[v1] sched_ext: Implement core event counters

[PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event

Posted by Changwoo Min 1 year ago

Collect the statistics of specific types of behavior in the sched_ext core,
which are not easily visible but still interesting to an scx scheduler.

Also, add a core event, SCX_EVENT_INVAL_SELECT_CPU, which represents how
many times ops.select_cpu() returns a CPU that the task can't use.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
 kernel/sched/ext.c | 120 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 118 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0bcdd1a31676..7e12d5b8322e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1458,6 +1458,66 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
 	return p;
 }
 
+/*
+ * Collection of event counters.
+ */
+struct scx_event_stat {
+	/*
+	 * If ops.select_cpu() returns a CPU which can't be used by the task,
+	 * the core scheduler code silently picks a fallback CPU.
+	 */
+	u64		INVAL_SELECT_CPU;
+};
+
+#define SCX_EVENT_IDX(e)	(offsetof(struct scx_event_stat, e)/sizeof(u64))
+#define SCX_EVENT_END_IDX()	(sizeof(struct scx_event_stat)/sizeof(u64))
+#define SCX_EVENT_DEFINE(e)	SCX_EVENT_##e = SCX_EVENT_IDX(e)
+
+/*
+ * Types of event counters.
+ */
+enum scx_event_kind {
+	SCX_EVENT_BEGIN = 0,
+	SCX_EVENT_DEFINE(INVAL_SELECT_CPU),
+	SCX_EVENT_END = SCX_EVENT_END_IDX(),
+};
+
+static const char *scx_event_stat_str[] = {
+	[SCX_EVENT_INVAL_SELECT_CPU]	= "invalid_select_cpu",
+};
+
+/*
+ * The event counter is organized by a per-CPU variable to minimize the
+ * accounting overhead without synchronization. A system-wide view on the
+ * event counter is constructed when requested by scx_bpf_get_event_stat().
+ */
+static DEFINE_PER_CPU(struct scx_event_stat, event_stats);
+
+/**
+ * scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stat
+ * @cnt: the number of the event occured
+ */
+#define scx_add_event(name, cnt) ({						\
+	struct scx_event_stat *__e;						\
+	__e = get_cpu_ptr(&event_stats);					\
+	WRITE_ONCE(__e->name, __e->name+ (cnt));				\
+	put_cpu_ptr(&event_stats);						\
+})
+
+
+/**
+ * scx_read_event_kind - Read an event from 'e' with 'kind'
+ * @e: a pointer to an event collected by scx_bpf_event_stat()
+ * @kine: an event type defined in scx_event_kind
+ */
+#define scx_read_event_kind(e, kind) ({						\
+	u64 *__e64 = (u64 *)(e);						\
+	__e64[kind];								\
+})
+
+static void scx_bpf_event_stat(struct scx_event_stat *event, size_t event__sz);
+
 static enum scx_ops_enable_state scx_ops_enable_state(void)
 {
 	return atomic_read(&scx_ops_enable_state_var);
@@ -3607,8 +3667,10 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 		*ddsp_taskp = NULL;
 		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
 			return cpu;
-		else
+		else {
+			scx_add_event(INVAL_SELECT_CPU, 1);
 			return prev_cpu;
+		}
 	} else {
 		bool found;
 		s32 cpu;
@@ -5053,6 +5115,15 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		scx_rq_clock_invalidate(rq);
 	}
 
+	/*
+	 * Clear event counters so the next scx scheduler always gets
+	 * fresh event counter values.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct scx_event_stat *e = per_cpu_ptr(&event_stats, cpu);
+		memset(e, 0, sizeof(*e));
+	}
+
 	/* no task is on scx, turn off all the switches and flush in-progress calls */
 	static_branch_disable(&__scx_ops_enabled);
 	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
@@ -5309,9 +5380,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 		.at_jiffies = jiffies,
 	};
 	struct seq_buf s;
+	struct scx_event_stat event;
 	unsigned long flags;
 	char *buf;
-	int cpu;
+	int cpu, kind;
 
 	spin_lock_irqsave(&dump_lock, flags);
 
@@ -5417,6 +5489,16 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 		rq_unlock(rq, &rf);
 	}
 
+	dump_newline(&s);
+	dump_line(&s, "Event counters");
+	dump_line(&s, "--------------");
+
+	scx_bpf_event_stat(&event, sizeof(event));
+	for (kind = SCX_EVENT_BEGIN; kind < SCX_EVENT_END; kind++) {
+		dump_line(&s, "%25s : %llu", scx_event_stat_str[kind],
+			  scx_read_event_kind(&event, kind));
+	}
+
 	if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
 		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
 		       trunc_marker, sizeof(trunc_marker));
@@ -7720,6 +7802,39 @@ __bpf_kfunc u64 scx_bpf_now(void)
 	return clock;
 }
 
+/*
+ * scx_bpf_event_stat - Get a system-wide event counter to
+ * @event: output buffer from a BPF program
+ * @event__sz: @event len, must end in '__sz'' for the verifier
+ */
+__bpf_kfunc void scx_bpf_event_stat(struct scx_event_stat *event,
+				    size_t event__sz)
+{
+	struct scx_event_stat *e;
+	u64 *event64, *e64;
+	int cpu, kind, event_end;
+
+	/*
+	 * We cannot entirely trust a BPF-provided size since a BPF program
+	 * might be compiled against a different vmlinux.h, of which
+	 * scx_event_stat would be larger (a newer vmlinux.h) or smaller
+	 * (an older vmlinux.h). Hence, we use the smaller size to avoid
+	 * memory corruption.
+	 */
+	event__sz = min(event__sz, sizeof(*event));
+	event_end = event__sz / sizeof(u64);
+
+	event64 = (u64 *)event;
+	memset(event, 0, event__sz);
+	for_each_possible_cpu(cpu) {
+		e = per_cpu_ptr(&event_stats, cpu);
+		e64 = (u64 *)e;
+		for (kind = 0; kind < event_end; kind++) {
+			event64[kind] += READ_ONCE(e64[kind]);
+		}
+	}
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -7752,6 +7867,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
 #endif
 BTF_ID_FLAGS(func, scx_bpf_now)
+BTF_ID_FLAGS(func, scx_bpf_event_stat, KF_TRUSTED_ARGS)
 BTF_KFUNCS_END(scx_kfunc_ids_any)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
-- 
2.48.1

Re: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event

Posted by Leonardo Temperanza 1 year ago

Hello,

On 1/16/2025 4:15 PM, Changwoo Min wrote:
> Collect the statistics of specific types of behavior in the sched_ext core,
> which are not easily visible but still interesting to an scx scheduler.
>
> Also, add a core event, SCX_EVENT_INVAL_SELECT_CPU, which represents how
> many times ops.select_cpu() returns a CPU that the task can't use.
>
> Signed-off-by: Changwoo Min <changwoo@igalia.com>
> ---
>   kernel/sched/ext.c | 120 ++++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 118 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 0bcdd1a31676..7e12d5b8322e 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1458,6 +1458,66 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
>   	return p;
>   }
>   
> +/*
> + * Collection of event counters.
> + */
> +struct scx_event_stat {
> +	/*
> +	 * If ops.select_cpu() returns a CPU which can't be used by the task,
> +	 * the core scheduler code silently picks a fallback CPU.
> +	 */
> +	u64		INVAL_SELECT_CPU;
> +};
> +
> +#define SCX_EVENT_IDX(e)	(offsetof(struct scx_event_stat, e)/sizeof(u64))
> +#define SCX_EVENT_END_IDX()	(sizeof(struct scx_event_stat)/sizeof(u64))
> +#define SCX_EVENT_DEFINE(e)	SCX_EVENT_##e = SCX_EVENT_IDX(e)
> +


scx_event_stat fields are required to be u64, otherwise the macros below 
don't work correctly. Perhaps a comment could highlight this "hidden" 
constraint? As well as a BUILD_BUG_ON to verify that this constraint is 
verified at build time.


> +/*
> + * Types of event counters.
> + */
> +enum scx_event_kind {
> +	SCX_EVENT_BEGIN = 0,
> +	SCX_EVENT_DEFINE(INVAL_SELECT_CPU),
> +	SCX_EVENT_END = SCX_EVENT_END_IDX(),
> +};
> +
> +static const char *scx_event_stat_str[] = {
> +	[SCX_EVENT_INVAL_SELECT_CPU]	= "invalid_select_cpu",
> +};
> +


If an event is added to scx_event_kind but not scx_event_stat_str, the 
array can possibly be accessed out of bounds (or into a NULL string 
depending on the missing index). The GNU C extension could be used to 
initialize all elements to the empty string "", like this:

static const char *scx_event_stat_str[SCX_EVENT_END] = {
     [0 ... SCX_EVENT_END - 1] = "",
     [SCX_EVENT_INVAL_SELECT_CPU] = "invalid_select_cpu",
};

Alternatively a helper could be used:

static const char *scx_event_name(enum scx_event_kind kind)
{
     return scx_event_stat_str[kind] ? : "";
}


> +/*
> + * The event counter is organized by a per-CPU variable to minimize the
> + * accounting overhead without synchronization. A system-wide view on the
> + * event counter is constructed when requested by scx_bpf_get_event_stat().
> + */
> +static DEFINE_PER_CPU(struct scx_event_stat, event_stats);
> +
> +/**
> + * scx_add_event - Increase an event counter for 'name' by 'cnt'
> + * @name: an event name defined in struct scx_event_stat
> + * @cnt: the number of the event occured
> + */
> +#define scx_add_event(name, cnt) ({						\
> +	struct scx_event_stat *__e;						\
> +	__e = get_cpu_ptr(&event_stats);					\
> +	WRITE_ONCE(__e->name, __e->name+ (cnt));				\
> +	put_cpu_ptr(&event_stats);						\
> +})
> +
> +
> +/**
> + * scx_read_event_kind - Read an event from 'e' with 'kind'
> + * @e: a pointer to an event collected by scx_bpf_event_stat()
> + * @kine: an event type defined in scx_event_kind
> + */
> +#define scx_read_event_kind(e, kind) ({						\
> +	u64 *__e64 = (u64 *)(e);						\
> +	__e64[kind];								\
> +})
> +


nit: typo, "@kine" instead of "@kind"


> +static void scx_bpf_event_stat(struct scx_event_stat *event, size_t event__sz);
> +
>   static enum scx_ops_enable_state scx_ops_enable_state(void)
>   {
>   	return atomic_read(&scx_ops_enable_state_var);
> @@ -3607,8 +3667,10 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
>   		*ddsp_taskp = NULL;
>   		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
>   			return cpu;
> -		else
> +		else {
> +			scx_add_event(INVAL_SELECT_CPU, 1);
>   			return prev_cpu;
> +		}
>   	} else {
>   		bool found;
>   		s32 cpu;
> @@ -5053,6 +5115,15 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
>   		scx_rq_clock_invalidate(rq);
>   	}
>   
> +	/*
> +	 * Clear event counters so the next scx scheduler always gets
> +	 * fresh event counter values.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		struct scx_event_stat *e = per_cpu_ptr(&event_stats, cpu);
> +		memset(e, 0, sizeof(*e));
> +	}
> +
>   	/* no task is on scx, turn off all the switches and flush in-progress calls */
>   	static_branch_disable(&__scx_ops_enabled);
>   	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
> @@ -5309,9 +5380,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
>   		.at_jiffies = jiffies,
>   	};
>   	struct seq_buf s;
> +	struct scx_event_stat event;
>   	unsigned long flags;
>   	char *buf;
> -	int cpu;
> +	int cpu, kind;
>   
>   	spin_lock_irqsave(&dump_lock, flags);
>   
> @@ -5417,6 +5489,16 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
>   		rq_unlock(rq, &rf);
>   	}
>   
> +	dump_newline(&s);
> +	dump_line(&s, "Event counters");
> +	dump_line(&s, "--------------");
> +
> +	scx_bpf_event_stat(&event, sizeof(event));
> +	for (kind = SCX_EVENT_BEGIN; kind < SCX_EVENT_END; kind++) {
> +		dump_line(&s, "%25s : %llu", scx_event_stat_str[kind],
> +			  scx_read_event_kind(&event, kind));
> +	}
> +
>   	if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
>   		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
>   		       trunc_marker, sizeof(trunc_marker));
> @@ -7720,6 +7802,39 @@ __bpf_kfunc u64 scx_bpf_now(void)
>   	return clock;
>   }
>   
> +/*
> + * scx_bpf_event_stat - Get a system-wide event counter to
> + * @event: output buffer from a BPF program
> + * @event__sz: @event len, must end in '__sz'' for the verifier
> + */
> +__bpf_kfunc void scx_bpf_event_stat(struct scx_event_stat *event,
> +				    size_t event__sz)
> +{
> +	struct scx_event_stat *e;
> +	u64 *event64, *e64;
> +	int cpu, kind, event_end;
> +
> +	/*
> +	 * We cannot entirely trust a BPF-provided size since a BPF program
> +	 * might be compiled against a different vmlinux.h, of which
> +	 * scx_event_stat would be larger (a newer vmlinux.h) or smaller
> +	 * (an older vmlinux.h). Hence, we use the smaller size to avoid
> +	 * memory corruption.
> +	 */
> +	event__sz = min(event__sz, sizeof(*event));
> +	event_end = event__sz / sizeof(u64);
> +
> +	event64 = (u64 *)event;
> +	memset(event, 0, event__sz);
> +	for_each_possible_cpu(cpu) {
> +		e = per_cpu_ptr(&event_stats, cpu);
> +		e64 = (u64 *)e;
> +		for (kind = 0; kind < event_end; kind++) {
> +			event64[kind] += READ_ONCE(e64[kind]);
> +		}
> +	}
> +}
> +
>   __bpf_kfunc_end_defs();
>   
>   BTF_KFUNCS_START(scx_kfunc_ids_any)
> @@ -7752,6 +7867,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
>   BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
>   #endif
>   BTF_ID_FLAGS(func, scx_bpf_now)
> +BTF_ID_FLAGS(func, scx_bpf_event_stat, KF_TRUSTED_ARGS)
>   BTF_KFUNCS_END(scx_kfunc_ids_any)
>   
>   static const struct btf_kfunc_id_set scx_kfunc_set_any = {

Re: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event

Posted by Changwoo Min 1 year ago

Hello Leonardo,

Thanks for the review.

On 25. 1. 22. 00:00, Leonardo Temperanza wrote:
> scx_event_stat fields are required to be u64, otherwise the macros below 
> don't work correctly. Perhaps a comment could highlight this "hidden" 
> constraint? As well as a BUILD_BUG_ON to verify that this constraint is 
> verified at build time.

That's a good suggestion. I will add any hidden constratins in the comments.

> If an event is added to scx_event_kind but not scx_event_stat_str, the 
> array can possibly be accessed out of bounds (or into a NULL string 
> depending on the missing index). The GNU C extension could be used to 
> initialize all elements to the empty string "", like this:
> 
> static const char *scx_event_stat_str[SCX_EVENT_END] = {
>      [0 ... SCX_EVENT_END - 1] = "",
>      [SCX_EVENT_INVAL_SELECT_CPU] = "invalid_select_cpu",
> };
> 
> Alternatively a helper could be used:
> 
> static const char *scx_event_name(enum scx_event_kind kind)
> {
>      return scx_event_stat_str[kind] ? : "";
> }

Thanks for the suggestion. I am considering dropping event and event_str 
in the next version. That is because the only purpose of the event 
things is to print dump messages. At this point, the simpler the better, 
I think.


>> +/**
>> + * scx_read_event_kind - Read an event from 'e' with 'kind'
>> + * @e: a pointer to an event collected by scx_bpf_event_stat()
>> + * @kine: an event type defined in scx_event_kind
>> + */
>> +#define scx_read_event_kind(e, kind) ({                        \
>> +    u64 *__e64 = (u64 *)(e);                        \
>> +    __e64[kind];                                \
>> +})
>> +
> 
> 
> nit: typo, "@kine" instead of "@kind"

Good catch. Will fix it.

Regards,
Changwoo Min

Re: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event

Posted by Andrea Righi 1 year ago

On Fri, Jan 17, 2025 at 12:15:37AM +0900, Changwoo Min wrote:
...
> +/*
> + * The event counter is organized by a per-CPU variable to minimize the
> + * accounting overhead without synchronization. A system-wide view on the
> + * event counter is constructed when requested by scx_bpf_get_event_stat().
> + */
> +static DEFINE_PER_CPU(struct scx_event_stat, event_stats);

Should we consider tracking these statistics per-scheduler rather than
globally (like adding scx_event_stat to sched_ext_ops)?

It's not particularly important for now, but in the future, if we allow
multiple scx schedulers to be loaded at the same time, tracking separate
stats per-scheduler would be preferable.

Thanks,
-Andrea

Re: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event

Posted by Changwoo Min 1 year ago

Hello,

Thank you for the suggestion, Andrea!

On 25. 1. 17. 18:49, Andrea Righi wrote:
> On Fri, Jan 17, 2025 at 12:15:37AM +0900, Changwoo Min wrote:
> ...
>> +/*
>> + * The event counter is organized by a per-CPU variable to minimize the
>> + * accounting overhead without synchronization. A system-wide view on the
>> + * event counter is constructed when requested by scx_bpf_get_event_stat().
>> + */
>> +static DEFINE_PER_CPU(struct scx_event_stat, event_stats);
> 
> Should we consider tracking these statistics per-scheduler rather than
> globally (like adding scx_event_stat to sched_ext_ops)?
> 
> It's not particularly important for now, but in the future, if we allow
> multiple scx schedulers to be loaded at the same time, tracking separate
> stats per-scheduler would be preferable.

It will be useful to get per-scheduler information. Since how the
scheduler composability will be embodied is wide open at this
point, I will revisit the interface design as the composability
design gets more concrete.

Regards,
Changwoo Min

Re: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event

Posted by Tejun Heo 1 year ago

Hello,

On Fri, Jan 17, 2025 at 10:49:55AM +0100, Andrea Righi wrote:
> On Fri, Jan 17, 2025 at 12:15:37AM +0900, Changwoo Min wrote:
> ...
> > +/*
> > + * The event counter is organized by a per-CPU variable to minimize the
> > + * accounting overhead without synchronization. A system-wide view on the
> > + * event counter is constructed when requested by scx_bpf_get_event_stat().
> > + */
> > +static DEFINE_PER_CPU(struct scx_event_stat, event_stats);
> 
> Should we consider tracking these statistics per-scheduler rather than
> globally (like adding scx_event_stat to sched_ext_ops)?
> 
> It's not particularly important for now, but in the future, if we allow
> multiple scx schedulers to be loaded at the same time, tracking separate
> stats per-scheduler would be preferable.

Yeah, eventually, but I don't think it affects anything user visible right
now. Later, we may add an interface to query the stats for a specific
scheduler provided the scheduler is a child of the current one and so on but
we'd still need an interface to query "my stats" which the current interface
can become.

Thanks.

-- 
tejun

Re: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event

Posted by Tejun Heo 1 year ago

Hello,

On Fri, Jan 17, 2025 at 12:15:37AM +0900, Changwoo Min wrote:
> +/*
> + * Collection of event counters.
> + */
> +struct scx_event_stat {

If we keep this struct, maybe name it scx_event_stats?

> +	/*
> +	 * If ops.select_cpu() returns a CPU which can't be used by the task,
> +	 * the core scheduler code silently picks a fallback CPU.
> +	 */
> +	u64		INVAL_SELECT_CPU;

Let's do $COMPONENT_$EVENT, so SELECT_CPU_INVALID (or maybe something more
specific than invalid, like disallowed or fallback?).

> +#define SCX_EVENT_IDX(e)	(offsetof(struct scx_event_stat, e)/sizeof(u64))
> +#define SCX_EVENT_END_IDX()	(sizeof(struct scx_event_stat)/sizeof(u64))
> +#define SCX_EVENT_DEFINE(e)	SCX_EVENT_##e = SCX_EVENT_IDX(e)
> +
> +/*
> + * Types of event counters.
> + */
> +enum scx_event_kind {
> +	SCX_EVENT_BEGIN = 0,

I think 0 BEGIN value can be implicit.

> +	SCX_EVENT_DEFINE(INVAL_SELECT_CPU),
> +	SCX_EVENT_END = SCX_EVENT_END_IDX(),

SCX_NR_EVENTS? Also, we don't really need to macro to count the enums.

> +};

This feels a bit odd to me. Why does it need both the struct fields and
indices? Can we do either one of those?

> +static const char *scx_event_stat_str[] = {
> +	[SCX_EVENT_INVAL_SELECT_CPU]	= "invalid_select_cpu",
> +};

and hopefully derive this through # macro arg expansion?

> +/*
> + * The event counter is organized by a per-CPU variable to minimize the
> + * accounting overhead without synchronization. A system-wide view on the
> + * event counter is constructed when requested by scx_bpf_get_event_stat().
> + */
> +static DEFINE_PER_CPU(struct scx_event_stat, event_stats);

May be better to include "cpu" in the variable name.

> +/**
> + * scx_add_event - Increase an event counter for 'name' by 'cnt'
> + * @name: an event name defined in struct scx_event_stat
> + * @cnt: the number of the event occured
> + */
> +#define scx_add_event(name, cnt) ({						\
> +	struct scx_event_stat *__e;						\
> +	__e = get_cpu_ptr(&event_stats);					\
> +	WRITE_ONCE(__e->name, __e->name+ (cnt));				\
> +	put_cpu_ptr(&event_stats);						\

this_cpu_add()?

> @@ -3607,8 +3667,10 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
>  		*ddsp_taskp = NULL;
>  		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
>  			return cpu;
> -		else
> +		else {
> +			scx_add_event(INVAL_SELECT_CPU, 1);
>  			return prev_cpu;
> +		}

formatting:

        if () {
        } else {
        }

Also, I'm not sure this is a useful event to count. False ops_cpu_valid()
indicates that the returned CPU is not even possible and the scheduler is
ejected right away. What's more interesting is
kernel/sched/core.c::select_task_rq() tripping on !is_cpu_allowed() and
falling back using select_fallback_rq().

We can either hook into core.c or just compare the ops.select_cpu() picked
CPU against the CPU the task ends up on in enqueue_task_scx().

> @@ -5053,6 +5115,15 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
>  		scx_rq_clock_invalidate(rq);
>  	}
>  
> +	/*
> +	 * Clear event counters so the next scx scheduler always gets
> +	 * fresh event counter values.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		struct scx_event_stat *e = per_cpu_ptr(&event_stats, cpu);
> +		memset(e, 0, sizeof(*e));
> +	}
> +

Wouldn't we want to keep these counters intact on ejection so that the
scheduler's ejection path can capture the counters if necessary? Resetting
on load probably is better.

Thanks.

-- 
tejun

Re: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event

Posted by Changwoo Min 1 year ago

Hello,

Thank you for the review!

On 25. 1. 17. 10:33, Tejun Heo wrote:
> Hello,
> 
> On Fri, Jan 17, 2025 at 12:15:37AM +0900, Changwoo Min wrote:
>> +/*
>> + * Collection of event counters.
>> + */
>> +struct scx_event_stat {
> 
> If we keep this struct, maybe name it scx_event_stats?

Sure, I will change it as suggested. Also, I will change
scx_bpf_event_stat() to scx_bpf_event_stats() for consistency.

> 
>> +	/*
>> +	 * If ops.select_cpu() returns a CPU which can't be used by the task,
>> +	 * the core scheduler code silently picks a fallback CPU.
>> +	 */
>> +	u64		INVAL_SELECT_CPU;
> 
> Let's do $COMPONENT_$EVENT, so SELECT_CPU_INVALID (or maybe something more
> specific than invalid, like disallowed or fallback?).

$COMPONENT_$EVENT sounds more systematic. Thanks for the
suggestion. What about SELECT_CPU_FALLBACK?

>> +#define SCX_EVENT_IDX(e)	(offsetof(struct scx_event_stat, e)/sizeof(u64))
>> +#define SCX_EVENT_END_IDX()	(sizeof(struct scx_event_stat)/sizeof(u64))
>> +#define SCX_EVENT_DEFINE(e)	SCX_EVENT_##e = SCX_EVENT_IDX(e)
>> +
>> +/*
>> + * Types of event counters.
>> + */
>> +enum scx_event_kind {
>> +	SCX_EVENT_BEGIN = 0,
> 
> I think 0 BEGIN value can be implicit.

Yup, I will remove it in the next version.

>> +	SCX_EVENT_DEFINE(INVAL_SELECT_CPU),
>> +	SCX_EVENT_END = SCX_EVENT_END_IDX(),
> 
> SCX_NR_EVENTS? Also, we don't really need to macro to count the enums.

For SCX_EVENT_BEGIN/END, I followed the style of
SCX_OPI_BEGIN/END. You are right. I will remove
SCX_EVENT_END_IDX() in the next version.

> This feels a bit odd to me. Why does it need both the struct fields and
> indices? Can we do either one of those?

Before submitting the patch set, I tested one approach using
purely enums storing the actual counters in an array and another
(current) approach using struct fields. I rejected the first
approach because the BPF verifier does not allow changing the
array size, causing the BPF binary compatibility problem. The
second approach is satisfactory regarding the BPF binary
compatibility by CO-RE. However, it is a little bit cumbersome to
iterate all the events, so I added the enums. I know the enums
are only usable in the kernel code due to the binary
compatibility, but I thought adding the enums rather than
bloating the scx_dump_state() code at the end would be better.
Does it make sense to you?

> 
>> +static const char *scx_event_stat_str[] = {
>> +	[SCX_EVENT_INVAL_SELECT_CPU]	= "invalid_select_cpu",
>> +};
> 
> and hopefully derive this through # macro arg expansion?

That's a good idea. I will change it.

> 
>> +/*
>> + * The event counter is organized by a per-CPU variable to minimize the
>> + * accounting overhead without synchronization. A system-wide view on the
>> + * event counter is constructed when requested by scx_bpf_get_event_stat().
>> + */
>> +static DEFINE_PER_CPU(struct scx_event_stat, event_stats);
> 
> May be better to include "cpu" in the variable name.

I will add a "_cpu" suffix, renaming it to "event_stats_cpu".

>> +/**
>> + * scx_add_event - Increase an event counter for 'name' by 'cnt'
>> + * @name: an event name defined in struct scx_event_stat
>> + * @cnt: the number of the event occured
>> + */
>> +#define scx_add_event(name, cnt) ({						\
>> +	struct scx_event_stat *__e;						\
>> +	__e = get_cpu_ptr(&event_stats);					\
>> +	WRITE_ONCE(__e->name, __e->name+ (cnt));				\
>> +	put_cpu_ptr(&event_stats);						\
> 
> this_cpu_add()?

That's handier. I will change it.

> 
>> @@ -3607,8 +3667,10 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
>>   		*ddsp_taskp = NULL;
>>   		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
>>   			return cpu;
>> -		else
>> +		else {
>> +			scx_add_event(INVAL_SELECT_CPU, 1);
>>   			return prev_cpu;
>> +		}
> 
> formatting:
> 
>          if () {
>          } else {
>          }
My bad. Will fix it.

> Also, I'm not sure this is a useful event to count. False ops_cpu_valid()
> indicates that the returned CPU is not even possible and the scheduler is
> ejected right away. What's more interesting is
> kernel/sched/core.c::select_task_rq() tripping on !is_cpu_allowed() and
> falling back using select_fallback_rq().
> 
> We can either hook into core.c or just compare the ops.select_cpu() picked
> CPU against the CPU the task ends up on in enqueue_task_scx().

Modifying core.c will be more direct and straightforward. Also,
I think it would be better to separate this commit into two: one
for the infra-structure and another for the SELECT_CPU_FALLBACK
event, which touches core.c. I will move the necessary code in
the infrastructure into kernel/sched/sched.h, so we can use
scx_add_event() in core.c.

> 
>> @@ -5053,6 +5115,15 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
>>   		scx_rq_clock_invalidate(rq);
>>   	}
>>   
>> +	/*
>> +	 * Clear event counters so the next scx scheduler always gets
>> +	 * fresh event counter values.
>> +	 */
>> +	for_each_possible_cpu(cpu) {
>> +		struct scx_event_stat *e = per_cpu_ptr(&event_stats, cpu);
>> +		memset(e, 0, sizeof(*e));
>> +	}
>> +
> 
> Wouldn't we want to keep these counters intact on ejection so that the
> scheduler's ejection path can capture the counters if necessary? Resetting
> on load probably is better.

That makes sense. I will move the resetting code on load.

Regards,
Changwoo Min

Re: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event

Posted by Tejun Heo 1 year ago

Hello,

On Fri, Jan 17, 2025 at 04:08:45PM +0900, Changwoo Min wrote:
> $COMPONENT_$EVENT sounds more systematic. Thanks for the
> suggestion. What about SELECT_CPU_FALLBACK?

Sounds good.

> > This feels a bit odd to me. Why does it need both the struct fields and
> > indices? Can we do either one of those?
> 
> Before submitting the patch set, I tested one approach using
> purely enums storing the actual counters in an array and another
> (current) approach using struct fields. I rejected the first
> approach because the BPF verifier does not allow changing the
> array size, causing the BPF binary compatibility problem. The

That makes sense. It'd be useful to note that on top of the struct
definition.

> second approach is satisfactory regarding the BPF binary
> compatibility by CO-RE. However, it is a little bit cumbersome to
> iterate all the events, so I added the enums. I know the enums
> are only usable in the kernel code due to the binary
> compatibility, but I thought adding the enums rather than
> bloating the scx_dump_state() code at the end would be better.
> Does it make sense to you?

Let's just use the structs and open code dumping. We can pack the dump
output better that way too and I don't think BPF scheds would have much use
for enum iterations.

...
> > > +/**
> > > + * scx_add_event - Increase an event counter for 'name' by 'cnt'
> > > + * @name: an event name defined in struct scx_event_stat
> > > + * @cnt: the number of the event occured
> > > + */
> > > +#define scx_add_event(name, cnt) ({						\
> > > +	struct scx_event_stat *__e;						\
> > > +	__e = get_cpu_ptr(&event_stats);					\
> > > +	WRITE_ONCE(__e->name, __e->name+ (cnt));				\
> > > +	put_cpu_ptr(&event_stats);						\
> > 
> > this_cpu_add()?
> 
> That's handier. I will change it.

Note that there's __this_cpu_add() too which can be used from sections of
code that already disables preemption. On x86, both variants cost the same
but on arm __this_cpu_add() is cheaper as it can skip preemption off/on.
Given that most of these counters are modified while holding rq lock, it may
make sense to provide __ version too.

...
> > Also, I'm not sure this is a useful event to count. False ops_cpu_valid()
> > indicates that the returned CPU is not even possible and the scheduler is
> > ejected right away. What's more interesting is
> > kernel/sched/core.c::select_task_rq() tripping on !is_cpu_allowed() and
> > falling back using select_fallback_rq().
> > 
> > We can either hook into core.c or just compare the ops.select_cpu() picked
> > CPU against the CPU the task ends up on in enqueue_task_scx().
> 
> Modifying core.c will be more direct and straightforward. Also,
> I think it would be better to separate this commit into two: one
> for the infra-structure and another for the SELECT_CPU_FALLBACK
> event, which touches core.c. I will move the necessary code in
> the infrastructure into kernel/sched/sched.h, so we can use
> scx_add_event() in core.c.

Hmm.... yeah, both approaches have pros and cons but I kinda like the idea
of restricting the events within ext.c and here detecting the fallback
condition is pretty trivial. I don't know.

Thanks.

-- 
tejun

Re: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event

Posted by Changwoo Min 1 year ago

Hello,

On 25. 1. 18. 01:24, Tejun Heo wrote:
>> second approach is satisfactory regarding the BPF binary
>> compatibility by CO-RE. However, it is a little bit cumbersome to
>> iterate all the events, so I added the enums. I know the enums
>> are only usable in the kernel code due to the binary
>> compatibility, but I thought adding the enums rather than
>> bloating the scx_dump_state() code at the end would be better.
>> Does it make sense to you?
> 
> Let's just use the structs and open code dumping. We can pack the dump
> output better that way too and I don't think BPF scheds would have much use
> for enum iterations.

Currently, enums are not exposed to BPF schedulers. Also, BPF
schedulers should not rely on the enums because the enums are
coupled with a specific layout of the struct, so that would cause
a BPF binary compatibility problem.

Could you explain more on the code dumping? I want at least print
the event string for easier interpretation.

> ...
>>>> +/**
>>>> + * scx_add_event - Increase an event counter for 'name' by 'cnt'
>>>> + * @name: an event name defined in struct scx_event_stat
>>>> + * @cnt: the number of the event occured
>>>> + */
>>>> +#define scx_add_event(name, cnt) ({						\
>>>> +	struct scx_event_stat *__e;						\
>>>> +	__e = get_cpu_ptr(&event_stats);					\
>>>> +	WRITE_ONCE(__e->name, __e->name+ (cnt));				\
>>>> +	put_cpu_ptr(&event_stats);						\
>>>
>>> this_cpu_add()?
>>
>> That's handier. I will change it.
> 
> Note that there's __this_cpu_add() too which can be used from sections of
> code that already disables preemption. On x86, both variants cost the same
> but on arm __this_cpu_add() is cheaper as it can skip preemption off/on.
> Given that most of these counters are modified while holding rq lock, it may
> make sense to provide __ version too.

Sounds good. I will add __scx_add_event() following the
convention of this_cpu_add() and __this_cpu_add().

> 
> ...
>>> Also, I'm not sure this is a useful event to count. False ops_cpu_valid()
>>> indicates that the returned CPU is not even possible and the scheduler is
>>> ejected right away. What's more interesting is
>>> kernel/sched/core.c::select_task_rq() tripping on !is_cpu_allowed() and
>>> falling back using select_fallback_rq().
>>>
>>> We can either hook into core.c or just compare the ops.select_cpu() picked
>>> CPU against the CPU the task ends up on in enqueue_task_scx().
>>
>> Modifying core.c will be more direct and straightforward. Also,
>> I think it would be better to separate this commit into two: one
>> for the infra-structure and another for the SELECT_CPU_FALLBACK
>> event, which touches core.c. I will move the necessary code in
>> the infrastructure into kernel/sched/sched.h, so we can use
>> scx_add_event() in core.c.
> 
> Hmm.... yeah, both approaches have pros and cons but I kinda like the idea
> of restricting the events within ext.c and here detecting the fallback
> condition is pretty trivial. I don't know.

Right, it will be possible track the same thing without touch
core.c (that's too much) by replicating the conditions in core.c
to ext.c. I will dig the code more to figure out what the best
approach is.

Regards,
Changwoo Min

Re: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event

Posted by Tejun Heo 1 year ago

Hello,

On Sat, Jan 18, 2025 at 09:27:18AM +0900, Changwoo Min wrote:
...
> Could you explain more on the code dumping? I want at least print
> the event string for easier interpretation.

I meant that we don't need to the enums to print out the counters. e.g. we
can just do:

        dump_line(&s, "selcpu_inval: %16llu    dsp_keep_last: %16llu",
                  events.select_cpu_inval, events.dispatch_keep_last);

Thanks.

-- 
tejun

Re: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event

Posted by Changwoo Min 1 year ago

Hello,

On 25. 1. 22. 09:42, Tejun Heo wrote:
> I meant that we don't need to the enums to print out the counters. e.g. we
> can just do:
> 
>          dump_line(&s, "selcpu_inval: %16llu    dsp_keep_last: %16llu",
>                    events.select_cpu_inval, events.dispatch_keep_last);

Haha. Thanks for the clarification. I got it.

Regards,
Changwoo Min