include/trace/events/sched_ext.h | 21 +++++++++++++++++++++ kernel/sched/ext.c | 4 ++++ 2 files changed, 25 insertions(+)
Add tracing support, which may be useful for debugging sched_ext schedulers
that trigger a certain event.
Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
include/trace/events/sched_ext.h | 21 +++++++++++++++++++++
kernel/sched/ext.c | 4 ++++
2 files changed, 25 insertions(+)
diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h
index fe19da7315a9..88527b9316de 100644
--- a/include/trace/events/sched_ext.h
+++ b/include/trace/events/sched_ext.h
@@ -26,6 +26,27 @@ TRACE_EVENT(sched_ext_dump,
)
);
+TRACE_EVENT(sched_ext_add_event,
+ TP_PROTO(const char *name, int offset, __u64 added),
+ TP_ARGS(name, offset, added),
+
+ TP_STRUCT__entry(
+ __string(name, name)
+ __field( int, offset )
+ __field( __u64, added )
+ ),
+
+ TP_fast_assign(
+ __assign_str(name);
+ __entry->offset = offset;
+ __entry->added = added;
+ ),
+
+ TP_printk("name %s offset %d added %llu",
+ __get_str(name), __entry->offset, __entry->added
+ )
+);
+
#endif /* _TRACE_SCHED_EXT_H */
/* This part must be outside protection */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 986b655911df..825e79863057 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1554,6 +1554,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
*/
#define scx_add_event(name, cnt) do { \
this_cpu_add(event_stats_cpu.name, cnt); \
+ trace_sched_ext_add_event(#name, \
+ offsetof(struct scx_event_stats, name), cnt); \
} while(0)
/**
@@ -1565,6 +1567,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
*/
#define __scx_add_event(name, cnt) do { \
__this_cpu_add(event_stats_cpu.name, cnt); \
+ trace_sched_ext_add_event(#name, \
+ offsetof(struct scx_event_stats, name), cnt); \
} while(0)
/**
--
2.48.1
Hi Changwoo,
On Wed, Feb 26, 2025 at 11:33:27PM +0900, Changwoo Min wrote:
> Add tracing support, which may be useful for debugging sched_ext schedulers
> that trigger a certain event.
>
> Signed-off-by: Changwoo Min <changwoo@igalia.com>
> ---
> include/trace/events/sched_ext.h | 21 +++++++++++++++++++++
> kernel/sched/ext.c | 4 ++++
> 2 files changed, 25 insertions(+)
>
> diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h
> index fe19da7315a9..88527b9316de 100644
> --- a/include/trace/events/sched_ext.h
> +++ b/include/trace/events/sched_ext.h
> @@ -26,6 +26,27 @@ TRACE_EVENT(sched_ext_dump,
> )
> );
>
> +TRACE_EVENT(sched_ext_add_event,
> + TP_PROTO(const char *name, int offset, __u64 added),
> + TP_ARGS(name, offset, added),
> +
> + TP_STRUCT__entry(
> + __string(name, name)
> + __field( int, offset )
> + __field( __u64, added )
> + ),
> +
> + TP_fast_assign(
> + __assign_str(name);
> + __entry->offset = offset;
> + __entry->added = added;
> + ),
> +
> + TP_printk("name %s offset %d added %llu",
> + __get_str(name), __entry->offset, __entry->added
> + )
> +);
Isn't the name enough to determine which event has been triggered? What are
the benefits of reporting also the offset within struct scx_event_stats?
Thanks,
-Andrea
> +
> #endif /* _TRACE_SCHED_EXT_H */
>
> /* This part must be outside protection */
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 986b655911df..825e79863057 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1554,6 +1554,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
> */
> #define scx_add_event(name, cnt) do { \
> this_cpu_add(event_stats_cpu.name, cnt); \
> + trace_sched_ext_add_event(#name, \
> + offsetof(struct scx_event_stats, name), cnt); \
> } while(0)
>
> /**
> @@ -1565,6 +1567,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
> */
> #define __scx_add_event(name, cnt) do { \
> __this_cpu_add(event_stats_cpu.name, cnt); \
> + trace_sched_ext_add_event(#name, \
> + offsetof(struct scx_event_stats, name), cnt); \
> } while(0)
>
> /**
> --
> 2.48.1
>
Hi Andrea,
Thank you for the review!
On 25. 2. 27. 16:38, Andrea Righi wrote:
> Hi Changwoo,
>
> On Wed, Feb 26, 2025 at 11:33:27PM +0900, Changwoo Min wrote:
>> Add tracing support, which may be useful for debugging sched_ext schedulers
>> that trigger a certain event.
>>
>> Signed-off-by: Changwoo Min <changwoo@igalia.com>
>> ---
>> include/trace/events/sched_ext.h | 21 +++++++++++++++++++++
>> kernel/sched/ext.c | 4 ++++
>> 2 files changed, 25 insertions(+)
>>
>> diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h
>> index fe19da7315a9..88527b9316de 100644
>> --- a/include/trace/events/sched_ext.h
>> +++ b/include/trace/events/sched_ext.h
>> @@ -26,6 +26,27 @@ TRACE_EVENT(sched_ext_dump,
>> )
>> );
>>
>> +TRACE_EVENT(sched_ext_add_event,
>> + TP_PROTO(const char *name, int offset, __u64 added),
>> + TP_ARGS(name, offset, added),
>> +
>> + TP_STRUCT__entry(
>> + __string(name, name)
>> + __field( int, offset )
>> + __field( __u64, added )
>> + ),
>> +
>> + TP_fast_assign(
>> + __assign_str(name);
>> + __entry->offset = offset;
>> + __entry->added = added;
>> + ),
>> +
>> + TP_printk("name %s offset %d added %llu",
>> + __get_str(name), __entry->offset, __entry->added
>> + )
>> +);
>
> Isn't the name enough to determine which event has been triggered? What are
> the benefits of reporting also the offset within struct scx_event_stats?
>
@name and @offset are duplicated information. However, I thought
having two is more convenient from the users' point of view
because they have different pros and cons.
@offset is quick to compare and can be used easily in the BPF
code, but the offset of an event can change across kernel
versions when new events are added. @offset would be good to
write a quick trace hook for debugging.
On the other hand, @name won't change across kernel versions,
which is good. However, it requires more code to acutally read
the string in the BPF code (__data_loc for string is a 32-bit
integer encoding string length and location).
Does it make sense to you?
Regards,
Changwoo Min
On Thu, Feb 27, 2025 at 05:05:54PM +0900, Changwoo Min wrote:
> Hi Andrea,
>
> Thank you for the review!
>
> On 25. 2. 27. 16:38, Andrea Righi wrote:
> > Hi Changwoo,
> >
> > On Wed, Feb 26, 2025 at 11:33:27PM +0900, Changwoo Min wrote:
> > > Add tracing support, which may be useful for debugging sched_ext schedulers
> > > that trigger a certain event.
> > >
> > > Signed-off-by: Changwoo Min <changwoo@igalia.com>
> > > ---
> > > include/trace/events/sched_ext.h | 21 +++++++++++++++++++++
> > > kernel/sched/ext.c | 4 ++++
> > > 2 files changed, 25 insertions(+)
> > >
> > > diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h
> > > index fe19da7315a9..88527b9316de 100644
> > > --- a/include/trace/events/sched_ext.h
> > > +++ b/include/trace/events/sched_ext.h
> > > @@ -26,6 +26,27 @@ TRACE_EVENT(sched_ext_dump,
> > > )
> > > );
> > > +TRACE_EVENT(sched_ext_add_event,
> > > + TP_PROTO(const char *name, int offset, __u64 added),
> > > + TP_ARGS(name, offset, added),
> > > +
> > > + TP_STRUCT__entry(
> > > + __string(name, name)
> > > + __field( int, offset )
> > > + __field( __u64, added )
> > > + ),
> > > +
> > > + TP_fast_assign(
> > > + __assign_str(name);
> > > + __entry->offset = offset;
> > > + __entry->added = added;
> > > + ),
> > > +
> > > + TP_printk("name %s offset %d added %llu",
> > > + __get_str(name), __entry->offset, __entry->added
> > > + )
> > > +);
> >
> > Isn't the name enough to determine which event has been triggered? What are
> > the benefits of reporting also the offset within struct scx_event_stats?
> >
>
> @name and @offset are duplicated information. However, I thought
> having two is more convenient from the users' point of view
> because they have different pros and cons.
>
> @offset is quick to compare and can be used easily in the BPF
> code, but the offset of an event can change across kernel
> versions when new events are added. @offset would be good to
> write a quick trace hook for debugging.
>
> On the other hand, @name won't change across kernel versions,
> which is good. However, it requires more code to acutally read
> the string in the BPF code (__data_loc for string is a 32-bit
> integer encoding string length and location).
>
> Does it make sense to you?
So, IMHO @offset to me would make sense if we guarantee that it won't
change across kernel versions, and that's probably doable, we just need to
make sure that we always add new events at the bottom of scx_event_stats.
Otherwise there's the risk to break potential users of this tracepoint that
may consider the offset like a portable ID.
Maybe we can call it @id or @event_id or similar and guarantee its
portability? What do you think?
-Andrea
On 25. 2. 27. 17:19, Andrea Righi wrote:
> On Thu, Feb 27, 2025 at 05:05:54PM +0900, Changwoo Min wrote:
>> Hi Andrea,
>>
>> Thank you for the review!
>>
>> On 25. 2. 27. 16:38, Andrea Righi wrote:
>>> Hi Changwoo,
>>>
>>> On Wed, Feb 26, 2025 at 11:33:27PM +0900, Changwoo Min wrote:
>>>> Add tracing support, which may be useful for debugging sched_ext schedulers
>>>> that trigger a certain event.
>>>>
>>>> Signed-off-by: Changwoo Min <changwoo@igalia.com>
>>>> ---
>>>> include/trace/events/sched_ext.h | 21 +++++++++++++++++++++
>>>> kernel/sched/ext.c | 4 ++++
>>>> 2 files changed, 25 insertions(+)
>>>>
>>>> diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h
>>>> index fe19da7315a9..88527b9316de 100644
>>>> --- a/include/trace/events/sched_ext.h
>>>> +++ b/include/trace/events/sched_ext.h
>>>> @@ -26,6 +26,27 @@ TRACE_EVENT(sched_ext_dump,
>>>> )
>>>> );
>>>> +TRACE_EVENT(sched_ext_add_event,
>>>> + TP_PROTO(const char *name, int offset, __u64 added),
>>>> + TP_ARGS(name, offset, added),
>>>> +
>>>> + TP_STRUCT__entry(
>>>> + __string(name, name)
>>>> + __field( int, offset )
>>>> + __field( __u64, added )
>>>> + ),
>>>> +
>>>> + TP_fast_assign(
>>>> + __assign_str(name);
>>>> + __entry->offset = offset;
>>>> + __entry->added = added;
>>>> + ),
>>>> +
>>>> + TP_printk("name %s offset %d added %llu",
>>>> + __get_str(name), __entry->offset, __entry->added
>>>> + )
>>>> +);
>>>
>>> Isn't the name enough to determine which event has been triggered? What are
>>> the benefits of reporting also the offset within struct scx_event_stats?
>>>
>>
>> @name and @offset are duplicated information. However, I thought
>> having two is more convenient from the users' point of view
>> because they have different pros and cons.
>>
>> @offset is quick to compare and can be used easily in the BPF
>> code, but the offset of an event can change across kernel
>> versions when new events are added. @offset would be good to
>> write a quick trace hook for debugging.
>>
>> On the other hand, @name won't change across kernel versions,
>> which is good. However, it requires more code to acutally read
>> the string in the BPF code (__data_loc for string is a 32-bit
>> integer encoding string length and location).
>>
>> Does it make sense to you?
> So, IMHO @offset to me would make sense if we guarantee that it won't
> change across kernel versions, and that's probably doable, we just need to
> make sure that we always add new events at the bottom of scx_event_stats.
Keeping the offset across versions is possible if we add new
events to the bottom. However, I am not sure if that is what we
want because we lose the nice logical grouping of the events in
the scx_event_stats struct.
> Otherwise there's the risk to break potential users of this tracepoint that
> may consider the offset like a portable ID.
Hmm... I agree. The @offset would be too low level and could the
potential source of confusion.
> Maybe we can call it @id or @event_id or similar and guarantee its
> portability? What do you think?
Now I think dropping @offset would be better in the long run
because we can maintain scx_event_stats clean and do not create
a source of confusion. Regarding the ease of using @name, adding
an code example in the commit message will suffice, something
like this:
struct tp_add_event {
struct trace_entry ent;
u32 __data_loc_name;
u64 delta;
};
SEC("tracepoint/sched_ext/sched_ext_add_event")
int tp_add_event(struct tp_add_event *ctx)
{
char event_name[128];
unsigned short offset = ctx->__data_loc_name & 0xFFFF;
bpf_probe_read_str((void *)event_name, 128, (char *)ctx + offset);
bpf_printk("name %s delta %llu", event_name, ctx->delta);
return 0;
}
The downside of not having a numerical ID (@offset or @event_id)
is the cost of string comparison to distinguish an event type. If
we assume the probing the event is rare, it will be okay.
@Tejun, @Andrea -- What do you think? Should we provide
a portability-guaranteed @event_id after dropping @offset? Or
would it be more than sufficient to have a string-type event name?
Regards,
Changwoo Min
On Thu, Feb 27, 2025 at 07:23:23PM +0900, Changwoo Min wrote:
> On 25. 2. 27. 17:19, Andrea Righi wrote:
> > On Thu, Feb 27, 2025 at 05:05:54PM +0900, Changwoo Min wrote:
> > Otherwise there's the risk to break potential users of this tracepoint that
...
> > Maybe we can call it @id or @event_id or similar and guarantee its
> > portability? What do you think?
>
> Now I think dropping @offset would be better in the long run
> because we can maintain scx_event_stats clean and do not create
> a source of confusion. Regarding the ease of using @name, adding
> an code example in the commit message will suffice, something
> like this:
>
> struct tp_add_event {
> struct trace_entry ent;
> u32 __data_loc_name;
> u64 delta;
> };
>
> SEC("tracepoint/sched_ext/sched_ext_add_event")
> int tp_add_event(struct tp_add_event *ctx)
> {
> char event_name[128];
> unsigned short offset = ctx->__data_loc_name & 0xFFFF;
> bpf_probe_read_str((void *)event_name, 128, (char *)ctx + offset);
>
> bpf_printk("name %s delta %llu", event_name, ctx->delta);
> return 0;
> }
We can definitely add a BPF code example, but keep in mind that tracepoints
can be used also outside of BPF, like:
$ sudo perf trace -e sched_ext:sched_ext_add_event
In this case I think just having the name is totally fine.
>
> The downside of not having a numerical ID (@offset or @event_id)
> is the cost of string comparison to distinguish an event type. If
> we assume the probing the event is rare, it will be okay.
>
> @Tejun, @Andrea -- What do you think? Should we provide
> a portability-guaranteed @event_id after dropping @offset? Or
> would it be more than sufficient to have a string-type event name?
I think a tracepoint should be used mostly for tracing purposes, not in
critical hot paths. So, under this assumption, the overhead of the string
comparison is probably acceptable and it allows us to not worry too much
about breaking compatibility.
Also, perf trace allows to use filters based on strings, so in our case we
can do something like this for example:
$ sudo perf trace -e sched_ext:sched_ext_add_event --filter 'name == "SCX_EV_ENQ_SLICE_DFL"'
While at it, what do you think about renaming this tracepoint
sched_ext_event or maybe sched_ext_core_event?
Thanks,
-Andrea
On 25. 2. 27. 19:55, Andrea Righi wrote:
> On Thu, Feb 27, 2025 at 07:23:23PM +0900, Changwoo Min wrote:
>> On 25. 2. 27. 17:19, Andrea Righi wrote:
>>> On Thu, Feb 27, 2025 at 05:05:54PM +0900, Changwoo Min wrote:
>>> Otherwise there's the risk to break potential users of this tracepoint that
> ...
>>> Maybe we can call it @id or @event_id or similar and guarantee its
>>> portability? What do you think?
>>
>> Now I think dropping @offset would be better in the long run
>> because we can maintain scx_event_stats clean and do not create
>> a source of confusion. Regarding the ease of using @name, adding
>> an code example in the commit message will suffice, something
>> like this:
>>
>> struct tp_add_event {
>> struct trace_entry ent;
>> u32 __data_loc_name;
>> u64 delta;
>> };
>>
>> SEC("tracepoint/sched_ext/sched_ext_add_event")
>> int tp_add_event(struct tp_add_event *ctx)
>> {
>> char event_name[128];
>> unsigned short offset = ctx->__data_loc_name & 0xFFFF;
>> bpf_probe_read_str((void *)event_name, 128, (char *)ctx + offset);
>>
>> bpf_printk("name %s delta %llu", event_name, ctx->delta);
>> return 0;
>> }
>
> We can definitely add a BPF code example, but keep in mind that tracepoints
> can be used also outside of BPF, like:
>
> $ sudo perf trace -e sched_ext:sched_ext_add_event
>
> In this case I think just having the name is totally fine.
Sure.
>
>>
>> The downside of not having a numerical ID (@offset or @event_id)
>> is the cost of string comparison to distinguish an event type. If
>> we assume the probing the event is rare, it will be okay.
>>
>> @Tejun, @Andrea -- What do you think? Should we provide
>> a portability-guaranteed @event_id after dropping @offset? Or
>> would it be more than sufficient to have a string-type event name?
>
> I think a tracepoint should be used mostly for tracing purposes, not in
> critical hot paths. So, under this assumption, the overhead of the string
> comparison is probably acceptable and it allows us to not worry too much
> about breaking compatibility.
I agree.
>
> Also, perf trace allows to use filters based on strings, so in our case we
> can do something like this for example:
>
> $ sudo perf trace -e sched_ext:sched_ext_add_event --filter 'name == "SCX_EV_ENQ_SLICE_DFL"'
>
> While at it, what do you think about renaming this tracepoint
> sched_ext_event or maybe sched_ext_core_event?
To me, sched_ext_event sounds better than others as it is simple.
Regards,
Changwoo Min
On Wed, Feb 26, 2025 at 11:33:27PM +0900, Changwoo Min wrote:
> Add tracing support, which may be useful for debugging sched_ext schedulers
> that trigger a certain event.
>
> Signed-off-by: Changwoo Min <changwoo@igalia.com>
> ---
> include/trace/events/sched_ext.h | 21 +++++++++++++++++++++
> kernel/sched/ext.c | 4 ++++
> 2 files changed, 25 insertions(+)
>
> diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h
> index fe19da7315a9..88527b9316de 100644
> --- a/include/trace/events/sched_ext.h
> +++ b/include/trace/events/sched_ext.h
> @@ -26,6 +26,27 @@ TRACE_EVENT(sched_ext_dump,
> )
> );
>
> +TRACE_EVENT(sched_ext_add_event,
> + TP_PROTO(const char *name, int offset, __u64 added),
> + TP_ARGS(name, offset, added),
> +
> + TP_STRUCT__entry(
> + __string(name, name)
> + __field( int, offset )
> + __field( __u64, added )
> + ),
> +
> + TP_fast_assign(
> + __assign_str(name);
> + __entry->offset = offset;
> + __entry->added = added;
> + ),
> +
> + TP_printk("name %s offset %d added %llu",
> + __get_str(name), __entry->offset, __entry->added
> + )
> +);
> +
> #endif /* _TRACE_SCHED_EXT_H */
>
> /* This part must be outside protection */
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 986b655911df..825e79863057 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1554,6 +1554,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
> */
> #define scx_add_event(name, cnt) do { \
> this_cpu_add(event_stats_cpu.name, cnt); \
> + trace_sched_ext_add_event(#name, \
> + offsetof(struct scx_event_stats, name), cnt); \
> } while(0)
>
> /**
> @@ -1565,6 +1567,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
> */
> #define __scx_add_event(name, cnt) do { \
> __this_cpu_add(event_stats_cpu.name, cnt); \
> + trace_sched_ext_add_event(#name, \
> + offsetof(struct scx_event_stats, name), cnt); \
> } while(0)
>
> /**
LGTM
Reviewed-by: Mukesh Kumar Chaurasiya <mchauras@linux.ibm.com>
> --
> 2.48.1
>
Hello, On Wed, Feb 26, 2025 at 11:33:27PM +0900, Changwoo Min wrote: > Add tracing support, which may be useful for debugging sched_ext schedulers > that trigger a certain event. > > Signed-off-by: Changwoo Min <changwoo@igalia.com> > --- > include/trace/events/sched_ext.h | 21 +++++++++++++++++++++ > kernel/sched/ext.c | 4 ++++ > 2 files changed, 25 insertions(+) > > diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h > index fe19da7315a9..88527b9316de 100644 > --- a/include/trace/events/sched_ext.h > +++ b/include/trace/events/sched_ext.h > @@ -26,6 +26,27 @@ TRACE_EVENT(sched_ext_dump, > ) > ); > > +TRACE_EVENT(sched_ext_add_event, > + TP_PROTO(const char *name, int offset, __u64 added), > + TP_ARGS(name, offset, added), Can we do sched_ext_event with @delta? Otherwise, looks fine to me. Thanks. -- tejun
Hi Tejun, On 25. 2. 27. 03:51, Tejun Heo wrote: > Hello, > > On Wed, Feb 26, 2025 at 11:33:27PM +0900, Changwoo Min wrote: >> Add tracing support, which may be useful for debugging sched_ext schedulers >> that trigger a certain event. >> >> Signed-off-by: Changwoo Min <changwoo@igalia.com> >> --- >> include/trace/events/sched_ext.h | 21 +++++++++++++++++++++ >> kernel/sched/ext.c | 4 ++++ >> 2 files changed, 25 insertions(+) >> >> diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h >> index fe19da7315a9..88527b9316de 100644 >> --- a/include/trace/events/sched_ext.h >> +++ b/include/trace/events/sched_ext.h >> @@ -26,6 +26,27 @@ TRACE_EVENT(sched_ext_dump, >> ) >> ); >> >> +TRACE_EVENT(sched_ext_add_event, >> + TP_PROTO(const char *name, int offset, __u64 added), >> + TP_ARGS(name, offset, added), > > Can we do sched_ext_event with @delta? Otherwise, looks fine to me. Sure, @delta is clearer. I will change it as suggested. Regards, Changwoo Min
© 2016 - 2025 Red Hat, Inc.