Trace task's preemption due to a wakeups, getting delayed. Which
can occurs when the running task requested extra time on cpu.
Also, indicate the NEED_RESCHED flag set on the task getting
cleared.
Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
---
include/trace/events/sched.h | 28 ++++++++++++++++++++++++++++
kernel/entry/common.c | 8 ++++++--
2 files changed, 34 insertions(+), 2 deletions(-)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 8994e97d86c1..4aa04044b14a 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -296,6 +296,34 @@ TRACE_EVENT(sched_migrate_task,
__entry->orig_cpu, __entry->dest_cpu)
);
+/*
+ * Tracepoint for delayed resched requested by task:
+ */
+TRACE_EVENT(sched_delay_resched,
+
+ TP_PROTO(struct task_struct *p, unsigned int resched_flg),
+
+ TP_ARGS(p, resched_flg),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, cpu )
+ __field( int, flg )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->cpu = task_cpu(p);
+ __entry->flg = resched_flg;
+ ),
+
+ TP_printk("comm=%s pid=%d cpu=%d resched_flg_cleared=0x%x",
+ __entry->comm, __entry->pid, __entry->cpu, __entry->flg)
+
+);
+
DECLARE_EVENT_CLASS(sched_process_template,
TP_PROTO(struct task_struct *p),
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index b26adccb32df..1951e6a4e9bc 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -12,6 +12,7 @@
#include "common.h"
+#include <trace/events/sched.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
@@ -100,9 +101,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
local_irq_enable_exit_to_user(ti_work);
if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
- if (irq && rseq_delay_resched())
+ if (irq && rseq_delay_resched()) {
clear_tsk_need_resched(current);
- else
+ trace_sched_delay_resched(current,
+ ti_work & (_TIF_NEED_RESCHED |
+ _TIF_NEED_RESCHED_LAZY));
+ } else
schedule();
}
--
2.43.5
On Fri, May 02, 2025 at 01:59:55AM +0000, Prakash Sangappa wrote:
> Trace task's preemption due to a wakeups, getting delayed. Which
> can occurs when the running task requested extra time on cpu.
> Also, indicate the NEED_RESCHED flag set on the task getting
> cleared.
>
> Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
> ---
> include/trace/events/sched.h | 28 ++++++++++++++++++++++++++++
> kernel/entry/common.c | 8 ++++++--
> 2 files changed, 34 insertions(+), 2 deletions(-)
>
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index 8994e97d86c1..4aa04044b14a 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -296,6 +296,34 @@ TRACE_EVENT(sched_migrate_task,
> __entry->orig_cpu, __entry->dest_cpu)
> );
>
> +/*
> + * Tracepoint for delayed resched requested by task:
> + */
> +TRACE_EVENT(sched_delay_resched,
> +
> + TP_PROTO(struct task_struct *p, unsigned int resched_flg),
> +
> + TP_ARGS(p, resched_flg),
> +
> + TP_STRUCT__entry(
> + __array( char, comm, TASK_COMM_LEN )
> + __field( pid_t, pid )
> + __field( int, cpu )
> + __field( int, flg )
> + ),
> +
> + TP_fast_assign(
> + memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
> + __entry->pid = p->pid;
> + __entry->cpu = task_cpu(p);
> + __entry->flg = resched_flg;
> + ),
> +
> + TP_printk("comm=%s pid=%d cpu=%d resched_flg_cleared=0x%x",
> + __entry->comm, __entry->pid, __entry->cpu, __entry->flg)
> +
> +);
> +
> DECLARE_EVENT_CLASS(sched_process_template,
>
> TP_PROTO(struct task_struct *p),
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index b26adccb32df..1951e6a4e9bc 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -12,6 +12,7 @@
>
> #include "common.h"
>
> +#include <trace/events/sched.h>
> #define CREATE_TRACE_POINTS
> #include <trace/events/syscalls.h>
>
> @@ -100,9 +101,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> local_irq_enable_exit_to_user(ti_work);
>
> if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
> - if (irq && rseq_delay_resched())
> + if (irq && rseq_delay_resched()) {
> clear_tsk_need_resched(current);
> - else
> + trace_sched_delay_resched(current,
> + ti_work & (_TIF_NEED_RESCHED |
> + _TIF_NEED_RESCHED_LAZY));
> + } else
> schedule();
> }
This is horrific coding style. But really why do we need this? I'm not,
in general, a fan of tracepoints.
> On May 2, 2025, at 2:14 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, May 02, 2025 at 01:59:55AM +0000, Prakash Sangappa wrote:
>> Trace task's preemption due to a wakeups, getting delayed. Which
>> can occurs when the running task requested extra time on cpu.
>> Also, indicate the NEED_RESCHED flag set on the task getting
>> cleared.
>>
>> Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
>> ---
>> include/trace/events/sched.h | 28 ++++++++++++++++++++++++++++
>> kernel/entry/common.c | 8 ++++++--
>> 2 files changed, 34 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
>> index 8994e97d86c1..4aa04044b14a 100644
>> --- a/include/trace/events/sched.h
>> +++ b/include/trace/events/sched.h
>> @@ -296,6 +296,34 @@ TRACE_EVENT(sched_migrate_task,
>> __entry->orig_cpu, __entry->dest_cpu)
>> );
>>
>> +/*
>> + * Tracepoint for delayed resched requested by task:
>> + */
>> +TRACE_EVENT(sched_delay_resched,
>> +
>> + TP_PROTO(struct task_struct *p, unsigned int resched_flg),
>> +
>> + TP_ARGS(p, resched_flg),
>> +
>> + TP_STRUCT__entry(
>> + __array( char, comm, TASK_COMM_LEN )
>> + __field( pid_t, pid )
>> + __field( int, cpu )
>> + __field( int, flg )
>> + ),
>> +
>> + TP_fast_assign(
>> + memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
>> + __entry->pid = p->pid;
>> + __entry->cpu = task_cpu(p);
>> + __entry->flg = resched_flg;
>> + ),
>> +
>> + TP_printk("comm=%s pid=%d cpu=%d resched_flg_cleared=0x%x",
>> + __entry->comm, __entry->pid, __entry->cpu, __entry->flg)
>> +
>> +);
>> +
>> DECLARE_EVENT_CLASS(sched_process_template,
>>
>> TP_PROTO(struct task_struct *p),
>> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
>> index b26adccb32df..1951e6a4e9bc 100644
>> --- a/kernel/entry/common.c
>> +++ b/kernel/entry/common.c
>> @@ -12,6 +12,7 @@
>>
>> #include "common.h"
>>
>> +#include <trace/events/sched.h>
>> #define CREATE_TRACE_POINTS
>> #include <trace/events/syscalls.h>
>>
>> @@ -100,9 +101,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>> local_irq_enable_exit_to_user(ti_work);
>>
>> if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
>> - if (irq && rseq_delay_resched())
>> + if (irq && rseq_delay_resched()) {
>> clear_tsk_need_resched(current);
>> - else
>> + trace_sched_delay_resched(current,
>> + ti_work & (_TIF_NEED_RESCHED |
>> + _TIF_NEED_RESCHED_LAZY));
>> + } else
>> schedule();
>> }
>
> This is horrific coding style. But really why do we need this? I'm not,
> in general, a fan of tracepoints.
I will move the trace_sched_delay* call towards the end of the routine,
assuming we are keeping this tracepoint.
On 2025-05-02 11:14:34 [+0200], Peter Zijlstra wrote: > This is horrific coding style. But really why do we need this? I'm not, > in general, a fan of tracepoints. I suggested a tracepoint so that we see why the NEED_RESCHED flag disappeared. The trace would show a wakeup, the need_resched flag would be set and the flag would disappear without an explanation. Sebastian
On Fri, May 02, 2025 at 01:02:10PM +0200, Sebastian Andrzej Siewior wrote: > On 2025-05-02 11:14:34 [+0200], Peter Zijlstra wrote: > > This is horrific coding style. But really why do we need this? I'm not, > > in general, a fan of tracepoints. > > I suggested a tracepoint so that we see why the NEED_RESCHED flag > disappeared. The trace would show a wakeup, the need_resched flag would > be set and the flag would disappear without an explanation. Would we not see a timer_start event for the hrtick or somesuch?
On 2025-05-02 13:10:44 [+0200], Peter Zijlstra wrote: > On Fri, May 02, 2025 at 01:02:10PM +0200, Sebastian Andrzej Siewior wrote: > > On 2025-05-02 11:14:34 [+0200], Peter Zijlstra wrote: > > > This is horrific coding style. But really why do we need this? I'm not, > > > in general, a fan of tracepoints. > > > > I suggested a tracepoint so that we see why the NEED_RESCHED flag > > disappeared. The trace would show a wakeup, the need_resched flag would > > be set and the flag would disappear without an explanation. > > Would we not see a timer_start event for the hrtick or somesuch? Yes, we do. That would mean we need timer:hrtimer* in order for the sched:* to make sense. Sebastian
© 2016 - 2026 Red Hat, Inc.