[RFC PATCH V3] sched: psi: Add psi events trace point

Xuewen Yan posted 1 patch 3 weeks ago
There is a newer version of this series
include/trace/events/sched.h | 27 +++++++++++++++++++++++++++
kernel/sched/psi.c           |  4 ++++
2 files changed, 31 insertions(+)
[RFC PATCH V3] sched: psi: Add psi events trace point
Posted by Xuewen Yan 3 weeks ago
Add trace point to psi triggers. This is useful to
observe the psi events in the kernel space.

One use of this is to monitor memory pressure.
When the pressure is too high, we can kill the process
in the kernel space to prevent OOM.

Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
---
V3:
-export it in the tracefs;
---
v2:
-fix compilation error;
-export the tp;
-add more commit message;
---
 include/trace/events/sched.h | 27 +++++++++++++++++++++++++++
 kernel/sched/psi.c           |  4 ++++
 2 files changed, 31 insertions(+)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7b2645b50e78..db8b8f25466e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -826,6 +826,33 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
 	TP_printk("cpu=%d", __entry->cpu)
 );
 
+#ifdef CONFIG_PSI
+TRACE_EVENT(psi_event,
+
+	TP_PROTO(int aggregator, int state, u64 threshold, u64 win_size),
+
+	TP_ARGS(aggregator, state, threshold, win_size),
+
+	TP_STRUCT__entry(
+		__field(int,	aggregator)
+		__field(int,	state)
+		__field(u64,	threshold)
+		__field(u64,	win_size)
+	),
+
+	TP_fast_assign(
+		__entry->aggregator	= aggregator;
+		__entry->state		= state;
+		__entry->threshold	= threshold;
+		__entry->win_size	= win_size;
+	),
+
+	TP_printk("aggregator=%d state=%d threshold=%llu window_size=%llu",
+		__entry->aggregator, __entry->state, __entry->threshold,
+		__entry->win_size)
+);
+#endif /* CONFIG_PSI */
+
 /*
  * Following tracepoints are not exported in tracefs and provide hooking
  * mechanisms only for testing and debugging purposes.
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 59fdb7ebbf22..5b7954b653ed 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -141,6 +141,8 @@
 #include <linux/psi.h>
 #include "sched.h"
 
+EXPORT_TRACEPOINT_SYMBOL_GPL(psi_event);
+
 static int psi_bug __read_mostly;
 
 DEFINE_STATIC_KEY_FALSE(psi_disabled);
@@ -509,6 +511,8 @@ static void update_triggers(struct psi_group *group, u64 now,
 		if (now < t->last_event_time + t->win.size)
 			continue;
 
+		trace_psi_event(aggregator, t->state, t->threshold, t->win.size);
+
 		/* Generate an event */
 		if (cmpxchg(&t->event, 0, 1) == 0) {
 			if (t->of)
-- 
2.25.1
Re: [RFC PATCH V3] sched: psi: Add psi events trace point
Posted by Xuewen Yan 1 week, 2 days ago
A very gentle ping on this patch.
Best regards,
Thanks!

On Thu, Sep 11, 2025 at 11:30 AM Xuewen Yan <xuewen.yan@unisoc.com> wrote:
>
> Add trace point to psi triggers. This is useful to
> observe the psi events in the kernel space.
>
> One use of this is to monitor memory pressure.
> When the pressure is too high, we can kill the process
> in the kernel space to prevent OOM.
>
> Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
> ---
> V3:
> -export it in the tracefs;
> ---
> v2:
> -fix compilation error;
> -export the tp;
> -add more commit message;
> ---
>  include/trace/events/sched.h | 27 +++++++++++++++++++++++++++
>  kernel/sched/psi.c           |  4 ++++
>  2 files changed, 31 insertions(+)
>
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index 7b2645b50e78..db8b8f25466e 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -826,6 +826,33 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
>         TP_printk("cpu=%d", __entry->cpu)
>  );
>
> +#ifdef CONFIG_PSI
> +TRACE_EVENT(psi_event,
> +
> +       TP_PROTO(int aggregator, int state, u64 threshold, u64 win_size),
> +
> +       TP_ARGS(aggregator, state, threshold, win_size),
> +
> +       TP_STRUCT__entry(
> +               __field(int,    aggregator)
> +               __field(int,    state)
> +               __field(u64,    threshold)
> +               __field(u64,    win_size)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->aggregator     = aggregator;
> +               __entry->state          = state;
> +               __entry->threshold      = threshold;
> +               __entry->win_size       = win_size;
> +       ),
> +
> +       TP_printk("aggregator=%d state=%d threshold=%llu window_size=%llu",
> +               __entry->aggregator, __entry->state, __entry->threshold,
> +               __entry->win_size)
> +);
> +#endif /* CONFIG_PSI */
> +
>  /*
>   * Following tracepoints are not exported in tracefs and provide hooking
>   * mechanisms only for testing and debugging purposes.
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 59fdb7ebbf22..5b7954b653ed 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -141,6 +141,8 @@
>  #include <linux/psi.h>
>  #include "sched.h"
>
> +EXPORT_TRACEPOINT_SYMBOL_GPL(psi_event);
> +
>  static int psi_bug __read_mostly;
>
>  DEFINE_STATIC_KEY_FALSE(psi_disabled);
> @@ -509,6 +511,8 @@ static void update_triggers(struct psi_group *group, u64 now,
>                 if (now < t->last_event_time + t->win.size)
>                         continue;
>
> +               trace_psi_event(aggregator, t->state, t->threshold, t->win.size);
> +
>                 /* Generate an event */
>                 if (cmpxchg(&t->event, 0, 1) == 0) {
>                         if (t->of)
> --
> 2.25.1
>
Re: [RFC PATCH V3] sched: psi: Add psi events trace point
Posted by Suren Baghdasaryan 1 week, 2 days ago
On Mon, Sep 22, 2025 at 7:58 PM Xuewen Yan <xuewen.yan94@gmail.com> wrote:
>
> A very gentle ping on this patch.
> Best regards,
> Thanks!

Thanks! Sorry I missed it.

>
> On Thu, Sep 11, 2025 at 11:30 AM Xuewen Yan <xuewen.yan@unisoc.com> wrote:
> >
> > Add trace point to psi triggers. This is useful to
> > observe the psi events in the kernel space.
> >
> > One use of this is to monitor memory pressure.
> > When the pressure is too high, we can kill the process
> > in the kernel space to prevent OOM.
> >
> > Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
> > ---
> > V3:
> > -export it in the tracefs;
> > ---
> > v2:
> > -fix compilation error;
> > -export the tp;
> > -add more commit message;
> > ---
> >  include/trace/events/sched.h | 27 +++++++++++++++++++++++++++
> >  kernel/sched/psi.c           |  4 ++++
> >  2 files changed, 31 insertions(+)
> >
> > diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> > index 7b2645b50e78..db8b8f25466e 100644
> > --- a/include/trace/events/sched.h
> > +++ b/include/trace/events/sched.h
> > @@ -826,6 +826,33 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
> >         TP_printk("cpu=%d", __entry->cpu)
> >  );
> >
> > +#ifdef CONFIG_PSI
> > +TRACE_EVENT(psi_event,
> > +
> > +       TP_PROTO(int aggregator, int state, u64 threshold, u64 win_size),
> > +
> > +       TP_ARGS(aggregator, state, threshold, win_size),
> > +
> > +       TP_STRUCT__entry(
> > +               __field(int,    aggregator)
> > +               __field(int,    state)
> > +               __field(u64,    threshold)
> > +               __field(u64,    win_size)
> > +       ),
> > +
> > +       TP_fast_assign(
> > +               __entry->aggregator     = aggregator;
> > +               __entry->state          = state;
> > +               __entry->threshold      = threshold;
> > +               __entry->win_size       = win_size;
> > +       ),
> > +
> > +       TP_printk("aggregator=%d state=%d threshold=%llu window_size=%llu",
> > +               __entry->aggregator, __entry->state, __entry->threshold,
> > +               __entry->win_size)
> > +);
> > +#endif /* CONFIG_PSI */
> > +
> >  /*
> >   * Following tracepoints are not exported in tracefs and provide hooking
> >   * mechanisms only for testing and debugging purposes.
> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > index 59fdb7ebbf22..5b7954b653ed 100644
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -141,6 +141,8 @@
> >  #include <linux/psi.h>
> >  #include "sched.h"
> >
> > +EXPORT_TRACEPOINT_SYMBOL_GPL(psi_event);
> > +
> >  static int psi_bug __read_mostly;
> >
> >  DEFINE_STATIC_KEY_FALSE(psi_disabled);
> > @@ -509,6 +511,8 @@ static void update_triggers(struct psi_group *group, u64 now,
> >                 if (now < t->last_event_time + t->win.size)
> >                         continue;
> >
> > +               trace_psi_event(aggregator, t->state, t->threshold, t->win.size);

Again, this trace event should be generated only after cmpxchg()
passes the check. Otherwise trace events might be generated when
actual PSI events are not (false positives). That disconnect is not
acceptable.

> > +
> >                 /* Generate an event */
> >                 if (cmpxchg(&t->event, 0, 1) == 0) {
> >                         if (t->of)
> > --
> > 2.25.1
> >
Re: [RFC PATCH V3] sched: psi: Add psi events trace point
Posted by Xuewen Yan 1 week, 2 days ago
On Tue, Sep 23, 2025 at 11:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, Sep 22, 2025 at 7:58 PM Xuewen Yan <xuewen.yan94@gmail.com> wrote:
> >
> > A very gentle ping on this patch.
> > Best regards,
> > Thanks!
>
> Thanks! Sorry I missed it.
Not at all, thanks for your reply:)

>
> >
> > On Thu, Sep 11, 2025 at 11:30 AM Xuewen Yan <xuewen.yan@unisoc.com> wrote:
> > >
> > > Add trace point to psi triggers. This is useful to
> > > observe the psi events in the kernel space.
> > >
> > > One use of this is to monitor memory pressure.
> > > When the pressure is too high, we can kill the process
> > > in the kernel space to prevent OOM.
> > >
> > > Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
> > > ---
> > > V3:
> > > -export it in the tracefs;
> > > ---
> > > v2:
> > > -fix compilation error;
> > > -export the tp;
> > > -add more commit message;
> > > ---
> > >  include/trace/events/sched.h | 27 +++++++++++++++++++++++++++
> > >  kernel/sched/psi.c           |  4 ++++
> > >  2 files changed, 31 insertions(+)
> > >
> > > diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> > > index 7b2645b50e78..db8b8f25466e 100644
> > > --- a/include/trace/events/sched.h
> > > +++ b/include/trace/events/sched.h
> > > @@ -826,6 +826,33 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
> > >         TP_printk("cpu=%d", __entry->cpu)
> > >  );
> > >
> > > +#ifdef CONFIG_PSI
> > > +TRACE_EVENT(psi_event,
> > > +
> > > +       TP_PROTO(int aggregator, int state, u64 threshold, u64 win_size),
> > > +
> > > +       TP_ARGS(aggregator, state, threshold, win_size),
> > > +
> > > +       TP_STRUCT__entry(
> > > +               __field(int,    aggregator)
> > > +               __field(int,    state)
> > > +               __field(u64,    threshold)
> > > +               __field(u64,    win_size)
> > > +       ),
> > > +
> > > +       TP_fast_assign(
> > > +               __entry->aggregator     = aggregator;
> > > +               __entry->state          = state;
> > > +               __entry->threshold      = threshold;
> > > +               __entry->win_size       = win_size;
> > > +       ),
> > > +
> > > +       TP_printk("aggregator=%d state=%d threshold=%llu window_size=%llu",
> > > +               __entry->aggregator, __entry->state, __entry->threshold,
> > > +               __entry->win_size)
> > > +);
> > > +#endif /* CONFIG_PSI */
> > > +
> > >  /*
> > >   * Following tracepoints are not exported in tracefs and provide hooking
> > >   * mechanisms only for testing and debugging purposes.
> > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > index 59fdb7ebbf22..5b7954b653ed 100644
> > > --- a/kernel/sched/psi.c
> > > +++ b/kernel/sched/psi.c
> > > @@ -141,6 +141,8 @@
> > >  #include <linux/psi.h>
> > >  #include "sched.h"
> > >
> > > +EXPORT_TRACEPOINT_SYMBOL_GPL(psi_event);
> > > +
> > >  static int psi_bug __read_mostly;
> > >
> > >  DEFINE_STATIC_KEY_FALSE(psi_disabled);
> > > @@ -509,6 +511,8 @@ static void update_triggers(struct psi_group *group, u64 now,
> > >                 if (now < t->last_event_time + t->win.size)
> > >                         continue;
> > >
> > > +               trace_psi_event(aggregator, t->state, t->threshold, t->win.size);
>
> Again, this trace event should be generated only after cmpxchg()
> passes the check. Otherwise trace events might be generated when
> actual PSI events are not (false positives). That disconnect is not
> acceptable.

You’re absolutely right. I’ll resolve it in the next patch.

Thanks!

BR
---
xuewen

>
> > > +
> > >                 /* Generate an event */
> > >                 if (cmpxchg(&t->event, 0, 1) == 0) {
> > >                         if (t->of)
> > > --
> > > 2.25.1
> > >