[RFC PATCH V4] sched: psi: Add psi events trace point

Xuewen Yan posted 1 patch 2 days, 19 hours ago
include/trace/events/sched.h | 27 +++++++++++++++++++++++++++
kernel/sched/psi.c           |  5 +++++
2 files changed, 32 insertions(+)
[RFC PATCH V4] sched: psi: Add psi events trace point
Posted by Xuewen Yan 2 days, 19 hours ago
Add trace point to psi triggers. This is useful to
observe the psi events in the kernel space.

One use of this is to monitor memory pressure.
When the pressure is too high, we can kill the process
in the kernel space to prevent OOM.

Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
---
V4:
-generate the event only after cmpxchg() passes the check
---
V3:
-export it in the tracefs;
---
v2:
-fix compilation error;
-export the tp;
-add more commit message;
---
 include/trace/events/sched.h | 27 +++++++++++++++++++++++++++
 kernel/sched/psi.c           |  5 +++++
 2 files changed, 32 insertions(+)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7b2645b50e78..db8b8f25466e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -826,6 +826,33 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
 	TP_printk("cpu=%d", __entry->cpu)
 );
 
+#ifdef CONFIG_PSI
+TRACE_EVENT(psi_event,
+
+	TP_PROTO(int aggregator, int state, u64 threshold, u64 win_size),
+
+	TP_ARGS(aggregator, state, threshold, win_size),
+
+	TP_STRUCT__entry(
+		__field(int,	aggregator)
+		__field(int,	state)
+		__field(u64,	threshold)
+		__field(u64,	win_size)
+	),
+
+	TP_fast_assign(
+		__entry->aggregator	= aggregator;
+		__entry->state		= state;
+		__entry->threshold	= threshold;
+		__entry->win_size	= win_size;
+	),
+
+	TP_printk("aggregator=%d state=%d threshold=%llu window_size=%llu",
+		__entry->aggregator, __entry->state, __entry->threshold,
+		__entry->win_size)
+);
+#endif /* CONFIG_PSI */
+
 /*
  * Following tracepoints are not exported in tracefs and provide hooking
  * mechanisms only for testing and debugging purposes.
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 59fdb7ebbf22..e8a7fd04ba9f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -141,6 +141,8 @@
 #include <linux/psi.h>
 #include "sched.h"
 
+EXPORT_TRACEPOINT_SYMBOL_GPL(psi_event);
+
 static int psi_bug __read_mostly;
 
 DEFINE_STATIC_KEY_FALSE(psi_disabled);
@@ -515,6 +517,9 @@ static void update_triggers(struct psi_group *group, u64 now,
 				kernfs_notify(t->of->kn);
 			else
 				wake_up_interruptible(&t->event_wait);
+
+			trace_psi_event(aggregator, t->state, t->threshold,
+					t->win.size);
 		}
 		t->last_event_time = now;
 		/* Reset threshold breach flag once event got generated */
-- 
2.25.1
Re: [RFC PATCH V4] sched: psi: Add psi events trace point
Posted by Suren Baghdasaryan 1 day, 22 hours ago
On Sun, Sep 28, 2025 at 6:43 PM Xuewen Yan <xuewen.yan@unisoc.com> wrote:
>
> Add trace point to psi triggers. This is useful to
> observe the psi events in the kernel space.
>
> One use of this is to monitor memory pressure.
> When the pressure is too high, we can kill the process
> in the kernel space to prevent OOM.

Just FYI, Roman is working on a BPF-based oom-killer solution [1]
which might be also interesting for you and this tracepoint might be
useful for Roman as well. CC'ing him here.

[1] https://lore.kernel.org/all/20250818170136.209169-1-roman.gushchin@linux.dev/
>
> Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>

Acked-by: Suren Baghdasaryan <surenb@google.com>

> ---
> V4:
> -generate the event only after cmpxchg() passes the check
> ---
> V3:
> -export it in the tracefs;
> ---
> v2:
> -fix compilation error;
> -export the tp;
> -add more commit message;
> ---
>  include/trace/events/sched.h | 27 +++++++++++++++++++++++++++
>  kernel/sched/psi.c           |  5 +++++
>  2 files changed, 32 insertions(+)
>
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index 7b2645b50e78..db8b8f25466e 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -826,6 +826,33 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
>         TP_printk("cpu=%d", __entry->cpu)
>  );
>
> +#ifdef CONFIG_PSI
> +TRACE_EVENT(psi_event,
> +
> +       TP_PROTO(int aggregator, int state, u64 threshold, u64 win_size),
> +
> +       TP_ARGS(aggregator, state, threshold, win_size),
> +
> +       TP_STRUCT__entry(
> +               __field(int,    aggregator)
> +               __field(int,    state)
> +               __field(u64,    threshold)
> +               __field(u64,    win_size)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->aggregator     = aggregator;
> +               __entry->state          = state;
> +               __entry->threshold      = threshold;
> +               __entry->win_size       = win_size;
> +       ),
> +
> +       TP_printk("aggregator=%d state=%d threshold=%llu window_size=%llu",
> +               __entry->aggregator, __entry->state, __entry->threshold,
> +               __entry->win_size)
> +);
> +#endif /* CONFIG_PSI */
> +
>  /*
>   * Following tracepoints are not exported in tracefs and provide hooking
>   * mechanisms only for testing and debugging purposes.
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 59fdb7ebbf22..e8a7fd04ba9f 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -141,6 +141,8 @@
>  #include <linux/psi.h>
>  #include "sched.h"
>
> +EXPORT_TRACEPOINT_SYMBOL_GPL(psi_event);
> +
>  static int psi_bug __read_mostly;
>
>  DEFINE_STATIC_KEY_FALSE(psi_disabled);
> @@ -515,6 +517,9 @@ static void update_triggers(struct psi_group *group, u64 now,
>                                 kernfs_notify(t->of->kn);
>                         else
>                                 wake_up_interruptible(&t->event_wait);
> +
> +                       trace_psi_event(aggregator, t->state, t->threshold,
> +                                       t->win.size);
>                 }
>                 t->last_event_time = now;
>                 /* Reset threshold breach flag once event got generated */
> --
> 2.25.1
>
>
Re: [RFC PATCH V4] sched: psi: Add psi events trace point
Posted by Xuewen Yan 1 day, 18 hours ago
On Tue, Sep 30, 2025 at 7:17 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Sun, Sep 28, 2025 at 6:43 PM Xuewen Yan <xuewen.yan@unisoc.com> wrote:
> >
> > Add trace point to psi triggers. This is useful to
> > observe the psi events in the kernel space.
> >
> > One use of this is to monitor memory pressure.
> > When the pressure is too high, we can kill the process
> > in the kernel space to prevent OOM.
>
> Just FYI, Roman is working on a BPF-based oom-killer solution [1]
> which might be also interesting for you and this tracepoint might be
> useful for Roman as well. CC'ing him here.
>
> [1] https://lore.kernel.org/all/20250818170136.209169-1-roman.gushchin@linux.dev/

Thanks for the review and the bpf patch, I'm interested in these.
> >
> > Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
>
> Acked-by: Suren Baghdasaryan <surenb@google.com>

Thanks!
---
BR
>
> > ---
> > V4:
> > -generate the event only after cmpxchg() passes the check
> > ---
> > V3:
> > -export it in the tracefs;
> > ---
> > v2:
> > -fix compilation error;
> > -export the tp;
> > -add more commit message;
> > ---
> >  include/trace/events/sched.h | 27 +++++++++++++++++++++++++++
> >  kernel/sched/psi.c           |  5 +++++
> >  2 files changed, 32 insertions(+)
> >
> > diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> > index 7b2645b50e78..db8b8f25466e 100644
> > --- a/include/trace/events/sched.h
> > +++ b/include/trace/events/sched.h
> > @@ -826,6 +826,33 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
> >         TP_printk("cpu=%d", __entry->cpu)
> >  );
> >
> > +#ifdef CONFIG_PSI
> > +TRACE_EVENT(psi_event,
> > +
> > +       TP_PROTO(int aggregator, int state, u64 threshold, u64 win_size),
> > +
> > +       TP_ARGS(aggregator, state, threshold, win_size),
> > +
> > +       TP_STRUCT__entry(
> > +               __field(int,    aggregator)
> > +               __field(int,    state)
> > +               __field(u64,    threshold)
> > +               __field(u64,    win_size)
> > +       ),
> > +
> > +       TP_fast_assign(
> > +               __entry->aggregator     = aggregator;
> > +               __entry->state          = state;
> > +               __entry->threshold      = threshold;
> > +               __entry->win_size       = win_size;
> > +       ),
> > +
> > +       TP_printk("aggregator=%d state=%d threshold=%llu window_size=%llu",
> > +               __entry->aggregator, __entry->state, __entry->threshold,
> > +               __entry->win_size)
> > +);
> > +#endif /* CONFIG_PSI */
> > +
> >  /*
> >   * Following tracepoints are not exported in tracefs and provide hooking
> >   * mechanisms only for testing and debugging purposes.
> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > index 59fdb7ebbf22..e8a7fd04ba9f 100644
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -141,6 +141,8 @@
> >  #include <linux/psi.h>
> >  #include "sched.h"
> >
> > +EXPORT_TRACEPOINT_SYMBOL_GPL(psi_event);
> > +
> >  static int psi_bug __read_mostly;
> >
> >  DEFINE_STATIC_KEY_FALSE(psi_disabled);
> > @@ -515,6 +517,9 @@ static void update_triggers(struct psi_group *group, u64 now,
> >                                 kernfs_notify(t->of->kn);
> >                         else
> >                                 wake_up_interruptible(&t->event_wait);
> > +
> > +                       trace_psi_event(aggregator, t->state, t->threshold,
> > +                                       t->win.size);
> >                 }
> >                 t->last_event_time = now;
> >                 /* Reset threshold breach flag once event got generated */
> > --
> > 2.25.1
> >
> >