[PATCH v2 03/11] sched: Add sched tracepoints for RV task model

Gabriele Monaco posted 11 patches 10 months, 1 week ago
There is a newer version of this series
[PATCH v2 03/11] sched: Add sched tracepoints for RV task model
Posted by Gabriele Monaco 10 months, 1 week ago
Add the following tracepoints:
* sched_entry(bool preempt, ip)
    Called while entering __schedule
* sched_exit(bool is_switch, ip)
    Called while exiting __schedule
* sched_set_state(task, curr_state, state)
    Called when a task changes its state (to and from running)

These tracepoints are useful to describe the Linux task model and are
adapted from the patches by Daniel Bristot de Oliveira
(https://bristot.me/linux-task-model/).

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/linux/rv.h                 |  2 +-
 include/linux/sched.h              | 16 ++++++++++++++++
 include/trace/events/sched.h       | 13 +++++++++++++
 kernel/sched/core.c                | 26 +++++++++++++++++++++++++-
 tools/verification/rv/include/rv.h |  2 +-
 5 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/include/linux/rv.h b/include/linux/rv.h
index 8883b41d88ec4..55d458be53a4c 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -7,7 +7,7 @@
 #ifndef _LINUX_RV_H
 #define _LINUX_RV_H
 
-#define MAX_DA_NAME_LEN	24
+#define MAX_DA_NAME_LEN	32
 
 #ifdef CONFIG_RV
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9632e3318e0d6..60d2cda16a136 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -46,6 +46,7 @@
 #include <linux/rv.h>
 #include <linux/livepatch_sched.h>
 #include <linux/uidgid_types.h>
+#include <linux/tracepoint-defs.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
@@ -186,6 +187,12 @@ struct user_event_mm;
 # define debug_rtlock_wait_restore_state()	do { } while (0)
 #endif
 
+#define trace_set_current_state(state_value)                     \
+	do {                                                     \
+		if (tracepoint_enabled(sched_set_state_tp))      \
+			__do_trace_set_current_state(state_value); \
+	} while (0)
+
 /*
  * set_current_state() includes a barrier so that the write of current->__state
  * is correctly serialised wrt the caller's subsequent test of whether to
@@ -226,12 +233,14 @@ struct user_event_mm;
 #define __set_current_state(state_value)				\
 	do {								\
 		debug_normal_state_change((state_value));		\
+		trace_set_current_state(state_value);			\
 		WRITE_ONCE(current->__state, (state_value));		\
 	} while (0)
 
 #define set_current_state(state_value)					\
 	do {								\
 		debug_normal_state_change((state_value));		\
+		trace_set_current_state(state_value);			\
 		smp_store_mb(current->__state, (state_value));		\
 	} while (0)
 
@@ -247,6 +256,7 @@ struct user_event_mm;
 									\
 		raw_spin_lock_irqsave(&current->pi_lock, flags);	\
 		debug_special_state_change((state_value));		\
+		trace_set_current_state(state_value);			\
 		WRITE_ONCE(current->__state, (state_value));		\
 		raw_spin_unlock_irqrestore(&current->pi_lock, flags);	\
 	} while (0)
@@ -282,6 +292,7 @@ struct user_event_mm;
 		raw_spin_lock(&current->pi_lock);			\
 		current->saved_state = current->__state;		\
 		debug_rtlock_wait_set_state();				\
+		trace_set_current_state(TASK_RTLOCK_WAIT);		\
 		WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);		\
 		raw_spin_unlock(&current->pi_lock);			\
 	} while (0);
@@ -291,6 +302,7 @@ struct user_event_mm;
 		lockdep_assert_irqs_disabled();				\
 		raw_spin_lock(&current->pi_lock);			\
 		debug_rtlock_wait_restore_state();			\
+		trace_set_current_state(TASK_RUNNING);			\
 		WRITE_ONCE(current->__state, current->saved_state);	\
 		current->saved_state = TASK_RUNNING;			\
 		raw_spin_unlock(&current->pi_lock);			\
@@ -327,6 +339,10 @@ extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
+/* wrapper function to trace from this header file */
+DECLARE_TRACEPOINT(sched_set_state_tp);
+extern void __do_trace_set_current_state(int state_value);
+
 /**
  * struct prev_cputime - snapshot of system and user cputime
  * @utime: time spent in user mode
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9ea4c404bd4ef..cc3be04fe9986 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -824,6 +824,19 @@ DECLARE_TRACE(sched_compute_energy_tp,
 		 unsigned long max_util, unsigned long busy_time),
 	TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
 
+DECLARE_TRACE(sched_entry_tp,
+	TP_PROTO(bool preempt, unsigned long ip),
+	TP_ARGS(preempt, ip));
+
+DECLARE_TRACE(sched_exit_tp,
+	TP_PROTO(bool is_switch, unsigned long ip),
+	TP_ARGS(is_switch, ip));
+
+DECLARE_TRACE_CONDITION(sched_set_state_tp,
+	TP_PROTO(struct task_struct *tsk, int curr_state, int state),
+	TP_ARGS(tsk, curr_state, state),
+	TP_CONDITION(!!curr_state != !!state));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 165c90ba64ea9..4aa6af026e05c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -491,6 +491,19 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
 
 #endif /* CONFIG_SCHED_CORE */
 
+/* need a wrapper since we may need to trace from modules */
+EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
+
+/*
+ * Do not call this function directly since it won't check if the tp is enabled.
+ * Call the helper macro trace_set_current_state instead.
+ */
+void __do_trace_set_current_state(int state_value)
+{
+	__do_trace_sched_set_state_tp(current, current->__state, state_value);
+}
+EXPORT_SYMBOL(__do_trace_set_current_state);
+
 /*
  * Serialization rules:
  *
@@ -5306,6 +5319,12 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	 */
 
 	finish_task_switch(prev);
+	/*
+	 * This is a special case: the newly created task has just
+	 * switched the context for the fist time. It is returning from
+	 * schedule for the first time in this path.
+	 */
+	trace_sched_exit_tp(true, CALLER_ADDR0);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -6649,12 +6668,15 @@ static void __sched notrace __schedule(int sched_mode)
 	 * as a preemption by schedule_debug() and RCU.
 	 */
 	bool preempt = sched_mode > SM_NONE;
+	bool is_switch = false;
 	unsigned long *switch_count;
 	unsigned long prev_state;
 	struct rq_flags rf;
 	struct rq *rq;
 	int cpu;
 
+	trace_sched_entry_tp(preempt, CALLER_ADDR0);
+
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	prev = rq->curr;
@@ -6722,7 +6744,8 @@ static void __sched notrace __schedule(int sched_mode)
 	rq->last_seen_need_resched_ns = 0;
 #endif
 
-	if (likely(prev != next)) {
+	is_switch = prev != next;
+	if (likely(is_switch)) {
 		rq->nr_switches++;
 		/*
 		 * RCU users of rcu_dereference(rq->curr) may not see
@@ -6767,6 +6790,7 @@ static void __sched notrace __schedule(int sched_mode)
 		__balance_callbacks(rq);
 		raw_spin_rq_unlock_irq(rq);
 	}
+	trace_sched_exit_tp(is_switch, CALLER_ADDR0);
 }
 
 void __noreturn do_task_dead(void)
diff --git a/tools/verification/rv/include/rv.h b/tools/verification/rv/include/rv.h
index 770fd6da36107..0cab1037a98f7 100644
--- a/tools/verification/rv/include/rv.h
+++ b/tools/verification/rv/include/rv.h
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #define MAX_DESCRIPTION 1024
-#define MAX_DA_NAME_LEN	24
+#define MAX_DA_NAME_LEN	32
 
 struct monitor {
 	char name[MAX_DA_NAME_LEN];
-- 
2.48.1
Re: [PATCH v2 03/11] sched: Add sched tracepoints for RV task model
Posted by Steven Rostedt 10 months ago
On Thu, 13 Feb 2025 10:08:01 +0100
Gabriele Monaco <gmonaco@redhat.com> wrote:

> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index 9ea4c404bd4ef..cc3be04fe9986 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -824,6 +824,19 @@ DECLARE_TRACE(sched_compute_energy_tp,
>  		 unsigned long max_util, unsigned long busy_time),
>  	TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
>  
> +DECLARE_TRACE(sched_entry_tp,
> +	TP_PROTO(bool preempt, unsigned long ip),
> +	TP_ARGS(preempt, ip));
> +
> +DECLARE_TRACE(sched_exit_tp,
> +	TP_PROTO(bool is_switch, unsigned long ip),
> +	TP_ARGS(is_switch, ip));
> +
> +DECLARE_TRACE_CONDITION(sched_set_state_tp,
> +	TP_PROTO(struct task_struct *tsk, int curr_state, int state),
> +	TP_ARGS(tsk, curr_state, state),
> +	TP_CONDITION(!!curr_state != !!state));

I don't think you need to pass in state. Just have it be:

	TP_CONDITION(!!(tsk->__state) != !!state));


> +
>  #endif /* _TRACE_SCHED_H */
>  
>  /* This part must be outside protection */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 165c90ba64ea9..4aa6af026e05c 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -491,6 +491,19 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
>  
>  #endif /* CONFIG_SCHED_CORE */
>  
> +/* need a wrapper since we may need to trace from modules */
> +EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
> +
> +/*
> + * Do not call this function directly since it won't check if the tp is enabled.
> + * Call the helper macro trace_set_current_state instead.
> + */
> +void __do_trace_set_current_state(int state_value)
> +{
> +	__do_trace_sched_set_state_tp(current, current->__state, state_value);

And this should not be using the internal macros of a trace point. It should be:

	trace_sched_set_state_tp(current, state_value);

(I removed the current->__state as mentioned above).


> +}
> +EXPORT_SYMBOL(__do_trace_set_current_state);
> +

-- Steve
Re: [PATCH v2 03/11] sched: Add sched tracepoints for RV task model
Posted by Peter Zijlstra 10 months ago
On Mon, Feb 17, 2025 at 11:38:44AM -0500, Steven Rostedt wrote:

> > +void __do_trace_set_current_state(int state_value)
> > +{
> > +	__do_trace_sched_set_state_tp(current, current->__state, state_value);
> 
> And this should not be using the internal macros of a trace point. It should be:
> 
> 	trace_sched_set_state_tp(current, state_value);
> 
> (I removed the current->__state as mentioned above).

But the static branch is already in the caller, no point duplicating
that.
Re: [PATCH v2 03/11] sched: Add sched tracepoints for RV task model
Posted by Steven Rostedt 10 months ago
On Mon, 17 Feb 2025 17:49:17 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Feb 17, 2025 at 11:38:44AM -0500, Steven Rostedt wrote:
> 
> > > +void __do_trace_set_current_state(int state_value)
> > > +{
> > > +	__do_trace_sched_set_state_tp(current, current->__state, state_value);  
> > 
> > And this should not be using the internal macros of a trace point. It should be:
> > 
> > 	trace_sched_set_state_tp(current, state_value);
> > 
> > (I removed the current->__state as mentioned above).  
> 
> But the static branch is already in the caller, no point duplicating
> that.

Regardless, you should not be using internals of the tracepoints. That can
change at any time and is not reliable (as the kernel test robot pointed out).

It's a static branch, who cares if it's added twice? One is used to jump to
the function, the other is for the tracepoint logic itself.

There's several places that do this.

Perhaps, in the future we could create a normal API that will always call
the tracepoint, but until then, let's not use code that wasn't meant for
that purpose.

-- Steve
Re: [PATCH v2 03/11] sched: Add sched tracepoints for RV task model
Posted by Gabriele Monaco 10 months ago

On Mon, 2025-02-17 at 12:16 -0500, Steven Rostedt wrote:
> On Mon, 17 Feb 2025 17:49:17 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Mon, Feb 17, 2025 at 11:38:44AM -0500, Steven Rostedt wrote:
> > 
> > > > +void __do_trace_set_current_state(int state_value)
> > > > +{
> > > > +	__do_trace_sched_set_state_tp(current, current-
> > > > >__state, state_value);  
> > > 
> > > And this should not be using the internal macros of a trace
> > > point. It should be:
> > > 
> > > 	trace_sched_set_state_tp(current, state_value);
> > > 
> > > (I removed the current->__state as mentioned above).  
> > 
> > But the static branch is already in the caller, no point
> > duplicating
> > that.
> 
> Regardless, you should not be using internals of the tracepoints.
> That can
> change at any time and is not reliable (as the kernel test robot
> pointed out).
> 
> It's a static branch, who cares if it's added twice? One is used to
> jump to
> the function, the other is for the tracepoint logic itself.
> 
> There's several places that do this.
> 
> Perhaps, in the future we could create a normal API that will always
> call
> the tracepoint, but until then, let's not use code that wasn't meant
> for
> that purpose.
> 
> -- Steve
> 

Mmh I get your point. We definitely don't want this piece of code to
break whenever something changes in the tracepoint API..

I will revert the change and prepare a V3

Thanks,
Gabriele
Re: [PATCH v2 03/11] sched: Add sched tracepoints for RV task model
Posted by kernel test robot 10 months, 1 week ago
Hi Gabriele,

kernel test robot noticed the following build errors:

[auto build test ERROR on 4dc1d1bec89864d8076e5ab314f86f46442bfb02]

url:    https://github.com/intel-lab-lkp/linux/commits/Gabriele-Monaco/tracing-Fix-DECLARE_TRACE_CONDITION/20250213-171642
base:   4dc1d1bec89864d8076e5ab314f86f46442bfb02
patch link:    https://lore.kernel.org/r/20250213090819.419470-4-gmonaco%40redhat.com
patch subject: [PATCH v2 03/11] sched: Add sched tracepoints for RV task model
config: arm-randconfig-001-20250214 (https://download.01.org/0day-ci/archive/20250214/202502141719.Jdm44al4-lkp@intel.com/config)
compiler: clang version 16.0.6 (https://github.com/llvm/llvm-project 7cbf1a2591520c2491aa35339f227775f4d3adf6)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250214/202502141719.Jdm44al4-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202502141719.Jdm44al4-lkp@intel.com/

All errors (new ones prefixed by >>):

>> kernel/sched/core.c:503:2: error: call to undeclared function '__do_trace_sched_set_state_tp'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
           __do_trace_sched_set_state_tp(current, current->__state, state_value);
           ^
   kernel/sched/core.c:503:2: note: did you mean 'trace_sched_set_state_tp'?
   include/trace/events/sched.h:835:1: note: 'trace_sched_set_state_tp' declared here
   DECLARE_TRACE_CONDITION(sched_set_state_tp,
   ^
   include/linux/tracepoint.h:472:2: note: expanded from macro 'DECLARE_TRACE_CONDITION'
           __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),              \
           ^
   include/linux/tracepoint.h:409:2: note: expanded from macro '__DECLARE_TRACE'
           __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto))
           ^
   include/linux/tracepoint.h:385:21: note: expanded from macro '__DECLARE_TRACE_COMMON'
           static inline void trace_##name(proto)                          \
                              ^
   <scratch space>:84:1: note: expanded from here
   trace_sched_set_state_tp
   ^
   1 error generated.


vim +/__do_trace_sched_set_state_tp +503 kernel/sched/core.c

   496	
   497	/*
   498	 * Do not call this function directly since it won't check if the tp is enabled.
   499	 * Call the helper macro trace_set_current_state instead.
   500	 */
   501	void __do_trace_set_current_state(int state_value)
   502	{
 > 503		__do_trace_sched_set_state_tp(current, current->__state, state_value);
   504	}
   505	EXPORT_SYMBOL(__do_trace_set_current_state);
   506	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v2 03/11] sched: Add sched tracepoints for RV task model
Posted by kernel test robot 10 months, 1 week ago
Hi Gabriele,

kernel test robot noticed the following build errors:

[auto build test ERROR on 4dc1d1bec89864d8076e5ab314f86f46442bfb02]

url:    https://github.com/intel-lab-lkp/linux/commits/Gabriele-Monaco/tracing-Fix-DECLARE_TRACE_CONDITION/20250213-171642
base:   4dc1d1bec89864d8076e5ab314f86f46442bfb02
patch link:    https://lore.kernel.org/r/20250213090819.419470-4-gmonaco%40redhat.com
patch subject: [PATCH v2 03/11] sched: Add sched tracepoints for RV task model
config: i386-buildonly-randconfig-001-20250214 (https://download.01.org/0day-ci/archive/20250214/202502141516.OkUInaxw-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250214/202502141516.OkUInaxw-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202502141516.OkUInaxw-lkp@intel.com/

All errors (new ones prefixed by >>):

   kernel/sched/core.c: In function '__do_trace_set_current_state':
>> kernel/sched/core.c:503:9: error: implicit declaration of function '__do_trace_sched_set_state_tp'; did you mean 'trace_sched_set_state_tp'? [-Werror=implicit-function-declaration]
     503 |         __do_trace_sched_set_state_tp(current, current->__state, state_value);
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         |         trace_sched_set_state_tp
   cc1: some warnings being treated as errors


vim +503 kernel/sched/core.c

   496	
   497	/*
   498	 * Do not call this function directly since it won't check if the tp is enabled.
   499	 * Call the helper macro trace_set_current_state instead.
   500	 */
   501	void __do_trace_set_current_state(int state_value)
   502	{
 > 503		__do_trace_sched_set_state_tp(current, current->__state, state_value);
   504	}
   505	EXPORT_SYMBOL(__do_trace_set_current_state);
   506	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v2 03/11] sched: Add sched tracepoints for RV task model
Posted by Gabriele Monaco 10 months, 1 week ago
 
On Fri, 2025-02-14 at 16:00 +0800, kernel test robot wrote: 
 
>  >    kernel/sched/core.c: In function
> '__do_trace_set_current_state': 
>  
 
>  
> >  
> > >  > > > kernel/sched/core.c:503:9: error: implicit declaration of 
> > > > > > function '__do_trace_sched_set_state_tp'; did you mean 
> > > > > > 'trace_sched_set_state_tp'? 
> > > > > > [-Werror=implicit-function-declaration] 
> > >  
> >  
>  
 
>  >      503 |         __do_trace_sched_set_state_tp(current, 
> > current->__state, state_value); 
> >          |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
> >          |         trace_sched_set_state_tp 
> >    cc1: some warnings being treated as errors 
> > 
> > 
> > vim +503 kernel/sched/core.c 
> > 
> >    496 
> >    497 /* 
> >    498 * Do not call this function directly since it won't check if
> > the tp is enabled. 
> >    499 * Call the helper macro trace_set_current_state instead. 
> >    500 */ 
> >    501 void __do_trace_set_current_state(int state_value) 
> >    502 { 
> >  > 503 __do_trace_sched_set_state_tp(current, current->__state, 
> > state_value); 
> >    504 } 
> >    505 EXPORT_SYMBOL(__do_trace_set_current_state); 
> >    506 
>  
 
I honestly don't get why this build failed. The function __do_trace_
exists since cff6d93eab00ba ("tracepoint: Reduce duplication of
__DO_TRACE_CALL"), a while before that it was just a macro and not an
inline function, reason why no one so far used it directly.

Both failed builds are based on 4dc1d1bec898 (where my patchset is
based) and there __do_trace_ does exist.

Unless there's a strong opinion not to use it although the compiler
allows it, I'd consider the two kernel robot results false negatives.

Or am I missing something?
Re: [PATCH v2 03/11] sched: Add sched tracepoints for RV task model
Posted by Steven Rostedt 10 months ago
On Fri, 14 Feb 2025 12:15:31 +0100
Gabriele Monaco <gmonaco@redhat.com> wrote:

> > >  > 503 __do_trace_sched_set_state_tp(current, current->__state,   
> > > state_value); 
> > >    504 } 
> > >    505 EXPORT_SYMBOL(__do_trace_set_current_state); 
> > >    506   
> >    
>  
> I honestly don't get why this build failed. The function __do_trace_
> exists since cff6d93eab00ba ("tracepoint: Reduce duplication of
> __DO_TRACE_CALL"), a while before that it was just a macro and not an
> inline function, reason why no one so far used it directly.
> 
> Both failed builds are based on 4dc1d1bec898 (where my patchset is
> based) and there __do_trace_ does exist.
> 
> Unless there's a strong opinion not to use it although the compiler
> allows it, I'd consider the two kernel robot results false negatives.
> 
> Or am I missing something?

It's because you should not be using the internal macros of a tracepoint.
Just use the tracepoint itself.

I replied to the patch.

-- Steve