[v3] rv: Add monitors to validate task switch

[PATCH v3 12/17] sched: Adapt sched tracepoints for RV task model

Posted by Gabriele Monaco 6 months, 4 weeks ago

Add the following tracepoints:
* sched_set_need_resched(tsk, cpu, tif)
    Called when a task is set the need resched [lazy] flag
* sched_switch_vain(preempt, tsk, tsk_state)
    Called when a task is selected again during __schedule
    i.e. prev == next == tsk : no real context switch

Add new parameter to sched_set_state to identify whether the state
change was due to an explicit call or a signal pending while scheduling.
We now also trace from try_to_block_task in case a signal was pending
and the task is set to runnable.

Also adapt all monitors using sched_set_state to avoid breaking build.

These tracepoints are useful to describe the Linux task model and are
adapted from the patches by Daniel Bristot de Oliveira
(https://bristot.me/linux-task-model/).

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/linux/sched.h                  |  7 ++++++-
 include/trace/events/sched.h           | 17 +++++++++++++++--
 kernel/sched/core.c                    | 10 +++++++++-
 kernel/trace/rv/monitors/sco/sco.c     |  3 ++-
 kernel/trace/rv/monitors/sleep/sleep.c |  3 ++-
 kernel/trace/rv/monitors/snroc/snroc.c |  3 ++-
 6 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7bce4c7ae3b4f..19ab4597c97d3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -339,9 +339,11 @@ extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
-/* wrapper function to trace from this header file */
+/* wrapper functions to trace from this header file */
 DECLARE_TRACEPOINT(sched_set_state_tp);
 extern void __trace_set_current_state(int state_value);
+DECLARE_TRACEPOINT(sched_set_need_resched_tp);
+extern void __trace_set_need_resched(struct task_struct *curr, int tif);
 
 /**
  * struct prev_cputime - snapshot of system and user cputime
@@ -2059,6 +2061,9 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
 
 static inline void set_tsk_need_resched(struct task_struct *tsk)
 {
+	if (tracepoint_enabled(sched_set_need_resched_tp) &&
+	    !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED))
+		__trace_set_need_resched(tsk, TIF_NEED_RESCHED);
 	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
 }
 
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 4e6b2910cec3f..c9dec6d38ad2d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -889,11 +889,24 @@ DECLARE_TRACE(sched_exit,
 	TP_PROTO(bool is_switch, unsigned long ip),
 	TP_ARGS(is_switch, ip));
 
+/*
+ * Tracepoint called when setting the state of a task;
+ * this tracepoint is guaranteed to be called from the waking context of the
+ * task setting the state.
+ */
 DECLARE_TRACE_CONDITION(sched_set_state,
-	TP_PROTO(struct task_struct *tsk, int state),
-	TP_ARGS(tsk, state),
+	TP_PROTO(struct task_struct *tsk, int state, bool from_signal),
+	TP_ARGS(tsk, state, from_signal),
 	TP_CONDITION(!!(tsk->__state) != !!state));
 
+DECLARE_TRACE(sched_set_need_resched,
+	TP_PROTO(struct task_struct *tsk, int cpu, int tif),
+	TP_ARGS(tsk, cpu, tif));
+
+DECLARE_TRACE(sched_switch_vain,
+	TP_PROTO(bool preempt, struct task_struct *tsk, unsigned int prev_state),
+	TP_ARGS(preempt, tsk, prev_state));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 81c6df746df17..6cb70e6f7fa17 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -495,7 +495,7 @@ EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
 /* Call via the helper macro trace_set_current_state. */
 void __trace_set_current_state(int state_value)
 {
-	trace_sched_set_state_tp(current, state_value);
+	trace_sched_set_state_tp(current, state_value, false);
 }
 EXPORT_SYMBOL(__trace_set_current_state);
 
@@ -1110,6 +1110,7 @@ static void __resched_curr(struct rq *rq, int tif)
 
 	cpu = cpu_of(rq);
 
+	trace_sched_set_need_resched_tp(curr, cpu, tif);
 	if (cpu == smp_processor_id()) {
 		set_ti_thread_flag(cti, tif);
 		if (tif == TIF_NEED_RESCHED)
@@ -1125,6 +1126,11 @@ static void __resched_curr(struct rq *rq, int tif)
 	}
 }
 
+void __trace_set_need_resched(struct task_struct *curr, int tif)
+{
+	trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif);
+}
+
 void resched_curr(struct rq *rq)
 {
 	__resched_curr(rq, TIF_NEED_RESCHED);
@@ -6592,6 +6598,7 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
 	int flags = DEQUEUE_NOCLOCK;
 
 	if (signal_pending_state(task_state, p)) {
+		trace_sched_set_state_tp(p, TASK_RUNNING, true);
 		WRITE_ONCE(p->__state, TASK_RUNNING);
 		*task_state_p = TASK_RUNNING;
 		return false;
@@ -6786,6 +6793,7 @@ static void __sched notrace __schedule(int sched_mode)
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
 		rq_unpin_lock(rq, &rf);
+		trace_sched_switch_vain_tp(preempt, prev, prev_state);
 		__balance_callbacks(rq);
 		raw_spin_rq_unlock_irq(rq);
 	}
diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c
index 66f4639d46ac4..c9206aa12c319 100644
--- a/kernel/trace/rv/monitors/sco/sco.c
+++ b/kernel/trace/rv/monitors/sco/sco.c
@@ -19,7 +19,8 @@
 static struct rv_monitor rv_sco;
 DECLARE_DA_MON_PER_CPU(sco, unsigned char);
 
-static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
+static void handle_sched_set_state(void *data, struct task_struct *tsk,
+				   int state, bool from_signal)
 {
 	da_handle_start_event_sco(sched_set_state_sco);
 }
diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
index eea447b069071..5103a98818c53 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.c
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -82,7 +82,8 @@ static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bo
 
 }
 
-static void handle_sched_set_state(void *data, struct task_struct *task, int state)
+static void handle_sched_set_state(void *data, struct task_struct *task,
+				   int state, bool from_signal)
 {
 	if (state & TASK_INTERRUPTIBLE)
 		ltl_atom_pulse(task, LTL_SLEEP, true);
diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c
index 540e686e699f4..2651f589d1554 100644
--- a/kernel/trace/rv/monitors/snroc/snroc.c
+++ b/kernel/trace/rv/monitors/snroc/snroc.c
@@ -19,7 +19,8 @@
 static struct rv_monitor rv_snroc;
 DECLARE_DA_MON_PER_TASK(snroc, unsigned char);
 
-static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
+static void handle_sched_set_state(void *data, struct task_struct *tsk,
+				   int state, bool from_signal)
 {
 	da_handle_event_snroc(tsk, sched_set_state_snroc);
 }
-- 
2.50.1

Re: [PATCH v3 12/17] sched: Adapt sched tracepoints for RV task model

Posted by Peter Zijlstra 6 months, 3 weeks ago

On Tue, Jul 15, 2025 at 09:14:29AM +0200, Gabriele Monaco wrote:
> Add the following tracepoints:
> * sched_set_need_resched(tsk, cpu, tif)
>     Called when a task is set the need resched [lazy] flag
> * sched_switch_vain(preempt, tsk, tsk_state)
>     Called when a task is selected again during __schedule
>     i.e. prev == next == tsk : no real context switch

> @@ -6592,6 +6598,7 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
>  	int flags = DEQUEUE_NOCLOCK;
>  
>  	if (signal_pending_state(task_state, p)) {
> +		trace_sched_set_state_tp(p, TASK_RUNNING, true);
>  		WRITE_ONCE(p->__state, TASK_RUNNING);
>  		*task_state_p = TASK_RUNNING;
>  		return false;

I'm confused on the purpose of this. How does this relate to say the
wakeup in signal_wake_up_state() ?

> @@ -6786,6 +6793,7 @@ static void __sched notrace __schedule(int sched_mode)
>  		rq = context_switch(rq, prev, next, &rf);
>  	} else {
>  		rq_unpin_lock(rq, &rf);
> +		trace_sched_switch_vain_tp(preempt, prev, prev_state);
>  		__balance_callbacks(rq);
>  		raw_spin_rq_unlock_irq(rq);
>  	}

Hurmph... don't you already have this covered by: trace_sched_exit_tp() ?

Specifically, the only case where is_switch := false, is this case.

Re: [PATCH v3 12/17] sched: Adapt sched tracepoints for RV task model

Posted by Gabriele Monaco 6 months, 3 weeks ago

On Wed, 2025-07-16 at 14:38 +0200, Peter Zijlstra wrote:
> On Tue, Jul 15, 2025 at 09:14:29AM +0200, Gabriele Monaco wrote:
> > Add the following tracepoints:
> > * sched_set_need_resched(tsk, cpu, tif)
> >     Called when a task is set the need resched [lazy] flag
> > * sched_switch_vain(preempt, tsk, tsk_state)
> >     Called when a task is selected again during __schedule
> >     i.e. prev == next == tsk : no real context switch
> 
> > @@ -6592,6 +6598,7 @@ static bool try_to_block_task(struct rq *rq,
> > struct task_struct *p,
> >  	int flags = DEQUEUE_NOCLOCK;
> >  
> >  	if (signal_pending_state(task_state, p)) {
> > +		trace_sched_set_state_tp(p, TASK_RUNNING, true);
> >  		WRITE_ONCE(p->__state, TASK_RUNNING);
> >  		*task_state_p = TASK_RUNNING;
> >  		return false;
> 
> I'm confused on the purpose of this. How does this relate to say the
> wakeup in signal_wake_up_state() ?

Also this adds more context: models like sssw (in this series) expect
that, after a task is set to sleepable, it either goes to sleep or is
woken up/set to runnable.

In this specific case, the task is set to runnable without tracing it,
so the model doesn't know what happened, since it may not see a wakeup
after that (the task is already runnable).

Now I'm not sure if there are other events that we are guaranteed to
see to reconstruct this specific case (at some point we should see the
signal, I assume).
This just simplified things as that is the only state change that was
not traced.

Am I missing anything obvious here?

Thanks,
Gabriele

Re: [PATCH v3 12/17] sched: Adapt sched tracepoints for RV task model

Posted by Peter Zijlstra 6 months, 3 weeks ago

On Wed, Jul 16, 2025 at 05:09:43PM +0200, Gabriele Monaco wrote:
> On Wed, 2025-07-16 at 14:38 +0200, Peter Zijlstra wrote:
> > On Tue, Jul 15, 2025 at 09:14:29AM +0200, Gabriele Monaco wrote:
> > > Add the following tracepoints:
> > > * sched_set_need_resched(tsk, cpu, tif)
> > >     Called when a task is set the need resched [lazy] flag
> > > * sched_switch_vain(preempt, tsk, tsk_state)
> > >     Called when a task is selected again during __schedule
> > >     i.e. prev == next == tsk : no real context switch
> > 
> > > @@ -6592,6 +6598,7 @@ static bool try_to_block_task(struct rq *rq,
> > > struct task_struct *p,
> > >  	int flags = DEQUEUE_NOCLOCK;
> > >  
> > >  	if (signal_pending_state(task_state, p)) {
> > > +		trace_sched_set_state_tp(p, TASK_RUNNING, true);
> > >  		WRITE_ONCE(p->__state, TASK_RUNNING);
> > >  		*task_state_p = TASK_RUNNING;
> > >  		return false;
> > 
> > I'm confused on the purpose of this. How does this relate to say the
> > wakeup in signal_wake_up_state() ?
> 
> Also this adds more context: models like sssw (in this series) expect
> that, after a task is set to sleepable, it either goes to sleep or is
> woken up/set to runnable.
> 
> In this specific case, the task is set to runnable without tracing it,
> so the model doesn't know what happened, since it may not see a wakeup
> after that (the task is already runnable).
> 
> Now I'm not sure if there are other events that we are guaranteed to
> see to reconstruct this specific case (at some point we should see the
> signal, I assume).
> This just simplified things as that is the only state change that was
> not traced.
> 
> Am I missing anything obvious here?


AFAICT this is just a normal wakeup race.

Given:

  for (;;) {
    set_current_state(TASK_UNINTERRUPTIBLE);
    if (COND)
      break;
    schedule();
  }
  __set_current_state(TASK_RUNNING);

vs

  COND=1;
  wake_up_state(p, TASK_UNINTERRUPTIBLE);

The race is seeing COND before or after hitting schedule(). With
interruptible this simply becomes:


  for (;;) {
    set_current_state(TASK_INTERRUPTIBLE);
    if (SIGPENDING || COND)
      break;
    schedule();
  }
  __set_current_state(TASK_RUNNING);

vs

  COND=1;
  wake_up_state(p, TASK_INTERRUPTIBLE);

vs

  set_tsk_thread_flag(p, TIF_SIGPENDING);
  wake_up_state(p, TASK_INTERRUPTIBLE);


(same with KILLABLE, but for fatal signals only)
(except the signal thing will often exit with -EINTR / -ERESTARTSYS, but
that should matter here, right?)

How is the race for seeing SIGPENDING different from seeing COND?

Both will issue a wakeup; except in one case the wakeup is superfluous
because schedule didn't end up blocking because it already saw the
condition, while in the other case it did block and the wakeup is indeed
needed.

Anyway, I don't mind tracing it, but I am confused on that new (3rd)
argument to trace_sched_set_state_tp().

Re: [PATCH v3 12/17] sched: Adapt sched tracepoints for RV task model

Posted by Gabriele Monaco 6 months, 3 weeks ago

On Wed, 2025-07-16 at 14:38 +0200, Peter Zijlstra wrote:
> On Tue, Jul 15, 2025 at 09:14:29AM +0200, Gabriele Monaco wrote:
> > Add the following tracepoints:
> > * sched_set_need_resched(tsk, cpu, tif)
> >     Called when a task is set the need resched [lazy] flag
> > * sched_switch_vain(preempt, tsk, tsk_state)
> >     Called when a task is selected again during __schedule
> >     i.e. prev == next == tsk : no real context switch
> 
> > @@ -6592,6 +6598,7 @@ static bool try_to_block_task(struct rq *rq,
> > struct task_struct *p,
> >  	int flags = DEQUEUE_NOCLOCK;
> >  
> >  	if (signal_pending_state(task_state, p)) {
> > +		trace_sched_set_state_tp(p, TASK_RUNNING, true);
> >  		WRITE_ONCE(p->__state, TASK_RUNNING);
> >  		*task_state_p = TASK_RUNNING;
> >  		return false;
> 
> I'm confused on the purpose of this. How does this relate to say the
> wakeup in signal_wake_up_state() ?
> 
> > @@ -6786,6 +6793,7 @@ static void __sched notrace __schedule(int
> > sched_mode)
> >  		rq = context_switch(rq, prev, next, &rf);
> >  	} else {
> >  		rq_unpin_lock(rq, &rf);
> > +		trace_sched_switch_vain_tp(preempt, prev,
> > prev_state);
> >  		__balance_callbacks(rq);
> >  		raw_spin_rq_unlock_irq(rq);
> >  	}
> 
> Hurmph... don't you already have this covered by:
> trace_sched_exit_tp() ?
> 
> Specifically, the only case where is_switch := false, is this case.

Mostly, it may work in some cases, but sched_exit happens with
interrupt enabled while all types of switches (including the vain ones)
must occur with interrupt disabled.

Some assumptions don't stand without this tracepoint, but I guess I
could adapt monitors to live without this if you believe it's not worth
adding a new tracepoint there.

Thanks,
Gabriele

Re: [PATCH v3 12/17] sched: Adapt sched tracepoints for RV task model

Posted by Peter Zijlstra 6 months, 3 weeks ago

On Wed, Jul 16, 2025 at 03:40:13PM +0200, Gabriele Monaco wrote:
> On Wed, 2025-07-16 at 14:38 +0200, Peter Zijlstra wrote:

> > > @@ -6786,6 +6793,7 @@ static void __sched notrace __schedule(int
> > > sched_mode)
> > >  		rq = context_switch(rq, prev, next, &rf);
> > >  	} else {
> > >  		rq_unpin_lock(rq, &rf);
> > > +		trace_sched_switch_vain_tp(preempt, prev,
> > > prev_state);
> > >  		__balance_callbacks(rq);
> > >  		raw_spin_rq_unlock_irq(rq);
> > >  	}
> > 
> > Hurmph... don't you already have this covered by:
> > trace_sched_exit_tp() ?
> > 
> > Specifically, the only case where is_switch := false, is this case.
> 
> Mostly, it may work in some cases, but sched_exit happens with
> interrupt enabled while all types of switches (including the vain ones)
> must occur with interrupt disabled.
> 
> Some assumptions don't stand without this tracepoint, but I guess I
> could adapt monitors to live without this if you believe it's not worth
> adding a new tracepoint there.

I'm not sure I understand the importance of IRQ state when describing
task transitions.

You know both:

 - schedule() invocations for any one task are in-order;
 - schedule() invocations for any one CPU are in-order.

Re: [PATCH v3 12/17] sched: Adapt sched tracepoints for RV task model

Posted by Gabriele Monaco 6 months, 3 weeks ago

On Wed, 2025-07-16 at 15:45 +0200, Peter Zijlstra wrote:
> 
> I'm not sure I understand the importance of IRQ state when describing
> task transitions.
> 
> You know both:
> 
>  - schedule() invocations for any one task are in-order;
>  - schedule() invocations for any one CPU are in-order.
> 

It's to describe latencies, which is the original purpose of the
scheduler model: if the event supposed to wake up an RT task occurs
with interrupts disabled while scheduling, we'd need to wait for that
run to complete.
Now to be fair, it doesn't really matter whether that call to
__schedule switches or not, but always having a switch event simplifies
things while modelling.
I can rearrange models like sts (added in this series) not to expect
that.

Thanks,
Gabriele

Re: [PATCH v3 12/17] sched: Adapt sched tracepoints for RV task model

Posted by Peter Zijlstra 6 months, 3 weeks ago

On Wed, Jul 16, 2025 at 04:07:16PM +0200, Gabriele Monaco wrote:
> On Wed, 2025-07-16 at 15:45 +0200, Peter Zijlstra wrote:
> > 
> > I'm not sure I understand the importance of IRQ state when describing
> > task transitions.
> > 
> > You know both:
> > 
> >  - schedule() invocations for any one task are in-order;
> >  - schedule() invocations for any one CPU are in-order.
> > 
> 
> It's to describe latencies, which is the original purpose of the
> scheduler model: if the event supposed to wake up an RT task occurs
> with interrupts disabled while scheduling, we'd need to wait for that
> run to complete.
> Now to be fair, it doesn't really matter whether that call to
> __schedule switches or not, but always having a switch event simplifies
> things while modelling.

So in general I'm a minimalist; less is more etc. 

Even if you care about latencies, I don't see what that tracepoint
adds. In fact, I don't even see the point of the .is_switch argument to
trace_sched_exit_tp(). That state can be fully reconstructed from having
seen trace_sched_switch() between trace_sched_{enter,exit}_tp().

As for the IRQ state, if you see:

 trace_sched_enter_tp();
 trace_irq_disable();
 trace_irq_enable();

You already know all you need to know; there was no switch, otherwise it
would'be been:

 trace_sched_enter_tp();
 trace_irq_disable();
 trace_sched_switch();
 trace_irq_enable();

Also, can we get rid of that CALLER_ADDR0 argument as well?

Re: [PATCH v3 12/17] sched: Adapt sched tracepoints for RV task model

Posted by Gabriele Monaco 6 months, 3 weeks ago

On Wed, 2025-07-16 at 16:19 +0200, Peter Zijlstra wrote:
> So in general I'm a minimalist; less is more etc.
>
> Even if you care about latencies, I don't see what that tracepoint adds. In
> fact, I don't even see the point of the .is_switch argument to
> trace_sched_exit_tp(). That state can be fully reconstructed from having seen
> trace_sched_switch() between trace_sched_{enter,exit}_tp().
>
> As for the IRQ state, if you see:
>
>  trace_sched_enter_tp();
>  trace_irq_disable();
>  trace_irq_enable();
>
> You already know all you need to know; there was no switch, otherwise
> it would'be been:
>
>  trace_sched_enter_tp();
>  trace_irq_disable();
>  trace_sched_switch();
>  trace_irq_enable();
>

Or you could see:

 trace_sched_enter_tp();
 trace_irq_disable();
 trace_irq_enable();
 trace_irq_disable();
 trace_sched_switch();
 trace_irq_enable();

Where in fact there /was/ a switch, that trace was actually something
like:

 trace_sched_enter_tp();
 trace_irq_disable();
 **irq_entry();**
 **irq_exit();**
 trace_irq_enable();
 trace_irq_disable();
 trace_sched_switch();
 trace_irq_enable();

So as you said, we can still reconstruct what happened from the trace, but the
model suddenly needs more states and more events.
If we could directly tell whether interrupts were disabled manually or from an
actual interrupt, that wouldn't be necessary, for instance (as in the original
model by Daniel).

I get your point why we don't really need the additional tracepoint, but some
arguments giving more context come almost for free.

> Also, can we get rid of that CALLER_ADDR0 argument as well?

Yeah good point, this might have been be useful to understand some things
(__schedule called from preempt_irq for instance) but it's a pain to make sense
out of it.

Re: [PATCH v3 12/17] sched: Adapt sched tracepoints for RV task model

Posted by Peter Zijlstra 6 months, 3 weeks ago

On Wed, Jul 16, 2025 at 04:38:36PM +0200, Gabriele Monaco wrote:

> So as you said, we can still reconstruct what happened from the trace, but the
> model suddenly needs more states and more events.

So given a sequence like:

  trace_sched_enter_tp();
  { trace_irq_disable();
    **irq_entry();**
    **irq_exit();**
    trace_irq_enable(); } * Ni
  trace_irq_disable();
  { trace_sched_switch(); } * Nj
  trace_irq_enable();
  { trace_irq_disable();
    **irq_entry();**
    **irq_exit();**
    trace_irq_enable(); } * Nk
  trace_sched_exit_tp();

It becomes somewhat hard to figure out which exact IRQ disabled section
the switch did not happen in (Nj == 0).

> If we could directly tell whether interrupts were disabled manually or from an
> actual interrupt, that wouldn't be necessary, for instance (as in the original
> model by Daniel).

Hmm.. we do indeed appear to trace the IRQ state before adding
HARDIRQ_OFFSET to preempt_count(). Yes, that complicates things a
little.

So... it *might* be possible to lift lockdep_hardirq_enter() to before
we start tracing. But then you're stuck to running with lockdep
enabled -- I'm thinking that's not ideal, given those other patches you
sent.

I'm going to go on holidays soon, but I've made a note to see if we can
lift setting HARDIRQ_OFFSET before we start tracing. IIRC the current
order is because setting HARDIRQ_OFFSET is using preempt_count_add()
which can be instrumented itself.

But we could use __preempt_count_add() instead, then we loose the
tracing from setting HARDIRQ_OFFSET, but I don't think that is a
problem. We already get the latency from the IRQ tracepoints after all.

> I get your point why we don't really need the additional tracepoint, but some
> arguments giving more context come almost for free.

Right. So please always try and justify adding tracepoints.

Re: [PATCH v3 12/17] sched: Adapt sched tracepoints for RV task model

Posted by Gabriele Monaco 6 months, 3 weeks ago

On Wed, 2025-07-16 at 17:31 +0200, Peter Zijlstra wrote:
> On Wed, Jul 16, 2025 at 04:38:36PM +0200, Gabriele Monaco wrote:
> 
> > So as you said, we can still reconstruct what happened from the
> > trace, but the model suddenly needs more states and more events.
> 
> So given a sequence like:
> 
>   trace_sched_enter_tp();
>   { trace_irq_disable();
>     **irq_entry();**
>     **irq_exit();**
>     trace_irq_enable(); } * Ni
>   trace_irq_disable();
>   { trace_sched_switch(); } * Nj
>   trace_irq_enable();
>   { trace_irq_disable();
>     **irq_entry();**
>     **irq_exit();**
>     trace_irq_enable(); } * Nk
>   trace_sched_exit_tp();
> 
> It becomes somewhat hard to figure out which exact IRQ disabled
> section
> the switch did not happen in (Nj == 0).
> 
> > If we could directly tell whether interrupts were disabled manually
> > or from an actual interrupt, that wouldn't be necessary, for
> > instance (as in the original model by Daniel).
> 
> Hmm.. we do indeed appear to trace the IRQ state before adding
> HARDIRQ_OFFSET to preempt_count(). Yes, that complicates things a
> little.
> 
> So... it *might* be possible to lift lockdep_hardirq_enter() to
> before we start tracing. But then you're stuck to running with
> lockdep enabled -- I'm thinking that's not ideal, given those other
> patches you sent.
> 
> I'm going to go on holidays soon, but I've made a note to see if we
> can lift setting HARDIRQ_OFFSET before we start tracing. IIRC the
> current order is because setting HARDIRQ_OFFSET is using
> preempt_count_add() which can be instrumented itself.
> 

Yeah I wondered if that was something perhaps required by RCU or
something else (some calls are in the way). NMIs have it set during the
tracepoints, for instance.

Thanks again and enjoy your holiday!

Gabriele

> But we could use __preempt_count_add() instead, then we loose the
> tracing from setting HARDIRQ_OFFSET, but I don't think that is a
> problem. We already get the latency from the IRQ tracepoints after
> all.
> 
> > I get your point why we don't really need the additional
> > tracepoint, but some
> > arguments giving more context come almost for free.
> 
> Right. So please always try and justify adding tracepoints.