[PATCH v2] tracing: Fix inconsistency in irq tracking on NMIs

Gabriele Monaco posted 1 patch 3 months, 2 weeks ago
arch/arm64/kernel/entry-common.c |  5 ++---
kernel/entry/common.c            |  5 ++---
kernel/trace/trace_preemptirq.c  | 12 +++++++-----
3 files changed, 11 insertions(+), 11 deletions(-)
[PATCH v2] tracing: Fix inconsistency in irq tracking on NMIs
Posted by Gabriele Monaco 3 months, 2 weeks ago
The irq_enable/irq_disable tracepoints fire only when there's an actual
transition (enabled->disabled and vice versa), this needs special care
in NMIs, as they can potentially start with interrupts already disabled.
The current implementation takes care of this by tracking the lockdep
state on nmi_entry as well as using the variable tracing_irq_cpu to
synchronise with other calls (e.g. local_irq_disable/enable).

This can be racy in case of NMIs when lockdep is enabled, and can lead
to missing events when lockdep is disabled.

Remove dependency on the lockdep status in the NMI common entry/exit
code and adapt the tracing code to make sure that:

- The first call disabling interrupts fires the tracepoint
- The first non-NMI call enabling interrupts fires the tracepoint
- The last NMI call enabling interrupts fires the tracepoint unless
  interrupts were disabled before the NMI
- All other calls don't fire

Fixes: ba1f2b2eaa2a ("x86/entry: Fix NMI vs IRQ state tracking")
Fixes: f0cd5ac1e4c5 ("arm64: entry: fix NMI {user, kernel}->kernel transitions")
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---

The inconsistency is visible with the sncid RV monitor and particularly
likely on machines with the following setup:
- x86 bare-metal with 40+ CPUs
- tuned throughput-performance (activating regular perf NMIs)
- workload: stress-ng --cpu-sched 21 --timer 11 --signal 11

The presence of the RV monitor is useful to see the error but it is not
necessary to trigger it.

Changes since V1:
* Reworded confusing changelog
* Remove dependency on lockdep counters for tracepoints
* Ensure we don't drop valid tracepoints
* Extend change to arm64 code

 arch/arm64/kernel/entry-common.c |  5 ++---
 kernel/entry/common.c            |  5 ++---
 kernel/trace/trace_preemptirq.c  | 12 +++++++-----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 7c1970b341b8c..7f1844123642e 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -213,10 +213,9 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs)
 	bool restore = regs->lockdep_hardirqs;
 
 	ftrace_nmi_exit();
-	if (restore) {
-		trace_hardirqs_on_prepare();
+	trace_hardirqs_on_prepare();
+	if (restore)
 		lockdep_hardirqs_on_prepare();
-	}
 
 	ct_nmi_exit();
 	lockdep_hardirq_exit();
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index a8dd1f27417cf..e234f264fb495 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -343,10 +343,9 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
 {
 	instrumentation_begin();
 	ftrace_nmi_exit();
-	if (irq_state.lockdep) {
-		trace_hardirqs_on_prepare();
+	trace_hardirqs_on_prepare();
+	if (irq_state.lockdep)
 		lockdep_hardirqs_on_prepare();
-	}
 	instrumentation_end();
 
 	ct_nmi_exit();
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 0c42b15c38004..fa45474fc54f1 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -58,7 +58,11 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
  */
 void trace_hardirqs_on_prepare(void)
 {
-	if (this_cpu_read(tracing_irq_cpu)) {
+	int tracing_count = this_cpu_read(tracing_irq_cpu);
+
+	if (in_nmi() && tracing_count > 1)
+		this_cpu_dec(tracing_irq_cpu);
+	else if (tracing_count) {
 		trace(irq_enable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
 		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
 		this_cpu_write(tracing_irq_cpu, 0);
@@ -89,8 +93,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on);
  */
 void trace_hardirqs_off_finish(void)
 {
-	if (!this_cpu_read(tracing_irq_cpu)) {
-		this_cpu_write(tracing_irq_cpu, 1);
+	if (this_cpu_inc_return(tracing_irq_cpu) == 1) {
 		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
 		trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
 	}
@@ -103,8 +106,7 @@ void trace_hardirqs_off(void)
 {
 	lockdep_hardirqs_off(CALLER_ADDR0);
 
-	if (!this_cpu_read(tracing_irq_cpu)) {
-		this_cpu_write(tracing_irq_cpu, 1);
+	if (this_cpu_inc_return(tracing_irq_cpu) == 1) {
 		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
 		trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
 	}

base-commit: 78f4e737a53e1163ded2687a922fce138aee73f5
-- 
2.49.0
Re: [PATCH v2] tracing: Fix inconsistency in irq tracking on NMIs
Posted by Peter Zijlstra 3 months, 1 week ago
On Wed, Jun 25, 2025 at 02:08:22PM +0200, Gabriele Monaco wrote:
> The irq_enable/irq_disable tracepoints fire only when there's an actual
> transition (enabled->disabled and vice versa), this needs special care
> in NMIs, as they can potentially start with interrupts already disabled.
> The current implementation takes care of this by tracking the lockdep
> state on nmi_entry as well as using the variable tracing_irq_cpu to
> synchronise with other calls (e.g. local_irq_disable/enable).
> 
> This can be racy in case of NMIs when lockdep is enabled, and can lead
> to missing events when lockdep is disabled.
> 
> Remove dependency on the lockdep status in the NMI common entry/exit
> code and adapt the tracing code to make sure that:
> 
> - The first call disabling interrupts fires the tracepoint
> - The first non-NMI call enabling interrupts fires the tracepoint
> - The last NMI call enabling interrupts fires the tracepoint unless
>   interrupts were disabled before the NMI
> - All other calls don't fire

I'm not at all convinced this is correct. Nor can I understand anything
much about what you wrote above.


>  arch/arm64/kernel/entry-common.c |  5 ++---
>  kernel/entry/common.c            |  5 ++---
>  kernel/trace/trace_preemptirq.c  | 12 +++++++-----
>  3 files changed, 11 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> index 7c1970b341b8c..7f1844123642e 100644
> --- a/arch/arm64/kernel/entry-common.c
> +++ b/arch/arm64/kernel/entry-common.c
> @@ -213,10 +213,9 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs)
>  	bool restore = regs->lockdep_hardirqs;
>  
>  	ftrace_nmi_exit();
> -	if (restore) {
> -		trace_hardirqs_on_prepare();
> +	trace_hardirqs_on_prepare();
> +	if (restore)
>  		lockdep_hardirqs_on_prepare();
> -	}
>  
>  	ct_nmi_exit();
>  	lockdep_hardirq_exit();
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index a8dd1f27417cf..e234f264fb495 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -343,10 +343,9 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
>  {
>  	instrumentation_begin();
>  	ftrace_nmi_exit();
> -	if (irq_state.lockdep) {
> -		trace_hardirqs_on_prepare();
> +	trace_hardirqs_on_prepare();
> +	if (irq_state.lockdep)
>  		lockdep_hardirqs_on_prepare();
> -	}
>  	instrumentation_end();
>  
>  	ct_nmi_exit();
> diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
> index 0c42b15c38004..fa45474fc54f1 100644
> --- a/kernel/trace/trace_preemptirq.c
> +++ b/kernel/trace/trace_preemptirq.c
> @@ -58,7 +58,11 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
>   */
>  void trace_hardirqs_on_prepare(void)
>  {
> -	if (this_cpu_read(tracing_irq_cpu)) {
> +	int tracing_count = this_cpu_read(tracing_irq_cpu);
> +
> +	if (in_nmi() && tracing_count > 1)
> +		this_cpu_dec(tracing_irq_cpu);
> +	else if (tracing_count) {
>  		trace(irq_enable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
>  		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
>  		this_cpu_write(tracing_irq_cpu, 0);
> @@ -89,8 +93,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on);
>   */
>  void trace_hardirqs_off_finish(void)
>  {
> -	if (!this_cpu_read(tracing_irq_cpu)) {
> -		this_cpu_write(tracing_irq_cpu, 1);
> +	if (this_cpu_inc_return(tracing_irq_cpu) == 1) {
>  		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
>  		trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
>  	}
> @@ -103,8 +106,7 @@ void trace_hardirqs_off(void)
>  {
>  	lockdep_hardirqs_off(CALLER_ADDR0);
>  
> -	if (!this_cpu_read(tracing_irq_cpu)) {
> -		this_cpu_write(tracing_irq_cpu, 1);
> +	if (this_cpu_inc_return(tracing_irq_cpu) == 1) {
>  		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
>  		trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
>  	}

So what about lovely things like:

trace_hardirqs_on_prepare()
  if (tracing_irq_cpu) {
    tracer_hardirqs_on();
    <NMI>
      trace_hardirqs_off_finish();
        if (this_cpu_inc_return() == 1) // will be > 1


So now we've traced IRQs are on, start an NMI, and loose the IRQs off
event. Well done.


This was all safe in that it would occasionally emit a duplicate state,
but no state was wrong. Both your attempts have broken things.

How about you fix you tool to accept duplicate state in the face of NMI
instead?
Re: [PATCH v2] tracing: Fix inconsistency in irq tracking on NMIs
Posted by Gabriele Monaco 3 months, 1 week ago

On Tue, 2025-07-01 at 14:54 +0200, Peter Zijlstra wrote:
> On Wed, Jun 25, 2025 at 02:08:22PM +0200, Gabriele Monaco wrote:
> > The irq_enable/irq_disable tracepoints fire only when there's an
> > actual
> > transition (enabled->disabled and vice versa), this needs special
> > care
> > in NMIs, as they can potentially start with interrupts already
> > disabled.
> > The current implementation takes care of this by tracking the
> > lockdep
> > state on nmi_entry as well as using the variable tracing_irq_cpu to
> > synchronise with other calls (e.g. local_irq_disable/enable).
> > 
> > This can be racy in case of NMIs when lockdep is enabled, and can
> > lead
> > to missing events when lockdep is disabled.
> > 
> > Remove dependency on the lockdep status in the NMI common
> > entry/exit
> > code and adapt the tracing code to make sure that:
> > 
> > - The first call disabling interrupts fires the tracepoint
> > - The first non-NMI call enabling interrupts fires the tracepoint
> > - The last NMI call enabling interrupts fires the tracepoint unless
> >   interrupts were disabled before the NMI
> > - All other calls don't fire
> 
> I'm not at all convinced this is correct. Nor can I understand
> anything
> much about what you wrote above.
> 
> 
> >  arch/arm64/kernel/entry-common.c |  5 ++---
> >  kernel/entry/common.c            |  5 ++---
> >  kernel/trace/trace_preemptirq.c  | 12 +++++++-----
> >  3 files changed, 11 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/arm64/kernel/entry-common.c
> > b/arch/arm64/kernel/entry-common.c
> > index 7c1970b341b8c..7f1844123642e 100644
> > --- a/arch/arm64/kernel/entry-common.c
> > +++ b/arch/arm64/kernel/entry-common.c
> > @@ -213,10 +213,9 @@ static void noinstr arm64_exit_nmi(struct
> > pt_regs *regs)
> >  	bool restore = regs->lockdep_hardirqs;
> >  
> >  	ftrace_nmi_exit();
> > -	if (restore) {
> > -		trace_hardirqs_on_prepare();
> > +	trace_hardirqs_on_prepare();
> > +	if (restore)
> >  		lockdep_hardirqs_on_prepare();
> > -	}
> >  
> >  	ct_nmi_exit();
> >  	lockdep_hardirq_exit();
> > diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> > index a8dd1f27417cf..e234f264fb495 100644
> > --- a/kernel/entry/common.c
> > +++ b/kernel/entry/common.c
> > @@ -343,10 +343,9 @@ void noinstr irqentry_nmi_exit(struct pt_regs
> > *regs, irqentry_state_t irq_state)
> >  {
> >  	instrumentation_begin();
> >  	ftrace_nmi_exit();
> > -	if (irq_state.lockdep) {
> > -		trace_hardirqs_on_prepare();
> > +	trace_hardirqs_on_prepare();
> > +	if (irq_state.lockdep)
> >  		lockdep_hardirqs_on_prepare();
> > -	}
> >  	instrumentation_end();
> >  
> >  	ct_nmi_exit();
> > diff --git a/kernel/trace/trace_preemptirq.c
> > b/kernel/trace/trace_preemptirq.c
> > index 0c42b15c38004..fa45474fc54f1 100644
> > --- a/kernel/trace/trace_preemptirq.c
> > +++ b/kernel/trace/trace_preemptirq.c
> > @@ -58,7 +58,11 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
> >   */
> >  void trace_hardirqs_on_prepare(void)
> >  {
> > -	if (this_cpu_read(tracing_irq_cpu)) {
> > +	int tracing_count = this_cpu_read(tracing_irq_cpu);
> > +
> > +	if (in_nmi() && tracing_count > 1)
> > +		this_cpu_dec(tracing_irq_cpu);
> > +	else if (tracing_count) {
> >  		trace(irq_enable, TP_ARGS(CALLER_ADDR0,
> > CALLER_ADDR1));
> >  		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
> >  		this_cpu_write(tracing_irq_cpu, 0);
> > @@ -89,8 +93,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on);
> >   */
> >  void trace_hardirqs_off_finish(void)
> >  {
> > -	if (!this_cpu_read(tracing_irq_cpu)) {
> > -		this_cpu_write(tracing_irq_cpu, 1);
> > +	if (this_cpu_inc_return(tracing_irq_cpu) == 1) {
> >  		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
> >  		trace(irq_disable, TP_ARGS(CALLER_ADDR0,
> > CALLER_ADDR1));
> >  	}
> > @@ -103,8 +106,7 @@ void trace_hardirqs_off(void)
> >  {
> >  	lockdep_hardirqs_off(CALLER_ADDR0);
> >  
> > -	if (!this_cpu_read(tracing_irq_cpu)) {
> > -		this_cpu_write(tracing_irq_cpu, 1);
> > +	if (this_cpu_inc_return(tracing_irq_cpu) == 1) {
> >  		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
> >  		trace(irq_disable, TP_ARGS(CALLER_ADDR0,
> > CALLER_ADDR1));
> >  	}
> 
> So what about lovely things like:
> 
> trace_hardirqs_on_prepare()
>   if (tracing_irq_cpu) {
>     tracer_hardirqs_on();
>     <NMI>
>       trace_hardirqs_off_finish();
>         if (this_cpu_inc_return() == 1) // will be > 1
> 
> 
> So now we've traced IRQs are on, start an NMI, and loose the IRQs off
> event. Well done.
> 
> 
> This was all safe in that it would occasionally emit a duplicate
> state,
> but no state was wrong. Both your attempts have broken things.
> 
> How about you fix you tool to accept duplicate state in the face of
> NMI
> instead?

Alright, I get this is yet another broken solution.. Thanks for
pointing it out.

The problem here is that, from the model's perspective, we don't only
get duplicated events, but also missing ones:

 d..1. ..932: irq_enable:   caller=finish_task_switch
 d.h3. ..933: event_sncid:  can_sched x irq_disable -> cant_sched
 d.h2. ..933: irq_disable:  caller=irqentry_nmi_enter+0x53 
 d.h2. ..936: nmi_handler:  perf_event_nmi_handler()
< missing irq_enable from NMI >
 ...2. ..942: error_sncid:  event schedule_exit not expected in the
state cant_sched

[This monitor verifies exiting and entering __schedule occur only with
interrupts enabled]

I first thought this was just possible with racing NMIs, but as Thomas
pointed out [1], this can happen much easier when irqsoff is enabled
but lockdep is not.

I could probably only fix this without even considering NMIs when
interrupts are disabled, but I believe if that happens, the tracepoints
would report something wrong, since using tracing_irq_cpu alone does:

local_irq_disable -> trace_irq_off
  nmi_enter -> no trace
  nmi_exit -> trace_irq_on
  // here interrupts are still off, aren't they?
local_irq_enable -> no trace

The idea that I described poorly, was to use tracing_irq_cpu in a way
that the first context disabling interrupts fires the tracepoint
(current behaviour), but when it's time to enable interrupts, an NMI
which didn't disable interrupts shouldn't trace but let the next
context trace.

Thanks,
Gabriele

[1] - https://lore.kernel.org/lkml/87sejup1fe.ffs@tglx
Re: [PATCH v2] tracing: Fix inconsistency in irq tracking on NMIs
Posted by Thomas Gleixner 3 months, 1 week ago
On Wed, Jul 02 2025 at 09:18, Gabriele Monaco wrote:
> On Tue, 2025-07-01 at 14:54 +0200, Peter Zijlstra wrote:
> I could probably only fix this without even considering NMIs when
> interrupts are disabled, but I believe if that happens, the tracepoints
> would report something wrong, since using tracing_irq_cpu alone does:
>
> local_irq_disable -> trace_irq_off
>   nmi_enter -> no trace
>   nmi_exit -> trace_irq_on
>   // here interrupts are still off, aren't they?
> local_irq_enable -> no trace
>
> The idea that I described poorly, was to use tracing_irq_cpu in a way
> that the first context disabling interrupts fires the tracepoint
> (current behaviour), but when it's time to enable interrupts, an NMI
> which didn't disable interrupts shouldn't trace but let the next
> context trace.
...
> [1] - https://lore.kernel.org/lkml/87sejup1fe.ffs@tglx

As I told you before:

 "As you correctly described, the two states are asynchronous in the
  context of NMIs, so the obvious thing is to treat them as seperate
  entities so that the trace really contains the off/on events
  independent of lockdep enablement and lockdep's internal state. No?"

So I would have expected that you look at the lockdep implementation and
figure out why that one works correctly. Let's have a look at it:

nmi_entry()
   irq_state.lockdep = lockdep_hardirqs_enabled();

   lockdep_hardirqs_off(CALLER_ADDR0);
   ...
   return irq_state;

nmi_exit(irq_state)

   if (irq_state.lockdep)
	lockdep_hardirqs_on(CALLER_ADDR0);

On NMI entry the internal state of lockdep tracking is recorded and
lockdep_hardirqs_off() handles a redundant off gracefully. As I said it
might be arguable to invoke it only when irq_state.lockdep == true, but
that's a cosmetic or performance issue not a correctness problem.

On NMI exit lockdep_hardirqs_on() is only invoked when the lockdep internal
state at entry was 'enabled' so it can restore the state correctly.

If you model the tracer exactly the same way:

   irq_state.lockdep = lockdep_hardirqs_enabled();
   irq_state.tracer = tracer_hardirqs_enabled();
   ....

   You get the idea...

That will be "correct" in terms of sequencing depending on your trace side
implementation:

  trace_hardirqs_off()
        if (trace_hardirqs_enabled()) {
        	trace_irqs_enabled = false;     // A
                emit_trace(OFF);	        // B

If the NMI hits before A, everything is fine because it will record a
ON->OFF transition on NMI entry and a OFF->ON transition on NMI
exit. Then the interrupted context records a ON->OFF transition again.

If it hits between A and B, then the NMI tracing wont do anything
because enabled state is already false, but the resulting trace
sequencing is messed up:

     irqs on
     ...
     nmi_entry
     nmi_exit
     irqs off

IOW, it misses the ON->OFF OFF->ON transitions in the NMI.

So you want to do it the other way round:

  trace_hardirqs_off()
        if (trace_hardirqs_enabled()) {
                emit_trace(OFF);	        // A
        	trace_irqs_enabled = false;     // B

If the NMI hits before A, everything is fine because it will record a
ON->OFF transition on NMI entry and a OFF->ON transition on NMI
exit. Then the interrupted context records a ON->OFF transition again.

If it hits between A and B then you get a duplicate

     irqs on
     ...
     irqs off
     nmi_entry
     irqs off
     irqs on
     nmi_exit

There is nothing you can do about it, but that's easy enough to filter
out, no?

I'm pretty sure you can figure out how that should be modeled on the
on() side of affairs.

Thanks,

        tglx
Re: [PATCH v2] tracing: Fix inconsistency in irq tracking on NMIs
Posted by Thomas Gleixner 3 months, 1 week ago
On Wed, Jun 25 2025 at 14:08, Gabriele Monaco wrote:
> The irq_enable/irq_disable tracepoints fire only when there's an actual
> transition (enabled->disabled and vice versa), this needs special care

vice versa). This needs ...

> in NMIs, as they can potentially start with interrupts already disabled.
> The current implementation takes care of this by tracking the lockdep
> state on nmi_entry as well as using the variable tracing_irq_cpu to
> synchronise with other calls (e.g. local_irq_disable/enable).
>
> This can be racy in case of NMIs when lockdep is enabled, and can lead
> to missing events when lockdep is disabled.
>
> Remove dependency on the lockdep status in the NMI common entry/exit
> code and adapt the tracing code to make sure that:
>
> - The first call disabling interrupts fires the tracepoint
> - The first non-NMI call enabling interrupts fires the tracepoint
> - The last NMI call enabling interrupts fires the tracepoint unless
>   interrupts were disabled before the NMI
> - All other calls don't fire

Please mention, that you fix the same problem in the ARM64 specific variant.

> Fixes: ba1f2b2eaa2a ("x86/entry: Fix NMI vs IRQ state tracking")
> Fixes: f0cd5ac1e4c5 ("arm64: entry: fix NMI {user, kernel}->kernel transitions")
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
> ---
>
> The inconsistency is visible with the sncid RV monitor and particularly
> likely on machines with the following setup:
> - x86 bare-metal with 40+ CPUs
> - tuned throughput-performance (activating regular perf NMIs)
> - workload: stress-ng --cpu-sched 21 --timer 11 --signal 11
>
> The presence of the RV monitor is useful to see the error but it is not
> necessary to trigger it.
>
> Changes since V1:
> * Reworded confusing changelog
> * Remove dependency on lockdep counters for tracepoints
> * Ensure we don't drop valid tracepoints
> * Extend change to arm64 code
>
>  arch/arm64/kernel/entry-common.c |  5 ++---
>  kernel/entry/common.c            |  5 ++---
>  kernel/trace/trace_preemptirq.c  | 12 +++++++-----
>  3 files changed, 11 insertions(+), 11 deletions(-)
>
> diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> index 7c1970b341b8c..7f1844123642e 100644
> --- a/arch/arm64/kernel/entry-common.c
> +++ b/arch/arm64/kernel/entry-common.c
> @@ -213,10 +213,9 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs)
>  	bool restore = regs->lockdep_hardirqs;
>  
>  	ftrace_nmi_exit();
> -	if (restore) {
> -		trace_hardirqs_on_prepare();
> +	trace_hardirqs_on_prepare();
> +	if (restore)
>  		lockdep_hardirqs_on_prepare();
> -	}
>  
>  	ct_nmi_exit();
>  	lockdep_hardirq_exit();
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index a8dd1f27417cf..e234f264fb495 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -343,10 +343,9 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
>  {
>  	instrumentation_begin();
>  	ftrace_nmi_exit();
> -	if (irq_state.lockdep) {
> -		trace_hardirqs_on_prepare();
> +	trace_hardirqs_on_prepare();
> +	if (irq_state.lockdep)
>  		lockdep_hardirqs_on_prepare();
> -	}
>  	instrumentation_end();
>  
>  	ct_nmi_exit();
> diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
> index 0c42b15c38004..fa45474fc54f1 100644
> --- a/kernel/trace/trace_preemptirq.c
> +++ b/kernel/trace/trace_preemptirq.c
> @@ -58,7 +58,11 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
>   */
>  void trace_hardirqs_on_prepare(void)
>  {
> -	if (this_cpu_read(tracing_irq_cpu)) {
> +	int tracing_count = this_cpu_read(tracing_irq_cpu);
> +
> +	if (in_nmi() && tracing_count > 1)
> +		this_cpu_dec(tracing_irq_cpu);

This if clause wants curly brackets and please add a comment explaining
this in_nmi() magic. Two month down the road everyone forgot including
you :)

Thanks,

        tglx