drivers/hwtracing/intel_th/core.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-)
In certain cases intel_th_irq can reference a null entry in
the th->thdev array. This results in the splat shown below.
The problem is that intel_th_output_enable() can modify the
thdev[] array at the same time intel_th_irq is referencing
the same array. This can be fixed by disabling interrupts
during the call to intel_th_output_enable().
BUG: kernel NULL pointer dereference, address: 0000000000000304
Oops: Oops: 0000 [#1] SMP NOPTI
RIP: 0010:intel_th_irq+0x26/0x70 [intel_th]
Call Trace:
<IRQ>
? show_trace_log_lvl+0x1b0/0x2f0
? show_trace_log_lvl+0x1b0/0x2f0
? __handle_irq_event_percpu+0x4a/0x180
? __die_body.cold+0x8/0x12
? page_fault_oops+0x148/0x160
? exc_page_fault+0x73/0x160
? asm_exc_page_fault+0x26/0x30
? intel_th_irq+0x26/0x70 [intel_th]
__handle_irq_event_percpu+0x4a/0x180
handle_irq_event+0x38/0x80
handle_fasteoi_irq+0x78/0x200
__common_interrupt+0x3e/0x90
common_interrupt+0x80/0xa0
</IRQ>
Fixes: a753bfcfdb1f ("intel_th: Make the switch allocate its subdevices")
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: linux-kernel@vger.kernel.org
Cc: Jerry Hoemann <jerry.hoemann@hpe.com>
Signed-off-by: David Arcari <darcari@redhat.com>
---
drivers/hwtracing/intel_th/core.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/drivers/hwtracing/intel_th/core.c b/drivers/hwtracing/intel_th/core.c
index 47d9e6c3bac0..c6f6153fcc88 100644
--- a/drivers/hwtracing/intel_th/core.c
+++ b/drivers/hwtracing/intel_th/core.c
@@ -715,7 +715,9 @@ intel_th_subdevice_alloc(struct intel_th *th,
int intel_th_output_enable(struct intel_th *th, unsigned int otype)
{
struct intel_th_device *thdev;
- int src = 0, dst = 0;
+ int src = 0, dst = 0, ret = 0;
+
+ disable_irq(th->irq);
for (src = 0, dst = 0; dst <= th->num_thdevs; src++, dst++) {
for (; src < ARRAY_SIZE(intel_th_subdevices); src++) {
@@ -730,7 +732,7 @@ int intel_th_output_enable(struct intel_th *th, unsigned int otype)
/* no unallocated matching subdevices */
if (src == ARRAY_SIZE(intel_th_subdevices))
- return -ENODEV;
+ goto nodev;
for (; dst < th->num_thdevs; dst++) {
if (th->thdev[dst]->type != INTEL_TH_OUTPUT)
@@ -750,16 +752,19 @@ int intel_th_output_enable(struct intel_th *th, unsigned int otype)
goto found;
}
+nodev:
+ enable_irq(th->irq);
return -ENODEV;
found:
thdev = intel_th_subdevice_alloc(th, &intel_th_subdevices[src]);
if (IS_ERR(thdev))
- return PTR_ERR(thdev);
-
- th->thdev[th->num_thdevs++] = thdev;
+ ret = PTR_ERR(thdev);
+ else
+ th->thdev[th->num_thdevs++] = thdev;
- return 0;
+ enable_irq(th->irq);
+ return ret;
}
EXPORT_SYMBOL_GPL(intel_th_output_enable);
--
2.50.1
… > +++ b/drivers/hwtracing/intel_th/core.c > @@ -715,7 +715,9 @@ intel_th_subdevice_alloc(struct intel_th *th, > int intel_th_output_enable(struct intel_th *th, unsigned int otype) > { > struct intel_th_device *thdev; > - int src = 0, dst = 0; > + int src = 0, dst = 0, ret = 0; > + > + disable_irq(th->irq); … > - return 0; > + enable_irq(th->irq); > + return ret; > } … How do you think about to increase the application of scope-based resource management? https://elixir.bootlin.com/linux/v6.17-rc7/source/include/linux/interrupt.h#L239-L240 Regards, Markus
Hi Markus, On 9/27/25 10:54 AM, Markus Elfring wrote: > … >> +++ b/drivers/hwtracing/intel_th/core.c >> @@ -715,7 +715,9 @@ intel_th_subdevice_alloc(struct intel_th *th, >> int intel_th_output_enable(struct intel_th *th, unsigned int otype) >> { >> struct intel_th_device *thdev; >> - int src = 0, dst = 0; >> + int src = 0, dst = 0, ret = 0; >> + >> + disable_irq(th->irq); > … >> - return 0; >> + enable_irq(th->irq); >> + return ret; >> } > … > > How do you think about to increase the application of scope-based resource management? > https://elixir.bootlin.com/linux/v6.17-rc7/source/include/linux/interrupt.h#L239-L240 At this point, I think that Alex is the best person to handle the resolution of this issue. Best, -DA > > Regards, > Markus >
On 2025-08-25 20:45, David Arcari wrote: > In certain cases intel_th_irq can reference a null entry in > the th->thdev array. This results in the splat shown below. > The problem is that intel_th_output_enable() can modify the > thdev[] array at the same time intel_th_irq is referencing > the same array. This can be fixed by disabling interrupts > during the call to intel_th_output_enable(). Hi David, Thank you for the bug report and rootcausing! Can you please also detail the sequence of actions by which this is reproduced, so that I can test my fix and not bother you with a back-and-forth over-email debugging and also add it to our regression testing? Doesn't have to be a shell script (although I wouldn't say no to that), plain english would work in a pinch. If you have the time, I'm also curious about your use case for intel_th. This has eluded our testing for about 10 years, so I'm very interested in the reproducer. > BUG: kernel NULL pointer dereference, address: 0000000000000304 > Oops: Oops: 0000 [#1] SMP NOPTI > RIP: 0010:intel_th_irq+0x26/0x70 [intel_th] Yes, this is absolutely a bug. > @@ -715,7 +715,9 @@ intel_th_subdevice_alloc(struct intel_th *th, > int intel_th_output_enable(struct intel_th *th, unsigned int otype) > { > struct intel_th_device *thdev; > - int src = 0, dst = 0; > + int src = 0, dst = 0, ret = 0; > + > + disable_irq(th->irq); > > for (src = 0, dst = 0; dst <= th->num_thdevs; src++, dst++) { > for (; src < ARRAY_SIZE(intel_th_subdevices); src++) { [...] > @@ -750,16 +752,19 @@ int intel_th_output_enable(struct intel_th *th, > unsigned int otype) > goto found; > } > > +nodev: > + enable_irq(th->irq); > return -ENODEV; > > found: > thdev = intel_th_subdevice_alloc(th, &intel_th_subdevices[src]); > if (IS_ERR(thdev)) > - return PTR_ERR(thdev); > - > - th->thdev[th->num_thdevs++] = thdev; > + ret = PTR_ERR(thdev); > + else > + th->thdev[th->num_thdevs++] = thdev; > > - return 0; > + enable_irq(th->irq); > + return ret; > } > EXPORT_SYMBOL_GPL(intel_th_output_enable); This is indeed a possible fix, but I believe a little bit of serialization can be employed here. Lastly, my apologies for tardiness. Thanks! -- Alex
Hi Alex, On 9/26/25 12:19 PM, alex@ash.works wrote: > On 2025-08-25 20:45, David Arcari wrote: >> In certain cases intel_th_irq can reference a null entry in >> the th->thdev array. This results in the splat shown below. >> The problem is that intel_th_output_enable() can modify the >> thdev[] array at the same time intel_th_irq is referencing >> the same array. This can be fixed by disabling interrupts >> during the call to intel_th_output_enable(). > > Hi David, > > Thank you for the bug report and rootcausing! Can you please also > detail the sequence of actions by which this is reproduced, so > that I can test my fix and not bother you with a back-and-forth > over-email debugging and also add it to our regression testing? > Doesn't have to be a shell script (although I wouldn't say no > to that), plain english would work in a pinch. If you have the > time, I'm also curious about your use case for intel_th. Unfortunately, I don't have a great reproducer. I have a system which is afflicted in ~ 1 out of every 100 reboots. Adding debug code made the problem easier to reproduce. > > This has eluded our testing for about 10 years, so I'm very > interested in the reproducer. > >> BUG: kernel NULL pointer dereference, address: 0000000000000304 >> Oops: Oops: 0000 [#1] SMP NOPTI >> RIP: 0010:intel_th_irq+0x26/0x70 [intel_th] > > Yes, this is absolutely a bug. > >> @@ -715,7 +715,9 @@ intel_th_subdevice_alloc(struct intel_th *th, >> int intel_th_output_enable(struct intel_th *th, unsigned int otype) >> { >> struct intel_th_device *thdev; >> - int src = 0, dst = 0; >> + int src = 0, dst = 0, ret = 0; >> + >> + disable_irq(th->irq); >> >> for (src = 0, dst = 0; dst <= th->num_thdevs; src++, dst++) { >> for (; src < ARRAY_SIZE(intel_th_subdevices); src++) { > > [...] > >> @@ -750,16 +752,19 @@ int intel_th_output_enable(struct intel_th *th, >> unsigned int otype) >> goto found; >> } >> >> +nodev: >> + enable_irq(th->irq); >> return -ENODEV; >> >> found: >> thdev = intel_th_subdevice_alloc(th, &intel_th_subdevices[src]); >> if (IS_ERR(thdev)) >> - return PTR_ERR(thdev); >> - >> - th->thdev[th->num_thdevs++] = thdev; >> + ret = PTR_ERR(thdev); >> + else >> + th->thdev[th->num_thdevs++] = thdev; >> >> - return 0; >> + enable_irq(th->irq); >> + return ret; >> } >> EXPORT_SYMBOL_GPL(intel_th_output_enable); > > This is indeed a possible fix, but I believe a little bit of > serialization can be employed here. I was thinking there was a better approach. Given the situation if you'd like me to test a fix, I can do so, > > Lastly, my apologies for tardiness. No worries. Best, -DA > > Thanks! > -- > Alex >
Hi, On 8/25/25 1:45 PM, David Arcari wrote: > In certain cases intel_th_irq can reference a null entry in > the th->thdev array. This results in the splat shown below. > The problem is that intel_th_output_enable() can modify the > thdev[] array at the same time intel_th_irq is referencing > the same array. This can be fixed by disabling interrupts > during the call to intel_th_output_enable(). > > BUG: kernel NULL pointer dereference, address: 0000000000000304 > Oops: Oops: 0000 [#1] SMP NOPTI > RIP: 0010:intel_th_irq+0x26/0x70 [intel_th] > Call Trace: > <IRQ> > ? show_trace_log_lvl+0x1b0/0x2f0 > ? show_trace_log_lvl+0x1b0/0x2f0 > ? __handle_irq_event_percpu+0x4a/0x180 > ? __die_body.cold+0x8/0x12 > ? page_fault_oops+0x148/0x160 > ? exc_page_fault+0x73/0x160 > ? asm_exc_page_fault+0x26/0x30 > ? intel_th_irq+0x26/0x70 [intel_th] > __handle_irq_event_percpu+0x4a/0x180 > handle_irq_event+0x38/0x80 > handle_fasteoi_irq+0x78/0x200 > __common_interrupt+0x3e/0x90 > common_interrupt+0x80/0xa0 > </IRQ> > > Fixes: a753bfcfdb1f ("intel_th: Make the switch allocate its subdevices") > Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> > Cc: linux-kernel@vger.kernel.org > Cc: Jerry Hoemann <jerry.hoemann@hpe.com> > Signed-off-by: David Arcari <darcari@redhat.com> > --- > drivers/hwtracing/intel_th/core.c | 17 +++++++++++------ > 1 file changed, 11 insertions(+), 6 deletions(-) > > diff --git a/drivers/hwtracing/intel_th/core.c b/drivers/hwtracing/intel_th/core.c > index 47d9e6c3bac0..c6f6153fcc88 100644 > --- a/drivers/hwtracing/intel_th/core.c > +++ b/drivers/hwtracing/intel_th/core.c > @@ -715,7 +715,9 @@ intel_th_subdevice_alloc(struct intel_th *th, > int intel_th_output_enable(struct intel_th *th, unsigned int otype) > { > struct intel_th_device *thdev; > - int src = 0, dst = 0; > + int src = 0, dst = 0, ret = 0; > + > + disable_irq(th->irq); > > for (src = 0, dst = 0; dst <= th->num_thdevs; src++, dst++) { > for (; src < ARRAY_SIZE(intel_th_subdevices); src++) { > @@ -730,7 +732,7 @@ int intel_th_output_enable(struct intel_th *th, unsigned int otype) > > /* no unallocated matching subdevices */ > if (src == ARRAY_SIZE(intel_th_subdevices)) > - return -ENODEV; > + goto nodev; > > for (; dst < th->num_thdevs; dst++) { > if (th->thdev[dst]->type != INTEL_TH_OUTPUT) > @@ -750,16 +752,19 @@ int intel_th_output_enable(struct intel_th *th, unsigned int otype) > goto found; > } > > +nodev: > + enable_irq(th->irq); > return -ENODEV; > > found: > thdev = intel_th_subdevice_alloc(th, &intel_th_subdevices[src]); > if (IS_ERR(thdev)) > - return PTR_ERR(thdev); > - > - th->thdev[th->num_thdevs++] = thdev; > + ret = PTR_ERR(thdev); > + else > + th->thdev[th->num_thdevs++] = thdev; > > - return 0; > + enable_irq(th->irq); > + return ret; > } > EXPORT_SYMBOL_GPL(intel_th_output_enable); > I suspect there may be a better approach to this problem, but I did want to add that after extensive testing this did resolve the issue. -DA
© 2016 - 2025 Red Hat, Inc.