drivers/base/power/main.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
Add dpm_watchdog_all_cpu_backtrace module parameter which
controls all CPU backtrace dump before DPM panics the system.
This is expected to help understanding what might have caused
device timeout.
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
drivers/base/power/main.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index e83503bdc1fd..7a8807ec9a5d 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -34,6 +34,7 @@
#include <linux/cpufreq.h>
#include <linux/devfreq.h>
#include <linux/timer.h>
+#include <linux/nmi.h>
#include "../base.h"
#include "power.h"
@@ -515,6 +516,11 @@ struct dpm_watchdog {
#define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
struct dpm_watchdog wd
+static bool __read_mostly dpm_watchdog_all_cpu_backtrace;
+module_param(dpm_watchdog_all_cpu_backtrace, bool, 0644);
+MODULE_PARM_DESC(dpm_watchdog_all_cpu_backtrace,
+ "Backtrace all CPUs on DPM watchdog timeout");
+
/**
* dpm_watchdog_handler - Driver suspend / resume watchdog handler.
* @t: The timer that PM watchdog depends on.
@@ -530,8 +536,12 @@ static void dpm_watchdog_handler(struct timer_list *t)
unsigned int time_left;
if (wd->fatal) {
+ unsigned int this_cpu = smp_processor_id();
+
dev_emerg(wd->dev, "**** DPM device timeout ****\n");
show_stack(wd->tsk, NULL, KERN_EMERG);
+ if (dpm_watchdog_all_cpu_backtrace)
+ trigger_allbutcpu_cpu_backtrace(this_cpu);
panic("%s %s: unrecoverable failure\n",
dev_driver_string(wd->dev), dev_name(wd->dev));
}
--
2.51.0.618.g983fd99d29-goog
On Oct 07, 2025 at 15:35:40 +0900, Sergey Senozhatsky wrote:
> Add dpm_watchdog_all_cpu_backtrace module parameter which
> controls all CPU backtrace dump before DPM panics the system.
> This is expected to help understanding what might have caused
> device timeout.
This will indeed be really helpful for debugging some nasty bugs!
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
> drivers/base/power/main.c | 10 ++++++++++
> 1 file changed, 10 insertions(+)
>
> diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
> index e83503bdc1fd..7a8807ec9a5d 100644
> --- a/drivers/base/power/main.c
> +++ b/drivers/base/power/main.c
> @@ -34,6 +34,7 @@
> #include <linux/cpufreq.h>
> #include <linux/devfreq.h>
> #include <linux/timer.h>
> +#include <linux/nmi.h>
>
> #include "../base.h"
> #include "power.h"
> @@ -515,6 +516,11 @@ struct dpm_watchdog {
> #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
> struct dpm_watchdog wd
>
> +static bool __read_mostly dpm_watchdog_all_cpu_backtrace;
> +module_param(dpm_watchdog_all_cpu_backtrace, bool, 0644);
> +MODULE_PARM_DESC(dpm_watchdog_all_cpu_backtrace,
> + "Backtrace all CPUs on DPM watchdog timeout");
> +
Have you considered runtime configurability instead of a module param?
> /**
> * dpm_watchdog_handler - Driver suspend / resume watchdog handler.
> * @t: The timer that PM watchdog depends on.
> @@ -530,8 +536,12 @@ static void dpm_watchdog_handler(struct timer_list *t)
> unsigned int time_left;
>
> if (wd->fatal) {
> + unsigned int this_cpu = smp_processor_id();
> +
> dev_emerg(wd->dev, "**** DPM device timeout ****\n");
> show_stack(wd->tsk, NULL, KERN_EMERG);
> + if (dpm_watchdog_all_cpu_backtrace)
> + trigger_allbutcpu_cpu_backtrace(this_cpu);
IMO it would be useful to check the ret val of this as well, I mean just
incase this silently returns false it maybe confusing to figure out what
hapenned in the system inspite of setting the mod param.
--
Best regards,
Dhruva Gole
Texas Instruments Incorporated
On (25/10/08 15:44), Dhruva Gole wrote: [..] > > dev_emerg(wd->dev, "**** DPM device timeout ****\n"); > > show_stack(wd->tsk, NULL, KERN_EMERG); > > + if (dpm_watchdog_all_cpu_backtrace) > > + trigger_allbutcpu_cpu_backtrace(this_cpu); > > IMO it would be useful to check the ret val of this as well, I mean just > incase this silently returns false it maybe confusing to figure out what > hapenned in the system inspite of setting the mod param. Honestly, I haven't seen a system that constantly modifies its modules' params at runtime. It's usually a pretty static configuration, so I'm not sure if this will address any real world problem.
On Wed, Oct 8, 2025 at 12:14 PM Dhruva Gole <d-gole@ti.com> wrote:
>
> On Oct 07, 2025 at 15:35:40 +0900, Sergey Senozhatsky wrote:
> > Add dpm_watchdog_all_cpu_backtrace module parameter which
> > controls all CPU backtrace dump before DPM panics the system.
> > This is expected to help understanding what might have caused
> > device timeout.
>
> This will indeed be really helpful for debugging some nasty bugs!
>
> >
> > Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> > ---
> > drivers/base/power/main.c | 10 ++++++++++
> > 1 file changed, 10 insertions(+)
> >
> > diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
> > index e83503bdc1fd..7a8807ec9a5d 100644
> > --- a/drivers/base/power/main.c
> > +++ b/drivers/base/power/main.c
> > @@ -34,6 +34,7 @@
> > #include <linux/cpufreq.h>
> > #include <linux/devfreq.h>
> > #include <linux/timer.h>
> > +#include <linux/nmi.h>
> >
> > #include "../base.h"
> > #include "power.h"
> > @@ -515,6 +516,11 @@ struct dpm_watchdog {
> > #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
> > struct dpm_watchdog wd
> >
> > +static bool __read_mostly dpm_watchdog_all_cpu_backtrace;
> > +module_param(dpm_watchdog_all_cpu_backtrace, bool, 0644);
> > +MODULE_PARM_DESC(dpm_watchdog_all_cpu_backtrace,
> > + "Backtrace all CPUs on DPM watchdog timeout");
> > +
>
> Have you considered runtime configurability instead of a module param?
This one can be updated at run time AFAICS.
On Oct 08, 2025 at 13:23:33 +0200, Rafael J. Wysocki wrote:
> On Wed, Oct 8, 2025 at 12:14 PM Dhruva Gole <d-gole@ti.com> wrote:
> >
> > On Oct 07, 2025 at 15:35:40 +0900, Sergey Senozhatsky wrote:
> > > Add dpm_watchdog_all_cpu_backtrace module parameter which
> > > controls all CPU backtrace dump before DPM panics the system.
> > > This is expected to help understanding what might have caused
> > > device timeout.
> >
> > This will indeed be really helpful for debugging some nasty bugs!
> >
> > >
> > > Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> > > ---
> > > drivers/base/power/main.c | 10 ++++++++++
> > > 1 file changed, 10 insertions(+)
> > >
> > > diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
> > > index e83503bdc1fd..7a8807ec9a5d 100644
> > > --- a/drivers/base/power/main.c
> > > +++ b/drivers/base/power/main.c
> > > @@ -34,6 +34,7 @@
> > > #include <linux/cpufreq.h>
> > > #include <linux/devfreq.h>
> > > #include <linux/timer.h>
> > > +#include <linux/nmi.h>
> > >
> > > #include "../base.h"
> > > #include "power.h"
> > > @@ -515,6 +516,11 @@ struct dpm_watchdog {
> > > #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
> > > struct dpm_watchdog wd
> > >
> > > +static bool __read_mostly dpm_watchdog_all_cpu_backtrace;
> > > +module_param(dpm_watchdog_all_cpu_backtrace, bool, 0644);
> > > +MODULE_PARM_DESC(dpm_watchdog_all_cpu_backtrace,
> > > + "Backtrace all CPUs on DPM watchdog timeout");
> > > +
> >
> > Have you considered runtime configurability instead of a module param?
>
> This one can be updated at run time AFAICS.
What I meant really was to consider another path instead of a mod param,
something like a /sys/kernel/ entry, but looking more into it I think this
is perfectly fine too.
Reviewed-by: Dhruva Gole <d-gole@ti.com>
--
Best regards,
Dhruva Gole
Texas Instruments Incorporated
On (25/10/08 18:32), Dhruva Gole wrote: > What I meant really was to consider another path instead of a mod param, > something like a /sys/kernel/ Modules' params are exposed to sysfs and are writeable.
On Wed, Oct 8, 2025 at 3:10 PM Sergey Senozhatsky <senozhatsky@chromium.org> wrote: > > On (25/10/08 18:32), Dhruva Gole wrote: > > What I meant really was to consider another path instead of a mod param, > > something like a /sys/kernel/ > > Modules' params are exposed to sysfs and are writeable. Some of them aren't writable, but this particular one is.
On Tue, Oct 7, 2025 at 3:36 PM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> Add dpm_watchdog_all_cpu_backtrace module parameter which
> controls all CPU backtrace dump before DPM panics the system.
> This is expected to help understanding what might have caused
> device timeout.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
> drivers/base/power/main.c | 10 ++++++++++
> 1 file changed, 10 insertions(+)
>
> diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
> index e83503bdc1fd..7a8807ec9a5d 100644
> --- a/drivers/base/power/main.c
> +++ b/drivers/base/power/main.c
> @@ -34,6 +34,7 @@
> #include <linux/cpufreq.h>
> #include <linux/devfreq.h>
> #include <linux/timer.h>
> +#include <linux/nmi.h>
>
> #include "../base.h"
> #include "power.h"
> @@ -515,6 +516,11 @@ struct dpm_watchdog {
> #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
> struct dpm_watchdog wd
>
> +static bool __read_mostly dpm_watchdog_all_cpu_backtrace;
> +module_param(dpm_watchdog_all_cpu_backtrace, bool, 0644);
> +MODULE_PARM_DESC(dpm_watchdog_all_cpu_backtrace,
> + "Backtrace all CPUs on DPM watchdog timeout");
> +
> /**
> * dpm_watchdog_handler - Driver suspend / resume watchdog handler.
> * @t: The timer that PM watchdog depends on.
> @@ -530,8 +536,12 @@ static void dpm_watchdog_handler(struct timer_list *t)
> unsigned int time_left;
>
> if (wd->fatal) {
> + unsigned int this_cpu = smp_processor_id();
> +
> dev_emerg(wd->dev, "**** DPM device timeout ****\n");
> show_stack(wd->tsk, NULL, KERN_EMERG);
> + if (dpm_watchdog_all_cpu_backtrace)
> + trigger_allbutcpu_cpu_backtrace(this_cpu);
> panic("%s %s: unrecoverable failure\n",
> dev_driver_string(wd->dev), dev_name(wd->dev));
> }
> --
> 2.51.0.618.g983fd99d29-goog
>
Reviewed-by: Tomasz Figa <tfiga@chromium.org>
Best,
Tomasz
On Tue, Oct 7, 2025 at 9:51 AM Tomasz Figa <tfiga@chromium.org> wrote:
>
> On Tue, Oct 7, 2025 at 3:36 PM Sergey Senozhatsky
> <senozhatsky@chromium.org> wrote:
> >
> > Add dpm_watchdog_all_cpu_backtrace module parameter which
> > controls all CPU backtrace dump before DPM panics the system.
> > This is expected to help understanding what might have caused
> > device timeout.
> >
> > Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> > ---
> > drivers/base/power/main.c | 10 ++++++++++
> > 1 file changed, 10 insertions(+)
> >
> > diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
> > index e83503bdc1fd..7a8807ec9a5d 100644
> > --- a/drivers/base/power/main.c
> > +++ b/drivers/base/power/main.c
> > @@ -34,6 +34,7 @@
> > #include <linux/cpufreq.h>
> > #include <linux/devfreq.h>
> > #include <linux/timer.h>
> > +#include <linux/nmi.h>
> >
> > #include "../base.h"
> > #include "power.h"
> > @@ -515,6 +516,11 @@ struct dpm_watchdog {
> > #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
> > struct dpm_watchdog wd
> >
> > +static bool __read_mostly dpm_watchdog_all_cpu_backtrace;
> > +module_param(dpm_watchdog_all_cpu_backtrace, bool, 0644);
> > +MODULE_PARM_DESC(dpm_watchdog_all_cpu_backtrace,
> > + "Backtrace all CPUs on DPM watchdog timeout");
> > +
> > /**
> > * dpm_watchdog_handler - Driver suspend / resume watchdog handler.
> > * @t: The timer that PM watchdog depends on.
> > @@ -530,8 +536,12 @@ static void dpm_watchdog_handler(struct timer_list *t)
> > unsigned int time_left;
> >
> > if (wd->fatal) {
> > + unsigned int this_cpu = smp_processor_id();
> > +
> > dev_emerg(wd->dev, "**** DPM device timeout ****\n");
> > show_stack(wd->tsk, NULL, KERN_EMERG);
> > + if (dpm_watchdog_all_cpu_backtrace)
> > + trigger_allbutcpu_cpu_backtrace(this_cpu);
> > panic("%s %s: unrecoverable failure\n",
> > dev_driver_string(wd->dev), dev_name(wd->dev));
> > }
> > --
> > 2.51.0.618.g983fd99d29-goog
> >
>
> Reviewed-by: Tomasz Figa <tfiga@chromium.org>
Applied as 6.19 material with some edits in the subject and changelog, thanks!
© 2016 - 2026 Red Hat, Inc.