[v1] PM: dpm: add module param to backtrace all CPUs

[RFC PATCH] PM: dpm: add module param to backtrace all CPUs

Posted by Sergey Senozhatsky 6 months, 2 weeks ago

Add dpm_all_cpu_backtrace module parameter which controls
all CPU backtrace dump before DPM panics the system.  This
is expected to help understanding what might have caused
device timeout.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 drivers/base/power/main.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index dbf5456cd891..9fb943afe246 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -34,6 +34,7 @@
 #include <linux/cpufreq.h>
 #include <linux/devfreq.h>
 #include <linux/timer.h>
+#include <linux/nmi.h>
 
 #include "../base.h"
 #include "power.h"
@@ -517,6 +518,9 @@ struct dpm_watchdog {
 #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
 	struct dpm_watchdog wd
 
+static bool __read_mostly dpm_all_cpu_backtrace;
+module_param(dpm_all_cpu_backtrace, bool, 0644);
+
 /**
  * dpm_watchdog_handler - Driver suspend / resume watchdog handler.
  * @t: The timer that PM watchdog depends on.
@@ -532,8 +536,12 @@ static void dpm_watchdog_handler(struct timer_list *t)
 	unsigned int time_left;
 
 	if (wd->fatal) {
+		unsigned int this_cpu = smp_processor_id();
+
 		dev_emerg(wd->dev, "**** DPM device timeout ****\n");
 		show_stack(wd->tsk, NULL, KERN_EMERG);
+		if (dpm_all_cpu_backtrace)
+			trigger_single_cpu_backtrace(this_cpu);
 		panic("%s %s: unrecoverable failure\n",
 			dev_driver_string(wd->dev), dev_name(wd->dev));
 	}
-- 
2.50.0.727.gbf7dc18ff4-goog

Re: [RFC PATCH] PM: dpm: add module param to backtrace all CPUs

Posted by Sergey Senozhatsky 6 months, 2 weeks ago

On (25/07/23 12:59), Sergey Senozhatsky wrote:
> Add dpm_all_cpu_backtrace module parameter which controls
> all CPU backtrace dump before DPM panics the system.  This
> is expected to help understanding what might have caused
> device timeout.
> 
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
>  drivers/base/power/main.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
> index dbf5456cd891..9fb943afe246 100644
> --- a/drivers/base/power/main.c
> +++ b/drivers/base/power/main.c
> @@ -34,6 +34,7 @@
>  #include <linux/cpufreq.h>
>  #include <linux/devfreq.h>
>  #include <linux/timer.h>
> +#include <linux/nmi.h>
>  
>  #include "../base.h"
>  #include "power.h"
> @@ -517,6 +518,9 @@ struct dpm_watchdog {
>  #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
>  	struct dpm_watchdog wd
>  
> +static bool __read_mostly dpm_all_cpu_backtrace;
> +module_param(dpm_all_cpu_backtrace, bool, 0644);
> +
>  /**
>   * dpm_watchdog_handler - Driver suspend / resume watchdog handler.
>   * @t: The timer that PM watchdog depends on.
> @@ -532,8 +536,12 @@ static void dpm_watchdog_handler(struct timer_list *t)
>  	unsigned int time_left;
>  
>  	if (wd->fatal) {
> +		unsigned int this_cpu = smp_processor_id();
> +
>  		dev_emerg(wd->dev, "**** DPM device timeout ****\n");
>  		show_stack(wd->tsk, NULL, KERN_EMERG);
> +		if (dpm_all_cpu_backtrace)
> +			trigger_single_cpu_backtrace(this_cpu);

This is silly, I do apologize.  This should be

			trigger_allbutcpu_cpu_backtrace(this_cpu);

We want to backtrace all CPUs, except the current one.  A silly
copy-paste mistake.