[PATCH] PM / core: Allow configuring the DPM watchdog to warn earlier than panic

Douglas Anderson posted 1 patch 1 year, 1 month ago
There is a newer version of this series
drivers/base/power/main.c | 22 ++++++++++++++++++----
kernel/power/Kconfig      |  8 +++++++-
2 files changed, 25 insertions(+), 5 deletions(-)
[PATCH] PM / core: Allow configuring the DPM watchdog to warn earlier than panic
Posted by Douglas Anderson 1 year, 1 month ago
Allow configuring the DPM watchdog to warn about slow suspend/resume
functions without causing a system panic(). This allows you to set the
DPM_WATCHDOG_WARNING_TIMEOUT to something like 5 or 10 seconds to get
warnings about slow suspend/resume functions that eventually succeed.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
---

 drivers/base/power/main.c | 22 ++++++++++++++++++----
 kernel/power/Kconfig      |  8 +++++++-
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 4a67e83300e1..239bcf93f821 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -496,6 +496,7 @@ struct dpm_watchdog {
 	struct device		*dev;
 	struct task_struct	*tsk;
 	struct timer_list	timer;
+	bool			fatal;
 };
 
 #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
@@ -512,11 +513,23 @@ struct dpm_watchdog {
 static void dpm_watchdog_handler(struct timer_list *t)
 {
 	struct dpm_watchdog *wd = from_timer(wd, t, timer);
+	struct timer_list *timer = &wd->timer;
+	unsigned int time_left;
+
+	if (wd->fatal) {
+		dev_emerg(wd->dev, "**** DPM device timeout ****\n");
+		show_stack(wd->tsk, NULL, KERN_EMERG);
+		panic("%s %s: unrecoverable failure\n",
+			dev_driver_string(wd->dev), dev_name(wd->dev));
+	}
 
-	dev_emerg(wd->dev, "**** DPM device timeout ****\n");
+	time_left = CONFIG_DPM_WATCHDOG_TIMEOUT - CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
+	dev_emerg(wd->dev, "**** DPM device timeout after %u seconds; %u seconds until panic ****\n",
+		  CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT, time_left);
 	show_stack(wd->tsk, NULL, KERN_EMERG);
-	panic("%s %s: unrecoverable failure\n",
-		dev_driver_string(wd->dev), dev_name(wd->dev));
+
+	wd->fatal = true;
+	mod_timer(timer, jiffies + HZ * time_left);
 }
 
 /**
@@ -530,10 +543,11 @@ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev)
 
 	wd->dev = dev;
 	wd->tsk = current;
+	wd->fatal = CONFIG_DPM_WATCHDOG_TIMEOUT == CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
 
 	timer_setup_on_stack(timer, dpm_watchdog_handler, 0);
 	/* use same timeout value for both suspend and resume */
-	timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT;
+	timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
 	add_timer(timer);
 }
 
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index afce8130d8b9..3387b94e76c1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -257,11 +257,17 @@ config DPM_WATCHDOG
 	  boot session.
 
 config DPM_WATCHDOG_TIMEOUT
-	int "Watchdog timeout in seconds"
+	int "Watchdog timeout to panic in seconds"
 	range 1 120
 	default 120
 	depends on DPM_WATCHDOG
 
+config DPM_WATCHDOG_WARNING_TIMEOUT
+	int "Watchdog timeout to warn in seconds"
+	range 1 DPM_WATCHDOG_TIMEOUT
+	default DPM_WATCHDOG_TIMEOUT
+	depends on DPM_WATCHDOG
+
 config PM_TRACE
 	bool
 	help
-- 
2.47.1.613.gc27f4b7a9f-goog
Re: [PATCH] PM / core: Allow configuring the DPM watchdog to warn earlier than panic
Posted by Tomasz Figa 1 year, 1 month ago
Hi Doug,

On Thu, Dec 19, 2024 at 2:56 AM Douglas Anderson <dianders@chromium.org> wrote:
>
> Allow configuring the DPM watchdog to warn about slow suspend/resume
> functions without causing a system panic(). This allows you to set the
> DPM_WATCHDOG_WARNING_TIMEOUT to something like 5 or 10 seconds to get
> warnings about slow suspend/resume functions that eventually succeed.

Thanks for the patch. I agree that it would be very useful to avoid
completely bringing the user's system down, while still having a
warning that lets them notice that something is not right.

That said, please see my comments below.

>
> Signed-off-by: Douglas Anderson <dianders@chromium.org>
> ---
>
>  drivers/base/power/main.c | 22 ++++++++++++++++++----
>  kernel/power/Kconfig      |  8 +++++++-
>  2 files changed, 25 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
> index 4a67e83300e1..239bcf93f821 100644
> --- a/drivers/base/power/main.c
> +++ b/drivers/base/power/main.c
> @@ -496,6 +496,7 @@ struct dpm_watchdog {
>         struct device           *dev;
>         struct task_struct      *tsk;
>         struct timer_list       timer;
> +       bool                    fatal;
>  };
>
>  #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
> @@ -512,11 +513,23 @@ struct dpm_watchdog {
>  static void dpm_watchdog_handler(struct timer_list *t)
>  {
>         struct dpm_watchdog *wd = from_timer(wd, t, timer);
> +       struct timer_list *timer = &wd->timer;
> +       unsigned int time_left;
> +
> +       if (wd->fatal) {
> +               dev_emerg(wd->dev, "**** DPM device timeout ****\n");
> +               show_stack(wd->tsk, NULL, KERN_EMERG);
> +               panic("%s %s: unrecoverable failure\n",
> +                       dev_driver_string(wd->dev), dev_name(wd->dev));
> +       }
>
> -       dev_emerg(wd->dev, "**** DPM device timeout ****\n");
> +       time_left = CONFIG_DPM_WATCHDOG_TIMEOUT - CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
> +       dev_emerg(wd->dev, "**** DPM device timeout after %u seconds; %u seconds until panic ****\n",
> +                 CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT, time_left);

Should this be printed using dev_warn() instead, since it's not fatal?

>         show_stack(wd->tsk, NULL, KERN_EMERG);
> -       panic("%s %s: unrecoverable failure\n",
> -               dev_driver_string(wd->dev), dev_name(wd->dev));
> +
> +       wd->fatal = true;
> +       mod_timer(timer, jiffies + HZ * time_left);
>  }
>
>  /**
> @@ -530,10 +543,11 @@ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev)
>
>         wd->dev = dev;
>         wd->tsk = current;
> +       wd->fatal = CONFIG_DPM_WATCHDOG_TIMEOUT == CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
>
>         timer_setup_on_stack(timer, dpm_watchdog_handler, 0);
>         /* use same timeout value for both suspend and resume */
> -       timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT;
> +       timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
>         add_timer(timer);
>  }
>
> diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
> index afce8130d8b9..3387b94e76c1 100644
> --- a/kernel/power/Kconfig
> +++ b/kernel/power/Kconfig
> @@ -257,11 +257,17 @@ config DPM_WATCHDOG
>           boot session.
>
>  config DPM_WATCHDOG_TIMEOUT
> -       int "Watchdog timeout in seconds"
> +       int "Watchdog timeout to panic in seconds"
>         range 1 120
>         default 120
>         depends on DPM_WATCHDOG
>
> +config DPM_WATCHDOG_WARNING_TIMEOUT
> +       int "Watchdog timeout to warn in seconds"
> +       range 1 DPM_WATCHDOG_TIMEOUT
> +       default DPM_WATCHDOG_TIMEOUT
> +       depends on DPM_WATCHDOG
> +

If I'm reading the code correctly, if we set
DPM_WATCHDOG_WARNING_TIMEOUT to the same value as
DPM_WATCHDOG_TIMEOUT, then we get the current behavior when the
timeout is fatal. Would it make sense to document this in the help
text of the new symbol?

Best regards,
Tomasz