drivers/base/power/main.c | 22 ++++++++++++++++++---- kernel/power/Kconfig | 8 +++++++- 2 files changed, 25 insertions(+), 5 deletions(-)
Allow configuring the DPM watchdog to warn about slow suspend/resume
functions without causing a system panic(). This allows you to set the
DPM_WATCHDOG_WARNING_TIMEOUT to something like 5 or 10 seconds to get
warnings about slow suspend/resume functions that eventually succeed.
Signed-off-by: Douglas Anderson <dianders@chromium.org>
---
drivers/base/power/main.c | 22 ++++++++++++++++++----
kernel/power/Kconfig | 8 +++++++-
2 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 4a67e83300e1..239bcf93f821 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -496,6 +496,7 @@ struct dpm_watchdog {
struct device *dev;
struct task_struct *tsk;
struct timer_list timer;
+ bool fatal;
};
#define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
@@ -512,11 +513,23 @@ struct dpm_watchdog {
static void dpm_watchdog_handler(struct timer_list *t)
{
struct dpm_watchdog *wd = from_timer(wd, t, timer);
+ struct timer_list *timer = &wd->timer;
+ unsigned int time_left;
+
+ if (wd->fatal) {
+ dev_emerg(wd->dev, "**** DPM device timeout ****\n");
+ show_stack(wd->tsk, NULL, KERN_EMERG);
+ panic("%s %s: unrecoverable failure\n",
+ dev_driver_string(wd->dev), dev_name(wd->dev));
+ }
- dev_emerg(wd->dev, "**** DPM device timeout ****\n");
+ time_left = CONFIG_DPM_WATCHDOG_TIMEOUT - CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
+ dev_emerg(wd->dev, "**** DPM device timeout after %u seconds; %u seconds until panic ****\n",
+ CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT, time_left);
show_stack(wd->tsk, NULL, KERN_EMERG);
- panic("%s %s: unrecoverable failure\n",
- dev_driver_string(wd->dev), dev_name(wd->dev));
+
+ wd->fatal = true;
+ mod_timer(timer, jiffies + HZ * time_left);
}
/**
@@ -530,10 +543,11 @@ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev)
wd->dev = dev;
wd->tsk = current;
+ wd->fatal = CONFIG_DPM_WATCHDOG_TIMEOUT == CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
timer_setup_on_stack(timer, dpm_watchdog_handler, 0);
/* use same timeout value for both suspend and resume */
- timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT;
+ timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
add_timer(timer);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index afce8130d8b9..3387b94e76c1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -257,11 +257,17 @@ config DPM_WATCHDOG
boot session.
config DPM_WATCHDOG_TIMEOUT
- int "Watchdog timeout in seconds"
+ int "Watchdog timeout to panic in seconds"
range 1 120
default 120
depends on DPM_WATCHDOG
+config DPM_WATCHDOG_WARNING_TIMEOUT
+ int "Watchdog timeout to warn in seconds"
+ range 1 DPM_WATCHDOG_TIMEOUT
+ default DPM_WATCHDOG_TIMEOUT
+ depends on DPM_WATCHDOG
+
config PM_TRACE
bool
help
--
2.47.1.613.gc27f4b7a9f-goog
Hi Doug,
On Thu, Dec 19, 2024 at 2:56 AM Douglas Anderson <dianders@chromium.org> wrote:
>
> Allow configuring the DPM watchdog to warn about slow suspend/resume
> functions without causing a system panic(). This allows you to set the
> DPM_WATCHDOG_WARNING_TIMEOUT to something like 5 or 10 seconds to get
> warnings about slow suspend/resume functions that eventually succeed.
Thanks for the patch. I agree that it would be very useful to avoid
completely bringing the user's system down, while still having a
warning that lets them notice that something is not right.
That said, please see my comments below.
>
> Signed-off-by: Douglas Anderson <dianders@chromium.org>
> ---
>
> drivers/base/power/main.c | 22 ++++++++++++++++++----
> kernel/power/Kconfig | 8 +++++++-
> 2 files changed, 25 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
> index 4a67e83300e1..239bcf93f821 100644
> --- a/drivers/base/power/main.c
> +++ b/drivers/base/power/main.c
> @@ -496,6 +496,7 @@ struct dpm_watchdog {
> struct device *dev;
> struct task_struct *tsk;
> struct timer_list timer;
> + bool fatal;
> };
>
> #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
> @@ -512,11 +513,23 @@ struct dpm_watchdog {
> static void dpm_watchdog_handler(struct timer_list *t)
> {
> struct dpm_watchdog *wd = from_timer(wd, t, timer);
> + struct timer_list *timer = &wd->timer;
> + unsigned int time_left;
> +
> + if (wd->fatal) {
> + dev_emerg(wd->dev, "**** DPM device timeout ****\n");
> + show_stack(wd->tsk, NULL, KERN_EMERG);
> + panic("%s %s: unrecoverable failure\n",
> + dev_driver_string(wd->dev), dev_name(wd->dev));
> + }
>
> - dev_emerg(wd->dev, "**** DPM device timeout ****\n");
> + time_left = CONFIG_DPM_WATCHDOG_TIMEOUT - CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
> + dev_emerg(wd->dev, "**** DPM device timeout after %u seconds; %u seconds until panic ****\n",
> + CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT, time_left);
Should this be printed using dev_warn() instead, since it's not fatal?
> show_stack(wd->tsk, NULL, KERN_EMERG);
> - panic("%s %s: unrecoverable failure\n",
> - dev_driver_string(wd->dev), dev_name(wd->dev));
> +
> + wd->fatal = true;
> + mod_timer(timer, jiffies + HZ * time_left);
> }
>
> /**
> @@ -530,10 +543,11 @@ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev)
>
> wd->dev = dev;
> wd->tsk = current;
> + wd->fatal = CONFIG_DPM_WATCHDOG_TIMEOUT == CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
>
> timer_setup_on_stack(timer, dpm_watchdog_handler, 0);
> /* use same timeout value for both suspend and resume */
> - timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT;
> + timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
> add_timer(timer);
> }
>
> diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
> index afce8130d8b9..3387b94e76c1 100644
> --- a/kernel/power/Kconfig
> +++ b/kernel/power/Kconfig
> @@ -257,11 +257,17 @@ config DPM_WATCHDOG
> boot session.
>
> config DPM_WATCHDOG_TIMEOUT
> - int "Watchdog timeout in seconds"
> + int "Watchdog timeout to panic in seconds"
> range 1 120
> default 120
> depends on DPM_WATCHDOG
>
> +config DPM_WATCHDOG_WARNING_TIMEOUT
> + int "Watchdog timeout to warn in seconds"
> + range 1 DPM_WATCHDOG_TIMEOUT
> + default DPM_WATCHDOG_TIMEOUT
> + depends on DPM_WATCHDOG
> +
If I'm reading the code correctly, if we set
DPM_WATCHDOG_WARNING_TIMEOUT to the same value as
DPM_WATCHDOG_TIMEOUT, then we get the current behavior when the
timeout is fatal. Would it make sense to document this in the help
text of the new symbol?
Best regards,
Tomasz
© 2016 - 2026 Red Hat, Inc.