include/linux/clockchips.h | 2 ++ kernel/time/clockevents.c | 37 +++++++++++++++++++++++-------------- 2 files changed, 25 insertions(+), 14 deletions(-)
Calvin reported an odd NMI watchdog lockup which claims that the CPU locked
up in user space. He provided a reproducer, which set's up a timerfd based
timer and then rearms it in a loop with an absolute expiry time of 1ns.
As the expiry time is in the past, the timer ends up as the first expiring
timer in the per CPU hrtimer base and the clockevent device is programmed
with the minimum delta value. If the machine is fast enough, this ends up
in a endless loop of programming the delta value to the minimum value
defined by the clock event device, before the timer interrupt can fire,
which starves the interrupt and consequently triggers the lockup detector
because the hrtimer callback of the lockup mechanism is never invoked.
As a first step to prevent this, avoid reprogramming the clock event device
when:
- a forced minimum delta event is pending
- the new expiry delta is less then or equal to the minimum delta
Thanks to Calvin for providing the reproducer and to Borislav for testing
and providing data from his Zen5 machine.
The problem is not limited to Zen5, but depending on the underlying
clock event device (e.g. TSC deadline timer on Intel) and the CPU speed
not necessarily observable.
This change serves only as the last resort and further changes will be made
to prevent this scenario earlier in the call chain.
Reported-by: Calvin Owens <calvin@wbinvd.org>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
P.S: I'm working on the other changes, but wanted to get this out ASAP
for testing.
---
include/linux/clockchips.h | 2 ++
kernel/time/clockevents.c | 37 +++++++++++++++++++++++--------------
2 files changed, 25 insertions(+), 14 deletions(-)
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -80,6 +80,7 @@ enum clock_event_state {
* @shift: nanoseconds to cycles divisor (power of two)
* @state_use_accessors:current state of the device, assigned by the core code
* @features: features
+ * @next_event_forced: True if the last programming was a forced event
* @retries: number of forced programming retries
* @set_state_periodic: switch state to periodic
* @set_state_oneshot: switch state to oneshot
@@ -108,6 +109,7 @@ struct clock_event_device {
u32 shift;
enum clock_event_state state_use_accessors;
unsigned int features;
+ unsigned int next_event_forced;
unsigned long retries;
int (*set_state_periodic)(struct clock_event_device *);
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -172,6 +172,7 @@ void clockevents_shutdown(struct clock_e
{
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
}
/**
@@ -224,13 +225,7 @@ static int clockevents_increase_min_delt
return 0;
}
-/**
- * clockevents_program_min_delta - Set clock event device to the minimum delay.
- * @dev: device to program
- *
- * Returns 0 on success, -ETIME when the retry loop failed.
- */
-static int clockevents_program_min_delta(struct clock_event_device *dev)
+static int __clockevents_program_min_delta(struct clock_event_device *dev)
{
unsigned long long clc;
int64_t delta;
@@ -263,13 +258,7 @@ static int clockevents_program_min_delta
#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
-/**
- * clockevents_program_min_delta - Set clock event device to the minimum delay.
- * @dev: device to program
- *
- * Returns 0 on success, -ETIME when the retry loop failed.
- */
-static int clockevents_program_min_delta(struct clock_event_device *dev)
+static int __clockevents_program_min_delta(struct clock_event_device *dev)
{
unsigned long long clc;
int64_t delta = 0;
@@ -293,6 +282,21 @@ static int clockevents_program_min_delta
#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev: device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+ if (dev->next_event_forced)
+ return 0;
+
+ dev->next_event_forced = 1;
+ return __clockevents_program_min_delta(dev);
+}
+
+/**
* clockevents_program_event - Reprogram the clock event device.
* @dev: device to program
* @expires: absolute expiry time (monotonic clock)
@@ -324,6 +328,11 @@ int clockevents_program_event(struct clo
return dev->set_next_ktime(expires, dev);
delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
+
+ /* Don't reprogram when a forced event is pending */
+ if (dev->next_event_forced && delta <= (int64_t)dev->min_delta_ns)
+ return 0;
+
if (delta <= 0)
return force ? clockevents_program_min_delta(dev) : -ETIME;
On Thu, Apr 02, 2026 at 07:07:49PM +0200, Thomas Gleixner wrote:
> --- a/kernel/time/clockevents.c
> +++ b/kernel/time/clockevents.c
> @@ -172,6 +172,7 @@ void clockevents_shutdown(struct clock_e
> {
> clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
> dev->next_event = KTIME_MAX;
> + dev->next_event_forced = 0;
> }
>
> /**
> @@ -224,13 +225,7 @@ static int clockevents_increase_min_delt
> return 0;
> }
>
> -/**
> - * clockevents_program_min_delta - Set clock event device to the minimum delay.
> - * @dev: device to program
> - *
> - * Returns 0 on success, -ETIME when the retry loop failed.
> - */
> -static int clockevents_program_min_delta(struct clock_event_device *dev)
> +static int __clockevents_program_min_delta(struct clock_event_device *dev)
> {
> unsigned long long clc;
> int64_t delta;
> @@ -263,13 +258,7 @@ static int clockevents_program_min_delta
>
> #else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
>
> -/**
> - * clockevents_program_min_delta - Set clock event device to the minimum delay.
> - * @dev: device to program
> - *
> - * Returns 0 on success, -ETIME when the retry loop failed.
> - */
> -static int clockevents_program_min_delta(struct clock_event_device *dev)
> +static int __clockevents_program_min_delta(struct clock_event_device *dev)
> {
> unsigned long long clc;
> int64_t delta = 0;
> @@ -293,6 +282,21 @@ static int clockevents_program_min_delta
> #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
>
> /**
> + * clockevents_program_min_delta - Set clock event device to the minimum delay.
> + * @dev: device to program
> + *
> + * Returns 0 on success, -ETIME when the retry loop failed.
> + */
> +static int clockevents_program_min_delta(struct clock_event_device *dev)
> +{
> + if (dev->next_event_forced)
> + return 0;
> +
> + dev->next_event_forced = 1;
> + return __clockevents_program_min_delta(dev);
> +}
> +
> +/**
> * clockevents_program_event - Reprogram the clock event device.
> * @dev: device to program
> * @expires: absolute expiry time (monotonic clock)
> @@ -324,6 +328,11 @@ int clockevents_program_event(struct clo
> return dev->set_next_ktime(expires, dev);
>
> delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
> +
> + /* Don't reprogram when a forced event is pending */
> + if (dev->next_event_forced && delta <= (int64_t)dev->min_delta_ns)
> + return 0;
> +
> if (delta <= 0)
> return force ? clockevents_program_min_delta(dev) : -ETIME;
>
This last hunk seems duplicate of the clockevents_program_min_delta()
change, that also will bail when next_event_forced is set.
Also, I note, thing (except shutdown) will ever clear next_event_forced;
and I'm thinking that any interrupt or normal reprogram should be doing
that.
Now, there doesn't seem to be a generic intercept for
dev->event_handler, so clearing on interrupt is hard, but would
something like the completely untested below work?
---
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index b0df28ddd394..a79f8fa10104 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -108,6 +108,7 @@ struct clock_event_device {
u32 shift;
enum clock_event_state state_use_accessors;
unsigned int features;
+ unsigned int next_event_forced;
unsigned long retries;
int (*set_state_periodic)(struct clock_event_device *);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index eaae1ce9f060..8f6621361e46 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -172,6 +172,7 @@ void clockevents_shutdown(struct clock_event_device *dev)
{
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
}
/**
@@ -324,16 +325,31 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
return dev->set_next_ktime(expires, dev);
delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
- if (delta <= 0)
- return force ? clockevents_program_min_delta(dev) : -ETIME;
+ if (delta <= 0) {
+ rc = -ETIME;
+ goto error;
+ }
delta = min(delta, (int64_t) dev->max_delta_ns);
delta = max(delta, (int64_t) dev->min_delta_ns);
clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
rc = dev->set_next_event((unsigned long) clc, dev);
+ if (rc)
+ goto error;
- return (rc && force) ? clockevents_program_min_delta(dev) : rc;
+ dev->next_event_forced = 0;
+ return 0;
+
+error:
+ if (force) {
+ if (dev->next_event_forced)
+ return 0;
+
+ dev->next_event_forced = 1;
+ return clockevents_program_min_delta(dev);
+ }
+ return rc;
}
/*
On Fri, Apr 03 2026 at 14:16, Peter Zijlstra wrote:
> On Thu, Apr 02, 2026 at 07:07:49PM +0200, Thomas Gleixner wrote:
> /**
> @@ -324,16 +325,31 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
> return dev->set_next_ktime(expires, dev);
>
> delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
> - if (delta <= 0)
> - return force ? clockevents_program_min_delta(dev) : -ETIME;
> + if (delta <= 0) {
> + rc = -ETIME;
> + goto error;
> + }
That's not working in the case that user space manages to set the expiry
time so it stays in the min_delta_ns window, which is doable. I just
tried. Then we are back to square one.
Less convoluted but untested version of my initial idea below.
Thanks,
tglx
---
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -80,6 +80,7 @@ enum clock_event_state {
* @shift: nanoseconds to cycles divisor (power of two)
* @state_use_accessors:current state of the device, assigned by the core code
* @features: features
+ * @next_event_forced: True if the last programming was a forced event
* @retries: number of forced programming retries
* @set_state_periodic: switch state to periodic
* @set_state_oneshot: switch state to oneshot
@@ -108,6 +109,7 @@ struct clock_event_device {
u32 shift;
enum clock_event_state state_use_accessors;
unsigned int features;
+ unsigned int next_event_forced;
unsigned long retries;
int (*set_state_periodic)(struct clock_event_device *);
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -172,6 +172,7 @@ void clockevents_shutdown(struct clock_e
{
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
}
/**
@@ -224,13 +225,7 @@ static int clockevents_increase_min_delt
return 0;
}
-/**
- * clockevents_program_min_delta - Set clock event device to the minimum delay.
- * @dev: device to program
- *
- * Returns 0 on success, -ETIME when the retry loop failed.
- */
-static int clockevents_program_min_delta(struct clock_event_device *dev)
+static int __clockevents_program_min_delta(struct clock_event_device *dev)
{
unsigned long long clc;
int64_t delta;
@@ -263,13 +258,7 @@ static int clockevents_program_min_delta
#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
-/**
- * clockevents_program_min_delta - Set clock event device to the minimum delay.
- * @dev: device to program
- *
- * Returns 0 on success, -ETIME when the retry loop failed.
- */
-static int clockevents_program_min_delta(struct clock_event_device *dev)
+static int __clockevents_program_min_delta(struct clock_event_device *dev)
{
unsigned long long clc;
int64_t delta = 0;
@@ -293,6 +282,21 @@ static int clockevents_program_min_delta
#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev: device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+ if (dev->next_event_forced)
+ return 0;
+
+ dev->next_event_forced = 1;
+ return __clockevents_program_min_delta(dev);
+}
+
+/**
* clockevents_program_event - Reprogram the clock event device.
* @dev: device to program
* @expires: absolute expiry time (monotonic clock)
@@ -324,16 +328,18 @@ int clockevents_program_event(struct clo
return dev->set_next_ktime(expires, dev);
delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
- if (delta <= 0)
- return force ? clockevents_program_min_delta(dev) : -ETIME;
- delta = min(delta, (int64_t) dev->max_delta_ns);
- delta = max(delta, (int64_t) dev->min_delta_ns);
+ if (!dev->next_event_forced || delta > dev->min_delta_ns) {
- clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
- rc = dev->set_next_event((unsigned long) clc, dev);
+ delta = min(delta, (int64_t) dev->max_delta_ns);
+ delta = max(delta, (int64_t) dev->min_delta_ns);
+ clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+ rc = dev->set_next_event((unsigned long) clc, dev);
+ if (!rc)
+ return 0;
+ }
- return (rc && force) ? clockevents_program_min_delta(dev) : rc;
+ return force ? clockevents_program_min_delta(dev) : rc;
}
/*
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1888,6 +1888,7 @@ void hrtimer_interrupt(struct clock_even
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
entry_time = now = hrtimer_update_base(cpu_base);
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -110,6 +110,7 @@ void tick_handle_periodic(struct clock_e
int cpu = smp_processor_id();
ktime_t next = dev->next_event;
+ dev->next_event_forced = 0;
tick_periodic(cpu);
/*
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1513,6 +1513,7 @@ static void tick_nohz_lowres_handler(str
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
On Fri, Apr 03, 2026 at 06:17:15PM +0200, Thomas Gleixner wrote:
> --- a/kernel/time/hrtimer.c
> +++ b/kernel/time/hrtimer.c
> @@ -1888,6 +1888,7 @@ void hrtimer_interrupt(struct clock_even
> BUG_ON(!cpu_base->hres_active);
> cpu_base->nr_events++;
> dev->next_event = KTIME_MAX;
> + dev->next_event_forced = 0;
>
> raw_spin_lock_irqsave(&cpu_base->lock, flags);
> entry_time = now = hrtimer_update_base(cpu_base);
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -110,6 +110,7 @@ void tick_handle_periodic(struct clock_e
> int cpu = smp_processor_id();
> ktime_t next = dev->next_event;
>
> + dev->next_event_forced = 0;
> tick_periodic(cpu);
>
> /*
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -1513,6 +1513,7 @@ static void tick_nohz_lowres_handler(str
> struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
>
> dev->next_event = KTIME_MAX;
> + dev->next_event_forced = 0;
>
> if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
> tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
Doesn't that want to be something like this instead?
---
virtual patch
@@
expression E;
struct clock_event_device *ptr;
@@
- if (E->event_handler)
- E->event_handler(E);
+ clockevent_handler(E);
@@
expression E;
struct clock_event_device *ptr;
@@
- E->event_handler(E);
+ clockevent_handler(E);
---
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -159,6 +159,13 @@ static inline bool clockevent_state_ones
return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT_STOPPED;
}
+static inline void clockevent_handler(struct clock_event_device *dev)
+{
+ dev->next_event_forced = 0;
+ if (dev->event_handler)
+ dev->event_handler(dev);
+}
+
/*
* Calculate a multiplication factor for scaled math, which is used to convert
* nanoseconds based values to clock ticks:
On Fri, Apr 03 2026 at 23:01, Peter Zijlstra wrote: > On Fri, Apr 03, 2026 at 06:17:15PM +0200, Thomas Gleixner wrote: > > Doesn't that want to be something like this instead? Duh. Yes. I'm clearly not seeing the forest for the trees anymore.
On Fri, Apr 03 2026 at 23:24, Thomas Gleixner wrote:
> On Fri, Apr 03 2026 at 23:01, Peter Zijlstra wrote:
>> On Fri, Apr 03, 2026 at 06:17:15PM +0200, Thomas Gleixner wrote:
>>
>> Doesn't that want to be something like this instead?
>
> Duh. Yes. I'm clearly not seeing the forest for the trees anymore.
But thinking more about it. That's a stretch for backporting.
So I rather stay with the current approach and do the cleanup on top,
which then also should obviously do
dev->next_event = KTIME_MAX;
and remove it from the handlers, no?
Thanks,
tglx
On Sat, Apr 04, 2026 at 12:14:03AM +0200, Thomas Gleixner wrote: > On Fri, Apr 03 2026 at 23:24, Thomas Gleixner wrote: > > > On Fri, Apr 03 2026 at 23:01, Peter Zijlstra wrote: > >> On Fri, Apr 03, 2026 at 06:17:15PM +0200, Thomas Gleixner wrote: > >> > >> Doesn't that want to be something like this instead? > > > > Duh. Yes. I'm clearly not seeing the forest for the trees anymore. > > But thinking more about it. That's a stretch for backporting. Yeah, that cocci patch results in quite a large actual patch :-) > So I rather stay with the current approach and do the cleanup on top, > which then also should obviously do > > dev->next_event = KTIME_MAX; > > and remove it from the handlers, no? Works for me.
On Fri, Apr 03 2026 at 14:16, Peter Zijlstra wrote: >> delta = ktime_to_ns(ktime_sub(expires, ktime_get())); >> + >> + /* Don't reprogram when a forced event is pending */ >> + if (dev->next_event_forced && delta <= (int64_t)dev->min_delta_ns) >> + return 0; >> + >> if (delta <= 0) >> return force ? clockevents_program_min_delta(dev) : -ETIME; >> > > This last hunk seems duplicate of the clockevents_program_min_delta() > change, that also will bail when next_event_forced is set. Yeah. I was planning to look into it with brain awake today again. > Also, I note, thing (except shutdown) will ever clear next_event_forced; > and I'm thinking that any interrupt or normal reprogram should be doing > that. See my reply to Calvin. :)
On Thursday 04/02 at 19:07 +0200, Thomas Gleixner wrote:
> Calvin reported an odd NMI watchdog lockup which claims that the CPU locked
> up in user space. He provided a reproducer, which set's up a timerfd based
> timer and then rearms it in a loop with an absolute expiry time of 1ns.
>
> As the expiry time is in the past, the timer ends up as the first expiring
> timer in the per CPU hrtimer base and the clockevent device is programmed
> with the minimum delta value. If the machine is fast enough, this ends up
> in a endless loop of programming the delta value to the minimum value
> defined by the clock event device, before the timer interrupt can fire,
> which starves the interrupt and consequently triggers the lockup detector
> because the hrtimer callback of the lockup mechanism is never invoked.
>
> As a first step to prevent this, avoid reprogramming the clock event device
> when:
> - a forced minimum delta event is pending
> - the new expiry delta is less then or equal to the minimum delta
>
> Thanks to Calvin for providing the reproducer and to Borislav for testing
> and providing data from his Zen5 machine.
>
> The problem is not limited to Zen5, but depending on the underlying
> clock event device (e.g. TSC deadline timer on Intel) and the CPU speed
> not necessarily observable.
>
> This change serves only as the last resort and further changes will be made
> to prevent this scenario earlier in the call chain.
>
> Reported-by: Calvin Owens <calvin@wbinvd.org>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> ---
> P.S: I'm working on the other changes, but wanted to get this out ASAP
> for testing.
Unfortunately the AMD boxes won't boot with this: one gives me no
video console output, the other gets to userspace but hangs trying to
mount the rootfs and then prints hard lockup traces with idle stacks.
Sorry not to have more info yet, I'll have time tomorrow to sit down and
get more data for you. If there's anything specific that you'd like me
grab just let me know.
Thanks,
Calvin
> ---
> include/linux/clockchips.h | 2 ++
> kernel/time/clockevents.c | 37 +++++++++++++++++++++++--------------
> 2 files changed, 25 insertions(+), 14 deletions(-)
>
> --- a/include/linux/clockchips.h
> +++ b/include/linux/clockchips.h
> @@ -80,6 +80,7 @@ enum clock_event_state {
> * @shift: nanoseconds to cycles divisor (power of two)
> * @state_use_accessors:current state of the device, assigned by the core code
> * @features: features
> + * @next_event_forced: True if the last programming was a forced event
> * @retries: number of forced programming retries
> * @set_state_periodic: switch state to periodic
> * @set_state_oneshot: switch state to oneshot
> @@ -108,6 +109,7 @@ struct clock_event_device {
> u32 shift;
> enum clock_event_state state_use_accessors;
> unsigned int features;
> + unsigned int next_event_forced;
> unsigned long retries;
>
> int (*set_state_periodic)(struct clock_event_device *);
> --- a/kernel/time/clockevents.c
> +++ b/kernel/time/clockevents.c
> @@ -172,6 +172,7 @@ void clockevents_shutdown(struct clock_e
> {
> clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
> dev->next_event = KTIME_MAX;
> + dev->next_event_forced = 0;
> }
>
> /**
> @@ -224,13 +225,7 @@ static int clockevents_increase_min_delt
> return 0;
> }
>
> -/**
> - * clockevents_program_min_delta - Set clock event device to the minimum delay.
> - * @dev: device to program
> - *
> - * Returns 0 on success, -ETIME when the retry loop failed.
> - */
> -static int clockevents_program_min_delta(struct clock_event_device *dev)
> +static int __clockevents_program_min_delta(struct clock_event_device *dev)
> {
> unsigned long long clc;
> int64_t delta;
> @@ -263,13 +258,7 @@ static int clockevents_program_min_delta
>
> #else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
>
> -/**
> - * clockevents_program_min_delta - Set clock event device to the minimum delay.
> - * @dev: device to program
> - *
> - * Returns 0 on success, -ETIME when the retry loop failed.
> - */
> -static int clockevents_program_min_delta(struct clock_event_device *dev)
> +static int __clockevents_program_min_delta(struct clock_event_device *dev)
> {
> unsigned long long clc;
> int64_t delta = 0;
> @@ -293,6 +282,21 @@ static int clockevents_program_min_delta
> #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
>
> /**
> + * clockevents_program_min_delta - Set clock event device to the minimum delay.
> + * @dev: device to program
> + *
> + * Returns 0 on success, -ETIME when the retry loop failed.
> + */
> +static int clockevents_program_min_delta(struct clock_event_device *dev)
> +{
> + if (dev->next_event_forced)
> + return 0;
> +
> + dev->next_event_forced = 1;
> + return __clockevents_program_min_delta(dev);
> +}
> +
> +/**
> * clockevents_program_event - Reprogram the clock event device.
> * @dev: device to program
> * @expires: absolute expiry time (monotonic clock)
> @@ -324,6 +328,11 @@ int clockevents_program_event(struct clo
> return dev->set_next_ktime(expires, dev);
>
> delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
> +
> + /* Don't reprogram when a forced event is pending */
> + if (dev->next_event_forced && delta <= (int64_t)dev->min_delta_ns)
> + return 0;
> +
> if (delta <= 0)
> return force ? clockevents_program_min_delta(dev) : -ETIME;
>
On Thu, Apr 02 2026 at 22:11, Calvin Owens wrote:
> On Thursday 04/02 at 19:07 +0200, Thomas Gleixner wrote:
>> Calvin reported an odd NMI watchdog lockup which claims that the CPU locked
>> up in user space. He provided a reproducer, which set's up a timerfd based
>> timer and then rearms it in a loop with an absolute expiry time of 1ns.
>>
>> As the expiry time is in the past, the timer ends up as the first expiring
>> timer in the per CPU hrtimer base and the clockevent device is programmed
>> with the minimum delta value. If the machine is fast enough, this ends up
>> in a endless loop of programming the delta value to the minimum value
>> defined by the clock event device, before the timer interrupt can fire,
>> which starves the interrupt and consequently triggers the lockup detector
>> because the hrtimer callback of the lockup mechanism is never invoked.
>>
>> As a first step to prevent this, avoid reprogramming the clock event device
>> when:
>> - a forced minimum delta event is pending
>> - the new expiry delta is less then or equal to the minimum delta
>>
>> Thanks to Calvin for providing the reproducer and to Borislav for testing
>> and providing data from his Zen5 machine.
>>
>> The problem is not limited to Zen5, but depending on the underlying
>> clock event device (e.g. TSC deadline timer on Intel) and the CPU speed
>> not necessarily observable.
>>
>> This change serves only as the last resort and further changes will be made
>> to prevent this scenario earlier in the call chain.
>>
>> Reported-by: Calvin Owens <calvin@wbinvd.org>
>> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
>> ---
>> P.S: I'm working on the other changes, but wanted to get this out ASAP
>> for testing.
>
> Unfortunately the AMD boxes won't boot with this: one gives me no
> video console output, the other gets to userspace but hangs trying to
> mount the rootfs and then prints hard lockup traces with idle stacks.
>
> Sorry not to have more info yet, I'll have time tomorrow to sit down and
> get more data for you. If there's anything specific that you'd like me
> grab just let me know.
I'm an idiot. When I polished the patch up, I dropped the hunks which
clear the flag in the interrupt handler and tired brain did not notice
despite checking five times in a row. Updated version below.
Thanks,
tglx
---
From: Thomas Gleixner <tglx@kernel.org>
Subject: clockevents: Prevent timer interrupt starvation
Date: Thu, 02 Apr 2026 19:07:49 +0200
From: Thomas Gleixner <tglx@kernel.org>
Calvin reported an odd NMI watchdog lockup which claims that the CPU locked
up in user space. He provided a reproducer, which set's up a timerfd based
timer and then rearms it in a loop with an absolute expiry time of 1ns.
As the expiry time is in the past, the timer ends up as the first expiring
timer in the per CPU hrtimer base and the clockevent device is programmed
with the minimum delta value. If the machine is fast enough, this ends up
in a endless loop of programming the delta value to the minimum value
defined by the clock event device, before the timer interrupt can fire,
which starves the interrupt and consequently triggers the lockup detector
because the hrtimer callback of the lockup mechanism is never invoked.
As a first step to prevent this, avoid reprogramming the clock event device
when:
- a forced minimum delta event is pending
- the new expiry delta is less then or equal to the minimum delta
Thanks to Calvin for providing the reproducer and to Borislav for testing
and providing data from his Zen5 machine.
The problem is not limited to Zen5, but depending on the underlying
clock event device (e.g. TSC deadline timer on Intel) and the CPU speed
not necessarily observable.
This change serves only as the last resort and further changes will be made
to prevent this scenario earlier in the call chain.
Reported-by: Calvin Owens <calvin@wbinvd.org>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
P.S: I'm working on the other changes, but wanted to get this out ASAP
for testing.
---
include/linux/clockchips.h | 2 ++
kernel/time/clockevents.c | 37 +++++++++++++++++++++++--------------
kernel/time/hrtimer.c | 1 +
kernel/time/tick-common.c | 1 +
kernel/time/tick-sched.c | 1 +
5 files changed, 28 insertions(+), 14 deletions(-)
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -80,6 +80,7 @@ enum clock_event_state {
* @shift: nanoseconds to cycles divisor (power of two)
* @state_use_accessors:current state of the device, assigned by the core code
* @features: features
+ * @next_event_forced: True if the last programming was a forced event
* @retries: number of forced programming retries
* @set_state_periodic: switch state to periodic
* @set_state_oneshot: switch state to oneshot
@@ -108,6 +109,7 @@ struct clock_event_device {
u32 shift;
enum clock_event_state state_use_accessors;
unsigned int features;
+ unsigned int next_event_forced;
unsigned long retries;
int (*set_state_periodic)(struct clock_event_device *);
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -172,6 +172,7 @@ void clockevents_shutdown(struct clock_e
{
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
}
/**
@@ -224,13 +225,7 @@ static int clockevents_increase_min_delt
return 0;
}
-/**
- * clockevents_program_min_delta - Set clock event device to the minimum delay.
- * @dev: device to program
- *
- * Returns 0 on success, -ETIME when the retry loop failed.
- */
-static int clockevents_program_min_delta(struct clock_event_device *dev)
+static int __clockevents_program_min_delta(struct clock_event_device *dev)
{
unsigned long long clc;
int64_t delta;
@@ -263,13 +258,7 @@ static int clockevents_program_min_delta
#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
-/**
- * clockevents_program_min_delta - Set clock event device to the minimum delay.
- * @dev: device to program
- *
- * Returns 0 on success, -ETIME when the retry loop failed.
- */
-static int clockevents_program_min_delta(struct clock_event_device *dev)
+static int __clockevents_program_min_delta(struct clock_event_device *dev)
{
unsigned long long clc;
int64_t delta = 0;
@@ -293,6 +282,21 @@ static int clockevents_program_min_delta
#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev: device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+ if (dev->next_event_forced)
+ return 0;
+
+ dev->next_event_forced = 1;
+ return __clockevents_program_min_delta(dev);
+}
+
+/**
* clockevents_program_event - Reprogram the clock event device.
* @dev: device to program
* @expires: absolute expiry time (monotonic clock)
@@ -324,6 +328,11 @@ int clockevents_program_event(struct clo
return dev->set_next_ktime(expires, dev);
delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
+
+ /* Don't reprogram when a forced event is pending */
+ if (dev->next_event_forced && delta <= (int64_t)dev->min_delta_ns)
+ return 0;
+
if (delta <= 0)
return force ? clockevents_program_min_delta(dev) : -ETIME;
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1888,6 +1888,7 @@ void hrtimer_interrupt(struct clock_even
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
entry_time = now = hrtimer_update_base(cpu_base);
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -110,6 +110,7 @@ void tick_handle_periodic(struct clock_e
int cpu = smp_processor_id();
ktime_t next = dev->next_event;
+ dev->next_event_forced = 0;
tick_periodic(cpu);
/*
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1513,6 +1513,7 @@ static void tick_nohz_lowres_handler(str
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
On Friday 04/03 at 16:41 +0200, Thomas Gleixner wrote: > On Thu, Apr 02 2026 at 22:11, Calvin Owens wrote: > > On Thursday 04/02 at 19:07 +0200, Thomas Gleixner wrote: > >> Calvin reported an odd NMI watchdog lockup which claims that the CPU locked > >> up in user space. He provided a reproducer, which set's up a timerfd based > >> timer and then rearms it in a loop with an absolute expiry time of 1ns. > >> > >> As the expiry time is in the past, the timer ends up as the first expiring > >> timer in the per CPU hrtimer base and the clockevent device is programmed > >> with the minimum delta value. If the machine is fast enough, this ends up > >> in a endless loop of programming the delta value to the minimum value > >> defined by the clock event device, before the timer interrupt can fire, > >> which starves the interrupt and consequently triggers the lockup detector > >> because the hrtimer callback of the lockup mechanism is never invoked. > >> > >> As a first step to prevent this, avoid reprogramming the clock event device > >> when: > >> - a forced minimum delta event is pending > >> - the new expiry delta is less then or equal to the minimum delta > >> > >> Thanks to Calvin for providing the reproducer and to Borislav for testing > >> and providing data from his Zen5 machine. > >> > >> The problem is not limited to Zen5, but depending on the underlying > >> clock event device (e.g. TSC deadline timer on Intel) and the CPU speed > >> not necessarily observable. > >> > >> This change serves only as the last resort and further changes will be made > >> to prevent this scenario earlier in the call chain. > >> > >> Reported-by: Calvin Owens <calvin@wbinvd.org> > >> Signed-off-by: Thomas Gleixner <tglx@kernel.org> > >> --- > >> P.S: I'm working on the other changes, but wanted to get this out ASAP > >> for testing. > > > > Unfortunately the AMD boxes won't boot with this: one gives me no > > video console output, the other gets to userspace but hangs trying to > > mount the rootfs and then prints hard lockup traces with idle stacks. > > > > Sorry not to have more info yet, I'll have time tomorrow to sit down and > > get more data for you. If there's anything specific that you'd like me > > grab just let me know. > > I'm an idiot. When I polished the patch up, I dropped the hunks which > clear the flag in the interrupt handler and tired brain did not notice > despite checking five times in a row. Updated version below. That did it, both AMD machines survive the reproducer and are well behaved afterwards. If you like: Tested-By: Calvin Owens <calvin@wbinvd.org> Thanks, Calvin
On Fri, Apr 03 2026 at 08:58, Calvin Owens wrote:
> On Friday 04/03 at 16:41 +0200, Thomas Gleixner wrote:
>> I'm an idiot. When I polished the patch up, I dropped the hunks which
>> clear the flag in the interrupt handler and tired brain did not notice
>> despite checking five times in a row. Updated version below.
>
> That did it, both AMD machines survive the reproducer and are well
> behaved afterwards.
Thank you and sorry for the nuisance.
> If you like:
>
> Tested-By: Calvin Owens <calvin@wbinvd.org>
I will probably post a slightly different version similar to the one I
sent in the reply to Peter and if you have time then I would appreciate
a tested-by on that final to be polished version.
Btw, I'm really curious how you deduced the reproducer from systemd
code. I assume you figured somehow out which program triggered the
behaviour and then inspected the source to find something fishy. Can you
provide a pointer to the code in question? If they really do what your
reproducer does, then this code needs to be fixed too :)
Thanks,
tglx
On Friday 04/03 at 21:00 +0200, Thomas Gleixner wrote:
> On Fri, Apr 03 2026 at 08:58, Calvin Owens wrote:
> > On Friday 04/03 at 16:41 +0200, Thomas Gleixner wrote:
> >> I'm an idiot. When I polished the patch up, I dropped the hunks which
> >> clear the flag in the interrupt handler and tired brain did not notice
> >> despite checking five times in a row. Updated version below.
> >
> > That did it, both AMD machines survive the reproducer and are well
> > behaved afterwards.
>
> Thank you and sorry for the nuisance.
>
> > If you like:
> >
> > Tested-By: Calvin Owens <calvin@wbinvd.org>
>
> I will probably post a slightly different version similar to the one I
> sent in the reply to Peter and if you have time then I would appreciate
> a tested-by on that final to be polished version.
I will take a look.
> Btw, I'm really curious how you deduced the reproducer from systemd
> code. I assume you figured somehow out which program triggered the
> behaviour and then inspected the source to find something fishy. Can you
> provide a pointer to the code in question? If they really do what your
> reproducer does, then this code needs to be fixed too :)
I pulled the text that was executing when the NMI fired out of the dump:
00 ba 38 03 00 00 48 8d 35 ce 40 18 00 48 8d 3d 16 41 18 00 e8 11 14
e8 ff b8 f4 ff ff ff e9 6d ff ff ff 0f 1f 80 00 00 00 00 0f b6 4f 2f
48 8d 15 e5 5f 26 00 48 89 c8 83 e0 03 48 c1 e0 05 48
...and searched for it in systemd-networkd and all its libs. It appears
in one spot in libsystemd-shared-259.so in path_hash_func(), so that
must be where the userspace %ip was when the NMI fired.
Unfortunately that has too many callers: I couldn't narrow it down
meaningfully from there. Despite staring at a lot of timer code in
systemd, I haven't yet found anything concrete that might cause buggy
behavior.
But, it stuck out at me that the detritus on the stack wasn't futex() or
poll() or read() related. It seemed wildly improbable that the NMI
would have just happened to catch systemd-networkd running like that, I
guessed it was probably spinning around timerfd_settime() in userspace
when the NMI fired (with calls to path_hash_func() somehow in-between).
My initial guess was that the trigger was something about waiting on the
timer in a different thread than it was set on. I started to write that
out as a small reproducer, but almost jokingly thought, "well, I should
just try setting them blindly first and see if that works", and then my
head exploded when it actually did :)
I've tried overloading the machine, and triggering some unrealistically
large time steps back and forth underneath it. But I can't get systemd
to stick itself in any sort of loop like that, or even set a single
timer expiry to an unreasonable value.
I think I will set up a little BPF thing to force systemd-networkd to
dump core if it makes timerfd_settime() calls too quickly or with
abstime arguments in the past, hopefully from the core I can work out
what was going on. But any better suggestions are welcome.
Thanks,
Calvin
On Fri, Apr 03 2026 at 17:15, Calvin Owens wrote:
> On Friday 04/03 at 21:00 +0200, Thomas Gleixner wrote:
>> Btw, I'm really curious how you deduced the reproducer from systemd
>> code. I assume you figured somehow out which program triggered the
>> behaviour and then inspected the source to find something fishy. Can you
>> provide a pointer to the code in question? If they really do what your
>> reproducer does, then this code needs to be fixed too :)
>
> I pulled the text that was executing when the NMI fired out of the dump:
>
> 00 ba 38 03 00 00 48 8d 35 ce 40 18 00 48 8d 3d 16 41 18 00 e8 11 14
> e8 ff b8 f4 ff ff ff e9 6d ff ff ff 0f 1f 80 00 00 00 00 0f b6 4f 2f
> 48 8d 15 e5 5f 26 00 48 89 c8 83 e0 03 48 c1 e0 05 48
>
> ...and searched for it in systemd-networkd and all its libs. It appears
> in one spot in libsystemd-shared-259.so in path_hash_func(), so that
> must be where the userspace %ip was when the NMI fired.
Amazing.
> Unfortunately that has too many callers: I couldn't narrow it down
> meaningfully from there. Despite staring at a lot of timer code in
> systemd, I haven't yet found anything concrete that might cause buggy
> behavior.
>
> But, it stuck out at me that the detritus on the stack wasn't futex() or
> poll() or read() related. It seemed wildly improbable that the NMI
> would have just happened to catch systemd-networkd running like that, I
> guessed it was probably spinning around timerfd_settime() in userspace
> when the NMI fired (with calls to path_hash_func() somehow in-between).
Right and there is an explicit timerfd_settime(... { 0, 1 }) in the
event management code.
> My initial guess was that the trigger was something about waiting on the
> timer in a different thread than it was set on. I started to write that
> out as a small reproducer, but almost jokingly thought, "well, I should
> just try setting them blindly first and see if that works", and then my
> head exploded when it actually did :)
:)
> I've tried overloading the machine, and triggering some unrealistically
> large time steps back and forth underneath it. But I can't get systemd
> to stick itself in any sort of loop like that, or even set a single
> timer expiry to an unreasonable value.
>
> I think I will set up a little BPF thing to force systemd-networkd to
> dump core if it makes timerfd_settime() calls too quickly or with
> abstime arguments in the past, hopefully from the core I can work out
> what was going on. But any better suggestions are welcome.
It just occured to me that with the hrtimer changes, you might be able
to utilize the new hrtimer_start_expires tracepoint and enable user
stack traces to get down to the actual root cause.
Thanks,
tglx
© 2016 - 2026 Red Hat, Inc.