Currently hrtimer_interrupt() runs expired timers, which can re-arm
themselves, after which it computes the next expiration time and
re-programs the hardware.
However, things like HRTICK, a highres timer driving preemption,
cannot re-arm itself at the point of running, since the next task has
not been determined yet. The schedule() in the interrupt return path
will switch to the next task, which then causes a new hrtimer to be
programmed.
This then results in reprogramming the hardware at least twice, once
after running the timers, and once upon selecting the new task.
Notably, *both* events happen in the interrupt.
By pushing the hrtimer reprogram all the way into the interrupt return
path, it runs after schedule() and this double reprogram can be
avoided.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/asm-generic/thread_info_tif.h | 5 ++++-
include/linux/hrtimer.h | 17 +++++++++++++++++
include/linux/irq-entry-common.h | 2 ++
kernel/entry/common.c | 13 +++++++++++++
kernel/sched/core.c | 10 ++++++++++
kernel/time/hrtimer.c | 28 ++++++++++++++++++++++++----
6 files changed, 70 insertions(+), 5 deletions(-)
--- a/include/asm-generic/thread_info_tif.h
+++ b/include/asm-generic/thread_info_tif.h
@@ -41,11 +41,14 @@
#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING)
#ifdef HAVE_TIF_RESTORE_SIGMASK
-# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() */
+# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal()
# define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK)
#endif
#define TIF_RSEQ 11 // Run RSEQ fast path
#define _TIF_RSEQ BIT(TIF_RSEQ)
+#define TIF_HRTIMER_REARM 12 // re-arm the timer
+#define _TIF_HRTIMER_REARM BIT(TIF_HRTIMER_REARM)
+
#endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -175,10 +175,27 @@ extern void hrtimer_interrupt(struct clo
extern unsigned int hrtimer_resolution;
+#ifdef TIF_HRTIMER_REARM
+extern void _hrtimer_rearm(void);
+/*
+ * This is to be called on all irqentry_exit() paths that will enable
+ * interrupts; as well as in the context switch path before switch_to().
+ */
+static inline void hrtimer_rearm(void)
+{
+ if (test_thread_flag(TIF_HRTIMER_REARM))
+ _hrtimer_rearm();
+}
+#else
+static inline void hrtimer_rearm(void) { }
+#endif /* TIF_HRTIMER_REARM */
+
#else
#define hrtimer_resolution (unsigned int)LOW_RES_NSEC
+static inline void hrtimer_rearm(void) { }
+
#endif
static inline ktime_t
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -224,6 +224,8 @@ static __always_inline void __exit_to_us
ti_work = read_thread_flags();
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
+ else
+ hrtimer_rearm();
arch_exit_to_user_mode_prepare(regs, ti_work);
}
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -7,6 +7,7 @@
#include <linux/kmsan.h>
#include <linux/livepatch.h>
#include <linux/tick.h>
+#include <linux/hrtimer.h>
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
@@ -26,6 +27,16 @@ static __always_inline unsigned long __e
*/
while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
+ /*
+ * If hrtimer need re-arming, do so before enabling IRQs,
+ * except when a reschedule is needed, in that case schedule()
+ * will do this.
+ */
+ if ((ti_work & (_TIF_NEED_RESCHED |
+ _TIF_NEED_RESCHED_LAZY |
+ _TIF_HRTIMER_REARM)) == _TIF_HRTIMER_REARM)
+ hrtimer_rearm();
+
local_irq_enable_exit_to_user(ti_work);
if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
@@ -202,6 +213,7 @@ noinstr void irqentry_exit(struct pt_reg
*/
if (state.exit_rcu) {
instrumentation_begin();
+ hrtimer_rearm();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
@@ -215,6 +227,7 @@ noinstr void irqentry_exit(struct pt_reg
if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
+ hrtimer_rearm();
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6814,6 +6814,16 @@ static void __sched notrace __schedule(i
keep_resched:
rq->last_seen_need_resched_ns = 0;
+ /*
+ * Notably, this must be called after pick_next_task() but before
+ * switch_to(), since the new task need not be on the return from
+ * interrupt path. Additionally, exit_to_user_mode_loop() relies on
+ * any schedule() call to imply this call, so do it unconditionally.
+ *
+ * We've just cleared TIF_NEED_RESCHED, TIF word should be in cache.
+ */
+ hrtimer_rearm();
+
is_switch = prev != next;
if (likely(is_switch)) {
rq->nr_switches++;
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1892,10 +1892,9 @@ static __latent_entropy void hrtimer_run
* Very similar to hrtimer_force_reprogram(), except it deals with
* in_hrirq and hang_detected.
*/
-static void __hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now)
+static void __hrtimer_rearm(struct hrtimer_cpu_base *cpu_base,
+ ktime_t now, ktime_t expires_next)
{
- ktime_t expires_next = hrtimer_update_next_event(cpu_base);
-
cpu_base->expires_next = expires_next;
cpu_base->in_hrtirq = 0;
@@ -1970,9 +1969,30 @@ void hrtimer_interrupt(struct clock_even
cpu_base->hang_detected = 1;
}
- __hrtimer_rearm(cpu_base, now);
+#ifdef TIF_HRTIMER_REARM
+ set_thread_flag(TIF_HRTIMER_REARM);
+#else
+ __hrtimer_rearm(cpu_base, now, expires_next);
+#endif
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}
+
+#ifdef TIF_HRTIMER_REARM
+void _hrtimer_rearm(void)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t now, expires_next;
+
+ lockdep_assert_irqs_disabled();
+
+ scoped_guard (raw_spinlock, &cpu_base->lock) {
+ now = hrtimer_update_base(cpu_base);
+ expires_next = hrtimer_update_next_event(cpu_base);
+ __hrtimer_rearm(cpu_base, now, expires_next);
+ clear_thread_flag(TIF_HRTIMER_REARM);
+ }
+}
+#endif /* TIF_HRTIMER_REARM */
#endif /* !CONFIG_HIGH_RES_TIMERS */
/*
On Wed, Jan 21 2026 at 17:20, Peter Zijlstra wrote:
> while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
>
> + /*
> + * If hrtimer need re-arming, do so before enabling IRQs,
> + * except when a reschedule is needed, in that case schedule()
> + * will do this.
> + */
> + if ((ti_work & (_TIF_NEED_RESCHED |
> + _TIF_NEED_RESCHED_LAZY |
> + _TIF_HRTIMER_REARM)) == _TIF_HRTIMER_REARM)
> + hrtimer_rearm();
Two things I'm not convinced that they are handled correctly:
1) Interrupts
After reenabling interrupts and before reaching schedule() an
interrupt comes in and runs soft interrupt processing for a while
on the way back, which delays the update until that processing
completes.
2) Time slice extension
When the time slice is granted this will not rearm the clockevent
device unless the slice hrtimer becomes the first expiring timer
on that CPU, but even then that misses the full reevaluation of
the next timer event.
> -static void __hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now)
> +static void __hrtimer_rearm(struct hrtimer_cpu_base *cpu_base,
> + ktime_t now, ktime_t expires_next)
> {
> - ktime_t expires_next = hrtimer_update_next_event(cpu_base);
> -
> cpu_base->expires_next = expires_next;
> cpu_base->in_hrtirq = 0;
>
> @@ -1970,9 +1969,30 @@ void hrtimer_interrupt(struct clock_even
> cpu_base->hang_detected = 1;
> }
>
> - __hrtimer_rearm(cpu_base, now);
> +#ifdef TIF_HRTIMER_REARM
> + set_thread_flag(TIF_HRTIMER_REARM);
> +#else
> + __hrtimer_rearm(cpu_base, now, expires_next);
> +#endif
in hrtimer.h where you already have the #ifdef TIF_HRTIMER_REARM
section:
static inline bool hrtimer_set_rearm_delayed()
{
set_thread_flag(TIF_HRTIMER_REARM);
return true;
}
and a empty stub returning false for the other case then this becomes:
if (!hrtimer_set_rearm_delayed())
hrtimer_rearm(...);
and the ugly ifdef in the code goes away.
> raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
> }
> +
> +#ifdef TIF_HRTIMER_REARM
> +void _hrtimer_rearm(void)
Grr. I had to read this five times to figure out that we now have
hrtimer_rearm()
_hrtimer_rearm()
__hrtimer_rearm()
You clearly ran out of characters to make that obvious:
hrtimer_rearm_delayed()
hrtimer_rearm()
hrtimer_do_rearm()
or something like that.
Thanks,
tglx
On Mon, Feb 02, 2026 at 03:37:13PM +0100, Thomas Gleixner wrote:
> On Wed, Jan 21 2026 at 17:20, Peter Zijlstra wrote:
> > while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
> >
> > + /*
> > + * If hrtimer need re-arming, do so before enabling IRQs,
> > + * except when a reschedule is needed, in that case schedule()
> > + * will do this.
> > + */
> > + if ((ti_work & (_TIF_NEED_RESCHED |
> > + _TIF_NEED_RESCHED_LAZY |
> > + _TIF_HRTIMER_REARM)) == _TIF_HRTIMER_REARM)
> > + hrtimer_rearm();
>
> Two things I'm not convinced that they are handled correctly:
>
> 1) Interrupts
>
> After reenabling interrupts and before reaching schedule() an
> interrupt comes in and runs soft interrupt processing for a while
> on the way back, which delays the update until that processing
> completes.
So the basic thing looks like:
<USER-MODE>
irqentry_enter()
run_irq_on_irqstack_cond()
if (user_mode() || hardirq_stack_inuse)
irq_enter_rcu();
func_c();
irq_exit_rcu()
__irq_exit_rcu()
invoke_softirq()
irqentry_exit()
irqentry_exit_to_user_mode()
irqentry_exit_to_user_mode_prepare()
__exit_to_user_mode_prepare()
exit_to_user_mode_loop()
...here...
So a nested IRQ at this point will have !user_mode(), but I think it can
still end up in softirqs due to that hardirq_stack_inuse. Should we
perhaps make sure only user_mode() ends up in softirqs?
> 2) Time slice extension
>
> When the time slice is granted this will not rearm the clockevent
> device unless the slice hrtimer becomes the first expiring timer
> on that CPU, but even then that misses the full reevaluation of
> the next timer event.
Oh crud yes, that should be something like:
if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY))
schedule();
else
hrtimer_rearm();
On Mon, Feb 02 2026 at 17:33, Peter Zijlstra wrote:
> On Mon, Feb 02, 2026 at 03:37:13PM +0100, Thomas Gleixner wrote:
>> On Wed, Jan 21 2026 at 17:20, Peter Zijlstra wrote:
>> > while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
>> >
>> > + /*
>> > + * If hrtimer need re-arming, do so before enabling IRQs,
>> > + * except when a reschedule is needed, in that case schedule()
>> > + * will do this.
>> > + */
>> > + if ((ti_work & (_TIF_NEED_RESCHED |
>> > + _TIF_NEED_RESCHED_LAZY |
>> > + _TIF_HRTIMER_REARM)) == _TIF_HRTIMER_REARM)
>> > + hrtimer_rearm();
>>
>> Two things I'm not convinced that they are handled correctly:
>>
>> 1) Interrupts
>>
>> After reenabling interrupts and before reaching schedule() an
>> interrupt comes in and runs soft interrupt processing for a while
>> on the way back, which delays the update until that processing
>> completes.
>
> So the basic thing looks like:
>
> <USER-MODE>
> irqentry_enter()
> run_irq_on_irqstack_cond()
> if (user_mode() || hardirq_stack_inuse)
> irq_enter_rcu();
> func_c();
> irq_exit_rcu()
> __irq_exit_rcu()
> invoke_softirq()
> irqentry_exit()
> irqentry_exit_to_user_mode()
> irqentry_exit_to_user_mode_prepare()
> __exit_to_user_mode_prepare()
> exit_to_user_mode_loop()
> ...here...
>
> So a nested IRQ at this point will have !user_mode(), but I think it can
> still end up in softirqs due to that hardirq_stack_inuse. Should we
> perhaps make sure only user_mode() ends up in softirqs?
All interrupts independent of the mode they hit are ending up in
irq_exit_rcu() and therefore in __irq_exit_rcu()
run_irq_on_irqstack_cond()
if (user_mode() || hardirq_stack_inuse)
// Stay on user or hardirq stack
irq_enter_rcu();
func_c();
irq_exit_rcu()
else
// MAGIC ASM to switch to hardirq stack
call irq_enter_rcu
call func_c
call irq_exit_rcu
The only reason why invoke_softirq() won't be called is when the
interrupt hits into the softirq processing region of the previous
interrupt, which means it's already on the hardirq stack.
But looking at this there is already a problem without interrupt
nesting:
irq_enter_rcu();
timer_interrupt()
hrtimer_interrupt()
delay_rearm();
irq_exit_rcu()
__irq_exit_rcu()
invoke_softirq() <- Here
Soft interrupts can run for quite some time, which means this already
can cause timers being delayed for way too long. I think in
__irq_exit_rcu() you want to do:
if (!in_interrupt() && local_softirq_pending()) {
hrtimer_rearm();
invoke_softirq();
}
Thanks,
tglx
On Tue, Feb 03, 2026 at 12:28:13AM +0100, Thomas Gleixner wrote:
> But looking at this there is already a problem without interrupt
> nesting:
>
> irq_enter_rcu();
> timer_interrupt()
> hrtimer_interrupt()
> delay_rearm();
> irq_exit_rcu()
> __irq_exit_rcu()
> invoke_softirq() <- Here
>
> Soft interrupts can run for quite some time, which means this already
> can cause timers being delayed for way too long. I think in
> __irq_exit_rcu() you want to do:
>
> if (!in_interrupt() && local_softirq_pending()) {
> hrtimer_rearm();
> invoke_softirq();
> }
Right, and we can do the same on (nested) IRQ entry. Something like so:
---
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -63,6 +63,8 @@ static __always_inline unsigned long __e
if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY))
schedule();
+ else
+ hrtimer_rearm();
}
if (ti_work & _TIF_UPROBE)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -663,6 +663,13 @@ void irq_enter_rcu(void)
{
__irq_enter_raw();
+ /*
+ * If this is a nested IRQ that hits the exit_to_user_mode_loop
+ * where it has enabled IRQs but before it has hit schedule()
+ * we could have hrtimers in an undefined state. Fix it up here.
+ */
+ hrtimer_rearm();
+
if (tick_nohz_full_cpu(smp_processor_id()) ||
(is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)))
tick_irq_enter();
@@ -719,8 +726,14 @@ static inline void __irq_exit_rcu(void)
#endif
account_hardirq_exit(current);
preempt_count_sub(HARDIRQ_OFFSET);
- if (!in_interrupt() && local_softirq_pending())
+ if (!in_interrupt() && local_softirq_pending()) {
+ /*
+ * If we left hrtimers unarmed, make sure to arm them now,
+ * before enabling interrupts to run SoftIRQ.
+ */
+ hrtimer_rearm();
invoke_softirq();
+ }
if (IS_ENABLED(CONFIG_IRQ_FORCED_THREADING) && force_irqthreads() &&
local_timers_pending_force_th() && !(in_nmi() | in_hardirq()))
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1279,8 +1279,8 @@ static int __hrtimer_start_range_ns(stru
if (timer->is_fuzzy) {
/*
- * XXX fuzzy implies pinned! not sure how to deal with
- * retrigger_next_event() for the !local case.
+ * Fuzzy requires pinned as the lazy programming only works
+ * for CPU local timers.
*/
WARN_ON_ONCE(!(mode & HRTIMER_MODE_PINNED));
/*
@@ -1898,7 +1898,7 @@ static __latent_entropy void hrtimer_run
/*
* Very similar to hrtimer_force_reprogram(), except it deals with
- * in_hrirq and hang_detected.
+ * in_hrtirq and hang_detected.
*/
static void __hrtimer_rearm(struct hrtimer_cpu_base *cpu_base,
ktime_t now, ktime_t expires_next)
On Tue, Feb 03 2026 at 00:28, Thomas Gleixner wrote:
> On Mon, Feb 02 2026 at 17:33, Peter Zijlstra wrote:
>> So a nested IRQ at this point will have !user_mode(), but I think it can
>> still end up in softirqs due to that hardirq_stack_inuse. Should we
>> perhaps make sure only user_mode() ends up in softirqs?
>
> All interrupts independent of the mode they hit are ending up in
> irq_exit_rcu() and therefore in __irq_exit_rcu()
>
> run_irq_on_irqstack_cond()
> if (user_mode() || hardirq_stack_inuse)
> // Stay on user or hardirq stack
> irq_enter_rcu();
> func_c();
> irq_exit_rcu()
> else
> // MAGIC ASM to switch to hardirq stack
> call irq_enter_rcu
> call func_c
> call irq_exit_rcu
>
> The only reason why invoke_softirq() won't be called is when the
> interrupt hits into the softirq processing region of the previous
> interrupt, which means it's already on the hardirq stack.
In the case I pointed out where the second interrupt hits right after
exit to user enabled interupts, there is no nesting and it will happily
take the second path which switches to the hardirq stack and then on
return processes soft interrupts.
> But looking at this there is already a problem without interrupt
> nesting:
>
> irq_enter_rcu();
> timer_interrupt()
> hrtimer_interrupt()
> delay_rearm();
> irq_exit_rcu()
> __irq_exit_rcu()
> invoke_softirq() <- Here
>
> Soft interrupts can run for quite some time, which means this already
> can cause timers being delayed for way too long. I think in
> __irq_exit_rcu() you want to do:
>
> if (!in_interrupt() && local_softirq_pending()) {
> hrtimer_rearm();
> invoke_softirq();
> }
Actually it's worse. Assume the CPU on which this happens has the
jiffies duty. As the timer does not fire, jiffies become stale. So
anything which relies on jiffies going forward will get stuck until some
other condition breaks the tie. That's going to be fun to debug :)
Thanks,
tglx
On Wed, 21 Jan 2026 17:20:15 +0100
Peter Zijlstra <peterz@infradead.org> wrote:
> +#ifdef TIF_HRTIMER_REARM
> +void _hrtimer_rearm(void)
> +{
> + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
> + ktime_t now, expires_next;
> +
> + lockdep_assert_irqs_disabled();
> +
> + scoped_guard (raw_spinlock, &cpu_base->lock) {
> + now = hrtimer_update_base(cpu_base);
> + expires_next = hrtimer_update_next_event(cpu_base);
> + __hrtimer_rearm(cpu_base, now, expires_next);
> + clear_thread_flag(TIF_HRTIMER_REARM);
> + }
> +}
I'm curious to why you decided to use scoped_guard() here and not just
guard() and not add the extra indentation? The function is small enough
where everything is expected to be protected by the spinlock.
-- Steve
On Fri, Jan 23, 2026 at 03:08:43PM -0500, Steven Rostedt wrote:
> On Wed, 21 Jan 2026 17:20:15 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
>
> > +#ifdef TIF_HRTIMER_REARM
> > +void _hrtimer_rearm(void)
> > +{
> > + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
> > + ktime_t now, expires_next;
> > +
> > + lockdep_assert_irqs_disabled();
> > +
> > + scoped_guard (raw_spinlock, &cpu_base->lock) {
> > + now = hrtimer_update_base(cpu_base);
> > + expires_next = hrtimer_update_next_event(cpu_base);
> > + __hrtimer_rearm(cpu_base, now, expires_next);
> > + clear_thread_flag(TIF_HRTIMER_REARM);
> > + }
> > +}
>
> I'm curious to why you decided to use scoped_guard() here and not just
> guard() and not add the extra indentation? The function is small enough
> where everything is expected to be protected by the spinlock.
Yeah, I'm not entirely sure... its been over 6 months since I wrote this
code :-/
© 2016 - 2026 Red Hat, Inc.