kernel/time/tick-sched.c | 46 ++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-)
On large NUMA systems, while running a test program that saturates the
inter-proccesor and inter-NUMA links, acquiring the jiffies_lock can
be very expensive. If the cpu designated to do jiffies updates
(tick_do_timer_cpu) gets delayed and other cpus decide to do the
jiffies update themselves, a large number of them decide to do so at
the same time. The inexpensive check against tick_next_period is far
quicker than actually acquiring the lock, so most of these get in line
to obtain the lock. If obtaining the lock is slow enough, this
spirals into the vast majority of CPUs continuously being stuck
waiting for this lock, just to obtain it and find out that time has
already been updated by another cpu. For example, on one random entry
to kdb by manually-injected NMI, I saw 2912 of 3840 cpus stuck here.
To avoid this, in tick_sched_do_timer() have cpus that are not the
official timekeeper only try for the lock, and if it is held by
another CPU, leave the updating of jiffies to the lock holder. If the
update is not yet guaranteed complete, do not reset
ts->stalled_jiffies, so the check for stalled jiffies continues on the
next tick.
With this change, manually interrupting the test I find at most one
cpu in the tick_do_update_jiffies64 function.
Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
---
kernel/time/tick-sched.c | 46 ++++++++++++++++++++++++++++++++--------
1 file changed, 37 insertions(+), 9 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c527b421c865..706d4e235989 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -54,7 +54,7 @@ static ktime_t last_jiffies_update;
/*
* Must be called with interrupts disabled !
*/
-static void tick_do_update_jiffies64(ktime_t now)
+static bool _tick_do_update_jiffies64(ktime_t now, bool trylock)
{
unsigned long ticks = 1;
ktime_t delta, nextp;
@@ -70,7 +70,7 @@ static void tick_do_update_jiffies64(ktime_t now)
*/
if (IS_ENABLED(CONFIG_64BIT)) {
if (ktime_before(now, smp_load_acquire(&tick_next_period)))
- return;
+ return true;
} else {
unsigned int seq;
@@ -84,18 +84,24 @@ static void tick_do_update_jiffies64(ktime_t now)
} while (read_seqcount_retry(&jiffies_seq, seq));
if (ktime_before(now, nextp))
- return;
+ return true;
}
/* Quick check failed, i.e. update is required. */
- raw_spin_lock(&jiffies_lock);
+ if (trylock) {
+ /* The cpu holding the lock will do the update. */
+ if (!raw_spin_trylock(&jiffies_lock))
+ return false;
+ } else {
+ raw_spin_lock(&jiffies_lock);
+ }
/*
* Re-evaluate with the lock held. Another CPU might have done the
* update already.
*/
if (ktime_before(now, tick_next_period)) {
raw_spin_unlock(&jiffies_lock);
- return;
+ return true;
}
write_seqcount_begin(&jiffies_seq);
@@ -147,6 +153,27 @@ static void tick_do_update_jiffies64(ktime_t now)
raw_spin_unlock(&jiffies_lock);
update_wall_time();
+ return true;
+}
+
+/*
+ * Obtains the lock and does not return until update is complete.
+ * Must be called with interrupts disabled.
+ */
+static void tick_do_update_jiffies64(ktime_t now)
+{
+ _tick_do_update_jiffies64(now, false);
+}
+
+/*
+ * This will return early if another cpu holds the lock. On return,
+ * the update is in progress but may not have completed yet.
+ * Must be called with interrupts disabled.
+ * Returns false if update might not yet be completed.
+ */
+static bool tick_attempt_update_jiffies64(ktime_t now)
+{
+ return _tick_do_update_jiffies64(now, true);
}
/*
@@ -239,10 +266,11 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies);
} else {
- if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
- tick_do_update_jiffies64(now);
- ts->stalled_jiffies = 0;
- ts->last_tick_jiffies = READ_ONCE(jiffies);
+ if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) {
+ if (tick_attempt_update_jiffies64(now)) {
+ ts->stalled_jiffies = 0;
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
+ }
}
}
--
2.26.2
On Mon, Oct 13 2025 at 10:09, Steve Wahl wrote:
> -static void tick_do_update_jiffies64(ktime_t now)
> +static bool _tick_do_update_jiffies64(ktime_t now, bool trylock)
> {
> unsigned long ticks = 1;
> ktime_t delta, nextp;
> @@ -70,7 +70,7 @@ static void tick_do_update_jiffies64(ktime_t now)
> */
> if (IS_ENABLED(CONFIG_64BIT)) {
> if (ktime_before(now, smp_load_acquire(&tick_next_period)))
> - return;
> + return true;
> } else {
> unsigned int seq;
>
> @@ -84,18 +84,24 @@ static void tick_do_update_jiffies64(ktime_t now)
> } while (read_seqcount_retry(&jiffies_seq, seq));
>
> if (ktime_before(now, nextp))
> - return;
> + return true;
> }
>
> /* Quick check failed, i.e. update is required. */
> - raw_spin_lock(&jiffies_lock);
> + if (trylock) {
> + /* The cpu holding the lock will do the update. */
> + if (!raw_spin_trylock(&jiffies_lock))
> + return false;
> + } else {
> + raw_spin_lock(&jiffies_lock);
> + }
Why inflicting this horrible conditional locking scheme into the main
path? This can be solved without all this churn completely independent
from this function.
Something like the uncompiled below. You get the idea.
Thanks,
tglx
---
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -203,6 +203,21 @@ static inline void tick_sched_flag_clear
#define MAX_STALLED_JIFFIES 5
+static bool tick_try_update_jiffies64(struct tick_sched *ts, ktime_t now)
+{
+ static atomic_t in_progress;
+ int inp;
+
+ inp = atomic_read(&in_progress);
+ if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1))
+ return false;
+
+ if (ts->last_tick_jiffies == jiffies)
+ tick_do_update_jiffies64(now);
+ atomic_set(&in_progress, 0);
+ return true;
+}
+
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
int tick_cpu, cpu = smp_processor_id();
@@ -239,10 +254,11 @@ static void tick_sched_do_timer(struct t
ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies);
} else {
- if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
- tick_do_update_jiffies64(now);
- ts->stalled_jiffies = 0;
- ts->last_tick_jiffies = READ_ONCE(jiffies);
+ if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) {
+ if (tick_try_update_jiffies64(ts, now)) {
+ ts->stalled_jiffies = 0;
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
+ }
}
}
On Thu, Oct 16, 2025 at 10:07:07PM +0200, Thomas Gleixner wrote:
> On Mon, Oct 13 2025 at 10:09, Steve Wahl wrote:
> > -static void tick_do_update_jiffies64(ktime_t now)
> > +static bool _tick_do_update_jiffies64(ktime_t now, bool trylock)
> > {
> > unsigned long ticks = 1;
> > ktime_t delta, nextp;
> > @@ -70,7 +70,7 @@ static void tick_do_update_jiffies64(ktime_t now)
> > */
> > if (IS_ENABLED(CONFIG_64BIT)) {
> > if (ktime_before(now, smp_load_acquire(&tick_next_period)))
> > - return;
> > + return true;
> > } else {
> > unsigned int seq;
> >
> > @@ -84,18 +84,24 @@ static void tick_do_update_jiffies64(ktime_t now)
> > } while (read_seqcount_retry(&jiffies_seq, seq));
> >
> > if (ktime_before(now, nextp))
> > - return;
> > + return true;
> > }
> >
> > /* Quick check failed, i.e. update is required. */
> > - raw_spin_lock(&jiffies_lock);
> > + if (trylock) {
> > + /* The cpu holding the lock will do the update. */
> > + if (!raw_spin_trylock(&jiffies_lock))
> > + return false;
> > + } else {
> > + raw_spin_lock(&jiffies_lock);
> > + }
>
> Why inflicting this horrible conditional locking scheme into the main
> path? This can be solved without all this churn completely independent
> from this function.
Why? Because my first crack at the problem was just change to a
trylock at all times. But I saw some callers might depend on time
being updated before return. And I would say I didn't "zoom out" far
enough from the simple fix when trying to accomodate that.
> Something like the uncompiled below. You get the idea.
I like this approach.
The reason I'm getting in to this situation seems to be that the
designated timekeeper is actually doing the jiffies update work;
holding the lock, hasn't finished processing yet. Under those
conditions, this approach will have one extra CPU stuck waiting on the
jiffies lock.
But that's far better than thousands, and I think would be acceptable
tradeoff for code readability. I will make it compile and see how it
tests, and proceed to make it become patch v2 if it seems OK.
> Thanks,
>
> tglx
Thank you!
--> Steve Wahl
> ---
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -203,6 +203,21 @@ static inline void tick_sched_flag_clear
>
> #define MAX_STALLED_JIFFIES 5
>
> +static bool tick_try_update_jiffies64(struct tick_sched *ts, ktime_t now)
> +{
> + static atomic_t in_progress;
> + int inp;
> +
> + inp = atomic_read(&in_progress);
> + if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1))
> + return false;
> +
> + if (ts->last_tick_jiffies == jiffies)
> + tick_do_update_jiffies64(now);
> + atomic_set(&in_progress, 0);
> + return true;
> +}
> +
> static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
> {
> int tick_cpu, cpu = smp_processor_id();
> @@ -239,10 +254,11 @@ static void tick_sched_do_timer(struct t
> ts->stalled_jiffies = 0;
> ts->last_tick_jiffies = READ_ONCE(jiffies);
> } else {
> - if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
> - tick_do_update_jiffies64(now);
> - ts->stalled_jiffies = 0;
> - ts->last_tick_jiffies = READ_ONCE(jiffies);
> + if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) {
> + if (tick_try_update_jiffies64(ts, now)) {
> + ts->stalled_jiffies = 0;
> + ts->last_tick_jiffies = READ_ONCE(jiffies);
> + }
> }
> }
>
--
Steve Wahl, Hewlett Packard Enterprise
© 2016 - 2025 Red Hat, Inc.