[patch 2/2] timekeeping: Always check for negative motion

Thomas Gleixner posted 2 patches 1 year, 3 months ago
[patch 2/2] timekeeping: Always check for negative motion
Posted by Thomas Gleixner 1 year, 3 months ago
clocksource_delta() has two variants. One with a check for negative motion,
which is only selected by x86. This is a historic leftover as this function
was previously used in the time getter hot paths.

Since 135225a363ae timekeeping_cycles_to_ns() has unconditional protection
against this as a by-product of the protection against 64bit math overflow.

clocksource_delta() is only used in the clocksource watchdog and in
timekeeping_advance(). The extra conditional there is not hurting anyone.

Remove the config option and unconditionally prevent negative motion of the
readout.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig                   |    1 -
 kernel/time/Kconfig                |    5 -----
 kernel/time/timekeeping_internal.h |    7 -------
 3 files changed, 13 deletions(-)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -145,7 +145,6 @@ config X86
 	select ARCH_HAS_PARANOID_L1D_FLUSH
 	select BUILDTIME_TABLE_SORT
 	select CLKEVT_I8253
-	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
 	select CLOCKSOURCE_WATCHDOG
 	# Word-size accesses may read uninitialized data past the trailing \0
 	# in strings and cause false KMSAN reports.
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -17,11 +17,6 @@ config ARCH_CLOCKSOURCE_DATA
 config ARCH_CLOCKSOURCE_INIT
 	bool
 
-# Clocksources require validation of the clocksource against the last
-# cycle update - x86/TSC misfeature
-config CLOCKSOURCE_VALIDATE_LAST_CYCLE
-	bool
-
 # Timekeeping vsyscall support
 config GENERIC_TIME_VSYSCALL
 	bool
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -30,7 +30,6 @@ static inline void timekeeping_inc_mg_fl
 
 #endif
 
-#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
 static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
 {
 	u64 ret = (now - last) & mask;
@@ -41,12 +40,6 @@ static inline u64 clocksource_delta(u64
 	 */
 	return ret & ~(mask >> 1) ? 0 : ret;
 }
-#else
-static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
-{
-	return (now - last) & mask;
-}
-#endif
 
 /* Semi public for serialization of non timekeeper VDSO updates. */
 unsigned long timekeeper_lock_irqsave(void);
Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by Guenter Roeck 1 year, 2 months ago
Hi,

On Thu, Oct 31, 2024 at 01:04:08PM +0100, Thomas Gleixner wrote:
> clocksource_delta() has two variants. One with a check for negative motion,
> which is only selected by x86. This is a historic leftover as this function
> was previously used in the time getter hot paths.
> 
> Since 135225a363ae timekeeping_cycles_to_ns() has unconditional protection
> against this as a by-product of the protection against 64bit math overflow.
> 
> clocksource_delta() is only used in the clocksource watchdog and in
> timekeeping_advance(). The extra conditional there is not hurting anyone.
> 
> Remove the config option and unconditionally prevent negative motion of the
> readout.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

This patch causes the kuda-bmc qemu emulation to stall. Reverting it fixes
the problem.

Bisecting the problem was a bit difficult. I attached two bisect results.
The first is on mainline, the second from timers-core-2024-11-18 after
rebasing it on top of mainline. The second log shows how the problem is
introduced.

Guenter

---
bisect on mainline:

# bad: [06afb0f36106ecb839c5e2509905e68c1e2677de] Merge tag 'trace-v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace
# good: [adc218676eef25575469234709c2d87185ca223a] Linux 6.12
git bisect start 'HEAD' 'v6.12'
# bad: [6e95ef0258ff4ee23ae3b06bf6b00b33dbbd5ef7] Merge tag 'bpf-next-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
git bisect bad 6e95ef0258ff4ee23ae3b06bf6b00b33dbbd5ef7
# bad: [aad3a0d084513e811233ad48bf234fbfcfcd0a14] Merge tag 'ftrace-v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace
git bisect bad aad3a0d084513e811233ad48bf234fbfcfcd0a14
# good: [ba1f9c8fe3d443a78814cdf8ac8f9829b5ca7095] Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux
git bisect good ba1f9c8fe3d443a78814cdf8ac8f9829b5ca7095
# good: [9d7d4ad222aea8ab482e78858d03b10221c7fe78] Merge tag 'objtool-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect good 9d7d4ad222aea8ab482e78858d03b10221c7fe78
# good: [035238752319a58244d86facd442c5f40b0e97e2] Merge tag 'timers-vdso-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect good 035238752319a58244d86facd442c5f40b0e97e2
# bad: [e5cfc0989d9a2849c51c720a16b90b2c061a1aeb] clocksource/drivers/timer-ti-dm: Fix child node refcount handling
git bisect bad e5cfc0989d9a2849c51c720a16b90b2c061a1aeb
# bad: [147ba943024e564e89d9ac265d6a07a0d2c03988] timekeeping: Merge timekeeping_update_staged() and timekeeping_update()
git bisect bad 147ba943024e564e89d9ac265d6a07a0d2c03988
# bad: [f36eb171410839325fff9cd9b7b7400f7e606962] timers: Update function descriptions of sleep/delay related functions
git bisect bad f36eb171410839325fff9cd9b7b7400f7e606962
# bad: [9d7130dfc0e1c53112fcbed4b9f566d0f6fbc949] ntp: Move pps_jitter into ntp_data
git bisect bad 9d7130dfc0e1c53112fcbed4b9f566d0f6fbc949
# bad: [ec93ec22aa10fb5311c0f068ee66c5b6d39788fe] ntp: Move tick_length* into ntp_data
git bisect bad ec93ec22aa10fb5311c0f068ee66c5b6d39788fe
# bad: [38007dc032bd90920463c5d2e6a27d89f7617d6d] ntp: Cleanup formatting of code
git bisect bad 38007dc032bd90920463c5d2e6a27d89f7617d6d
# bad: [66606a93849bfe3cbe9f0b801b40f60b87c54e11] ntp: Make tick_usec static
git bisect bad 66606a93849bfe3cbe9f0b801b40f60b87c54e11
# bad: [a849a0273d0f73a252d14d31c5003a8059ea51fc] ntp: Remove unused tick_nsec
git bisect bad a849a0273d0f73a252d14d31c5003a8059ea51fc
# first bad commit: [a849a0273d0f73a252d14d31c5003a8059ea51fc] ntp: Remove unused tick_nsec

---
bisect on timers-core-2024-11-18 after rebasing it on top of mainline
prior to the merge:

# bad: [997be8ee5090ece72d32e0386400a3e0046d29c7] posix-timers: Fix spurious warning on double enqueue versus do_exit()
# good: [035238752319a58244d86facd442c5f40b0e97e2] Merge tag 'timers-vdso-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect start 'HEAD' 'bf9aa14fc523~1'
# good: [820128b09e6443aa6e2fc4efa1488e6c6c9d41d6] signal: Confine POSIX_TIMERS properly
git bisect good 820128b09e6443aa6e2fc4efa1488e6c6c9d41d6
# bad: [3349e563d11aa9fdb957be9ccd1a3f0db771a748] signal: Provide ignored_posix_timers list
git bisect bad 3349e563d11aa9fdb957be9ccd1a3f0db771a748
# good: [6e882ecdf76efe3846d3d327c5e17f7ca1252397] clocksource/drivers/timer-tegra: Remove clockevents shutdown call on offlining
git bisect good 6e882ecdf76efe3846d3d327c5e17f7ca1252397
# bad: [5c6b73e3bf36c30005948f072e9d5d351db4d7e0] posix-timers: Add a refcount to struct k_itimer
git bisect bad 5c6b73e3bf36c30005948f072e9d5d351db4d7e0
# bad: [6ab61583280a183828024101549d69b28ca1be9e] posix-cpu-timers: Correctly update timer status in posix_cpu_timer_del()
git bisect bad 6ab61583280a183828024101549d69b28ca1be9e
# good: [1c4cf0acb9a94d3d22ae24ba5a22b9892bf6d025] timekeeping: Remove CONFIG_DEBUG_TIMEKEEPING
git bisect good 1c4cf0acb9a94d3d22ae24ba5a22b9892bf6d025
# bad: [cc9811d2b494c3e12bab01a79be8f2a0c16b32c8] timekeeping: Always check for negative motion
git bisect bad cc9811d2b494c3e12bab01a79be8f2a0c16b32c8
# first bad commit: [cc9811d2b494c3e12bab01a79be8f2a0c16b32c8] timekeeping: Always check for negative motion
Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by John Stultz 1 year, 2 months ago
On Sun, Nov 24, 2024 at 4:48 PM Guenter Roeck <linux@roeck-us.net> wrote:
> On Thu, Oct 31, 2024 at 01:04:08PM +0100, Thomas Gleixner wrote:
> > clocksource_delta() has two variants. One with a check for negative motion,
> > which is only selected by x86. This is a historic leftover as this function
> > was previously used in the time getter hot paths.
> >
> > Since 135225a363ae timekeeping_cycles_to_ns() has unconditional protection
> > against this as a by-product of the protection against 64bit math overflow.
> >
> > clocksource_delta() is only used in the clocksource watchdog and in
> > timekeeping_advance(). The extra conditional there is not hurting anyone.
> >
> > Remove the config option and unconditionally prevent negative motion of the
> > readout.
> >
> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
>
> This patch causes the kuda-bmc qemu emulation to stall. Reverting it fixes
> the problem.
>

I'm not familiar with kuda-bmc and I'm not finding too many details
searching on it.
From other qemu bmc reults I'm guessing this is an arm32 architecture?

Do you have any more details about where it's stalling?  Also dmesg
details might help illuminate what clocksource was used, etc.
I'm wondering if the clocksource mask value is incorrect for the
clocksource being used here?

thanks
-john
Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by Guenter Roeck 1 year, 2 months ago
On 11/27/24 14:08, John Stultz wrote:
> On Sun, Nov 24, 2024 at 4:48 PM Guenter Roeck <linux@roeck-us.net> wrote:
>> On Thu, Oct 31, 2024 at 01:04:08PM +0100, Thomas Gleixner wrote:
>>> clocksource_delta() has two variants. One with a check for negative motion,
>>> which is only selected by x86. This is a historic leftover as this function
>>> was previously used in the time getter hot paths.
>>>
>>> Since 135225a363ae timekeeping_cycles_to_ns() has unconditional protection
>>> against this as a by-product of the protection against 64bit math overflow.
>>>
>>> clocksource_delta() is only used in the clocksource watchdog and in
>>> timekeeping_advance(). The extra conditional there is not hurting anyone.
>>>
>>> Remove the config option and unconditionally prevent negative motion of the
>>> readout.
>>>
>>> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
>>
>> This patch causes the kuda-bmc qemu emulation to stall. Reverting it fixes
>> the problem.
>>
> 
> I'm not familiar with kuda-bmc and I'm not finding too many details
> searching on it.

Sorry, I keep misspelling it. kudo-bmc.

>>From other qemu bmc reults I'm guessing this is an arm32 architecture?
> 
Yes.

> Do you have any more details about where it's stalling?  Also dmesg
> details might help illuminate what clocksource was used, etc.
> I'm wondering if the clocksource mask value is incorrect for the
> clocksource being used here?
> 

An example log is at [1]. It says

clocksource: npcm7xx-timer1: mask: 0xffffff max_cycles: 0xffffff, max_idle_ns: 597268854 ns
...
clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
...
clocksource: Switched to clocksource npcm7xx-timer1

I don't know where exactly it stalls; sometime after handover to userspace.
I'll be happy to do some more debugging, but you'll nee to let me know what
to look for.

Thanks,
Guenter

---
[1] https://kerneltests.org/builders/qemu-arm-v7-master/builds/324/steps/qemubuildcommand/logs/stdio

Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by Thomas Gleixner 1 year, 2 months ago
On Wed, Nov 27 2024 at 15:02, Guenter Roeck wrote:
> On 11/27/24 14:08, John Stultz wrote:
> An example log is at [1]. It says
>
> clocksource: npcm7xx-timer1: mask: 0xffffff max_cycles: 0xffffff, max_idle_ns: 597268854 ns

That's a 24bit counter. So negative motion happens when the readouts are
more than (1 << 23) apart. AFAICT the counter runs with about 14MHz, but
I'd like to have that confirmed.

> clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
> ...
> clocksource: Switched to clocksource npcm7xx-timer1
>
> I don't know where exactly it stalls; sometime after handover to userspace.
> I'll be happy to do some more debugging, but you'll nee to let me know what
> to look for.

On that platform max_idle_ns should correspond to 50% of the counter
width. So if both CPUs go deep idle for max_idle_ns, then the next timer
interrupt doing the timeeeping advancement sees a delta of > (1 << 23)
and timekeeping stalls.

If my ssumption is correct, then the below should fix it.

Thanks,

        tglx
---
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2221,7 +2221,7 @@ static bool timekeeping_advance(enum tim
 	struct timekeeper *real_tk = &tk_core.timekeeper;
 	unsigned int clock_set = 0;
 	int shift = 0, maxshift;
-	u64 offset;
+	u64 offset, maxcyc;
 
 	guard(raw_spinlock_irqsave)(&tk_core.lock);
 
@@ -2229,8 +2229,13 @@ static bool timekeeping_advance(enum tim
 	if (unlikely(timekeeping_suspended))
 		return false;
 
-	offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
-				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+	offset = tk_clock_read(&tk->tkr_mono) - tk->tkr_mono.cycle_last;
+	offset &= tk->tkr_mono.mask;
+
+	maxcyc = tk->tkr_mono.mask >>= 1;
+	maxcyc += tk->tkr_mono.mask >>= 2;
+	if (offset > maxcyc)
+		offset = 0;
 
 	/* Check if there's really nothing to do */
 	if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by Guenter Roeck 1 year, 2 months ago
On 11/28/24 06:51, Thomas Gleixner wrote:
> On Wed, Nov 27 2024 at 15:02, Guenter Roeck wrote:
>> On 11/27/24 14:08, John Stultz wrote:
>> An example log is at [1]. It says
>>
>> clocksource: npcm7xx-timer1: mask: 0xffffff max_cycles: 0xffffff, max_idle_ns: 597268854 ns
> 
> That's a 24bit counter. So negative motion happens when the readouts are
> more than (1 << 23) apart. AFAICT the counter runs with about 14MHz, but
> I'd like to have that confirmed.
> 
>> clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
>> ...
>> clocksource: Switched to clocksource npcm7xx-timer1
>>
>> I don't know where exactly it stalls; sometime after handover to userspace.
>> I'll be happy to do some more debugging, but you'll nee to let me know what
>> to look for.
> 
> On that platform max_idle_ns should correspond to 50% of the counter
> width. So if both CPUs go deep idle for max_idle_ns, then the next timer
> interrupt doing the timeeeping advancement sees a delta of > (1 << 23)
> and timekeeping stalls.
> 
> If my ssumption is correct, then the below should fix it.
> 

While that didn't work, the following code does.

Guenter

---
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0ca85ff4fbb4..bd88c04ae357 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2190,7 +2190,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
         /* Accumulate one shifted interval */
         offset -= interval;
         tk->tkr_mono.cycle_last += interval;
-       tk->tkr_raw.cycle_last  += interval;
+       tk->tkr_raw.cycle_last  = (tk->tkr_raw.cycle_last + interval) & tk->tkr_mono.mask;

         tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
         *clock_set |= accumulate_nsecs_to_secs(tk);
Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by Guenter Roeck 1 year, 2 months ago
On 11/28/24 07:57, Guenter Roeck wrote:
> On 11/28/24 06:51, Thomas Gleixner wrote:
>> On Wed, Nov 27 2024 at 15:02, Guenter Roeck wrote:
>>> On 11/27/24 14:08, John Stultz wrote:
>>> An example log is at [1]. It says
>>>
>>> clocksource: npcm7xx-timer1: mask: 0xffffff max_cycles: 0xffffff, max_idle_ns: 597268854 ns
>>
>> That's a 24bit counter. So negative motion happens when the readouts are
>> more than (1 << 23) apart. AFAICT the counter runs with about 14MHz, but
>> I'd like to have that confirmed.
>>
>>> clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
>>> ...
>>> clocksource: Switched to clocksource npcm7xx-timer1
>>>
>>> I don't know where exactly it stalls; sometime after handover to userspace.
>>> I'll be happy to do some more debugging, but you'll nee to let me know what
>>> to look for.
>>
>> On that platform max_idle_ns should correspond to 50% of the counter
>> width. So if both CPUs go deep idle for max_idle_ns, then the next timer
>> interrupt doing the timeeeping advancement sees a delta of > (1 << 23)
>> and timekeeping stalls.
>>
>> If my ssumption is correct, then the below should fix it.
>>
> 
> While that didn't work, the following code does.
> 
> Guenter
> 
> ---
> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> index 0ca85ff4fbb4..bd88c04ae357 100644
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -2190,7 +2190,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
>          /* Accumulate one shifted interval */
>          offset -= interval;
>          tk->tkr_mono.cycle_last += interval;
> -       tk->tkr_raw.cycle_last  += interval;
> +       tk->tkr_raw.cycle_last  = (tk->tkr_raw.cycle_last + interval) & tk->tkr_mono.mask;
                                          ^^^^^^^                              ^^^^^^^^

No idea what I was testing earlier, but that obviously doesn't work either, and masking
both tkr_raw.cycle_last and tk->tkr_mono.cycle_last also doesn't work. Sorry for the noise.

Guenter

Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by Guenter Roeck 1 year, 2 months ago
On 11/28/24 09:13, Guenter Roeck wrote:
> On 11/28/24 07:57, Guenter Roeck wrote:
>> On 11/28/24 06:51, Thomas Gleixner wrote:
>>> On Wed, Nov 27 2024 at 15:02, Guenter Roeck wrote:
>>>> On 11/27/24 14:08, John Stultz wrote:
>>>> An example log is at [1]. It says
>>>>
>>>> clocksource: npcm7xx-timer1: mask: 0xffffff max_cycles: 0xffffff, max_idle_ns: 597268854 ns
>>>
>>> That's a 24bit counter. So negative motion happens when the readouts are
>>> more than (1 << 23) apart. AFAICT the counter runs with about 14MHz, but
>>> I'd like to have that confirmed.
>>>
>>>> clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
>>>> ...
>>>> clocksource: Switched to clocksource npcm7xx-timer1
>>>>
>>>> I don't know where exactly it stalls; sometime after handover to userspace.
>>>> I'll be happy to do some more debugging, but you'll nee to let me know what
>>>> to look for.
>>>
>>> On that platform max_idle_ns should correspond to 50% of the counter
>>> width. So if both CPUs go deep idle for max_idle_ns, then the next timer
>>> interrupt doing the timeeeping advancement sees a delta of > (1 << 23)
>>> and timekeeping stalls.
>>>
>>> If my ssumption is correct, then the below should fix it.
>>>
>>
>> While that didn't work, the following code does.
>>
>> Guenter
>>
>> ---
>> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
>> index 0ca85ff4fbb4..bd88c04ae357 100644
>> --- a/kernel/time/timekeeping.c
>> +++ b/kernel/time/timekeeping.c
>> @@ -2190,7 +2190,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
>>          /* Accumulate one shifted interval */
>>          offset -= interval;
>>          tk->tkr_mono.cycle_last += interval;
>> -       tk->tkr_raw.cycle_last  += interval;
>> +       tk->tkr_raw.cycle_last  = (tk->tkr_raw.cycle_last + interval) & tk->tkr_mono.mask;
>                                           ^^^^^^^                              ^^^^^^^^
> 
> No idea what I was testing earlier, but that obviously doesn't work either, and masking
> both tkr_raw.cycle_last and tk->tkr_mono.cycle_last also doesn't work. Sorry for the noise.
> 

The patch below works better, but it still stalls as soon as the counter wraps,
i.e., when the function returns 0.

Guenter

---
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 63e600e943a7..3fe10407b1d5 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -32,7 +32,7 @@ static inline void timekeeping_inc_mg_floor_swaps(void)

  static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
  {
-       u64 ret = (now - last) & mask;
+       u64 ret = (now - (last & mask)) & mask;

         /*
          * Prevent time going backwards by checking the MSB of mask in

Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by Guenter Roeck 1 year, 2 months ago
On 11/28/24 06:51, Thomas Gleixner wrote:
> On Wed, Nov 27 2024 at 15:02, Guenter Roeck wrote:
>> On 11/27/24 14:08, John Stultz wrote:
>> An example log is at [1]. It says
>>
>> clocksource: npcm7xx-timer1: mask: 0xffffff max_cycles: 0xffffff, max_idle_ns: 597268854 ns
> 
> That's a 24bit counter. So negative motion happens when the readouts are
> more than (1 << 23) apart. AFAICT the counter runs with about 14MHz, but
> I'd like to have that confirmed.
> 
>> clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
>> ...
>> clocksource: Switched to clocksource npcm7xx-timer1
>>
>> I don't know where exactly it stalls; sometime after handover to userspace.
>> I'll be happy to do some more debugging, but you'll nee to let me know what
>> to look for.
> 
> On that platform max_idle_ns should correspond to 50% of the counter
> width. So if both CPUs go deep idle for max_idle_ns, then the next timer
> interrupt doing the timeeeping advancement sees a delta of > (1 << 23)
> and timekeeping stalls.
> 
> If my ssumption is correct, then the below should fix it.
> 
Unfortunately not, it just makes things worse and stalls earlier.

[    2.080000] Fusion MPT SAS Host driver 3.04.20
[    2.080000] npcm7xx-ehci f0806000.usb: EHCI Host Controller
[    2.080000] npcm7xx-ehci f0806000.usb: new USB bus registered, assigned bus number 1
[    2.080000] npcm7xx-ehci f0806000.usb: irq 39, io mem 0xf0806000
qemu-system-arm: terminating on signal 15 from pid 17184 (/bin/bash)

There is something else going on. I reverted the offending patch and
added

+       pr_warn("###### now: %llx last: %llx mask: %llx return: %llx", now, last, mask, (now - last) & mask);

into clocksource_delta(). Here is what I get:

...
[    3.310000] ###### now: f8c502 last: f6c964 mask: ffffff return: 1fb9e
[    3.320000] ###### now: fbc513 last: f8b1ac mask: ffffff return: 31367
[    3.330000] ###### now: fd368d last: fa99f4 mask: ffffff return: 29c99
[    3.340000] ###### now: ffe8b3 last: fc823c mask: ffffff return: 36677
[    3.350000] ###### now: 6c4f last: fe6a84 mask: ffffff return: 201cb      <---
[    3.360000] ###### now: 40427 last: 10052cc mask: ffffff return: 3b15b    <---
[    3.370000] ###### now: 43bfe last: 1023b14 mask: ffffff return: 200ea
[    3.380000] ###### now: 6f438 last: 104235c mask: ffffff return: 2d0dc
[    3.390000] ###### now: 8e9a3 last: 1060ba4 mask: ffffff return: 2ddff
[    3.400000] ###### now: ac9d3 last: 107f3ec mask: ffffff return: 2d5e7
[    3.410000] ###### now: cb9b4 last: 109dc34 mask: ffffff return: 2dd80
[    3.420000] ###### now: e9ba2 last: 10bc47c mask: ffffff return: 2d726
[    3.430000] ###### now: 108994 last: 10dacc4 mask: ffffff return: 2dcd0
[    3.440000] ###### now: 1200a0 last: 10f950c mask: ffffff return: 26b94
[    3.450000] ###### now: 14c55f last: 1117d54 mask: ffffff return: 3480b
[    3.460000] ###### now: 15d43a last: 113659c mask: ffffff return: 26e9e
...
[   18.100000] ###### now: f8c39f last: ceea990 mask: ffffff return: a1a0f
[   18.290000] ###### now: 1d01af last: cf832f8 mask: ffffff return: 24ceb7
[   18.500000] ###### now: 45b0a2 last: d1c7050 mask: ffffff return: 294052
[   19.090000] ###### now: b5ac62 last: d447e38 mask: ffffff return: 712e2a
[   19.110000] ###### now: b9ebc3 last: db506d0 mask: ffffff return: 4e4f3
[   19.120000] ###### now: bb842f last: db8d760 mask: ffffff return: 2accf
[   19.160000] ###### now: c43f2e last: dbabfa8 mask: ffffff return: 97f86

'last' advances beyond 'mask', and after that the result is always bad. The call to
clocksource_delta() is from timekeeping_advance().

Guenter
Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by Thomas Gleixner 1 year, 2 months ago
On Thu, Nov 28 2024 at 07:30, Guenter Roeck wrote:
> On 11/28/24 06:51, Thomas Gleixner wrote:
> [   19.090000] ###### now: b5ac62 last: d447e38 mask: ffffff return: 712e2a
> [   19.110000] ###### now: b9ebc3 last: db506d0 mask: ffffff return: 4e4f3
> [   19.120000] ###### now: bb842f last: db8d760 mask: ffffff return: 2accf
> [   19.160000] ###### now: c43f2e last: dbabfa8 mask: ffffff return: 97f86
>
> 'last' advances beyond 'mask', and after that the result is always bad. The call to
> clocksource_delta() is from timekeeping_advance().

This does not make any sense. The bits above the mask in cycle_last are
irrelevant:

        delta = (now - last) & mask;

> [    3.350000] ###### now:  6c4f last:  fe6a84 mask: ffffff return: 201cb    <---
> [    3.360000] ###### now: 40427 last: 10052cc mask: ffffff return: 3b15b    <---

       0x40427 - 0x10052cc = 0xffffffffff03b15b
       0xffffffffff03b15b & 0xffffff = 0x3b15b

That's not any different than before. The only difference is that the
return value is checked:

       return delta & ~(mask >> 1) ? 0 : delta;

But clearly none of the resulting deltas (after masking) has bit 23
set. So the function can't return 0, right?

Coming back to my earlier assumption vs. the max idle time. Here is a
long idle sleep:

> [   18.500000] ###### now: 45b0a2 last: d1c7050 mask: ffffff return: 294052
> [   19.090000] ###### now: b5ac62 last: d447e38 mask: ffffff return: 712e2a

The cycle interval is 125000 clock cycles per tick. That's a HZ=100
kernel, so the nominal clock frequency is 12.5 MHz.

  0x712e2a/12.5e6 = 0.593391s

which is close to the 597268854ns max_idle_ns value.

That's about 0.0776978s away from the point where the delta becomes >
mask/2. So there is a valid concern vs. these long idle sleeps on
machines with a small counter width.

But none of this explains the problems you are observing.

Can you instrument clocksource_delta() to print the values only when the
negative motion detection triggers?

         if (delta & ~(mask >> 1))
         	pr_info(....);

Thanks,

        tglx
Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by Guenter Roeck 1 year, 2 months ago
On 11/29/24 04:16, Thomas Gleixner wrote:
> On Thu, Nov 28 2024 at 07:30, Guenter Roeck wrote:
>> On 11/28/24 06:51, Thomas Gleixner wrote:
>> [   19.090000] ###### now: b5ac62 last: d447e38 mask: ffffff return: 712e2a
>> [   19.110000] ###### now: b9ebc3 last: db506d0 mask: ffffff return: 4e4f3
>> [   19.120000] ###### now: bb842f last: db8d760 mask: ffffff return: 2accf
>> [   19.160000] ###### now: c43f2e last: dbabfa8 mask: ffffff return: 97f86
>>
>> 'last' advances beyond 'mask', and after that the result is always bad. The call to
>> clocksource_delta() is from timekeeping_advance().
> 
> This does not make any sense. The bits above the mask in cycle_last are
> irrelevant:
> 
>          delta = (now - last) & mask;
> 
>> [    3.350000] ###### now:  6c4f last:  fe6a84 mask: ffffff return: 201cb    <---
>> [    3.360000] ###### now: 40427 last: 10052cc mask: ffffff return: 3b15b    <---
> 
>         0x40427 - 0x10052cc = 0xffffffffff03b15b
>         0xffffffffff03b15b & 0xffffff = 0x3b15b
> 
> That's not any different than before. The only difference is that the
> return value is checked:
> 
>         return delta & ~(mask >> 1) ? 0 : delta;
> 
> But clearly none of the resulting deltas (after masking) has bit 23
> set. So the function can't return 0, right?
> 
> Coming back to my earlier assumption vs. the max idle time. Here is a
> long idle sleep:
> 
>> [   18.500000] ###### now: 45b0a2 last: d1c7050 mask: ffffff return: 294052
>> [   19.090000] ###### now: b5ac62 last: d447e38 mask: ffffff return: 712e2a
> 
> The cycle interval is 125000 clock cycles per tick. That's a HZ=100
> kernel, so the nominal clock frequency is 12.5 MHz.
> 
>    0x712e2a/12.5e6 = 0.593391s
> 
> which is close to the 597268854ns max_idle_ns value.
> 
> That's about 0.0776978s away from the point where the delta becomes >
> mask/2. So there is a valid concern vs. these long idle sleeps on
> machines with a small counter width.
> 
> But none of this explains the problems you are observing.
> 
> Can you instrument clocksource_delta() to print the values only when the
> negative motion detection triggers?
> 

With added WARN_ONCE() after the problem is seen for the first time:

[   13.860000] ##### now=0xd60127 last=0x85170f4 mask=0xffffff ret=0x849033
[   13.860000] ------------[ cut here ]------------
[   13.860000] WARNING: CPU: 0 PID: 0 at kernel/time/timekeeping_internal.h:44 timekeeping_advance+0x844/0x9d0
[   13.860000] clocksource_delta() time going backward: now=0xd60127 last=0x85170f4 mask=0xffffff ret=0x849033
[   13.860000] Modules linked in:
[   13.860000] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.12.0-10689-g7af08b57bcb9-dirty #1
[   13.860000] Hardware name: Generic DT based system
[   13.860000] Call trace:
[   13.860000]  unwind_backtrace from show_stack+0x10/0x14
[   13.860000]  show_stack from dump_stack_lvl+0x64/0x88
[   13.860000]  dump_stack_lvl from __warn+0x7c/0x1ac
[   13.860000]  __warn from warn_slowpath_fmt+0x130/0x1a4
[   13.860000]  warn_slowpath_fmt from timekeeping_advance+0x844/0x9d0
[   13.860000]  timekeeping_advance from update_wall_time+0xc/0x1c
[   13.860000]  update_wall_time from tick_irq_enter+0x6c/0xb0
[   13.860000]  tick_irq_enter from generic_handle_arch_irq+0xc/0x44
[   13.860000]  generic_handle_arch_irq from __irq_svc+0x88/0xb0
[   13.860000] Exception stack(0xc2201f28 to 0xc2201f70)
[   13.860000] 1f20:                   00000001 2d639000 c220da80 00000001 c220da80 c2208f14
[   13.860000] 1f40: c24af000 c2208f54 00000000 00000000 00000097 00000000 c2208f14 c2201f78
[   13.860000] 1f60: c1568504 c1568488 20000013 ffffffff
[   13.860000]  __irq_svc from default_idle_call+0x1c/0x1bc
[   13.860000]  default_idle_call from do_idle+0x200/0x288
[   13.860000]  do_idle from cpu_startup_entry+0x28/0x2c
[   13.860000]  cpu_startup_entry from kernel_init+0x0/0x12c
[   13.860000]  kernel_init from start_kernel+0x6a8/0x6c4
[   13.860000] ---[ end trace 0000000000000000 ]---
[   13.870000] ##### now=0xd747fa last=0x85170f4 mask=0xffffff ret=0x85d706
[   13.880000] ##### now=0xda5bf3 last=0x85170f4 mask=0xffffff ret=0x88eaff
[   13.890000] ##### now=0xdb1868 last=0x85170f4 mask=0xffffff ret=0x89a774
[   13.940000] ##### now=0xe541ba last=0x85170f4 mask=0xffffff ret=0x93d0c6
[   13.950000] ##### now=0xe63e66 last=0x85170f4 mask=0xffffff ret=0x94cd72
[   13.950000] platform iio-hwmon: deferred probe pending: iio_hwmon: Failed to get channels
[   13.960000] ##### now=0xe863a8 last=0x85170f4 mask=0xffffff ret=0x96f2b4
[   13.970000] ##### now=0xeba3a3 last=0x85170f4 mask=0xffffff ret=0x9a32af
[   13.980000] ##### now=0xec1406 last=0x85170f4 mask=0xffffff ret=0x9aa312
[   13.990000] ##### now=0xef85a4 last=0x85170f4 mask=0xffffff ret=0x9e14b0
[   14.000000] ##### now=0xf0f07b last=0x85170f4 mask=0xffffff ret=0x9f7f87
[   14.010000] ##### now=0xf1bff6 last=0x85170f4 mask=0xffffff ret=0xa04f02
[   14.150000] ##### now=0xc609c last=0x85170f4 mask=0xffffff ret=0xbaefa8
[   14.360000] ##### now=0x357f2d last=0x85170f4 mask=0xffffff ret=0xe40e39
[   14.410000] ##### now=0x3e0a3a last=0x85170f4 mask=0xffffff ret=0xec9946
[   14.420000] ##### now=0x3fed36 last=0x85170f4 mask=0xffffff ret=0xee7c42
[   14.430000] ##### now=0x42b58a last=0x85170f4 mask=0xffffff ret=0xf14496
[   14.440000] ##### now=0x44ba17 last=0x85170f4 mask=0xffffff ret=0xf34923
[   14.450000] ##### now=0x46635e last=0x85170f4 mask=0xffffff ret=0xf4f26a
[   14.480000] ##### now=0x4bd58e last=0x85170f4 mask=0xffffff ret=0xfa649a
[   14.490000] ##### now=0x4d3b0c last=0x85170f4 mask=0xffffff ret=0xfbca18
[   14.510000] ##### now=0x510ef4 last=0x85170f4 mask=0xffffff ret=0xff9e00

With more context (printing for each call to clocksource_delta() as well as the
actual return value):

[   14.020000] ##### now=0x165aa2 last=0x913b49f mask=0xffffff ret=0x2a603 -> 0x2a603
[   14.030000] ##### now=0x18aa49 last=0x9159ce7 mask=0xffffff ret=0x30d62 -> 0x30d62
[   14.040000] ##### now=0x1a2971 last=0x917852f mask=0xffffff ret=0x2a442 -> 0x2a442
[   14.050000] ##### now=0x1c78d5 last=0x9196d77 mask=0xffffff ret=0x30b5e -> 0x30b5e
[   14.060000] ##### now=0x1dfe88 last=0x91b55bf mask=0xffffff ret=0x2a8c9 -> 0x2a8c9
[   14.090000] ##### now=0x236d57 last=0x91d3e07 mask=0xffffff ret=0x62f50 -> 0x62f50
[   14.180000] ##### now=0x34aaa3 last=0x922f6df mask=0xffffff ret=0x11b3c4 -> 0x11b3c4
[   14.890000] ##### now=0xbd0aad last=0x9342167 mask=0xffffff ret=0x88e946 -> 0x0
[   14.890000] ------------[ cut here ]------------
[   14.890000] WARNING: CPU: 0 PID: 0 at kernel/time/timekeeping_internal.h:44 clocksource_delta+0x108/0x118
[   14.890000] clocksource_delta() time going backward: now=0xbd0aad last=0x9342167 mask=0xffffff ret=0x88e946
[   14.890000] Modules linked in:
[   14.890000] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.12.0-10689-g7af08b57bcb9-dirty #1
[   14.890000] Hardware name: Generic DT based system
[   14.890000] Call trace:
[   14.890000]  unwind_backtrace from show_stack+0x10/0x14
[   14.890000]  show_stack from dump_stack_lvl+0x64/0x88
[   14.890000]  dump_stack_lvl from __warn+0x7c/0x1ac
[   14.890000]  __warn from warn_slowpath_fmt+0x130/0x1a4
[   14.890000]  warn_slowpath_fmt from clocksource_delta+0x108/0x118
[   14.890000]  clocksource_delta from timekeeping_advance+0x6c/0x8e4
[   14.890000]  timekeeping_advance from update_wall_time+0xc/0x1c
[   14.890000]  update_wall_time from tick_irq_enter+0x6c/0xb0
[   14.890000]  tick_irq_enter from generic_handle_arch_irq+0xc/0x44
[   14.890000]  generic_handle_arch_irq from __irq_svc+0x88/0xb0
[   14.890000] Exception stack(0xc2201f28 to 0xc2201f70)
[   14.890000] 1f20:                   00000001 2d639000 c220da80 00000001 c220da80 c2208f14
[   14.890000] 1f40: c24af000 c2208f54 00000000 00000000 00000091 00000000 c2208f14 c2201f78
[   14.890000] 1f60: c15684c4 c1568448 20000013 ffffffff
[   14.890000]  __irq_svc from default_idle_call+0x1c/0x1bc
[   14.890000]  default_idle_call from do_idle+0x200/0x288
[   14.890000]  do_idle from cpu_startup_entry+0x28/0x2c
[   14.890000]  cpu_startup_entry from kernel_init+0x0/0x12c
[   14.890000]  kernel_init from start_kernel+0x6a8/0x6c4
[   14.890000] ---[ end trace 0000000000000000 ]---
[   14.900000] ##### now=0xbdfc5c last=0x9342167 mask=0xffffff ret=0x89daf5 -> 0x0
[   14.950000] ##### now=0xc8827a last=0x9342167 mask=0xffffff ret=0x946113 -> 0x0
[   15.060000] ##### now=0xdc3907 last=0x9342167 mask=0xffffff ret=0xa817a0 -> 0x0
[   15.110000] ##### now=0xe6b484 last=0x9342167 mask=0xffffff ret=0xb2931d -> 0x0
Sent SIGKILL to all processes
Requesting system reboot
[   15.120000] ##### now=0xe7fd9a last=0x9342167 mask=0xffffff ret=0xb3dc33 -> 0x0
[   15.170000] ##### now=0xf277d5 last=0x9342167 mask=0xffffff ret=0xbe566e -> 0x0
[   15.260000] ##### now=0x26c10 last=0x9342167 mask=0xffffff ret=0xce4aa9 -> 0x0
[   15.310000] ##### now=0xce799 last=0x9342167 mask=0xffffff ret=0xd8c632 -> 0x0

So, yes, it looks like the problem is seen after a large delta, and persists from there.

Guenter
Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by Thomas Gleixner 1 year, 2 months ago
On Fri, Nov 29 2024 at 08:09, Guenter Roeck wrote:
> On 11/29/24 04:16, Thomas Gleixner wrote:
> [   13.860000] WARNING: CPU: 0 PID: 0 at kernel/time/timekeeping_internal.h:44 timekeeping_advance+0x844/0x9d0
> [   13.860000] clocksource_delta() time going backward: now=0xd60127 last=0x85170f4 mask=0xffffff ret=0x849033

So this is a idle sleep which took longer than max_idle_ns. The rest is
the consequence of this as timekeeping does not advance and the timers
are rearmed on the stale time.

Can you try the patch below?

Thanks,

        tglx
---
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -49,6 +49,7 @@ struct module;
  * @archdata:		Optional arch-specific data
  * @max_cycles:		Maximum safe cycle value which won't overflow on
  *			multiplication
+ * @max_raw_delta:	Maximum safe delta value for negative motion detection
  * @name:		Pointer to clocksource name
  * @list:		List head for registration (internal)
  * @freq_khz:		Clocksource frequency in khz.
@@ -109,6 +110,7 @@ struct clocksource {
 	struct arch_clocksource_data archdata;
 #endif
 	u64			max_cycles;
+	u64			max_raw_delta;
 	const char		*name;
 	struct list_head	list;
 	u32			freq_khz;
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -24,7 +24,7 @@ static void clocksource_enqueue(struct c
 
 static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end)
 {
-	u64 delta = clocksource_delta(end, start, cs->mask);
+	u64 delta = clocksource_delta(end, start, cs->mask, cs->max_raw_delta);
 
 	if (likely(delta < cs->max_cycles))
 		return clocksource_cyc2ns(delta, cs->mult, cs->shift);
@@ -993,6 +993,15 @@ static inline void clocksource_update_ma
 	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
 						cs->maxadj, cs->mask,
 						&cs->max_cycles);
+
+	/*
+	 * Threshold for detecting negative motion in clocksource_delta().
+	 *
+	 * Allow for 0.875 of the mask value so that overly long idle
+	 * sleeps which go slightly over mask/2 do not trigger the negative
+	 * motion detection.
+	 */
+	cs->max_raw_delta = (cs->mask >> 1) + (cs->mask >> 2) + (cs->mask >> 3);
 }
 
 static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -755,7 +755,8 @@ static void timekeeping_forward_now(stru
 	u64 cycle_now, delta;
 
 	cycle_now = tk_clock_read(&tk->tkr_mono);
-	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
+				  tk->tkr_mono.clock->max_raw_delta);
 	tk->tkr_mono.cycle_last = cycle_now;
 	tk->tkr_raw.cycle_last  = cycle_now;
 
@@ -2230,7 +2231,8 @@ static bool timekeeping_advance(enum tim
 		return false;
 
 	offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
-				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
+				   tk->tkr_mono.clock->max_raw_delta);
 
 	/* Check if there's really nothing to do */
 	if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -30,15 +30,15 @@ static inline void timekeeping_inc_mg_fl
 
 #endif
 
-static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
+static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta)
 {
 	u64 ret = (now - last) & mask;
 
 	/*
-	 * Prevent time going backwards by checking the MSB of mask in
-	 * the result. If set, return 0.
+	 * Prevent time going backwards by checking against the result
+	 * against @max_delta. If greater, return 0.
 	 */
-	return ret & ~(mask >> 1) ? 0 : ret;
+	return ret > max_delta ? 0 : ret;
 }
 
 /* Semi public for serialization of non timekeeper VDSO updates. */
Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by Guenter Roeck 1 year, 2 months ago
On 11/30/24 03:09, Thomas Gleixner wrote:
> On Fri, Nov 29 2024 at 08:09, Guenter Roeck wrote:
>> On 11/29/24 04:16, Thomas Gleixner wrote:
>> [   13.860000] WARNING: CPU: 0 PID: 0 at kernel/time/timekeeping_internal.h:44 timekeeping_advance+0x844/0x9d0
>> [   13.860000] clocksource_delta() time going backward: now=0xd60127 last=0x85170f4 mask=0xffffff ret=0x849033
> 
> So this is a idle sleep which took longer than max_idle_ns. The rest is
> the consequence of this as timekeeping does not advance and the timers
> are rearmed on the stale time.
> 
> Can you try the patch below?
> 

That one works. I booted kudo-bmc 60 times without failure. I also tested the
patch twice with all my qemu emulations, for a total of 1000+ boots with
different machines/configurations, and did not see a single failure.

Tested-by: Guenter Roeck <linux@roeck-us.net>

Thanks,
Guenter

> Thanks,
> 
>          tglx
> ---
> --- a/include/linux/clocksource.h
> +++ b/include/linux/clocksource.h
> @@ -49,6 +49,7 @@ struct module;
>    * @archdata:		Optional arch-specific data
>    * @max_cycles:		Maximum safe cycle value which won't overflow on
>    *			multiplication
> + * @max_raw_delta:	Maximum safe delta value for negative motion detection
>    * @name:		Pointer to clocksource name
>    * @list:		List head for registration (internal)
>    * @freq_khz:		Clocksource frequency in khz.
> @@ -109,6 +110,7 @@ struct clocksource {
>   	struct arch_clocksource_data archdata;
>   #endif
>   	u64			max_cycles;
> +	u64			max_raw_delta;
>   	const char		*name;
>   	struct list_head	list;
>   	u32			freq_khz;
> --- a/kernel/time/clocksource.c
> +++ b/kernel/time/clocksource.c
> @@ -24,7 +24,7 @@ static void clocksource_enqueue(struct c
>   
>   static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end)
>   {
> -	u64 delta = clocksource_delta(end, start, cs->mask);
> +	u64 delta = clocksource_delta(end, start, cs->mask, cs->max_raw_delta);
>   
>   	if (likely(delta < cs->max_cycles))
>   		return clocksource_cyc2ns(delta, cs->mult, cs->shift);
> @@ -993,6 +993,15 @@ static inline void clocksource_update_ma
>   	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
>   						cs->maxadj, cs->mask,
>   						&cs->max_cycles);
> +
> +	/*
> +	 * Threshold for detecting negative motion in clocksource_delta().
> +	 *
> +	 * Allow for 0.875 of the mask value so that overly long idle
> +	 * sleeps which go slightly over mask/2 do not trigger the negative
> +	 * motion detection.
> +	 */
> +	cs->max_raw_delta = (cs->mask >> 1) + (cs->mask >> 2) + (cs->mask >> 3);
>   }
>   
>   static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -755,7 +755,8 @@ static void timekeeping_forward_now(stru
>   	u64 cycle_now, delta;
>   
>   	cycle_now = tk_clock_read(&tk->tkr_mono);
> -	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
> +	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
> +				  tk->tkr_mono.clock->max_raw_delta);
>   	tk->tkr_mono.cycle_last = cycle_now;
>   	tk->tkr_raw.cycle_last  = cycle_now;
>   
> @@ -2230,7 +2231,8 @@ static bool timekeeping_advance(enum tim
>   		return false;
>   
>   	offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
> -				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
> +				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
> +				   tk->tkr_mono.clock->max_raw_delta);
>   
>   	/* Check if there's really nothing to do */
>   	if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
> --- a/kernel/time/timekeeping_internal.h
> +++ b/kernel/time/timekeeping_internal.h
> @@ -30,15 +30,15 @@ static inline void timekeeping_inc_mg_fl
>   
>   #endif
>   
> -static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
> +static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta)
>   {
>   	u64 ret = (now - last) & mask;
>   
>   	/*
> -	 * Prevent time going backwards by checking the MSB of mask in
> -	 * the result. If set, return 0.
> +	 * Prevent time going backwards by checking against the result
> +	 * against @max_delta. If greater, return 0.
>   	 */
> -	return ret & ~(mask >> 1) ? 0 : ret;
> +	return ret > max_delta ? 0 : ret;
>   }
>   
>   /* Semi public for serialization of non timekeeper VDSO updates. */
> 
>
[patch] clocksource: Make negative motion detection more robust
Posted by Thomas Gleixner 1 year, 2 months ago
Guenter reported boot stalls on a emulated ARM 32-bit platform, which has a
24-bit wide clocksource.

It turns out that the calculated maximal idle time, which limits idle
sleeps to prevent clocksource wrap arounds, is close to the point where the
negative motion detection triggers.

  max_idle_ns:                    597268854 ns
  negative motion tripping point: 671088640 ns

If the idle wakeup is delayed beyond that point, the clocksource
advances far enough to trigger the negative motion detection. This
prevents the clock to advance and in the worst case the system stalls
completely if the consecutive sleeps based on the stale clock are
delayed as well.

Cure this by calculating a more robust cut-off value for negative motion,
which covers 87.5% of the actual clocksource counter width. Compare the
delta against this value to catch negative motion. This is specifically for
clock sources with a small counter width as their wrap around time is close
to the half counter width. For clock sources with wide counters this is not
a problem because the maximum idle time is far from the half counter width
due to the math overflow protection constraints.

For the case at hand this results in a tripping point of 1174405120ns.

Note, that this cannot prevent issues when the delay exceeds the 87.5%
margin, but that's not different from the previous unchecked version which
allowed arbitrary time jumps.

Systems with small counter width are prone to invalid results, but this
problem is unlikely to be seen on real hardware. If such a system
completely stalls for more than half a second, then there are other more
urgent problems than the counter wrapping around.

Fixes: c163e40af9b2 ("timekeeping: Always check for negative motion")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Closes: https://lore.kernel.org/all/387b120b-d68a-45e8-b6ab-768cd95d11c2@roeck-us.net
---
 include/linux/clocksource.h        |    2 ++
 kernel/time/clocksource.c          |   11 ++++++++++-
 kernel/time/timekeeping.c          |    6 ++++--
 kernel/time/timekeeping_internal.h |    8 ++++----
 4 files changed, 20 insertions(+), 7 deletions(-)

--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -49,6 +49,7 @@ struct module;
  * @archdata:		Optional arch-specific data
  * @max_cycles:		Maximum safe cycle value which won't overflow on
  *			multiplication
+ * @max_raw_delta:	Maximum safe delta value for negative motion detection
  * @name:		Pointer to clocksource name
  * @list:		List head for registration (internal)
  * @freq_khz:		Clocksource frequency in khz.
@@ -109,6 +110,7 @@ struct clocksource {
 	struct arch_clocksource_data archdata;
 #endif
 	u64			max_cycles;
+	u64			max_raw_delta;
 	const char		*name;
 	struct list_head	list;
 	u32			freq_khz;
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -24,7 +24,7 @@ static void clocksource_enqueue(struct c
 
 static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end)
 {
-	u64 delta = clocksource_delta(end, start, cs->mask);
+	u64 delta = clocksource_delta(end, start, cs->mask, cs->max_raw_delta);
 
 	if (likely(delta < cs->max_cycles))
 		return clocksource_cyc2ns(delta, cs->mult, cs->shift);
@@ -993,6 +993,15 @@ static inline void clocksource_update_ma
 	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
 						cs->maxadj, cs->mask,
 						&cs->max_cycles);
+
+	/*
+	 * Threshold for detecting negative motion in clocksource_delta().
+	 *
+	 * Allow for 0.875 of the counter width so that overly long idle
+	 * sleeps, which go slightly over mask/2, do not trigger the
+	 * negative motion detection.
+	 */
+	cs->max_raw_delta = (cs->mask >> 1) + (cs->mask >> 2) + (cs->mask >> 3);
 }
 
 static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -755,7 +755,8 @@ static void timekeeping_forward_now(stru
 	u64 cycle_now, delta;
 
 	cycle_now = tk_clock_read(&tk->tkr_mono);
-	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
+				  tk->tkr_mono.clock->max_raw_delta);
 	tk->tkr_mono.cycle_last = cycle_now;
 	tk->tkr_raw.cycle_last  = cycle_now;
 
@@ -2230,7 +2231,8 @@ static bool timekeeping_advance(enum tim
 		return false;
 
 	offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
-				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
+				   tk->tkr_mono.clock->max_raw_delta);
 
 	/* Check if there's really nothing to do */
 	if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -30,15 +30,15 @@ static inline void timekeeping_inc_mg_fl
 
 #endif
 
-static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
+static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta)
 {
 	u64 ret = (now - last) & mask;
 
 	/*
-	 * Prevent time going backwards by checking the MSB of mask in
-	 * the result. If set, return 0.
+	 * Prevent time going backwards by checking the result against
+	 * @max_delta. If greater, return 0.
 	 */
-	return ret & ~(mask >> 1) ? 0 : ret;
+	return ret > max_delta ? 0 : ret;
 }
 
 /* Semi public for serialization of non timekeeper VDSO updates. */
[tip: timers/urgent] clocksource: Make negative motion detection more robust
Posted by tip-bot2 for Thomas Gleixner 1 year, 2 months ago
The following commit has been merged into the timers/urgent branch of tip:

Commit-ID:     76031d9536a076bf023bedbdb1b4317fc801dd67
Gitweb:        https://git.kernel.org/tip/76031d9536a076bf023bedbdb1b4317fc801dd67
Author:        Thomas Gleixner <tglx@linutronix.de>
AuthorDate:    Tue, 03 Dec 2024 11:16:30 +01:00
Committer:     Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Thu, 05 Dec 2024 16:03:24 +01:00

clocksource: Make negative motion detection more robust

Guenter reported boot stalls on a emulated ARM 32-bit platform, which has a
24-bit wide clocksource.

It turns out that the calculated maximal idle time, which limits idle
sleeps to prevent clocksource wrap arounds, is close to the point where the
negative motion detection triggers.

  max_idle_ns:                    597268854 ns
  negative motion tripping point: 671088640 ns

If the idle wakeup is delayed beyond that point, the clocksource
advances far enough to trigger the negative motion detection. This
prevents the clock to advance and in the worst case the system stalls
completely if the consecutive sleeps based on the stale clock are
delayed as well.

Cure this by calculating a more robust cut-off value for negative motion,
which covers 87.5% of the actual clocksource counter width. Compare the
delta against this value to catch negative motion. This is specifically for
clock sources with a small counter width as their wrap around time is close
to the half counter width. For clock sources with wide counters this is not
a problem because the maximum idle time is far from the half counter width
due to the math overflow protection constraints.

For the case at hand this results in a tripping point of 1174405120ns.

Note, that this cannot prevent issues when the delay exceeds the 87.5%
margin, but that's not different from the previous unchecked version which
allowed arbitrary time jumps.

Systems with small counter width are prone to invalid results, but this
problem is unlikely to be seen on real hardware. If such a system
completely stalls for more than half a second, then there are other more
urgent problems than the counter wrapping around.

Fixes: c163e40af9b2 ("timekeeping: Always check for negative motion")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/all/8734j5ul4x.ffs@tglx
Closes: https://lore.kernel.org/all/387b120b-d68a-45e8-b6ab-768cd95d11c2@roeck-us.net
---
 include/linux/clocksource.h        |  2 ++
 kernel/time/clocksource.c          | 11 ++++++++++-
 kernel/time/timekeeping.c          |  6 ++++--
 kernel/time/timekeeping_internal.h |  8 ++++----
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index ef1b16d..65b7c41 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -49,6 +49,7 @@ struct module;
  * @archdata:		Optional arch-specific data
  * @max_cycles:		Maximum safe cycle value which won't overflow on
  *			multiplication
+ * @max_raw_delta:	Maximum safe delta value for negative motion detection
  * @name:		Pointer to clocksource name
  * @list:		List head for registration (internal)
  * @freq_khz:		Clocksource frequency in khz.
@@ -109,6 +110,7 @@ struct clocksource {
 	struct arch_clocksource_data archdata;
 #endif
 	u64			max_cycles;
+	u64			max_raw_delta;
 	const char		*name;
 	struct list_head	list;
 	u32			freq_khz;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index aab6472..7304d7c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -24,7 +24,7 @@ static void clocksource_enqueue(struct clocksource *cs);
 
 static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end)
 {
-	u64 delta = clocksource_delta(end, start, cs->mask);
+	u64 delta = clocksource_delta(end, start, cs->mask, cs->max_raw_delta);
 
 	if (likely(delta < cs->max_cycles))
 		return clocksource_cyc2ns(delta, cs->mult, cs->shift);
@@ -993,6 +993,15 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs)
 	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
 						cs->maxadj, cs->mask,
 						&cs->max_cycles);
+
+	/*
+	 * Threshold for detecting negative motion in clocksource_delta().
+	 *
+	 * Allow for 0.875 of the counter width so that overly long idle
+	 * sleeps, which go slightly over mask/2, do not trigger the
+	 * negative motion detection.
+	 */
+	cs->max_raw_delta = (cs->mask >> 1) + (cs->mask >> 2) + (cs->mask >> 3);
 }
 
 static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0ca85ff..3d12882 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -755,7 +755,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 	u64 cycle_now, delta;
 
 	cycle_now = tk_clock_read(&tk->tkr_mono);
-	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
+				  tk->tkr_mono.clock->max_raw_delta);
 	tk->tkr_mono.cycle_last = cycle_now;
 	tk->tkr_raw.cycle_last  = cycle_now;
 
@@ -2230,7 +2231,8 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
 		return false;
 
 	offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
-				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
+				   tk->tkr_mono.clock->max_raw_delta);
 
 	/* Check if there's really nothing to do */
 	if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 63e600e..8c90791 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -30,15 +30,15 @@ static inline void timekeeping_inc_mg_floor_swaps(void)
 
 #endif
 
-static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
+static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta)
 {
 	u64 ret = (now - last) & mask;
 
 	/*
-	 * Prevent time going backwards by checking the MSB of mask in
-	 * the result. If set, return 0.
+	 * Prevent time going backwards by checking the result against
+	 * @max_delta. If greater, return 0.
 	 */
-	return ret & ~(mask >> 1) ? 0 : ret;
+	return ret > max_delta ? 0 : ret;
 }
 
 /* Semi public for serialization of non timekeeper VDSO updates. */
Re: [patch 2/2] timekeeping: Always check for negative motion
Posted by John Stultz 1 year, 3 months ago
On Thu, Oct 31, 2024 at 5:04 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> clocksource_delta() has two variants. One with a check for negative motion,
> which is only selected by x86. This is a historic leftover as this function
> was previously used in the time getter hot paths.
>
> Since 135225a363ae timekeeping_cycles_to_ns() has unconditional protection
> against this as a by-product of the protection against 64bit math overflow.
>
> clocksource_delta() is only used in the clocksource watchdog and in
> timekeeping_advance(). The extra conditional there is not hurting anyone.
>
> Remove the config option and unconditionally prevent negative motion of the
> readout.
>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Acked-by: John Stultz <jstultz@google.com>
[tip: timers/core] timekeeping: Always check for negative motion
Posted by tip-bot2 for Thomas Gleixner 1 year, 3 months ago
The following commit has been merged into the timers/core branch of tip:

Commit-ID:     c163e40af9b2331b2c629fd4ec8b703ed4d4ae39
Gitweb:        https://git.kernel.org/tip/c163e40af9b2331b2c629fd4ec8b703ed4d4ae39
Author:        Thomas Gleixner <tglx@linutronix.de>
AuthorDate:    Thu, 31 Oct 2024 13:04:08 +01:00
Committer:     Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Sat, 02 Nov 2024 10:14:31 +01:00

timekeeping: Always check for negative motion

clocksource_delta() has two variants. One with a check for negative motion,
which is only selected by x86. This is a historic leftover as this function
was previously used in the time getter hot paths.

Since 135225a363ae timekeeping_cycles_to_ns() has unconditional protection
against this as a by-product of the protection against 64bit math overflow.

clocksource_delta() is only used in the clocksource watchdog and in
timekeeping_advance(). The extra conditional there is not hurting anyone.

Remove the config option and unconditionally prevent negative motion of the
readout.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20241031120328.599430157@linutronix.de

---
 arch/x86/Kconfig                   | 1 -
 kernel/time/Kconfig                | 5 -----
 kernel/time/timekeeping_internal.h | 7 -------
 3 files changed, 13 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2852fcd..53a5eda 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -145,7 +145,6 @@ config X86
 	select ARCH_HAS_PARANOID_L1D_FLUSH
 	select BUILDTIME_TABLE_SORT
 	select CLKEVT_I8253
-	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
 	select CLOCKSOURCE_WATCHDOG
 	# Word-size accesses may read uninitialized data past the trailing \0
 	# in strings and cause false KMSAN reports.
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8ebb6d5..b0b97a6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -17,11 +17,6 @@ config ARCH_CLOCKSOURCE_DATA
 config ARCH_CLOCKSOURCE_INIT
 	bool
 
-# Clocksources require validation of the clocksource against the last
-# cycle update - x86/TSC misfeature
-config CLOCKSOURCE_VALIDATE_LAST_CYCLE
-	bool
-
 # Timekeeping vsyscall support
 config GENERIC_TIME_VSYSCALL
 	bool
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index b3dca83..63e600e 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -30,7 +30,6 @@ static inline void timekeeping_inc_mg_floor_swaps(void)
 
 #endif
 
-#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
 static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
 {
 	u64 ret = (now - last) & mask;
@@ -41,12 +40,6 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
 	 */
 	return ret & ~(mask >> 1) ? 0 : ret;
 }
-#else
-static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
-{
-	return (now - last) & mask;
-}
-#endif
 
 /* Semi public for serialization of non timekeeper VDSO updates. */
 unsigned long timekeeper_lock_irqsave(void);