arch/x86/kernel/tsc.c | 2 -- kernel/sched/clock.c | 3 +++ kernel/sched/cputime.c | 9 +++++---- kernel/sched/sched.h | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-)
Read-mostly sched_clock_irqtime may share the same cacheline with
frequently updated nohz struct. Make it as static_key to avoid
false sharing issue.
The only user of disable_sched_clock_irqtime()
is tsc_.*mark_unstable() which may be invoked under atomic context
and require a workqueue to disable static_key. But both of them
calls clear_sched_clock_stable() just before doing
disable_sched_clock_irqtime(). We can reuse
"sched_clock_work" to also disable sched_clock_irqtime().
One additional case need to handle is if the tsc is marked unstable
before late_initcall() phase, sched_clock_work will not be invoked
and sched_clock_irqtime will stay enabled although clock is unstable:
tsc_init()
enable_sched_clock_irqtime() # irqtime accounting is enabled here
...
if (unsynchronized_tsc()) # true
mark_tsc_unstable()
clear_sched_clock_stable()
__sched_clock_stable_early = 0;
...
if (static_key_count(&sched_clock_running.key) == 2)
# Only happens at sched_clock_init_late()
__clear_sched_clock_stable(); # Never executed
...
# late_initcall() phase
sched_clock_init_late()
if (__sched_clock_stable_early) # Already false
__set_sched_clock_stable(); # sched_clock is never marked stable
# TSC unstable, but sched_clock_work won't run to disable irqtime
So we need to disable_sched_clock_irqtime() in sched_clock_init_late()
if clock is unstable.
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Reported-by: Benjamin Lei <benjamin.lei@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Tianyou Li <tianyou.li@intel.com>
Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
---
v7 -> v6:
- move irqtime_enabled() check to disable_sched_clock_irqtime()
v6 -> v5:
- Only disable_sched_clock_irqtime() if irqtime_enabled() in
sched_lock_init_late() to avoid unnessary overhead.
V5 -> v4:
- Changelog update to reflect static_key changes
V4 -> V3:
- Avoid creating a new workqueue to disable static_key
- Specify kernel version for c2c result in changelog
V2 -> V3:
- Use static_key instead of a __read_mostly var.
V1 -> V2:
- Use __read_mostly instead of __cacheline_aligned to avoid wasting
spaces.
History:
v6: https://lore.kernel.org/all/20260127044159.2254247-1-wangyang.guo@intel.com/
v5: https://lore.kernel.org/all/20260127031602.1907377-1-wangyang.guo@intel.com/
v4: https://lore.kernel.org/all/20260126021401.1490163-1-wangyang.guo@intel.com/
v3: https://lore.kernel.org/all/20260116023945.1849329-1-wangyang.guo@intel.com/
v2: https://lore.kernel.org/all/20260113074807.3404180-1-wangyang.guo@intel.com/
v1: https://lore.kernel.org/all/20260113022958.3379650-1-wangyang.guo@intel.com/
prev discussions: https://lore.kernel.org/all/20251211055612.4071266-1-wangyang.guo@intel.com/T/#u
---
arch/x86/kernel/tsc.c | 2 --
kernel/sched/clock.c | 3 +++
kernel/sched/cputime.c | 9 +++++----
kernel/sched/sched.h | 4 ++--
4 files changed, 10 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 87e749106dda..9a62e18d1bff 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1142,7 +1142,6 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
tsc_unstable = 1;
if (using_native_sched_clock())
clear_sched_clock_stable();
- disable_sched_clock_irqtime();
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
@@ -1212,7 +1211,6 @@ void mark_tsc_unstable(char *reason)
tsc_unstable = 1;
if (using_native_sched_clock())
clear_sched_clock_stable();
- disable_sched_clock_irqtime();
pr_info("Marking TSC unstable due to %s\n", reason);
clocksource_mark_unstable(&clocksource_tsc_early);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index f5e6dd6a6b3a..2ae4fbf13431 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -173,6 +173,7 @@ notrace static void __sched_clock_work(struct work_struct *work)
scd->tick_gtod, __gtod_offset,
scd->tick_raw, __sched_clock_offset);
+ disable_sched_clock_irqtime();
static_branch_disable(&__sched_clock_stable);
}
@@ -238,6 +239,8 @@ static int __init sched_clock_init_late(void)
if (__sched_clock_stable_early)
__set_sched_clock_stable();
+ else
+ disable_sched_clock_irqtime(); /* disable if clock unstable. */
return 0;
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 7097de2c8cda..556a70f344d0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -12,6 +12,8 @@
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
/*
* There are no locks covering percpu hardirq/softirq time.
* They are only modified in vtime_account, on corresponding CPU
@@ -25,16 +27,15 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-int sched_clock_irqtime;
-
void enable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 1;
+ static_branch_enable(&sched_clock_irqtime);
}
void disable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 0;
+ if (irqtime_enabled())
+ static_branch_disable(&sched_clock_irqtime);
}
static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index adfb6e3409d7..ec963314287a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3172,11 +3172,11 @@ struct irqtime {
};
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
-extern int sched_clock_irqtime;
+DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime);
static inline int irqtime_enabled(void)
{
- return sched_clock_irqtime;
+ return static_branch_likely(&sched_clock_irqtime);
}
/*
--
2.47.3
On Tue, 27 Jan 2026 at 08:28, Wangyang Guo <wangyang.guo@intel.com> wrote:
>
> Read-mostly sched_clock_irqtime may share the same cacheline with
> frequently updated nohz struct. Make it as static_key to avoid
> false sharing issue.
>
> The only user of disable_sched_clock_irqtime()
> is tsc_.*mark_unstable() which may be invoked under atomic context
> and require a workqueue to disable static_key. But both of them
> calls clear_sched_clock_stable() just before doing
> disable_sched_clock_irqtime(). We can reuse
> "sched_clock_work" to also disable sched_clock_irqtime().
>
> One additional case need to handle is if the tsc is marked unstable
> before late_initcall() phase, sched_clock_work will not be invoked
> and sched_clock_irqtime will stay enabled although clock is unstable:
> tsc_init()
> enable_sched_clock_irqtime() # irqtime accounting is enabled here
> ...
> if (unsynchronized_tsc()) # true
> mark_tsc_unstable()
> clear_sched_clock_stable()
> __sched_clock_stable_early = 0;
> ...
> if (static_key_count(&sched_clock_running.key) == 2)
> # Only happens at sched_clock_init_late()
> __clear_sched_clock_stable(); # Never executed
> ...
>
> # late_initcall() phase
> sched_clock_init_late()
> if (__sched_clock_stable_early) # Already false
> __set_sched_clock_stable(); # sched_clock is never marked stable
> # TSC unstable, but sched_clock_work won't run to disable irqtime
>
> So we need to disable_sched_clock_irqtime() in sched_clock_init_late()
> if clock is unstable.
>
> Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
> Reported-by: Benjamin Lei <benjamin.lei@intel.com>
> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
> Reviewed-by: Tianyou Li <tianyou.li@intel.com>
> Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
> v7 -> v6:
> - move irqtime_enabled() check to disable_sched_clock_irqtime()
>
> v6 -> v5:
> - Only disable_sched_clock_irqtime() if irqtime_enabled() in
> sched_lock_init_late() to avoid unnessary overhead.
>
> V5 -> v4:
> - Changelog update to reflect static_key changes
>
> V4 -> V3:
> - Avoid creating a new workqueue to disable static_key
> - Specify kernel version for c2c result in changelog
>
> V2 -> V3:
> - Use static_key instead of a __read_mostly var.
>
> V1 -> V2:
> - Use __read_mostly instead of __cacheline_aligned to avoid wasting
> spaces.
>
> History:
> v6: https://lore.kernel.org/all/20260127044159.2254247-1-wangyang.guo@intel.com/
> v5: https://lore.kernel.org/all/20260127031602.1907377-1-wangyang.guo@intel.com/
> v4: https://lore.kernel.org/all/20260126021401.1490163-1-wangyang.guo@intel.com/
> v3: https://lore.kernel.org/all/20260116023945.1849329-1-wangyang.guo@intel.com/
> v2: https://lore.kernel.org/all/20260113074807.3404180-1-wangyang.guo@intel.com/
> v1: https://lore.kernel.org/all/20260113022958.3379650-1-wangyang.guo@intel.com/
> prev discussions: https://lore.kernel.org/all/20251211055612.4071266-1-wangyang.guo@intel.com/T/#u
> ---
> arch/x86/kernel/tsc.c | 2 --
> kernel/sched/clock.c | 3 +++
> kernel/sched/cputime.c | 9 +++++----
> kernel/sched/sched.h | 4 ++--
> 4 files changed, 10 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 87e749106dda..9a62e18d1bff 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -1142,7 +1142,6 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
> tsc_unstable = 1;
> if (using_native_sched_clock())
> clear_sched_clock_stable();
> - disable_sched_clock_irqtime();
> pr_info("Marking TSC unstable due to clocksource watchdog\n");
> }
>
> @@ -1212,7 +1211,6 @@ void mark_tsc_unstable(char *reason)
> tsc_unstable = 1;
> if (using_native_sched_clock())
> clear_sched_clock_stable();
> - disable_sched_clock_irqtime();
> pr_info("Marking TSC unstable due to %s\n", reason);
>
> clocksource_mark_unstable(&clocksource_tsc_early);
> diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
> index f5e6dd6a6b3a..2ae4fbf13431 100644
> --- a/kernel/sched/clock.c
> +++ b/kernel/sched/clock.c
> @@ -173,6 +173,7 @@ notrace static void __sched_clock_work(struct work_struct *work)
> scd->tick_gtod, __gtod_offset,
> scd->tick_raw, __sched_clock_offset);
>
> + disable_sched_clock_irqtime();
> static_branch_disable(&__sched_clock_stable);
> }
>
> @@ -238,6 +239,8 @@ static int __init sched_clock_init_late(void)
>
> if (__sched_clock_stable_early)
> __set_sched_clock_stable();
> + else
> + disable_sched_clock_irqtime(); /* disable if clock unstable. */
>
> return 0;
> }
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index 7097de2c8cda..556a70f344d0 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -12,6 +12,8 @@
>
> #ifdef CONFIG_IRQ_TIME_ACCOUNTING
>
> +DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
> +
> /*
> * There are no locks covering percpu hardirq/softirq time.
> * They are only modified in vtime_account, on corresponding CPU
> @@ -25,16 +27,15 @@
> */
> DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
>
> -int sched_clock_irqtime;
> -
> void enable_sched_clock_irqtime(void)
> {
> - sched_clock_irqtime = 1;
> + static_branch_enable(&sched_clock_irqtime);
> }
>
> void disable_sched_clock_irqtime(void)
> {
> - sched_clock_irqtime = 0;
> + if (irqtime_enabled())
> + static_branch_disable(&sched_clock_irqtime);
> }
>
> static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index adfb6e3409d7..ec963314287a 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3172,11 +3172,11 @@ struct irqtime {
> };
>
> DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
> -extern int sched_clock_irqtime;
> +DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime);
>
> static inline int irqtime_enabled(void)
> {
> - return sched_clock_irqtime;
> + return static_branch_likely(&sched_clock_irqtime);
> }
>
> /*
> --
> 2.47.3
>
Hi Wangyang, Prateek.
On 1/27/26 12:55 PM, Wangyang Guo wrote:
> Read-mostly sched_clock_irqtime may share the same cacheline with
> frequently updated nohz struct. Make it as static_key to avoid
> false sharing issue.
>
> The only user of disable_sched_clock_irqtime()
> is tsc_.*mark_unstable() which may be invoked under atomic context
> and require a workqueue to disable static_key. But both of them
> calls clear_sched_clock_stable() just before doing
> disable_sched_clock_irqtime(). We can reuse
> "sched_clock_work" to also disable sched_clock_irqtime().
>
> One additional case need to handle is if the tsc is marked unstable
> before late_initcall() phase, sched_clock_work will not be invoked
> and sched_clock_irqtime will stay enabled although clock is unstable:
> tsc_init()
> enable_sched_clock_irqtime() # irqtime accounting is enabled here
> ...
> if (unsynchronized_tsc()) # true
> mark_tsc_unstable()
> clear_sched_clock_stable()
> __sched_clock_stable_early = 0;
> ...
> if (static_key_count(&sched_clock_running.key) == 2)
> # Only happens at sched_clock_init_late()
> __clear_sched_clock_stable(); # Never executed
> ...
>
> # late_initcall() phase
> sched_clock_init_late()
> if (__sched_clock_stable_early) # Already false
> __set_sched_clock_stable(); # sched_clock is never marked stable
> # TSC unstable, but sched_clock_work won't run to disable irqtime
>
> So we need to disable_sched_clock_irqtime() in sched_clock_init_late()
> if clock is unstable.
>
Do you this as a valid case? have you tested with CONFIG_PARAVIRT?
Lets say you have a non native sched clock such as kvm_sched_clock_read.
tsc_init -> sets enable_sched_clock_irqtime()
->mark_tsc_unstable -> if using_native_sched_clock -> clear_sched_clock_stable
In this case, since clear_sched_clock_stable won't be called you may not disable the
sched clock irqtime since __sched_clock_stable_early is reset only in clear_sched_clock_stable
Bigger concern(maybe) is clock source marked as stable still, though called mark_tsc_unstable in
non native sched clock?
Disclaimer: (just curious, seeing this x86 code for first time, so may not know all paths)
On 1/27/2026 7:04 PM, Shrikanth Hegde wrote:
> Hi Wangyang, Prateek.
>
> On 1/27/26 12:55 PM, Wangyang Guo wrote:
>> Read-mostly sched_clock_irqtime may share the same cacheline with
>> frequently updated nohz struct. Make it as static_key to avoid
>> false sharing issue.
>>
>> The only user of disable_sched_clock_irqtime()
>> is tsc_.*mark_unstable() which may be invoked under atomic context
>> and require a workqueue to disable static_key. But both of them
>> calls clear_sched_clock_stable() just before doing
>> disable_sched_clock_irqtime(). We can reuse
>> "sched_clock_work" to also disable sched_clock_irqtime().
>>
>> One additional case need to handle is if the tsc is marked unstable
>> before late_initcall() phase, sched_clock_work will not be invoked
>> and sched_clock_irqtime will stay enabled although clock is unstable:
>> tsc_init()
>> enable_sched_clock_irqtime() # irqtime accounting is enabled here
>> ...
>> if (unsynchronized_tsc()) # true
>> mark_tsc_unstable()
>> clear_sched_clock_stable()
>> __sched_clock_stable_early = 0;
>> ...
>> if (static_key_count(&sched_clock_running.key) == 2)
>> # Only happens at sched_clock_init_late()
>> __clear_sched_clock_stable(); # Never executed
>> ...
>>
>> # late_initcall() phase
>> sched_clock_init_late()
>> if (__sched_clock_stable_early) # Already false
>> __set_sched_clock_stable(); # sched_clock is never marked stable
>> # TSC unstable, but sched_clock_work won't run to disable irqtime
>>
>> So we need to disable_sched_clock_irqtime() in sched_clock_init_late()
>> if clock is unstable.
>>
>
> Do you this as a valid case? have you tested with CONFIG_PARAVIRT?
>
> Lets say you have a non native sched clock such as kvm_sched_clock_read.
> tsc_init -> sets enable_sched_clock_irqtime()
> ->mark_tsc_unstable -> if using_native_sched_clock ->
> clear_sched_clock_stable
>
> In this case, since clear_sched_clock_stable won't be called you may not
> disable the
> sched clock irqtime since __sched_clock_stable_early is reset only in
> clear_sched_clock_stable
For hypervisor, I see this path may call clear_sched_clock_stable when
clock is unstable at init:
kvm_init_platform() ->
kvmclock_init() -> kvm_sched_clock_init(stable):
if (!stable) clear_sched_clock_stable()
paravirt_set_sched_clock(kvm_sched_clock_read)
>
> Bigger concern(maybe) is clock source marked as stable still, though
> called mark_tsc_unstable in
> non native sched clock?
>
> Disclaimer: (just curious, seeing this x86 code for first time, so may
> not know all paths)
>
Yes, when clock mark unstable through tsc_.*mark_unstable() with
non-native_sched_clock, clear_sched_clock_stable won't be called, thus
sched_clock_irqtime still keep enabled.
Maybe the dedicated workqueue for sched_clock_irqtime is still needed
considering this case.
On 1/28/2026 7:49 AM, Guo, Wangyang wrote: > Yes, when clock mark unstable through tsc_.*mark_unstable() with non-native_sched_clock, clear_sched_clock_stable won't be called, thus sched_clock_irqtime still keep enabled. > > Maybe the dedicated workqueue for sched_clock_irqtime is still needed considering this case. In that case, shouldn't tsc_init() only enable irqtime when using_native_sched_clock()? How can tsc_init() make a call on irqtime if TSC isn't being used as the sched_clock() ultimately? For kvmclock, if PVCLOCK_TSC_STABLE_BIT is not set, it'll call clear_sched_clock_stable() at kvm_sched_clock_init() but none of the other clocksources do so we can assume once we override the sched_clock() it is up to the sched_clock() provider to deal with the clock stability. -- Thanks and Regards, Prateek
On 1/28/26 8:35 AM, K Prateek Nayak wrote: > On 1/28/2026 7:49 AM, Guo, Wangyang wrote: >> Yes, when clock mark unstable through tsc_.*mark_unstable() with non-native_sched_clock, clear_sched_clock_stable won't be called, thus sched_clock_irqtime still keep enabled. >> >> Maybe the dedicated workqueue for sched_clock_irqtime is still needed considering this case. > > In that case, shouldn't tsc_init() only enable irqtime when > using_native_sched_clock()? How can tsc_init() make a call on irqtime if > TSC isn't being used as the sched_clock() ultimately? > > For kvmclock, if PVCLOCK_TSC_STABLE_BIT is not set, it'll call > clear_sched_clock_stable() at kvm_sched_clock_init() but none of the > other clocksources do so we can assume once we override the sched_clock() > it is up to the sched_clock() provider to deal with the clock stability. > I think this would depend if mark_tsc_unstable happens after system boot, specially while running kvm guest?
On 1/28/2026 11:56 AM, Shrikanth Hegde wrote:
>
>
> On 1/28/26 8:35 AM, K Prateek Nayak wrote:
>> On 1/28/2026 7:49 AM, Guo, Wangyang wrote:
>>> Yes, when clock mark unstable through tsc_.*mark_unstable() with non-native_sched_clock, clear_sched_clock_stable won't be called, thus sched_clock_irqtime still keep enabled.
>>>
>>> Maybe the dedicated workqueue for sched_clock_irqtime is still needed considering this case.
>>
>> In that case, shouldn't tsc_init() only enable irqtime when
>> using_native_sched_clock()? How can tsc_init() make a call on irqtime if
>> TSC isn't being used as the sched_clock() ultimately?
>>
>> For kvmclock, if PVCLOCK_TSC_STABLE_BIT is not set, it'll call
>> clear_sched_clock_stable() at kvm_sched_clock_init() but none of the
>> other clocksources do so we can assume once we override the sched_clock()
>> it is up to the sched_clock() provider to deal with the clock stability.
>>
>
> I think this would depend if mark_tsc_unstable happens after system boot,
> specially while running kvm guest?
I don't see anything on the guest side that would mark the kvmclock as
unstable if host's TSC turns unstable post init and since kvmclock
doesn't set CLOCK_SOURCE_MUST_VERIFY, I doubt if a watchdog runs to
verify it in the guest.
I have the following in the guest:
$ sudo dmesg | grep -i clock
[ 0.000000] kvm-clock: Using msrs 4b564d01 and 4b564d00
[ 0.000000] kvm-clock: using sched offset of 423259259 cycles
[ 0.000002] clocksource: kvm-clock: mask: 0xffffffffffffffff max_cycles: 0x1cd42e4dffb, max_idle_ns: 881590591483 ns
[ 0.071675] clocksource: refined-jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645519600211568 ns
[ 0.378467] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604467 ns
[ 0.388678] clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x398cb1e4d56, max_idle_ns: 881590790753 ns
[ 0.679262] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
[ 0.903121] PTP clock support registered
[ 0.927243] clocksource: Switched to clocksource kvm-clock
[ 0.944986] clocksource: acpi_pm: mask: 0xffffff max_cycles: 0xffffff, max_idle_ns: 2085701024 ns
[ 0.993198] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x398cb1e4d56, max_idle_ns: 881590790753 ns
[ 1.123796] rtc_cmos 00:05: setting system clock to 2026-01-28T07:03:45 UTC (1769583825)
[ 1.155755] sched_clock: Marking stable (940009972, 212965288)->(1171254846, -18279586)
[ 1.712598] clk: Disabling unused clocks
Then I mark TSC unstable on the host
tsc: Marking TSC unstable due to Faking unreliable TSC!
TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.
clocksource: Checking clocksource tsc synchronization from CPU 93 to CPUs 0,2,26,75,101,114,118,195.
sched_clock: Marking unstable (945948313746, 69389667)<-(947618130068, -1600430832)
clocksource: CPU 93 check durations 3436ns - 25277ns for clocksource tsc.
clocksource: Switched to clocksource hpet
And nothing happens in the guest.
cat /sys/devices/system/clocksource/clocksource0/current_clocksource
kvm-clock
If I launch the guest after marking host TSC unstable, I see:
Unstable clock detected, switching default tracing clock to "global"
and I don't get any "sched_clock: Marking stable" messages.
--
Thanks and Regards,
Prateek
On 1/28/26 12:48 PM, K Prateek Nayak wrote: > On 1/28/2026 11:56 AM, Shrikanth Hegde wrote: >> >> >> On 1/28/26 8:35 AM, K Prateek Nayak wrote: >>> On 1/28/2026 7:49 AM, Guo, Wangyang wrote: >>>> Yes, when clock mark unstable through tsc_.*mark_unstable() with non-native_sched_clock, clear_sched_clock_stable won't be called, thus sched_clock_irqtime still keep enabled. >>>> >>>> Maybe the dedicated workqueue for sched_clock_irqtime is still needed considering this case. >>> >>> In that case, shouldn't tsc_init() only enable irqtime when >>> using_native_sched_clock()? How can tsc_init() make a call on irqtime if >>> TSC isn't being used as the sched_clock() ultimately? >>> >>> For kvmclock, if PVCLOCK_TSC_STABLE_BIT is not set, it'll call >>> clear_sched_clock_stable() at kvm_sched_clock_init() but none of the >>> other clocksources do so we can assume once we override the sched_clock() >>> it is up to the sched_clock() provider to deal with the clock stability. >>> >> >> I think this would depend if mark_tsc_unstable happens after system boot, >> specially while running kvm guest? > > I don't see anything on the guest side that would mark the kvmclock as > unstable if host's TSC turns unstable post init and since kvmclock > doesn't set CLOCK_SOURCE_MUST_VERIFY, I doubt if a watchdog runs to > verify it in the guest. > > I have the following in the guest: > > $ sudo dmesg | grep -i clock > [ 0.000000] kvm-clock: Using msrs 4b564d01 and 4b564d00 > [ 0.000000] kvm-clock: using sched offset of 423259259 cycles This means pv_sched_clock is kvm_sched_clock_read from now. and irqtime is enabled in the guest. right? > [ 0.000002] clocksource: kvm-clock: mask: 0xffffffffffffffff max_cycles: 0x1cd42e4dffb, max_idle_ns: 881590591483 ns > [ 0.071675] clocksource: refined-jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645519600211568 ns > [ 0.378467] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604467 ns > [ 0.388678] clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x398cb1e4d56, max_idle_ns: 881590790753 ns > [ 0.679262] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns > [ 0.903121] PTP clock support registered > [ 0.927243] clocksource: Switched to clocksource kvm-clock > [ 0.944986] clocksource: acpi_pm: mask: 0xffffff max_cycles: 0xffffff, max_idle_ns: 2085701024 ns > [ 0.993198] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x398cb1e4d56, max_idle_ns: 881590790753 ns > [ 1.123796] rtc_cmos 00:05: setting system clock to 2026-01-28T07:03:45 UTC (1769583825) > [ 1.155755] sched_clock: Marking stable (940009972, 212965288)->(1171254846, -18279586) > [ 1.712598] clk: Disabling unused clocks > > Then I mark TSC unstable on the host > > tsc: Marking TSC unstable due to Faking unreliable TSC! > TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'. > clocksource: Checking clocksource tsc synchronization from CPU 93 to CPUs 0,2,26,75,101,114,118,195. > sched_clock: Marking unstable (945948313746, 69389667)<-(947618130068, -1600430832) > clocksource: CPU 93 check durations 3436ns - 25277ns for clocksource tsc. > clocksource: Switched to clocksource hpet > so now, using_native_sched_clock should fail in guest? If so, with the patch, irqtime won't be disabled no? > And nothing happens in the guest. > > cat /sys/devices/system/clocksource/clocksource0/current_clocksource > kvm-clock > > > If I launch the guest after marking host TSC unstable, I see: > > Unstable clock detected, switching default tracing clock to "global" > > and I don't get any "sched_clock: Marking stable" messages. > Maybe kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) won't be set.
On 1/28/2026 1:02 PM, Shrikanth Hegde wrote:
>
>
> On 1/28/26 12:48 PM, K Prateek Nayak wrote:
>> On 1/28/2026 11:56 AM, Shrikanth Hegde wrote:
>>>
>>>
>>> On 1/28/26 8:35 AM, K Prateek Nayak wrote:
>>>> On 1/28/2026 7:49 AM, Guo, Wangyang wrote:
>>>>> Yes, when clock mark unstable through tsc_.*mark_unstable() with non-native_sched_clock, clear_sched_clock_stable won't be called, thus sched_clock_irqtime still keep enabled.
>>>>>
>>>>> Maybe the dedicated workqueue for sched_clock_irqtime is still needed considering this case.
>>>>
>>>> In that case, shouldn't tsc_init() only enable irqtime when
>>>> using_native_sched_clock()? How can tsc_init() make a call on irqtime if
>>>> TSC isn't being used as the sched_clock() ultimately?
>>>>
>>>> For kvmclock, if PVCLOCK_TSC_STABLE_BIT is not set, it'll call
>>>> clear_sched_clock_stable() at kvm_sched_clock_init() but none of the
>>>> other clocksources do so we can assume once we override the sched_clock()
>>>> it is up to the sched_clock() provider to deal with the clock stability.
>>>>
>>>
>>> I think this would depend if mark_tsc_unstable happens after system boot,
>>> specially while running kvm guest?
>>
>> I don't see anything on the guest side that would mark the kvmclock as
>> unstable if host's TSC turns unstable post init and since kvmclock
>> doesn't set CLOCK_SOURCE_MUST_VERIFY, I doubt if a watchdog runs to
>> verify it in the guest.
>>
>> I have the following in the guest:
>>
>> $ sudo dmesg | grep -i clock
>> [ 0.000000] kvm-clock: Using msrs 4b564d01 and 4b564d00
>> [ 0.000000] kvm-clock: using sched offset of 423259259 cycles
>
> This means pv_sched_clock is kvm_sched_clock_read from now. and
> irqtime is enabled in the guest. right?
So within the guest today ...
$ sudo dmesg | grep -i "clock\|tsc"
[ 0.000000] kvm-clock: Using msrs 4b564d01 and 4b564d00
[ 0.000000] kvm-clock: using sched offset of 504626078 cycles
# kvm_sched_clock_init() happens here so it can potentially do
# clear_sched_clock_stable() here if !PVCLOCK_TSC_STABLE_BIT.
[ 0.000002] clocksource: kvm-clock: mask: 0xffffffffffffffff max_cycles: 0x1cd42e4dffb, max_idle_ns: 881590591483 ns
[ 0.000004] tsc: Detected 1996.251 MHz processor
# We enable irqtime here once TSC frequency has been determined
# without considering using_native_sched_clock()
After that TSC is never selected so we don't care if it is stable
or not since it is not the clocksource - the guest continues on
with unstable sched_clock() but also irqtime enabled since TSC
was calibrated successfully.
>
>> [ 0.000002] clocksource: kvm-clock: mask: 0xffffffffffffffff max_cycles: 0x1cd42e4dffb, max_idle_ns: 881590591483 ns
>> [ 0.071675] clocksource: refined-jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645519600211568 ns
>> [ 0.378467] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604467 ns
>> [ 0.388678] clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x398cb1e4d56, max_idle_ns: 881590790753 ns
>> [ 0.679262] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
>> [ 0.903121] PTP clock support registered
>> [ 0.927243] clocksource: Switched to clocksource kvm-clock
>> [ 0.944986] clocksource: acpi_pm: mask: 0xffffff max_cycles: 0xffffff, max_idle_ns: 2085701024 ns
>> [ 0.993198] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x398cb1e4d56, max_idle_ns: 881590790753 ns
>> [ 1.123796] rtc_cmos 00:05: setting system clock to 2026-01-28T07:03:45 UTC (1769583825)
>> [ 1.155755] sched_clock: Marking stable (940009972, 212965288)->(1171254846, -18279586)
>> [ 1.712598] clk: Disabling unused clocks
>>
>> Then I mark TSC unstable on the host
>>
>> tsc: Marking TSC unstable due to Faking unreliable TSC!
>> TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.
>> clocksource: Checking clocksource tsc synchronization from CPU 93 to CPUs 0,2,26,75,101,114,118,195.
>> sched_clock: Marking unstable (945948313746, 69389667)<-(947618130068, -1600430832)
>> clocksource: CPU 93 check durations 3436ns - 25277ns for clocksource tsc.
>> clocksource: Switched to clocksource hpet
>>
>
> so now, using_native_sched_clock should fail in guest? If so, with the patch,
> irqtime won't be disabled no?
Ideally yes, but the guest continues using kvmclock without any hitch.
I think the x86 KVM layer has something to ensure stability but I'm
not 100% sure.
Since I don't see "tsc: Marking TSC unstable ..." or "sched_clock:
Marking unstable ..." in the guest, we don't hit the mark_tsc_unstable()
path within the guest which would disable irqtime today so essentially
host's TSC turning changing doesn't seem to affect the guest.
>
>> And nothing happens in the guest.
>>
>> cat /sys/devices/system/clocksource/clocksource0/current_clocksource
>> kvm-clock
>>
>>
>> If I launch the guest after marking host TSC unstable, I see:
>>
>> Unstable clock detected, switching default tracing clock to "global"
>>
>> and I don't get any "sched_clock: Marking stable" messages.
>>
>
> Maybe kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) won't be set.
True but that is only for new guests launches past marking TSC unstable
on the host.
--
Thanks and Regards,
Prateek
On 1/28/26 1:20 PM, K Prateek Nayak wrote: > On 1/28/2026 1:02 PM, Shrikanth Hegde wrote: >> >> >> On 1/28/26 12:48 PM, K Prateek Nayak wrote: >>> On 1/28/2026 11:56 AM, Shrikanth Hegde wrote: >>>> >>>> >>>> On 1/28/26 8:35 AM, K Prateek Nayak wrote: >>>>> On 1/28/2026 7:49 AM, Guo, Wangyang wrote: >>>>>> Yes, when clock mark unstable through tsc_.*mark_unstable() with non-native_sched_clock, clear_sched_clock_stable won't be called, thus sched_clock_irqtime still keep enabled. >>>>>> >>>>>> Maybe the dedicated workqueue for sched_clock_irqtime is still needed considering this case. >>>>> >>>>> In that case, shouldn't tsc_init() only enable irqtime when >>>>> using_native_sched_clock()? How can tsc_init() make a call on irqtime if >>>>> TSC isn't being used as the sched_clock() ultimately? >>>>> >>>>> For kvmclock, if PVCLOCK_TSC_STABLE_BIT is not set, it'll call >>>>> clear_sched_clock_stable() at kvm_sched_clock_init() but none of the >>>>> other clocksources do so we can assume once we override the sched_clock() >>>>> it is up to the sched_clock() provider to deal with the clock stability. >>>>> >>>> >>>> I think this would depend if mark_tsc_unstable happens after system boot, >>>> specially while running kvm guest? >>> >>> I don't see anything on the guest side that would mark the kvmclock as >>> unstable if host's TSC turns unstable post init and since kvmclock >>> doesn't set CLOCK_SOURCE_MUST_VERIFY, I doubt if a watchdog runs to >>> verify it in the guest. >>> >>> I have the following in the guest: >>> >>> $ sudo dmesg | grep -i clock >>> [ 0.000000] kvm-clock: Using msrs 4b564d01 and 4b564d00 >>> [ 0.000000] kvm-clock: using sched offset of 423259259 cycles >> >> This means pv_sched_clock is kvm_sched_clock_read from now. and >> irqtime is enabled in the guest. right? > > So within the guest today ... > > $ sudo dmesg | grep -i "clock\|tsc" > [ 0.000000] kvm-clock: Using msrs 4b564d01 and 4b564d00 > [ 0.000000] kvm-clock: using sched offset of 504626078 cycles > > # kvm_sched_clock_init() happens here so it can potentially do > # clear_sched_clock_stable() here if !PVCLOCK_TSC_STABLE_BIT. > > [ 0.000002] clocksource: kvm-clock: mask: 0xffffffffffffffff max_cycles: 0x1cd42e4dffb, max_idle_ns: 881590591483 ns > [ 0.000004] tsc: Detected 1996.251 MHz processor > > # We enable irqtime here once TSC frequency has been determined > # without considering using_native_sched_clock() > > > After that TSC is never selected so we don't care if it is stable > or not since it is not the clocksource - the guest continues on > with unstable sched_clock() but also irqtime enabled since TSC > was calibrated successfully. > >> >>> [ 0.000002] clocksource: kvm-clock: mask: 0xffffffffffffffff max_cycles: 0x1cd42e4dffb, max_idle_ns: 881590591483 ns >>> [ 0.071675] clocksource: refined-jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645519600211568 ns >>> [ 0.378467] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604467 ns >>> [ 0.388678] clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x398cb1e4d56, max_idle_ns: 881590790753 ns >>> [ 0.679262] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns >>> [ 0.903121] PTP clock support registered >>> [ 0.927243] clocksource: Switched to clocksource kvm-clock >>> [ 0.944986] clocksource: acpi_pm: mask: 0xffffff max_cycles: 0xffffff, max_idle_ns: 2085701024 ns >>> [ 0.993198] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x398cb1e4d56, max_idle_ns: 881590790753 ns >>> [ 1.123796] rtc_cmos 00:05: setting system clock to 2026-01-28T07:03:45 UTC (1769583825) >>> [ 1.155755] sched_clock: Marking stable (940009972, 212965288)->(1171254846, -18279586) >>> [ 1.712598] clk: Disabling unused clocks >>> >>> Then I mark TSC unstable on the host >>> >>> tsc: Marking TSC unstable due to Faking unreliable TSC! >>> TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'. >>> clocksource: Checking clocksource tsc synchronization from CPU 93 to CPUs 0,2,26,75,101,114,118,195. >>> sched_clock: Marking unstable (945948313746, 69389667)<-(947618130068, -1600430832) >>> clocksource: CPU 93 check durations 3436ns - 25277ns for clocksource tsc. >>> clocksource: Switched to clocksource hpet >>> >> >> so now, using_native_sched_clock should fail in guest? If so, with the patch, >> irqtime won't be disabled no? > > Ideally yes, but the guest continues using kvmclock without any hitch. > I think the x86 KVM layer has something to ensure stability but I'm > not 100% sure. > > Since I don't see "tsc: Marking TSC unstable ..." or "sched_clock: > Marking unstable ..." in the guest, we don't hit the mark_tsc_unstable() > path within the guest which would disable irqtime today so essentially > host's TSC turning changing doesn't seem to affect the guest. > >> Okay. Fair enough. Then v7 should cover all scenarios i think. with that, Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 505da6689305b1103e9a8ab6636c6a7cf74cd5b1
Gitweb: https://git.kernel.org/tip/505da6689305b1103e9a8ab6636c6a7cf74cd5b1
Author: Wangyang Guo <wangyang.guo@intel.com>
AuthorDate: Tue, 27 Jan 2026 15:25:09 +08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 03 Feb 2026 12:04:19 +01:00
sched/clock: Avoid false sharing for sched_clock_irqtime
Read-mostly sched_clock_irqtime may share the same cacheline with
frequently updated nohz struct. Make it as static_key to avoid
false sharing issue.
The only user of disable_sched_clock_irqtime()
is tsc_.*mark_unstable() which may be invoked under atomic context
and require a workqueue to disable static_key. But both of them
calls clear_sched_clock_stable() just before doing
disable_sched_clock_irqtime(). We can reuse
"sched_clock_work" to also disable sched_clock_irqtime().
One additional case need to handle is if the tsc is marked unstable
before late_initcall() phase, sched_clock_work will not be invoked
and sched_clock_irqtime will stay enabled although clock is unstable:
tsc_init()
enable_sched_clock_irqtime() # irqtime accounting is enabled here
...
if (unsynchronized_tsc()) # true
mark_tsc_unstable()
clear_sched_clock_stable()
__sched_clock_stable_early = 0;
...
if (static_key_count(&sched_clock_running.key) == 2)
# Only happens at sched_clock_init_late()
__clear_sched_clock_stable(); # Never executed
...
# late_initcall() phase
sched_clock_init_late()
if (__sched_clock_stable_early) # Already false
__set_sched_clock_stable(); # sched_clock is never marked stable
# TSC unstable, but sched_clock_work won't run to disable irqtime
So we need to disable_sched_clock_irqtime() in sched_clock_init_late()
if clock is unstable.
Reported-by: Benjamin Lei <benjamin.lei@intel.com>
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Tianyou Li <tianyou.li@intel.com>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260127072509.2627346-1-wangyang.guo@intel.com
---
arch/x86/kernel/tsc.c | 2 --
kernel/sched/clock.c | 3 +++
kernel/sched/cputime.c | 9 +++++----
kernel/sched/sched.h | 4 ++--
4 files changed, 10 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7d3e13e..7be44b5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1143,7 +1143,6 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
tsc_unstable = 1;
if (using_native_sched_clock())
clear_sched_clock_stable();
- disable_sched_clock_irqtime();
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
@@ -1213,7 +1212,6 @@ void mark_tsc_unstable(char *reason)
tsc_unstable = 1;
if (using_native_sched_clock())
clear_sched_clock_stable();
- disable_sched_clock_irqtime();
pr_info("Marking TSC unstable due to %s\n", reason);
clocksource_mark_unstable(&clocksource_tsc_early);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index f5e6dd6..2ae4fbf 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -173,6 +173,7 @@ notrace static void __sched_clock_work(struct work_struct *work)
scd->tick_gtod, __gtod_offset,
scd->tick_raw, __sched_clock_offset);
+ disable_sched_clock_irqtime();
static_branch_disable(&__sched_clock_stable);
}
@@ -238,6 +239,8 @@ static int __init sched_clock_init_late(void)
if (__sched_clock_stable_early)
__set_sched_clock_stable();
+ else
+ disable_sched_clock_irqtime(); /* disable if clock unstable. */
return 0;
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 4f97896..ff0dfca 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -12,6 +12,8 @@
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
/*
* There are no locks covering percpu hardirq/softirq time.
* They are only modified in vtime_account, on corresponding CPU
@@ -25,16 +27,15 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-int sched_clock_irqtime;
-
void enable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 1;
+ static_branch_enable(&sched_clock_irqtime);
}
void disable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 0;
+ if (irqtime_enabled())
+ static_branch_disable(&sched_clock_irqtime);
}
static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2aa4251..a821cc8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3333,11 +3333,11 @@ struct irqtime {
};
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
-extern int sched_clock_irqtime;
+DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime);
static inline int irqtime_enabled(void)
{
- return sched_clock_irqtime;
+ return static_branch_likely(&sched_clock_irqtime);
}
/*
© 2016 - 2026 Red Hat, Inc.