Waiting loops like the one in flush_command_buffer() will degenerate to
infinite ones when used early enough for NOW() to still return constant
zero. Make sure the returned value at least monotonically increases. When
available, use nominal frequency values as initial approximation.
Do this only in get_s_time(), as producing a sane value in
get_s_time_fixed() for non-zero inputs won't be reasonably possible.
Put an assertion there.
Reported-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
RFC: This breaks at least the TSM_BOOT case printk_start_of_line(), which
checks for NOW() returning 0 (falling back to TSM_RAW in this case).
For now I have no idea how to avoid this; perhaps that's tolerable at
least in the case where we put in place an early estimate? Should we
maybe weaken the fallback condition to take effect for any value
below 1μs?
RFC: While generally the mentioned waiting loops will take longer to time
out, on a very fast CPU tight loops may time out too early.
RFC: For the AMD/Hygon case, if the "nominal" value isn't available, we
could use the "high" one. That would cause NOW() to run too slowly
(until the scale is properly set), but maybe that's still better than
it returning 0? (As it stands, I can't really test the new code
there, as my Rome system only supplies the lo/hi pair of values.)
RFC: On the 2nd pass through early_cpu_init() it may be okay to skip the
new additions.
With "x86/time: set AP's TSC scale estimate earlier" the counter update
may not need to be atomic anymore, as then only the BSP can reasonably hit
that path.
I don't think Fixes: tags should be put here. If we did, we'd have to
enumerate all introductions of early uses of NOW() (or get_s_time()), with
the exception of those dealing with getting back 0 (which I expect is only
printk_start_of_line()). Will want backporting nevertheless (unless deemed
too risky).
---
v2: Add assertion to get_s_time_fixed(). Use nominal frequencies for very
early setting, if available.
--- unstable.orig/xen/arch/x86/cpu/common.c 2026-05-13 08:35:28.640503356 +0200
+++ unstable/xen/arch/x86/cpu/common.c 2026-05-12 12:30:35.475284195 +0200
@@ -19,6 +19,7 @@
#include <asm/random.h>
#include <asm/setup.h>
#include <asm/shstk.h>
+#include <asm/time.h>
#include <asm/xstate.h>
#include <public/sysctl.h>
@@ -403,6 +404,25 @@ void __init early_cpu_init(bool verbose)
&c->x86_capability[FEATURESET_7d1]);
}
+ if (c->cpuid_level >= 0x15) {
+ cpuid(0x15, &eax, &ebx, &ecx, &edx);
+
+ if (ecx && ebx && eax)
+ preset_tsc_scale(DIV_ROUND_UP(ecx * 1UL * ebx, eax));
+ else if (c->cpuid_level >= 0x16) {
+ /* Assume CPU base freq ≈ TSC freq. */
+ cpuid(0x16, &eax, &ebx, &ecx, &edx);
+ if (eax)
+ preset_tsc_scale(eax * 1000000UL);
+ }
+ } else if (c->vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) {
+ unsigned int nom_mhz = 0;
+
+ amd_process_freq(c, NULL, &nom_mhz, NULL);
+ if (nom_mhz)
+ preset_tsc_scale(nom_mhz * 1000000UL);
+ }
+
eax = cpuid_eax(0x80000000);
if ((eax >> 16) == 0x8000 && eax >= 0x80000008) {
ebx = eax >= 0x8000001f ? cpuid_ebx(0x8000001f) : 0;
--- unstable.orig/xen/arch/x86/include/asm/time.h 2026-05-13 08:35:28.640503356 +0200
+++ unstable/xen/arch/x86/include/asm/time.h 2026-05-12 12:25:14.435489339 +0200
@@ -23,6 +23,7 @@ mktime (unsigned int year, unsigned int
int time_suspend(void);
int time_resume(void);
+void preset_tsc_scale(unsigned long freq);
void init_percpu_time(void);
void time_latch_stamps(void);
--- unstable.orig/xen/arch/x86/time.c 2026-05-13 08:35:28.640503356 +0200
+++ unstable/xen/arch/x86/time.c 2026-05-13 08:33:54.000000000 +0200
@@ -1655,6 +1655,9 @@ s_time_t get_s_time_fixed(u64 at_tsc)
const struct cpu_time *t = &this_cpu(cpu_time);
u64 tsc, delta;
+ /* scale_delta() degenerates when the scale wasn't set yet. */
+ ASSERT(t->tsc_scale.mul_frac);
+
if ( at_tsc )
tsc = at_tsc;
else
@@ -1670,6 +1673,20 @@ s_time_t get_s_time_fixed(u64 at_tsc)
s_time_t get_s_time(void)
{
+ /*
+ * Before the TSC scale is set, avoid returning constant 0 (or whatever
+ * this_cpu(cpu_time).stamp.local_stime is set to). While the returned
+ * value is in no way representing time, it at least increases
+ * monotonically, thus avoiding e.g. waiting loops to degenerate to
+ * entirely infinite ones.
+ */
+ if ( unlikely(!this_cpu(cpu_time).tsc_scale.mul_frac) )
+ {
+ static s_time_t counter;
+
+ return arch_fetch_and_add(&counter, 1);
+ }
+
return get_s_time_fixed(0);
}
@@ -2623,6 +2640,21 @@ int __init init_xen_time(void)
return 0;
}
+/* BSP-only function to pre-set an approximate TSC scale. */
+void __init preset_tsc_scale(unsigned long freq)
+{
+ struct cpu_time *t = &this_cpu(cpu_time);
+
+ /*
+ * The incoming frequency is only approximate (nominal). Increase it by
+ * 1% to make NOW() output rather a little too slow than too fast, thus
+ * avoiding a possible backwards jump once the final scale is set.
+ */
+ freq += DIV_ROUND_UP(freq, 100);
+
+ set_time_scale(&t->tsc_scale, freq);
+ t->stamp.local_tsc = boot_tsc_stamp;
+}
/* Early init function. */
void __init early_time_init(void)
@@ -2640,6 +2672,9 @@ void __init early_time_init(void)
"TSC ADJUST set to %lx on boot CPU - clearing\n", tmp);
wrmsrl(MSR_IA32_TSC_ADJUST, 0);
boot_tsc_stamp -= tmp;
+
+ if ( t->stamp.local_tsc )
+ t->stamp.local_tsc -= tmp;
}
}
On Wed, May 13, 2026 at 08:44:46AM +0200, Jan Beulich wrote:
> Waiting loops like the one in flush_command_buffer() will degenerate to
> infinite ones when used early enough for NOW() to still return constant
> zero. Make sure the returned value at least monotonically increases. When
> available, use nominal frequency values as initial approximation.
>
> Do this only in get_s_time(), as producing a sane value in
> get_s_time_fixed() for non-zero inputs won't be reasonably possible.
> Put an assertion there.
>
> Reported-by: Roger Pau Monné <roger.pau@citrix.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> RFC: This breaks at least the TSM_BOOT case printk_start_of_line(), which
> checks for NOW() returning 0 (falling back to TSM_RAW in this case).
> For now I have no idea how to avoid this; perhaps that's tolerable at
> least in the case where we put in place an early estimate? Should we
> maybe weaken the fallback condition to take effect for any value
> below 1μs?
Maybe it's fine to print cycles unconditionally until we reach
SYS_STATE_smp_boot when we know the per-cpu scale is correctly set?
>
> RFC: While generally the mentioned waiting loops will take longer to time
> out, on a very fast CPU tight loops may time out too early.
>
> RFC: For the AMD/Hygon case, if the "nominal" value isn't available, we
> could use the "high" one. That would cause NOW() to run too slowly
> (until the scale is properly set), but maybe that's still better than
> it returning 0? (As it stands, I can't really test the new code
> there, as my Rome system only supplies the lo/hi pair of values.)
Using the "high" frequency would seem fine to me.
>
> RFC: On the 2nd pass through early_cpu_init() it may be okay to skip the
> new additions.
>
> With "x86/time: set AP's TSC scale estimate earlier" the counter update
> may not need to be atomic anymore, as then only the BSP can reasonably hit
> that path.
>
> I don't think Fixes: tags should be put here. If we did, we'd have to
> enumerate all introductions of early uses of NOW() (or get_s_time()), with
> the exception of those dealing with getting back 0 (which I expect is only
> printk_start_of_line()). Will want backporting nevertheless (unless deemed
> too risky).
> ---
> v2: Add assertion to get_s_time_fixed(). Use nominal frequencies for very
> early setting, if available.
>
> --- unstable.orig/xen/arch/x86/cpu/common.c 2026-05-13 08:35:28.640503356 +0200
> +++ unstable/xen/arch/x86/cpu/common.c 2026-05-12 12:30:35.475284195 +0200
> @@ -19,6 +19,7 @@
> #include <asm/random.h>
> #include <asm/setup.h>
> #include <asm/shstk.h>
> +#include <asm/time.h>
> #include <asm/xstate.h>
>
> #include <public/sysctl.h>
> @@ -403,6 +404,25 @@ void __init early_cpu_init(bool verbose)
> &c->x86_capability[FEATURESET_7d1]);
> }
>
> + if (c->cpuid_level >= 0x15) {
> + cpuid(0x15, &eax, &ebx, &ecx, &edx);
> +
> + if (ecx && ebx && eax)
> + preset_tsc_scale(DIV_ROUND_UP(ecx * 1UL * ebx, eax));
> + else if (c->cpuid_level >= 0x16) {
> + /* Assume CPU base freq ≈ TSC freq. */
> + cpuid(0x16, &eax, &ebx, &ecx, &edx);
> + if (eax)
> + preset_tsc_scale(eax * 1000000UL);
> + }
> + } else if (c->vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) {
> + unsigned int nom_mhz = 0;
> +
> + amd_process_freq(c, NULL, &nom_mhz, NULL);
> + if (nom_mhz)
> + preset_tsc_scale(nom_mhz * 1000000UL);
> + }
> +
> eax = cpuid_eax(0x80000000);
> if ((eax >> 16) == 0x8000 && eax >= 0x80000008) {
> ebx = eax >= 0x8000001f ? cpuid_ebx(0x8000001f) : 0;
> --- unstable.orig/xen/arch/x86/include/asm/time.h 2026-05-13 08:35:28.640503356 +0200
> +++ unstable/xen/arch/x86/include/asm/time.h 2026-05-12 12:25:14.435489339 +0200
> @@ -23,6 +23,7 @@ mktime (unsigned int year, unsigned int
> int time_suspend(void);
> int time_resume(void);
>
> +void preset_tsc_scale(unsigned long freq);
> void init_percpu_time(void);
> void time_latch_stamps(void);
>
> --- unstable.orig/xen/arch/x86/time.c 2026-05-13 08:35:28.640503356 +0200
> +++ unstable/xen/arch/x86/time.c 2026-05-13 08:33:54.000000000 +0200
> @@ -1655,6 +1655,9 @@ s_time_t get_s_time_fixed(u64 at_tsc)
> const struct cpu_time *t = &this_cpu(cpu_time);
> u64 tsc, delta;
>
> + /* scale_delta() degenerates when the scale wasn't set yet. */
> + ASSERT(t->tsc_scale.mul_frac);
> +
> if ( at_tsc )
> tsc = at_tsc;
> else
> @@ -1670,6 +1673,20 @@ s_time_t get_s_time_fixed(u64 at_tsc)
>
> s_time_t get_s_time(void)
> {
> + /*
> + * Before the TSC scale is set, avoid returning constant 0 (or whatever
> + * this_cpu(cpu_time).stamp.local_stime is set to). While the returned
> + * value is in no way representing time, it at least increases
> + * monotonically, thus avoiding e.g. waiting loops to degenerate to
> + * entirely infinite ones.
> + */
> + if ( unlikely(!this_cpu(cpu_time).tsc_scale.mul_frac) )
> + {
> + static s_time_t counter;
> +
> + return arch_fetch_and_add(&counter, 1);
> + }
> +
> return get_s_time_fixed(0);
> }
>
> @@ -2623,6 +2640,21 @@ int __init init_xen_time(void)
> return 0;
> }
>
> +/* BSP-only function to pre-set an approximate TSC scale. */
> +void __init preset_tsc_scale(unsigned long freq)
> +{
> + struct cpu_time *t = &this_cpu(cpu_time);
> +
> + /*
> + * The incoming frequency is only approximate (nominal). Increase it by
> + * 1% to make NOW() output rather a little too slow than too fast, thus
> + * avoiding a possible backwards jump once the final scale is set.
> + */
> + freq += DIV_ROUND_UP(freq, 100);
To avoid such possible jump backwards, won't it safer to also update
the ->local_stime and ->local_tsc fields at the time the new scale is
set? Updatign those ahead of setting the new scale should avoid any
backward jumps.
Thanks, Roger.
On 14.05.2026 17:56, Roger Pau Monné wrote:
> On Wed, May 13, 2026 at 08:44:46AM +0200, Jan Beulich wrote:
>> Waiting loops like the one in flush_command_buffer() will degenerate to
>> infinite ones when used early enough for NOW() to still return constant
>> zero. Make sure the returned value at least monotonically increases. When
>> available, use nominal frequency values as initial approximation.
>>
>> Do this only in get_s_time(), as producing a sane value in
>> get_s_time_fixed() for non-zero inputs won't be reasonably possible.
>> Put an assertion there.
>>
>> Reported-by: Roger Pau Monné <roger.pau@citrix.com>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> RFC: This breaks at least the TSM_BOOT case printk_start_of_line(), which
>> checks for NOW() returning 0 (falling back to TSM_RAW in this case).
>> For now I have no idea how to avoid this; perhaps that's tolerable at
>> least in the case where we put in place an early estimate? Should we
>> maybe weaken the fallback condition to take effect for any value
>> below 1μs?
>
> Maybe it's fine to print cycles unconditionally until we reach
> SYS_STATE_smp_boot when we know the per-cpu scale is correctly set?
I remain of the opinion (as said in reply to your similar v1 comment) that
this isn't very desirable. Tying to SYS_STATE_smp_boot also would feel
pretty arbitrary. Other ports may have NOW() properly working much earlier.
If anything we may want to add a global indicator of NOW() properly working.
>> RFC: While generally the mentioned waiting loops will take longer to time
>> out, on a very fast CPU tight loops may time out too early.
>>
>> RFC: For the AMD/Hygon case, if the "nominal" value isn't available, we
>> could use the "high" one. That would cause NOW() to run too slowly
>> (until the scale is properly set), but maybe that's still better than
>> it returning 0? (As it stands, I can't really test the new code
>> there, as my Rome system only supplies the lo/hi pair of values.)
>
> Using the "high" frequency would seem fine to me.
Okay, will do then for v3.
Related aspect: With these family/model specific additions for AMD, we could
also separate out intel_log_freq()'s model specific part, to leverage from
here as well.
>> @@ -2623,6 +2640,21 @@ int __init init_xen_time(void)
>> return 0;
>> }
>>
>> +/* BSP-only function to pre-set an approximate TSC scale. */
>> +void __init preset_tsc_scale(unsigned long freq)
>> +{
>> + struct cpu_time *t = &this_cpu(cpu_time);
>> +
>> + /*
>> + * The incoming frequency is only approximate (nominal). Increase it by
>> + * 1% to make NOW() output rather a little too slow than too fast, thus
>> + * avoiding a possible backwards jump once the final scale is set.
>> + */
>> + freq += DIV_ROUND_UP(freq, 100);
>
> To avoid such possible jump backwards, won't it safer to also update
> the ->local_stime and ->local_tsc fields at the time the new scale is
> set? Updatign those ahead of setting the new scale should avoid any
> backward jumps.
->stamp.local_tsc does get updated; you merely dropped that line from reply
context. As to local_stime - how could we possibly set that, when we didn't
get through init_platform_timer() yet? Leaving it at 0 is the correct
match for setting local_tsc to boot_tsc_stamp.
Jan
On Fri, May 15, 2026 at 09:15:40AM +0200, Jan Beulich wrote:
> On 14.05.2026 17:56, Roger Pau Monné wrote:
> > On Wed, May 13, 2026 at 08:44:46AM +0200, Jan Beulich wrote:
> >> Waiting loops like the one in flush_command_buffer() will degenerate to
> >> infinite ones when used early enough for NOW() to still return constant
> >> zero. Make sure the returned value at least monotonically increases. When
> >> available, use nominal frequency values as initial approximation.
> >>
> >> Do this only in get_s_time(), as producing a sane value in
> >> get_s_time_fixed() for non-zero inputs won't be reasonably possible.
> >> Put an assertion there.
> >>
> >> Reported-by: Roger Pau Monné <roger.pau@citrix.com>
> >> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> >> ---
> >> RFC: This breaks at least the TSM_BOOT case printk_start_of_line(), which
> >> checks for NOW() returning 0 (falling back to TSM_RAW in this case).
> >> For now I have no idea how to avoid this; perhaps that's tolerable at
> >> least in the case where we put in place an early estimate? Should we
> >> maybe weaken the fallback condition to take effect for any value
> >> below 1μs?
> >
> > Maybe it's fine to print cycles unconditionally until we reach
> > SYS_STATE_smp_boot when we know the per-cpu scale is correctly set?
>
> I remain of the opinion (as said in reply to your similar v1 comment) that
> this isn't very desirable. Tying to SYS_STATE_smp_boot also would feel
> pretty arbitrary. Other ports may have NOW() properly working much earlier.
> If anything we may want to add a global indicator of NOW() properly working.
I would be fine with such indicator.
> >> RFC: While generally the mentioned waiting loops will take longer to time
> >> out, on a very fast CPU tight loops may time out too early.
> >>
> >> RFC: For the AMD/Hygon case, if the "nominal" value isn't available, we
> >> could use the "high" one. That would cause NOW() to run too slowly
> >> (until the scale is properly set), but maybe that's still better than
> >> it returning 0? (As it stands, I can't really test the new code
> >> there, as my Rome system only supplies the lo/hi pair of values.)
> >
> > Using the "high" frequency would seem fine to me.
>
> Okay, will do then for v3.
>
> Related aspect: With these family/model specific additions for AMD, we could
> also separate out intel_log_freq()'s model specific part, to leverage from
> here as well.
Hm, yes, that would reduce the duplication of the added logic.
> >> @@ -2623,6 +2640,21 @@ int __init init_xen_time(void)
> >> return 0;
> >> }
> >>
> >> +/* BSP-only function to pre-set an approximate TSC scale. */
> >> +void __init preset_tsc_scale(unsigned long freq)
> >> +{
> >> + struct cpu_time *t = &this_cpu(cpu_time);
> >> +
> >> + /*
> >> + * The incoming frequency is only approximate (nominal). Increase it by
> >> + * 1% to make NOW() output rather a little too slow than too fast, thus
> >> + * avoiding a possible backwards jump once the final scale is set.
> >> + */
> >> + freq += DIV_ROUND_UP(freq, 100);
> >
> > To avoid such possible jump backwards, won't it safer to also update
> > the ->local_stime and ->local_tsc fields at the time the new scale is
> > set? Updatign those ahead of setting the new scale should avoid any
> > backward jumps.
>
> ->stamp.local_tsc does get updated; you merely dropped that line from reply
> context. As to local_stime - how could we possibly set that, when we didn't
> get through init_platform_timer() yet? Leaving it at 0 is the correct
> match for setting local_tsc to boot_tsc_stamp.
Please bear with me, maybe I'm not understanding exactly to what the
code comment refers to as "possible backwards jump once the final
scale is set". I assume you refer to the setting of scale
early_time_init()? The ->stamp.local_tsc value also gets updated at
that point, so it's not possible for the timer going backwards?
This changed with the addition of the init_percpu_time() call in
early_time_init(), and makes the setting of "t->stamp.local_tsc =
boot_tsc_stamp" pointless, as it will get overwritten by the logic in
init_percpu_time() a couple of lines after?
Thanks, _Roger.
On 15.05.2026 15:12, Roger Pau Monné wrote:
> On Fri, May 15, 2026 at 09:15:40AM +0200, Jan Beulich wrote:
>> On 14.05.2026 17:56, Roger Pau Monné wrote:
>>> On Wed, May 13, 2026 at 08:44:46AM +0200, Jan Beulich wrote:
>>>> @@ -2623,6 +2640,21 @@ int __init init_xen_time(void)
>>>> return 0;
>>>> }
>>>>
>>>> +/* BSP-only function to pre-set an approximate TSC scale. */
>>>> +void __init preset_tsc_scale(unsigned long freq)
>>>> +{
>>>> + struct cpu_time *t = &this_cpu(cpu_time);
>>>> +
>>>> + /*
>>>> + * The incoming frequency is only approximate (nominal). Increase it by
>>>> + * 1% to make NOW() output rather a little too slow than too fast, thus
>>>> + * avoiding a possible backwards jump once the final scale is set.
>>>> + */
>>>> + freq += DIV_ROUND_UP(freq, 100);
>>>
>>> To avoid such possible jump backwards, won't it safer to also update
>>> the ->local_stime and ->local_tsc fields at the time the new scale is
>>> set? Updatign those ahead of setting the new scale should avoid any
>>> backward jumps.
>>
>> ->stamp.local_tsc does get updated; you merely dropped that line from reply
>> context. As to local_stime - how could we possibly set that, when we didn't
>> get through init_platform_timer() yet? Leaving it at 0 is the correct
>> match for setting local_tsc to boot_tsc_stamp.
>
> Please bear with me, maybe I'm not understanding exactly to what the
> code comment refers to as "possible backwards jump once the final
> scale is set". I assume you refer to the setting of scale
> early_time_init()? The ->stamp.local_tsc value also gets updated at
> that point, so it's not possible for the timer going backwards?
It is updated there, but only to boot_tsc_stamp. I.e. no change at all
if preset_tsc_scale() set the field already.
> This changed with the addition of the init_percpu_time() call in
> early_time_init(), and makes the setting of "t->stamp.local_tsc =
> boot_tsc_stamp" pointless, as it will get overwritten by the logic in
> init_percpu_time() a couple of lines after?
When making these changes, I first thought so too. But no, that write
isn't pointless: In case preset_tsc_scale() wasn't called, leaving the
field at 0 would break the use of get_s_time_fixed() out of
init_percpu_time(). (Iirc I only noticed this because of having put
debug printk()s there for other purposes.)
Jan
On Mon, May 18, 2026 at 10:05:41AM +0200, Jan Beulich wrote:
> On 15.05.2026 15:12, Roger Pau Monné wrote:
> > On Fri, May 15, 2026 at 09:15:40AM +0200, Jan Beulich wrote:
> >> On 14.05.2026 17:56, Roger Pau Monné wrote:
> >>> On Wed, May 13, 2026 at 08:44:46AM +0200, Jan Beulich wrote:
> >>>> @@ -2623,6 +2640,21 @@ int __init init_xen_time(void)
> >>>> return 0;
> >>>> }
> >>>>
> >>>> +/* BSP-only function to pre-set an approximate TSC scale. */
> >>>> +void __init preset_tsc_scale(unsigned long freq)
> >>>> +{
> >>>> + struct cpu_time *t = &this_cpu(cpu_time);
> >>>> +
> >>>> + /*
> >>>> + * The incoming frequency is only approximate (nominal). Increase it by
> >>>> + * 1% to make NOW() output rather a little too slow than too fast, thus
> >>>> + * avoiding a possible backwards jump once the final scale is set.
> >>>> + */
> >>>> + freq += DIV_ROUND_UP(freq, 100);
> >>>
> >>> To avoid such possible jump backwards, won't it safer to also update
> >>> the ->local_stime and ->local_tsc fields at the time the new scale is
> >>> set? Updatign those ahead of setting the new scale should avoid any
> >>> backward jumps.
> >>
> >> ->stamp.local_tsc does get updated; you merely dropped that line from reply
> >> context. As to local_stime - how could we possibly set that, when we didn't
> >> get through init_platform_timer() yet? Leaving it at 0 is the correct
> >> match for setting local_tsc to boot_tsc_stamp.
> >
> > Please bear with me, maybe I'm not understanding exactly to what the
> > code comment refers to as "possible backwards jump once the final
> > scale is set". I assume you refer to the setting of scale
> > early_time_init()? The ->stamp.local_tsc value also gets updated at
> > that point, so it's not possible for the timer going backwards?
>
> It is updated there, but only to boot_tsc_stamp. I.e. no change at all
> if preset_tsc_scale() set the field already.
Couldn't we do the following in early_init_time() to ensure time
doesn't go backwards:
if ( t->tsc_scale.mul_frac )
{
/*
* Update time snapshot to ensure time doesn't go backwards as a
* result of the scale change done below.
*/
t->stamp.local_tsc = rdtsc_ordered();
t->stamp.local_stime = get_s_time_fixed(t->stamp.local_tsc);
}
else
t->stamp.local_tsc = boot_tsc_stamp;
set_time_scale(&t->tsc_scale, tmp);
init_percpu_time();
That's kind of the same logic that's used in cpu_frequency_change()
ahead of calling set_time_scale().
> > This changed with the addition of the init_percpu_time() call in
> > early_time_init(), and makes the setting of "t->stamp.local_tsc =
> > boot_tsc_stamp" pointless, as it will get overwritten by the logic in
> > init_percpu_time() a couple of lines after?
>
> When making these changes, I first thought so too. But no, that write
> isn't pointless: In case preset_tsc_scale() wasn't called, leaving the
> field at 0 would break the use of get_s_time_fixed() out of
> init_percpu_time(). (Iirc I only noticed this because of having put
> debug printk()s there for other purposes.)
I see, yes, that would be needed.
Regards, Roger.
On 20.05.2026 12:02, Roger Pau Monné wrote:
> On Mon, May 18, 2026 at 10:05:41AM +0200, Jan Beulich wrote:
>> On 15.05.2026 15:12, Roger Pau Monné wrote:
>>> On Fri, May 15, 2026 at 09:15:40AM +0200, Jan Beulich wrote:
>>>> On 14.05.2026 17:56, Roger Pau Monné wrote:
>>>>> On Wed, May 13, 2026 at 08:44:46AM +0200, Jan Beulich wrote:
>>>>>> @@ -2623,6 +2640,21 @@ int __init init_xen_time(void)
>>>>>> return 0;
>>>>>> }
>>>>>>
>>>>>> +/* BSP-only function to pre-set an approximate TSC scale. */
>>>>>> +void __init preset_tsc_scale(unsigned long freq)
>>>>>> +{
>>>>>> + struct cpu_time *t = &this_cpu(cpu_time);
>>>>>> +
>>>>>> + /*
>>>>>> + * The incoming frequency is only approximate (nominal). Increase it by
>>>>>> + * 1% to make NOW() output rather a little too slow than too fast, thus
>>>>>> + * avoiding a possible backwards jump once the final scale is set.
>>>>>> + */
>>>>>> + freq += DIV_ROUND_UP(freq, 100);
>>>>>
>>>>> To avoid such possible jump backwards, won't it safer to also update
>>>>> the ->local_stime and ->local_tsc fields at the time the new scale is
>>>>> set? Updatign those ahead of setting the new scale should avoid any
>>>>> backward jumps.
>>>>
>>>> ->stamp.local_tsc does get updated; you merely dropped that line from reply
>>>> context. As to local_stime - how could we possibly set that, when we didn't
>>>> get through init_platform_timer() yet? Leaving it at 0 is the correct
>>>> match for setting local_tsc to boot_tsc_stamp.
>>>
>>> Please bear with me, maybe I'm not understanding exactly to what the
>>> code comment refers to as "possible backwards jump once the final
>>> scale is set". I assume you refer to the setting of scale
>>> early_time_init()? The ->stamp.local_tsc value also gets updated at
>>> that point, so it's not possible for the timer going backwards?
>>
>> It is updated there, but only to boot_tsc_stamp. I.e. no change at all
>> if preset_tsc_scale() set the field already.
>
> Couldn't we do the following in early_init_time() to ensure time
> doesn't go backwards:
>
> if ( t->tsc_scale.mul_frac )
> {
> /*
> * Update time snapshot to ensure time doesn't go backwards as a
> * result of the scale change done below.
> */
> t->stamp.local_tsc = rdtsc_ordered();
> t->stamp.local_stime = get_s_time_fixed(t->stamp.local_tsc);
> }
> else
> t->stamp.local_tsc = boot_tsc_stamp;
>
> set_time_scale(&t->tsc_scale, tmp);
> init_percpu_time();
Yes, this would prevent stime going backwards. But at the same time it
would latch the imprecise calculations done initially. The farther off
the initial scale (e.g. when using "high" in case "nominal" isn't
available, as discussed previously), the bigger the error. That may be
mainly (only?) a cosmetic thing, but still.
Actually, if we did that, what would prevent stime going backwards by
the update done in init_percpu_time()? Aiui we really want .local_stime
to be written there for the very first time.
> That's kind of the same logic that's used in cpu_frequency_change()
> ahead of calling set_time_scale().
That function is ripe for removal anyway, so I'm hesitant to derive
anything from it.
Jan
© 2016 - 2026 Red Hat, Inc.