Although the kernel switches over to stable TSC clocksource instead of
kvmclock, the scheduler still keeps on using kvmclock as the sched clock.
This is due to kvm_sched_clock_init() updating the pv_sched_clock()
unconditionally.
Use the clock source enable/disable callbacks to initialize
kvm_sched_clock_init() and update the pv_sched_clock().
As the clock selection happens in the stop machine context, schedule
delayed work to update the static_call()
Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
arch/x86/kernel/kvmclock.c | 34 +++++++++++++++++++++++++++++-----
1 file changed, 29 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 5b2c15214a6b..5cd3717e103b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -21,6 +21,7 @@
#include <asm/hypervisor.h>
#include <asm/x86_init.h>
#include <asm/kvmclock.h>
+#include <asm/timer.h>
static int kvmclock __initdata = 1;
static int kvmclock_vsyscall __initdata = 1;
@@ -148,12 +149,39 @@ bool kvm_check_and_clear_guest_paused(void)
return ret;
}
+static u64 (*old_pv_sched_clock)(void);
+
+static void enable_kvm_sc_work(struct work_struct *work)
+{
+ u8 flags;
+
+ old_pv_sched_clock = static_call_query(pv_sched_clock);
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+ kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+}
+
+static DECLARE_DELAYED_WORK(enable_kvm_sc, enable_kvm_sc_work);
+
+static void disable_kvm_sc_work(struct work_struct *work)
+{
+ if (old_pv_sched_clock)
+ paravirt_set_sched_clock(old_pv_sched_clock);
+}
+static DECLARE_DELAYED_WORK(disable_kvm_sc, disable_kvm_sc_work);
+
static int kvm_cs_enable(struct clocksource *cs)
{
vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK);
+ schedule_delayed_work(&enable_kvm_sc, 0);
+
return 0;
}
+static void kvm_cs_disable(struct clocksource *cs)
+{
+ schedule_delayed_work(&disable_kvm_sc, 0);
+}
+
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_get_cycles,
@@ -162,6 +190,7 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.id = CSID_X86_KVM_CLK,
.enable = kvm_cs_enable,
+ .disable = kvm_cs_disable,
};
static void kvm_register_clock(char *txt)
@@ -287,8 +316,6 @@ static int kvmclock_setup_percpu(unsigned int cpu)
void __init kvmclock_init(void)
{
- u8 flags;
-
if (!kvm_para_available() || !kvmclock)
return;
@@ -317,9 +344,6 @@ void __init kvmclock_init(void)
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
- flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
- kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
-
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
x86_platform.calibrate_cpu = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
--
2.34.1
On Wed, Oct 09, 2024, Nikunj A Dadhania wrote: > Although the kernel switches over to stable TSC clocksource instead of > kvmclock, the scheduler still keeps on using kvmclock as the sched clock. > This is due to kvm_sched_clock_init() updating the pv_sched_clock() > unconditionally. All PV clocks are affected by this, no? This seems like something that should be handled in common code, which is the point I was trying to make in v11. > Use the clock source enable/disable callbacks to initialize > kvm_sched_clock_init() and update the pv_sched_clock(). > > As the clock selection happens in the stop machine context, schedule > delayed work to update the static_call() > > Signed-off-by: Nikunj A Dadhania <nikunj@amd.com> > --- > arch/x86/kernel/kvmclock.c | 34 +++++++++++++++++++++++++++++----- > 1 file changed, 29 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c > index 5b2c15214a6b..5cd3717e103b 100644 > --- a/arch/x86/kernel/kvmclock.c > +++ b/arch/x86/kernel/kvmclock.c > @@ -21,6 +21,7 @@ > #include <asm/hypervisor.h> > #include <asm/x86_init.h> > #include <asm/kvmclock.h> > +#include <asm/timer.h> > > static int kvmclock __initdata = 1; > static int kvmclock_vsyscall __initdata = 1; > @@ -148,12 +149,39 @@ bool kvm_check_and_clear_guest_paused(void) > return ret; > } > > +static u64 (*old_pv_sched_clock)(void); > + > +static void enable_kvm_sc_work(struct work_struct *work) > +{ > + u8 flags; > + > + old_pv_sched_clock = static_call_query(pv_sched_clock); > + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); > + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); > +} > + > +static DECLARE_DELAYED_WORK(enable_kvm_sc, enable_kvm_sc_work); > + > +static void disable_kvm_sc_work(struct work_struct *work) > +{ > + if (old_pv_sched_clock) This feels like it should be a WARN condition, as IIUC, pv_sched_clock() should never be null. And it _looks_ wrong too, as it means kvm_clock will remain the sched clock if there was no old clock, which should be impossible. > + paravirt_set_sched_clock(old_pv_sched_clock);
On 10/9/2024 9:28 PM, Sean Christopherson wrote: > On Wed, Oct 09, 2024, Nikunj A Dadhania wrote: >> Although the kernel switches over to stable TSC clocksource instead of >> kvmclock, the scheduler still keeps on using kvmclock as the sched clock. >> This is due to kvm_sched_clock_init() updating the pv_sched_clock() >> unconditionally. > > All PV clocks are affected by this, no? There are two things that we are trying to associate with a registered PV clocksource and a PV sched_clock override provided by that PV. Looking at the code of various x86 PVs a) HyperV does not override the sched clock when the TSC_INVARIANT feature is set. It implements something similar to calling kvm_sched_clock_init() only when tsc is not stable [1] b) VMWare: Exports a reliable TSC to the guest. Does not register a clocksource. Overrides the pv_sched_clock with its own version that is using rdtsc(). c) Xen: Overrides the pv_sched_clock. The xen registers its own clocksource. It has same problem like KVM, pv_sched_clock is not switched back to native_sched_clock() Effectively, KVM, Xen and HyperV(when TSC invariant is not available) can be handled in the manner similar to this patch by registering a callback to override/restore the pv_sched_clock when the corresponding clocksource is chosen as the default clocksource. However, since VMWare only wants to override the pv_sched_clock without registering a PV clocksource, I will need to give some more thought to it as there is no callback available in this case. > This seems like something that should > be handled in common code, which is the point I was trying to make in v11. Let me think about it if this can be handled in common clocksource code. We will also need to look at how other archs are using this. > >> Use the clock source enable/disable callbacks to initialize >> kvm_sched_clock_init() and update the pv_sched_clock(). >> >> As the clock selection happens in the stop machine context, schedule >> delayed work to update the static_call() >> >> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com> >> --- >> arch/x86/kernel/kvmclock.c | 34 +++++++++++++++++++++++++++++----- >> 1 file changed, 29 insertions(+), 5 deletions(-) >> >> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c >> index 5b2c15214a6b..5cd3717e103b 100644 >> --- a/arch/x86/kernel/kvmclock.c >> +++ b/arch/x86/kernel/kvmclock.c >> @@ -21,6 +21,7 @@ >> #include <asm/hypervisor.h> >> #include <asm/x86_init.h> >> #include <asm/kvmclock.h> >> +#include <asm/timer.h> >> >> static int kvmclock __initdata = 1; >> static int kvmclock_vsyscall __initdata = 1; >> @@ -148,12 +149,39 @@ bool kvm_check_and_clear_guest_paused(void) >> return ret; >> } >> >> +static u64 (*old_pv_sched_clock)(void); >> + >> +static void enable_kvm_sc_work(struct work_struct *work) >> +{ >> + u8 flags; >> + >> + old_pv_sched_clock = static_call_query(pv_sched_clock); >> + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); >> + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); >> +} >> + >> +static DECLARE_DELAYED_WORK(enable_kvm_sc, enable_kvm_sc_work); >> + >> +static void disable_kvm_sc_work(struct work_struct *work) >> +{ >> + if (old_pv_sched_clock) > > This feels like it should be a WARN condition, as IIUC, pv_sched_clock() should > never be null. And it _looks_ wrong too, as it means kvm_clock will remain the > sched clock if there was no old clock, which should be impossible. Makes sense, I will add a WARN_ON to catch this condition. > >> + paravirt_set_sched_clock(old_pv_sched_clock); Regards Nikunj 1. https://lore.kernel.org/lkml/ef194c25-22d8-204e-ffb6-8f9f0a0621fb@amd.com/
On 10/10/2024 3:44 PM, Nikunj A. Dadhania wrote: > > > On 10/9/2024 9:28 PM, Sean Christopherson wrote: >> On Wed, Oct 09, 2024, Nikunj A Dadhania wrote: >>> Although the kernel switches over to stable TSC clocksource instead of >>> kvmclock, the scheduler still keeps on using kvmclock as the sched clock. >>> This is due to kvm_sched_clock_init() updating the pv_sched_clock() >>> unconditionally. >> >> All PV clocks are affected by this, no? > > There are two things that we are trying to associate with a registered PV > clocksource and a PV sched_clock override provided by that PV. Looking at > the code of various x86 PVs > > a) HyperV does not override the sched clock when the TSC_INVARIANT feature is set. > It implements something similar to calling kvm_sched_clock_init() only when > tsc is not stable [1] > > b) VMWare: Exports a reliable TSC to the guest. Does not register a clocksource. > Overrides the pv_sched_clock with its own version that is using rdtsc(). > > c) Xen: Overrides the pv_sched_clock. The xen registers its own clocksource. It > has same problem like KVM, pv_sched_clock is not switched back to native_sched_clock() > > Effectively, KVM, Xen and HyperV(when TSC invariant is not available) can be handled > in the manner similar to this patch by registering a callback to override/restore the > pv_sched_clock when the corresponding clocksource is chosen as the default clocksource. > > However, since VMWare only wants to override the pv_sched_clock without registering a > PV clocksource, I will need to give some more thought to it as there is no callback > available in this case. Adding Xen and VMWare folks for comments/review: For modern systems that provide constant, non-stop and stable TSC, guest kernel will switch to TSC as the clocksource and sched_clock should also be switched to native_sched_clock(). The below patch and patch here [1], does the above mentioned changes. Proposed change will override the kvm_sched_clock_read()/vmware_sched_clock()/ xen_sched_clock() routine whenever TSC(early or regular) is selected as a clocksource. Special note to VMWare folks: Commit 80e9a4f21fd7 ("x86/vmware: Add paravirt sched clock") in 2016 had introduced vmware_sched_clock(). In the current upstream version native_sched_clock() uses __cyc2ns_read(), which is optimized and use percpu multiplier and shifts which do not change for constant tsc. Is it fine for the linux guest running on VMWare to use native_sched_clock() instead of vmware_sched_clock(). From: Nikunj A Dadhania <nikunj@amd.com> Date: Tue, 28 Nov 2023 18:29:56 +0530 Subject: [RFC PATCH] tsc: Switch to native sched clock Although the kernel switches over to stable TSC clocksource instead of PV clocksource, the scheduler still keeps on using PV clocks as the sched clock source. This is because the KVM, Xen and VMWare, switches the paravirt sched clock handler in their init routines. The HyperV is the only PV clock source that checks if the platform provides invariant TSC and does not switch to PV sched clock. When switching back to stable TSC, restore the scheduler clock to native_sched_clock(). As the clock selection happens in the stop machine context, schedule delayed work to update the static_call() Signed-off-by: Nikunj A Dadhania <nikunj@amd.com> --- arch/x86/kernel/tsc.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 8150f2104474..48ce7afd69dc 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -272,10 +272,25 @@ bool using_native_sched_clock(void) { return static_call_query(pv_sched_clock) == native_sched_clock; } + +static void enable_native_sc_work(struct work_struct *work) +{ + pr_info("using native sched clock\n"); + paravirt_set_sched_clock(native_sched_clock); +} +static DECLARE_DELAYED_WORK(enable_native_sc, enable_native_sc_work); + +static void enable_native_sched_clock(void) +{ + if (!using_native_sched_clock()) + schedule_delayed_work(&enable_native_sc, 0); +} #else u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } + +void enable_native_sched_clock(void) { } #endif notrace u64 sched_clock(void) @@ -1157,6 +1172,10 @@ static void tsc_cs_tick_stable(struct clocksource *cs) static int tsc_cs_enable(struct clocksource *cs) { vclocks_set_used(VDSO_CLOCKMODE_TSC); + + /* Restore native_sched_clock() when switching to TSC */ + enable_native_sched_clock(); + return 0; } -- 2.34.1 1. https://lore.kernel.org/lkml/20241009092850.197575-16-nikunj@amd.com/
© 2016 - 2024 Red Hat, Inc.