[PATCH v12 16/19] x86/kvmclock: Use clock source callback to update kvm sched clock

Nikunj A Dadhania posted 19 patches 1 month, 2 weeks ago
There is a newer version of this series
[PATCH v12 16/19] x86/kvmclock: Use clock source callback to update kvm sched clock
Posted by Nikunj A Dadhania 1 month, 2 weeks ago
Although the kernel switches over to stable TSC clocksource instead of
kvmclock, the scheduler still keeps on using kvmclock as the sched clock.
This is due to kvm_sched_clock_init() updating the pv_sched_clock()
unconditionally.

Use the clock source enable/disable callbacks to initialize
kvm_sched_clock_init() and update the pv_sched_clock().

As the clock selection happens in the stop machine context, schedule
delayed work to update the static_call()

Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
 arch/x86/kernel/kvmclock.c | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 5b2c15214a6b..5cd3717e103b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -21,6 +21,7 @@
 #include <asm/hypervisor.h>
 #include <asm/x86_init.h>
 #include <asm/kvmclock.h>
+#include <asm/timer.h>
 
 static int kvmclock __initdata = 1;
 static int kvmclock_vsyscall __initdata = 1;
@@ -148,12 +149,39 @@ bool kvm_check_and_clear_guest_paused(void)
 	return ret;
 }
 
+static u64 (*old_pv_sched_clock)(void);
+
+static void enable_kvm_sc_work(struct work_struct *work)
+{
+	u8 flags;
+
+	old_pv_sched_clock = static_call_query(pv_sched_clock);
+	flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+	kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+}
+
+static DECLARE_DELAYED_WORK(enable_kvm_sc, enable_kvm_sc_work);
+
+static void disable_kvm_sc_work(struct work_struct *work)
+{
+	if (old_pv_sched_clock)
+		paravirt_set_sched_clock(old_pv_sched_clock);
+}
+static DECLARE_DELAYED_WORK(disable_kvm_sc, disable_kvm_sc_work);
+
 static int kvm_cs_enable(struct clocksource *cs)
 {
 	vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK);
+	schedule_delayed_work(&enable_kvm_sc, 0);
+
 	return 0;
 }
 
+static void kvm_cs_disable(struct clocksource *cs)
+{
+	schedule_delayed_work(&disable_kvm_sc, 0);
+}
+
 static struct clocksource kvm_clock = {
 	.name	= "kvm-clock",
 	.read	= kvm_clock_get_cycles,
@@ -162,6 +190,7 @@ static struct clocksource kvm_clock = {
 	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
 	.id     = CSID_X86_KVM_CLK,
 	.enable	= kvm_cs_enable,
+	.disable = kvm_cs_disable,
 };
 
 static void kvm_register_clock(char *txt)
@@ -287,8 +316,6 @@ static int kvmclock_setup_percpu(unsigned int cpu)
 
 void __init kvmclock_init(void)
 {
-	u8 flags;
-
 	if (!kvm_para_available() || !kvmclock)
 		return;
 
@@ -317,9 +344,6 @@ void __init kvmclock_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
 		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 
-	flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
-	kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
-
 	x86_platform.calibrate_tsc = kvm_get_tsc_khz;
 	x86_platform.calibrate_cpu = kvm_get_tsc_khz;
 	x86_platform.get_wallclock = kvm_get_wallclock;
-- 
2.34.1
Re: [PATCH v12 16/19] x86/kvmclock: Use clock source callback to update kvm sched clock
Posted by Sean Christopherson 1 month, 2 weeks ago
On Wed, Oct 09, 2024, Nikunj A Dadhania wrote:
> Although the kernel switches over to stable TSC clocksource instead of
> kvmclock, the scheduler still keeps on using kvmclock as the sched clock.
> This is due to kvm_sched_clock_init() updating the pv_sched_clock()
> unconditionally.

All PV clocks are affected by this, no?  This seems like something that should
be handled in common code, which is the point I was trying to make in v11.

> Use the clock source enable/disable callbacks to initialize
> kvm_sched_clock_init() and update the pv_sched_clock().
> 
> As the clock selection happens in the stop machine context, schedule
> delayed work to update the static_call()
> 
> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
> ---
>  arch/x86/kernel/kvmclock.c | 34 +++++++++++++++++++++++++++++-----
>  1 file changed, 29 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 5b2c15214a6b..5cd3717e103b 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -21,6 +21,7 @@
>  #include <asm/hypervisor.h>
>  #include <asm/x86_init.h>
>  #include <asm/kvmclock.h>
> +#include <asm/timer.h>
>  
>  static int kvmclock __initdata = 1;
>  static int kvmclock_vsyscall __initdata = 1;
> @@ -148,12 +149,39 @@ bool kvm_check_and_clear_guest_paused(void)
>  	return ret;
>  }
>  
> +static u64 (*old_pv_sched_clock)(void);
> +
> +static void enable_kvm_sc_work(struct work_struct *work)
> +{
> +	u8 flags;
> +
> +	old_pv_sched_clock = static_call_query(pv_sched_clock);
> +	flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
> +	kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
> +}
> +
> +static DECLARE_DELAYED_WORK(enable_kvm_sc, enable_kvm_sc_work);
> +
> +static void disable_kvm_sc_work(struct work_struct *work)
> +{
> +	if (old_pv_sched_clock)

This feels like it should be a WARN condition, as IIUC, pv_sched_clock() should
never be null.  And it _looks_ wrong too, as it means kvm_clock will remain the
sched clock if there was no old clock, which should be impossible.

> +		paravirt_set_sched_clock(old_pv_sched_clock);
Re: [PATCH v12 16/19] x86/kvmclock: Use clock source callback to update kvm sched clock
Posted by Nikunj A. Dadhania 1 month, 2 weeks ago

On 10/9/2024 9:28 PM, Sean Christopherson wrote:
> On Wed, Oct 09, 2024, Nikunj A Dadhania wrote:
>> Although the kernel switches over to stable TSC clocksource instead of
>> kvmclock, the scheduler still keeps on using kvmclock as the sched clock.
>> This is due to kvm_sched_clock_init() updating the pv_sched_clock()
>> unconditionally.
> 
> All PV clocks are affected by this, no?

There are two things that we are trying to associate with a registered PV 
clocksource and a PV sched_clock override provided by that PV. Looking at 
the code of various x86 PVs

a) HyperV does not override the sched clock when the TSC_INVARIANT feature is set.
   It implements something similar to calling kvm_sched_clock_init() only when
   tsc is not stable [1]

b) VMWare: Exports a reliable TSC to the guest. Does not register a clocksource.
   Overrides the pv_sched_clock with its own version that is using rdtsc().

c) Xen: Overrides the pv_sched_clock. The xen registers its own clocksource. It
   has same problem like KVM, pv_sched_clock is not switched back to native_sched_clock()

Effectively, KVM, Xen and HyperV(when TSC invariant is not available) can be handled
in the manner similar to this patch by registering a callback to override/restore the
pv_sched_clock when the corresponding clocksource is chosen as the default clocksource.

However, since VMWare only wants to override the pv_sched_clock without registering a
PV clocksource, I will need to give some more thought to it as there is no callback
available in this case.

> This seems like something that should
> be handled in common code, which is the point I was trying to make in v11.

Let me think about it if this can be handled in common clocksource code. 
We will also need to look at how other archs are using this.

> 
>> Use the clock source enable/disable callbacks to initialize
>> kvm_sched_clock_init() and update the pv_sched_clock().
>>
>> As the clock selection happens in the stop machine context, schedule
>> delayed work to update the static_call()
>>
>> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
>> ---
>>  arch/x86/kernel/kvmclock.c | 34 +++++++++++++++++++++++++++++-----
>>  1 file changed, 29 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
>> index 5b2c15214a6b..5cd3717e103b 100644
>> --- a/arch/x86/kernel/kvmclock.c
>> +++ b/arch/x86/kernel/kvmclock.c
>> @@ -21,6 +21,7 @@
>>  #include <asm/hypervisor.h>
>>  #include <asm/x86_init.h>
>>  #include <asm/kvmclock.h>
>> +#include <asm/timer.h>
>>  
>>  static int kvmclock __initdata = 1;
>>  static int kvmclock_vsyscall __initdata = 1;
>> @@ -148,12 +149,39 @@ bool kvm_check_and_clear_guest_paused(void)
>>  	return ret;
>>  }
>>  
>> +static u64 (*old_pv_sched_clock)(void);
>> +
>> +static void enable_kvm_sc_work(struct work_struct *work)
>> +{
>> +	u8 flags;
>> +
>> +	old_pv_sched_clock = static_call_query(pv_sched_clock);
>> +	flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
>> +	kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
>> +}
>> +
>> +static DECLARE_DELAYED_WORK(enable_kvm_sc, enable_kvm_sc_work);
>> +
>> +static void disable_kvm_sc_work(struct work_struct *work)
>> +{
>> +	if (old_pv_sched_clock)
> 
> This feels like it should be a WARN condition, as IIUC, pv_sched_clock() should
> never be null.  And it _looks_ wrong too, as it means kvm_clock will remain the
> sched clock if there was no old clock, which should be impossible.

Makes sense, I will add a WARN_ON to catch this condition.

> 
>> +		paravirt_set_sched_clock(old_pv_sched_clock);

Regards
Nikunj

1. https://lore.kernel.org/lkml/ef194c25-22d8-204e-ffb6-8f9f0a0621fb@amd.com/
Re: [PATCH v12 16/19] x86/kvmclock: Use clock source callback to update kvm sched clock
Posted by Nikunj A. Dadhania 1 month, 1 week ago

On 10/10/2024 3:44 PM, Nikunj A. Dadhania wrote:
> 
> 
> On 10/9/2024 9:28 PM, Sean Christopherson wrote:
>> On Wed, Oct 09, 2024, Nikunj A Dadhania wrote:
>>> Although the kernel switches over to stable TSC clocksource instead of
>>> kvmclock, the scheduler still keeps on using kvmclock as the sched clock.
>>> This is due to kvm_sched_clock_init() updating the pv_sched_clock()
>>> unconditionally.
>>
>> All PV clocks are affected by this, no?
> 
> There are two things that we are trying to associate with a registered PV 
> clocksource and a PV sched_clock override provided by that PV. Looking at 
> the code of various x86 PVs
> 
> a) HyperV does not override the sched clock when the TSC_INVARIANT feature is set.
>    It implements something similar to calling kvm_sched_clock_init() only when
>    tsc is not stable [1]
> 
> b) VMWare: Exports a reliable TSC to the guest. Does not register a clocksource.
>    Overrides the pv_sched_clock with its own version that is using rdtsc().
> 
> c) Xen: Overrides the pv_sched_clock. The xen registers its own clocksource. It
>    has same problem like KVM, pv_sched_clock is not switched back to native_sched_clock()
> 
> Effectively, KVM, Xen and HyperV(when TSC invariant is not available) can be handled
> in the manner similar to this patch by registering a callback to override/restore the
> pv_sched_clock when the corresponding clocksource is chosen as the default clocksource.
> 
> However, since VMWare only wants to override the pv_sched_clock without registering a
> PV clocksource, I will need to give some more thought to it as there is no callback
> available in this case.

Adding Xen and VMWare folks for comments/review:
For modern systems that provide constant, non-stop and stable TSC, guest kernel
will switch to TSC as the clocksource and sched_clock should also be
switched to native_sched_clock().

The below patch and patch here [1], does the above mentioned changes. Proposed
change will override the kvm_sched_clock_read()/vmware_sched_clock()/
xen_sched_clock() routine whenever TSC(early or regular) is selected as a
clocksource.

Special note to VMWare folks: 
Commit 80e9a4f21fd7 ("x86/vmware: Add paravirt sched clock") in 2016 had
introduced vmware_sched_clock(). In the current upstream version
native_sched_clock() uses __cyc2ns_read(), which is optimized and use 
percpu multiplier and shifts which do not change for constant tsc. Is it 
fine for the linux guest running on VMWare to use native_sched_clock() 
instead of vmware_sched_clock().

From: Nikunj A Dadhania <nikunj@amd.com>
Date: Tue, 28 Nov 2023 18:29:56 +0530
Subject: [RFC PATCH] tsc: Switch to native sched clock

Although the kernel switches over to stable TSC clocksource instead of PV
clocksource, the scheduler still keeps on using PV clocks as the sched
clock source. This is because the KVM, Xen and VMWare, switches
the paravirt sched clock handler in their init routines. The HyperV is the
only PV clock source that checks if the platform provides invariant TSC and
does not switch to PV sched clock.

When switching back to stable TSC, restore the scheduler clock to
native_sched_clock().

As the clock selection happens in the stop machine context, schedule
delayed work to update the static_call()

Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
 arch/x86/kernel/tsc.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 8150f2104474..48ce7afd69dc 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -272,10 +272,25 @@ bool using_native_sched_clock(void)
 {
 	return static_call_query(pv_sched_clock) == native_sched_clock;
 }
+
+static void enable_native_sc_work(struct work_struct *work)
+{
+	pr_info("using native sched clock\n");
+	paravirt_set_sched_clock(native_sched_clock);
+}
+static DECLARE_DELAYED_WORK(enable_native_sc, enable_native_sc_work);
+
+static void enable_native_sched_clock(void)
+{
+	if (!using_native_sched_clock())
+		schedule_delayed_work(&enable_native_sc, 0);
+}
 #else
 u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
 
 bool using_native_sched_clock(void) { return true; }
+
+void enable_native_sched_clock(void) { }
 #endif
 
 notrace u64 sched_clock(void)
@@ -1157,6 +1172,10 @@ static void tsc_cs_tick_stable(struct clocksource *cs)
 static int tsc_cs_enable(struct clocksource *cs)
 {
 	vclocks_set_used(VDSO_CLOCKMODE_TSC);
+
+	/* Restore native_sched_clock() when switching to TSC */
+	enable_native_sched_clock();
+
 	return 0;
 }
 
-- 
2.34.1

1. https://lore.kernel.org/lkml/20241009092850.197575-16-nikunj@amd.com/