Documentation/virt/kvm/api.rst | 6 ++++++ arch/x86/kvm/x86.c | 28 +++++++++++++++++++++++----- arch/x86/kvm/xen.c | 3 ++- include/uapi/linux/kvm.h | 1 + 4 files changed, 32 insertions(+), 6 deletions(-)
From: Paul Durrant <pdurrant@amazon.com>
Unless explicitly told to do so (by passing 'clocksource=tsc' and
'tsc=stable:socket', and then jumping through some hoops concerning
potential CPU hotplug) Xen will never use TSC as its clocksource.
Hence, by default, a Xen guest will not see PVCLOCK_TSC_STABLE_BIT set
in either the primary or secondary pvclock memory areas. This has
led to bugs in some guest kernels which only become evident if
PVCLOCK_TSC_STABLE_BIT *is* set in the pvclocks. Hence, to support
such guests, give the VMM a new Xen HVM config flag to tell KVM to
forcibly clear the bit in the Xen pvclocks.
Signed-off-by: Paul Durrant <pdurrant@amazon.com>
---
Documentation/virt/kvm/api.rst | 6 ++++++
arch/x86/kvm/x86.c | 28 +++++++++++++++++++++++-----
arch/x86/kvm/xen.c | 3 ++-
include/uapi/linux/kvm.h | 1 +
4 files changed, 32 insertions(+), 6 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 21a7578142a1..9752a01270df 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8252,6 +8252,7 @@ PVHVM guests. Valid flags are::
#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6)
+ #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7)
The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
ioctl is available, for the guest to set its hypercall page.
@@ -8295,6 +8296,11 @@ behave more correctly, not using the XEN_RUNSTATE_UPDATE flag until/unless
specifically enabled (by the guest making the hypercall, causing the VMM
to enable the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute).
+The KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE flag indicates that KVM supports
+clearing the PVCLOCK_TSC_STABLE_BIT flag in Xen pvclock sources. This will be
+done when the KVM_CAP_XEN_HVM ioctl sets the
+KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE flag.
+
8.31 KVM_CAP_PPC_MULTITCE
-------------------------
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 41cce5031126..6abad6dacf07 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3096,7 +3096,8 @@ u64 get_kvmclock_ns(struct kvm *kvm)
static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
struct gfn_to_pfn_cache *gpc,
- unsigned int offset)
+ unsigned int offset,
+ bool force_tsc_unstable)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info *guest_hv_clock;
@@ -3122,6 +3123,10 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
*/
guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
+
+ if (force_tsc_unstable)
+ guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT;
+
smp_wmb();
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
@@ -3154,6 +3159,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
u8 pvclock_flags;
bool use_master_clock;
+ /*
+ * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
+ * explicitly told to use TSC as its clocksource Xen will not set this bit.
+ * This default behaviour led to bugs in some guest kernels which cause
+ * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
+ */
+ bool xen_pvclock_tsc_unstable =
+ ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+
kernel_ns = 0;
host_tsc = 0;
@@ -3231,12 +3245,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.flags = pvclock_flags;
if (vcpu->pv_time.active)
- kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+ kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
+
if (vcpu->xen.vcpu_info_cache.active)
kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
- offsetof(struct compat_vcpu_info, time));
+ offsetof(struct compat_vcpu_info, time),
+ xen_pvclock_tsc_unstable);
if (vcpu->xen.vcpu_time_info_cache.active)
- kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
+ kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
+ xen_pvclock_tsc_unstable);
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -4531,7 +4548,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
KVM_XEN_HVM_CONFIG_SHARED_INFO |
KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
- KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 40edf4d1974c..959061315953 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1113,7 +1113,8 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
{
/* Only some feature flags need to be *enabled* by userspace */
u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
- KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
if (xhc->flags & ~permitted_flags)
return -EINVAL;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 13065dd96132..e21b53e8358d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1282,6 +1282,7 @@ struct kvm_x86_mce {
#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6)
+#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7)
struct kvm_xen_hvm_config {
__u32 flags;
--
2.39.2
On Tue, Oct 31, 2023, Paul Durrant wrote:
> From: Paul Durrant <pdurrant@amazon.com>
>
> Unless explicitly told to do so (by passing 'clocksource=tsc' and
> 'tsc=stable:socket', and then jumping through some hoops concerning
> potential CPU hotplug) Xen will never use TSC as its clocksource.
> Hence, by default, a Xen guest will not see PVCLOCK_TSC_STABLE_BIT set
> in either the primary or secondary pvclock memory areas. This has
> led to bugs in some guest kernels which only become evident if
> PVCLOCK_TSC_STABLE_BIT *is* set in the pvclocks. Hence, to support
> such guests, give the VMM a new Xen HVM config flag to tell KVM to
> forcibly clear the bit in the Xen pvclocks.
>
> Signed-off-by: Paul Durrant <pdurrant@amazon.com>
> ---
> Documentation/virt/kvm/api.rst | 6 ++++++
> arch/x86/kvm/x86.c | 28 +++++++++++++++++++++++-----
> arch/x86/kvm/xen.c | 3 ++-
> include/uapi/linux/kvm.h | 1 +
> 4 files changed, 32 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 21a7578142a1..9752a01270df 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -8252,6 +8252,7 @@ PVHVM guests. Valid flags are::
> #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
> #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
> #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6)
> + #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7)
>
> The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
> ioctl is available, for the guest to set its hypercall page.
> @@ -8295,6 +8296,11 @@ behave more correctly, not using the XEN_RUNSTATE_UPDATE flag until/unless
> specifically enabled (by the guest making the hypercall, causing the VMM
> to enable the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute).
>
> +The KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE flag indicates that KVM supports
> +clearing the PVCLOCK_TSC_STABLE_BIT flag in Xen pvclock sources. This will be
> +done when the KVM_CAP_XEN_HVM ioctl sets the
> +KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE flag.
> +
> 8.31 KVM_CAP_PPC_MULTITCE
> -------------------------
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 41cce5031126..6abad6dacf07 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3096,7 +3096,8 @@ u64 get_kvmclock_ns(struct kvm *kvm)
>
> static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
> struct gfn_to_pfn_cache *gpc,
> - unsigned int offset)
> + unsigned int offset,
> + bool force_tsc_unstable)
> {
> struct kvm_vcpu_arch *vcpu = &v->arch;
> struct pvclock_vcpu_time_info *guest_hv_clock;
> @@ -3122,6 +3123,10 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
> */
>
> guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
> +
> + if (force_tsc_unstable)
> + guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT;
I don't see how this works. This clears the bit in the guest copy, then clobbers
all of guest_hv_clock with a memcpy().
if (force_tsc_unstable)
guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT;
smp_wmb();
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
if (vcpu->pvclock_set_guest_stopped_request) {
vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
vcpu->pvclock_set_guest_stopped_request = false;
}
memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); <= sets PVCLOCK_TSC_STABLE_BIT again, no?
smp_wmb();
Any reason not to make this a generic "capability" instead of a Xen specific flag?
E.g. I assume these problematic guests would mishandle PVCLOCK_TSC_STABLE_BIT if
it showed up in kvmclock, but they don't use kvmclock so it's not a problem in
practice.
I doubt there's a real need or use case, but it'd require less churn and IMO is
simpler overall, e.g.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e3eb608b6692..731b201bfd5a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3225,7 +3225,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* If the host uses TSC clocksource, then it is stable */
pvclock_flags = 0;
- if (use_master_clock)
+ if (use_master_clock && !vcpu->kvm.force_tsc_unstable)
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
vcpu->hv_clock.flags = pvclock_flags;
I also assume this is a "set and forget" thing? I.e. KVM can require the flag
to be set before any vCPUs are created.
© 2016 - 2025 Red Hat, Inc.