[PATCH v7 1/3] KVM: x86: Advance guest TSC after deep suspend.

Suleiman Souhlal posted 3 patches 2 months, 3 weeks ago
There is a newer version of this series
[PATCH v7 1/3] KVM: x86: Advance guest TSC after deep suspend.
Posted by Suleiman Souhlal 2 months, 3 weeks ago
Try to advance guest TSC to current time after suspend when the host
TSCs went backwards.

This makes the behavior consistent between suspends where host TSC
resets and suspends where it doesn't, such as suspend-to-idle, where
in the former case if the host TSC resets, the guests' would
previously be "frozen" due to KVM's backwards TSC prevention, while
in the latter case they would advance.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Suleiman Souhlal <suleiman@google.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/x86.c              | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7b9ccdd99f32..3650a513ba19 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1414,6 +1414,9 @@ struct kvm_arch {
 	u64 cur_tsc_offset;
 	u64 cur_tsc_generation;
 	int nr_vcpus_matched_tsc;
+#ifdef CONFIG_X86_64
+	bool host_was_suspended;
+#endif
 
 	u32 default_tsc_khz;
 	bool user_set_tsc;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e21f5f2fe059..6539af701016 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5035,7 +5035,36 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	/* Apply any externally detected TSC adjustments (due to suspend) */
 	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+#ifdef CONFIG_X86_64
+		unsigned long flags;
+		struct kvm *kvm;
+		bool advance;
+		u64 kernel_ns, l1_tsc, offset, tsc_now;
+
+		kvm = vcpu->kvm;
+		advance = kvm_get_time_and_clockread(&kernel_ns, &tsc_now);
+		raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+		/*
+		 * Advance the guest's TSC to current time instead of only
+		 * preventing it from going backwards, while making sure
+		 * all the vCPUs use the same offset.
+		 */
+		if (kvm->arch.host_was_suspended && advance) {
+			l1_tsc = nsec_to_cycles(vcpu,
+						kvm->arch.kvmclock_offset + kernel_ns);
+			offset = kvm_compute_l1_tsc_offset(vcpu, l1_tsc);
+			kvm->arch.cur_tsc_offset = offset;
+			kvm_vcpu_write_tsc_offset(vcpu, offset);
+		} else if (advance) {
+			kvm_vcpu_write_tsc_offset(vcpu, kvm->arch.cur_tsc_offset);
+		} else {
+			adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+		}
+		kvm->arch.host_was_suspended = false;
+		raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+#else
 		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+#endif /* CONFIG_X86_64 */
 		vcpu->arch.tsc_offset_adjustment = 0;
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 	}
@@ -12729,6 +12758,9 @@ int kvm_arch_enable_virtualization_cpu(void)
 				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 			}
 
+#ifdef CONFIG_X86_64
+			kvm->arch.host_was_suspended = true;
+#endif
 			/*
 			 * We have to disable TSC offset matching.. if you were
 			 * booting a VM while issuing an S4 host suspend....
-- 
2.50.0.727.gbf7dc18ff4-goog
Re: [PATCH v7 1/3] KVM: x86: Advance guest TSC after deep suspend.
Posted by John Stultz 2 months, 3 weeks ago
On Sun, Jul 13, 2025 at 8:37 PM Suleiman Souhlal <suleiman@google.com> wrote:
>
> Try to advance guest TSC to current time after suspend when the host
> TSCs went backwards.
>
> This makes the behavior consistent between suspends where host TSC
> resets and suspends where it doesn't, such as suspend-to-idle, where
> in the former case if the host TSC resets, the guests' would
> previously be "frozen" due to KVM's backwards TSC prevention, while
> in the latter case they would advance.
>
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Suleiman Souhlal <suleiman@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  3 +++
>  arch/x86/kvm/x86.c              | 32 ++++++++++++++++++++++++++++++++
>  2 files changed, 35 insertions(+)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 7b9ccdd99f32..3650a513ba19 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1414,6 +1414,9 @@ struct kvm_arch {
>         u64 cur_tsc_offset;
>         u64 cur_tsc_generation;
>         int nr_vcpus_matched_tsc;
> +#ifdef CONFIG_X86_64
> +       bool host_was_suspended;
> +#endif
>
>         u32 default_tsc_khz;
>         bool user_set_tsc;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e21f5f2fe059..6539af701016 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5035,7 +5035,36 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>
>         /* Apply any externally detected TSC adjustments (due to suspend) */
>         if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
> +#ifdef CONFIG_X86_64
> +               unsigned long flags;
> +               struct kvm *kvm;
> +               bool advance;
> +               u64 kernel_ns, l1_tsc, offset, tsc_now;
> +
> +               kvm = vcpu->kvm;
> +               advance = kvm_get_time_and_clockread(&kernel_ns, &tsc_now);
> +               raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
> +               /*
> +                * Advance the guest's TSC to current time instead of only
> +                * preventing it from going backwards, while making sure
> +                * all the vCPUs use the same offset.
> +                */
> +               if (kvm->arch.host_was_suspended && advance) {
> +                       l1_tsc = nsec_to_cycles(vcpu,
> +                                               kvm->arch.kvmclock_offset + kernel_ns);
> +                       offset = kvm_compute_l1_tsc_offset(vcpu, l1_tsc);
> +                       kvm->arch.cur_tsc_offset = offset;
> +                       kvm_vcpu_write_tsc_offset(vcpu, offset);
> +               } else if (advance) {
> +                       kvm_vcpu_write_tsc_offset(vcpu, kvm->arch.cur_tsc_offset);
> +               } else {
> +                       adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
> +               }
> +               kvm->arch.host_was_suspended = false;
> +               raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
> +#else
>                 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
> +#endif /* CONFIG_X86_64 */

Just style wise, it seems like renaming adjust_tsc_offset_host() to
__adjust_tsc_offset_host(), and then moving the ifdefed logic into a
new adjust_tsc_offset_host() implementation might be cleaner?
Then you could have:

#ifdef COFNIG_X86_64
static inline void adjust_tsc_offset_host(...)
{
/* added logic above */
}
#else
static inline void adjust_tsc_offset_host(...)
{
    __adjust_tsc_offset_host(...);
}
#endif

>                 vcpu->arch.tsc_offset_adjustment = 0;
>                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
>         }
> @@ -12729,6 +12758,9 @@ int kvm_arch_enable_virtualization_cpu(void)
>                                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
>                         }
>
> +#ifdef CONFIG_X86_64
> +                       kvm->arch.host_was_suspended = true;
> +#endif

Similarly I'd wrap this in a:

#ifdef CONFIG_x86_64
static inline void kvm_set_host_was_suspended(*kvm)
{
    kvm->arch.host_was_suspended = true;
}
#else
static inline void kvm_set_host_was_suspended(*kvm)
{
}
#endif

then call kvm_set_host_was_suspended(kvm) unconditionally in the logic above.

thanks
-john
Re: [PATCH v7 1/3] KVM: x86: Advance guest TSC after deep suspend.
Posted by Suleiman Souhlal 2 months, 3 weeks ago
On Fri, Jul 18, 2025 at 5:43 AM John Stultz <jstultz@google.com> wrote:
>
> On Sun, Jul 13, 2025 at 8:37 PM Suleiman Souhlal <suleiman@google.com> wrote:
> >
> > Try to advance guest TSC to current time after suspend when the host
> > TSCs went backwards.
> >
> > This makes the behavior consistent between suspends where host TSC
> > resets and suspends where it doesn't, such as suspend-to-idle, where
> > in the former case if the host TSC resets, the guests' would
> > previously be "frozen" due to KVM's backwards TSC prevention, while
> > in the latter case they would advance.
> >
> > Suggested-by: Sean Christopherson <seanjc@google.com>
> > Signed-off-by: Suleiman Souhlal <suleiman@google.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |  3 +++
> >  arch/x86/kvm/x86.c              | 32 ++++++++++++++++++++++++++++++++
> >  2 files changed, 35 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 7b9ccdd99f32..3650a513ba19 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1414,6 +1414,9 @@ struct kvm_arch {
> >         u64 cur_tsc_offset;
> >         u64 cur_tsc_generation;
> >         int nr_vcpus_matched_tsc;
> > +#ifdef CONFIG_X86_64
> > +       bool host_was_suspended;
> > +#endif
> >
> >         u32 default_tsc_khz;
> >         bool user_set_tsc;
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index e21f5f2fe059..6539af701016 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5035,7 +5035,36 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >
> >         /* Apply any externally detected TSC adjustments (due to suspend) */
> >         if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
> > +#ifdef CONFIG_X86_64
> > +               unsigned long flags;
> > +               struct kvm *kvm;
> > +               bool advance;
> > +               u64 kernel_ns, l1_tsc, offset, tsc_now;
> > +
> > +               kvm = vcpu->kvm;
> > +               advance = kvm_get_time_and_clockread(&kernel_ns, &tsc_now);
> > +               raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
> > +               /*
> > +                * Advance the guest's TSC to current time instead of only
> > +                * preventing it from going backwards, while making sure
> > +                * all the vCPUs use the same offset.
> > +                */
> > +               if (kvm->arch.host_was_suspended && advance) {
> > +                       l1_tsc = nsec_to_cycles(vcpu,
> > +                                               kvm->arch.kvmclock_offset + kernel_ns);
> > +                       offset = kvm_compute_l1_tsc_offset(vcpu, l1_tsc);
> > +                       kvm->arch.cur_tsc_offset = offset;
> > +                       kvm_vcpu_write_tsc_offset(vcpu, offset);
> > +               } else if (advance) {
> > +                       kvm_vcpu_write_tsc_offset(vcpu, kvm->arch.cur_tsc_offset);
> > +               } else {
> > +                       adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
> > +               }
> > +               kvm->arch.host_was_suspended = false;
> > +               raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
> > +#else
> >                 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
> > +#endif /* CONFIG_X86_64 */
>
> Just style wise, it seems like renaming adjust_tsc_offset_host() to
> __adjust_tsc_offset_host(), and then moving the ifdefed logic into a
> new adjust_tsc_offset_host() implementation might be cleaner?
> Then you could have:
>
> #ifdef COFNIG_X86_64
> static inline void adjust_tsc_offset_host(...)
> {
> /* added logic above */
> }
> #else
> static inline void adjust_tsc_offset_host(...)
> {
>     __adjust_tsc_offset_host(...);
> }
> #endif
>
> >                 vcpu->arch.tsc_offset_adjustment = 0;
> >                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> >         }
> > @@ -12729,6 +12758,9 @@ int kvm_arch_enable_virtualization_cpu(void)
> >                                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
> >                         }
> >
> > +#ifdef CONFIG_X86_64
> > +                       kvm->arch.host_was_suspended = true;
> > +#endif
>
> Similarly I'd wrap this in a:
>
> #ifdef CONFIG_x86_64
> static inline void kvm_set_host_was_suspended(*kvm)
> {
>     kvm->arch.host_was_suspended = true;
> }
> #else
> static inline void kvm_set_host_was_suspended(*kvm)
> {
> }
> #endif
>
> then call kvm_set_host_was_suspended(kvm) unconditionally in the logic above.

Thanks for the good suggestions. I'll incorporate them into v8.

-- Suleiman
Re: [PATCH v7 1/3] KVM: x86: Advance guest TSC after deep suspend.
Posted by Tzung-Bi Shih 2 months, 3 weeks ago
On Mon, Jul 14, 2025 at 12:36:47PM +0900, Suleiman Souhlal wrote:
> Try to advance guest TSC to current time after suspend when the host
> TSCs went backwards.
> 
> This makes the behavior consistent between suspends where host TSC
> resets and suspends where it doesn't, such as suspend-to-idle, where
> in the former case if the host TSC resets, the guests' would
> previously be "frozen" due to KVM's backwards TSC prevention, while
> in the latter case they would advance.
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Suleiman Souhlal <suleiman@google.com>

Tested again with comparing `date` before and after suspend-to-RAM:
  echo deep >/sys/power/mem_sleep
  echo $(date '+%s' -d '+3 minutes') >/sys/class/rtc/rtc0/wakealarm
  echo mem >/sys/power/state

Without the patch, the guest's `date` is slower (~3 mins) than the host's
after resuming.

Tested-by: Tzung-Bi Shih <tzungbi@kernel.org>

> @@ -5035,7 +5035,36 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  
>  	/* Apply any externally detected TSC adjustments (due to suspend) */
>  	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
> +#ifdef CONFIG_X86_64
> +		unsigned long flags;
> +		struct kvm *kvm;
> +		bool advance;
> +		u64 kernel_ns, l1_tsc, offset, tsc_now;
> +
> +		kvm = vcpu->kvm;
> +		advance = kvm_get_time_and_clockread(&kernel_ns, &tsc_now);
> +		raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
> +		/*
> +		 * Advance the guest's TSC to current time instead of only
> +		 * preventing it from going backwards, while making sure
> +		 * all the vCPUs use the same offset.
> +		 */
> +		if (kvm->arch.host_was_suspended && advance) {
> +			l1_tsc = nsec_to_cycles(vcpu,
> +						kvm->arch.kvmclock_offset + kernel_ns);
> +			offset = kvm_compute_l1_tsc_offset(vcpu, l1_tsc);
> +			kvm->arch.cur_tsc_offset = offset;
> +			kvm_vcpu_write_tsc_offset(vcpu, offset);
> +		} else if (advance) {
> +			kvm_vcpu_write_tsc_offset(vcpu, kvm->arch.cur_tsc_offset);
> +		} else {
> +			adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
> +		}
> +		kvm->arch.host_was_suspended = false;
> +		raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
> +#else
>  		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
> +#endif /* CONFIG_X86_64 */

Wondering if it needs to acquire the `tsc_write_lock`, given that:
- The original code adjust_tsc_offset_host() doesn't acquire.  Note:
  adjust_tsc_offset_host() eventually calls kvm_vcpu_write_tsc_offset() too.
- Documentation/virt/kvm/locking.rst [1].

[1] https://elixir.bootlin.com/linux/v6.15/source/Documentation/virt/kvm/locking.rst#L264
Re: [PATCH v7 1/3] KVM: x86: Advance guest TSC after deep suspend.
Posted by Suleiman Souhlal 2 months, 3 weeks ago
On Tue, Jul 15, 2025 at 2:29 PM Tzung-Bi Shih <tzungbi@kernel.org> wrote:
>
> On Mon, Jul 14, 2025 at 12:36:47PM +0900, Suleiman Souhlal wrote:
> > Try to advance guest TSC to current time after suspend when the host
> > TSCs went backwards.
> >
> > This makes the behavior consistent between suspends where host TSC
> > resets and suspends where it doesn't, such as suspend-to-idle, where
> > in the former case if the host TSC resets, the guests' would
> > previously be "frozen" due to KVM's backwards TSC prevention, while
> > in the latter case they would advance.
> >
> > Suggested-by: Sean Christopherson <seanjc@google.com>
> > Signed-off-by: Suleiman Souhlal <suleiman@google.com>
>
> Tested again with comparing `date` before and after suspend-to-RAM:
>   echo deep >/sys/power/mem_sleep
>   echo $(date '+%s' -d '+3 minutes') >/sys/class/rtc/rtc0/wakealarm
>   echo mem >/sys/power/state
>
> Without the patch, the guest's `date` is slower (~3 mins) than the host's
> after resuming.
>
> Tested-by: Tzung-Bi Shih <tzungbi@kernel.org>

Thanks for testing!

>
> > @@ -5035,7 +5035,36 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >
> >       /* Apply any externally detected TSC adjustments (due to suspend) */
> >       if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
> > +#ifdef CONFIG_X86_64
> > +             unsigned long flags;
> > +             struct kvm *kvm;
> > +             bool advance;
> > +             u64 kernel_ns, l1_tsc, offset, tsc_now;
> > +
> > +             kvm = vcpu->kvm;
> > +             advance = kvm_get_time_and_clockread(&kernel_ns, &tsc_now);
> > +             raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
> > +             /*
> > +              * Advance the guest's TSC to current time instead of only
> > +              * preventing it from going backwards, while making sure
> > +              * all the vCPUs use the same offset.
> > +              */
> > +             if (kvm->arch.host_was_suspended && advance) {
> > +                     l1_tsc = nsec_to_cycles(vcpu,
> > +                                             kvm->arch.kvmclock_offset + kernel_ns);
> > +                     offset = kvm_compute_l1_tsc_offset(vcpu, l1_tsc);
> > +                     kvm->arch.cur_tsc_offset = offset;
> > +                     kvm_vcpu_write_tsc_offset(vcpu, offset);
> > +             } else if (advance) {
> > +                     kvm_vcpu_write_tsc_offset(vcpu, kvm->arch.cur_tsc_offset);
> > +             } else {
> > +                     adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
> > +             }
> > +             kvm->arch.host_was_suspended = false;
> > +             raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
> > +#else
> >               adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
> > +#endif /* CONFIG_X86_64 */
>
> Wondering if it needs to acquire the `tsc_write_lock`, given that:
> - The original code adjust_tsc_offset_host() doesn't acquire.  Note:
>   adjust_tsc_offset_host() eventually calls kvm_vcpu_write_tsc_offset() too.
> - Documentation/virt/kvm/locking.rst [1].
>
> [1] https://elixir.bootlin.com/linux/v6.15/source/Documentation/virt/kvm/locking.rst#L264

This is an excellent question.
I used a lock here to make sure that only one VCPU computes the offset
and that all the others reuse it.
It might be doable with atomic operations, but using a lock seemed
simpler to me.
I don't think it has to be tsc_write_lock specifically, but reusing it
for this purpose seemed appropriate to me.

Thanks,
-- Suleiman