[PATCH v5 1/2] KVM: x86: Advance guest TSC after deep suspend.

Suleiman Souhlal posted 2 patches 8 months, 4 weeks ago
There is a newer version of this series
[PATCH v5 1/2] KVM: x86: Advance guest TSC after deep suspend.
Posted by Suleiman Souhlal 8 months, 4 weeks ago
Advance guest TSC to current time after suspend when the host
TSCs went backwards.

This makes the behavior consistent between suspends where host TSC
resets and suspends where it doesn't, such as suspend-to-idle, where
in the former case if the host TSC resets, the guests' would
previously be "frozen" due to KVM's backwards TSC prevention, while
in the latter case they would advance.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Suleiman Souhlal <suleiman@google.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 33 ++++++++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 32ae3aa50c7e38..f5ce2c2782142b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1399,6 +1399,7 @@ struct kvm_arch {
 	u64 cur_tsc_offset;
 	u64 cur_tsc_generation;
 	int nr_vcpus_matched_tsc;
+	bool host_was_suspended;
 
 	u32 default_tsc_khz;
 	bool user_set_tsc;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4b64ab350bcd4d..6b4ea3be66e814 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4971,7 +4971,37 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	/* Apply any externally detected TSC adjustments (due to suspend) */
 	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
-		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+		unsigned long flags;
+		struct kvm *kvm;
+		bool advance;
+		u64 kernel_ns, l1_tsc, offset, tsc_now;
+
+		kvm = vcpu->kvm;
+		advance = kvm_get_time_and_clockread(&kernel_ns,
+		    &tsc_now);
+		raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+		/*
+		 * Advance the guest's TSC to current time instead of only
+		 * preventing it from going backwards, while making sure
+		 * all the vCPUs use the same offset.
+		 */
+		if (kvm->arch.host_was_suspended && advance) {
+			l1_tsc = nsec_to_cycles(vcpu,
+			    vcpu->kvm->arch.kvmclock_offset +
+			    kernel_ns);
+			offset = kvm_compute_l1_tsc_offset(vcpu,
+			    l1_tsc);
+			kvm->arch.cur_tsc_offset = offset;
+			kvm_vcpu_write_tsc_offset(vcpu, offset);
+		} else if (advance)
+			kvm_vcpu_write_tsc_offset(vcpu,
+			    vcpu->kvm->arch.cur_tsc_offset);
+		else
+			adjust_tsc_offset_host(vcpu,
+			    vcpu->arch.tsc_offset_adjustment);
+		kvm->arch.host_was_suspended = 0;
+		raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock,
+		    flags);
 		vcpu->arch.tsc_offset_adjustment = 0;
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 	}
@@ -12640,6 +12670,7 @@ int kvm_arch_enable_virtualization_cpu(void)
 				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 			}
 
+			kvm->arch.host_was_suspended = 1;
 			/*
 			 * We have to disable TSC offset matching.. if you were
 			 * booting a VM while issuing an S4 host suspend....
-- 
2.49.0.395.g12beb8f557-goog
Re: [PATCH v5 1/2] KVM: x86: Advance guest TSC after deep suspend.
Posted by Tzung-Bi Shih 8 months ago
On Tue, Mar 25, 2025 at 01:13:49PM +0900, Suleiman Souhlal wrote:
> Advance guest TSC to current time after suspend when the host
> TSCs went backwards.
> 
> This makes the behavior consistent between suspends where host TSC
> resets and suspends where it doesn't, such as suspend-to-idle, where
> in the former case if the host TSC resets, the guests' would
> previously be "frozen" due to KVM's backwards TSC prevention, while
> in the latter case they would advance.
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Suleiman Souhlal <suleiman@google.com>

Tested with comparing `date` before and after suspend-to-RAM[1]:
  echo deep >/sys/power/mem_sleep
  echo $(date '+%s' -d '+3 minutes') >/sys/class/rtc/rtc0/wakealarm
  echo mem >/sys/power/state

Without the patch, the guest's `date` is slower (~3 mins) than the host's
after resuming.

Tested-by: Tzung-Bi Shih <tzungbi@kernel.org>

[1]: https://www.kernel.org/doc/Documentation/power/states.txt

Some non-functional comments inline below.

> @@ -4971,7 +4971,37 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  
>  	/* Apply any externally detected TSC adjustments (due to suspend) */
>  	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
> -		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
> +		unsigned long flags;
> +		struct kvm *kvm;
> +		bool advance;
> +		u64 kernel_ns, l1_tsc, offset, tsc_now;
> +
> +		kvm = vcpu->kvm;

It will be more clear (at least to me) if moving the statement to its declaration:
  struct kvm *kvm = vcpu->kvm;

Other than that, the following code should better utilitize the local
variable, e.g. s/vcpu->kvm/kvm/g.

> +		advance = kvm_get_time_and_clockread(&kernel_ns,
> +		    &tsc_now);
> +		raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
> +		/*
> +		 * Advance the guest's TSC to current time instead of only
> +		 * preventing it from going backwards, while making sure
> +		 * all the vCPUs use the same offset.
> +		 */
> +		if (kvm->arch.host_was_suspended && advance) {
> +			l1_tsc = nsec_to_cycles(vcpu,
> +			    vcpu->kvm->arch.kvmclock_offset +
                            ^^^^^^^^^
                            kvm

> +			    kernel_ns);
> +			offset = kvm_compute_l1_tsc_offset(vcpu,
> +			    l1_tsc);
> +			kvm->arch.cur_tsc_offset = offset;
> +			kvm_vcpu_write_tsc_offset(vcpu, offset);
> +		} else if (advance)
> +			kvm_vcpu_write_tsc_offset(vcpu,
> +			    vcpu->kvm->arch.cur_tsc_offset);
                            ^^^^^^^^^
			    kvm

> +		else
> +			adjust_tsc_offset_host(vcpu,
> +			    vcpu->arch.tsc_offset_adjustment);

Need braces in `else if` and `else` cases [2].

[2]: https://www.kernel.org/doc/html/latest/process/coding-style.html#placing-braces-and-spaces


> @@ -12640,6 +12670,7 @@ int kvm_arch_enable_virtualization_cpu(void)
>  				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
>  			}
>  
> +			kvm->arch.host_was_suspended = 1;

Given that it is a bool, how about use `true`?
Re: [PATCH v5 1/2] KVM: x86: Advance guest TSC after deep suspend.
Posted by Sean Christopherson 7 months, 2 weeks ago
On Tue, Apr 22, 2025, Tzung-Bi Shih wrote:
> On Tue, Mar 25, 2025 at 01:13:49PM +0900, Suleiman Souhlal wrote:
> > Advance guest TSC to current time after suspend when the host
> > TSCs went backwards.
> > 
> > This makes the behavior consistent between suspends where host TSC
> > resets and suspends where it doesn't, such as suspend-to-idle, where
> > in the former case if the host TSC resets, the guests' would
> > previously be "frozen" due to KVM's backwards TSC prevention, while
> > in the latter case they would advance.
> > 
> > Suggested-by: Sean Christopherson <seanjc@google.com>
> > Signed-off-by: Suleiman Souhlal <suleiman@google.com>
> 
> Tested with comparing `date` before and after suspend-to-RAM[1]:
>   echo deep >/sys/power/mem_sleep
>   echo $(date '+%s' -d '+3 minutes') >/sys/class/rtc/rtc0/wakealarm
>   echo mem >/sys/power/state
> 
> Without the patch, the guest's `date` is slower (~3 mins) than the host's
> after resuming.
> 
> Tested-by: Tzung-Bi Shih <tzungbi@kernel.org>
> 
> [1]: https://www.kernel.org/doc/Documentation/power/states.txt
> 
> Some non-functional comments inline below.
> 
> > @@ -4971,7 +4971,37 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >  
> >  	/* Apply any externally detected TSC adjustments (due to suspend) */
> >  	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
> > -		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
> > +		unsigned long flags;
> > +		struct kvm *kvm;
> > +		bool advance;
> > +		u64 kernel_ns, l1_tsc, offset, tsc_now;
> > +
> > +		kvm = vcpu->kvm;
> 
> It will be more clear (at least to me) if moving the statement to its declaration:
>   struct kvm *kvm = vcpu->kvm;
> 
> Other than that, the following code should better utilitize the local
> variable, e.g. s/vcpu->kvm/kvm/g.
> 
> > +		advance = kvm_get_time_and_clockread(&kernel_ns,
> > +		    &tsc_now);

In addition to Tzung-Bi's feedback...

Please don't wrap at weird points, and align when you do wrap.  The 80 char limit
isn't a super hard limit, and many of these wraps are well below that anyways.

		advance = kvm_get_time_and_clockread(&kernel_ns, &tsc_now);
		raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
		/*
		 * Advance the guest's TSC to current time instead of only
		 * preventing it from going backwards, while making sure
		 * all the vCPUs use the same offset.
		 */
		if (kvm->arch.host_was_suspended && advance) {
			l1_tsc = nsec_to_cycles(vcpu,
						vcpu->kvm->arch.kvmclock_offset + kernel_ns);
			offset = kvm_compute_l1_tsc_offset(vcpu, l1_tsc);
			kvm->arch.cur_tsc_offset = offset;
			kvm_vcpu_write_tsc_offset(vcpu, offset);
		} else if (advance) {
			kvm_vcpu_write_tsc_offset(vcpu, vcpu->kvm->arch.cur_tsc_offset);
		} else {
			adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
		}
		kvm->arch.host_was_suspended = 0;
		raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);


As for the correctness of this code with respect to masterclock and TSC
synchronization, I'm definitely going to have to stare even more, and probably
bring in at least Paolo for a consult, because KVM's TSC code is all kinds of
brittle and complex.