[v1] KVM: x86: Cleanup #MC and XCR0/XSS/PKRU handling

[PATCH 3/4] KVM: x86: Load guest/host XCR0 and XSS outside of the fastpath run loop

Posted by Sean Christopherson 3 months, 1 week ago

Move KVM's swapping of XFEATURE masks, i.e. XCR0 and XSS, out of the
fastpath loop now that the guts of the #MC handler runs in task context,
i.e. won't invoke schedule() with preemption disabled and clobber state
(or crash the kernel) due to trying to context switch XSTATE with a mix
of host and guest state.

For all intents and purposes, this reverts commit 1811d979c716 ("x86/kvm:
move kvm_load/put_guest_xcr0 into atomic context"), which papered over an
egregious bug/flaw in the #MC handler where it would do schedule() even
though IRQs are disabled.  E.g. the call stack from the commit:

  kvm_load_guest_xcr0
  ...
  kvm_x86_ops->run(vcpu)
    vmx_vcpu_run
      vmx_complete_atomic_exit
        kvm_machine_check
          do_machine_check
            do_memory_failure
              memory_failure
                lock_page

Commit 1811d979c716 "fixed" the immediate issue of XRSTORS exploding, but
completely ignored that scheduling out a vCPU task while IRQs and
preemption is wildly broken.  Thankfully, commit 5567d11c21a1 ("x86/mce:
Send #MC singal from task work") (somewhat incidentally?) fixed that flaw
by pushing the meat of the work to the user-return path, i.e. to task
context.

KVM has also hardened itself against #MC goofs by moving #MC forwarding to
kvm_x86_ops.handle_exit_irqoff(), i.e. out of the fastpath.  While that's
by no means a robust fix, restoring as much state as possible before
handling the #MC will hopefully provide some measure of protection in the
event that #MC handling goes off the rails again.

Note, KVM always intercepts XCR0 writes for vCPUs without protected state,
e.g. there's no risk of consuming a stale XCR0 when determining if a PKRU
update is needed; kvm_load_host_xfeatures() only reads, and never writes,
vcpu->arch.xcr0.

Deferring the XCR0 and XSS loads shaves ~300 cycles off the fastpath for
Intel, and ~500 cycles for AMD.  E.g. using INVD in KVM-Unit-Test's
vmexit.c, which an extra hack to enable CR4.OXSAVE, latency numbers for
AMD Turin go from ~2000 => 1500, and for Intel Emerald Rapids, go from
~1300 => ~1000.

Cc: Jon Kohler <jon@nutanix.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4b5d2d09634..b5c2879e3330 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1203,13 +1203,12 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw);
 
-void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
+static void kvm_load_guest_xfeatures(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->arch.guest_state_protected)
 		return;
 
 	if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
-
 		if (vcpu->arch.xcr0 != kvm_host.xcr0)
 			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
 
@@ -1217,6 +1216,27 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
 		    vcpu->arch.ia32_xss != kvm_host.xss)
 			wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss);
 	}
+}
+
+static void kvm_load_host_xfeatures(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.guest_state_protected)
+		return;
+
+	if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
+		if (vcpu->arch.xcr0 != kvm_host.xcr0)
+			xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
+
+		if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
+		    vcpu->arch.ia32_xss != kvm_host.xss)
+			wrmsrq(MSR_IA32_XSS, kvm_host.xss);
+	}
+}
+
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.guest_state_protected)
+		return;
 
 	if (cpu_feature_enabled(X86_FEATURE_PKU) &&
 	    vcpu->arch.pkru != vcpu->arch.host_pkru &&
@@ -1238,17 +1258,6 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
 		if (vcpu->arch.pkru != vcpu->arch.host_pkru)
 			wrpkru(vcpu->arch.host_pkru);
 	}
-
-	if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
-
-		if (vcpu->arch.xcr0 != kvm_host.xcr0)
-			xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
-
-		if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
-		    vcpu->arch.ia32_xss != kvm_host.xss)
-			wrmsrq(MSR_IA32_XSS, kvm_host.xss);
-	}
-
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state);
 
@@ -11292,6 +11301,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.guest_fpu.xfd_err)
 		wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
 
+	kvm_load_guest_xfeatures(vcpu);
+
 	if (unlikely(vcpu->arch.switch_db_regs &&
 		     !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
 		set_debugreg(DR7_FIXED_1, 7);
@@ -11378,6 +11389,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
 
+	kvm_load_host_xfeatures(vcpu);
+
 	/*
 	 * Sync xfd before calling handle_exit_irqoff() which may
 	 * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
-- 
2.51.1.930.gacf6e81ea2-goog

Re: [PATCH 3/4] KVM: x86: Load guest/host XCR0 and XSS outside of the fastpath run loop

Posted by Binbin Wu 3 months ago


On 10/31/2025 6:42 AM, Sean Christopherson wrote:
[...]
>   
> -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
> +static void kvm_load_guest_xfeatures(struct kvm_vcpu *vcpu)
>   {
>   	if (vcpu->arch.guest_state_protected)
>   		return;
>   
>   	if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
> -
>   		if (vcpu->arch.xcr0 != kvm_host.xcr0)
>   			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
>   
> @@ -1217,6 +1216,27 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
>   		    vcpu->arch.ia32_xss != kvm_host.xss)
>   			wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss);
>   	}
> +}
> +
> +static void kvm_load_host_xfeatures(struct kvm_vcpu *vcpu)
> +{
> +	if (vcpu->arch.guest_state_protected)
> +		return;
> +
> +	if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
> +		if (vcpu->arch.xcr0 != kvm_host.xcr0)
> +			xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
> +
> +		if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
> +		    vcpu->arch.ia32_xss != kvm_host.xss)
> +			wrmsrq(MSR_IA32_XSS, kvm_host.xss);
> +	}
> +}

kvm_load_guest_xfeatures() and kvm_load_host_xfeatures() are almost the same
except for the guest values VS. host values to set.
I am wondering if it is worth adding a helper to dedup the code, like:

static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, u64 xcr0, u64 xss)
{
         if (vcpu->arch.guest_state_protected)
                 return;

         if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
                 if (vcpu->arch.xcr0 != kvm_host.xcr0)
                         xsetbv(XCR_XFEATURE_ENABLED_MASK, xcr0);

                 if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
                     vcpu->arch.ia32_xss != kvm_host.xss)
                         wrmsrq(MSR_IA32_XSS, xss);
         }
}

Re: [PATCH 3/4] KVM: x86: Load guest/host XCR0 and XSS outside of the fastpath run loop

Posted by Sean Christopherson 3 months ago

On Wed, Nov 05, 2025, Binbin Wu wrote:
> 
> 
> On 10/31/2025 6:42 AM, Sean Christopherson wrote:
> [...]
> > -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
> > +static void kvm_load_guest_xfeatures(struct kvm_vcpu *vcpu)
> >   {
> >   	if (vcpu->arch.guest_state_protected)
> >   		return;
> >   	if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
> > -
> >   		if (vcpu->arch.xcr0 != kvm_host.xcr0)
> >   			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
> > @@ -1217,6 +1216,27 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
> >   		    vcpu->arch.ia32_xss != kvm_host.xss)
> >   			wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss);
> >   	}
> > +}
> > +
> > +static void kvm_load_host_xfeatures(struct kvm_vcpu *vcpu)
> > +{
> > +	if (vcpu->arch.guest_state_protected)
> > +		return;
> > +
> > +	if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
> > +		if (vcpu->arch.xcr0 != kvm_host.xcr0)
> > +			xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
> > +
> > +		if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
> > +		    vcpu->arch.ia32_xss != kvm_host.xss)
> > +			wrmsrq(MSR_IA32_XSS, kvm_host.xss);
> > +	}
> > +}
> 
> kvm_load_guest_xfeatures() and kvm_load_host_xfeatures() are almost the same
> except for the guest values VS. host values to set.
> I am wondering if it is worth adding a helper to dedup the code, like:
> 
> static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, u64 xcr0, u64 xss)
> {
>         if (vcpu->arch.guest_state_protected)
>                 return;
> 
>         if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
>                 if (vcpu->arch.xcr0 != kvm_host.xcr0)
>                         xsetbv(XCR_XFEATURE_ENABLED_MASK, xcr0);
> 
>                 if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
>                     vcpu->arch.ia32_xss != kvm_host.xss)
>                         wrmsrq(MSR_IA32_XSS, xss);
>         }
> }

Nice!  I like it.  Want to send a proper patch (relative to this series)?  Or
I can turn the above into a patch with a Suggested-by.  Either way works for me.

Re: [PATCH 3/4] KVM: x86: Load guest/host XCR0 and XSS outside of the fastpath run loop

Posted by Binbin Wu 3 months ago


On 11/5/2025 10:43 PM, Sean Christopherson wrote:
> On Wed, Nov 05, 2025, Binbin Wu wrote:
>>
>> On 10/31/2025 6:42 AM, Sean Christopherson wrote:
>> [...]
>>> -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
>>> +static void kvm_load_guest_xfeatures(struct kvm_vcpu *vcpu)
>>>    {
>>>    	if (vcpu->arch.guest_state_protected)
>>>    		return;
>>>    	if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
>>> -
>>>    		if (vcpu->arch.xcr0 != kvm_host.xcr0)
>>>    			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
>>> @@ -1217,6 +1216,27 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
>>>    		    vcpu->arch.ia32_xss != kvm_host.xss)
>>>    			wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss);
>>>    	}
>>> +}
>>> +
>>> +static void kvm_load_host_xfeatures(struct kvm_vcpu *vcpu)
>>> +{
>>> +	if (vcpu->arch.guest_state_protected)
>>> +		return;
>>> +
>>> +	if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
>>> +		if (vcpu->arch.xcr0 != kvm_host.xcr0)
>>> +			xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
>>> +
>>> +		if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
>>> +		    vcpu->arch.ia32_xss != kvm_host.xss)
>>> +			wrmsrq(MSR_IA32_XSS, kvm_host.xss);
>>> +	}
>>> +}
>> kvm_load_guest_xfeatures() and kvm_load_host_xfeatures() are almost the same
>> except for the guest values VS. host values to set.
>> I am wondering if it is worth adding a helper to dedup the code, like:
>>
>> static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, u64 xcr0, u64 xss)
>> {
>>          if (vcpu->arch.guest_state_protected)
>>                  return;
>>
>>          if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
>>                  if (vcpu->arch.xcr0 != kvm_host.xcr0)
>>                          xsetbv(XCR_XFEATURE_ENABLED_MASK, xcr0);
>>
>>                  if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
>>                      vcpu->arch.ia32_xss != kvm_host.xss)
>>                          wrmsrq(MSR_IA32_XSS, xss);
>>          }
>> }
> Nice!  I like it.  Want to send a proper patch (relative to this series)?  Or
> I can turn the above into a patch with a Suggested-by.  Either way works for me.
>
I can send a patch.

[PATCH 1/4] KVM: SVM: Handle #MCs in guest outside of fastpath
[PATCH 2/4] KVM: VMX: Handle #MCs on VM-Enter/TD-Enter outside of the fastpath
[PATCH 3/4] KVM: x86: Load guest/host XCR0 and XSS outside of the fastpath run loop
[PATCH 4/4] KVM: x86: Load guest/host PKRU outside of the fastpath run loop