From: Sean Christopherson <seanjc@google.com>
Load the guest's FPU state if userspace is accessing MSRs whose values
are managed by XSAVES. Introduce two helpers, kvm_{get,set}_xstate_msr(),
to facilitate access to such kind of MSRs.
If MSRs supported in kvm_caps.supported_xss are passed through to guest,
the guest MSRs are swapped with host's before vCPU exits to userspace and
after it reenters kernel before next VM-entry.
Because the modified code is also used for the KVM_GET_MSRS device ioctl(),
explicitly check @vcpu is non-null before attempting to load guest state.
The XSAVE-managed MSRs cannot be retrieved via the device ioctl() without
loading guest FPU state (which doesn't exist).
Note that guest_cpuid_has() is not queried as host userspace is allowed to
access MSRs that have not been exposed to the guest, e.g. it might do
KVM_SET_MSRS prior to KVM_SET_CPUID2.
The two helpers are put here in order to manifest accessing xsave-managed
MSRs requires special check and handling to guarantee the correctness of
read/write to the MSRs.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Yang Weijiang <weijiang.yang@intel.com>
Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Tested-by: Mathias Krause <minipli@grsecurity.net>
Tested-by: John Allen <john.allen@amd.com>
Tested-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Chao Gao <chao.gao@intel.com>
---
arch/x86/kvm/x86.c | 35 ++++++++++++++++++++++++++++++++++-
arch/x86/kvm/x86.h | 24 ++++++++++++++++++++++++
2 files changed, 58 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6b01c6e9330e..799ac76679c9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -136,6 +136,9 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static DEFINE_MUTEX(vendor_module_lock);
+static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+
struct kvm_x86_ops kvm_x86_ops __read_mostly;
#define KVM_X86_OP(func) \
@@ -4566,6 +4569,21 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
EXPORT_SYMBOL_GPL(kvm_get_msr_common);
+/*
+ * Returns true if the MSR in question is managed via XSTATE, i.e. is context
+ * switched with the rest of guest FPU state.
+ */
+static bool is_xstate_managed_msr(u32 index)
+{
+ switch (index) {
+ case MSR_IA32_U_CET:
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* Read or write a bunch of msrs. All parameters are kernel addresses.
*
@@ -4576,11 +4594,26 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
int (*do_msr)(struct kvm_vcpu *vcpu,
unsigned index, u64 *data))
{
+ bool fpu_loaded = false;
int i;
- for (i = 0; i < msrs->nmsrs; ++i)
+ for (i = 0; i < msrs->nmsrs; ++i) {
+ /*
+ * If userspace is accessing one or more XSTATE-managed MSRs,
+ * temporarily load the guest's FPU state so that the guest's
+ * MSR value(s) is resident in hardware, i.e. so that KVM can
+ * get/set the MSR via RDMSR/WRMSR.
+ */
+ if (vcpu && !fpu_loaded && kvm_caps.supported_xss &&
+ is_xstate_managed_msr(entries[i].index)) {
+ kvm_load_guest_fpu(vcpu);
+ fpu_loaded = true;
+ }
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
+ }
+ if (fpu_loaded)
+ kvm_put_guest_fpu(vcpu);
return i;
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index eb3088684e8a..d90f1009ac10 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -701,4 +701,28 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl,
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+/*
+ * Lock and/or reload guest FPU and access xstate MSRs. For accesses initiated
+ * by host, guest FPU is loaded in __msr_io(). For accesses initiated by guest,
+ * guest FPU should have been loaded already.
+ */
+
+static inline void kvm_get_xstate_msr(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info)
+{
+ KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm);
+ kvm_fpu_get();
+ rdmsrl(msr_info->index, msr_info->data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_set_xstate_msr(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info)
+{
+ KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm);
+ kvm_fpu_get();
+ wrmsrl(msr_info->index, msr_info->data);
+ kvm_fpu_put();
+}
+
#endif
--
2.47.3
On 8/21/2025 6:30 AM, Chao Gao wrote: > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index eb3088684e8a..d90f1009ac10 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -701,4 +701,28 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, > > int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); > > +/* > + * Lock and/or reload guest FPU and access xstate MSRs. For accesses initiated > + * by host, guest FPU is loaded in __msr_io(). For accesses initiated by guest, > + * guest FPU should have been loaded already. > + */ > + > +static inline void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, > + struct msr_data *msr_info) > +{ > + KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm); > + kvm_fpu_get(); > + rdmsrl(msr_info->index, msr_info->data); s/rdmsrl/rdmsrq/ > + kvm_fpu_put(); > +} > + > +static inline void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, > + struct msr_data *msr_info) > +{ > + KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm); > + kvm_fpu_get(); > + wrmsrl(msr_info->index, msr_info->data); s/wrmsrl/wrmsrq/ Perhaps it's time to remove rdmsrl() and wrmsrl(), as keeping them around won't trigger any errors when the old APIs are still being used. > + kvm_fpu_put(); > +} > + > #endif
On Tue, Aug 26, 2025, Xin Li wrote: > On 8/21/2025 6:30 AM, Chao Gao wrote: > > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > > index eb3088684e8a..d90f1009ac10 100644 > > --- a/arch/x86/kvm/x86.h > > +++ b/arch/x86/kvm/x86.h > > @@ -701,4 +701,28 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, > > int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); > > +/* > > + * Lock and/or reload guest FPU and access xstate MSRs. For accesses initiated > > + * by host, guest FPU is loaded in __msr_io(). For accesses initiated by guest, > > + * guest FPU should have been loaded already. > > + */ > > + > > +static inline void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, > > + struct msr_data *msr_info) > > +{ > > + KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm); > > + kvm_fpu_get(); > > + rdmsrl(msr_info->index, msr_info->data); > > s/rdmsrl/rdmsrq/ > > > + kvm_fpu_put(); > > +} > > + > > +static inline void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, > > + struct msr_data *msr_info) > > +{ > > + KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm); > > + kvm_fpu_get(); > > + wrmsrl(msr_info->index, msr_info->data); > > s/wrmsrl/wrmsrq/ > > > Perhaps it's time to remove rdmsrl() and wrmsrl(), as keeping them around > won't trigger any errors when the old APIs are still being used. Yeah, we should bite the bullet and force in-flight code to adapt. I was _this_ close to making the same goof in the mediated PMU series, and IIRC it was only some random conflict that alerted me to using the old/wrong APIs.
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 6b01c6e9330e..799ac76679c9 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -4566,6 +4569,21 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > } > EXPORT_SYMBOL_GPL(kvm_get_msr_common); > > +/* > + * Returns true if the MSR in question is managed via XSTATE, i.e. is context > + * switched with the rest of guest FPU state. > + */ > +static bool is_xstate_managed_msr(u32 index) > +{ > + switch (index) { > + case MSR_IA32_U_CET: Why MSR_IA32_S_CET is not included here? > + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: > + return true; > + default: > + return false; > + } > +} Is it better to do? static bool is_xstate_managed_msr(u32 index) { if (!kvm_caps.supported_xss) return false; switch (index) { case MSR_IA32_U_CET: case MSR_IA32_S_CET: case MSR_IA32_PL1_SSP ... MSR_IA32_PL3_SSP: return kvm_caps.supported_xss & XFEATURE_MASK_CET_USER && kvm_caps.supported_xss & XFEATURE_MASK_CET_KERNEL; default: return false; } } And it would be obvious how to add new MSRs related to other XFEATURE bits. Thanks! Xin
On Sun, Aug 24, 2025 at 06:52:55PM -0700, Xin Li wrote: >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index 6b01c6e9330e..799ac76679c9 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -4566,6 +4569,21 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) >> } >> EXPORT_SYMBOL_GPL(kvm_get_msr_common); >> +/* >> + * Returns true if the MSR in question is managed via XSTATE, i.e. is context >> + * switched with the rest of guest FPU state. >> + */ >> +static bool is_xstate_managed_msr(u32 index) >> +{ >> + switch (index) { >> + case MSR_IA32_U_CET: > > >Why MSR_IA32_S_CET is not included here? Emm. I didn't think about this. MSR_IA32_S_CET is read from or written to a dedicated VMCS/B field, so KVM doesn't need to load the guest FPU to access MSR_IA32_S_CET. This pairs with the kvm_{get,set}_xstate_msr() in kvm_{get,set}_msr_common(). That said, userspace writes can indeed cause an inconsistency between the guest FPU and VMCS fields regarding MSR_IA32_S_CET. If migration occurs right after a userspace write (without a VM-entry, which would bring them in sync) and userspace just restores MSR_IA32_S_CET from the guest FPU, the write before migration could be lost. If that migration issue is a practical problem, I think MSR_IA32_S_CET should be included here, and we need to perform a kvm_set_xstate_msr() after writing to the VMCS/B. > > >> + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: >> + return true; >> + default: >> + return false; >> + } >> +} > > >Is it better to do? > >static bool is_xstate_managed_msr(u32 index) >{ > if (!kvm_caps.supported_xss) > return false; > > switch (index) { > case MSR_IA32_U_CET: > case MSR_IA32_S_CET: > case MSR_IA32_PL1_SSP ... MSR_IA32_PL3_SSP: > return kvm_caps.supported_xss & XFEATURE_MASK_CET_USER && > kvm_caps.supported_xss & XFEATURE_MASK_CET_KERNEL; > default: > return false; This will duplicate checks in other functions. I slightly prefer to keep this function super simple and do all capability checks in __kvm_{set,get}_msr() or kvm_emulate_msr_{write,read}. > } >} > >And it would be obvious how to add new MSRs related to other XFEATURE bits. Just return true for all those MSRs, regardless of host capabilities. If kvm_caps doesn't support them, those MSRs are not advertised to userspace either (see kvm_probe_msr_to_save()). Loading or putting the guest FPU when userspace attempts to read/write those unsupported MSRs shouldn't cause any performance issues, as userspace is unlikely to access them in hot paths. > >Thanks! > Xin
On 8/24/2025 7:55 PM, Chao Gao wrote: >> static bool is_xstate_managed_msr(u32 index) >> { >> if (!kvm_caps.supported_xss) >> return false; >> >> switch (index) { >> case MSR_IA32_U_CET: >> case MSR_IA32_S_CET: >> case MSR_IA32_PL1_SSP ... MSR_IA32_PL3_SSP: >> return kvm_caps.supported_xss & XFEATURE_MASK_CET_USER && >> kvm_caps.supported_xss & XFEATURE_MASK_CET_KERNEL; >> default: >> return false; > This will duplicate checks in other functions. I slightly prefer to keep this > function super simple and do all capability checks in __kvm_{set,get}_msr() > or kvm_emulate_msr_{write,read}. > >> } >> } >> >> And it would be obvious how to add new MSRs related to other XFEATURE bits. > Just return true for all those MSRs, regardless of host capabilities. If > kvm_caps doesn't support them, those MSRs are not advertised to userspace > either (see kvm_probe_msr_to_save()). Loading or putting the guest FPU when > userspace attempts to read/write those unsupported MSRs shouldn't cause any > performance issues, as userspace is unlikely to access them in hot paths. There is no problem as of now, because there are only two CET related bits set in KVM_SUPPORTED_XSS. So if !CET, the two bits are cleared thus kvm_caps.supported_xss is 0, and kvm_load_guest_fpu() is never executed in __msr_io(). However after any new bit is added to KVM_SUPPORTED_XSS in future, if !CET, kvm_caps.supported_xss could be non-zero. There should still be no problem because we don't expect any access to CET MSRs. The trouble comes with MSR_IA32_PL0_SSP when FRED and !CET, because it will be accessed even !CET. And we need to have to do the following: static bool is_xstate_managed_msr(u32 index) { switch (index) { case MSR_IA32_U_CET: case MSR_IA32_PL1_SSP ... MSR_IA32_PL3_SSP: return true; case MSR_IA32_PL0_SSP: return kvm_caps.supported_xss & XFEATURE_MASK_CET_USER && kvm_caps.supported_xss & XFEATURE_MASK_CET_KERNEL; default: return false; } } Then it makes more sense to handle all CET MSRs consistently. Thanks! Xin
© 2016 - 2025 Red Hat, Inc.