From: Sean Christopherson <seanjc@google.com>
Allow the use of kvm_load_host_xsave_state() with
vcpu->arch.guest_state_protected == true. This will allow TDX to reuse
kvm_load_host_xsave_state() instead of creating its own version.
For consistency, amend kvm_load_guest_xsave_state() also.
Ensure that guest state that kvm_load_host_xsave_state() depends upon,
such as MSR_IA32_XSS, cannot be changed by user space, if
guest_state_protected.
[Adrian: wrote commit message]
Link: https://lore.kernel.org/r/Z2GiQS_RmYeHU09L@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
TD vcpu enter/exit v2:
- New patch
---
arch/x86/kvm/svm/svm.c | 7 +++++--
arch/x86/kvm/x86.c | 18 +++++++++++-------
2 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7640a84e554a..b4bcfe15ad5e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4253,7 +4253,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
svm_set_dr6(svm, DR6_ACTIVE_LOW);
clgi();
- kvm_load_guest_xsave_state(vcpu);
+
+ if (!vcpu->arch.guest_state_protected)
+ kvm_load_guest_xsave_state(vcpu);
kvm_wait_lapic_expire(vcpu);
@@ -4282,7 +4284,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
- kvm_load_host_xsave_state(vcpu);
+ if (!vcpu->arch.guest_state_protected)
+ kvm_load_host_xsave_state(vcpu);
stgi();
/* Any pending NMI will happen here */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bbb6b7f40b3a..5cf9f023fd4b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1169,11 +1169,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw);
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
{
- if (vcpu->arch.guest_state_protected)
- return;
+ WARN_ON_ONCE(vcpu->arch.guest_state_protected);
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
-
if (vcpu->arch.xcr0 != kvm_host.xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
@@ -1192,13 +1190,11 @@ EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
{
- if (vcpu->arch.guest_state_protected)
- return;
-
if (cpu_feature_enabled(X86_FEATURE_PKU) &&
((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
- vcpu->arch.pkru = rdpkru();
+ if (!vcpu->arch.guest_state_protected)
+ vcpu->arch.pkru = rdpkru();
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
wrpkru(vcpu->arch.host_pkru);
}
@@ -3916,6 +3912,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
return 1;
+
+ if (vcpu->arch.guest_state_protected)
+ return 1;
+
/*
* KVM supports exposing PT to the guest, but does not support
* IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
@@ -4375,6 +4375,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
return 1;
+
+ if (vcpu->arch.guest_state_protected)
+ return 1;
+
msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_K7_CLK_CTL:
--
2.43.0
On 1/29/2025 5:58 PM, Adrian Hunter wrote:
> From: Sean Christopherson <seanjc@google.com>
>
> Allow the use of kvm_load_host_xsave_state() with
> vcpu->arch.guest_state_protected == true. This will allow TDX to reuse
> kvm_load_host_xsave_state() instead of creating its own version.
>
> For consistency, amend kvm_load_guest_xsave_state() also.
>
> Ensure that guest state that kvm_load_host_xsave_state() depends upon,
> such as MSR_IA32_XSS, cannot be changed by user space, if
> guest_state_protected.
>
> [Adrian: wrote commit message]
>
> Link: https://lore.kernel.org/r/Z2GiQS_RmYeHU09L@google.com
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> ---
> TD vcpu enter/exit v2:
> - New patch
> ---
> arch/x86/kvm/svm/svm.c | 7 +++++--
> arch/x86/kvm/x86.c | 18 +++++++++++-------
> 2 files changed, 16 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 7640a84e554a..b4bcfe15ad5e 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4253,7 +4253,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
> svm_set_dr6(svm, DR6_ACTIVE_LOW);
>
> clgi();
> - kvm_load_guest_xsave_state(vcpu);
> +
> + if (!vcpu->arch.guest_state_protected)
> + kvm_load_guest_xsave_state(vcpu);
>
> kvm_wait_lapic_expire(vcpu);
>
> @@ -4282,7 +4284,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
> if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
> kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
>
> - kvm_load_host_xsave_state(vcpu);
> + if (!vcpu->arch.guest_state_protected)
> + kvm_load_host_xsave_state(vcpu);
> stgi();
>
> /* Any pending NMI will happen here */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index bbb6b7f40b3a..5cf9f023fd4b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1169,11 +1169,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw);
>
> void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
> {
> - if (vcpu->arch.guest_state_protected)
> - return;
> + WARN_ON_ONCE(vcpu->arch.guest_state_protected);
>
> if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
> -
> if (vcpu->arch.xcr0 != kvm_host.xcr0)
> xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
>
> @@ -1192,13 +1190,11 @@ EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
>
> void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
> {
> - if (vcpu->arch.guest_state_protected)
> - return;
> -
> if (cpu_feature_enabled(X86_FEATURE_PKU) &&
> ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
> kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
> - vcpu->arch.pkru = rdpkru();
> + if (!vcpu->arch.guest_state_protected)
> + vcpu->arch.pkru = rdpkru();
this needs justification.
> if (vcpu->arch.pkru != vcpu->arch.host_pkru)
> wrpkru(vcpu->arch.host_pkru);
> }
> @@ -3916,6 +3912,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> if (!msr_info->host_initiated &&
> !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
> return 1;
> +
> + if (vcpu->arch.guest_state_protected)
> + return 1;
> +
this and below change need to be a separate patch. So that we can
discuss independently.
I see no reason to make MSR_IA32_XSS special than other MSRs. When
guest_state_protected, most of the MSRs that aren't emulated by KVM are
inaccessible by KVM.
> /*
> * KVM supports exposing PT to the guest, but does not support
> * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
> @@ -4375,6 +4375,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> if (!msr_info->host_initiated &&
> !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
> return 1;
> +
> + if (vcpu->arch.guest_state_protected)
> + return 1;
> +
> msr_info->data = vcpu->arch.ia32_xss;
> break;
> case MSR_K7_CLK_CTL:
On 2/20/25 11:50, Xiaoyao Li wrote:
> On 1/29/2025 5:58 PM, Adrian Hunter wrote:
>> From: Sean Christopherson <seanjc@google.com>
>>
>> Allow the use of kvm_load_host_xsave_state() with
>> vcpu->arch.guest_state_protected == true. This will allow TDX to reuse
>> kvm_load_host_xsave_state() instead of creating its own version.
>>
>> For consistency, amend kvm_load_guest_xsave_state() also.
>>
>> Ensure that guest state that kvm_load_host_xsave_state() depends upon,
>> such as MSR_IA32_XSS, cannot be changed by user space, if
>> guest_state_protected.
>>
>> [Adrian: wrote commit message]
>>
>> Link: https://lore.kernel.org/r/Z2GiQS_RmYeHU09L@google.com
>> Signed-off-by: Sean Christopherson <seanjc@google.com>
>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
>> ---
>> TD vcpu enter/exit v2:
>> - New patch
>> ---
>> arch/x86/kvm/svm/svm.c | 7 +++++--
>> arch/x86/kvm/x86.c | 18 +++++++++++-------
>> 2 files changed, 16 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index 7640a84e554a..b4bcfe15ad5e 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -4253,7 +4253,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct
>> kvm_vcpu *vcpu,
>> svm_set_dr6(svm, DR6_ACTIVE_LOW);
>> clgi();
>> - kvm_load_guest_xsave_state(vcpu);
>> +
>> + if (!vcpu->arch.guest_state_protected)
>> + kvm_load_guest_xsave_state(vcpu);
>> kvm_wait_lapic_expire(vcpu);
>> @@ -4282,7 +4284,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct
>> kvm_vcpu *vcpu,
>> if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
>> kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
>> - kvm_load_host_xsave_state(vcpu);
>> + if (!vcpu->arch.guest_state_protected)
>> + kvm_load_host_xsave_state(vcpu);
>> stgi();
>> /* Any pending NMI will happen here */
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index bbb6b7f40b3a..5cf9f023fd4b 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -1169,11 +1169,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw);
>> void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
>> {
>> - if (vcpu->arch.guest_state_protected)
>> - return;
>> + WARN_ON_ONCE(vcpu->arch.guest_state_protected);
>> if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
>> -
>> if (vcpu->arch.xcr0 != kvm_host.xcr0)
>> xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
>> @@ -1192,13 +1190,11 @@ EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
>> void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
>> {
>> - if (vcpu->arch.guest_state_protected)
>> - return;
>> -
>> if (cpu_feature_enabled(X86_FEATURE_PKU) &&
>> ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
>> kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
>> - vcpu->arch.pkru = rdpkru();
>> + if (!vcpu->arch.guest_state_protected)
>> + vcpu->arch.pkru = rdpkru();
>
> this needs justification.
>
>> if (vcpu->arch.pkru != vcpu->arch.host_pkru)
>> wrpkru(vcpu->arch.host_pkru);
>> }
>
>
>> @@ -3916,6 +3912,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu,
>> struct msr_data *msr_info)
>> if (!msr_info->host_initiated &&
>> !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
>> return 1;
>> +
>> + if (vcpu->arch.guest_state_protected)
>> + return 1;
>> +
>
> this and below change need to be a separate patch. So that we can
> discuss independently.
>
> I see no reason to make MSR_IA32_XSS special than other MSRs. When
> guest_state_protected, most of the MSRs that aren't emulated by KVM are
> inaccessible by KVM.
I agree with Xiaoyao that this change is sensible but should be proposed
separately for both SNP and TDX.
Allowing the use of kvm_load_host_xsave_state() is really ugly,
especially since the corresponding code is so simple:
if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != 0)
wrpkru(vcpu->arch.host_pkru);
if (kvm_host.xcr0 != kvm_tdx->xfam & kvm_caps.supported_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
/*
* All TDX hosts support XSS; but even if they didn't, both
* arms of the comparison would be 0 and the wrmsrl would be
* skipped.
*/
if (kvm_host.xss != kvm_tdx->xfam & kvm_caps.supported_xss)
wrmsrl(MSR_IA32_XSS, kvm_host.xss);
This is really all that should be in patch 7. I'll test it and decide
what to do.
Paolo
>> /*
>> * KVM supports exposing PT to the guest, but does not support
>> * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
>> @@ -4375,6 +4375,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu,
>> struct msr_data *msr_info)
>> if (!msr_info->host_initiated &&
>> !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
>> return 1;
>> +
>> + if (vcpu->arch.guest_state_protected)
>> + return 1;
>> +
>> msr_info->data = vcpu->arch.ia32_xss;
>> break;
>> case MSR_K7_CLK_CTL:
>
>
On Thu, Mar 06, 2025, Paolo Bonzini wrote: > I agree with Xiaoyao that this change is sensible but should be proposed > separately for both SNP and TDX. > > Allowing the use of kvm_load_host_xsave_state() is really ugly, especially > since the corresponding code is so simple: > > if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != 0) > wrpkru(vcpu->arch.host_pkru); It's clearly not "so simple", because this code is buggy. The justification for using kvm_load_host_xsave_state() is that either KVM gets the TDX state model correct and the existing flows Just Work, or we handle all that state as one-offs and at best replicate concepts and flows, and at worst have bugs that are unique to TDX, e.g. because we get the "simple" code wrong, we miss flows that subtly consume state, etc.
Il gio 6 mar 2025, 21:44 Sean Christopherson <seanjc@google.com> ha scritto: > > Allowing the use of kvm_load_host_xsave_state() is really ugly, especially > > since the corresponding code is so simple: > > > > if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != 0) > > wrpkru(vcpu->arch.host_pkru); > > It's clearly not "so simple", because this code is buggy. > > The justification for using kvm_load_host_xsave_state() is that either KVM gets > the TDX state model correct and the existing flows Just Work, or we handle all > that state as one-offs and at best replicate concepts and flows, and at worst > have bugs that are unique to TDX, e.g. because we get the "simple" code wrong, > we miss flows that subtly consume state, etc. A typo doesn't change the fact that kvm_load_host_xsave_state is optimized with knowledge of the guest CR0 and CR4; faking the values so that the same field means both "exit value" and "guest value", just so that the common code does the right thing for pkru/xcr0/xss, is unmaintainable and conceptually just wrong. And while the change for XSS (and possibly other MSRs) is actually correct, it should be justified for both SEV-ES/SNP and TDX rather than sneaked into the TDX patches. While there could be other flows that consume guest state, they're just as bound to do the wrong thing if vcpu->arch is only guaranteed to be somehow plausible (think anything that for whatever reason uses cpu_role). There's no way the existing flows for !guest_state_protected should run _at all_ when the register state is not there. If they do, it's a bug and fixing them is the right thing to do (it may feel like whack-a-mole but isn't). See for example the KVM_CAP_SYNC_REGS calls that Adrian mentioned in the reply to 05/12, and for which I'll send a patch too: I haven't checked if it's possible, but I wonder if userspace could misuse sync_regs() to change guest xcr0 and xss, and trick the TDX *host* into running with some bits of xcr0 and xss unexpectedly cleared. TDX provides well known values for the host for all three registers that are being restored here, so there's no need to reuse code that is meant to do something completely different. I'm not singling out TDX, the same would be true for SEV-ES/SNP swap type C. Paolo Paolo
On Thu, Mar 06, 2025, Paolo Bonzini wrote:
> Il gio 6 mar 2025, 21:44 Sean Christopherson <seanjc@google.com> ha scritto:
> > > Allowing the use of kvm_load_host_xsave_state() is really ugly, especially
> > > since the corresponding code is so simple:
> > >
> > > if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != 0)
> > > wrpkru(vcpu->arch.host_pkru);
> >
> > It's clearly not "so simple", because this code is buggy.
> >
> > The justification for using kvm_load_host_xsave_state() is that either KVM gets
> > the TDX state model correct and the existing flows Just Work, or we handle all
> > that state as one-offs and at best replicate concepts and flows, and at worst
> > have bugs that are unique to TDX, e.g. because we get the "simple" code wrong,
> > we miss flows that subtly consume state, etc.
>
> A typo doesn't change the fact that kvm_load_host_xsave_state is
> optimized with knowledge of the guest CR0 and CR4; faking the values
> so that the same field means both "exit value" and "guest value",
I can't argue against that, but I still absolutely detest carrying dedicated code
for SEV and TDX state management. It's bad enough that figuring out WTF actually
happens basically requires encyclopedic knowledge of massive specs.
I tried to figure out a way to share code, but everything I can come up with that
doesn't fake vCPU state makes the non-TDX code a mess. :-(
> just so that the common code does the right thing for pkru/xcr0/xss,
FWIW, it's not just to that KVM does the right thing for those values, it's a
defense in depth mechanism so that *when*, not if, KVM screws up, the odds of the
bug being fatal to KVM and/or the guest are reduced.
> is > unmaintainable and conceptually just wrong.
I don't necessarily disagree, but what we have today isn't maintainable either.
Without actual sanity check and safeguards in the low level helpers, we absolutely
are playing a game of whack-a-mole.
E.g. see commit 9b42d1e8e4fe ("KVM: x86: Play nice with protected guests in
complete_hypercall_exit()").
At a glance, kvm_hv_hypercall() is still broken, because is_protmode() will return
false incorrectly.
> And while the change for XSS (and possibly other MSRs) is actually correct,
> it should be justified for both SEV-ES/SNP and TDX rather than sneaked into
> the TDX patches.
>
> While there could be other flows that consume guest state, they're
> just as bound to do the wrong thing if vcpu->arch is only guaranteed
> to be somehow plausible (think anything that for whatever reason uses
> cpu_role).
But the MMU code is *already* broken. kvm_init_mmu() => vcpu_to_role_regs(). It
"works" because the fubar role is never truly consumed. I'm sure there are more
examples.
> There's no way the existing flows for !guest_state_protected should run _at
> all_ when the register state is not there. If they do, it's a bug and fixing
> them is the right thing to do (it may feel like whack-a-mole but isn't)
Eh, it's still whack-a-mole, there just happen to be a finite number of moles :-)
On Sat, Mar 8, 2025 at 12:04 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Mar 06, 2025, Paolo Bonzini wrote:
> I still absolutely detest carrying dedicated code
> for SEV and TDX state management. It's bad enough that figuring out WTF actually
> happens basically requires encyclopedic knowledge of massive specs.
>
> I tried to figure out a way to share code, but everything I can come up with that
> doesn't fake vCPU state makes the non-TDX code a mess. :-(
The only thing worse is requiring encyclopedic knowledge of both the
specs and KVM. :) And yeah, we do require some knowledge of parts of
KVM
that *shouldn't* matter for protected-state guests, but it shouldn't
be worse than needed.
There's different microcode/firmware for VMX/SVM/SEV-ES+/TDX, the
chance of sharing code is lower and lower as more stuff is added
there---as is the case
for SEV-ES/SNP and TDX. Which is why state management code for TDX is
anyway doing its own thing most of the time---there's no point in
sharing a little bit which is not even the hardest.
> > just so that the common code does the right thing for pkru/xcr0/xss,
>
> FWIW, it's not just to that KVM does the right thing for those values, it's a
> defense in depth mechanism so that *when*, not if, KVM screws up, the odds of the
> bug being fatal to KVM and/or the guest are reduced.
I would say the other way round is true too. Not relying too much on
fake values in vcpu->arch can be more robust.
> Without actual sanity check and safeguards in the low level helpers, we absolutely
> are playing a game of whack-a-mole.
>
> E.g. see commit 9b42d1e8e4fe ("KVM: x86: Play nice with protected guests in
> complete_hypercall_exit()").
>
> At a glance, kvm_hv_hypercall() is still broken, because is_protmode() will return
> false incorrectly.
So the fixes are needed anyway and we're playing the game anyway. :(
> > And while the change for XSS (and possibly other MSRs) is actually correct,
> > it should be justified for both SEV-ES/SNP and TDX rather than sneaked into
> > the TDX patches.
> >
> > While there could be other flows that consume guest state, they're
> > just as bound to do the wrong thing if vcpu->arch is only guaranteed
> > to be somehow plausible (think anything that for whatever reason uses
> > cpu_role).
>
> But the MMU code is *already* broken. kvm_init_mmu() => vcpu_to_role_regs(). It
> "works" because the fubar role is never truly consumed. I'm sure there are more
> examples.
Yes, and there should be at least a WARN_ON_ONCE when it is accessed,
even if we don't completely cull the initialization of cpu_role...
Loading the XSAVE state isn't any different.
I'm okay with placing some values in cr0/cr4 or even xcr0/xss, but do
not wish to use them more than the absolute minimum necessary. And I
would rather not set more than the bare minimum needed in CR4... why
set CR4.PKE for example, if KVM anyway has no business using the guest
PKRU.
Paolo
> > There's no way the existing flows for !guest_state_protected should run _at
> > all_ when the register state is not there. If they do, it's a bug and fixing
> > them is the right thing to do (it may feel like whack-a-mole but isn't)
>
> Eh, it's still whack-a-mole, there just happen to be a finite number of moles :-)
On 20/02/25 12:50, Xiaoyao Li wrote:
> On 1/29/2025 5:58 PM, Adrian Hunter wrote:
>> From: Sean Christopherson <seanjc@google.com>
>>
>> Allow the use of kvm_load_host_xsave_state() with
>> vcpu->arch.guest_state_protected == true. This will allow TDX to reuse
>> kvm_load_host_xsave_state() instead of creating its own version.
>>
>> For consistency, amend kvm_load_guest_xsave_state() also.
>>
>> Ensure that guest state that kvm_load_host_xsave_state() depends upon,
>> such as MSR_IA32_XSS, cannot be changed by user space, if
>> guest_state_protected.
>>
>> [Adrian: wrote commit message]
>>
>> Link: https://lore.kernel.org/r/Z2GiQS_RmYeHU09L@google.com
>> Signed-off-by: Sean Christopherson <seanjc@google.com>
>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
>> ---
>> TD vcpu enter/exit v2:
>> - New patch
>> ---
>> arch/x86/kvm/svm/svm.c | 7 +++++--
>> arch/x86/kvm/x86.c | 18 +++++++++++-------
>> 2 files changed, 16 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index 7640a84e554a..b4bcfe15ad5e 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -4253,7 +4253,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
>> svm_set_dr6(svm, DR6_ACTIVE_LOW);
>> clgi();
>> - kvm_load_guest_xsave_state(vcpu);
>> +
>> + if (!vcpu->arch.guest_state_protected)
>> + kvm_load_guest_xsave_state(vcpu);
>> kvm_wait_lapic_expire(vcpu);
>> @@ -4282,7 +4284,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
>> if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
>> kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
>> - kvm_load_host_xsave_state(vcpu);
>> + if (!vcpu->arch.guest_state_protected)
>> + kvm_load_host_xsave_state(vcpu);
>> stgi();
>> /* Any pending NMI will happen here */
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index bbb6b7f40b3a..5cf9f023fd4b 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -1169,11 +1169,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw);
>> void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
>> {
>> - if (vcpu->arch.guest_state_protected)
>> - return;
>> + WARN_ON_ONCE(vcpu->arch.guest_state_protected);
>> if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
>> -
>> if (vcpu->arch.xcr0 != kvm_host.xcr0)
>> xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
>> @@ -1192,13 +1190,11 @@ EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
>> void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
>> {
>> - if (vcpu->arch.guest_state_protected)
>> - return;
>> -
>> if (cpu_feature_enabled(X86_FEATURE_PKU) &&
>> ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
>> kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
>> - vcpu->arch.pkru = rdpkru();
>> + if (!vcpu->arch.guest_state_protected)
>> + vcpu->arch.pkru = rdpkru();
>
> this needs justification.
It was proposed by Sean here:
https://lore.kernel.org/all/Z2WZ091z8GmGjSbC@google.com/
which is part of the email thread referenced by the "Link:" tag above
>
>> if (vcpu->arch.pkru != vcpu->arch.host_pkru)
>> wrpkru(vcpu->arch.host_pkru);
>> }
>
>
>> @@ -3916,6 +3912,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> if (!msr_info->host_initiated &&
>> !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
>> return 1;
>> +
>> + if (vcpu->arch.guest_state_protected)
>> + return 1;
>> +
>
> this and below change need to be a separate patch. So that we can discuss independently.
>
> I see no reason to make MSR_IA32_XSS special than other MSRs. When guest_state_protected, most of the MSRs that aren't emulated by KVM are inaccessible by KVM.
Yes, TDX will block access to MSR_IA32_XSS anyway because
tdx_has_emulated_msr() will return false for MSR_IA32_XSS.
However kvm_load_host_xsave_state() is not TDX-specific code and it
relies upon vcpu->arch.ia32_xss, so there is reason to block
access to it when vcpu->arch.guest_state_protected is true.
>
>> /*
>> * KVM supports exposing PT to the guest, but does not support
>> * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
>> @@ -4375,6 +4375,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> if (!msr_info->host_initiated &&
>> !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
>> return 1;
>> +
>> + if (vcpu->arch.guest_state_protected)
>> + return 1;
>> +
>> msr_info->data = vcpu->arch.ia32_xss;
>> break;
>> case MSR_K7_CLK_CTL:
>
On 2/24/2025 7:38 PM, Adrian Hunter wrote:
> On 20/02/25 12:50, Xiaoyao Li wrote:
>> On 1/29/2025 5:58 PM, Adrian Hunter wrote:
>>> From: Sean Christopherson <seanjc@google.com>
>>>
>>> Allow the use of kvm_load_host_xsave_state() with
>>> vcpu->arch.guest_state_protected == true. This will allow TDX to reuse
>>> kvm_load_host_xsave_state() instead of creating its own version.
>>>
>>> For consistency, amend kvm_load_guest_xsave_state() also.
>>>
>>> Ensure that guest state that kvm_load_host_xsave_state() depends upon,
>>> such as MSR_IA32_XSS, cannot be changed by user space, if
>>> guest_state_protected.
>>>
>>> [Adrian: wrote commit message]
>>>
>>> Link: https://lore.kernel.org/r/Z2GiQS_RmYeHU09L@google.com
>>> Signed-off-by: Sean Christopherson <seanjc@google.com>
>>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
>>> ---
>>> TD vcpu enter/exit v2:
>>> - New patch
>>> ---
>>> arch/x86/kvm/svm/svm.c | 7 +++++--
>>> arch/x86/kvm/x86.c | 18 +++++++++++-------
>>> 2 files changed, 16 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>>> index 7640a84e554a..b4bcfe15ad5e 100644
>>> --- a/arch/x86/kvm/svm/svm.c
>>> +++ b/arch/x86/kvm/svm/svm.c
>>> @@ -4253,7 +4253,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
>>> svm_set_dr6(svm, DR6_ACTIVE_LOW);
>>> clgi();
>>> - kvm_load_guest_xsave_state(vcpu);
>>> +
>>> + if (!vcpu->arch.guest_state_protected)
>>> + kvm_load_guest_xsave_state(vcpu);
>>> kvm_wait_lapic_expire(vcpu);
>>> @@ -4282,7 +4284,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
>>> if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
>>> kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
>>> - kvm_load_host_xsave_state(vcpu);
>>> + if (!vcpu->arch.guest_state_protected)
>>> + kvm_load_host_xsave_state(vcpu);
>>> stgi();
>>> /* Any pending NMI will happen here */
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index bbb6b7f40b3a..5cf9f023fd4b 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -1169,11 +1169,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw);
>>> void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
>>> {
>>> - if (vcpu->arch.guest_state_protected)
>>> - return;
>>> + WARN_ON_ONCE(vcpu->arch.guest_state_protected);
>>> if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
>>> -
>>> if (vcpu->arch.xcr0 != kvm_host.xcr0)
>>> xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
>>> @@ -1192,13 +1190,11 @@ EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
>>> void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
>>> {
>>> - if (vcpu->arch.guest_state_protected)
>>> - return;
>>> -
>>> if (cpu_feature_enabled(X86_FEATURE_PKU) &&
>>> ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
>>> kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
>>> - vcpu->arch.pkru = rdpkru();
>>> + if (!vcpu->arch.guest_state_protected)
>>> + vcpu->arch.pkru = rdpkru();
>>
>> this needs justification.
>
> It was proposed by Sean here:
>
> https://lore.kernel.org/all/Z2WZ091z8GmGjSbC@google.com/
>
> which is part of the email thread referenced by the "Link:" tag above
IMHO, this change needs to be put in patch 07, which is the better place
to justify it.
>>
>>> if (vcpu->arch.pkru != vcpu->arch.host_pkru)
>>> wrpkru(vcpu->arch.host_pkru);
>>> }
>>
>>
>>> @@ -3916,6 +3912,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>> if (!msr_info->host_initiated &&
>>> !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
>>> return 1;
>>> +
>>> + if (vcpu->arch.guest_state_protected)
>>> + return 1;
>>> +
>>
>> this and below change need to be a separate patch. So that we can discuss independently.
>>
>> I see no reason to make MSR_IA32_XSS special than other MSRs. When guest_state_protected, most of the MSRs that aren't emulated by KVM are inaccessible by KVM.
>
> Yes, TDX will block access to MSR_IA32_XSS anyway because
> tdx_has_emulated_msr() will return false for MSR_IA32_XSS.
>
> However kvm_load_host_xsave_state() is not TDX-specific code and it
> relies upon vcpu->arch.ia32_xss, so there is reason to block
> access to it when vcpu->arch.guest_state_protected is true.
It is TDX specific logic that TDX requires vcpu->arch.ia32_xss unchanged
since TDX is going to utilize kvm_load_host_xsave_state() to restore
host xsave state and relies on vcpu->arch.ia32_xss to be always the
value of XFAM & XSS_MASK.
So please put this change into the TDX specific patch with the clear
justfication.
>>
>>> /*
>>> * KVM supports exposing PT to the guest, but does not support
>>> * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
>>> @@ -4375,6 +4375,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>> if (!msr_info->host_initiated &&
>>> !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
>>> return 1;
>>> +
>>> + if (vcpu->arch.guest_state_protected)
>>> + return 1;
>>> +
>>> msr_info->data = vcpu->arch.ia32_xss;
>>> break;
>>> case MSR_K7_CLK_CTL:
>>
>
On 25/02/25 07:56, Xiaoyao Li wrote:
> On 2/24/2025 7:38 PM, Adrian Hunter wrote:
>> On 20/02/25 12:50, Xiaoyao Li wrote:
>>> On 1/29/2025 5:58 PM, Adrian Hunter wrote:
>>>> From: Sean Christopherson <seanjc@google.com>
>>>>
>>>> Allow the use of kvm_load_host_xsave_state() with
>>>> vcpu->arch.guest_state_protected == true. This will allow TDX to reuse
>>>> kvm_load_host_xsave_state() instead of creating its own version.
>>>>
>>>> For consistency, amend kvm_load_guest_xsave_state() also.
>>>>
>>>> Ensure that guest state that kvm_load_host_xsave_state() depends upon,
>>>> such as MSR_IA32_XSS, cannot be changed by user space, if
>>>> guest_state_protected.
>>>>
>>>> [Adrian: wrote commit message]
>>>>
>>>> Link: https://lore.kernel.org/r/Z2GiQS_RmYeHU09L@google.com
>>>> Signed-off-by: Sean Christopherson <seanjc@google.com>
>>>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
>>>> ---
>>>> TD vcpu enter/exit v2:
>>>> - New patch
>>>> ---
>>>> arch/x86/kvm/svm/svm.c | 7 +++++--
>>>> arch/x86/kvm/x86.c | 18 +++++++++++-------
>>>> 2 files changed, 16 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>>>> index 7640a84e554a..b4bcfe15ad5e 100644
>>>> --- a/arch/x86/kvm/svm/svm.c
>>>> +++ b/arch/x86/kvm/svm/svm.c
>>>> @@ -4253,7 +4253,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
>>>> svm_set_dr6(svm, DR6_ACTIVE_LOW);
>>>> clgi();
>>>> - kvm_load_guest_xsave_state(vcpu);
>>>> +
>>>> + if (!vcpu->arch.guest_state_protected)
>>>> + kvm_load_guest_xsave_state(vcpu);
>>>> kvm_wait_lapic_expire(vcpu);
>>>> @@ -4282,7 +4284,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
>>>> if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
>>>> kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
>>>> - kvm_load_host_xsave_state(vcpu);
>>>> + if (!vcpu->arch.guest_state_protected)
>>>> + kvm_load_host_xsave_state(vcpu);
>>>> stgi();
>>>> /* Any pending NMI will happen here */
>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>> index bbb6b7f40b3a..5cf9f023fd4b 100644
>>>> --- a/arch/x86/kvm/x86.c
>>>> +++ b/arch/x86/kvm/x86.c
>>>> @@ -1169,11 +1169,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw);
>>>> void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
>>>> {
>>>> - if (vcpu->arch.guest_state_protected)
>>>> - return;
>>>> + WARN_ON_ONCE(vcpu->arch.guest_state_protected);
>>>> if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
>>>> -
>>>> if (vcpu->arch.xcr0 != kvm_host.xcr0)
>>>> xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
>>>> @@ -1192,13 +1190,11 @@ EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
>>>> void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
>>>> {
>>>> - if (vcpu->arch.guest_state_protected)
>>>> - return;
>>>> -
>>>> if (cpu_feature_enabled(X86_FEATURE_PKU) &&
>>>> ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
>>>> kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
>>>> - vcpu->arch.pkru = rdpkru();
>>>> + if (!vcpu->arch.guest_state_protected)
>>>> + vcpu->arch.pkru = rdpkru();
>>>
>>> this needs justification.
>>
>> It was proposed by Sean here:
>>
>> https://lore.kernel.org/all/Z2WZ091z8GmGjSbC@google.com/
>>
>> which is part of the email thread referenced by the "Link:" tag above
>
> IMHO, this change needs to be put in patch 07, which is the better place to justify it.
>
>>>
>>>> if (vcpu->arch.pkru != vcpu->arch.host_pkru)
>>>> wrpkru(vcpu->arch.host_pkru);
>>>> }
>>>
>>>
>>>> @@ -3916,6 +3912,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>>> if (!msr_info->host_initiated &&
>>>> !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
>>>> return 1;
>>>> +
>>>> + if (vcpu->arch.guest_state_protected)
>>>> + return 1;
>>>> +
>>>
>>> this and below change need to be a separate patch. So that we can discuss independently.
>>>
>>> I see no reason to make MSR_IA32_XSS special than other MSRs. When guest_state_protected, most of the MSRs that aren't emulated by KVM are inaccessible by KVM.
>>
>> Yes, TDX will block access to MSR_IA32_XSS anyway because
>> tdx_has_emulated_msr() will return false for MSR_IA32_XSS.
>>
>> However kvm_load_host_xsave_state() is not TDX-specific code and it
>> relies upon vcpu->arch.ia32_xss, so there is reason to block
>> access to it when vcpu->arch.guest_state_protected is true.
>
> It is TDX specific logic that TDX requires vcpu->arch.ia32_xss unchanged since TDX is going to utilize kvm_load_host_xsave_state() to restore host xsave state and relies on vcpu->arch.ia32_xss to be always the value of XFAM & XSS_MASK.
>
> So please put this change into the TDX specific patch with the clear justfication.
This patch set is owned by Paolo now, so it is up to him.
>>>
>>>> /*
>>>> * KVM supports exposing PT to the guest, but does not support
>>>> * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
>>>> @@ -4375,6 +4375,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>>> if (!msr_info->host_initiated &&
>>>> !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
>>>> return 1;
>>>> +
>>>> + if (vcpu->arch.guest_state_protected)
>>>> + return 1;
>>>> +
>>>> msr_info->data = vcpu->arch.ia32_xss;
>>>> break;
>>>> case MSR_K7_CLK_CTL:
>>>
>>
>
© 2016 - 2026 Red Hat, Inc.