Set all user-return MSRs to their post-TD-exit value when preparing to run
a TDX vCPU to ensure the value that KVM expects to be loaded after running
the vCPU is indeed the value that's loaded in hardware. If the TDX-Module
doesn't actually enter the guest, i.e. doesn't do VM-Enter, then it won't
"restore" VMM state, i.e. won't clobber user-return MSRs to their expected
post-run values, in which case simply updating KVM's "cached" value will
effectively corrupt the cache due to hardware still holding the original
value.
In theory, KVM could conditionally update the current user-return value if
and only if tdh_vp_enter() succeeds, but in practice "success" doesn't
guarantee the TDX-Module actually entered the guest, e.g. if the TDX-Module
synthesizes an EPT Violation because it suspects a zero-step attack.
Force-load the expected values instead of trying to decipher whether or
not the TDX-Module restored/clobbered MSRs, as the risk doesn't justify
the benefits. Effectively avoiding four WRMSRs once per run loop (even if
the vCPU is scheduled out, user-return MSRs only need to be reloaded if
the CPU exits to userspace or runs a non-TDX vCPU) is likely in the noise
when amortized over all entries, given the cost of running a TDX vCPU.
E.g. the cost of the WRMSRs is somewhere between ~300 and ~500 cycles,
whereas the cost of a _single_ roundtrip to/from a TDX guest is thousands
of cycles.
Fixes: e0b4f31a3c65 ("KVM: TDX: restore user ret MSRs")
Cc: stable@vger.kernel.org
Cc: Yan Zhao <yan.y.zhao@intel.com>
Cc: Xiaoyao Li <xiaoyao.li@intel.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/kvm_host.h | 1 -
arch/x86/kvm/vmx/tdx.c | 52 +++++++++++++++------------------
arch/x86/kvm/vmx/tdx.h | 1 -
arch/x86/kvm/x86.c | 9 ------
4 files changed, 23 insertions(+), 40 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 48598d017d6f..d158dfd1842e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2378,7 +2378,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
int kvm_add_user_return_msr(u32 msr);
int kvm_find_user_return_msr(u32 msr);
int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
-void kvm_user_return_msr_update_cache(unsigned int index, u64 val);
u64 kvm_get_user_return_msr(unsigned int slot);
static inline bool kvm_is_supported_user_return_msr(u32 msr)
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 326db9b9c567..cde91a995076 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -763,25 +763,6 @@ static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
}
-/*
- * Compared to vmx_prepare_switch_to_guest(), there is not much to do
- * as SEAMCALL/SEAMRET calls take care of most of save and restore.
- */
-void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vt *vt = to_vt(vcpu);
-
- if (vt->guest_state_loaded)
- return;
-
- if (likely(is_64bit_mm(current->mm)))
- vt->msr_host_kernel_gs_base = current->thread.gsbase;
- else
- vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
-
- vt->guest_state_loaded = true;
-}
-
struct tdx_uret_msr {
u32 msr;
unsigned int slot;
@@ -795,19 +776,38 @@ static struct tdx_uret_msr tdx_uret_msrs[] = {
{.msr = MSR_TSC_AUX,},
};
-static void tdx_user_return_msr_update_cache(void)
+void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vt *vt = to_vt(vcpu);
int i;
+ if (vt->guest_state_loaded)
+ return;
+
+ if (likely(is_64bit_mm(current->mm)))
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
+ else
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+
+ vt->guest_state_loaded = true;
+
+ /*
+ * Explicitly set user-return MSRs that are clobbered by the TDX-Module
+ * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
+ * written by the TDX-Module. Don't rely on the TDX-Module to actually
+ * clobber the MSRs, as the contract is poorly defined and not upheld.
+ * E.g. the TDX-Module will synthesize an EPT Violation without doing
+ * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
+ * state.
+ */
for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
- kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
- tdx_uret_msrs[i].defval);
+ kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
+ tdx_uret_msrs[i].defval, -1ull);
}
static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
{
struct vcpu_vt *vt = to_vt(vcpu);
- struct vcpu_tdx *tdx = to_tdx(vcpu);
if (!vt->guest_state_loaded)
return;
@@ -815,11 +815,6 @@ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
++vcpu->stat.host_state_reload;
wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
- if (tdx->guest_entered) {
- tdx_user_return_msr_update_cache();
- tdx->guest_entered = false;
- }
-
vt->guest_state_loaded = false;
}
@@ -1059,7 +1054,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
update_debugctlmsr(vcpu->arch.host_debugctl);
tdx_load_host_xsave_state(vcpu);
- tdx->guest_entered = true;
vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index ca39a9391db1..7f258870dc41 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -67,7 +67,6 @@ struct vcpu_tdx {
u64 vp_enter_ret;
enum vcpu_tdx_state state;
- bool guest_entered;
u64 map_gpa_next;
u64 map_gpa_end;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4b5d2d09634..639589af7cbe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -681,15 +681,6 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
-void kvm_user_return_msr_update_cache(unsigned int slot, u64 value)
-{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
-
- msrs->values[slot].curr = value;
- kvm_user_return_register_notifier(msrs);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache);
-
u64 kvm_get_user_return_msr(unsigned int slot)
{
return this_cpu_ptr(user_return_msrs)->values[slot].curr;
--
2.51.1.930.gacf6e81ea2-goog
On 10/31/2025 3:15 AM, Sean Christopherson wrote:
> Set all user-return MSRs to their post-TD-exit value when preparing to run
> a TDX vCPU to ensure the value that KVM expects to be loaded after running
> the vCPU is indeed the value that's loaded in hardware. If the TDX-Module
> doesn't actually enter the guest, i.e. doesn't do VM-Enter, then it won't
> "restore" VMM state, i.e. won't clobber user-return MSRs to their expected
> post-run values, in which case simply updating KVM's "cached" value will
> effectively corrupt the cache due to hardware still holding the original
> value.
>
> In theory, KVM could conditionally update the current user-return value if
> and only if tdh_vp_enter() succeeds, but in practice "success" doesn't
> guarantee the TDX-Module actually entered the guest, e.g. if the TDX-Module
> synthesizes an EPT Violation because it suspects a zero-step attack.
>
> Force-load the expected values instead of trying to decipher whether or
> not the TDX-Module restored/clobbered MSRs, as the risk doesn't justify
> the benefits. Effectively avoiding four WRMSRs once per run loop (even if
> the vCPU is scheduled out, user-return MSRs only need to be reloaded if
> the CPU exits to userspace or runs a non-TDX vCPU) is likely in the noise
> when amortized over all entries, given the cost of running a TDX vCPU.
> E.g. the cost of the WRMSRs is somewhere between ~300 and ~500 cycles,
> whereas the cost of a _single_ roundtrip to/from a TDX guest is thousands
> of cycles.
>
> Fixes: e0b4f31a3c65 ("KVM: TDX: restore user ret MSRs")
> Cc: stable@vger.kernel.org
> Cc: Yan Zhao <yan.y.zhao@intel.com>
> Cc: Xiaoyao Li <xiaoyao.li@intel.com>
> Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
> ---
> arch/x86/include/asm/kvm_host.h | 1 -
> arch/x86/kvm/vmx/tdx.c | 52 +++++++++++++++------------------
> arch/x86/kvm/vmx/tdx.h | 1 -
> arch/x86/kvm/x86.c | 9 ------
> 4 files changed, 23 insertions(+), 40 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 48598d017d6f..d158dfd1842e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -2378,7 +2378,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
> int kvm_add_user_return_msr(u32 msr);
> int kvm_find_user_return_msr(u32 msr);
> int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
> -void kvm_user_return_msr_update_cache(unsigned int index, u64 val);
> u64 kvm_get_user_return_msr(unsigned int slot);
>
> static inline bool kvm_is_supported_user_return_msr(u32 msr)
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 326db9b9c567..cde91a995076 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -763,25 +763,6 @@ static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
> return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
> }
>
> -/*
> - * Compared to vmx_prepare_switch_to_guest(), there is not much to do
> - * as SEAMCALL/SEAMRET calls take care of most of save and restore.
> - */
> -void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
> -{
> - struct vcpu_vt *vt = to_vt(vcpu);
> -
> - if (vt->guest_state_loaded)
> - return;
> -
> - if (likely(is_64bit_mm(current->mm)))
> - vt->msr_host_kernel_gs_base = current->thread.gsbase;
> - else
> - vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
> -
> - vt->guest_state_loaded = true;
> -}
> -
> struct tdx_uret_msr {
> u32 msr;
> unsigned int slot;
> @@ -795,19 +776,38 @@ static struct tdx_uret_msr tdx_uret_msrs[] = {
> {.msr = MSR_TSC_AUX,},
> };
>
> -static void tdx_user_return_msr_update_cache(void)
> +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
> {
> + struct vcpu_vt *vt = to_vt(vcpu);
> int i;
>
> + if (vt->guest_state_loaded)
> + return;
> +
> + if (likely(is_64bit_mm(current->mm)))
> + vt->msr_host_kernel_gs_base = current->thread.gsbase;
> + else
> + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
> +
> + vt->guest_state_loaded = true;
> +
> + /*
> + * Explicitly set user-return MSRs that are clobbered by the TDX-Module
> + * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
> + * written by the TDX-Module. Don't rely on the TDX-Module to actually
> + * clobber the MSRs, as the contract is poorly defined and not upheld.
> + * E.g. the TDX-Module will synthesize an EPT Violation without doing
> + * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
> + * state.
> + */
> for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
> - kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
> - tdx_uret_msrs[i].defval);
> + kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
> + tdx_uret_msrs[i].defval, -1ull);
> }
>
> static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
> {
> struct vcpu_vt *vt = to_vt(vcpu);
> - struct vcpu_tdx *tdx = to_tdx(vcpu);
>
> if (!vt->guest_state_loaded)
> return;
> @@ -815,11 +815,6 @@ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
> ++vcpu->stat.host_state_reload;
> wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
>
> - if (tdx->guest_entered) {
> - tdx_user_return_msr_update_cache();
> - tdx->guest_entered = false;
> - }
> -
> vt->guest_state_loaded = false;
> }
>
> @@ -1059,7 +1054,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
> update_debugctlmsr(vcpu->arch.host_debugctl);
>
> tdx_load_host_xsave_state(vcpu);
> - tdx->guest_entered = true;
>
> vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
>
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index ca39a9391db1..7f258870dc41 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -67,7 +67,6 @@ struct vcpu_tdx {
> u64 vp_enter_ret;
>
> enum vcpu_tdx_state state;
> - bool guest_entered;
>
> u64 map_gpa_next;
> u64 map_gpa_end;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b4b5d2d09634..639589af7cbe 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -681,15 +681,6 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
> }
> EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
>
> -void kvm_user_return_msr_update_cache(unsigned int slot, u64 value)
> -{
> - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
> -
> - msrs->values[slot].curr = value;
> - kvm_user_return_register_notifier(msrs);
> -}
> -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache);
> -
> u64 kvm_get_user_return_msr(unsigned int slot)
> {
> return this_cpu_ptr(user_return_msrs)->values[slot].curr;
On Thu, Oct 30, 2025 at 12:15:25PM -0700, Sean Christopherson wrote:
> Set all user-return MSRs to their post-TD-exit value when preparing to run
> a TDX vCPU to ensure the value that KVM expects to be loaded after running
> the vCPU is indeed the value that's loaded in hardware. If the TDX-Module
> doesn't actually enter the guest, i.e. doesn't do VM-Enter, then it won't
> "restore" VMM state, i.e. won't clobber user-return MSRs to their expected
> post-run values, in which case simply updating KVM's "cached" value will
> effectively corrupt the cache due to hardware still holding the original
> value.
This paragraph is confusing.
The flow for the TDX module for the user-return MSRs is:
1. Before entering guest, i.e., inside tdh_vp_enter(),
a) if VM-Enter is guaranteed to succeed, load MSRs with saved guest value;
b) otherwise, do nothing and return to VMM.
2. After VMExit, before returning to VMM,
save guest value and restore MSRs to default values.
Failure of tdh_vp_enter() (i.e., in case of 1.b), the hardware values of the
MSRs should be either host value or default value, while with
msrs->values[slot].curr being default value.
As a result, the reasoning of "hardware still holding the original value" is not
convincing, since the original value is exactly the host value.
> In theory, KVM could conditionally update the current user-return value if
> and only if tdh_vp_enter() succeeds, but in practice "success" doesn't
> guarantee the TDX-Module actually entered the guest, e.g. if the TDX-Module
> synthesizes an EPT Violation because it suspects a zero-step attack.
>
> Force-load the expected values instead of trying to decipher whether or
> not the TDX-Module restored/clobbered MSRs, as the risk doesn't justify
> the benefits. Effectively avoiding four WRMSRs once per run loop (even if
> the vCPU is scheduled out, user-return MSRs only need to be reloaded if
> the CPU exits to userspace or runs a non-TDX vCPU) is likely in the noise
> when amortized over all entries, given the cost of running a TDX vCPU.
> E.g. the cost of the WRMSRs is somewhere between ~300 and ~500 cycles,
> whereas the cost of a _single_ roundtrip to/from a TDX guest is thousands
> of cycles.
>
> Fixes: e0b4f31a3c65 ("KVM: TDX: restore user ret MSRs")
> Cc: stable@vger.kernel.org
> Cc: Yan Zhao <yan.y.zhao@intel.com>
> Cc: Xiaoyao Li <xiaoyao.li@intel.com>
> Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> arch/x86/include/asm/kvm_host.h | 1 -
> arch/x86/kvm/vmx/tdx.c | 52 +++++++++++++++------------------
> arch/x86/kvm/vmx/tdx.h | 1 -
> arch/x86/kvm/x86.c | 9 ------
> 4 files changed, 23 insertions(+), 40 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 48598d017d6f..d158dfd1842e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -2378,7 +2378,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
> int kvm_add_user_return_msr(u32 msr);
> int kvm_find_user_return_msr(u32 msr);
> int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
> -void kvm_user_return_msr_update_cache(unsigned int index, u64 val);
> u64 kvm_get_user_return_msr(unsigned int slot);
>
> static inline bool kvm_is_supported_user_return_msr(u32 msr)
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 326db9b9c567..cde91a995076 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -763,25 +763,6 @@ static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
> return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
> }
>
> -/*
> - * Compared to vmx_prepare_switch_to_guest(), there is not much to do
> - * as SEAMCALL/SEAMRET calls take care of most of save and restore.
> - */
> -void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
> -{
> - struct vcpu_vt *vt = to_vt(vcpu);
> -
> - if (vt->guest_state_loaded)
> - return;
> -
> - if (likely(is_64bit_mm(current->mm)))
> - vt->msr_host_kernel_gs_base = current->thread.gsbase;
> - else
> - vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
> -
> - vt->guest_state_loaded = true;
> -}
> -
> struct tdx_uret_msr {
> u32 msr;
> unsigned int slot;
> @@ -795,19 +776,38 @@ static struct tdx_uret_msr tdx_uret_msrs[] = {
> {.msr = MSR_TSC_AUX,},
> };
>
> -static void tdx_user_return_msr_update_cache(void)
> +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
> {
> + struct vcpu_vt *vt = to_vt(vcpu);
> int i;
>
> + if (vt->guest_state_loaded)
> + return;
> +
> + if (likely(is_64bit_mm(current->mm)))
> + vt->msr_host_kernel_gs_base = current->thread.gsbase;
> + else
> + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
> +
> + vt->guest_state_loaded = true;
> +
> + /*
> + * Explicitly set user-return MSRs that are clobbered by the TDX-Module
> + * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
> + * written by the TDX-Module. Don't rely on the TDX-Module to actually
> + * clobber the MSRs, as the contract is poorly defined and not upheld.
> + * E.g. the TDX-Module will synthesize an EPT Violation without doing
> + * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
> + * state.
> + */
> for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
> - kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
> - tdx_uret_msrs[i].defval);
> + kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
> + tdx_uret_msrs[i].defval, -1ull);
> }
>
> static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
> {
> struct vcpu_vt *vt = to_vt(vcpu);
> - struct vcpu_tdx *tdx = to_tdx(vcpu);
>
> if (!vt->guest_state_loaded)
> return;
> @@ -815,11 +815,6 @@ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
> ++vcpu->stat.host_state_reload;
> wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
>
> - if (tdx->guest_entered) {
> - tdx_user_return_msr_update_cache();
> - tdx->guest_entered = false;
> - }
> -
> vt->guest_state_loaded = false;
> }
>
> @@ -1059,7 +1054,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
> update_debugctlmsr(vcpu->arch.host_debugctl);
>
> tdx_load_host_xsave_state(vcpu);
> - tdx->guest_entered = true;
>
> vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
>
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index ca39a9391db1..7f258870dc41 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -67,7 +67,6 @@ struct vcpu_tdx {
> u64 vp_enter_ret;
>
> enum vcpu_tdx_state state;
> - bool guest_entered;
>
> u64 map_gpa_next;
> u64 map_gpa_end;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b4b5d2d09634..639589af7cbe 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -681,15 +681,6 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
> }
> EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
>
> -void kvm_user_return_msr_update_cache(unsigned int slot, u64 value)
> -{
> - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
> -
> - msrs->values[slot].curr = value;
> - kvm_user_return_register_notifier(msrs);
> -}
> -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache);
> -
> u64 kvm_get_user_return_msr(unsigned int slot)
> {
> return this_cpu_ptr(user_return_msrs)->values[slot].curr;
> --
> 2.51.1.930.gacf6e81ea2-goog
>
Another nit:
Remove the tdx_user_return_msr_update_cache() in the comment of __tdx_bringup().
Or could we just invoke tdx_user_return_msr_update_cache() in
tdx_prepare_switch_to_guest()?
On Mon, Nov 03, 2025 at 02:20:18PM +0800, Yan Zhao wrote:
> On Thu, Oct 30, 2025 at 12:15:25PM -0700, Sean Christopherson wrote:
> > Set all user-return MSRs to their post-TD-exit value when preparing to run
> > a TDX vCPU to ensure the value that KVM expects to be loaded after running
> > the vCPU is indeed the value that's loaded in hardware. If the TDX-Module
> > doesn't actually enter the guest, i.e. doesn't do VM-Enter, then it won't
> > "restore" VMM state, i.e. won't clobber user-return MSRs to their expected
> > post-run values, in which case simply updating KVM's "cached" value will
> > effectively corrupt the cache due to hardware still holding the original
> > value.
> This paragraph is confusing.
>
> The flow for the TDX module for the user-return MSRs is:
>
> 1. Before entering guest, i.e., inside tdh_vp_enter(),
> a) if VM-Enter is guaranteed to succeed, load MSRs with saved guest value;
> b) otherwise, do nothing and return to VMM.
>
> 2. After VMExit, before returning to VMM,
> save guest value and restore MSRs to default values.
>
>
> Failure of tdh_vp_enter() (i.e., in case of 1.b), the hardware values of the
> MSRs should be either host value or default value, while with
> msrs->values[slot].curr being default value.
>
> As a result, the reasoning of "hardware still holding the original value" is not
> convincing, since the original value is exactly the host value.
>
> > In theory, KVM could conditionally update the current user-return value if
> > and only if tdh_vp_enter() succeeds, but in practice "success" doesn't
> > guarantee the TDX-Module actually entered the guest, e.g. if the TDX-Module
> > synthesizes an EPT Violation because it suspects a zero-step attack.
> >
> > Force-load the expected values instead of trying to decipher whether or
> > not the TDX-Module restored/clobbered MSRs, as the risk doesn't justify
> > the benefits. Effectively avoiding four WRMSRs once per run loop (even if
> > the vCPU is scheduled out, user-return MSRs only need to be reloaded if
> > the CPU exits to userspace or runs a non-TDX vCPU) is likely in the noise
> > when amortized over all entries, given the cost of running a TDX vCPU.
> > E.g. the cost of the WRMSRs is somewhere between ~300 and ~500 cycles,
> > whereas the cost of a _single_ roundtrip to/from a TDX guest is thousands
> > of cycles.
> >
> > Fixes: e0b4f31a3c65 ("KVM: TDX: restore user ret MSRs")
> > Cc: stable@vger.kernel.org
> > Cc: Yan Zhao <yan.y.zhao@intel.com>
> > Cc: Xiaoyao Li <xiaoyao.li@intel.com>
> > Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > ---
> > arch/x86/include/asm/kvm_host.h | 1 -
> > arch/x86/kvm/vmx/tdx.c | 52 +++++++++++++++------------------
> > arch/x86/kvm/vmx/tdx.h | 1 -
> > arch/x86/kvm/x86.c | 9 ------
> > 4 files changed, 23 insertions(+), 40 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 48598d017d6f..d158dfd1842e 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -2378,7 +2378,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
> > int kvm_add_user_return_msr(u32 msr);
> > int kvm_find_user_return_msr(u32 msr);
> > int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
> > -void kvm_user_return_msr_update_cache(unsigned int index, u64 val);
> > u64 kvm_get_user_return_msr(unsigned int slot);
> >
> > static inline bool kvm_is_supported_user_return_msr(u32 msr)
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 326db9b9c567..cde91a995076 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -763,25 +763,6 @@ static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
> > return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
> > }
> >
> > -/*
> > - * Compared to vmx_prepare_switch_to_guest(), there is not much to do
> > - * as SEAMCALL/SEAMRET calls take care of most of save and restore.
> > - */
> > -void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
> > -{
> > - struct vcpu_vt *vt = to_vt(vcpu);
> > -
> > - if (vt->guest_state_loaded)
> > - return;
> > -
> > - if (likely(is_64bit_mm(current->mm)))
> > - vt->msr_host_kernel_gs_base = current->thread.gsbase;
> > - else
> > - vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
> > -
> > - vt->guest_state_loaded = true;
> > -}
> > -
> > struct tdx_uret_msr {
> > u32 msr;
> > unsigned int slot;
> > @@ -795,19 +776,38 @@ static struct tdx_uret_msr tdx_uret_msrs[] = {
> > {.msr = MSR_TSC_AUX,},
> > };
> >
> > -static void tdx_user_return_msr_update_cache(void)
> > +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
> > {
> > + struct vcpu_vt *vt = to_vt(vcpu);
> > int i;
> >
> > + if (vt->guest_state_loaded)
> > + return;
> > +
> > + if (likely(is_64bit_mm(current->mm)))
> > + vt->msr_host_kernel_gs_base = current->thread.gsbase;
> > + else
> > + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
> > +
> > + vt->guest_state_loaded = true;
> > +
> > + /*
> > + * Explicitly set user-return MSRs that are clobbered by the TDX-Module
> > + * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
> > + * written by the TDX-Module. Don't rely on the TDX-Module to actually
> > + * clobber the MSRs, as the contract is poorly defined and not upheld.
> > + * E.g. the TDX-Module will synthesize an EPT Violation without doing
> > + * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
> > + * state.
> > + */
> > for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
> > - kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
> > - tdx_uret_msrs[i].defval);
> > + kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
> > + tdx_uret_msrs[i].defval, -1ull);
> > }
> >
> > static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
> > {
> > struct vcpu_vt *vt = to_vt(vcpu);
> > - struct vcpu_tdx *tdx = to_tdx(vcpu);
> >
> > if (!vt->guest_state_loaded)
> > return;
> > @@ -815,11 +815,6 @@ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
> > ++vcpu->stat.host_state_reload;
> > wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
> >
> > - if (tdx->guest_entered) {
> > - tdx_user_return_msr_update_cache();
> > - tdx->guest_entered = false;
> > - }
> > -
> > vt->guest_state_loaded = false;
> > }
> >
> > @@ -1059,7 +1054,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
> > update_debugctlmsr(vcpu->arch.host_debugctl);
> >
> > tdx_load_host_xsave_state(vcpu);
> > - tdx->guest_entered = true;
> >
> > vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
> >
> > diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> > index ca39a9391db1..7f258870dc41 100644
> > --- a/arch/x86/kvm/vmx/tdx.h
> > +++ b/arch/x86/kvm/vmx/tdx.h
> > @@ -67,7 +67,6 @@ struct vcpu_tdx {
> > u64 vp_enter_ret;
> >
> > enum vcpu_tdx_state state;
> > - bool guest_entered;
> >
> > u64 map_gpa_next;
> > u64 map_gpa_end;
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index b4b5d2d09634..639589af7cbe 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -681,15 +681,6 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
> > }
> > EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
> >
> > -void kvm_user_return_msr_update_cache(unsigned int slot, u64 value)
> > -{
> > - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
> > -
> > - msrs->values[slot].curr = value;
> > - kvm_user_return_register_notifier(msrs);
> > -}
> > -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache);
> > -
> > u64 kvm_get_user_return_msr(unsigned int slot)
> > {
> > return this_cpu_ptr(user_return_msrs)->values[slot].curr;
> > --
> > 2.51.1.930.gacf6e81ea2-goog
> >
On 11/4/2025 3:06 PM, Yan Zhao wrote:
> Another nit:
> Remove the tdx_user_return_msr_update_cache() in the comment of __tdx_bringup().
>
> Or could we just invoke tdx_user_return_msr_update_cache() in
> tdx_prepare_switch_to_guest()?
No. It lacks the WRMSR operation to update the hardware value, which is
the key of this patch.
> On Mon, Nov 03, 2025 at 02:20:18PM +0800, Yan Zhao wrote:
>> On Thu, Oct 30, 2025 at 12:15:25PM -0700, Sean Christopherson wrote:
>>> Set all user-return MSRs to their post-TD-exit value when preparing to run
>>> a TDX vCPU to ensure the value that KVM expects to be loaded after running
>>> the vCPU is indeed the value that's loaded in hardware. If the TDX-Module
>>> doesn't actually enter the guest, i.e. doesn't do VM-Enter, then it won't
>>> "restore" VMM state, i.e. won't clobber user-return MSRs to their expected
>>> post-run values, in which case simply updating KVM's "cached" value will
>>> effectively corrupt the cache due to hardware still holding the original
>>> value.
>> This paragraph is confusing.
>>
>> The flow for the TDX module for the user-return MSRs is:
>>
>> 1. Before entering guest, i.e., inside tdh_vp_enter(),
>> a) if VM-Enter is guaranteed to succeed, load MSRs with saved guest value;
>> b) otherwise, do nothing and return to VMM.
>>
>> 2. After VMExit, before returning to VMM,
>> save guest value and restore MSRs to default values.
>>
>>
>> Failure of tdh_vp_enter() (i.e., in case of 1.b), the hardware values of the
>> MSRs should be either host value or default value, while with
>> msrs->values[slot].curr being default value.
>>
>> As a result, the reasoning of "hardware still holding the original value" is not
>> convincing, since the original value is exactly the host value.
>>
>>> In theory, KVM could conditionally update the current user-return value if
>>> and only if tdh_vp_enter() succeeds, but in practice "success" doesn't
>>> guarantee the TDX-Module actually entered the guest, e.g. if the TDX-Module
>>> synthesizes an EPT Violation because it suspects a zero-step attack.
>>>
>>> Force-load the expected values instead of trying to decipher whether or
>>> not the TDX-Module restored/clobbered MSRs, as the risk doesn't justify
>>> the benefits. Effectively avoiding four WRMSRs once per run loop (even if
>>> the vCPU is scheduled out, user-return MSRs only need to be reloaded if
>>> the CPU exits to userspace or runs a non-TDX vCPU) is likely in the noise
>>> when amortized over all entries, given the cost of running a TDX vCPU.
>>> E.g. the cost of the WRMSRs is somewhere between ~300 and ~500 cycles,
>>> whereas the cost of a _single_ roundtrip to/from a TDX guest is thousands
>>> of cycles.
>>>
>>> Fixes: e0b4f31a3c65 ("KVM: TDX: restore user ret MSRs")
>>> Cc: stable@vger.kernel.org
>>> Cc: Yan Zhao <yan.y.zhao@intel.com>
>>> Cc: Xiaoyao Li <xiaoyao.li@intel.com>
>>> Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
>>> Signed-off-by: Sean Christopherson <seanjc@google.com>
>>> ---
>>> arch/x86/include/asm/kvm_host.h | 1 -
>>> arch/x86/kvm/vmx/tdx.c | 52 +++++++++++++++------------------
>>> arch/x86/kvm/vmx/tdx.h | 1 -
>>> arch/x86/kvm/x86.c | 9 ------
>>> 4 files changed, 23 insertions(+), 40 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>> index 48598d017d6f..d158dfd1842e 100644
>>> --- a/arch/x86/include/asm/kvm_host.h
>>> +++ b/arch/x86/include/asm/kvm_host.h
>>> @@ -2378,7 +2378,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
>>> int kvm_add_user_return_msr(u32 msr);
>>> int kvm_find_user_return_msr(u32 msr);
>>> int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
>>> -void kvm_user_return_msr_update_cache(unsigned int index, u64 val);
>>> u64 kvm_get_user_return_msr(unsigned int slot);
>>>
>>> static inline bool kvm_is_supported_user_return_msr(u32 msr)
>>> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
>>> index 326db9b9c567..cde91a995076 100644
>>> --- a/arch/x86/kvm/vmx/tdx.c
>>> +++ b/arch/x86/kvm/vmx/tdx.c
>>> @@ -763,25 +763,6 @@ static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
>>> return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
>>> }
>>>
>>> -/*
>>> - * Compared to vmx_prepare_switch_to_guest(), there is not much to do
>>> - * as SEAMCALL/SEAMRET calls take care of most of save and restore.
>>> - */
>>> -void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
>>> -{
>>> - struct vcpu_vt *vt = to_vt(vcpu);
>>> -
>>> - if (vt->guest_state_loaded)
>>> - return;
>>> -
>>> - if (likely(is_64bit_mm(current->mm)))
>>> - vt->msr_host_kernel_gs_base = current->thread.gsbase;
>>> - else
>>> - vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
>>> -
>>> - vt->guest_state_loaded = true;
>>> -}
>>> -
>>> struct tdx_uret_msr {
>>> u32 msr;
>>> unsigned int slot;
>>> @@ -795,19 +776,38 @@ static struct tdx_uret_msr tdx_uret_msrs[] = {
>>> {.msr = MSR_TSC_AUX,},
>>> };
>>>
>>> -static void tdx_user_return_msr_update_cache(void)
>>> +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
>>> {
>>> + struct vcpu_vt *vt = to_vt(vcpu);
>>> int i;
>>>
>>> + if (vt->guest_state_loaded)
>>> + return;
>>> +
>>> + if (likely(is_64bit_mm(current->mm)))
>>> + vt->msr_host_kernel_gs_base = current->thread.gsbase;
>>> + else
>>> + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
>>> +
>>> + vt->guest_state_loaded = true;
>>> +
>>> + /*
>>> + * Explicitly set user-return MSRs that are clobbered by the TDX-Module
>>> + * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
>>> + * written by the TDX-Module. Don't rely on the TDX-Module to actually
>>> + * clobber the MSRs, as the contract is poorly defined and not upheld.
>>> + * E.g. the TDX-Module will synthesize an EPT Violation without doing
>>> + * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
>>> + * state.
>>> + */
>>> for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
>>> - kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
>>> - tdx_uret_msrs[i].defval);
>>> + kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
>>> + tdx_uret_msrs[i].defval, -1ull);
>>> }
>>>
>>> static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
>>> {
>>> struct vcpu_vt *vt = to_vt(vcpu);
>>> - struct vcpu_tdx *tdx = to_tdx(vcpu);
>>>
>>> if (!vt->guest_state_loaded)
>>> return;
>>> @@ -815,11 +815,6 @@ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
>>> ++vcpu->stat.host_state_reload;
>>> wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
>>>
>>> - if (tdx->guest_entered) {
>>> - tdx_user_return_msr_update_cache();
>>> - tdx->guest_entered = false;
>>> - }
>>> -
>>> vt->guest_state_loaded = false;
>>> }
>>>
>>> @@ -1059,7 +1054,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
>>> update_debugctlmsr(vcpu->arch.host_debugctl);
>>>
>>> tdx_load_host_xsave_state(vcpu);
>>> - tdx->guest_entered = true;
>>>
>>> vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
>>>
>>> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
>>> index ca39a9391db1..7f258870dc41 100644
>>> --- a/arch/x86/kvm/vmx/tdx.h
>>> +++ b/arch/x86/kvm/vmx/tdx.h
>>> @@ -67,7 +67,6 @@ struct vcpu_tdx {
>>> u64 vp_enter_ret;
>>>
>>> enum vcpu_tdx_state state;
>>> - bool guest_entered;
>>>
>>> u64 map_gpa_next;
>>> u64 map_gpa_end;
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index b4b5d2d09634..639589af7cbe 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -681,15 +681,6 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
>>> }
>>> EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
>>>
>>> -void kvm_user_return_msr_update_cache(unsigned int slot, u64 value)
>>> -{
>>> - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
>>> -
>>> - msrs->values[slot].curr = value;
>>> - kvm_user_return_register_notifier(msrs);
>>> -}
>>> -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache);
>>> -
>>> u64 kvm_get_user_return_msr(unsigned int slot)
>>> {
>>> return this_cpu_ptr(user_return_msrs)->values[slot].curr;
>>> --
>>> 2.51.1.930.gacf6e81ea2-goog
>>>
On Tue, Nov 04, 2025 at 04:40:44PM +0800, Xiaoyao Li wrote: > On 11/4/2025 3:06 PM, Yan Zhao wrote: > > Another nit: > > Remove the tdx_user_return_msr_update_cache() in the comment of __tdx_bringup(). > > > > Or could we just invoke tdx_user_return_msr_update_cache() in > > tdx_prepare_switch_to_guest()? > > No. It lacks the WRMSR operation to update the hardware value, which is the > key of this patch. As [1], I don't think the WRMSR operation to update the hardware value is necessary. The value will be updated to guest value soon any way if tdh_vp_enter() succeeds, or the hardware value remains to be the host value or the default value. But I think invoking tdx_user_return_msr_update_cache() in tdx_prepare_switch_to_guest() is better than in tdx_prepare_switch_to_host(). [1] https://lore.kernel.org/kvm/aQhJol0CvT6bNCJQ@yzhao56-desk.sh.intel.com/
On Tue, Nov 04, 2025, Yan Zhao wrote: > On Tue, Nov 04, 2025 at 04:40:44PM +0800, Xiaoyao Li wrote: > > On 11/4/2025 3:06 PM, Yan Zhao wrote: > > > Another nit: > > > Remove the tdx_user_return_msr_update_cache() in the comment of __tdx_bringup(). > > > > > > Or could we just invoke tdx_user_return_msr_update_cache() in > > > tdx_prepare_switch_to_guest()? > > > > No. It lacks the WRMSR operation to update the hardware value, which is the > > key of this patch. > As [1], I don't think the WRMSR operation to update the hardware value is > necessary. The value will be updated to guest value soon any way if > tdh_vp_enter() succeeds, or the hardware value remains to be the host value or > the default value. As explained in the original thread: : > If the MSR's do not get clobbered, does it matter whether or not they get : > restored. : : It matters because KVM needs to know the actual value in hardware. If KVM thinks : an MSR is 'X', but it's actually 'Y', then KVM could fail to write the correct : value into hardware when returning to userspace and/or when running a different : vCPU. I.e. updating the cache effectively corrupts state if the TDX-Module doesn't clobber MSRs as expected, i.e. if the current value is preserved in hardware. > But I think invoking tdx_user_return_msr_update_cache() in > tdx_prepare_switch_to_guest() is better than in > tdx_prepare_switch_to_host(). > > [1] https://lore.kernel.org/kvm/aQhJol0CvT6bNCJQ@yzhao56-desk.sh.intel.com/ >
On Tue, Nov 04, 2025 at 09:55:54AM -0800, Sean Christopherson wrote: > On Tue, Nov 04, 2025, Yan Zhao wrote: > > On Tue, Nov 04, 2025 at 04:40:44PM +0800, Xiaoyao Li wrote: > > > On 11/4/2025 3:06 PM, Yan Zhao wrote: > > > > Another nit: > > > > Remove the tdx_user_return_msr_update_cache() in the comment of __tdx_bringup(). > > > > > > > > Or could we just invoke tdx_user_return_msr_update_cache() in > > > > tdx_prepare_switch_to_guest()? > > > > > > No. It lacks the WRMSR operation to update the hardware value, which is the > > > key of this patch. > > As [1], I don't think the WRMSR operation to update the hardware value is > > necessary. The value will be updated to guest value soon any way if > > tdh_vp_enter() succeeds, or the hardware value remains to be the host value or > > the default value. > > As explained in the original thread: > > : > If the MSR's do not get clobbered, does it matter whether or not they get > : > restored. > : > : It matters because KVM needs to know the actual value in hardware. If KVM thinks > : an MSR is 'X', but it's actually 'Y', then KVM could fail to write the correct > : value into hardware when returning to userspace and/or when running a different > : vCPU. > > I.e. updating the cache effectively corrupts state if the TDX-Module doesn't > clobber MSRs as expected, i.e. if the current value is preserved in hardware. I'm not against this patch. But I think the above explanation is not that convincing, (or somewhat confusing). By "if the TDX-Module doesn't clobber MSRs as expected", - if it occurs due to tdh_vp_enter() failure, I think it's fine. Though KVM thinks the MSR is 'X', the actual value in hardware should be either 'Y' (the host value) or 'X' (the expected clobbered value). It's benign to preserving value 'Y', no? - if it occurs due to TDX module bugs, e.g., if after a successful tdh_vp_enter() and VM exits, the TDX module clobbers the MSR to 'Z', while the host value for the MSR is 'Y' and KVM thinks the actual value is 'X'. Then the hardware state will be incorrect after returning to userspace if 'X' == 'Y'. But this patch can't guard against this condition as well, right? > > But I think invoking tdx_user_return_msr_update_cache() in > > tdx_prepare_switch_to_guest() is better than in > > tdx_prepare_switch_to_host(). > > > > [1] https://lore.kernel.org/kvm/aQhJol0CvT6bNCJQ@yzhao56-desk.sh.intel.com/ > >
On 11/5/2025 9:52 AM, Yan Zhao wrote:
> On Tue, Nov 04, 2025 at 09:55:54AM -0800, Sean Christopherson wrote:
>> On Tue, Nov 04, 2025, Yan Zhao wrote:
>>> On Tue, Nov 04, 2025 at 04:40:44PM +0800, Xiaoyao Li wrote:
>>>> On 11/4/2025 3:06 PM, Yan Zhao wrote:
>>>>> Another nit:
>>>>> Remove the tdx_user_return_msr_update_cache() in the comment of __tdx_bringup().
>>>>>
>>>>> Or could we just invoke tdx_user_return_msr_update_cache() in
>>>>> tdx_prepare_switch_to_guest()?
>>>>
>>>> No. It lacks the WRMSR operation to update the hardware value, which is the
>>>> key of this patch.
>>> As [1], I don't think the WRMSR operation to update the hardware value is
>>> necessary. The value will be updated to guest value soon any way if
>>> tdh_vp_enter() succeeds, or the hardware value remains to be the host value or
>>> the default value.
>>
>> As explained in the original thread:
>>
>> : > If the MSR's do not get clobbered, does it matter whether or not they get
>> : > restored.
>> :
>> : It matters because KVM needs to know the actual value in hardware. If KVM thinks
>> : an MSR is 'X', but it's actually 'Y', then KVM could fail to write the correct
>> : value into hardware when returning to userspace and/or when running a different
>> : vCPU.
>>
>> I.e. updating the cache effectively corrupts state if the TDX-Module doesn't
>> clobber MSRs as expected, i.e. if the current value is preserved in hardware.
> I'm not against this patch. But I think the above explanation is not that
> convincing, (or somewhat confusing).
>
>
> By "if the TDX-Module doesn't clobber MSRs as expected",
> - if it occurs due to tdh_vp_enter() failure, I think it's fine.
> Though KVM thinks the MSR is 'X', the actual value in hardware should be
> either 'Y' (the host value) or 'X' (the expected clobbered value).
> It's benign to preserving value 'Y', no?
For example, after tdh_vp_enter() failure, the state becomes
.curr == 'X'
hardware == 'Y'
and the TD vcpu thread is preempted and the pcpu is scheduled to run
another VM's vcpu, which is a normal VMX vcpu and it happens to have the
MSR value of 'X'. So in
vmx_prepare_switch_to_guest()
-> kvm_set_user_return_msr()
it will skip the WRMSR because written_value == .curr == 'X', but the
hardware value is 'Y'. Then KVM fails to load the expected value 'X' for
the VMX vcpu.
> - if it occurs due to TDX module bugs, e.g., if after a successful
> tdh_vp_enter() and VM exits, the TDX module clobbers the MSR to 'Z', while
> the host value for the MSR is 'Y' and KVM thinks the actual value is 'X'.
> Then the hardware state will be incorrect after returning to userspace if
> 'X' == 'Y'. But this patch can't guard against this condition as well, right?
>
>
>>> But I think invoking tdx_user_return_msr_update_cache() in
>>> tdx_prepare_switch_to_guest() is better than in
>>> tdx_prepare_switch_to_host().
>>>
>>> [1] https://lore.kernel.org/kvm/aQhJol0CvT6bNCJQ@yzhao56-desk.sh.intel.com/
>>>
On Wed, Nov 05, 2025 at 05:16:56PM +0800, Xiaoyao Li wrote: > On 11/5/2025 9:52 AM, Yan Zhao wrote: > > On Tue, Nov 04, 2025 at 09:55:54AM -0800, Sean Christopherson wrote: > > > On Tue, Nov 04, 2025, Yan Zhao wrote: > > > > On Tue, Nov 04, 2025 at 04:40:44PM +0800, Xiaoyao Li wrote: > > > > > On 11/4/2025 3:06 PM, Yan Zhao wrote: > > > > > > Another nit: > > > > > > Remove the tdx_user_return_msr_update_cache() in the comment of __tdx_bringup(). > > > > > > > > > > > > Or could we just invoke tdx_user_return_msr_update_cache() in > > > > > > tdx_prepare_switch_to_guest()? > > > > > > > > > > No. It lacks the WRMSR operation to update the hardware value, which is the > > > > > key of this patch. > > > > As [1], I don't think the WRMSR operation to update the hardware value is > > > > necessary. The value will be updated to guest value soon any way if > > > > tdh_vp_enter() succeeds, or the hardware value remains to be the host value or > > > > the default value. > > > > > > As explained in the original thread: > > > > > > : > If the MSR's do not get clobbered, does it matter whether or not they get > > > : > restored. > > > : > > > : It matters because KVM needs to know the actual value in hardware. If KVM thinks > > > : an MSR is 'X', but it's actually 'Y', then KVM could fail to write the correct > > > : value into hardware when returning to userspace and/or when running a different > > > : vCPU. > > > > > > I.e. updating the cache effectively corrupts state if the TDX-Module doesn't > > > clobber MSRs as expected, i.e. if the current value is preserved in hardware. > > I'm not against this patch. But I think the above explanation is not that > > convincing, (or somewhat confusing). > > > > > > By "if the TDX-Module doesn't clobber MSRs as expected", > > - if it occurs due to tdh_vp_enter() failure, I think it's fine. > > Though KVM thinks the MSR is 'X', the actual value in hardware should be > > either 'Y' (the host value) or 'X' (the expected clobbered value). > > It's benign to preserving value 'Y', no? > > For example, after tdh_vp_enter() failure, the state becomes > > .curr == 'X' > hardware == 'Y' > > and the TD vcpu thread is preempted and the pcpu is scheduled to run another > VM's vcpu, which is a normal VMX vcpu and it happens to have the MSR value > of 'X'. So in > > vmx_prepare_switch_to_guest() > -> kvm_set_user_return_msr() > > it will skip the WRMSR because written_value == .curr == 'X', but the > hardware value is 'Y'. Then KVM fails to load the expected value 'X' for the > VMX vcpu. Oh. Thanks! I overlooked that there's another checking of .curr in kvm_set_user_return_msr(). It explains why .curr must be equal to the hardware value when outside guest mode. > > - if it occurs due to TDX module bugs, e.g., if after a successful > > tdh_vp_enter() and VM exits, the TDX module clobbers the MSR to 'Z', while > > the host value for the MSR is 'Y' and KVM thinks the actual value is 'X'. > > Then the hardware state will be incorrect after returning to userspace if > > 'X' == 'Y'. But this patch can't guard against this condition as well, right? > > > > > > > > But I think invoking tdx_user_return_msr_update_cache() in > > > > tdx_prepare_switch_to_guest() is better than in > > > > tdx_prepare_switch_to_host(). > > > > > > > > [1] https://lore.kernel.org/kvm/aQhJol0CvT6bNCJQ@yzhao56-desk.sh.intel.com/ >
© 2016 - 2026 Red Hat, Inc.