From: Isaku Yamahata <isaku.yamahata@intel.com>
Implement callbacks to enter/exit a TDX VCPU by calling tdh_vp_enter().
Ensure the TDX VCPU is in a correct state to run.
Do not pass arguments from/to vcpu->arch.regs[] unconditionally. Instead,
marshall state to/from the appropriate x86 registers only when needed,
i.e., to handle some TDVMCALL sub-leaves following KVM's ABI to leverage
the existing code.
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
TD vcpu enter/exit v2:
- Move VCPU_TD_STATE_INITIALIZED check to tdx_vcpu_pre_run() (Xiaoyao)
- Check TD_STATE_RUNNABLE also in tdx_vcpu_pre_run() (Yan)
- Add back 'noinstr' for tdx_vcpu_enter_exit() (Sean)
- Add WARN_ON_ONCE if force_immediate_exit (Sean)
- Add vp_enter_args to vcpu_tdx to store the input/output arguments for
tdh_vp_enter().
- Don't copy arguments to/from vcpu->arch.regs[] unconditionally. (Sean)
TD vcpu enter/exit v1:
- Make argument of tdx_vcpu_enter_exit() struct kvm_vcpu.
- Update for the wrapper functions for SEAMCALLs. (Sean)
- Remove noinstr (Sean)
- Add a missing comma, clarify sched_in part, and update changelog to
match code by dropping the PMU related paragraph (Binbin)
https://lore.kernel.org/lkml/c0029d4d-3dee-4f11-a929-d64d2651bfb3@linux.intel.com/
- Remove the union tdx_exit_reason. (Sean)
https://lore.kernel.org/kvm/ZfSExlemFMKjBtZb@google.com/
- Remove the code of special handling of vcpu->kvm->vm_bugged (Rick)
https://lore.kernel.org/kvm/20240318234010.GD1645738@ls.amr.corp.intel.com/
- For !tdx->initialized case, set tdx->vp_enter_ret to TDX_SW_ERROR to avoid
collision with EXIT_REASON_EXCEPTION_NMI.
v19:
- Removed export_symbol_gpl(host_xcr0) to the patch that uses it
Changes v15 -> v16:
- use __seamcall_saved_ret()
- As struct tdx_module_args doesn't match with vcpu.arch.regs, copy regs
before/after calling __seamcall_saved_ret().
---
arch/x86/kvm/vmx/main.c | 20 ++++++++++++++--
arch/x86/kvm/vmx/tdx.c | 47 ++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/tdx.h | 3 +++
arch/x86/kvm/vmx/x86_ops.h | 7 ++++++
4 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 1cc1c06461f2..301c1a26606f 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -133,6 +133,22 @@ static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vmx_vcpu_load(vcpu, cpu);
}
+static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_pre_run(vcpu);
+
+ return vmx_vcpu_pre_run(vcpu);
+}
+
+static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_run(vcpu, force_immediate_exit);
+
+ return vmx_vcpu_run(vcpu, force_immediate_exit);
+}
+
static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
{
if (is_td_vcpu(vcpu)) {
@@ -272,8 +288,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.flush_tlb_gva = vt_flush_tlb_gva,
.flush_tlb_guest = vt_flush_tlb_guest,
- .vcpu_pre_run = vmx_vcpu_pre_run,
- .vcpu_run = vmx_vcpu_run,
+ .vcpu_pre_run = vt_vcpu_pre_run,
+ .vcpu_run = vt_vcpu_run,
.handle_exit = vmx_handle_exit,
.skip_emulated_instruction = vmx_skip_emulated_instruction,
.update_emulated_instruction = vmx_update_emulated_instruction,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index a7ebdafdfd82..95420ffd0022 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -11,6 +11,8 @@
#include "vmx.h"
#include "mmu/spte.h"
#include "common.h"
+#include <trace/events/kvm.h>
+#include "trace.h"
#pragma GCC poison to_vmx
@@ -673,6 +675,51 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
tdx->state = VCPU_TD_STATE_UNINITIALIZED;
}
+int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
+ to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
+ return -EINVAL;
+
+ return 1;
+}
+
+static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ guest_state_enter_irqoff();
+
+ tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
+
+ guest_state_exit_irqoff();
+}
+
+#define TDX_REGS_UNSUPPORTED_SET (BIT(VCPU_EXREG_RFLAGS) | \
+ BIT(VCPU_EXREG_SEGMENTS))
+
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+{
+ /*
+ * force_immediate_exit requires vCPU entering for events injection with
+ * an immediately exit followed. But The TDX module doesn't guarantee
+ * entry, it's already possible for KVM to _think_ it completely entry
+ * to the guest without actually having done so.
+ * Since KVM never needs to force an immediate exit for TDX, and can't
+ * do direct injection, just warn on force_immediate_exit.
+ */
+ WARN_ON_ONCE(force_immediate_exit);
+
+ trace_kvm_entry(vcpu, force_immediate_exit);
+
+ tdx_vcpu_enter_exit(vcpu);
+
+ vcpu->arch.regs_avail &= ~TDX_REGS_UNSUPPORTED_SET;
+
+ trace_kvm_exit(vcpu, KVM_ISA_VMX);
+
+ return EXIT_FASTPATH_NONE;
+}
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
{
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index ba880dae547f..8339bbf0fdd4 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -46,11 +46,14 @@ enum vcpu_tdx_state {
struct vcpu_tdx {
struct kvm_vcpu vcpu;
struct vcpu_vt vt;
+ struct tdx_module_args vp_enter_args;
struct tdx_vp vp;
struct list_head cpu_list;
+ u64 vp_enter_ret;
+
enum vcpu_tdx_state state;
};
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index ff6370787926..83aac44b779b 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -131,6 +131,8 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
int tdx_vcpu_create(struct kvm_vcpu *vcpu);
void tdx_vcpu_free(struct kvm_vcpu *vcpu);
void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu);
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
@@ -158,6 +160,11 @@ static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOP
static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {}
+static inline int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
+static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+{
+ return EXIT_FASTPATH_NONE;
+}
static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
--
2.43.0
On 1/29/2025 5:58 PM, Adrian Hunter wrote:
> +#define TDX_REGS_UNSUPPORTED_SET (BIT(VCPU_EXREG_RFLAGS) | \
> + BIT(VCPU_EXREG_SEGMENTS))
> +
> +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
> +{
> + /*
> + * force_immediate_exit requires vCPU entering for events injection with
> + * an immediately exit followed. But The TDX module doesn't guarantee
> + * entry, it's already possible for KVM to_think_ it completely entry
> + * to the guest without actually having done so.
> + * Since KVM never needs to force an immediate exit for TDX, and can't
> + * do direct injection, just warn on force_immediate_exit.
> + */
> + WARN_ON_ONCE(force_immediate_exit);
> +
> + trace_kvm_entry(vcpu, force_immediate_exit);
> +
> + tdx_vcpu_enter_exit(vcpu);
> +
> + vcpu->arch.regs_avail &= ~TDX_REGS_UNSUPPORTED_SET;
I don't understand this. Why only clear RFLAGS and SEGMENTS?
When creating the vcpu, vcpu->arch.regs_avail = ~0 in
kvm_arch_vcpu_create().
now it only clears RFLAGS and SEGMENTS for TDX vcpu, which leaves other
bits set. But I don't see any code that syncs the guest value of into
vcpu->arch.regs[reg].
> + trace_kvm_exit(vcpu, KVM_ISA_VMX);
> +
> + return EXIT_FASTPATH_NONE;
> +}
On 20/02/25 15:16, Xiaoyao Li wrote:
> On 1/29/2025 5:58 PM, Adrian Hunter wrote:
>> +#define TDX_REGS_UNSUPPORTED_SET (BIT(VCPU_EXREG_RFLAGS) | \
>> + BIT(VCPU_EXREG_SEGMENTS))
>> +
>> +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
>> +{
>> + /*
>> + * force_immediate_exit requires vCPU entering for events injection with
>> + * an immediately exit followed. But The TDX module doesn't guarantee
>> + * entry, it's already possible for KVM to_think_ it completely entry
>> + * to the guest without actually having done so.
>> + * Since KVM never needs to force an immediate exit for TDX, and can't
>> + * do direct injection, just warn on force_immediate_exit.
>> + */
>> + WARN_ON_ONCE(force_immediate_exit);
>> +
>> + trace_kvm_entry(vcpu, force_immediate_exit);
>> +
>> + tdx_vcpu_enter_exit(vcpu);
>> +
>> + vcpu->arch.regs_avail &= ~TDX_REGS_UNSUPPORTED_SET;
>
> I don't understand this. Why only clear RFLAGS and SEGMENTS?
>
> When creating the vcpu, vcpu->arch.regs_avail = ~0 in kvm_arch_vcpu_create().
>
> now it only clears RFLAGS and SEGMENTS for TDX vcpu, which leaves other bits set. But I don't see any code that syncs the guest value of into vcpu->arch.regs[reg].
TDX guest registers are generally not known but
values are placed into vcpu->arch.regs when needed
to work with common code.
We used to use ~VMX_REGS_LAZY_LOAD_SET and tdx_cache_reg()
which has since been removed.
tdx_cache_reg() did not support RFLAGS, SEGMENTS,
EXIT_INFO_1/EXIT_INFO_2 but EXIT_INFO_1/EXIT_INFO_2 became
needed, so that just left RFLAGS, SEGMENTS.
>
>> + trace_kvm_exit(vcpu, KVM_ISA_VMX);
>> +
>> + return EXIT_FASTPATH_NONE;
>> +}
>
On 2/24/2025 8:27 PM, Adrian Hunter wrote:
> On 20/02/25 15:16, Xiaoyao Li wrote:
>> On 1/29/2025 5:58 PM, Adrian Hunter wrote:
>>> +#define TDX_REGS_UNSUPPORTED_SET (BIT(VCPU_EXREG_RFLAGS) | \
>>> + BIT(VCPU_EXREG_SEGMENTS))
>>> +
>>> +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
>>> +{
>>> + /*
>>> + * force_immediate_exit requires vCPU entering for events injection with
>>> + * an immediately exit followed. But The TDX module doesn't guarantee
>>> + * entry, it's already possible for KVM to_think_ it completely entry
>>> + * to the guest without actually having done so.
>>> + * Since KVM never needs to force an immediate exit for TDX, and can't
>>> + * do direct injection, just warn on force_immediate_exit.
>>> + */
>>> + WARN_ON_ONCE(force_immediate_exit);
>>> +
>>> + trace_kvm_entry(vcpu, force_immediate_exit);
>>> +
>>> + tdx_vcpu_enter_exit(vcpu);
>>> +
>>> + vcpu->arch.regs_avail &= ~TDX_REGS_UNSUPPORTED_SET;
>>
>> I don't understand this. Why only clear RFLAGS and SEGMENTS?
>>
>> When creating the vcpu, vcpu->arch.regs_avail = ~0 in kvm_arch_vcpu_create().
>>
>> now it only clears RFLAGS and SEGMENTS for TDX vcpu, which leaves other bits set. But I don't see any code that syncs the guest value of into vcpu->arch.regs[reg].
>
> TDX guest registers are generally not known but
> values are placed into vcpu->arch.regs when needed
> to work with common code.
>
> We used to use ~VMX_REGS_LAZY_LOAD_SET and tdx_cache_reg()
> which has since been removed.
>
> tdx_cache_reg() did not support RFLAGS, SEGMENTS,
> EXIT_INFO_1/EXIT_INFO_2 but EXIT_INFO_1/EXIT_INFO_2 became
> needed, so that just left RFLAGS, SEGMENTS.
Quote what Sean said [1]
“I'm also not convinced letting KVM read garbage for RIP, RSP, CR3, or
PDPTRs is at all reasonable. CR3 and PDPTRs should be unreachable,
and I gotta imagine the same holds true for RSP. Allow reads/writes
to RIP is fine, in that it probably simplifies the overall code.”
We need to justify why to let KVM read "garbage" of VCPU_REGS_RIP,
VCPU_EXREG_PDPTR, VCPU_EXREG_CR0, VCPU_EXREG_CR3, VCPU_EXREG_CR4,
VCPU_EXREG_EXIT_INFO_1, and VCPU_EXREG_EXIT_INFO_2 are neeed.
The changelog justify nothing for it.
btw, how EXIT_INFO_1/EXIT_INFO_2 became needed? It seems I cannot find
any TDX code use them.
[1] https://lore.kernel.org/all/Z2GiQS_RmYeHU09L@google.com/
>>
>>> + trace_kvm_exit(vcpu, KVM_ISA_VMX);
>>> +
>>> + return EXIT_FASTPATH_NONE;
>>> +}
>>
>
On 25/02/25 08:15, Xiaoyao Li wrote:
> On 2/24/2025 8:27 PM, Adrian Hunter wrote:
>> On 20/02/25 15:16, Xiaoyao Li wrote:
>>> On 1/29/2025 5:58 PM, Adrian Hunter wrote:
>>>> +#define TDX_REGS_UNSUPPORTED_SET (BIT(VCPU_EXREG_RFLAGS) | \
>>>> + BIT(VCPU_EXREG_SEGMENTS))
>>>> +
>>>> +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
>>>> +{
>>>> + /*
>>>> + * force_immediate_exit requires vCPU entering for events injection with
>>>> + * an immediately exit followed. But The TDX module doesn't guarantee
>>>> + * entry, it's already possible for KVM to_think_ it completely entry
>>>> + * to the guest without actually having done so.
>>>> + * Since KVM never needs to force an immediate exit for TDX, and can't
>>>> + * do direct injection, just warn on force_immediate_exit.
>>>> + */
>>>> + WARN_ON_ONCE(force_immediate_exit);
>>>> +
>>>> + trace_kvm_entry(vcpu, force_immediate_exit);
>>>> +
>>>> + tdx_vcpu_enter_exit(vcpu);
>>>> +
>>>> + vcpu->arch.regs_avail &= ~TDX_REGS_UNSUPPORTED_SET;
>>>
>>> I don't understand this. Why only clear RFLAGS and SEGMENTS?
>>>
>>> When creating the vcpu, vcpu->arch.regs_avail = ~0 in kvm_arch_vcpu_create().
>>>
>>> now it only clears RFLAGS and SEGMENTS for TDX vcpu, which leaves other bits set. But I don't see any code that syncs the guest value of into vcpu->arch.regs[reg].
>>
>> TDX guest registers are generally not known but
>> values are placed into vcpu->arch.regs when needed
>> to work with common code.
>>
>> We used to use ~VMX_REGS_LAZY_LOAD_SET and tdx_cache_reg()
>> which has since been removed.
>>
>> tdx_cache_reg() did not support RFLAGS, SEGMENTS,
>> EXIT_INFO_1/EXIT_INFO_2 but EXIT_INFO_1/EXIT_INFO_2 became
>> needed, so that just left RFLAGS, SEGMENTS.
>
> Quote what Sean said [1]
>
> “I'm also not convinced letting KVM read garbage for RIP, RSP, CR3, or
> PDPTRs is at all reasonable. CR3 and PDPTRs should be unreachable,
> and I gotta imagine the same holds true for RSP. Allow reads/writes
> to RIP is fine, in that it probably simplifies the overall code.”
>
> We need to justify why to let KVM read "garbage" of VCPU_REGS_RIP,
> VCPU_EXREG_PDPTR, VCPU_EXREG_CR0, VCPU_EXREG_CR3, VCPU_EXREG_CR4,
> VCPU_EXREG_EXIT_INFO_1, and VCPU_EXREG_EXIT_INFO_2 are neeed.
>
> The changelog justify nothing for it.
Could add VCPU_REGS_RIP, VCPU_REGS_RSP, VCPU_EXREG_CR3, VCPU_EXREG_PDPTR.
But not VCPU_EXREG_CR0 nor VCPU_EXREG_CR4 since we started using them.
>
> btw, how EXIT_INFO_1/EXIT_INFO_2 became needed? It seems I cannot find any TDX code use them.
vmx_get_exit_qual() / vmx_get_intr_info() are now used by TDX.
>
> [1] https://lore.kernel.org/all/Z2GiQS_RmYeHU09L@google.com/
>
>>>
>>>> + trace_kvm_exit(vcpu, KVM_ISA_VMX);
>>>> +
>>>> + return EXIT_FASTPATH_NONE;
>>>> +}
>>>
>>
>
On 2/27/25 19:37, Adrian Hunter wrote:
> On 25/02/25 08:15, Xiaoyao Li wrote:
>> On 2/24/2025 8:27 PM, Adrian Hunter wrote:
>>> On 20/02/25 15:16, Xiaoyao Li wrote:
>>>> On 1/29/2025 5:58 PM, Adrian Hunter wrote:
>>>>> +#define TDX_REGS_UNSUPPORTED_SET (BIT(VCPU_EXREG_RFLAGS) | \
>>>>> + BIT(VCPU_EXREG_SEGMENTS))
>>>>> +
>>>>> +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
>>>>> +{
>>>>> + /*
>>>>> + * force_immediate_exit requires vCPU entering for events injection with
>>>>> + * an immediately exit followed. But The TDX module doesn't guarantee
>>>>> + * entry, it's already possible for KVM to_think_ it completely entry
>>>>> + * to the guest without actually having done so.
>>>>> + * Since KVM never needs to force an immediate exit for TDX, and can't
>>>>> + * do direct injection, just warn on force_immediate_exit.
>>>>> + */
>>>>> + WARN_ON_ONCE(force_immediate_exit);
>>>>> +
>>>>> + trace_kvm_entry(vcpu, force_immediate_exit);
>>>>> +
>>>>> + tdx_vcpu_enter_exit(vcpu);
>>>>> +
>>>>> + vcpu->arch.regs_avail &= ~TDX_REGS_UNSUPPORTED_SET;
>>>>
>>>> I don't understand this. Why only clear RFLAGS and SEGMENTS?
>>>>
>>>> When creating the vcpu, vcpu->arch.regs_avail = ~0 in kvm_arch_vcpu_create().
>>>>
>>>> now it only clears RFLAGS and SEGMENTS for TDX vcpu, which leaves other bits set. But I don't see any code that syncs the guest value of into vcpu->arch.regs[reg].
>>>
>>> TDX guest registers are generally not known but
>>> values are placed into vcpu->arch.regs when needed
>>> to work with common code.
>>>
>>> We used to use ~VMX_REGS_LAZY_LOAD_SET and tdx_cache_reg()
>>> which has since been removed.
>>>
>>> tdx_cache_reg() did not support RFLAGS, SEGMENTS,
>>> EXIT_INFO_1/EXIT_INFO_2 but EXIT_INFO_1/EXIT_INFO_2 became
>>> needed, so that just left RFLAGS, SEGMENTS.
>>
>> Quote what Sean said [1]
>>
>> “I'm also not convinced letting KVM read garbage for RIP, RSP, CR3, or
>> PDPTRs is at all reasonable. CR3 and PDPTRs should be unreachable,
>> and I gotta imagine the same holds true for RSP. Allow reads/writes
>> to RIP is fine, in that it probably simplifies the overall code.”
>>
>> We need to justify why to let KVM read "garbage" of VCPU_REGS_RIP,
>> VCPU_EXREG_PDPTR, VCPU_EXREG_CR0, VCPU_EXREG_CR3, VCPU_EXREG_CR4,
>> VCPU_EXREG_EXIT_INFO_1, and VCPU_EXREG_EXIT_INFO_2 are neeed.
>>
>> The changelog justify nothing for it.
>
> Could add VCPU_REGS_RIP, VCPU_REGS_RSP, VCPU_EXREG_CR3, VCPU_EXREG_PDPTR.
> But not VCPU_EXREG_CR0 nor VCPU_EXREG_CR4 since we started using them.
Hi Adrian,
how is CR0 used? And CR4 is only used other than for loading the XSAVE
state, I think?
I will change this to a list of specific available registers instead of
using "&= ~", and it would be even better if CR0/CR4 are not on the list.
Paolo
>> btw, how EXIT_INFO_1/EXIT_INFO_2 became needed? It seems I cannot find any TDX code use them.
>
> vmx_get_exit_qual() / vmx_get_intr_info() are now used by TDX.
>
>>
>> [1] https://lore.kernel.org/all/Z2GiQS_RmYeHU09L@google.com/
>>
>>>>
>>>>> + trace_kvm_exit(vcpu, KVM_ISA_VMX);
>>>>> +
>>>>> + return EXIT_FASTPATH_NONE;
>>>>> +}
>>>>
>>>
>>
>
>
On 6/03/25 20:19, Paolo Bonzini wrote:
> On 2/27/25 19:37, Adrian Hunter wrote:
>> On 25/02/25 08:15, Xiaoyao Li wrote:
>>> On 2/24/2025 8:27 PM, Adrian Hunter wrote:
>>>> On 20/02/25 15:16, Xiaoyao Li wrote:
>>>>> On 1/29/2025 5:58 PM, Adrian Hunter wrote:
>>>>>> +#define TDX_REGS_UNSUPPORTED_SET (BIT(VCPU_EXREG_RFLAGS) | \
>>>>>> + BIT(VCPU_EXREG_SEGMENTS))
>>>>>> +
>>>>>> +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
>>>>>> +{
>>>>>> + /*
>>>>>> + * force_immediate_exit requires vCPU entering for events injection with
>>>>>> + * an immediately exit followed. But The TDX module doesn't guarantee
>>>>>> + * entry, it's already possible for KVM to_think_ it completely entry
>>>>>> + * to the guest without actually having done so.
>>>>>> + * Since KVM never needs to force an immediate exit for TDX, and can't
>>>>>> + * do direct injection, just warn on force_immediate_exit.
>>>>>> + */
>>>>>> + WARN_ON_ONCE(force_immediate_exit);
>>>>>> +
>>>>>> + trace_kvm_entry(vcpu, force_immediate_exit);
>>>>>> +
>>>>>> + tdx_vcpu_enter_exit(vcpu);
>>>>>> +
>>>>>> + vcpu->arch.regs_avail &= ~TDX_REGS_UNSUPPORTED_SET;
>>>>>
>>>>> I don't understand this. Why only clear RFLAGS and SEGMENTS?
>>>>>
>>>>> When creating the vcpu, vcpu->arch.regs_avail = ~0 in kvm_arch_vcpu_create().
>>>>>
>>>>> now it only clears RFLAGS and SEGMENTS for TDX vcpu, which leaves other bits set. But I don't see any code that syncs the guest value of into vcpu->arch.regs[reg].
>>>>
>>>> TDX guest registers are generally not known but
>>>> values are placed into vcpu->arch.regs when needed
>>>> to work with common code.
>>>>
>>>> We used to use ~VMX_REGS_LAZY_LOAD_SET and tdx_cache_reg()
>>>> which has since been removed.
>>>>
>>>> tdx_cache_reg() did not support RFLAGS, SEGMENTS,
>>>> EXIT_INFO_1/EXIT_INFO_2 but EXIT_INFO_1/EXIT_INFO_2 became
>>>> needed, so that just left RFLAGS, SEGMENTS.
>>>
>>> Quote what Sean said [1]
>>>
>>> “I'm also not convinced letting KVM read garbage for RIP, RSP, CR3, or
>>> PDPTRs is at all reasonable. CR3 and PDPTRs should be unreachable,
>>> and I gotta imagine the same holds true for RSP. Allow reads/writes
>>> to RIP is fine, in that it probably simplifies the overall code.”
>>>
>>> We need to justify why to let KVM read "garbage" of VCPU_REGS_RIP,
>>> VCPU_EXREG_PDPTR, VCPU_EXREG_CR0, VCPU_EXREG_CR3, VCPU_EXREG_CR4,
>>> VCPU_EXREG_EXIT_INFO_1, and VCPU_EXREG_EXIT_INFO_2 are neeed.
>>>
>>> The changelog justify nothing for it.
>>
>> Could add VCPU_REGS_RIP, VCPU_REGS_RSP, VCPU_EXREG_CR3, VCPU_EXREG_PDPTR.
>> But not VCPU_EXREG_CR0 nor VCPU_EXREG_CR4 since we started using them.
>
> Hi Adrian,
>
> how is CR0 used? And CR4 is only used other than for loading the XSAVE state, I think?
I meant it is used in the sense that patch "[PATCH V2 07/12] KVM: TDX:
restore host xsave state when exit from the guest TD" provides a value for it.
But it looks like it might be accessible via:
store_regs()
__get_sregs()
__get_sregs_common()
Sean wanted a maximal CR0 value consistent with the CR4.
CR4 is also being used in kvm_update_cpuid_runtime().
>
> I will change this to a list of specific available registers instead of using "&= ~", and it would be even better if CR0/CR4 are not on the list.
>
> Paolo
>
>>> btw, how EXIT_INFO_1/EXIT_INFO_2 became needed? It seems I cannot find any TDX code use them.
>>
>> vmx_get_exit_qual() / vmx_get_intr_info() are now used by TDX.
>>
>>>
>>> [1] https://lore.kernel.org/all/Z2GiQS_RmYeHU09L@google.com/
>>>
>>>>>
>>>>>> + trace_kvm_exit(vcpu, KVM_ISA_VMX);
>>>>>> +
>>>>>> + return EXIT_FASTPATH_NONE;
>>>>>> +}
>>>>>
>>>>
>>>
>>
>>
>
© 2016 - 2026 Red Hat, Inc.