From: Xin Li <xin3.li@intel.com>
Extend nested VMX context management to include FRED-related VMCS fields.
This enables proper handling of FRED state during nested virtualization.
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Tested-by: Xuelian Guo <xuelian.guo@intel.com>
---
Change in v5:
* Add TB from Xuelian Guo.
Changes in v4:
* Advertise VMX nested exception as if the CPU supports it (Chao Gao).
* Split FRED state management controls (Chao Gao).
Changes in v3:
* Add and use nested_cpu_has_fred(vmcs12) because vmcs02 should be set
from vmcs12 if and only if the field is enabled in L1's VMX config
(Sean Christopherson).
* Fix coding style issues (Sean Christopherson).
Changes in v2:
* Remove hyperv TLFS related changes (Jeremi Piotrowski).
* Use kvm_cpu_cap_has() instead of cpu_feature_enabled() (Chao Gao).
---
Documentation/virt/kvm/x86/nested-vmx.rst | 18 +++++
arch/x86/kvm/vmx/capabilities.h | 5 ++
arch/x86/kvm/vmx/nested.c | 83 ++++++++++++++++++++++-
arch/x86/kvm/vmx/nested.h | 22 ++++++
arch/x86/kvm/vmx/vmcs12.c | 18 +++++
arch/x86/kvm/vmx/vmcs12.h | 36 ++++++++++
arch/x86/kvm/vmx/vmcs_shadow_fields.h | 4 ++
7 files changed, 184 insertions(+), 2 deletions(-)
diff --git a/Documentation/virt/kvm/x86/nested-vmx.rst b/Documentation/virt/kvm/x86/nested-vmx.rst
index e64ef231f310..87fa9f3877ab 100644
--- a/Documentation/virt/kvm/x86/nested-vmx.rst
+++ b/Documentation/virt/kvm/x86/nested-vmx.rst
@@ -218,6 +218,24 @@ struct shadow_vmcs is ever changed.
u16 host_gs_selector;
u16 host_tr_selector;
u64 secondary_vm_exit_controls;
+ u64 guest_ia32_fred_config;
+ u64 guest_ia32_fred_rsp1;
+ u64 guest_ia32_fred_rsp2;
+ u64 guest_ia32_fred_rsp3;
+ u64 guest_ia32_fred_stklvls;
+ u64 guest_ia32_fred_ssp1;
+ u64 guest_ia32_fred_ssp2;
+ u64 guest_ia32_fred_ssp3;
+ u64 host_ia32_fred_config;
+ u64 host_ia32_fred_rsp1;
+ u64 host_ia32_fred_rsp2;
+ u64 host_ia32_fred_rsp3;
+ u64 host_ia32_fred_stklvls;
+ u64 host_ia32_fred_ssp1;
+ u64 host_ia32_fred_ssp2;
+ u64 host_ia32_fred_ssp3;
+ u64 injected_event_data;
+ u64 original_event_data;
};
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 6d446574d770..cf7c93c33a98 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -78,6 +78,11 @@ static inline bool cpu_has_vmx_basic_inout(void)
return vmcs_config.basic & VMX_BASIC_INOUT;
}
+static inline bool cpu_has_vmx_nested_exception(void)
+{
+ return vmcs_config.basic & VMX_BASIC_NESTED_EXCEPTION;
+}
+
static inline bool cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS &&
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 4405d176cf74..30d2de1243ef 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -705,6 +705,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FRED_RSP0, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FRED_SSP0, MSR_TYPE_RW);
#endif
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
@@ -1272,9 +1278,11 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
{
const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
VMX_BASIC_INOUT |
- VMX_BASIC_TRUE_CTLS;
+ VMX_BASIC_TRUE_CTLS |
+ VMX_BASIC_NESTED_EXCEPTION;
- const u64 reserved_bits = GENMASK_ULL(63, 56) |
+ const u64 reserved_bits = GENMASK_ULL(63, 59) |
+ GENMASK_ULL(57, 56) |
GENMASK_ULL(47, 45) |
BIT_ULL(31);
@@ -2526,6 +2534,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
vmcs12->vm_entry_instruction_len);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
vmcs12->guest_interruptibility_info);
+ if (cpu_has_vmx_fred())
+ vmcs_write64(INJECTED_EVENT_DATA, vmcs12->injected_event_data);
vmx->loaded_vmcs->nmi_known_unmasked =
!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
} else {
@@ -2578,6 +2588,17 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
vmx_segment_cache_clear(vmx);
+
+ if (nested_cpu_load_guest_fred_states(vmcs12)) {
+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->guest_ia32_fred_config);
+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->guest_ia32_fred_rsp1);
+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->guest_ia32_fred_rsp2);
+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->guest_ia32_fred_rsp3);
+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->guest_ia32_fred_stklvls);
+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->guest_ia32_fred_ssp1);
+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->guest_ia32_fred_ssp2);
+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->guest_ia32_fred_ssp3);
+ }
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@ -3864,6 +3885,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
u32 idt_vectoring;
unsigned int nr;
+ vmcs12->original_event_data = 0;
+
/*
* Per the SDM, VM-Exits due to double and triple faults are never
* considered to occur during event delivery, even if the double/triple
@@ -3902,6 +3925,13 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
vcpu->arch.exception.error_code;
}
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) &&
+ (vmcs12->guest_cr4 & X86_CR4_FRED) &&
+ (vcpu->arch.exception.nested))
+ idt_vectoring |= VECTORING_INFO_NESTED_EXCEPTION_MASK;
+
+ vmcs12->original_event_data = vcpu->arch.exception.event_data;
+
vmcs12->idt_vectoring_info_field = idt_vectoring;
} else if (vcpu->arch.nmi_injected) {
vmcs12->idt_vectoring_info_field =
@@ -4482,6 +4512,14 @@ static bool is_vmcs12_ext_field(unsigned long field)
case GUEST_TR_BASE:
case GUEST_GDTR_BASE:
case GUEST_IDTR_BASE:
+ case GUEST_IA32_FRED_CONFIG:
+ case GUEST_IA32_FRED_RSP1:
+ case GUEST_IA32_FRED_RSP2:
+ case GUEST_IA32_FRED_RSP3:
+ case GUEST_IA32_FRED_STKLVLS:
+ case GUEST_IA32_FRED_SSP1:
+ case GUEST_IA32_FRED_SSP2:
+ case GUEST_IA32_FRED_SSP3:
case GUEST_PENDING_DBG_EXCEPTIONS:
case GUEST_BNDCFGS:
return true;
@@ -4531,6 +4569,18 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+
+ if (nested_cpu_save_guest_fred_states(vmcs12)) {
+ vmcs12->guest_ia32_fred_config = vmcs_read64(GUEST_IA32_FRED_CONFIG);
+ vmcs12->guest_ia32_fred_rsp1 = vmcs_read64(GUEST_IA32_FRED_RSP1);
+ vmcs12->guest_ia32_fred_rsp2 = vmcs_read64(GUEST_IA32_FRED_RSP2);
+ vmcs12->guest_ia32_fred_rsp3 = vmcs_read64(GUEST_IA32_FRED_RSP3);
+ vmcs12->guest_ia32_fred_stklvls = vmcs_read64(GUEST_IA32_FRED_STKLVLS);
+ vmcs12->guest_ia32_fred_ssp1 = vmcs_read64(GUEST_IA32_FRED_SSP1);
+ vmcs12->guest_ia32_fred_ssp2 = vmcs_read64(GUEST_IA32_FRED_SSP2);
+ vmcs12->guest_ia32_fred_ssp3 = vmcs_read64(GUEST_IA32_FRED_SSP3);
+ }
+
vmcs12->guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
@@ -4684,6 +4734,21 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->vm_exit_intr_info = exit_intr_info;
vmcs12->vm_exit_instruction_len = exit_insn_len;
+
+ /*
+ * When there is a valid original event, the exiting event is a nested
+ * event during delivery of the earlier original event.
+ *
+ * FRED event delivery reflects this relationship by setting the value
+ * of the nested exception bit of VM-exit interruption information
+ * (aka exiting-event identification) to that of the valid bit of the
+ * IDT-vectoring information (aka original-event identification).
+ */
+ if ((vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) &&
+ (vmcs12->guest_cr4 & X86_CR4_FRED))
+ vmcs12->vm_exit_intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
+
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
/*
@@ -4761,6 +4826,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
+ if (nested_cpu_load_host_fred_states(vmcs12)) {
+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->host_ia32_fred_config);
+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->host_ia32_fred_rsp1);
+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->host_ia32_fred_rsp2);
+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->host_ia32_fred_rsp3);
+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->host_ia32_fred_stklvls);
+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->host_ia32_fred_ssp1);
+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->host_ia32_fred_ssp2);
+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->host_ia32_fred_ssp3);
+ }
+
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
vmcs_write64(GUEST_BNDCFGS, 0);
@@ -7228,6 +7304,9 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
msrs->basic |= VMX_BASIC_TRUE_CTLS;
if (cpu_has_vmx_basic_inout())
msrs->basic |= VMX_BASIC_INOUT;
+
+ if (cpu_has_vmx_nested_exception())
+ msrs->basic |= VMX_BASIC_NESTED_EXCEPTION;
}
static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 6eedcfc91070..c6b69699e28e 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -249,6 +249,11 @@ static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12)
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
}
+static inline bool nested_cpu_has_secondary_vm_exit_controls(struct vmcs12 *vmcs12)
+{
+ return vmcs12->vm_exit_controls & VM_EXIT_ACTIVATE_SECONDARY_CONTROLS;
+}
+
static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
{
return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
@@ -269,6 +274,23 @@ static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
}
+static inline bool nested_cpu_load_guest_fred_states(struct vmcs12 *vmcs12)
+{
+ return vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_FRED;
+}
+
+static inline bool nested_cpu_save_guest_fred_states(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_secondary_vm_exit_controls(vmcs12) &&
+ vmcs12->secondary_vm_exit_controls & SECONDARY_VM_EXIT_SAVE_IA32_FRED;
+}
+
+static inline bool nested_cpu_load_host_fred_states(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_secondary_vm_exit_controls(vmcs12) &&
+ vmcs12->secondary_vm_exit_controls & SECONDARY_VM_EXIT_LOAD_IA32_FRED;
+}
+
/*
* if fixed0[i] == 1: val[i] must be 1
* if fixed1[i] == 0: val[i] must be 0
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index 9fac24fd5b4b..5fa63326deba 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -67,6 +67,24 @@ const unsigned short vmcs12_field_offsets[] = {
FIELD64(HOST_IA32_EFER, host_ia32_efer),
FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
FIELD64(SECONDARY_VM_EXIT_CONTROLS, secondary_vm_exit_controls),
+ FIELD64(INJECTED_EVENT_DATA, injected_event_data),
+ FIELD64(ORIGINAL_EVENT_DATA, original_event_data),
+ FIELD64(GUEST_IA32_FRED_CONFIG, guest_ia32_fred_config),
+ FIELD64(GUEST_IA32_FRED_RSP1, guest_ia32_fred_rsp1),
+ FIELD64(GUEST_IA32_FRED_RSP2, guest_ia32_fred_rsp2),
+ FIELD64(GUEST_IA32_FRED_RSP3, guest_ia32_fred_rsp3),
+ FIELD64(GUEST_IA32_FRED_STKLVLS, guest_ia32_fred_stklvls),
+ FIELD64(GUEST_IA32_FRED_SSP1, guest_ia32_fred_ssp1),
+ FIELD64(GUEST_IA32_FRED_SSP2, guest_ia32_fred_ssp2),
+ FIELD64(GUEST_IA32_FRED_SSP3, guest_ia32_fred_ssp3),
+ FIELD64(HOST_IA32_FRED_CONFIG, host_ia32_fred_config),
+ FIELD64(HOST_IA32_FRED_RSP1, host_ia32_fred_rsp1),
+ FIELD64(HOST_IA32_FRED_RSP2, host_ia32_fred_rsp2),
+ FIELD64(HOST_IA32_FRED_RSP3, host_ia32_fred_rsp3),
+ FIELD64(HOST_IA32_FRED_STKLVLS, host_ia32_fred_stklvls),
+ FIELD64(HOST_IA32_FRED_SSP1, host_ia32_fred_ssp1),
+ FIELD64(HOST_IA32_FRED_SSP2, host_ia32_fred_ssp2),
+ FIELD64(HOST_IA32_FRED_SSP3, host_ia32_fred_ssp3),
FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
FIELD(EXCEPTION_BITMAP, exception_bitmap),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 1fe3ed9108aa..f2a33d7007c9 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -186,6 +186,24 @@ struct __packed vmcs12 {
u16 host_tr_selector;
u16 guest_pml_index;
u64 secondary_vm_exit_controls;
+ u64 guest_ia32_fred_config;
+ u64 guest_ia32_fred_rsp1;
+ u64 guest_ia32_fred_rsp2;
+ u64 guest_ia32_fred_rsp3;
+ u64 guest_ia32_fred_stklvls;
+ u64 guest_ia32_fred_ssp1;
+ u64 guest_ia32_fred_ssp2;
+ u64 guest_ia32_fred_ssp3;
+ u64 host_ia32_fred_config;
+ u64 host_ia32_fred_rsp1;
+ u64 host_ia32_fred_rsp2;
+ u64 host_ia32_fred_rsp3;
+ u64 host_ia32_fred_stklvls;
+ u64 host_ia32_fred_ssp1;
+ u64 host_ia32_fred_ssp2;
+ u64 host_ia32_fred_ssp3;
+ u64 injected_event_data;
+ u64 original_event_data;
};
/*
@@ -362,6 +380,24 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(host_tr_selector, 994);
CHECK_OFFSET(guest_pml_index, 996);
CHECK_OFFSET(secondary_vm_exit_controls, 998);
+ CHECK_OFFSET(guest_ia32_fred_config, 1006);
+ CHECK_OFFSET(guest_ia32_fred_rsp1, 1014);
+ CHECK_OFFSET(guest_ia32_fred_rsp2, 1022);
+ CHECK_OFFSET(guest_ia32_fred_rsp3, 1030);
+ CHECK_OFFSET(guest_ia32_fred_stklvls, 1038);
+ CHECK_OFFSET(guest_ia32_fred_ssp1, 1046);
+ CHECK_OFFSET(guest_ia32_fred_ssp2, 1054);
+ CHECK_OFFSET(guest_ia32_fred_ssp3, 1062);
+ CHECK_OFFSET(host_ia32_fred_config, 1070);
+ CHECK_OFFSET(host_ia32_fred_rsp1, 1078);
+ CHECK_OFFSET(host_ia32_fred_rsp2, 1086);
+ CHECK_OFFSET(host_ia32_fred_rsp3, 1094);
+ CHECK_OFFSET(host_ia32_fred_stklvls, 1102);
+ CHECK_OFFSET(host_ia32_fred_ssp1, 1110);
+ CHECK_OFFSET(host_ia32_fred_ssp2, 1118);
+ CHECK_OFFSET(host_ia32_fred_ssp3, 1126);
+ CHECK_OFFSET(injected_event_data, 1134);
+ CHECK_OFFSET(original_event_data, 1142);
}
extern const unsigned short vmcs12_field_offsets[];
diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
index cad128d1657b..da338327c2b3 100644
--- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h
+++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
@@ -74,6 +74,10 @@ SHADOW_FIELD_RW(HOST_GS_BASE, host_gs_base)
/* 64-bit */
SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address)
SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address)
+SHADOW_FIELD_RO(ORIGINAL_EVENT_DATA, original_event_data)
+SHADOW_FIELD_RO(ORIGINAL_EVENT_DATA_HIGH, original_event_data)
+SHADOW_FIELD_RW(INJECTED_EVENT_DATA, injected_event_data)
+SHADOW_FIELD_RW(INJECTED_EVENT_DATA_HIGH, injected_event_data)
#undef SHADOW_FIELD_RO
#undef SHADOW_FIELD_RW
--
2.50.1
>@@ -2578,6 +2588,17 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) > vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); > > vmx_segment_cache_clear(vmx); >+ >+ if (nested_cpu_load_guest_fred_states(vmcs12)) { >+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->guest_ia32_fred_config); >+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->guest_ia32_fred_rsp1); >+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->guest_ia32_fred_rsp2); >+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->guest_ia32_fred_rsp3); >+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->guest_ia32_fred_stklvls); >+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->guest_ia32_fred_ssp1); >+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->guest_ia32_fred_ssp2); >+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->guest_ia32_fred_ssp3); >+ } I think we need to snapshot L1's FRED MSR values before nested VM entry and propagate them to GUEST_IA32_FRED* of VMCS02 for !nested_cpu_load_guest_fred_states(vmcs12) case. i.e., from guest's view, FRED MSRs shouldn't change across VM-entry if "Load guest FRED states" isn't set. Refer to the comment above 'pre_vmenter_debugctl' definition and also the CET implmenetation*. [*]: https://lore.kernel.org/kvm/20250704085027.182163-22-chao.gao@intel.com/
On 7/23/2025 11:50 PM, Chao Gao wrote: >> @@ -2578,6 +2588,17 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) >> vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); >> >> vmx_segment_cache_clear(vmx); >> + >> + if (nested_cpu_load_guest_fred_states(vmcs12)) { >> + vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->guest_ia32_fred_config); >> + vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->guest_ia32_fred_rsp1); >> + vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->guest_ia32_fred_rsp2); >> + vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->guest_ia32_fred_rsp3); >> + vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->guest_ia32_fred_stklvls); >> + vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->guest_ia32_fred_ssp1); >> + vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->guest_ia32_fred_ssp2); >> + vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->guest_ia32_fred_ssp3); >> + } > > I think we need to snapshot L1's FRED MSR values before nested VM entry and > propagate them to GUEST_IA32_FRED* of VMCS02 for > !nested_cpu_load_guest_fred_states(vmcs12) case. i.e., from guest's view, > FRED MSRs shouldn't change across VM-entry if "Load guest FRED states" isn't > set. > > Refer to the comment above 'pre_vmenter_debugctl' definition and also the > CET implmenetation*. > > [*]: https://lore.kernel.org/kvm/20250704085027.182163-22-chao.gao@intel.com/ > Nice catch, it really took me a while to understand the issue.
From: Xin Li <xin3.li@intel.com>
Extend nested VMX context management to include FRED-related VMCS fields.
This enables proper handling of FRED state during nested virtualization.
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Tested-by: Xuelian Guo <xuelian.guo@intel.com>
---
Change in v5A:
* Handle FRED MSR pre-vmenter save/restore (Chao Gao).
Change in v5:
* Add TB from Xuelian Guo.
Changes in v4:
* Advertise VMX nested exception as if the CPU supports it (Chao Gao).
* Split FRED state management controls (Chao Gao).
Changes in v3:
* Add and use nested_cpu_has_fred(vmcs12) because vmcs02 should be set
from vmcs12 if and only if the field is enabled in L1's VMX config
(Sean Christopherson).
* Fix coding style issues (Sean Christopherson).
Changes in v2:
* Remove hyperv TLFS related changes (Jeremi Piotrowski).
* Use kvm_cpu_cap_has() instead of cpu_feature_enabled() (Chao Gao).
---
Documentation/virt/kvm/x86/nested-vmx.rst | 18 ++++
arch/x86/kvm/vmx/capabilities.h | 5 +
arch/x86/kvm/vmx/nested.c | 126 +++++++++++++++++++++-
arch/x86/kvm/vmx/nested.h | 22 ++++
arch/x86/kvm/vmx/vmcs12.c | 18 ++++
arch/x86/kvm/vmx/vmcs12.h | 36 +++++++
arch/x86/kvm/vmx/vmcs_shadow_fields.h | 4 +
arch/x86/kvm/vmx/vmx.h | 33 ++++++
8 files changed, 260 insertions(+), 2 deletions(-)
diff --git a/Documentation/virt/kvm/x86/nested-vmx.rst b/Documentation/virt/kvm/x86/nested-vmx.rst
index e64ef231f310..87fa9f3877ab 100644
--- a/Documentation/virt/kvm/x86/nested-vmx.rst
+++ b/Documentation/virt/kvm/x86/nested-vmx.rst
@@ -218,6 +218,24 @@ struct shadow_vmcs is ever changed.
u16 host_gs_selector;
u16 host_tr_selector;
u64 secondary_vm_exit_controls;
+ u64 guest_ia32_fred_config;
+ u64 guest_ia32_fred_rsp1;
+ u64 guest_ia32_fred_rsp2;
+ u64 guest_ia32_fred_rsp3;
+ u64 guest_ia32_fred_stklvls;
+ u64 guest_ia32_fred_ssp1;
+ u64 guest_ia32_fred_ssp2;
+ u64 guest_ia32_fred_ssp3;
+ u64 host_ia32_fred_config;
+ u64 host_ia32_fred_rsp1;
+ u64 host_ia32_fred_rsp2;
+ u64 host_ia32_fred_rsp3;
+ u64 host_ia32_fred_stklvls;
+ u64 host_ia32_fred_ssp1;
+ u64 host_ia32_fred_ssp2;
+ u64 host_ia32_fred_ssp3;
+ u64 injected_event_data;
+ u64 original_event_data;
};
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 6d446574d770..cf7c93c33a98 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -78,6 +78,11 @@ static inline bool cpu_has_vmx_basic_inout(void)
return vmcs_config.basic & VMX_BASIC_INOUT;
}
+static inline bool cpu_has_vmx_nested_exception(void)
+{
+ return vmcs_config.basic & VMX_BASIC_NESTED_EXCEPTION;
+}
+
static inline bool cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS &&
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 4405d176cf74..1d9de811313a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -705,6 +705,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FRED_RSP0, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FRED_SSP0, MSR_TYPE_RW);
#endif
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
@@ -1272,9 +1278,11 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
{
const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
VMX_BASIC_INOUT |
- VMX_BASIC_TRUE_CTLS;
+ VMX_BASIC_TRUE_CTLS |
+ VMX_BASIC_NESTED_EXCEPTION;
- const u64 reserved_bits = GENMASK_ULL(63, 56) |
+ const u64 reserved_bits = GENMASK_ULL(63, 59) |
+ GENMASK_ULL(57, 56) |
GENMASK_ULL(47, 45) |
BIT_ULL(31);
@@ -2526,6 +2534,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
vmcs12->vm_entry_instruction_len);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
vmcs12->guest_interruptibility_info);
+ if (cpu_has_vmx_fred())
+ vmcs_write64(INJECTED_EVENT_DATA, vmcs12->injected_event_data);
vmx->loaded_vmcs->nmi_known_unmasked =
!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
} else {
@@ -2578,6 +2588,17 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
vmx_segment_cache_clear(vmx);
+
+ if (nested_cpu_load_guest_fred_state(vmcs12)) {
+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->guest_ia32_fred_config);
+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->guest_ia32_fred_rsp1);
+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->guest_ia32_fred_rsp2);
+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->guest_ia32_fred_rsp3);
+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->guest_ia32_fred_stklvls);
+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->guest_ia32_fred_ssp1);
+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->guest_ia32_fred_ssp2);
+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->guest_ia32_fred_ssp3);
+ }
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@ -2679,6 +2700,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
!(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
}
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_FRED) &&
+ (!vmx->nested.nested_run_pending || !nested_cpu_load_guest_fred_state(vmcs12))) {
+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmx->nested.pre_vmenter_fred_config);
+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmx->nested.pre_vmenter_fred_rsp1);
+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmx->nested.pre_vmenter_fred_rsp2);
+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmx->nested.pre_vmenter_fred_rsp3);
+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmx->nested.pre_vmenter_fred_stklvls);
+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmx->nested.pre_vmenter_fred_ssp1);
+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmx->nested.pre_vmenter_fred_ssp2);
+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmx->nested.pre_vmenter_fred_ssp3);
+ }
+
if (vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
@@ -3549,6 +3582,18 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
kvm_service_local_tlb_flush_requests(vcpu);
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_FRED) &&
+ (!vmx->nested.nested_run_pending || !nested_cpu_load_guest_fred_state(vmcs12))) {
+ vmx->nested.pre_vmenter_fred_config = vmcs_read64(GUEST_IA32_FRED_CONFIG);
+ vmx->nested.pre_vmenter_fred_rsp1 = vmcs_read64(GUEST_IA32_FRED_RSP1);
+ vmx->nested.pre_vmenter_fred_rsp2 = vmcs_read64(GUEST_IA32_FRED_RSP2);
+ vmx->nested.pre_vmenter_fred_rsp3 = vmcs_read64(GUEST_IA32_FRED_RSP3);
+ vmx->nested.pre_vmenter_fred_stklvls = vmcs_read64(GUEST_IA32_FRED_STKLVLS);
+ vmx->nested.pre_vmenter_fred_ssp1 = vmcs_read64(GUEST_IA32_FRED_SSP1);
+ vmx->nested.pre_vmenter_fred_ssp2 = vmcs_read64(GUEST_IA32_FRED_SSP2);
+ vmx->nested.pre_vmenter_fred_ssp3 = vmcs_read64(GUEST_IA32_FRED_SSP3);
+ }
+
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
@@ -3864,6 +3909,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
u32 idt_vectoring;
unsigned int nr;
+ vmcs12->original_event_data = 0;
+
/*
* Per the SDM, VM-Exits due to double and triple faults are never
* considered to occur during event delivery, even if the double/triple
@@ -3902,6 +3949,13 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
vcpu->arch.exception.error_code;
}
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) &&
+ (vmcs12->guest_cr4 & X86_CR4_FRED) &&
+ (vcpu->arch.exception.nested))
+ idt_vectoring |= VECTORING_INFO_NESTED_EXCEPTION_MASK;
+
+ vmcs12->original_event_data = vcpu->arch.exception.event_data;
+
vmcs12->idt_vectoring_info_field = idt_vectoring;
} else if (vcpu->arch.nmi_injected) {
vmcs12->idt_vectoring_info_field =
@@ -4482,6 +4536,14 @@ static bool is_vmcs12_ext_field(unsigned long field)
case GUEST_TR_BASE:
case GUEST_GDTR_BASE:
case GUEST_IDTR_BASE:
+ case GUEST_IA32_FRED_CONFIG:
+ case GUEST_IA32_FRED_RSP1:
+ case GUEST_IA32_FRED_RSP2:
+ case GUEST_IA32_FRED_RSP3:
+ case GUEST_IA32_FRED_STKLVLS:
+ case GUEST_IA32_FRED_SSP1:
+ case GUEST_IA32_FRED_SSP2:
+ case GUEST_IA32_FRED_SSP3:
case GUEST_PENDING_DBG_EXCEPTIONS:
case GUEST_BNDCFGS:
return true;
@@ -4531,6 +4593,27 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+
+ vmx->nested.pre_vmexit_fred_config = vmcs_read64(GUEST_IA32_FRED_CONFIG);
+ vmx->nested.pre_vmexit_fred_rsp1 = vmcs_read64(GUEST_IA32_FRED_RSP1);
+ vmx->nested.pre_vmexit_fred_rsp2 = vmcs_read64(GUEST_IA32_FRED_RSP2);
+ vmx->nested.pre_vmexit_fred_rsp3 = vmcs_read64(GUEST_IA32_FRED_RSP3);
+ vmx->nested.pre_vmexit_fred_stklvls = vmcs_read64(GUEST_IA32_FRED_STKLVLS);
+ vmx->nested.pre_vmexit_fred_ssp1 = vmcs_read64(GUEST_IA32_FRED_SSP1);
+ vmx->nested.pre_vmexit_fred_ssp2 = vmcs_read64(GUEST_IA32_FRED_SSP2);
+ vmx->nested.pre_vmexit_fred_ssp3 = vmcs_read64(GUEST_IA32_FRED_SSP3);
+
+ if (nested_cpu_save_guest_fred_state(vmcs12)) {
+ vmcs12->guest_ia32_fred_config = vmx->nested.pre_vmexit_fred_config;
+ vmcs12->guest_ia32_fred_rsp1 = vmx->nested.pre_vmexit_fred_rsp1;
+ vmcs12->guest_ia32_fred_rsp2 = vmx->nested.pre_vmexit_fred_rsp2;
+ vmcs12->guest_ia32_fred_rsp3 = vmx->nested.pre_vmexit_fred_rsp3;
+ vmcs12->guest_ia32_fred_stklvls = vmx->nested.pre_vmexit_fred_stklvls;
+ vmcs12->guest_ia32_fred_ssp1 = vmx->nested.pre_vmexit_fred_ssp1;
+ vmcs12->guest_ia32_fred_ssp2 = vmx->nested.pre_vmexit_fred_ssp2;
+ vmcs12->guest_ia32_fred_ssp3 = vmx->nested.pre_vmexit_fred_ssp3;
+ }
+
vmcs12->guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
@@ -4684,6 +4767,21 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->vm_exit_intr_info = exit_intr_info;
vmcs12->vm_exit_instruction_len = exit_insn_len;
+
+ /*
+ * When there is a valid original event, the exiting event is a nested
+ * event during delivery of the earlier original event.
+ *
+ * FRED event delivery reflects this relationship by setting the value
+ * of the nested exception bit of VM-exit interruption information
+ * (aka exiting-event identification) to that of the valid bit of the
+ * IDT-vectoring information (aka original-event identification).
+ */
+ if ((vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) &&
+ (vmcs12->guest_cr4 & X86_CR4_FRED))
+ vmcs12->vm_exit_intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
+
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
/*
@@ -4712,6 +4810,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
enum vm_entry_failure_code ignored;
struct kvm_segment seg;
@@ -4761,6 +4860,26 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
+ if (nested_cpu_load_host_fred_state(vmcs12)) {
+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->host_ia32_fred_config);
+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->host_ia32_fred_rsp1);
+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->host_ia32_fred_rsp2);
+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->host_ia32_fred_rsp3);
+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->host_ia32_fred_stklvls);
+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->host_ia32_fred_ssp1);
+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->host_ia32_fred_ssp2);
+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->host_ia32_fred_ssp3);
+ } else {
+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmx->nested.pre_vmexit_fred_config);
+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmx->nested.pre_vmexit_fred_rsp1);
+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmx->nested.pre_vmexit_fred_rsp2);
+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmx->nested.pre_vmexit_fred_rsp3);
+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmx->nested.pre_vmexit_fred_stklvls);
+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmx->nested.pre_vmexit_fred_ssp1);
+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmx->nested.pre_vmexit_fred_ssp2);
+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmx->nested.pre_vmexit_fred_ssp3);
+ }
+
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
vmcs_write64(GUEST_BNDCFGS, 0);
@@ -7228,6 +7347,9 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
msrs->basic |= VMX_BASIC_TRUE_CTLS;
if (cpu_has_vmx_basic_inout())
msrs->basic |= VMX_BASIC_INOUT;
+
+ if (cpu_has_vmx_nested_exception())
+ msrs->basic |= VMX_BASIC_NESTED_EXCEPTION;
}
static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 6eedcfc91070..b16dccff3186 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -249,6 +249,11 @@ static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12)
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
}
+static inline bool nested_cpu_has_secondary_vm_exit_controls(struct vmcs12 *vmcs12)
+{
+ return vmcs12->vm_exit_controls & VM_EXIT_ACTIVATE_SECONDARY_CONTROLS;
+}
+
static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
{
return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
@@ -269,6 +274,23 @@ static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
}
+static inline bool nested_cpu_load_guest_fred_state(struct vmcs12 *vmcs12)
+{
+ return vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_FRED;
+}
+
+static inline bool nested_cpu_save_guest_fred_state(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_secondary_vm_exit_controls(vmcs12) &&
+ vmcs12->secondary_vm_exit_controls & SECONDARY_VM_EXIT_SAVE_IA32_FRED;
+}
+
+static inline bool nested_cpu_load_host_fred_state(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_secondary_vm_exit_controls(vmcs12) &&
+ vmcs12->secondary_vm_exit_controls & SECONDARY_VM_EXIT_LOAD_IA32_FRED;
+}
+
/*
* if fixed0[i] == 1: val[i] must be 1
* if fixed1[i] == 0: val[i] must be 0
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index 9fac24fd5b4b..5fa63326deba 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -67,6 +67,24 @@ const unsigned short vmcs12_field_offsets[] = {
FIELD64(HOST_IA32_EFER, host_ia32_efer),
FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
FIELD64(SECONDARY_VM_EXIT_CONTROLS, secondary_vm_exit_controls),
+ FIELD64(INJECTED_EVENT_DATA, injected_event_data),
+ FIELD64(ORIGINAL_EVENT_DATA, original_event_data),
+ FIELD64(GUEST_IA32_FRED_CONFIG, guest_ia32_fred_config),
+ FIELD64(GUEST_IA32_FRED_RSP1, guest_ia32_fred_rsp1),
+ FIELD64(GUEST_IA32_FRED_RSP2, guest_ia32_fred_rsp2),
+ FIELD64(GUEST_IA32_FRED_RSP3, guest_ia32_fred_rsp3),
+ FIELD64(GUEST_IA32_FRED_STKLVLS, guest_ia32_fred_stklvls),
+ FIELD64(GUEST_IA32_FRED_SSP1, guest_ia32_fred_ssp1),
+ FIELD64(GUEST_IA32_FRED_SSP2, guest_ia32_fred_ssp2),
+ FIELD64(GUEST_IA32_FRED_SSP3, guest_ia32_fred_ssp3),
+ FIELD64(HOST_IA32_FRED_CONFIG, host_ia32_fred_config),
+ FIELD64(HOST_IA32_FRED_RSP1, host_ia32_fred_rsp1),
+ FIELD64(HOST_IA32_FRED_RSP2, host_ia32_fred_rsp2),
+ FIELD64(HOST_IA32_FRED_RSP3, host_ia32_fred_rsp3),
+ FIELD64(HOST_IA32_FRED_STKLVLS, host_ia32_fred_stklvls),
+ FIELD64(HOST_IA32_FRED_SSP1, host_ia32_fred_ssp1),
+ FIELD64(HOST_IA32_FRED_SSP2, host_ia32_fred_ssp2),
+ FIELD64(HOST_IA32_FRED_SSP3, host_ia32_fred_ssp3),
FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
FIELD(EXCEPTION_BITMAP, exception_bitmap),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 1fe3ed9108aa..f2a33d7007c9 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -186,6 +186,24 @@ struct __packed vmcs12 {
u16 host_tr_selector;
u16 guest_pml_index;
u64 secondary_vm_exit_controls;
+ u64 guest_ia32_fred_config;
+ u64 guest_ia32_fred_rsp1;
+ u64 guest_ia32_fred_rsp2;
+ u64 guest_ia32_fred_rsp3;
+ u64 guest_ia32_fred_stklvls;
+ u64 guest_ia32_fred_ssp1;
+ u64 guest_ia32_fred_ssp2;
+ u64 guest_ia32_fred_ssp3;
+ u64 host_ia32_fred_config;
+ u64 host_ia32_fred_rsp1;
+ u64 host_ia32_fred_rsp2;
+ u64 host_ia32_fred_rsp3;
+ u64 host_ia32_fred_stklvls;
+ u64 host_ia32_fred_ssp1;
+ u64 host_ia32_fred_ssp2;
+ u64 host_ia32_fred_ssp3;
+ u64 injected_event_data;
+ u64 original_event_data;
};
/*
@@ -362,6 +380,24 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(host_tr_selector, 994);
CHECK_OFFSET(guest_pml_index, 996);
CHECK_OFFSET(secondary_vm_exit_controls, 998);
+ CHECK_OFFSET(guest_ia32_fred_config, 1006);
+ CHECK_OFFSET(guest_ia32_fred_rsp1, 1014);
+ CHECK_OFFSET(guest_ia32_fred_rsp2, 1022);
+ CHECK_OFFSET(guest_ia32_fred_rsp3, 1030);
+ CHECK_OFFSET(guest_ia32_fred_stklvls, 1038);
+ CHECK_OFFSET(guest_ia32_fred_ssp1, 1046);
+ CHECK_OFFSET(guest_ia32_fred_ssp2, 1054);
+ CHECK_OFFSET(guest_ia32_fred_ssp3, 1062);
+ CHECK_OFFSET(host_ia32_fred_config, 1070);
+ CHECK_OFFSET(host_ia32_fred_rsp1, 1078);
+ CHECK_OFFSET(host_ia32_fred_rsp2, 1086);
+ CHECK_OFFSET(host_ia32_fred_rsp3, 1094);
+ CHECK_OFFSET(host_ia32_fred_stklvls, 1102);
+ CHECK_OFFSET(host_ia32_fred_ssp1, 1110);
+ CHECK_OFFSET(host_ia32_fred_ssp2, 1118);
+ CHECK_OFFSET(host_ia32_fred_ssp3, 1126);
+ CHECK_OFFSET(injected_event_data, 1134);
+ CHECK_OFFSET(original_event_data, 1142);
}
extern const unsigned short vmcs12_field_offsets[];
diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
index cad128d1657b..da338327c2b3 100644
--- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h
+++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
@@ -74,6 +74,10 @@ SHADOW_FIELD_RW(HOST_GS_BASE, host_gs_base)
/* 64-bit */
SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address)
SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address)
+SHADOW_FIELD_RO(ORIGINAL_EVENT_DATA, original_event_data)
+SHADOW_FIELD_RO(ORIGINAL_EVENT_DATA_HIGH, original_event_data)
+SHADOW_FIELD_RW(INJECTED_EVENT_DATA, injected_event_data)
+SHADOW_FIELD_RW(INJECTED_EVENT_DATA_HIGH, injected_event_data)
#undef SHADOW_FIELD_RO
#undef SHADOW_FIELD_RW
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 617cbec5c9b3..885e48fe33c4 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -181,6 +181,39 @@ struct nested_vmx {
*/
u64 pre_vmenter_debugctl;
u64 pre_vmenter_bndcfgs;
+ u64 pre_vmenter_fred_config;
+ u64 pre_vmenter_fred_rsp1;
+ u64 pre_vmenter_fred_rsp2;
+ u64 pre_vmenter_fred_rsp3;
+ u64 pre_vmenter_fred_stklvls;
+ u64 pre_vmenter_fred_ssp1;
+ u64 pre_vmenter_fred_ssp2;
+ u64 pre_vmenter_fred_ssp3;
+
+ /*
+ * Used to snapshot MSRs that are conditionally saved on VM-Exit in
+ * order to propagate the guest's pre-VM-Exit value into vmcs12.
+ *
+ * FRED MSRs are *always* saved to vmcs02 since KVM always sets
+ * SECONDARY_VM_EXIT_SAVE_IA32_FRED. However an L1 VMM, although
+ * unlikely, might choose not to set this bit, resulting in FRED MSRs
+ * not being saved to vmcs12.
+ *
+ * It's not a problem when SECONDARY_VM_EXIT_LOAD_IA32_FRED is set,
+ * as the CPU immediately loads the host FRED state from vmcs12 into
+ * the FRED MSRs.
+ *
+ * But an L1 VMM may clear SECONDARY_VM_EXIT_LOAD_IA32_FRED, causing
+ * the CPU to retain the pre VM-Exit FRED MSRs.
+ */
+ u64 pre_vmexit_fred_config;
+ u64 pre_vmexit_fred_rsp1;
+ u64 pre_vmexit_fred_rsp2;
+ u64 pre_vmexit_fred_rsp3;
+ u64 pre_vmexit_fred_stklvls;
+ u64 pre_vmexit_fred_ssp1;
+ u64 pre_vmexit_fred_ssp2;
+ u64 pre_vmexit_fred_ssp3;
/* to migrate it to L1 if L2 writes to L1's CR8 directly */
int l1_tpr_threshold;
--
2.50.1
> @@ -4531,6 +4593,27 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, > vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); > vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); > vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); > + > + vmx->nested.pre_vmexit_fred_config = vmcs_read64(GUEST_IA32_FRED_CONFIG); > + vmx->nested.pre_vmexit_fred_rsp1 = vmcs_read64(GUEST_IA32_FRED_RSP1); > + vmx->nested.pre_vmexit_fred_rsp2 = vmcs_read64(GUEST_IA32_FRED_RSP2); > + vmx->nested.pre_vmexit_fred_rsp3 = vmcs_read64(GUEST_IA32_FRED_RSP3); > + vmx->nested.pre_vmexit_fred_stklvls = vmcs_read64(GUEST_IA32_FRED_STKLVLS); > + vmx->nested.pre_vmexit_fred_ssp1 = vmcs_read64(GUEST_IA32_FRED_SSP1); > + vmx->nested.pre_vmexit_fred_ssp2 = vmcs_read64(GUEST_IA32_FRED_SSP2); > + vmx->nested.pre_vmexit_fred_ssp3 = vmcs_read64(GUEST_IA32_FRED_SSP3); This ... > + > + if (nested_cpu_save_guest_fred_state(vmcs12)) { > + vmcs12->guest_ia32_fred_config = vmx->nested.pre_vmexit_fred_config; > + vmcs12->guest_ia32_fred_rsp1 = vmx->nested.pre_vmexit_fred_rsp1; > + vmcs12->guest_ia32_fred_rsp2 = vmx->nested.pre_vmexit_fred_rsp2; > + vmcs12->guest_ia32_fred_rsp3 = vmx->nested.pre_vmexit_fred_rsp3; > + vmcs12->guest_ia32_fred_stklvls = vmx->nested.pre_vmexit_fred_stklvls; > + vmcs12->guest_ia32_fred_ssp1 = vmx->nested.pre_vmexit_fred_ssp1; > + vmcs12->guest_ia32_fred_ssp2 = vmx->nested.pre_vmexit_fred_ssp2; > + vmcs12->guest_ia32_fred_ssp3 = vmx->nested.pre_vmexit_fred_ssp3; > + } > + > vmcs12->guest_pending_dbg_exceptions = > vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); > > @@ -4761,6 +4860,26 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, > vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); > vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); > > + if (nested_cpu_load_host_fred_state(vmcs12)) { > + vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->host_ia32_fred_config); > + vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->host_ia32_fred_rsp1); > + vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->host_ia32_fred_rsp2); > + vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->host_ia32_fred_rsp3); > + vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->host_ia32_fred_stklvls); > + vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->host_ia32_fred_ssp1); > + vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->host_ia32_fred_ssp2); > + vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->host_ia32_fred_ssp3); > + } else { > + vmcs_write64(GUEST_IA32_FRED_CONFIG, vmx->nested.pre_vmexit_fred_config); > + vmcs_write64(GUEST_IA32_FRED_RSP1, vmx->nested.pre_vmexit_fred_rsp1); > + vmcs_write64(GUEST_IA32_FRED_RSP2, vmx->nested.pre_vmexit_fred_rsp2); > + vmcs_write64(GUEST_IA32_FRED_RSP3, vmx->nested.pre_vmexit_fred_rsp3); > + vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmx->nested.pre_vmexit_fred_stklvls); > + vmcs_write64(GUEST_IA32_FRED_SSP1, vmx->nested.pre_vmexit_fred_ssp1); > + vmcs_write64(GUEST_IA32_FRED_SSP2, vmx->nested.pre_vmexit_fred_ssp2); > + vmcs_write64(GUEST_IA32_FRED_SSP3, vmx->nested.pre_vmexit_fred_ssp3); And this are actually nops. IOW, if I don't add this snippet of code, the CPU still retains the guest FRED MSRs, i.e., using guest FRED state from vmcs02 as that of vmcs01. > + } > + > /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ > if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) > vmcs_write64(GUEST_BNDCFGS, 0); > diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h > index 617cbec5c9b3..885e48fe33c4 100644 > --- a/arch/x86/kvm/vmx/vmx.h > +++ b/arch/x86/kvm/vmx/vmx.h > @@ -181,6 +181,39 @@ struct nested_vmx { > */ > u64 pre_vmenter_debugctl; > u64 pre_vmenter_bndcfgs; > + u64 pre_vmenter_fred_config; > + u64 pre_vmenter_fred_rsp1; > + u64 pre_vmenter_fred_rsp2; > + u64 pre_vmenter_fred_rsp3; > + u64 pre_vmenter_fred_stklvls; > + u64 pre_vmenter_fred_ssp1; > + u64 pre_vmenter_fred_ssp2; > + u64 pre_vmenter_fred_ssp3; > + > + /* > + * Used to snapshot MSRs that are conditionally saved on VM-Exit in > + * order to propagate the guest's pre-VM-Exit value into vmcs12. > + * > + * FRED MSRs are *always* saved to vmcs02 since KVM always sets > + * SECONDARY_VM_EXIT_SAVE_IA32_FRED. However an L1 VMM, although > + * unlikely, might choose not to set this bit, resulting in FRED MSRs > + * not being saved to vmcs12. > + * > + * It's not a problem when SECONDARY_VM_EXIT_LOAD_IA32_FRED is set, > + * as the CPU immediately loads the host FRED state from vmcs12 into > + * the FRED MSRs. > + * > + * But an L1 VMM may clear SECONDARY_VM_EXIT_LOAD_IA32_FRED, causing > + * the CPU to retain the pre VM-Exit FRED MSRs. > + */ However I want to make this logic explicit. So we might end up with adding the comment somewhere and removing all the pre_vmexit_fred_* changes? > + u64 pre_vmexit_fred_config; > + u64 pre_vmexit_fred_rsp1; > + u64 pre_vmexit_fred_rsp2; > + u64 pre_vmexit_fred_rsp3; > + u64 pre_vmexit_fred_stklvls; > + u64 pre_vmexit_fred_ssp1; > + u64 pre_vmexit_fred_ssp2; > + u64 pre_vmexit_fred_ssp3; > > /* to migrate it to L1 if L2 writes to L1's CR8 directly */ > int l1_tpr_threshold; Thanks! Xin
On 8/2/2025 10:33 AM, Xin Li wrote: >> @@ -4531,6 +4593,27 @@ static void sync_vmcs02_to_vmcs12_rare(struct >> kvm_vcpu *vcpu, >> vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); >> vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); >> vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); >> + >> + vmx->nested.pre_vmexit_fred_config = >> vmcs_read64(GUEST_IA32_FRED_CONFIG); >> + vmx->nested.pre_vmexit_fred_rsp1 = >> vmcs_read64(GUEST_IA32_FRED_RSP1); >> + vmx->nested.pre_vmexit_fred_rsp2 = >> vmcs_read64(GUEST_IA32_FRED_RSP2); >> + vmx->nested.pre_vmexit_fred_rsp3 = >> vmcs_read64(GUEST_IA32_FRED_RSP3); >> + vmx->nested.pre_vmexit_fred_stklvls = >> vmcs_read64(GUEST_IA32_FRED_STKLVLS); >> + vmx->nested.pre_vmexit_fred_ssp1 = >> vmcs_read64(GUEST_IA32_FRED_SSP1); >> + vmx->nested.pre_vmexit_fred_ssp2 = >> vmcs_read64(GUEST_IA32_FRED_SSP2); >> + vmx->nested.pre_vmexit_fred_ssp3 = >> vmcs_read64(GUEST_IA32_FRED_SSP3); > > This ... > >> + >> + if (nested_cpu_save_guest_fred_state(vmcs12)) { >> + vmcs12->guest_ia32_fred_config = vmx- >> >nested.pre_vmexit_fred_config; >> + vmcs12->guest_ia32_fred_rsp1 = vmx->nested.pre_vmexit_fred_rsp1; >> + vmcs12->guest_ia32_fred_rsp2 = vmx->nested.pre_vmexit_fred_rsp2; >> + vmcs12->guest_ia32_fred_rsp3 = vmx->nested.pre_vmexit_fred_rsp3; >> + vmcs12->guest_ia32_fred_stklvls = vmx- >> >nested.pre_vmexit_fred_stklvls; >> + vmcs12->guest_ia32_fred_ssp1 = vmx->nested.pre_vmexit_fred_ssp1; >> + vmcs12->guest_ia32_fred_ssp2 = vmx->nested.pre_vmexit_fred_ssp2; >> + vmcs12->guest_ia32_fred_ssp3 = vmx->nested.pre_vmexit_fred_ssp3; >> + } >> + >> vmcs12->guest_pending_dbg_exceptions = >> vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); >> @@ -4761,6 +4860,26 @@ static void load_vmcs12_host_state(struct >> kvm_vcpu *vcpu, >> vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); >> vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); >> + if (nested_cpu_load_host_fred_state(vmcs12)) { >> + vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12- >> >host_ia32_fred_config); >> + vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->host_ia32_fred_rsp1); >> + vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->host_ia32_fred_rsp2); >> + vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->host_ia32_fred_rsp3); >> + vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12- >> >host_ia32_fred_stklvls); >> + vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->host_ia32_fred_ssp1); >> + vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->host_ia32_fred_ssp2); >> + vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->host_ia32_fred_ssp3); >> + } else { >> + vmcs_write64(GUEST_IA32_FRED_CONFIG, vmx- >> >nested.pre_vmexit_fred_config); >> + vmcs_write64(GUEST_IA32_FRED_RSP1, vmx- >> >nested.pre_vmexit_fred_rsp1); >> + vmcs_write64(GUEST_IA32_FRED_RSP2, vmx- >> >nested.pre_vmexit_fred_rsp2); >> + vmcs_write64(GUEST_IA32_FRED_RSP3, vmx- >> >nested.pre_vmexit_fred_rsp3); >> + vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmx- >> >nested.pre_vmexit_fred_stklvls); >> + vmcs_write64(GUEST_IA32_FRED_SSP1, vmx- >> >nested.pre_vmexit_fred_ssp1); >> + vmcs_write64(GUEST_IA32_FRED_SSP2, vmx- >> >nested.pre_vmexit_fred_ssp2); >> + vmcs_write64(GUEST_IA32_FRED_SSP3, vmx- >> >nested.pre_vmexit_fred_ssp3); > > And this are actually nops. IOW, if I don't add this snippet of code, > the CPU still retains the guest FRED MSRs, i.e., using guest FRED state > from vmcs02 as that of vmcs01. I confused myself. They are NOT nops, because __nested_vmx_vmexit() switches from vmcs02 to vmcs01. The code should be (as the patch does): __nested_vmx_vmexit() { ... /* * Save guest FRED state of vmcs02 to nested.pre_vmexit_fred * no matter if SECONDARY_VM_EXIT_SAVE_IA32_FRED is set. */ sync_vmcs02_to_vmcs12(); ... vmx_switch_vmcs(); ... /* * Load nested.pre_vmexit_fred to guest FRED state of vmcs01 * if SECONDARY_VM_EXIT_LOAD_IA32_FRED is NOT set. */ load_vmcs12_host_state(); ... } As not setting any of the two FRED VM-Exit controls are rare cases, we need to add KVM tests with L1 that: 1) doesn't set SECONDARY_VM_EXIT_SAVE_IA32_FRED in VM-Exit controls. 2) doesn't set SECONDARY_VM_EXIT_LOAD_IA32_FRED in VM-Exit controls. 3) doesn't set both of the FRED VM-Exit controls. Looks we need a framework for all VM-Exit controls which control whether to save/load specific MSRs related to CPU features during VM-Exit?
© 2016 - 2025 Red Hat, Inc.