From: Xin Li <xin3.li@intel.com>
Extend nested VMX context management to include FRED-related VMCS fields.
This enables proper handling of FRED state during nested virtualization.
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Tested-by: Xuelian Guo <xuelian.guo@intel.com>
---
Change in v5:
* Add TB from Xuelian Guo.
Changes in v4:
* Advertise VMX nested exception as if the CPU supports it (Chao Gao).
* Split FRED state management controls (Chao Gao).
Changes in v3:
* Add and use nested_cpu_has_fred(vmcs12) because vmcs02 should be set
from vmcs12 if and only if the field is enabled in L1's VMX config
(Sean Christopherson).
* Fix coding style issues (Sean Christopherson).
Changes in v2:
* Remove hyperv TLFS related changes (Jeremi Piotrowski).
* Use kvm_cpu_cap_has() instead of cpu_feature_enabled() (Chao Gao).
---
Documentation/virt/kvm/x86/nested-vmx.rst | 18 +++++
arch/x86/kvm/vmx/capabilities.h | 5 ++
arch/x86/kvm/vmx/nested.c | 83 ++++++++++++++++++++++-
arch/x86/kvm/vmx/nested.h | 22 ++++++
arch/x86/kvm/vmx/vmcs12.c | 18 +++++
arch/x86/kvm/vmx/vmcs12.h | 36 ++++++++++
arch/x86/kvm/vmx/vmcs_shadow_fields.h | 4 ++
7 files changed, 184 insertions(+), 2 deletions(-)
diff --git a/Documentation/virt/kvm/x86/nested-vmx.rst b/Documentation/virt/kvm/x86/nested-vmx.rst
index e64ef231f310..87fa9f3877ab 100644
--- a/Documentation/virt/kvm/x86/nested-vmx.rst
+++ b/Documentation/virt/kvm/x86/nested-vmx.rst
@@ -218,6 +218,24 @@ struct shadow_vmcs is ever changed.
u16 host_gs_selector;
u16 host_tr_selector;
u64 secondary_vm_exit_controls;
+ u64 guest_ia32_fred_config;
+ u64 guest_ia32_fred_rsp1;
+ u64 guest_ia32_fred_rsp2;
+ u64 guest_ia32_fred_rsp3;
+ u64 guest_ia32_fred_stklvls;
+ u64 guest_ia32_fred_ssp1;
+ u64 guest_ia32_fred_ssp2;
+ u64 guest_ia32_fred_ssp3;
+ u64 host_ia32_fred_config;
+ u64 host_ia32_fred_rsp1;
+ u64 host_ia32_fred_rsp2;
+ u64 host_ia32_fred_rsp3;
+ u64 host_ia32_fred_stklvls;
+ u64 host_ia32_fred_ssp1;
+ u64 host_ia32_fred_ssp2;
+ u64 host_ia32_fred_ssp3;
+ u64 injected_event_data;
+ u64 original_event_data;
};
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 6d446574d770..cf7c93c33a98 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -78,6 +78,11 @@ static inline bool cpu_has_vmx_basic_inout(void)
return vmcs_config.basic & VMX_BASIC_INOUT;
}
+static inline bool cpu_has_vmx_nested_exception(void)
+{
+ return vmcs_config.basic & VMX_BASIC_NESTED_EXCEPTION;
+}
+
static inline bool cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS &&
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 4405d176cf74..30d2de1243ef 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -705,6 +705,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FRED_RSP0, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FRED_SSP0, MSR_TYPE_RW);
#endif
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
@@ -1272,9 +1278,11 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
{
const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
VMX_BASIC_INOUT |
- VMX_BASIC_TRUE_CTLS;
+ VMX_BASIC_TRUE_CTLS |
+ VMX_BASIC_NESTED_EXCEPTION;
- const u64 reserved_bits = GENMASK_ULL(63, 56) |
+ const u64 reserved_bits = GENMASK_ULL(63, 59) |
+ GENMASK_ULL(57, 56) |
GENMASK_ULL(47, 45) |
BIT_ULL(31);
@@ -2526,6 +2534,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
vmcs12->vm_entry_instruction_len);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
vmcs12->guest_interruptibility_info);
+ if (cpu_has_vmx_fred())
+ vmcs_write64(INJECTED_EVENT_DATA, vmcs12->injected_event_data);
vmx->loaded_vmcs->nmi_known_unmasked =
!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
} else {
@@ -2578,6 +2588,17 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
vmx_segment_cache_clear(vmx);
+
+ if (nested_cpu_load_guest_fred_states(vmcs12)) {
+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->guest_ia32_fred_config);
+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->guest_ia32_fred_rsp1);
+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->guest_ia32_fred_rsp2);
+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->guest_ia32_fred_rsp3);
+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->guest_ia32_fred_stklvls);
+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->guest_ia32_fred_ssp1);
+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->guest_ia32_fred_ssp2);
+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->guest_ia32_fred_ssp3);
+ }
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@ -3864,6 +3885,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
u32 idt_vectoring;
unsigned int nr;
+ vmcs12->original_event_data = 0;
+
/*
* Per the SDM, VM-Exits due to double and triple faults are never
* considered to occur during event delivery, even if the double/triple
@@ -3902,6 +3925,13 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
vcpu->arch.exception.error_code;
}
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) &&
+ (vmcs12->guest_cr4 & X86_CR4_FRED) &&
+ (vcpu->arch.exception.nested))
+ idt_vectoring |= VECTORING_INFO_NESTED_EXCEPTION_MASK;
+
+ vmcs12->original_event_data = vcpu->arch.exception.event_data;
+
vmcs12->idt_vectoring_info_field = idt_vectoring;
} else if (vcpu->arch.nmi_injected) {
vmcs12->idt_vectoring_info_field =
@@ -4482,6 +4512,14 @@ static bool is_vmcs12_ext_field(unsigned long field)
case GUEST_TR_BASE:
case GUEST_GDTR_BASE:
case GUEST_IDTR_BASE:
+ case GUEST_IA32_FRED_CONFIG:
+ case GUEST_IA32_FRED_RSP1:
+ case GUEST_IA32_FRED_RSP2:
+ case GUEST_IA32_FRED_RSP3:
+ case GUEST_IA32_FRED_STKLVLS:
+ case GUEST_IA32_FRED_SSP1:
+ case GUEST_IA32_FRED_SSP2:
+ case GUEST_IA32_FRED_SSP3:
case GUEST_PENDING_DBG_EXCEPTIONS:
case GUEST_BNDCFGS:
return true;
@@ -4531,6 +4569,18 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+
+ if (nested_cpu_save_guest_fred_states(vmcs12)) {
+ vmcs12->guest_ia32_fred_config = vmcs_read64(GUEST_IA32_FRED_CONFIG);
+ vmcs12->guest_ia32_fred_rsp1 = vmcs_read64(GUEST_IA32_FRED_RSP1);
+ vmcs12->guest_ia32_fred_rsp2 = vmcs_read64(GUEST_IA32_FRED_RSP2);
+ vmcs12->guest_ia32_fred_rsp3 = vmcs_read64(GUEST_IA32_FRED_RSP3);
+ vmcs12->guest_ia32_fred_stklvls = vmcs_read64(GUEST_IA32_FRED_STKLVLS);
+ vmcs12->guest_ia32_fred_ssp1 = vmcs_read64(GUEST_IA32_FRED_SSP1);
+ vmcs12->guest_ia32_fred_ssp2 = vmcs_read64(GUEST_IA32_FRED_SSP2);
+ vmcs12->guest_ia32_fred_ssp3 = vmcs_read64(GUEST_IA32_FRED_SSP3);
+ }
+
vmcs12->guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
@@ -4684,6 +4734,21 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->vm_exit_intr_info = exit_intr_info;
vmcs12->vm_exit_instruction_len = exit_insn_len;
+
+ /*
+ * When there is a valid original event, the exiting event is a nested
+ * event during delivery of the earlier original event.
+ *
+ * FRED event delivery reflects this relationship by setting the value
+ * of the nested exception bit of VM-exit interruption information
+ * (aka exiting-event identification) to that of the valid bit of the
+ * IDT-vectoring information (aka original-event identification).
+ */
+ if ((vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) &&
+ (vmcs12->guest_cr4 & X86_CR4_FRED))
+ vmcs12->vm_exit_intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
+
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
/*
@@ -4761,6 +4826,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
+ if (nested_cpu_load_host_fred_states(vmcs12)) {
+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->host_ia32_fred_config);
+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->host_ia32_fred_rsp1);
+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->host_ia32_fred_rsp2);
+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->host_ia32_fred_rsp3);
+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->host_ia32_fred_stklvls);
+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->host_ia32_fred_ssp1);
+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->host_ia32_fred_ssp2);
+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->host_ia32_fred_ssp3);
+ }
+
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
vmcs_write64(GUEST_BNDCFGS, 0);
@@ -7228,6 +7304,9 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
msrs->basic |= VMX_BASIC_TRUE_CTLS;
if (cpu_has_vmx_basic_inout())
msrs->basic |= VMX_BASIC_INOUT;
+
+ if (cpu_has_vmx_nested_exception())
+ msrs->basic |= VMX_BASIC_NESTED_EXCEPTION;
}
static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 6eedcfc91070..c6b69699e28e 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -249,6 +249,11 @@ static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12)
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
}
+static inline bool nested_cpu_has_secondary_vm_exit_controls(struct vmcs12 *vmcs12)
+{
+ return vmcs12->vm_exit_controls & VM_EXIT_ACTIVATE_SECONDARY_CONTROLS;
+}
+
static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
{
return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
@@ -269,6 +274,23 @@ static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
}
+static inline bool nested_cpu_load_guest_fred_states(struct vmcs12 *vmcs12)
+{
+ return vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_FRED;
+}
+
+static inline bool nested_cpu_save_guest_fred_states(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_secondary_vm_exit_controls(vmcs12) &&
+ vmcs12->secondary_vm_exit_controls & SECONDARY_VM_EXIT_SAVE_IA32_FRED;
+}
+
+static inline bool nested_cpu_load_host_fred_states(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_secondary_vm_exit_controls(vmcs12) &&
+ vmcs12->secondary_vm_exit_controls & SECONDARY_VM_EXIT_LOAD_IA32_FRED;
+}
+
/*
* if fixed0[i] == 1: val[i] must be 1
* if fixed1[i] == 0: val[i] must be 0
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index 9fac24fd5b4b..5fa63326deba 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -67,6 +67,24 @@ const unsigned short vmcs12_field_offsets[] = {
FIELD64(HOST_IA32_EFER, host_ia32_efer),
FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
FIELD64(SECONDARY_VM_EXIT_CONTROLS, secondary_vm_exit_controls),
+ FIELD64(INJECTED_EVENT_DATA, injected_event_data),
+ FIELD64(ORIGINAL_EVENT_DATA, original_event_data),
+ FIELD64(GUEST_IA32_FRED_CONFIG, guest_ia32_fred_config),
+ FIELD64(GUEST_IA32_FRED_RSP1, guest_ia32_fred_rsp1),
+ FIELD64(GUEST_IA32_FRED_RSP2, guest_ia32_fred_rsp2),
+ FIELD64(GUEST_IA32_FRED_RSP3, guest_ia32_fred_rsp3),
+ FIELD64(GUEST_IA32_FRED_STKLVLS, guest_ia32_fred_stklvls),
+ FIELD64(GUEST_IA32_FRED_SSP1, guest_ia32_fred_ssp1),
+ FIELD64(GUEST_IA32_FRED_SSP2, guest_ia32_fred_ssp2),
+ FIELD64(GUEST_IA32_FRED_SSP3, guest_ia32_fred_ssp3),
+ FIELD64(HOST_IA32_FRED_CONFIG, host_ia32_fred_config),
+ FIELD64(HOST_IA32_FRED_RSP1, host_ia32_fred_rsp1),
+ FIELD64(HOST_IA32_FRED_RSP2, host_ia32_fred_rsp2),
+ FIELD64(HOST_IA32_FRED_RSP3, host_ia32_fred_rsp3),
+ FIELD64(HOST_IA32_FRED_STKLVLS, host_ia32_fred_stklvls),
+ FIELD64(HOST_IA32_FRED_SSP1, host_ia32_fred_ssp1),
+ FIELD64(HOST_IA32_FRED_SSP2, host_ia32_fred_ssp2),
+ FIELD64(HOST_IA32_FRED_SSP3, host_ia32_fred_ssp3),
FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
FIELD(EXCEPTION_BITMAP, exception_bitmap),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 1fe3ed9108aa..f2a33d7007c9 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -186,6 +186,24 @@ struct __packed vmcs12 {
u16 host_tr_selector;
u16 guest_pml_index;
u64 secondary_vm_exit_controls;
+ u64 guest_ia32_fred_config;
+ u64 guest_ia32_fred_rsp1;
+ u64 guest_ia32_fred_rsp2;
+ u64 guest_ia32_fred_rsp3;
+ u64 guest_ia32_fred_stklvls;
+ u64 guest_ia32_fred_ssp1;
+ u64 guest_ia32_fred_ssp2;
+ u64 guest_ia32_fred_ssp3;
+ u64 host_ia32_fred_config;
+ u64 host_ia32_fred_rsp1;
+ u64 host_ia32_fred_rsp2;
+ u64 host_ia32_fred_rsp3;
+ u64 host_ia32_fred_stklvls;
+ u64 host_ia32_fred_ssp1;
+ u64 host_ia32_fred_ssp2;
+ u64 host_ia32_fred_ssp3;
+ u64 injected_event_data;
+ u64 original_event_data;
};
/*
@@ -362,6 +380,24 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(host_tr_selector, 994);
CHECK_OFFSET(guest_pml_index, 996);
CHECK_OFFSET(secondary_vm_exit_controls, 998);
+ CHECK_OFFSET(guest_ia32_fred_config, 1006);
+ CHECK_OFFSET(guest_ia32_fred_rsp1, 1014);
+ CHECK_OFFSET(guest_ia32_fred_rsp2, 1022);
+ CHECK_OFFSET(guest_ia32_fred_rsp3, 1030);
+ CHECK_OFFSET(guest_ia32_fred_stklvls, 1038);
+ CHECK_OFFSET(guest_ia32_fred_ssp1, 1046);
+ CHECK_OFFSET(guest_ia32_fred_ssp2, 1054);
+ CHECK_OFFSET(guest_ia32_fred_ssp3, 1062);
+ CHECK_OFFSET(host_ia32_fred_config, 1070);
+ CHECK_OFFSET(host_ia32_fred_rsp1, 1078);
+ CHECK_OFFSET(host_ia32_fred_rsp2, 1086);
+ CHECK_OFFSET(host_ia32_fred_rsp3, 1094);
+ CHECK_OFFSET(host_ia32_fred_stklvls, 1102);
+ CHECK_OFFSET(host_ia32_fred_ssp1, 1110);
+ CHECK_OFFSET(host_ia32_fred_ssp2, 1118);
+ CHECK_OFFSET(host_ia32_fred_ssp3, 1126);
+ CHECK_OFFSET(injected_event_data, 1134);
+ CHECK_OFFSET(original_event_data, 1142);
}
extern const unsigned short vmcs12_field_offsets[];
diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
index cad128d1657b..da338327c2b3 100644
--- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h
+++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
@@ -74,6 +74,10 @@ SHADOW_FIELD_RW(HOST_GS_BASE, host_gs_base)
/* 64-bit */
SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address)
SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address)
+SHADOW_FIELD_RO(ORIGINAL_EVENT_DATA, original_event_data)
+SHADOW_FIELD_RO(ORIGINAL_EVENT_DATA_HIGH, original_event_data)
+SHADOW_FIELD_RW(INJECTED_EVENT_DATA, injected_event_data)
+SHADOW_FIELD_RW(INJECTED_EVENT_DATA_HIGH, injected_event_data)
#undef SHADOW_FIELD_RO
#undef SHADOW_FIELD_RW
--
2.50.1
>@@ -2578,6 +2588,17 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
> vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
>
> vmx_segment_cache_clear(vmx);
>+
>+ if (nested_cpu_load_guest_fred_states(vmcs12)) {
>+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->guest_ia32_fred_config);
>+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->guest_ia32_fred_rsp1);
>+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->guest_ia32_fred_rsp2);
>+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->guest_ia32_fred_rsp3);
>+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->guest_ia32_fred_stklvls);
>+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->guest_ia32_fred_ssp1);
>+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->guest_ia32_fred_ssp2);
>+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->guest_ia32_fred_ssp3);
>+ }
I think we need to snapshot L1's FRED MSR values before nested VM entry and
propagate them to GUEST_IA32_FRED* of VMCS02 for
!nested_cpu_load_guest_fred_states(vmcs12) case. i.e., from guest's view,
FRED MSRs shouldn't change across VM-entry if "Load guest FRED states" isn't
set.
Refer to the comment above 'pre_vmenter_debugctl' definition and also the
CET implmenetation*.
[*]: https://lore.kernel.org/kvm/20250704085027.182163-22-chao.gao@intel.com/
On 7/23/2025 11:50 PM, Chao Gao wrote:
>> @@ -2578,6 +2588,17 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
>> vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
>>
>> vmx_segment_cache_clear(vmx);
>> +
>> + if (nested_cpu_load_guest_fred_states(vmcs12)) {
>> + vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->guest_ia32_fred_config);
>> + vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->guest_ia32_fred_rsp1);
>> + vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->guest_ia32_fred_rsp2);
>> + vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->guest_ia32_fred_rsp3);
>> + vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->guest_ia32_fred_stklvls);
>> + vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->guest_ia32_fred_ssp1);
>> + vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->guest_ia32_fred_ssp2);
>> + vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->guest_ia32_fred_ssp3);
>> + }
>
> I think we need to snapshot L1's FRED MSR values before nested VM entry and
> propagate them to GUEST_IA32_FRED* of VMCS02 for
> !nested_cpu_load_guest_fred_states(vmcs12) case. i.e., from guest's view,
> FRED MSRs shouldn't change across VM-entry if "Load guest FRED states" isn't
> set.
>
> Refer to the comment above 'pre_vmenter_debugctl' definition and also the
> CET implmenetation*.
>
> [*]: https://lore.kernel.org/kvm/20250704085027.182163-22-chao.gao@intel.com/
>
Nice catch, it really took me a while to understand the issue.
From: Xin Li <xin3.li@intel.com>
Extend nested VMX context management to include FRED-related VMCS fields.
This enables proper handling of FRED state during nested virtualization.
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Tested-by: Xuelian Guo <xuelian.guo@intel.com>
---
Change in v5A:
* Handle FRED MSR pre-vmenter save/restore (Chao Gao).
Change in v5:
* Add TB from Xuelian Guo.
Changes in v4:
* Advertise VMX nested exception as if the CPU supports it (Chao Gao).
* Split FRED state management controls (Chao Gao).
Changes in v3:
* Add and use nested_cpu_has_fred(vmcs12) because vmcs02 should be set
from vmcs12 if and only if the field is enabled in L1's VMX config
(Sean Christopherson).
* Fix coding style issues (Sean Christopherson).
Changes in v2:
* Remove hyperv TLFS related changes (Jeremi Piotrowski).
* Use kvm_cpu_cap_has() instead of cpu_feature_enabled() (Chao Gao).
---
Documentation/virt/kvm/x86/nested-vmx.rst | 18 ++++
arch/x86/kvm/vmx/capabilities.h | 5 +
arch/x86/kvm/vmx/nested.c | 126 +++++++++++++++++++++-
arch/x86/kvm/vmx/nested.h | 22 ++++
arch/x86/kvm/vmx/vmcs12.c | 18 ++++
arch/x86/kvm/vmx/vmcs12.h | 36 +++++++
arch/x86/kvm/vmx/vmcs_shadow_fields.h | 4 +
arch/x86/kvm/vmx/vmx.h | 33 ++++++
8 files changed, 260 insertions(+), 2 deletions(-)
diff --git a/Documentation/virt/kvm/x86/nested-vmx.rst b/Documentation/virt/kvm/x86/nested-vmx.rst
index e64ef231f310..87fa9f3877ab 100644
--- a/Documentation/virt/kvm/x86/nested-vmx.rst
+++ b/Documentation/virt/kvm/x86/nested-vmx.rst
@@ -218,6 +218,24 @@ struct shadow_vmcs is ever changed.
u16 host_gs_selector;
u16 host_tr_selector;
u64 secondary_vm_exit_controls;
+ u64 guest_ia32_fred_config;
+ u64 guest_ia32_fred_rsp1;
+ u64 guest_ia32_fred_rsp2;
+ u64 guest_ia32_fred_rsp3;
+ u64 guest_ia32_fred_stklvls;
+ u64 guest_ia32_fred_ssp1;
+ u64 guest_ia32_fred_ssp2;
+ u64 guest_ia32_fred_ssp3;
+ u64 host_ia32_fred_config;
+ u64 host_ia32_fred_rsp1;
+ u64 host_ia32_fred_rsp2;
+ u64 host_ia32_fred_rsp3;
+ u64 host_ia32_fred_stklvls;
+ u64 host_ia32_fred_ssp1;
+ u64 host_ia32_fred_ssp2;
+ u64 host_ia32_fred_ssp3;
+ u64 injected_event_data;
+ u64 original_event_data;
};
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 6d446574d770..cf7c93c33a98 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -78,6 +78,11 @@ static inline bool cpu_has_vmx_basic_inout(void)
return vmcs_config.basic & VMX_BASIC_INOUT;
}
+static inline bool cpu_has_vmx_nested_exception(void)
+{
+ return vmcs_config.basic & VMX_BASIC_NESTED_EXCEPTION;
+}
+
static inline bool cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS &&
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 4405d176cf74..1d9de811313a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -705,6 +705,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FRED_RSP0, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FRED_SSP0, MSR_TYPE_RW);
#endif
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
@@ -1272,9 +1278,11 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
{
const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
VMX_BASIC_INOUT |
- VMX_BASIC_TRUE_CTLS;
+ VMX_BASIC_TRUE_CTLS |
+ VMX_BASIC_NESTED_EXCEPTION;
- const u64 reserved_bits = GENMASK_ULL(63, 56) |
+ const u64 reserved_bits = GENMASK_ULL(63, 59) |
+ GENMASK_ULL(57, 56) |
GENMASK_ULL(47, 45) |
BIT_ULL(31);
@@ -2526,6 +2534,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
vmcs12->vm_entry_instruction_len);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
vmcs12->guest_interruptibility_info);
+ if (cpu_has_vmx_fred())
+ vmcs_write64(INJECTED_EVENT_DATA, vmcs12->injected_event_data);
vmx->loaded_vmcs->nmi_known_unmasked =
!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
} else {
@@ -2578,6 +2588,17 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
vmx_segment_cache_clear(vmx);
+
+ if (nested_cpu_load_guest_fred_state(vmcs12)) {
+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->guest_ia32_fred_config);
+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->guest_ia32_fred_rsp1);
+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->guest_ia32_fred_rsp2);
+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->guest_ia32_fred_rsp3);
+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->guest_ia32_fred_stklvls);
+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->guest_ia32_fred_ssp1);
+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->guest_ia32_fred_ssp2);
+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->guest_ia32_fred_ssp3);
+ }
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@ -2679,6 +2700,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
!(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
}
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_FRED) &&
+ (!vmx->nested.nested_run_pending || !nested_cpu_load_guest_fred_state(vmcs12))) {
+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmx->nested.pre_vmenter_fred_config);
+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmx->nested.pre_vmenter_fred_rsp1);
+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmx->nested.pre_vmenter_fred_rsp2);
+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmx->nested.pre_vmenter_fred_rsp3);
+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmx->nested.pre_vmenter_fred_stklvls);
+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmx->nested.pre_vmenter_fred_ssp1);
+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmx->nested.pre_vmenter_fred_ssp2);
+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmx->nested.pre_vmenter_fred_ssp3);
+ }
+
if (vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
@@ -3549,6 +3582,18 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
kvm_service_local_tlb_flush_requests(vcpu);
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_FRED) &&
+ (!vmx->nested.nested_run_pending || !nested_cpu_load_guest_fred_state(vmcs12))) {
+ vmx->nested.pre_vmenter_fred_config = vmcs_read64(GUEST_IA32_FRED_CONFIG);
+ vmx->nested.pre_vmenter_fred_rsp1 = vmcs_read64(GUEST_IA32_FRED_RSP1);
+ vmx->nested.pre_vmenter_fred_rsp2 = vmcs_read64(GUEST_IA32_FRED_RSP2);
+ vmx->nested.pre_vmenter_fred_rsp3 = vmcs_read64(GUEST_IA32_FRED_RSP3);
+ vmx->nested.pre_vmenter_fred_stklvls = vmcs_read64(GUEST_IA32_FRED_STKLVLS);
+ vmx->nested.pre_vmenter_fred_ssp1 = vmcs_read64(GUEST_IA32_FRED_SSP1);
+ vmx->nested.pre_vmenter_fred_ssp2 = vmcs_read64(GUEST_IA32_FRED_SSP2);
+ vmx->nested.pre_vmenter_fred_ssp3 = vmcs_read64(GUEST_IA32_FRED_SSP3);
+ }
+
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
@@ -3864,6 +3909,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
u32 idt_vectoring;
unsigned int nr;
+ vmcs12->original_event_data = 0;
+
/*
* Per the SDM, VM-Exits due to double and triple faults are never
* considered to occur during event delivery, even if the double/triple
@@ -3902,6 +3949,13 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
vcpu->arch.exception.error_code;
}
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) &&
+ (vmcs12->guest_cr4 & X86_CR4_FRED) &&
+ (vcpu->arch.exception.nested))
+ idt_vectoring |= VECTORING_INFO_NESTED_EXCEPTION_MASK;
+
+ vmcs12->original_event_data = vcpu->arch.exception.event_data;
+
vmcs12->idt_vectoring_info_field = idt_vectoring;
} else if (vcpu->arch.nmi_injected) {
vmcs12->idt_vectoring_info_field =
@@ -4482,6 +4536,14 @@ static bool is_vmcs12_ext_field(unsigned long field)
case GUEST_TR_BASE:
case GUEST_GDTR_BASE:
case GUEST_IDTR_BASE:
+ case GUEST_IA32_FRED_CONFIG:
+ case GUEST_IA32_FRED_RSP1:
+ case GUEST_IA32_FRED_RSP2:
+ case GUEST_IA32_FRED_RSP3:
+ case GUEST_IA32_FRED_STKLVLS:
+ case GUEST_IA32_FRED_SSP1:
+ case GUEST_IA32_FRED_SSP2:
+ case GUEST_IA32_FRED_SSP3:
case GUEST_PENDING_DBG_EXCEPTIONS:
case GUEST_BNDCFGS:
return true;
@@ -4531,6 +4593,27 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+
+ vmx->nested.pre_vmexit_fred_config = vmcs_read64(GUEST_IA32_FRED_CONFIG);
+ vmx->nested.pre_vmexit_fred_rsp1 = vmcs_read64(GUEST_IA32_FRED_RSP1);
+ vmx->nested.pre_vmexit_fred_rsp2 = vmcs_read64(GUEST_IA32_FRED_RSP2);
+ vmx->nested.pre_vmexit_fred_rsp3 = vmcs_read64(GUEST_IA32_FRED_RSP3);
+ vmx->nested.pre_vmexit_fred_stklvls = vmcs_read64(GUEST_IA32_FRED_STKLVLS);
+ vmx->nested.pre_vmexit_fred_ssp1 = vmcs_read64(GUEST_IA32_FRED_SSP1);
+ vmx->nested.pre_vmexit_fred_ssp2 = vmcs_read64(GUEST_IA32_FRED_SSP2);
+ vmx->nested.pre_vmexit_fred_ssp3 = vmcs_read64(GUEST_IA32_FRED_SSP3);
+
+ if (nested_cpu_save_guest_fred_state(vmcs12)) {
+ vmcs12->guest_ia32_fred_config = vmx->nested.pre_vmexit_fred_config;
+ vmcs12->guest_ia32_fred_rsp1 = vmx->nested.pre_vmexit_fred_rsp1;
+ vmcs12->guest_ia32_fred_rsp2 = vmx->nested.pre_vmexit_fred_rsp2;
+ vmcs12->guest_ia32_fred_rsp3 = vmx->nested.pre_vmexit_fred_rsp3;
+ vmcs12->guest_ia32_fred_stklvls = vmx->nested.pre_vmexit_fred_stklvls;
+ vmcs12->guest_ia32_fred_ssp1 = vmx->nested.pre_vmexit_fred_ssp1;
+ vmcs12->guest_ia32_fred_ssp2 = vmx->nested.pre_vmexit_fred_ssp2;
+ vmcs12->guest_ia32_fred_ssp3 = vmx->nested.pre_vmexit_fred_ssp3;
+ }
+
vmcs12->guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
@@ -4684,6 +4767,21 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->vm_exit_intr_info = exit_intr_info;
vmcs12->vm_exit_instruction_len = exit_insn_len;
+
+ /*
+ * When there is a valid original event, the exiting event is a nested
+ * event during delivery of the earlier original event.
+ *
+ * FRED event delivery reflects this relationship by setting the value
+ * of the nested exception bit of VM-exit interruption information
+ * (aka exiting-event identification) to that of the valid bit of the
+ * IDT-vectoring information (aka original-event identification).
+ */
+ if ((vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) &&
+ (vmcs12->guest_cr4 & X86_CR4_FRED))
+ vmcs12->vm_exit_intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;
+
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
/*
@@ -4712,6 +4810,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
enum vm_entry_failure_code ignored;
struct kvm_segment seg;
@@ -4761,6 +4860,26 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
+ if (nested_cpu_load_host_fred_state(vmcs12)) {
+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->host_ia32_fred_config);
+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->host_ia32_fred_rsp1);
+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->host_ia32_fred_rsp2);
+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->host_ia32_fred_rsp3);
+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->host_ia32_fred_stklvls);
+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->host_ia32_fred_ssp1);
+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->host_ia32_fred_ssp2);
+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->host_ia32_fred_ssp3);
+ } else {
+ vmcs_write64(GUEST_IA32_FRED_CONFIG, vmx->nested.pre_vmexit_fred_config);
+ vmcs_write64(GUEST_IA32_FRED_RSP1, vmx->nested.pre_vmexit_fred_rsp1);
+ vmcs_write64(GUEST_IA32_FRED_RSP2, vmx->nested.pre_vmexit_fred_rsp2);
+ vmcs_write64(GUEST_IA32_FRED_RSP3, vmx->nested.pre_vmexit_fred_rsp3);
+ vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmx->nested.pre_vmexit_fred_stklvls);
+ vmcs_write64(GUEST_IA32_FRED_SSP1, vmx->nested.pre_vmexit_fred_ssp1);
+ vmcs_write64(GUEST_IA32_FRED_SSP2, vmx->nested.pre_vmexit_fred_ssp2);
+ vmcs_write64(GUEST_IA32_FRED_SSP3, vmx->nested.pre_vmexit_fred_ssp3);
+ }
+
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
vmcs_write64(GUEST_BNDCFGS, 0);
@@ -7228,6 +7347,9 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
msrs->basic |= VMX_BASIC_TRUE_CTLS;
if (cpu_has_vmx_basic_inout())
msrs->basic |= VMX_BASIC_INOUT;
+
+ if (cpu_has_vmx_nested_exception())
+ msrs->basic |= VMX_BASIC_NESTED_EXCEPTION;
}
static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 6eedcfc91070..b16dccff3186 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -249,6 +249,11 @@ static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12)
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
}
+static inline bool nested_cpu_has_secondary_vm_exit_controls(struct vmcs12 *vmcs12)
+{
+ return vmcs12->vm_exit_controls & VM_EXIT_ACTIVATE_SECONDARY_CONTROLS;
+}
+
static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
{
return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
@@ -269,6 +274,23 @@ static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
}
+static inline bool nested_cpu_load_guest_fred_state(struct vmcs12 *vmcs12)
+{
+ return vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_FRED;
+}
+
+static inline bool nested_cpu_save_guest_fred_state(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_secondary_vm_exit_controls(vmcs12) &&
+ vmcs12->secondary_vm_exit_controls & SECONDARY_VM_EXIT_SAVE_IA32_FRED;
+}
+
+static inline bool nested_cpu_load_host_fred_state(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_secondary_vm_exit_controls(vmcs12) &&
+ vmcs12->secondary_vm_exit_controls & SECONDARY_VM_EXIT_LOAD_IA32_FRED;
+}
+
/*
* if fixed0[i] == 1: val[i] must be 1
* if fixed1[i] == 0: val[i] must be 0
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index 9fac24fd5b4b..5fa63326deba 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -67,6 +67,24 @@ const unsigned short vmcs12_field_offsets[] = {
FIELD64(HOST_IA32_EFER, host_ia32_efer),
FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
FIELD64(SECONDARY_VM_EXIT_CONTROLS, secondary_vm_exit_controls),
+ FIELD64(INJECTED_EVENT_DATA, injected_event_data),
+ FIELD64(ORIGINAL_EVENT_DATA, original_event_data),
+ FIELD64(GUEST_IA32_FRED_CONFIG, guest_ia32_fred_config),
+ FIELD64(GUEST_IA32_FRED_RSP1, guest_ia32_fred_rsp1),
+ FIELD64(GUEST_IA32_FRED_RSP2, guest_ia32_fred_rsp2),
+ FIELD64(GUEST_IA32_FRED_RSP3, guest_ia32_fred_rsp3),
+ FIELD64(GUEST_IA32_FRED_STKLVLS, guest_ia32_fred_stklvls),
+ FIELD64(GUEST_IA32_FRED_SSP1, guest_ia32_fred_ssp1),
+ FIELD64(GUEST_IA32_FRED_SSP2, guest_ia32_fred_ssp2),
+ FIELD64(GUEST_IA32_FRED_SSP3, guest_ia32_fred_ssp3),
+ FIELD64(HOST_IA32_FRED_CONFIG, host_ia32_fred_config),
+ FIELD64(HOST_IA32_FRED_RSP1, host_ia32_fred_rsp1),
+ FIELD64(HOST_IA32_FRED_RSP2, host_ia32_fred_rsp2),
+ FIELD64(HOST_IA32_FRED_RSP3, host_ia32_fred_rsp3),
+ FIELD64(HOST_IA32_FRED_STKLVLS, host_ia32_fred_stklvls),
+ FIELD64(HOST_IA32_FRED_SSP1, host_ia32_fred_ssp1),
+ FIELD64(HOST_IA32_FRED_SSP2, host_ia32_fred_ssp2),
+ FIELD64(HOST_IA32_FRED_SSP3, host_ia32_fred_ssp3),
FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
FIELD(EXCEPTION_BITMAP, exception_bitmap),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 1fe3ed9108aa..f2a33d7007c9 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -186,6 +186,24 @@ struct __packed vmcs12 {
u16 host_tr_selector;
u16 guest_pml_index;
u64 secondary_vm_exit_controls;
+ u64 guest_ia32_fred_config;
+ u64 guest_ia32_fred_rsp1;
+ u64 guest_ia32_fred_rsp2;
+ u64 guest_ia32_fred_rsp3;
+ u64 guest_ia32_fred_stklvls;
+ u64 guest_ia32_fred_ssp1;
+ u64 guest_ia32_fred_ssp2;
+ u64 guest_ia32_fred_ssp3;
+ u64 host_ia32_fred_config;
+ u64 host_ia32_fred_rsp1;
+ u64 host_ia32_fred_rsp2;
+ u64 host_ia32_fred_rsp3;
+ u64 host_ia32_fred_stklvls;
+ u64 host_ia32_fred_ssp1;
+ u64 host_ia32_fred_ssp2;
+ u64 host_ia32_fred_ssp3;
+ u64 injected_event_data;
+ u64 original_event_data;
};
/*
@@ -362,6 +380,24 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(host_tr_selector, 994);
CHECK_OFFSET(guest_pml_index, 996);
CHECK_OFFSET(secondary_vm_exit_controls, 998);
+ CHECK_OFFSET(guest_ia32_fred_config, 1006);
+ CHECK_OFFSET(guest_ia32_fred_rsp1, 1014);
+ CHECK_OFFSET(guest_ia32_fred_rsp2, 1022);
+ CHECK_OFFSET(guest_ia32_fred_rsp3, 1030);
+ CHECK_OFFSET(guest_ia32_fred_stklvls, 1038);
+ CHECK_OFFSET(guest_ia32_fred_ssp1, 1046);
+ CHECK_OFFSET(guest_ia32_fred_ssp2, 1054);
+ CHECK_OFFSET(guest_ia32_fred_ssp3, 1062);
+ CHECK_OFFSET(host_ia32_fred_config, 1070);
+ CHECK_OFFSET(host_ia32_fred_rsp1, 1078);
+ CHECK_OFFSET(host_ia32_fred_rsp2, 1086);
+ CHECK_OFFSET(host_ia32_fred_rsp3, 1094);
+ CHECK_OFFSET(host_ia32_fred_stklvls, 1102);
+ CHECK_OFFSET(host_ia32_fred_ssp1, 1110);
+ CHECK_OFFSET(host_ia32_fred_ssp2, 1118);
+ CHECK_OFFSET(host_ia32_fred_ssp3, 1126);
+ CHECK_OFFSET(injected_event_data, 1134);
+ CHECK_OFFSET(original_event_data, 1142);
}
extern const unsigned short vmcs12_field_offsets[];
diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
index cad128d1657b..da338327c2b3 100644
--- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h
+++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
@@ -74,6 +74,10 @@ SHADOW_FIELD_RW(HOST_GS_BASE, host_gs_base)
/* 64-bit */
SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address)
SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address)
+SHADOW_FIELD_RO(ORIGINAL_EVENT_DATA, original_event_data)
+SHADOW_FIELD_RO(ORIGINAL_EVENT_DATA_HIGH, original_event_data)
+SHADOW_FIELD_RW(INJECTED_EVENT_DATA, injected_event_data)
+SHADOW_FIELD_RW(INJECTED_EVENT_DATA_HIGH, injected_event_data)
#undef SHADOW_FIELD_RO
#undef SHADOW_FIELD_RW
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 617cbec5c9b3..885e48fe33c4 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -181,6 +181,39 @@ struct nested_vmx {
*/
u64 pre_vmenter_debugctl;
u64 pre_vmenter_bndcfgs;
+ u64 pre_vmenter_fred_config;
+ u64 pre_vmenter_fred_rsp1;
+ u64 pre_vmenter_fred_rsp2;
+ u64 pre_vmenter_fred_rsp3;
+ u64 pre_vmenter_fred_stklvls;
+ u64 pre_vmenter_fred_ssp1;
+ u64 pre_vmenter_fred_ssp2;
+ u64 pre_vmenter_fred_ssp3;
+
+ /*
+ * Used to snapshot MSRs that are conditionally saved on VM-Exit in
+ * order to propagate the guest's pre-VM-Exit value into vmcs12.
+ *
+ * FRED MSRs are *always* saved to vmcs02 since KVM always sets
+ * SECONDARY_VM_EXIT_SAVE_IA32_FRED. However an L1 VMM, although
+ * unlikely, might choose not to set this bit, resulting in FRED MSRs
+ * not being saved to vmcs12.
+ *
+ * It's not a problem when SECONDARY_VM_EXIT_LOAD_IA32_FRED is set,
+ * as the CPU immediately loads the host FRED state from vmcs12 into
+ * the FRED MSRs.
+ *
+ * But an L1 VMM may clear SECONDARY_VM_EXIT_LOAD_IA32_FRED, causing
+ * the CPU to retain the pre VM-Exit FRED MSRs.
+ */
+ u64 pre_vmexit_fred_config;
+ u64 pre_vmexit_fred_rsp1;
+ u64 pre_vmexit_fred_rsp2;
+ u64 pre_vmexit_fred_rsp3;
+ u64 pre_vmexit_fred_stklvls;
+ u64 pre_vmexit_fred_ssp1;
+ u64 pre_vmexit_fred_ssp2;
+ u64 pre_vmexit_fred_ssp3;
/* to migrate it to L1 if L2 writes to L1's CR8 directly */
int l1_tpr_threshold;
--
2.50.1
> @@ -4531,6 +4593,27 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
> vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> +
> + vmx->nested.pre_vmexit_fred_config = vmcs_read64(GUEST_IA32_FRED_CONFIG);
> + vmx->nested.pre_vmexit_fred_rsp1 = vmcs_read64(GUEST_IA32_FRED_RSP1);
> + vmx->nested.pre_vmexit_fred_rsp2 = vmcs_read64(GUEST_IA32_FRED_RSP2);
> + vmx->nested.pre_vmexit_fred_rsp3 = vmcs_read64(GUEST_IA32_FRED_RSP3);
> + vmx->nested.pre_vmexit_fred_stklvls = vmcs_read64(GUEST_IA32_FRED_STKLVLS);
> + vmx->nested.pre_vmexit_fred_ssp1 = vmcs_read64(GUEST_IA32_FRED_SSP1);
> + vmx->nested.pre_vmexit_fred_ssp2 = vmcs_read64(GUEST_IA32_FRED_SSP2);
> + vmx->nested.pre_vmexit_fred_ssp3 = vmcs_read64(GUEST_IA32_FRED_SSP3);
This ...
> +
> + if (nested_cpu_save_guest_fred_state(vmcs12)) {
> + vmcs12->guest_ia32_fred_config = vmx->nested.pre_vmexit_fred_config;
> + vmcs12->guest_ia32_fred_rsp1 = vmx->nested.pre_vmexit_fred_rsp1;
> + vmcs12->guest_ia32_fred_rsp2 = vmx->nested.pre_vmexit_fred_rsp2;
> + vmcs12->guest_ia32_fred_rsp3 = vmx->nested.pre_vmexit_fred_rsp3;
> + vmcs12->guest_ia32_fred_stklvls = vmx->nested.pre_vmexit_fred_stklvls;
> + vmcs12->guest_ia32_fred_ssp1 = vmx->nested.pre_vmexit_fred_ssp1;
> + vmcs12->guest_ia32_fred_ssp2 = vmx->nested.pre_vmexit_fred_ssp2;
> + vmcs12->guest_ia32_fred_ssp3 = vmx->nested.pre_vmexit_fred_ssp3;
> + }
> +
> vmcs12->guest_pending_dbg_exceptions =
> vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>
> @@ -4761,6 +4860,26 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
> vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
> vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
>
> + if (nested_cpu_load_host_fred_state(vmcs12)) {
> + vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->host_ia32_fred_config);
> + vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->host_ia32_fred_rsp1);
> + vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->host_ia32_fred_rsp2);
> + vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->host_ia32_fred_rsp3);
> + vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->host_ia32_fred_stklvls);
> + vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->host_ia32_fred_ssp1);
> + vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->host_ia32_fred_ssp2);
> + vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->host_ia32_fred_ssp3);
> + } else {
> + vmcs_write64(GUEST_IA32_FRED_CONFIG, vmx->nested.pre_vmexit_fred_config);
> + vmcs_write64(GUEST_IA32_FRED_RSP1, vmx->nested.pre_vmexit_fred_rsp1);
> + vmcs_write64(GUEST_IA32_FRED_RSP2, vmx->nested.pre_vmexit_fred_rsp2);
> + vmcs_write64(GUEST_IA32_FRED_RSP3, vmx->nested.pre_vmexit_fred_rsp3);
> + vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmx->nested.pre_vmexit_fred_stklvls);
> + vmcs_write64(GUEST_IA32_FRED_SSP1, vmx->nested.pre_vmexit_fred_ssp1);
> + vmcs_write64(GUEST_IA32_FRED_SSP2, vmx->nested.pre_vmexit_fred_ssp2);
> + vmcs_write64(GUEST_IA32_FRED_SSP3, vmx->nested.pre_vmexit_fred_ssp3);
And this are actually nops. IOW, if I don't add this snippet of code,
the CPU still retains the guest FRED MSRs, i.e., using guest FRED state
from vmcs02 as that of vmcs01.
> + }
> +
> /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
> if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
> vmcs_write64(GUEST_BNDCFGS, 0);
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 617cbec5c9b3..885e48fe33c4 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -181,6 +181,39 @@ struct nested_vmx {
> */
> u64 pre_vmenter_debugctl;
> u64 pre_vmenter_bndcfgs;
> + u64 pre_vmenter_fred_config;
> + u64 pre_vmenter_fred_rsp1;
> + u64 pre_vmenter_fred_rsp2;
> + u64 pre_vmenter_fred_rsp3;
> + u64 pre_vmenter_fred_stklvls;
> + u64 pre_vmenter_fred_ssp1;
> + u64 pre_vmenter_fred_ssp2;
> + u64 pre_vmenter_fred_ssp3;
> +
> + /*
> + * Used to snapshot MSRs that are conditionally saved on VM-Exit in
> + * order to propagate the guest's pre-VM-Exit value into vmcs12.
> + *
> + * FRED MSRs are *always* saved to vmcs02 since KVM always sets
> + * SECONDARY_VM_EXIT_SAVE_IA32_FRED. However an L1 VMM, although
> + * unlikely, might choose not to set this bit, resulting in FRED MSRs
> + * not being saved to vmcs12.
> + *
> + * It's not a problem when SECONDARY_VM_EXIT_LOAD_IA32_FRED is set,
> + * as the CPU immediately loads the host FRED state from vmcs12 into
> + * the FRED MSRs.
> + *
> + * But an L1 VMM may clear SECONDARY_VM_EXIT_LOAD_IA32_FRED, causing
> + * the CPU to retain the pre VM-Exit FRED MSRs.
> + */
However I want to make this logic explicit. So we might end up with
adding the comment somewhere and removing all the pre_vmexit_fred_*
changes?
> + u64 pre_vmexit_fred_config;
> + u64 pre_vmexit_fred_rsp1;
> + u64 pre_vmexit_fred_rsp2;
> + u64 pre_vmexit_fred_rsp3;
> + u64 pre_vmexit_fred_stklvls;
> + u64 pre_vmexit_fred_ssp1;
> + u64 pre_vmexit_fred_ssp2;
> + u64 pre_vmexit_fred_ssp3;
>
> /* to migrate it to L1 if L2 writes to L1's CR8 directly */
> int l1_tpr_threshold;
Thanks!
Xin
On 8/2/2025 10:33 AM, Xin Li wrote:
>> @@ -4531,6 +4593,27 @@ static void sync_vmcs02_to_vmcs12_rare(struct
>> kvm_vcpu *vcpu,
>> vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
>> vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
>> vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
>> +
>> + vmx->nested.pre_vmexit_fred_config =
>> vmcs_read64(GUEST_IA32_FRED_CONFIG);
>> + vmx->nested.pre_vmexit_fred_rsp1 =
>> vmcs_read64(GUEST_IA32_FRED_RSP1);
>> + vmx->nested.pre_vmexit_fred_rsp2 =
>> vmcs_read64(GUEST_IA32_FRED_RSP2);
>> + vmx->nested.pre_vmexit_fred_rsp3 =
>> vmcs_read64(GUEST_IA32_FRED_RSP3);
>> + vmx->nested.pre_vmexit_fred_stklvls =
>> vmcs_read64(GUEST_IA32_FRED_STKLVLS);
>> + vmx->nested.pre_vmexit_fred_ssp1 =
>> vmcs_read64(GUEST_IA32_FRED_SSP1);
>> + vmx->nested.pre_vmexit_fred_ssp2 =
>> vmcs_read64(GUEST_IA32_FRED_SSP2);
>> + vmx->nested.pre_vmexit_fred_ssp3 =
>> vmcs_read64(GUEST_IA32_FRED_SSP3);
>
> This ...
>
>> +
>> + if (nested_cpu_save_guest_fred_state(vmcs12)) {
>> + vmcs12->guest_ia32_fred_config = vmx-
>> >nested.pre_vmexit_fred_config;
>> + vmcs12->guest_ia32_fred_rsp1 = vmx->nested.pre_vmexit_fred_rsp1;
>> + vmcs12->guest_ia32_fred_rsp2 = vmx->nested.pre_vmexit_fred_rsp2;
>> + vmcs12->guest_ia32_fred_rsp3 = vmx->nested.pre_vmexit_fred_rsp3;
>> + vmcs12->guest_ia32_fred_stklvls = vmx-
>> >nested.pre_vmexit_fred_stklvls;
>> + vmcs12->guest_ia32_fred_ssp1 = vmx->nested.pre_vmexit_fred_ssp1;
>> + vmcs12->guest_ia32_fred_ssp2 = vmx->nested.pre_vmexit_fred_ssp2;
>> + vmcs12->guest_ia32_fred_ssp3 = vmx->nested.pre_vmexit_fred_ssp3;
>> + }
>> +
>> vmcs12->guest_pending_dbg_exceptions =
>> vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>> @@ -4761,6 +4860,26 @@ static void load_vmcs12_host_state(struct
>> kvm_vcpu *vcpu,
>> vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
>> vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
>> + if (nested_cpu_load_host_fred_state(vmcs12)) {
>> + vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12-
>> >host_ia32_fred_config);
>> + vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->host_ia32_fred_rsp1);
>> + vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->host_ia32_fred_rsp2);
>> + vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->host_ia32_fred_rsp3);
>> + vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12-
>> >host_ia32_fred_stklvls);
>> + vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->host_ia32_fred_ssp1);
>> + vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->host_ia32_fred_ssp2);
>> + vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->host_ia32_fred_ssp3);
>> + } else {
>> + vmcs_write64(GUEST_IA32_FRED_CONFIG, vmx-
>> >nested.pre_vmexit_fred_config);
>> + vmcs_write64(GUEST_IA32_FRED_RSP1, vmx-
>> >nested.pre_vmexit_fred_rsp1);
>> + vmcs_write64(GUEST_IA32_FRED_RSP2, vmx-
>> >nested.pre_vmexit_fred_rsp2);
>> + vmcs_write64(GUEST_IA32_FRED_RSP3, vmx-
>> >nested.pre_vmexit_fred_rsp3);
>> + vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmx-
>> >nested.pre_vmexit_fred_stklvls);
>> + vmcs_write64(GUEST_IA32_FRED_SSP1, vmx-
>> >nested.pre_vmexit_fred_ssp1);
>> + vmcs_write64(GUEST_IA32_FRED_SSP2, vmx-
>> >nested.pre_vmexit_fred_ssp2);
>> + vmcs_write64(GUEST_IA32_FRED_SSP3, vmx-
>> >nested.pre_vmexit_fred_ssp3);
>
> And this are actually nops. IOW, if I don't add this snippet of code,
> the CPU still retains the guest FRED MSRs, i.e., using guest FRED state
> from vmcs02 as that of vmcs01.
I confused myself. They are NOT nops, because __nested_vmx_vmexit()
switches from vmcs02 to vmcs01. The code should be (as the patch does):
__nested_vmx_vmexit()
{
...
/*
* Save guest FRED state of vmcs02 to nested.pre_vmexit_fred
* no matter if SECONDARY_VM_EXIT_SAVE_IA32_FRED is set.
*/
sync_vmcs02_to_vmcs12();
...
vmx_switch_vmcs();
...
/*
* Load nested.pre_vmexit_fred to guest FRED state of vmcs01
* if SECONDARY_VM_EXIT_LOAD_IA32_FRED is NOT set.
*/
load_vmcs12_host_state();
...
}
As not setting any of the two FRED VM-Exit controls are rare cases, we
need to add KVM tests with L1 that:
1) doesn't set SECONDARY_VM_EXIT_SAVE_IA32_FRED in VM-Exit controls.
2) doesn't set SECONDARY_VM_EXIT_LOAD_IA32_FRED in VM-Exit controls.
3) doesn't set both of the FRED VM-Exit controls.
Looks we need a framework for all VM-Exit controls which control whether
to save/load specific MSRs related to CPU features during VM-Exit?
© 2016 - 2026 Red Hat, Inc.