Disallow access (VMREAD/VMWRITE), both emulated and via a shadow VMCS, to
VMCS fields that the loaded incarnation of KVM doesn't support, e.g. due
to lack of hardware support, as a middle ground between allowing access to
any vmcs12 field defined by KVM (current behavior) and gating access based
on the userspace-defined vCPU model (the most functionally correct, but
very costly, implementation).
Disallowing access to unsupported fields helps a tiny bit in terms of
closing the virtualization hole (see below), but the main motivation is to
avoid having to weed out unsupported fields when synchronizing between
vmcs12 and a shadow VMCS. Because shadow VMCS accesses are done via
VMREAD and VMWRITE, KVM _must_ filter out unsupported fields (or eat
VMREAD/VMWRITE failures), and filtering out just shadow VMCS fields is
about the same amount of effort, and arguably much more confusing.
As a bonus, this also fixes a KVM-Unit-Test failure bug when running on
_hardware_ without support for TSC Scaling, which fails with the same
signature as the bug fixed by commit ba1f82456ba8 ("KVM: nVMX: Dynamically
compute max VMCS index for vmcs12"):
FAIL: VMX_VMCS_ENUM.MAX_INDEX expected: 19, actual: 17
Dynamically computing the max VMCS index only resolved the issue where KVM
was hardcoding max index, but for CPUs with TSC Scaling, that was "good
enough".
Reviewed-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Xin Li <xin@zytor.com
Cc: Xiaoyao Li <xiaoyao.li@intel.com>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://lore.kernel.org/all/20251026201911.505204-22-xin@zytor.com
Link: https://lore.kernel.org/all/YR2Tf9WPNEzrE7Xg@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/vmx/nested.c | 15 +++++----
arch/x86/kvm/vmx/vmcs.h | 8 +++++
arch/x86/kvm/vmx/vmcs12.c | 70 +++++++++++++++++++++++++++++++++++++--
arch/x86/kvm/vmx/vmcs12.h | 6 ++--
4 files changed, 89 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 61113ead3d7b..ac7a17560c8f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -111,6 +111,9 @@ static void init_vmcs_shadow_fields(void)
field <= GUEST_TR_AR_BYTES,
"Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
+ if (get_vmcs12_field_offset(field) < 0)
+ continue;
+
/*
* PML and the preemption timer can be emulated, but the
* processor cannot vmwrite to fields that don't exist
@@ -7074,12 +7077,6 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void)
}
}
-/*
- * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo
- * that madness to get the encoding for comparison.
- */
-#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
-
static u64 nested_vmx_calc_vmcs_enum_msr(void)
{
/*
@@ -7407,6 +7404,12 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
{
int i;
+ /*
+ * Note! The set of supported vmcs12 fields is consumed by both VMX
+ * MSR and shadow VMCS setup.
+ */
+ nested_vmx_setup_vmcs12_fields();
+
nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
if (!cpu_has_vmx_shadow_vmcs())
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 9aa204c87661..66d747e265b1 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -11,7 +11,15 @@
#include "capabilities.h"
+/*
+ * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6 as a very
+ * rudimentary compression of the range of indices. The compression ratio is
+ * good enough to allow KVM to use a (very sparsely populated) array without
+ * wasting too much memory, while the "algorithm" is fast enough to be used to
+ * lookup vmcs12 fields on-demand, e.g. for emulation.
+ */
#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+#define VMCS12_IDX_TO_ENC(idx) ROL16(idx, 10)
#define ENC_TO_VMCS12_IDX(enc) ROL16(enc, 6)
struct vmcs_hdr {
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index c2ac9e1a50b3..1ebe67c384ad 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -9,7 +9,7 @@
FIELD(number, name), \
[ENC_TO_VMCS12_IDX(number##_HIGH)] = VMCS12_OFFSET(name) + sizeof(u32)
-const unsigned short vmcs12_field_offsets[] = {
+static const u16 kvm_supported_vmcs12_field_offsets[] __initconst = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@@ -158,4 +158,70 @@ const unsigned short vmcs12_field_offsets[] = {
FIELD(HOST_SSP, host_ssp),
FIELD(HOST_INTR_SSP_TABLE, host_ssp_tbl),
};
-const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);
+
+u16 vmcs12_field_offsets[ARRAY_SIZE(kvm_supported_vmcs12_field_offsets)] __ro_after_init;
+unsigned int nr_vmcs12_fields __ro_after_init;
+
+#define VMCS12_CASE64(enc) case enc##_HIGH: case enc
+
+static __init bool cpu_has_vmcs12_field(unsigned int idx)
+{
+ switch (VMCS12_IDX_TO_ENC(idx)) {
+ case VIRTUAL_PROCESSOR_ID:
+ return cpu_has_vmx_vpid();
+ case POSTED_INTR_NV:
+ return cpu_has_vmx_posted_intr();
+ VMCS12_CASE64(TSC_MULTIPLIER):
+ return cpu_has_vmx_tsc_scaling();
+ case TPR_THRESHOLD:
+ VMCS12_CASE64(VIRTUAL_APIC_PAGE_ADDR):
+ return cpu_has_vmx_tpr_shadow();
+ VMCS12_CASE64(APIC_ACCESS_ADDR):
+ return cpu_has_vmx_virtualize_apic_accesses();
+ VMCS12_CASE64(POSTED_INTR_DESC_ADDR):
+ return cpu_has_vmx_posted_intr();
+ case GUEST_INTR_STATUS:
+ return cpu_has_vmx_virtual_intr_delivery();
+ VMCS12_CASE64(VM_FUNCTION_CONTROL):
+ VMCS12_CASE64(EPTP_LIST_ADDRESS):
+ return cpu_has_vmx_vmfunc();
+ VMCS12_CASE64(EPT_POINTER):
+ return cpu_has_vmx_ept();
+ VMCS12_CASE64(XSS_EXIT_BITMAP):
+ return cpu_has_vmx_xsaves();
+ VMCS12_CASE64(ENCLS_EXITING_BITMAP):
+ return cpu_has_vmx_encls_vmexit();
+ VMCS12_CASE64(GUEST_IA32_PERF_GLOBAL_CTRL):
+ VMCS12_CASE64(HOST_IA32_PERF_GLOBAL_CTRL):
+ return cpu_has_load_perf_global_ctrl();
+ case SECONDARY_VM_EXEC_CONTROL:
+ return cpu_has_secondary_exec_ctrls();
+ case GUEST_S_CET:
+ case GUEST_SSP:
+ case GUEST_INTR_SSP_TABLE:
+ case HOST_S_CET:
+ case HOST_SSP:
+ case HOST_INTR_SSP_TABLE:
+ return cpu_has_load_cet_ctrl();
+
+ /* KVM always emulates PML and the VMX preemption timer in software. */
+ case GUEST_PML_INDEX:
+ case VMX_PREEMPTION_TIMER_VALUE:
+ default:
+ return true;
+ }
+}
+
+void __init nested_vmx_setup_vmcs12_fields(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(kvm_supported_vmcs12_field_offsets); i++) {
+ if (!kvm_supported_vmcs12_field_offsets[i] ||
+ !cpu_has_vmcs12_field(i))
+ continue;
+
+ vmcs12_field_offsets[i] = kvm_supported_vmcs12_field_offsets[i];
+ nr_vmcs12_fields = i + 1;
+ }
+}
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 7a5fdd9b27ba..21cd1b75e4fd 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -374,8 +374,10 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(guest_pml_index, 996);
}
-extern const unsigned short vmcs12_field_offsets[];
-extern const unsigned int nr_vmcs12_fields;
+extern u16 vmcs12_field_offsets[] __ro_after_init;
+extern unsigned int nr_vmcs12_fields __ro_after_init;
+
+void __init nested_vmx_setup_vmcs12_fields(void);
static inline short get_vmcs12_field_offset(unsigned long field)
{
--
2.52.0.457.g6b5491de43-goog
On 1/9/2026 12:15 PM, Sean Christopherson wrote: > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c > index 61113ead3d7b..ac7a17560c8f 100644 > --- a/arch/x86/kvm/vmx/nested.c > +++ b/arch/x86/kvm/vmx/nested.c > @@ -111,6 +111,9 @@ static void init_vmcs_shadow_fields(void) > field <= GUEST_TR_AR_BYTES, > "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); > > + if (get_vmcs12_field_offset(field) < 0) > + continue; > + why shadow_read_only_fields[] doesn't need such guard? IIUC, copy_vmcs12_to_shadow() will VMWRITE shadowed readonly field even if it doesn't exist on the hardware?
On Fri, Jan 09, 2026, Xiaoyao Li wrote: > On 1/9/2026 12:15 PM, Sean Christopherson wrote: > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c > > index 61113ead3d7b..ac7a17560c8f 100644 > > --- a/arch/x86/kvm/vmx/nested.c > > +++ b/arch/x86/kvm/vmx/nested.c > > @@ -111,6 +111,9 @@ static void init_vmcs_shadow_fields(void) > > field <= GUEST_TR_AR_BYTES, > > "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); > > + if (get_vmcs12_field_offset(field) < 0) > > + continue; > > + > > why shadow_read_only_fields[] doesn't need such guard? > > IIUC, copy_vmcs12_to_shadow() will VMWRITE shadowed readonly field even if > it doesn't exist on the hardware? Because I fixated on the existing checks and didn't look at the first for-loop. This time around I'll test by hacking in shadowed fields arbitrary shadow fields.
On Fri, Jan 09, 2026, Sean Christopherson wrote:
> On Fri, Jan 09, 2026, Xiaoyao Li wrote:
> > On 1/9/2026 12:15 PM, Sean Christopherson wrote:
> > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > > index 61113ead3d7b..ac7a17560c8f 100644
> > > --- a/arch/x86/kvm/vmx/nested.c
> > > +++ b/arch/x86/kvm/vmx/nested.c
> > > @@ -111,6 +111,9 @@ static void init_vmcs_shadow_fields(void)
> > > field <= GUEST_TR_AR_BYTES,
> > > "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
> > > + if (get_vmcs12_field_offset(field) < 0)
> > > + continue;
> > > +
> >
> > why shadow_read_only_fields[] doesn't need such guard?
> >
> > IIUC, copy_vmcs12_to_shadow() will VMWRITE shadowed readonly field even if
> > it doesn't exist on the hardware?
>
> Because I fixated on the existing checks and didn't look at the first for-loop.
>
> This time around I'll test by hacking in shadowed fields arbitrary shadow fields.
And with the RO fields handled, the below doesn't explode (I verified there failures
aplenty if either of the RO or RW checks are commented out).
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index c85c50019523..7d9bedd06afd 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -262,8 +262,12 @@ enum vmcs_field {
SHARED_EPT_POINTER = 0x0000203C,
PID_POINTER_TABLE = 0x00002042,
PID_POINTER_TABLE_HIGH = 0x00002043,
+ INJECTED_EVENT_DATA = 0x00002052,
+ INJECTED_EVENT_DATA_HIGH = 0x00002053,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
+ ORIGINAL_EVENT_DATA = 0x00002404,
+ ORIGINAL_EVENT_DATA_HIGH = 0x00002405,
VMCS_LINK_POINTER = 0x00002800,
VMCS_LINK_POINTER_HIGH = 0x00002801,
GUEST_IA32_DEBUGCTL = 0x00002802,
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index 1ebe67c384ad..7952d58fb2d8 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -157,6 +157,8 @@ static const u16 kvm_supported_vmcs12_field_offsets[] __initconst = {
FIELD(HOST_S_CET, host_s_cet),
FIELD(HOST_SSP, host_ssp),
FIELD(HOST_INTR_SSP_TABLE, host_ssp_tbl),
+ FIELD64(INJECTED_EVENT_DATA, injected_event_data),
+ FIELD64(ORIGINAL_EVENT_DATA, original_event_data),
};
u16 vmcs12_field_offsets[ARRAY_SIZE(kvm_supported_vmcs12_field_offsets)] __ro_after_init;
@@ -204,6 +206,12 @@ static __init bool cpu_has_vmcs12_field(unsigned int idx)
case HOST_INTR_SSP_TABLE:
return cpu_has_load_cet_ctrl();
+ case ORIGINAL_EVENT_DATA:
+ case ORIGINAL_EVENT_DATA_HIGH:
+ case INJECTED_EVENT_DATA:
+ case INJECTED_EVENT_DATA_HIGH:
+ return false;
+
/* KVM always emulates PML and the VMX preemption timer in software. */
case GUEST_PML_INDEX:
case VMX_PREEMPTION_TIMER_VALUE:
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 21cd1b75e4fd..56565722f527 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -191,6 +191,9 @@ struct __packed vmcs12 {
u16 host_gs_selector;
u16 host_tr_selector;
u16 guest_pml_index;
+
+ u64 injected_event_data;
+ u64 original_event_data;
};
/*
diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
index cad128d1657b..d23ffedaf25b 100644
--- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h
+++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
@@ -75,5 +75,10 @@ SHADOW_FIELD_RW(HOST_GS_BASE, host_gs_base)
SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address)
SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address)
+SHADOW_FIELD_RO(ORIGINAL_EVENT_DATA, original_event_data)
+SHADOW_FIELD_RO(ORIGINAL_EVENT_DATA_HIGH, original_event_data)
+SHADOW_FIELD_RW(INJECTED_EVENT_DATA, injected_event_data)
+SHADOW_FIELD_RW(INJECTED_EVENT_DATA_HIGH, injected_event_data)
+
#undef SHADOW_FIELD_RO
#undef SHADOW_FIELD_RW
© 2016 - 2026 Red Hat, Inc.