arch/x86/kvm/svm/nested.c | 30 +++++++++++++++++++++++++++++- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/svm/svm.h | 14 ++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-)
If a feature is not advertised in the guest's CPUID, prevent L1 from
intercepting the unsupported instructions by clearing the corresponding
intercept in KVM's cached vmcb12.
When an L2 guest executes an instruction that is not advertised to L1,
we expect a #UD exception to be injected by L0. However, the nested svm
exit handler first checks if the instruction intercept is set in vmcb12,
and if so, synthesizes an exit from L2 to L1 instead of a #UD exception.
If a feature is not advertised, the L1 intercept should be ignored.
Calculate the nested intercept mask by checking all instructions that
can be intercepted and are controlled by a CPUID bit. Use this mask when
copying from the vmcb12 to KVM's cached vmcb12 to effectively ignore the
intercept on nested vm exit handling.
Another option is to handle ignoring the L1 intercepts in the nested vm
exit code path, but I've gone with modifying the cached vmcb12 to keep
it simpler.
Signed-off-by: Kevin Cheng <chengkev@google.com>
---
arch/x86/kvm/svm/nested.c | 30 +++++++++++++++++++++++++++++-
arch/x86/kvm/svm/svm.c | 2 ++
arch/x86/kvm/svm/svm.h | 14 ++++++++++++++
3 files changed, 45 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index c81005b245222..f2ade24908b39 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -184,6 +184,33 @@ void recalc_intercepts(struct vcpu_svm *svm)
}
}
+/*
+ * If a feature is not advertised to L1, set the mask bit for the corresponding
+ * vmcb12 intercept.
+ */
+void svm_recalc_nested_intercepts_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ memset(svm->nested.nested_intercept_mask, 0,
+ sizeof(svm->nested.nested_intercept_mask));
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP))
+ set_nested_intercept_mask(&svm->nested, INTERCEPT_RDTSCP);
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SKINIT))
+ set_nested_intercept_mask(&svm->nested, INTERCEPT_SKINIT);
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
+ set_nested_intercept_mask(&svm->nested, INTERCEPT_XSETBV);
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RDPRU))
+ set_nested_intercept_mask(&svm->nested, INTERCEPT_RDPRU);
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID))
+ set_nested_intercept_mask(&svm->nested, INTERCEPT_INVPCID);
+}
+
/*
* This array (and its actual size) holds the set of offsets (indexing by chunk
* size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the
@@ -408,10 +435,11 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
struct vmcb_ctrl_area_cached *to,
struct vmcb_control_area *from)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
unsigned int i;
for (i = 0; i < MAX_INTERCEPT; i++)
- to->intercepts[i] = from->intercepts[i];
+ to->intercepts[i] = from->intercepts[i] & ~(svm->nested.nested_intercept_mask[i]);
to->iopm_base_pa = from->iopm_base_pa;
to->msrpm_base_pa = from->msrpm_base_pa;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index f56c2d895011c..dd02a076077d8 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1011,6 +1011,8 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
}
}
+
+ svm_recalc_nested_intercepts_mask(vcpu);
}
static void svm_recalc_intercepts(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 9e151dbdef25d..08779d78c0c27 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -217,6 +217,12 @@ struct svm_nested_state {
* on its side.
*/
bool force_msr_bitmap_recalc;
+
+ /*
+ * Reserved bitmask for instruction intercepts that should not be set
+ * by L1 if the feature is not advertised to L1 in guest CPUID.
+ */
+ u32 nested_intercept_mask[MAX_INTERCEPT];
};
struct vcpu_sev_es_state {
@@ -478,6 +484,12 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
recalc_intercepts(svm);
}
+static inline void set_nested_intercept_mask(struct svm_nested_state *nested, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ __set_bit(bit, (unsigned long *)&nested->nested_intercept_mask);
+}
+
static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -746,6 +758,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
+void svm_recalc_nested_intercepts_mask(struct kvm_vcpu *vcpu);
+
int __init nested_svm_init_msrpm_merge_offsets(void);
int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
--
2.52.0.223.gf5cc29aaa4-goog
On Fri, Dec 05, 2025, Kevin Cheng wrote:
> If a feature is not advertised in the guest's CPUID, prevent L1 from
> intercepting the unsupported instructions by clearing the corresponding
> intercept in KVM's cached vmcb12.
>
> When an L2 guest executes an instruction that is not advertised to L1,
> we expect a #UD exception to be injected by L0. However, the nested svm
> exit handler first checks if the instruction intercept is set in vmcb12,
> and if so, synthesizes an exit from L2 to L1 instead of a #UD exception.
> If a feature is not advertised, the L1 intercept should be ignored.
>
> Calculate the nested intercept mask by checking all instructions that
> can be intercepted and are controlled by a CPUID bit. Use this mask when
> copying from the vmcb12 to KVM's cached vmcb12 to effectively ignore the
> intercept on nested vm exit handling.
>
> Another option is to handle ignoring the L1 intercepts in the nested vm
> exit code path, but I've gone with modifying the cached vmcb12 to keep
> it simpler.
>
> Signed-off-by: Kevin Cheng <chengkev@google.com>
> ---
> arch/x86/kvm/svm/nested.c | 30 +++++++++++++++++++++++++++++-
> arch/x86/kvm/svm/svm.c | 2 ++
> arch/x86/kvm/svm/svm.h | 14 ++++++++++++++
> 3 files changed, 45 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index c81005b245222..f2ade24908b39 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -184,6 +184,33 @@ void recalc_intercepts(struct vcpu_svm *svm)
> }
> }
>
> +/*
> + * If a feature is not advertised to L1, set the mask bit for the corresponding
> + * vmcb12 intercept.
> + */
> +void svm_recalc_nested_intercepts_mask(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + memset(svm->nested.nested_intercept_mask, 0,
> + sizeof(svm->nested.nested_intercept_mask));
> +
> + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP))
> + set_nested_intercept_mask(&svm->nested, INTERCEPT_RDTSCP);
> +
> + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SKINIT))
> + set_nested_intercept_mask(&svm->nested, INTERCEPT_SKINIT);
> +
> + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
> + set_nested_intercept_mask(&svm->nested, INTERCEPT_XSETBV);
> +
> + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RDPRU))
> + set_nested_intercept_mask(&svm->nested, INTERCEPT_RDPRU);
> +
> + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID))
> + set_nested_intercept_mask(&svm->nested, INTERCEPT_INVPCID);
Ugh. I don't see any reason for svm->nested.nested_intercept_mask to exist.
guest_cpu_cap_has() is cheap (which is largely why it even exists), just sanitize
the vmcb02 intercepts on-demand. The name is also wonky: it "sets" bits only to
effect a "clear" of those bits.
Gah, and the helpers to access/mutate intercepts can be cleaned up. E.g. if we
do something like this:
static inline void __vmcb_set_intercept(unsigned long *intercepts, u32 bit)
{
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
__set_bit(bit, intercepts);
}
static inline void __vmcb_clr_intercept(unsigned long *intercepts, u32 bit)
{
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
__clear_bit(bit, intercepts);
}
static inline bool __vmcb_is_intercept(unsigned long *intercepts, u32 bit)
{
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
return test_bit(bit, intercepts);
}
static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
{
__vmcb_set_intercept((unsigned long *)&control->intercepts, bit);
}
static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit)
{
__vmcb_clr_intercept((unsigned long *)&control->intercepts, bit);
}
static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
{
return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit);
}
static inline void vmcb12_clr_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
{
__vmcb_clr_intercept((unsigned long *)&control->intercepts, bit);
}
static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
{
return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit);
}
> +}
> +
> /*
> * This array (and its actual size) holds the set of offsets (indexing by chunk
> * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the
> @@ -408,10 +435,11 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
> struct vmcb_ctrl_area_cached *to,
> struct vmcb_control_area *from)
> {
> + struct vcpu_svm *svm = to_svm(vcpu);
> unsigned int i;
>
> for (i = 0; i < MAX_INTERCEPT; i++)
> - to->intercepts[i] = from->intercepts[i];
> + to->intercepts[i] = from->intercepts[i] & ~(svm->nested.nested_intercept_mask[i]);
Then here we can use vmcb_clr_intercept(). And if with macro shenanigans, we
can cut down on the boilerplate like so:
#define __nested_svm_sanitize_intercept(__vcpu, __control, fname, iname) \
do { \
if (!guest_cpu_cap_has(__vcpu, X86_FEATURE_##fname)) \
vmcb12_clr_intercept(__control, INTERCEPT_##iname); \
} while (0)
#define nested_svm_sanitize_intercept(__vcpu, __control, name) \
__nested_svm_sanitize_intercept(__vcpu, __control, name, name)
static
void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
struct vmcb_ctrl_area_cached *to,
struct vmcb_control_area *from)
{
unsigned int i;
for (i = 0; i < MAX_INTERCEPT; i++)
to->intercepts[i] = from->intercepts[i];
nested_svm_sanitize_intercept(vcpu, to, RDTSCP);
nested_svm_sanitize_intercept(vcpu, to, SKINIT);
__nested_svm_sanitize_intercept(vcpu, to, XSAVE, XSETBV);
nested_svm_sanitize_intercept(vcpu, to, RDPRU);
nested_svm_sanitize_intercept(vcpu, to, INVPCID);
Side topic, do we care about handling the case where userspace sets CPUID after
stuffing guest state? I'm very tempted to send a patch disallowing KVM_SET_CPUID
if is_guest_mode() is true, and hoping no one cares.
>
> to->iopm_base_pa = from->iopm_base_pa;
> to->msrpm_base_pa = from->msrpm_base_pa;
On Tue, Dec 9, 2025 at 11:52 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Dec 05, 2025, Kevin Cheng wrote:
> > If a feature is not advertised in the guest's CPUID, prevent L1 from
> > intercepting the unsupported instructions by clearing the corresponding
> > intercept in KVM's cached vmcb12.
> >
> > When an L2 guest executes an instruction that is not advertised to L1,
> > we expect a #UD exception to be injected by L0. However, the nested svm
> > exit handler first checks if the instruction intercept is set in vmcb12,
> > and if so, synthesizes an exit from L2 to L1 instead of a #UD exception.
> > If a feature is not advertised, the L1 intercept should be ignored.
> >
> > Calculate the nested intercept mask by checking all instructions that
> > can be intercepted and are controlled by a CPUID bit. Use this mask when
> > copying from the vmcb12 to KVM's cached vmcb12 to effectively ignore the
> > intercept on nested vm exit handling.
> >
> > Another option is to handle ignoring the L1 intercepts in the nested vm
> > exit code path, but I've gone with modifying the cached vmcb12 to keep
> > it simpler.
> >
> > Signed-off-by: Kevin Cheng <chengkev@google.com>
> > ---
> > arch/x86/kvm/svm/nested.c | 30 +++++++++++++++++++++++++++++-
> > arch/x86/kvm/svm/svm.c | 2 ++
> > arch/x86/kvm/svm/svm.h | 14 ++++++++++++++
> > 3 files changed, 45 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index c81005b245222..f2ade24908b39 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -184,6 +184,33 @@ void recalc_intercepts(struct vcpu_svm *svm)
> > }
> > }
> >
> > +/*
> > + * If a feature is not advertised to L1, set the mask bit for the corresponding
> > + * vmcb12 intercept.
> > + */
> > +void svm_recalc_nested_intercepts_mask(struct kvm_vcpu *vcpu)
> > +{
> > + struct vcpu_svm *svm = to_svm(vcpu);
> > +
> > + memset(svm->nested.nested_intercept_mask, 0,
> > + sizeof(svm->nested.nested_intercept_mask));
> > +
> > + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP))
> > + set_nested_intercept_mask(&svm->nested, INTERCEPT_RDTSCP);
> > +
> > + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SKINIT))
> > + set_nested_intercept_mask(&svm->nested, INTERCEPT_SKINIT);
> > +
> > + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
> > + set_nested_intercept_mask(&svm->nested, INTERCEPT_XSETBV);
> > +
> > + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RDPRU))
> > + set_nested_intercept_mask(&svm->nested, INTERCEPT_RDPRU);
> > +
> > + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID))
> > + set_nested_intercept_mask(&svm->nested, INTERCEPT_INVPCID);
>
> Ugh. I don't see any reason for svm->nested.nested_intercept_mask to exist.
> guest_cpu_cap_has() is cheap (which is largely why it even exists), just sanitize
> the vmcb02 intercepts on-demand. The name is also wonky: it "sets" bits only to
> effect a "clear" of those bits.
>
> Gah, and the helpers to access/mutate intercepts can be cleaned up. E.g. if we
> do something like this:
>
> static inline void __vmcb_set_intercept(unsigned long *intercepts, u32 bit)
> {
> WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
> __set_bit(bit, intercepts);
> }
>
> static inline void __vmcb_clr_intercept(unsigned long *intercepts, u32 bit)
> {
> WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
> __clear_bit(bit, intercepts);
> }
>
> static inline bool __vmcb_is_intercept(unsigned long *intercepts, u32 bit)
> {
> WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
> return test_bit(bit, intercepts);
> }
>
> static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
> {
> __vmcb_set_intercept((unsigned long *)&control->intercepts, bit);
> }
>
> static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit)
> {
> __vmcb_clr_intercept((unsigned long *)&control->intercepts, bit);
> }
>
> static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
> {
> return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit);
> }
>
> static inline void vmcb12_clr_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
> {
> __vmcb_clr_intercept((unsigned long *)&control->intercepts, bit);
> }
>
> static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
> {
> return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit);
> }
>
> > +}
> > +
> > /*
> > * This array (and its actual size) holds the set of offsets (indexing by chunk
> > * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the
> > @@ -408,10 +435,11 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
> > struct vmcb_ctrl_area_cached *to,
> > struct vmcb_control_area *from)
> > {
> > + struct vcpu_svm *svm = to_svm(vcpu);
> > unsigned int i;
> >
> > for (i = 0; i < MAX_INTERCEPT; i++)
> > - to->intercepts[i] = from->intercepts[i];
> > + to->intercepts[i] = from->intercepts[i] & ~(svm->nested.nested_intercept_mask[i]);
>
> Then here we can use vmcb_clr_intercept(). And if with macro shenanigans, we
> can cut down on the boilerplate like so:
>
> #define __nested_svm_sanitize_intercept(__vcpu, __control, fname, iname) \
> do { \
> if (!guest_cpu_cap_has(__vcpu, X86_FEATURE_##fname)) \
> vmcb12_clr_intercept(__control, INTERCEPT_##iname); \
> } while (0)
>
> #define nested_svm_sanitize_intercept(__vcpu, __control, name) \
> __nested_svm_sanitize_intercept(__vcpu, __control, name, name)
>
> static
> void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
> struct vmcb_ctrl_area_cached *to,
> struct vmcb_control_area *from)
> {
> unsigned int i;
>
> for (i = 0; i < MAX_INTERCEPT; i++)
> to->intercepts[i] = from->intercepts[i];
>
> nested_svm_sanitize_intercept(vcpu, to, RDTSCP);
> nested_svm_sanitize_intercept(vcpu, to, SKINIT);
> __nested_svm_sanitize_intercept(vcpu, to, XSAVE, XSETBV);
> nested_svm_sanitize_intercept(vcpu, to, RDPRU);
> nested_svm_sanitize_intercept(vcpu, to, INVPCID);
>
> Side topic, do we care about handling the case where userspace sets CPUID after
> stuffing guest state? I'm very tempted to send a patch disallowing KVM_SET_CPUID
> if is_guest_mode() is true, and hoping no one cares.
Hmm good point I haven't thought about that. Would it be better to
instead update the nested state in svm_vcpu_after_set_cpuid() if
KVM_SET_CPUID is executed when is_guest_mode() is true?
Also sorry if this is a dumb question, but in general if KVM_SET_CPUID
is disallowed, then how does userspace handle a failed IOCTL call? Do
they just try again later or accept that the call has failed? Or is
there an error code that signals that the vcpu is executing in guest
mode and should wait until not in guest mode to call the IOCTL?
>
> >
> > to->iopm_base_pa = from->iopm_base_pa;
> > to->msrpm_base_pa = from->msrpm_base_pa;
On Sat, Dec 13, 2025, Kevin Cheng wrote:
> On Tue, Dec 9, 2025 at 11:52 AM Sean Christopherson <seanjc@google.com> wrote:
> > On Fri, Dec 05, 2025, Kevin Cheng wrote:
> > static
> > void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
> > struct vmcb_ctrl_area_cached *to,
> > struct vmcb_control_area *from)
> > {
> > unsigned int i;
> >
> > for (i = 0; i < MAX_INTERCEPT; i++)
> > to->intercepts[i] = from->intercepts[i];
> >
> > nested_svm_sanitize_intercept(vcpu, to, RDTSCP);
> > nested_svm_sanitize_intercept(vcpu, to, SKINIT);
> > __nested_svm_sanitize_intercept(vcpu, to, XSAVE, XSETBV);
> > nested_svm_sanitize_intercept(vcpu, to, RDPRU);
> > nested_svm_sanitize_intercept(vcpu, to, INVPCID);
> >
> > Side topic, do we care about handling the case where userspace sets CPUID after
> > stuffing guest state? I'm very tempted to send a patch disallowing KVM_SET_CPUID
> > if is_guest_mode() is true, and hoping no one cares.
>
> Hmm good point I haven't thought about that. Would it be better to
> instead update the nested state in svm_vcpu_after_set_cpuid() if
> KVM_SET_CPUID is executed when is_guest_mode() is true?
Allowing CPUID to change (or VMX feature MSRs) when the vCPU is in L2 creates a
massive set of potential complications that would be all but impossible to handle.
E.g. if userspace clears features that are being used to run L2, then KVM could
end up with conflicting/nonsensical state for the vCPU.
Note, the same holds true for non-nested features, which is largely what led to
63f5a1909f9e ("KVM: x86: Alert userspace that KVM_SET_CPUID{,2} after KVM_RUN is
broken") and feb627e8d6f6 ("KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN").
> Also sorry if this is a dumb question, but in general if KVM_SET_CPUID
> is disallowed, then how does userspace handle a failed IOCTL call?
The VMM logs an error and exits.
> Do they just try again later or accept that the call has failed? Or is there
> an error code that signals that the vcpu is executing in guest mode and
> should wait until not in guest mode to call the IOCTL?
In practice, CPUID (and feature MSRs) are set very early on, when userspace is
creating vCPUs. With one exception, any other approach simply can't work, because
as above CPUID can't be configured after the vCPU has run. The only way for the
vCPU to be "running" L2 is if userspace stuffed guest state via KVM_SET_NESTED_STATE,
and actually changing CPUID after that point simply can't work (it'll "succeed",
but KVM provides no guarantees as to the viability of the vCPU).
The exception is that KVM allows setting _identical_ CPUID after KVM_RUN to
support QEMU's vCPU hotplug implementation, which is what commit c6617c61e8fe
("KVM: x86: Partially allow KVM_SET_CPUID{,2} after KVM_RUN") is all about.
Trying to _completely_ protect userspace from itself is a fool's errand, as KVM's
granular set of ioctls for loading vCPU state and the subjectivity of what is a
"sane" vCPU model make it all but impossible to fully prevent userspace from
hurting itself without creating a maintenance nightmare. But I can't think of
any scenario where it would be useful for userspace to set nested state, then go
back and change the vCPU model.
I'll send a patch and see what happens. Worst case scenario we break userspace
and get to learn about crazy VMM behavior :-)
On Fri, Dec 05, 2025 at 07:06:30AM +0000, Kevin Cheng wrote:
> If a feature is not advertised in the guest's CPUID, prevent L1 from
> intercepting the unsupported instructions by clearing the corresponding
> intercept in KVM's cached vmcb12.
>
> When an L2 guest executes an instruction that is not advertised to L1,
> we expect a #UD exception to be injected by L0. However, the nested svm
> exit handler first checks if the instruction intercept is set in vmcb12,
> and if so, synthesizes an exit from L2 to L1 instead of a #UD exception.
> If a feature is not advertised, the L1 intercept should be ignored.
>
> Calculate the nested intercept mask by checking all instructions that
> can be intercepted and are controlled by a CPUID bit. Use this mask when
> copying from the vmcb12 to KVM's cached vmcb12 to effectively ignore the
> intercept on nested vm exit handling.
>
> Another option is to handle ignoring the L1 intercepts in the nested vm
> exit code path, but I've gone with modifying the cached vmcb12 to keep
> it simpler.
Basically instead of masking the intercept bits in
__nested_copy_vmcb_control_to_cache(), we'd need to do it in both:
- recalc_intercepts() (on copying from g->intercepts to c->intercepts).
- vmcb12_is_intercept().
The current approach has the advantage of applying to any future uses of
the intercepts bits in the VMCB12 as well. The alternative approach has
the advantage of not modifying the intercept bits in the cached VMCB12,
which avoids any potential bugs in the future if we ever directly copy
from the cached VMCB12 to L1's VMCB12.
I think the latter is easier to test for, the new test case in KUTs [*]
could just verify that the ignored intercept bits are not cleared in the
VMCB.
So I prefer the current approach (with the added testing).
[*]https://lore.kernel.org/kvm/20251205081448.4062096-3-chengkev@google.com/
>
> Signed-off-by: Kevin Cheng <chengkev@google.com>
> ---
> arch/x86/kvm/svm/nested.c | 30 +++++++++++++++++++++++++++++-
> arch/x86/kvm/svm/svm.c | 2 ++
> arch/x86/kvm/svm/svm.h | 14 ++++++++++++++
> 3 files changed, 45 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index c81005b245222..f2ade24908b39 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -184,6 +184,33 @@ void recalc_intercepts(struct vcpu_svm *svm)
> }
> }
>
> +/*
> + * If a feature is not advertised to L1, set the mask bit for the corresponding
> + * vmcb12 intercept.
> + */
> +void svm_recalc_nested_intercepts_mask(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + memset(svm->nested.nested_intercept_mask, 0,
> + sizeof(svm->nested.nested_intercept_mask));
> +
> + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP))
> + set_nested_intercept_mask(&svm->nested, INTERCEPT_RDTSCP);
> +
> + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SKINIT))
> + set_nested_intercept_mask(&svm->nested, INTERCEPT_SKINIT);
> +
> + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
> + set_nested_intercept_mask(&svm->nested, INTERCEPT_XSETBV);
> +
> + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RDPRU))
> + set_nested_intercept_mask(&svm->nested, INTERCEPT_RDPRU);
> +
> + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID))
> + set_nested_intercept_mask(&svm->nested, INTERCEPT_INVPCID);
> +}
> +
set_nested_intercept_mask() is only used here AFAICT, so maybe just
define it as a static function above
svm_recalc_nested_intercepts_mask()?
> /*
> * This array (and its actual size) holds the set of offsets (indexing by chunk
> * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the
> @@ -408,10 +435,11 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
> struct vmcb_ctrl_area_cached *to,
> struct vmcb_control_area *from)
> {
> + struct vcpu_svm *svm = to_svm(vcpu);
> unsigned int i;
>
> for (i = 0; i < MAX_INTERCEPT; i++)
> - to->intercepts[i] = from->intercepts[i];
> + to->intercepts[i] = from->intercepts[i] & ~(svm->nested.nested_intercept_mask[i]);
>
> to->iopm_base_pa = from->iopm_base_pa;
> to->msrpm_base_pa = from->msrpm_base_pa;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index f56c2d895011c..dd02a076077d8 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1011,6 +1011,8 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
> svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
> }
> }
> +
> + svm_recalc_nested_intercepts_mask(vcpu);
svm_recalc_nested_intercepts_mask() is also only used here, but I think
there's a general preference to keep nested helpers defined in nested.c,
even if not used there (e.g. nested_svm_check_permissions(),
nested_svm_vmrun()). So I think leave that one as-is.
> }
>
> static void svm_recalc_intercepts(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 9e151dbdef25d..08779d78c0c27 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -217,6 +217,12 @@ struct svm_nested_state {
> * on its side.
> */
> bool force_msr_bitmap_recalc;
> +
> + /*
> + * Reserved bitmask for instruction intercepts that should not be set
> + * by L1 if the feature is not advertised to L1 in guest CPUID.
> + */
> + u32 nested_intercept_mask[MAX_INTERCEPT];
I think the naming of this member (and all the helper functions) should
change to make it clear this is a bitmask of ignored intercepts. So
maybe call this 'ignored_intercepts' (nested is implied by the struct)?
Then we can do:
s/set_nested_intercept_mask/set_nested_ignored_intercept/g
s/svm_recalc_nested_intercepts_mask/svm_recalc_nested_ignored_intercepts/g
..and any needed commentary updates.
Otherwise, the code looks good to me.
> };
>
> struct vcpu_sev_es_state {
> @@ -478,6 +484,12 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
> recalc_intercepts(svm);
> }
>
> +static inline void set_nested_intercept_mask(struct svm_nested_state *nested, u32 bit)
> +{
> + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
> + __set_bit(bit, (unsigned long *)&nested->nested_intercept_mask);
> +}
> +
> static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
> {
> struct vmcb *vmcb = svm->vmcb01.ptr;
> @@ -746,6 +758,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
> return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
> }
>
> +void svm_recalc_nested_intercepts_mask(struct kvm_vcpu *vcpu);
> +
> int __init nested_svm_init_msrpm_merge_offsets(void);
>
> int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
> --
> 2.52.0.223.gf5cc29aaa4-goog
>
© 2016 - 2025 Red Hat, Inc.