[v1] KVM: VMX: Introduce Intel Mode-Based Execute Control (MBEC)

[RFC PATCH 06/18] KVM: VMX: Wire up Intel MBEC enable/disable logic

Posted by Jon Kohler 11 months ago

Add logic to enable / disable Intel Mode Based Execution Control (MBEC)
based on specific conditions.

MBEC depends on:
- User space exposing secondary execution control bit 22
- Extended Page Tables (EPT)
- The KVM module parameter `enable_pt_guest_exec_control`

If any of these conditions are not met, MBEC will be disabled
accordingly.

Store runtime enablement within `kvm_vcpu_arch.pt_guest_exec_control`.

Signed-off-by: Jon Kohler <jon@nutanix.com>

---
 arch/x86/kvm/vmx/vmx.c | 11 +++++++++++
 arch/x86/kvm/vmx/vmx.h |  7 +++++++
 2 files changed, 18 insertions(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 7a98f03ef146..116910159a3f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2694,6 +2694,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 			return -EIO;
 
 		vmx_cap->ept = 0;
+		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
 		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
 	}
 	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
@@ -4641,11 +4642,15 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
 	if (!enable_ept) {
 		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+		exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
 		exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
 		enable_unrestricted_guest = 0;
 	}
 	if (!enable_unrestricted_guest)
 		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+	if (!enable_pt_guest_exec_control)
+		exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
+
 	if (kvm_pause_in_guest(vmx->vcpu.kvm))
 		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 	if (!kvm_vcpu_apicv_active(vcpu))
@@ -4770,6 +4775,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
 		if (vmx->ve_info)
 			vmcs_write64(VE_INFORMATION_ADDRESS,
 				     __pa(vmx->ve_info));
+
+		vmx->vcpu.arch.pt_guest_exec_control =
+			enable_pt_guest_exec_control && vmx_has_mbec(vmx);
 	}
 
 	if (cpu_has_tertiary_exec_ctrls())
@@ -8472,6 +8480,9 @@ __init int vmx_hardware_setup(void)
 	if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
 		enable_unrestricted_guest = 0;
 
+	if (!cpu_has_vmx_mbec() || !enable_ept)
+		enable_pt_guest_exec_control = false;
+
 	if (!cpu_has_vmx_flexpriority())
 		flexpriority_enabled = 0;
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index d1e537bf50ea..9f4ae3139a90 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -580,6 +580,7 @@ static inline u8 vmx_get_rvi(void)
 	 SECONDARY_EXEC_ENABLE_VMFUNC |					\
 	 SECONDARY_EXEC_BUS_LOCK_DETECTION |				\
 	 SECONDARY_EXEC_NOTIFY_VM_EXITING |				\
+	 SECONDARY_EXEC_MODE_BASED_EPT_EXEC |				\
 	 SECONDARY_EXEC_ENCLS_EXITING |					\
 	 SECONDARY_EXEC_EPT_VIOLATION_VE)
 
@@ -721,6 +722,12 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
 		SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
 }
 
+static inline bool vmx_has_mbec(struct vcpu_vmx *vmx)
+{
+	return secondary_exec_controls_get(vmx) &
+		SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
+}
+
 static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
 {
 	if (!enable_ept)
-- 
2.43.0

Re: [RFC PATCH 06/18] KVM: VMX: Wire up Intel MBEC enable/disable logic

Posted by Sean Christopherson 9 months ago

On Thu, Mar 13, 2025, Jon Kohler wrote:
> Add logic to enable / disable Intel Mode Based Execution Control (MBEC)
> based on specific conditions.
> 
> MBEC depends on:
> - User space exposing secondary execution control bit 22
> - Extended Page Tables (EPT)
> - The KVM module parameter `enable_pt_guest_exec_control`
> 
> If any of these conditions are not met, MBEC will be disabled
> accordingly.

Why?  I know why, but I know why despite the changeloge, not because of the
changelog.

> Store runtime enablement within `kvm_vcpu_arch.pt_guest_exec_control`.

Again, why?  If you actually tried to explain this, I think/hope you would realize
why it's wrong.

> Signed-off-by: Jon Kohler <jon@nutanix.com>
> 
> ---
>  arch/x86/kvm/vmx/vmx.c | 11 +++++++++++
>  arch/x86/kvm/vmx/vmx.h |  7 +++++++
>  2 files changed, 18 insertions(+)
> 
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 7a98f03ef146..116910159a3f 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -2694,6 +2694,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  			return -EIO;
>  
>  		vmx_cap->ept = 0;
> +		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
>  		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
>  	}
>  	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
> @@ -4641,11 +4642,15 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
>  		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
>  	if (!enable_ept) {
>  		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
> +		exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
>  		exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
>  		enable_unrestricted_guest = 0;
>  	}
>  	if (!enable_unrestricted_guest)
>  		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
> +	if (!enable_pt_guest_exec_control)
> +		exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;

This is wrong and unnecessary.  As mentioned early, the input that matters is
vmcs12.  This flag should *never* be set for vmcs01.

>  	if (kvm_pause_in_guest(vmx->vcpu.kvm))
>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>  	if (!kvm_vcpu_apicv_active(vcpu))
> @@ -4770,6 +4775,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>  		if (vmx->ve_info)
>  			vmcs_write64(VE_INFORMATION_ADDRESS,
>  				     __pa(vmx->ve_info));
> +
> +		vmx->vcpu.arch.pt_guest_exec_control =
> +			enable_pt_guest_exec_control && vmx_has_mbec(vmx);

This should effectively be dead code, because vmx_has_mbec() should never be
true at vCPU creation.

Re: [RFC PATCH 06/18] KVM: VMX: Wire up Intel MBEC enable/disable logic

Posted by Jon Kohler 1 month, 2 weeks ago


> On May 12, 2025, at 2:23 PM, Sean Christopherson <seanjc@google.com> wrote:
> 
> On Thu, Mar 13, 2025, Jon Kohler wrote:
>> Add logic to enable / disable Intel Mode Based Execution Control (MBEC)
>> based on specific conditions.
>> 
>> MBEC depends on:
>> - User space exposing secondary execution control bit 22
>> - Extended Page Tables (EPT)
>> - The KVM module parameter `enable_pt_guest_exec_control`
>> 
>> If any of these conditions are not met, MBEC will be disabled
>> accordingly.
> 
> Why?  I know why, but I know why despite the changeloge, not because of the
> changelog.
> 
>> Store runtime enablement within `kvm_vcpu_arch.pt_guest_exec_control`.
> 
> Again, why?  If you actually tried to explain this, I think/hope you would realize
> why it's wrong.
> 
>> Signed-off-by: Jon Kohler <jon@nutanix.com>
>> 
>> ---
>> arch/x86/kvm/vmx/vmx.c | 11 +++++++++++
>> arch/x86/kvm/vmx/vmx.h |  7 +++++++
>> 2 files changed, 18 insertions(+)
>> 
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 7a98f03ef146..116910159a3f 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -2694,6 +2694,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>> return -EIO;
>> 
>> vmx_cap->ept = 0;
>> + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
>> _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
>> }
>> if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
>> @@ -4641,11 +4642,15 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
>> exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
>> if (!enable_ept) {
>> exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
>> + exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
>> exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
>> enable_unrestricted_guest = 0;
>> }
>> if (!enable_unrestricted_guest)
>> exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
>> + if (!enable_pt_guest_exec_control)
>> + exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
> 
> This is wrong and unnecessary.  As mentioned early, the input that matters is
> vmcs12.  This flag should *never* be set for vmcs01.
> 
>> if (kvm_pause_in_guest(vmx->vcpu.kvm))
>> exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>> if (!kvm_vcpu_apicv_active(vcpu))
>> @@ -4770,6 +4775,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>> if (vmx->ve_info)
>> vmcs_write64(VE_INFORMATION_ADDRESS,
>>     __pa(vmx->ve_info));
>> +
>> + vmx->vcpu.arch.pt_guest_exec_control =
>> + enable_pt_guest_exec_control && vmx_has_mbec(vmx);
> 
> This should effectively be dead code, because vmx_has_mbec() should never be
> true at vCPU creation.

Ack/done - all of this went away in v1 in favor of the advertisement in nested.c

Re: [RFC PATCH 06/18] KVM: VMX: Wire up Intel MBEC enable/disable logic

Posted by Jon Kohler 9 months ago


> On May 12, 2025, at 2:23 PM, Sean Christopherson <seanjc@google.com> wrote:
> 
> !-------------------------------------------------------------------|
>  CAUTION: External Email
> 
> |-------------------------------------------------------------------!
> 
> On Thu, Mar 13, 2025, Jon Kohler wrote:
>> Add logic to enable / disable Intel Mode Based Execution Control (MBEC)
>> based on specific conditions.
>> 
>> MBEC depends on:
>> - User space exposing secondary execution control bit 22
>> - Extended Page Tables (EPT)
>> - The KVM module parameter `enable_pt_guest_exec_control`
>> 
>> If any of these conditions are not met, MBEC will be disabled
>> accordingly.
> 
> Why?  I know why, but I know why despite the changeloge, not because of the
> changelog.
> 
>> Store runtime enablement within `kvm_vcpu_arch.pt_guest_exec_control`.
> 
> Again, why?  If you actually tried to explain this, I think/hope you would realize
> why it's wrong.
> 
>> Signed-off-by: Jon Kohler <jon@nutanix.com>
>> 
>> ---
>> arch/x86/kvm/vmx/vmx.c | 11 +++++++++++
>> arch/x86/kvm/vmx/vmx.h |  7 +++++++
>> 2 files changed, 18 insertions(+)
>> 
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 7a98f03ef146..116910159a3f 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -2694,6 +2694,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>> return -EIO;
>> 
>> vmx_cap->ept = 0;
>> + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
>> _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
>> }
>> if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
>> @@ -4641,11 +4642,15 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
>> exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
>> if (!enable_ept) {
>> exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
>> + exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
>> exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
>> enable_unrestricted_guest = 0;
>> }
>> if (!enable_unrestricted_guest)
>> exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
>> + if (!enable_pt_guest_exec_control)
>> + exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
> 
> This is wrong and unnecessary.  As mentioned early, the input that matters is
> vmcs12.  This flag should *never* be set for vmcs01.

I’ll page this back in, but I’m like 75% sure it didn’t work when I did it that way.

Either way, thanks for the feedback, I’ll chase that do ground.

> 
>> if (kvm_pause_in_guest(vmx->vcpu.kvm))
>> exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>> if (!kvm_vcpu_apicv_active(vcpu))
>> @@ -4770,6 +4775,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>> if (vmx->ve_info)
>> vmcs_write64(VE_INFORMATION_ADDRESS,
>>     __pa(vmx->ve_info));
>> +
>> + vmx->vcpu.arch.pt_guest_exec_control =
>> + enable_pt_guest_exec_control && vmx_has_mbec(vmx);
> 
> This should effectively be dead code, because vmx_has_mbec() should never be
> true at vCPU creation.

Ack, will fix

Re: [RFC PATCH 06/18] KVM: VMX: Wire up Intel MBEC enable/disable logic

Posted by Sean Christopherson 9 months ago

On Tue, May 13, 2025, Jon Kohler wrote:
> > On May 12, 2025, at 2:23 PM, Sean Christopherson <seanjc@google.com> wrote:
> > > On Thu, Mar 13, 2025, Jon Kohler wrote:
> >> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> >> index 7a98f03ef146..116910159a3f 100644
> >> --- a/arch/x86/kvm/vmx/vmx.c
> >> +++ b/arch/x86/kvm/vmx/vmx.c
> >> @@ -2694,6 +2694,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
> >> return -EIO;
> >> 
> >> vmx_cap->ept = 0;
> >> + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
> >> _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
> >> }
> >> if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
> >> @@ -4641,11 +4642,15 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
> >> exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
> >> if (!enable_ept) {
> >> exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
> >> + exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
> >> exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
> >> enable_unrestricted_guest = 0;
> >> }
> >> if (!enable_unrestricted_guest)
> >> exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
> >> + if (!enable_pt_guest_exec_control)
> >> + exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
> > 
> > This is wrong and unnecessary.  As mentioned early, the input that matters is
> > vmcs12.  This flag should *never* be set for vmcs01.
> 
> I’ll page this back in, but I’m like 75% sure it didn’t work when I did it that way.

Then you had other bugs.  The control is per-VMCS and thus needs to be emulated
as such.  Definitely holler if you get stuck, there's no need to develop this in
complete isolation.

Re: [RFC PATCH 06/18] KVM: VMX: Wire up Intel MBEC enable/disable logic

Posted by Shah, Amit 9 months ago

On Tue, 2025-05-13 at 06:28 -0700, Sean Christopherson wrote:
> On Tue, May 13, 2025, Jon Kohler wrote:
> > > On May 12, 2025, at 2:23 PM, Sean Christopherson
> > > <seanjc@google.com> wrote:
> > > > On Thu, Mar 13, 2025, Jon Kohler wrote:
> > > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > > > index 7a98f03ef146..116910159a3f 100644
> > > > --- a/arch/x86/kvm/vmx/vmx.c
> > > > +++ b/arch/x86/kvm/vmx/vmx.c
> > > > @@ -2694,6 +2694,7 @@ static int setup_vmcs_config(struct
> > > > vmcs_config *vmcs_conf,
> > > > return -EIO;
> > > > 
> > > > vmx_cap->ept = 0;
> > > > + _cpu_based_2nd_exec_control &=
> > > > ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
> > > > _cpu_based_2nd_exec_control &=
> > > > ~SECONDARY_EXEC_EPT_VIOLATION_VE;
> > > > }
> > > > if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID)
> > > > &&
> > > > @@ -4641,11 +4642,15 @@ static u32
> > > > vmx_secondary_exec_control(struct vcpu_vmx *vmx)
> > > > exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
> > > > if (!enable_ept) {
> > > > exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
> > > > + exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
> > > > exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
> > > > enable_unrestricted_guest = 0;
> > > > }
> > > > if (!enable_unrestricted_guest)
> > > > exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
> > > > + if (!enable_pt_guest_exec_control)
> > > > + exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
> > > 
> > > This is wrong and unnecessary.  As mentioned early, the input
> > > that matters is
> > > vmcs12.  This flag should *never* be set for vmcs01.
> > 
> > I’ll page this back in, but I’m like 75% sure it didn’t work when I
> > did it that way.
> 
> Then you had other bugs.  The control is per-VMCS and thus needs to
> be emulated
> as such.  Definitely holler if you get stuck, there's no need to
> develop this in
> complete isolation.

Looking at this from the AMD GMET POV, here's how I think support for
this feature for a Windows guest would be implemented:

* Do not enable the GMET feature in vmcb01.  Only the Windows guest (L1
guest) sets this bit for its own guest (L2 guest).  KVM (L0) should see
the bit set in vmcb02 (and vmcb12).  OTOH, pass on the CPUID bit to the
L1 guest.

* KVM needs to propagate the #NPF to Windows (instead of handling
anything itself -- ie no shadow page table adjustments or walks
needed).  Windows spawns an L2 guest that causes the #NPF, and Windows
is the one that needs to consume that fault.

* KVM needs to differentiate an #NPF exit due to GMET or non-GMET
condition -- check the CPL and U/S bits from the exit, and the NX bit
from the PTE that faulted.  If due to GMET, propagate it to the guest.
If not, continue handling it

(btw KVM MMU API question -- from the #NPF, I have the GPA of the L2
guest.  How to go from that guest GPA to look up the NX bit for that
page?  I skimmed and there doesn't seem to be an existing API for it -
so is walking the tables the only solution?)

		Amit

Re: [RFC PATCH 06/18] KVM: VMX: Wire up Intel MBEC enable/disable logic

Posted by Sean Christopherson 9 months ago

On Wed, May 14, 2025, Amit Shah wrote:
> On Tue, 2025-05-13 at 06:28 -0700, Sean Christopherson wrote:
> > On Tue, May 13, 2025, Jon Kohler wrote:
> > > > On May 12, 2025, at 2:23 PM, Sean Christopherson
> > > > This is wrong and unnecessary.  As mentioned early, the input that
> > > > matters is vmcs12.  This flag should *never* be set for vmcs01.
> > > 
> > > I’ll page this back in, but I’m like 75% sure it didn’t work when I
> > > did it that way.
> > 
> > Then you had other bugs.  The control is per-VMCS and thus needs to
> > be emulated
> > as such.  Definitely holler if you get stuck, there's no need to
> > develop this in
> > complete isolation.
> 
> Looking at this from the AMD GMET POV, here's how I think support for
> this feature for a Windows guest would be implemented:
> 
> * Do not enable the GMET feature in vmcb01.  Only the Windows guest (L1
> guest) sets this bit for its own guest (L2 guest).  KVM (L0) should see
> the bit set in vmcb02 (and vmcb12).  OTOH, pass on the CPUID bit to the
> L1 guest.
> 
> * KVM needs to propagate the #NPF to Windows (instead of handling
> anything itself -- ie no shadow page table adjustments or walks
> needed).  Windows spawns an L2 guest that causes the #NPF, and Windows
> is the one that needs to consume that fault.
> 
> * KVM needs to differentiate an #NPF exit due to GMET or non-GMET
> condition -- check the CPL and U/S bits from the exit, and the NX bit
> from the PTE that faulted.  If due to GMET, propagate it to the guest.
> If not, continue handling it

Yes, but no.  KVM shouldn't need to do anything special here other than teaching
update_permission_bitmask() to understand the GMET fault case.  Ditto for MBEC.
I'd type something up, but I would quickly encounter -ENOCOFFE :-)

With the correct mmu->permissions[], permission_fault() will naturally detect
that a #NPF (or EPT Violation) from L2 due to a GMET/MBEC violation is a fault
in the nNPT/nEPT domain and route the exit to L1.

> (btw KVM MMU API question -- from the #NPF, I have the GPA of the L2
> guest.  How to go from that guest GPA to look up the NX bit for that
> page?  I skimmed and there doesn't seem to be an existing API for it -
> so is walking the tables the only solution?)

As above, KVM doesn't manually look up individual bits while handling faults.
The walk of the guest page tables (L1's NPT/EPT for this scenario) performed by
FNAME(walk_addr_generic) will gather the effective permissions in walker->pte_access,
and check for a permission_fault() after the walk is completed.

Re: [RFC PATCH 06/18] KVM: VMX: Wire up Intel MBEC enable/disable logic

Posted by Shah, Amit 7 months, 3 weeks ago

On Wed, 2025-05-14 at 05:55 -0700, Sean Christopherson wrote:
> On Wed, May 14, 2025, Amit Shah wrote:
> > On Tue, 2025-05-13 at 06:28 -0700, Sean Christopherson wrote:
> > > On Tue, May 13, 2025, Jon Kohler wrote:
> > > > > On May 12, 2025, at 2:23 PM, Sean Christopherson
> > > > > This is wrong and unnecessary.  As mentioned early, the input
> > > > > that
> > > > > matters is vmcs12.  This flag should *never* be set for
> > > > > vmcs01.
> > > > 
> > > > I’ll page this back in, but I’m like 75% sure it didn’t work
> > > > when I
> > > > did it that way.
> > > 
> > > Then you had other bugs.  The control is per-VMCS and thus needs
> > > to
> > > be emulated
> > > as such.  Definitely holler if you get stuck, there's no need to
> > > develop this in
> > > complete isolation.
> > 
> > Looking at this from the AMD GMET POV, here's how I think support
> > for
> > this feature for a Windows guest would be implemented:
> > 
> > * Do not enable the GMET feature in vmcb01.  Only the Windows guest
> > (L1
> > guest) sets this bit for its own guest (L2 guest).  KVM (L0) should
> > see
> > the bit set in vmcb02 (and vmcb12).  OTOH, pass on the CPUID bit to
> > the
> > L1 guest.
> > 
> > * KVM needs to propagate the #NPF to Windows (instead of handling
> > anything itself -- ie no shadow page table adjustments or walks
> > needed).  Windows spawns an L2 guest that causes the #NPF, and
> > Windows
> > is the one that needs to consume that fault.
> > 
> > * KVM needs to differentiate an #NPF exit due to GMET or non-GMET
> > condition -- check the CPL and U/S bits from the exit, and the NX
> > bit
> > from the PTE that faulted.  If due to GMET, propagate it to the
> > guest.
> > If not, continue handling it
> 
> Yes, but no.  KVM shouldn't need to do anything special here other
> than teaching
> update_permission_bitmask() to understand the GMET fault case.  Ditto
> for MBEC.
> I'd type something up, but I would quickly encounter -ENOCOFFE :-)
> 
> With the correct mmu->permissions[], permission_fault() will
> naturally detect
> that a #NPF (or EPT Violation) from L2 due to a GMET/MBEC violation
> is a fault
> in the nNPT/nEPT domain and route the exit to L1.
>
> > (btw KVM MMU API question -- from the #NPF, I have the GPA of the
> > L2
> > guest.  How to go from that guest GPA to look up the NX bit for
> > that
> > page?  I skimmed and there doesn't seem to be an existing API for
> > it -
> > so is walking the tables the only solution?)
> 
> As above, KVM doesn't manually look up individual bits while handling
> faults.
> The walk of the guest page tables (L1's NPT/EPT for this scenario)
> performed by
> FNAME(walk_addr_generic) will gather the effective permissions in
> walker->pte_access,
> and check for a permission_fault() after the walk is completed.

Hm, despite the discussions in the PUCK calls since this email, I have
this doubt, which may be fairly basic.  To determine whether the exit
was due to GMET, we have to check the effective U/S and NX bit for the
address that faulted.  That means we have to walk the L2's page tables
to get those bits from the L2's PTEs, and then from the error code in
exitinfo1, confirm why the #NPF happened.  (And even with Paolo's neat
SMEP hack, the exit reason due to GMET can only be confirmed by looking
at the guest's U/S and NX bits.)

And from what I see, currently page table walks only happen on L1's
page tables, and not on L2's page tables, is that right?

I'm sure I'm missing something here, though..


		Amit

Re: [RFC PATCH 06/18] KVM: VMX: Wire up Intel MBEC enable/disable logic

Posted by Sean Christopherson 7 months, 3 weeks ago

On Mon, Jun 16, 2025, Amit Shah wrote:
> On Wed, 2025-05-14 at 05:55 -0700, Sean Christopherson wrote:
> > On Wed, May 14, 2025, Amit Shah wrote:
> > > (btw KVM MMU API question -- from the #NPF, I have the GPA of the L2
> > > guest.  How to go from that guest GPA to look up the NX bit for that
> > > page?  I skimmed and there doesn't seem to be an existing API for it - so
> > > is walking the tables the only solution?)
> > 
> > As above, KVM doesn't manually look up individual bits while handling
> > faults.  The walk of the guest page tables (L1's NPT/EPT for this scenario)
> > performed by FNAME(walk_addr_generic) will gather the effective permissions
> > in walker->pte_access, and check for a permission_fault() after the walk is
> > completed.
> 
> Hm, despite the discussions in the PUCK calls since this email, I have
> this doubt, which may be fairly basic.  To determine whether the exit
> was due to GMET, we have to check the effective U/S and NX bit for the
> address that faulted.  That means we have to walk the L2's page tables
> to get those bits from the L2's PTEs, and then from the error code in
> exitinfo1, confirm why the #NPF happened.  (And even with Paolo's neat
> SMEP hack, the exit reason due to GMET can only be confirmed by looking
> at the guest's U/S and NX bits.)
> 
> And from what I see, currently page table walks only happen on L1's
> page tables, and not on L2's page tables, is that right?

Nit, they aren't _L2's_ page tables, in that (barring crazy paravirt behavior)
L2 does not control the page tables.  In most conversations, that distinction
wouldn't matter, but when talking about which pages KVM walks when running an L2
while L1 is using NPT (or EPT), it's worth being very precise, because KVM may
also need to walk L2's non-nested page tables, i.e. the page table that map L2
GVAs to L2 GPA.

The least awful terminology we've come up with when referring to nested TDP is
to follow KVM's VMCS/VMCB terminology when doing nested virtualization:

  npt12: The NPT page tables controlled by L1 to manage L2 GPAs.  These are
         never referenced by hardware.
  npt02: KVM controlled page tables that shadow npt12, and are consumed by hardware.

> I'm sure I'm missing something here, though..

Heh, yep.  Part of that's my fault for using ambiguous terminology.  When I said
"L1's NPT/EPT" above, what I really meant was npt12.  I.e. this code

  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
  {
	struct guest_walker walker;
	int r;

	WARN_ON_ONCE(fault->is_tdp);

	/*
	 * Look up the guest pte for the faulting address.
	 * If PFEC.RSVD is set, this is a shadow page fault.
	 * The bit needs to be cleared before walking guest page tables.
	 */
	r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
			     fault->error_code & ~PFERR_RSVD_MASK);

	/*
	 * The page is not mapped by the guest.  Let the guest handle it.
	 */
	if (!r) {
		if (!fault->prefetch)
			kvm_inject_emulated_page_fault(vcpu, &walker.fault);  <===== GMET #NPF

		return RET_PF_RETRY;
	}

which leads to the aformentioned FNAME(walk_addr_generic) and walker->pte_access
behavior, is walking npt12.  Because the #NPF will have occurred while running
L2, and by virtue of it being an #NPF (as opposed to a "legacy" #PF), KVM knows
the fault is in the context of npt02.

Before doing anything with respect to npt12, KVM needs to do walk_addr() on _npt12_
to determine whether the access is allowed by np12.  E.g. the simplest scenario
to grok is if L2 accesses a (L2) GPA that isn't mapped by npt12, in case KVM needs
to inject a #NPF into L1.

Same thing here.  On a PRESENT+FETCH+USER fault, if the effective protections
in npt12 have U/S=1 and GMET is enabled, then KVM needs to inject a #NPF into
L1.  

Side topic, someone should check with the AMD architects as to whether or not
GMET depends on EFER.NXE=1.  The APM says that all NPT mappings are executable
if EFER.NXE=0 in the host (where the "host" is L1 when dealing with nested NPT).
To me, that implies GMET is effectively ignored if EFER.NXE=0.

  Similarly, if the EFER.NXE bit is cleared for the host, all nested page table
  mappings are executable at the underlying nested level.

Re: [RFC PATCH 06/18] KVM: VMX: Wire up Intel MBEC enable/disable logic

Posted by Shah, Amit 7 months ago

On Tue, 2025-06-17 at 07:13 -0700, Sean Christopherson wrote:


> [snipped nested page walk overview]

Thanks a lot for this!

> 
> Side topic, someone should check with the AMD architects as to
> whether or not
> GMET depends on EFER.NXE=1.  The APM says that all NPT mappings are
> executable
> if EFER.NXE=0 in the host (where the "host" is L1 when dealing with
> nested NPT).
> To me, that implies GMET is effectively ignored if EFER.NXE=0.
> 
>   Similarly, if the EFER.NXE bit is cleared for the host, all nested
> page table
>   mappings are executable at the underlying nested level.

The "effective NX" computation includes EFER.NXE.  If that's 0, GMET is
still active and depends on the U/S bit if enabled, as mentioned in the
APM.

Cheers,

		Amit

Re: [RFC PATCH 06/18] KVM: VMX: Wire up Intel MBEC enable/disable logic

Posted by Chao Gao 9 months, 3 weeks ago

On Thu, Mar 13, 2025 at 01:36:45PM -0700, Jon Kohler wrote:
>Add logic to enable / disable Intel Mode Based Execution Control (MBEC)
>based on specific conditions.
>
>MBEC depends on:
>- User space exposing secondary execution control bit 22

The code below doesn't check this.

>- Extended Page Tables (EPT)
>- The KVM module parameter `enable_pt_guest_exec_control`
>
>If any of these conditions are not met, MBEC will be disabled
>accordingly.
>
>Store runtime enablement within `kvm_vcpu_arch.pt_guest_exec_control`.
>
>Signed-off-by: Jon Kohler <jon@nutanix.com>
>
>---
> arch/x86/kvm/vmx/vmx.c | 11 +++++++++++
> arch/x86/kvm/vmx/vmx.h |  7 +++++++
> 2 files changed, 18 insertions(+)
>
>diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>index 7a98f03ef146..116910159a3f 100644
>--- a/arch/x86/kvm/vmx/vmx.c
>+++ b/arch/x86/kvm/vmx/vmx.c
>@@ -2694,6 +2694,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
> 			return -EIO;
> 
> 		vmx_cap->ept = 0;
>+		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
> 		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
> 	}
> 	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
>@@ -4641,11 +4642,15 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
> 		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
> 	if (!enable_ept) {
> 		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
>+		exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
> 		exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
> 		enable_unrestricted_guest = 0;
> 	}
> 	if (!enable_unrestricted_guest)
> 		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
>+	if (!enable_pt_guest_exec_control)
>+		exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
>+
> 	if (kvm_pause_in_guest(vmx->vcpu.kvm))
> 		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
> 	if (!kvm_vcpu_apicv_active(vcpu))
>@@ -4770,6 +4775,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
> 		if (vmx->ve_info)
> 			vmcs_write64(VE_INFORMATION_ADDRESS,
> 				     __pa(vmx->ve_info));
>+
>+		vmx->vcpu.arch.pt_guest_exec_control =
>+			enable_pt_guest_exec_control && vmx_has_mbec(vmx);

Is it possible for vmx->vcpu.arch.pt_guest_exec_control and
enable_pt_guest_exec_control to differ?

To me, the answer is no. So, why not use enable_pt_guest_exec_control
directly?

> 	}
> 
> 	if (cpu_has_tertiary_exec_ctrls())
>@@ -8472,6 +8480,9 @@ __init int vmx_hardware_setup(void)
> 	if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
> 		enable_unrestricted_guest = 0;
> 
>+	if (!cpu_has_vmx_mbec() || !enable_ept)
>+		enable_pt_guest_exec_control = false;
>+
> 	if (!cpu_has_vmx_flexpriority())
> 		flexpriority_enabled = 0;
> 
>diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
>index d1e537bf50ea..9f4ae3139a90 100644
>--- a/arch/x86/kvm/vmx/vmx.h
>+++ b/arch/x86/kvm/vmx/vmx.h
>@@ -580,6 +580,7 @@ static inline u8 vmx_get_rvi(void)
> 	 SECONDARY_EXEC_ENABLE_VMFUNC |					\
> 	 SECONDARY_EXEC_BUS_LOCK_DETECTION |				\
> 	 SECONDARY_EXEC_NOTIFY_VM_EXITING |				\
>+	 SECONDARY_EXEC_MODE_BASED_EPT_EXEC |				\
> 	 SECONDARY_EXEC_ENCLS_EXITING |					\
> 	 SECONDARY_EXEC_EPT_VIOLATION_VE)
> 
>@@ -721,6 +722,12 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
> 		SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
> }
> 
>+static inline bool vmx_has_mbec(struct vcpu_vmx *vmx)
>+{
>+	return secondary_exec_controls_get(vmx) &
>+		SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
>+}
>+
> static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
> {
> 	if (!enable_ept)
>-- 
>2.43.0
>