[v1] KVM: VMX: Introduce Intel Mode-Based Execute Control (MBEC)

[RFC PATCH 14/18] KVM: x86/mmu: Extend is_executable_pte to understand MBEC

Posted by Jon Kohler 11 months ago

From: Mickaël Salaün <mic@digikod.net>

Extend is_executable_pte to understand user vs kernel executable
pages and plumb in kvm_vcpu into kvm_mmu_set_spte so that tracepoints
can tell the right execute permissions.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Co-developed-by: Jon Kohler <jon@nutanix.com>
Signed-off-by: Jon Kohler <jon@nutanix.com>

---
 arch/x86/kvm/mmu/mmu.c      | 11 ++++++-----
 arch/x86/kvm/mmu/mmutrace.h | 15 +++++++++------
 arch/x86/kvm/mmu/spte.h     | 15 +++++++++++++--
 arch/x86/kvm/mmu/tdp_mmu.c  |  2 +-
 4 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 791413b93589..5127520f01d2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2951,7 +2951,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
 		ret = RET_PF_SPURIOUS;
 	} else {
 		flush |= mmu_spte_update(sptep, spte);
-		trace_kvm_mmu_set_spte(level, gfn, sptep);
+		trace_kvm_mmu_set_spte(vcpu, level, gfn, sptep);
 	}
 
 	if (wrprot && write_fault)
@@ -3430,10 +3430,11 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
 	return true;
 }
 
-static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
+static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte,
+			      struct kvm_vcpu *vcpu)
 {
 	if (fault->exec)
-		return is_executable_pte(spte);
+		return is_executable_pte(spte, !fault->user, vcpu);
 
 	if (fault->write)
 		return is_writable_pte(spte);
@@ -3514,7 +3515,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		 * Need not check the access of upper level table entries since
 		 * they are always ACC_ALL.
 		 */
-		if (is_access_allowed(fault, spte)) {
+		if (is_access_allowed(fault, spte, vcpu)) {
 			ret = RET_PF_SPURIOUS;
 			break;
 		}
@@ -3561,7 +3562,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 
 		/* Verify that the fault can be handled in the fast path */
 		if (new_spte == spte ||
-		    !is_access_allowed(fault, new_spte))
+		    !is_access_allowed(fault, new_spte, vcpu))
 			break;
 
 		/*
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 2511fe64ca01..1067fb7ecd55 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -339,8 +339,8 @@ TRACE_EVENT(
 
 TRACE_EVENT(
 	kvm_mmu_set_spte,
-	TP_PROTO(int level, gfn_t gfn, u64 *sptep),
-	TP_ARGS(level, gfn, sptep),
+	TP_PROTO(struct kvm_vcpu *vcpu, int level, gfn_t gfn, u64 *sptep),
+	TP_ARGS(vcpu, level, gfn, sptep),
 
 	TP_STRUCT__entry(
 		__field(u64, gfn)
@@ -349,7 +349,8 @@ TRACE_EVENT(
 		__field(u8, level)
 		/* These depend on page entry type, so compute them now.  */
 		__field(bool, r)
-		__field(bool, x)
+		__field(bool, kx)
+		__field(bool, ux)
 		__field(signed char, u)
 	),
 
@@ -359,15 +360,17 @@ TRACE_EVENT(
 		__entry->sptep = virt_to_phys(sptep);
 		__entry->level = level;
 		__entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
-		__entry->x = is_executable_pte(__entry->spte);
+		__entry->kx = is_executable_pte(__entry->spte, true, vcpu);
+		__entry->ux = is_executable_pte(__entry->spte, false, vcpu);
 		__entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
 	),
 
-	TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+	TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx",
 		  __entry->gfn, __entry->spte,
 		  __entry->r ? "r" : "-",
 		  __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
-		  __entry->x ? "x" : "-",
+		  __entry->kx ? "X" : "-",
+		  __entry->ux ? "x" : "-",
 		  __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
 		  __entry->level, __entry->sptep
 	)
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1f7b388a56aa..fd7e29a0a567 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -346,9 +346,20 @@ static inline bool is_last_spte(u64 pte, int level)
 	return (level == PG_LEVEL_4K) || is_large_pte(pte);
 }
 
-static inline bool is_executable_pte(u64 spte)
+static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,
+				     struct kvm_vcpu *vcpu)
 {
-	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+	u64 x_mask = shadow_x_mask;
+
+	if (vcpu->arch.pt_guest_exec_control) {
+		x_mask |= shadow_ux_mask;
+		if (for_kernel_mode)
+			x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
+		else
+			x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
+	}
+
+	return (spte & (x_mask | shadow_nx_mask)) == x_mask;
 }
 
 static inline kvm_pfn_t spte_to_pfn(u64 pte)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 3b996c1fdaab..6a799ab42687 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1056,7 +1056,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 				     new_spte);
 		ret = RET_PF_EMULATE;
 	} else {
-		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+		trace_kvm_mmu_set_spte(vcpu, iter->level, iter->gfn,
 				       rcu_dereference(iter->sptep));
 	}
 
-- 
2.43.0

Re: [RFC PATCH 14/18] KVM: x86/mmu: Extend is_executable_pte to understand MBEC

Posted by Sean Christopherson 9 months ago

On Thu, Mar 13, 2025, Jon Kohler wrote:
> @@ -359,15 +360,17 @@ TRACE_EVENT(
>  		__entry->sptep = virt_to_phys(sptep);
>  		__entry->level = level;
>  		__entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
> -		__entry->x = is_executable_pte(__entry->spte);
> +		__entry->kx = is_executable_pte(__entry->spte, true, vcpu);
> +		__entry->ux = is_executable_pte(__entry->spte, false, vcpu);
>  		__entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
>  	),
>  
> -	TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
> +	TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx",
>  		  __entry->gfn, __entry->spte,
>  		  __entry->r ? "r" : "-",
>  		  __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
> -		  __entry->x ? "x" : "-",
> +		  __entry->kx ? "X" : "-",
> +		  __entry->ux ? "x" : "-",

I don't have a better idea, but I do worry that X vs. x will lead to confusion.
But as I said, I don't have a better idea...

>  		  __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
>  		  __entry->level, __entry->sptep
>  	)
> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> index 1f7b388a56aa..fd7e29a0a567 100644
> --- a/arch/x86/kvm/mmu/spte.h
> +++ b/arch/x86/kvm/mmu/spte.h
> @@ -346,9 +346,20 @@ static inline bool is_last_spte(u64 pte, int level)
>  	return (level == PG_LEVEL_4K) || is_large_pte(pte);
>  }
>  
> -static inline bool is_executable_pte(u64 spte)
> +static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,

s/for_kernel_mode/is_user_access and invert.  A handful of KVM comments describe
supervisor as "kernel mode", but those are quite old and IMO unnecessarily imprecise.

> +				     struct kvm_vcpu *vcpu)

This needs to be an mmu (or maybe a root role?).  Hmm, thinking about the page
role, I don't think one new bit will suffice.  Simply adding ACC_USER_EXEC_MASK
won't let KVM differentiate between shadow pages created with ACC_EXEC_MASK for
an MMU without MBEC, and a page created explicitly without ACC_USER_EXEC_MASK
for an MMU *with* MBEC.

What I'm not sure about is if MBEC/GMET support needs to be captured in the base
page role, or if it shoving it in kvm_mmu_extended_role will suffice.  I'll think
more on this and report back, need to refresh all the shadowing paging stuff, again...

>  {
> -	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
> +	u64 x_mask = shadow_x_mask;
> +
> +	if (vcpu->arch.pt_guest_exec_control) {
> +		x_mask |= shadow_ux_mask;
> +		if (for_kernel_mode)
> +			x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
> +		else
> +			x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
> +	}

This is going to get messy when GMET support comes along, because the U/S bit
would need to be inverted to do the right thing for supervisor fetches.  Rather
than trying to shoehorn support into the existing code, I think we should prep
for GMET and make the code a wee bit easier to follow in the process.  We can
even implement the actual GMET semanctics, but guarded with a WARN (emulating
GMET isn't a terrible fallback in the event of a KVM bug).

	if (spte & shadow_nx_mask)
		return false;

	if (!role.has_mode_based_exec)
		return (spte & shadow_x_mask) == shadow_x_mask;

	if (WARN_ON_ONCE(!shadow_x_mask))
		return is_user_access || !(spte & shadow_user_mask);

	return spte & (is_user_access ? shadow_ux_mask : shadow_x_mask);

Re: [RFC PATCH 14/18] KVM: x86/mmu: Extend is_executable_pte to understand MBEC

Posted by Jon Kohler 9 months ago


> On May 12, 2025, at 5:16 PM, Sean Christopherson <seanjc@google.com> wrote:
> 
> !-------------------------------------------------------------------|
>  CAUTION: External Email
> 
> |-------------------------------------------------------------------!
> 
> On Thu, Mar 13, 2025, Jon Kohler wrote:
>> @@ -359,15 +360,17 @@ TRACE_EVENT(
>> __entry->sptep = virt_to_phys(sptep);
>> __entry->level = level;
>> __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
>> - __entry->x = is_executable_pte(__entry->spte);
>> + __entry->kx = is_executable_pte(__entry->spte, true, vcpu);
>> + __entry->ux = is_executable_pte(__entry->spte, false, vcpu);
>> __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
>> ),
>> 
>> - TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
>> + TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx",
>>  __entry->gfn, __entry->spte,
>>  __entry->r ? "r" : "-",
>>  __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
>> -  __entry->x ? "x" : "-",
>> +  __entry->kx ? "X" : "-",
>> +  __entry->ux ? "x" : "-",
> 
> I don't have a better idea, but I do worry that X vs. x will lead to confusion.
> But as I said, I don't have a better idea...

Rampant confusion on this in our internal review, but it was the best we could
come up with on the first go-around here (outside of additional rigor on code
comments, etc) … which certainly don’t help at run/trace time.

> 
>>  __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
>>  __entry->level, __entry->sptep
>> )
>> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
>> index 1f7b388a56aa..fd7e29a0a567 100644
>> --- a/arch/x86/kvm/mmu/spte.h
>> +++ b/arch/x86/kvm/mmu/spte.h
>> @@ -346,9 +346,20 @@ static inline bool is_last_spte(u64 pte, int level)
>> return (level == PG_LEVEL_4K) || is_large_pte(pte);
>> }
>> 
>> -static inline bool is_executable_pte(u64 spte)
>> +static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,
> 
> s/for_kernel_mode/is_user_access and invert.  A handful of KVM comments describe
> supervisor as "kernel mode", but those are quite old and IMO unnecessarily imprecise.
> 
>> +     struct kvm_vcpu *vcpu)
> 
> This needs to be an mmu (or maybe a root role?).  Hmm, thinking about the page
> role, I don't think one new bit will suffice.  Simply adding ACC_USER_EXEC_MASK
> won't let KVM differentiate between shadow pages created with ACC_EXEC_MASK for
> an MMU without MBEC, and a page created explicitly without ACC_USER_EXEC_MASK
> for an MMU *with* MBEC.
> 
> What I'm not sure about is if MBEC/GMET support needs to be captured in the base
> page role, or if it shoving it in kvm_mmu_extended_role will suffice.  I'll think
> more on this and report back, need to refresh all the shadowing paging stuff, again...
> 
> 
>> {
>> - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
>> + u64 x_mask = shadow_x_mask;
>> +
>> + if (vcpu->arch.pt_guest_exec_control) {
>> + x_mask |= shadow_ux_mask;
>> + if (for_kernel_mode)
>> + x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
>> + else
>> + x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
>> + }
> 
> This is going to get messy when GMET support comes along, because the U/S bit
> would need to be inverted to do the right thing for supervisor fetches.  Rather
> than trying to shoehorn support into the existing code, I think we should prep
> for GMET and make the code a wee bit easier to follow in the process.  We can
> even implement the actual GMET semanctics, but guarded with a WARN (emulating
> GMET isn't a terrible fallback in the event of a KVM bug).

+Amit

We’re on the same page there. In fact, Amit and I have been talking off list about
GMET with (notionally) this same goal in mind, of trying to make sure we do this in
such a way where we don’t need to rework the whole thing for GMET.

> 
> if (spte & shadow_nx_mask)
> return false;
> 
> if (!role.has_mode_based_exec)
> return (spte & shadow_x_mask) == shadow_x_mask;
> 
> if (WARN_ON_ONCE(!shadow_x_mask))
> return is_user_access || !(spte & shadow_user_mask);
> 
> return spte & (is_user_access ? shadow_ux_mask : shadow_x_mask);

Ack, I’ll chew on this.

Re: [RFC PATCH 14/18] KVM: x86/mmu: Extend is_executable_pte to understand MBEC

Posted by Jon Kohler 1 month, 2 weeks ago


> On May 12, 2025, at 10:09 PM, Jon Kohler <jon@nutanix.com> wrote:
> 
>> On May 12, 2025, at 5:16 PM, Sean Christopherson <seanjc@google.com> wrote:
>> 
>> On Thu, Mar 13, 2025, Jon Kohler wrote:
>>> @@ -359,15 +360,17 @@ TRACE_EVENT(
>>> __entry->sptep = virt_to_phys(sptep);
>>> __entry->level = level;
>>> __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
>>> - __entry->x = is_executable_pte(__entry->spte);
>>> + __entry->kx = is_executable_pte(__entry->spte, true, vcpu);
>>> + __entry->ux = is_executable_pte(__entry->spte, false, vcpu);
>>> __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
>>> ),
>>> 
>>> - TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
>>> + TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx",
>>> __entry->gfn, __entry->spte,
>>> __entry->r ? "r" : "-",
>>> __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
>>> -  __entry->x ? "x" : "-",
>>> +  __entry->kx ? "X" : "-",
>>> +  __entry->ux ? "x" : "-",
>> 
>> I don't have a better idea, but I do worry that X vs. x will lead to confusion.
>> But as I said, I don't have a better idea...
> 
> Rampant confusion on this in our internal review, but it was the best we could
> come up with on the first go-around here (outside of additional rigor on code
> comments, etc) … which certainly don’t help at run/trace time.

I still couldn’t come up with something cleaner than BigX, LittleX here for v1,
But I’m open to feedback if anyones got comments.

>>> __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
>>> __entry->level, __entry->sptep
>>> )
>>> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
>>> index 1f7b388a56aa..fd7e29a0a567 100644
>>> --- a/arch/x86/kvm/mmu/spte.h
>>> +++ b/arch/x86/kvm/mmu/spte.h
>>> @@ -346,9 +346,20 @@ static inline bool is_last_spte(u64 pte, int level)
>>> return (level == PG_LEVEL_4K) || is_large_pte(pte);
>>> }
>>> 
>>> -static inline bool is_executable_pte(u64 spte)
>>> +static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,
>> 
>> s/for_kernel_mode/is_user_access and invert.  A handful of KVM comments describe
>> supervisor as "kernel mode", but those are quite old and IMO unnecessarily imprecise.

Ack/done - thanks for the feedback, I’ve integrated this. See what I did in v1,
but I’ve also added a TDP aware version is_executable_pte_fault for fault->is_tdp
as I *think* we actually need to check this to figure out this user access vs non user
access. I might be misinterpreting TDP, so I’d appreciate some sanity checking there.

>>> +     struct kvm_vcpu *vcpu)
>> 
>> This needs to be an mmu (or maybe a root role?).  Hmm, thinking about the page
>> role, I don't think one new bit will suffice.  Simply adding ACC_USER_EXEC_MASK
>> won't let KVM differentiate between shadow pages created with ACC_EXEC_MASK for
>> an MMU without MBEC, and a page created explicitly without ACC_USER_EXEC_MASK
>> for an MMU *with* MBEC.
>> 
>> What I'm not sure about is if MBEC/GMET support needs to be captured in the base
>> page role, or if it shoving it in kvm_mmu_extended_role will suffice.  I'll think
>> more on this and report back, need to refresh all the shadowing paging stuff, again...

Sold! I made this part of kvm_mmu_page_role for access bit and also a new “has_mbec”
which both helped simplified the overall work (thanks!) and made it cleaner to enable

>>> {
>>> - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
>>> + u64 x_mask = shadow_x_mask;
>>> +
>>> + if (vcpu->arch.pt_guest_exec_control) {
>>> + x_mask |= shadow_ux_mask;
>>> + if (for_kernel_mode)
>>> + x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
>>> + else
>>> + x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
>>> + }
>> 
>> This is going to get messy when GMET support comes along, because the U/S bit
>> would need to be inverted to do the right thing for supervisor fetches.  Rather
>> than trying to shoehorn support into the existing code, I think we should prep
>> for GMET and make the code a wee bit easier to follow in the process.  We can
>> even implement the actual GMET semanctics, but guarded with a WARN (emulating
>> GMET isn't a terrible fallback in the event of a KVM bug).
> 
> +Amit
> 
> We’re on the same page there. In fact, Amit and I have been talking off list about
> GMET with (notionally) this same goal in mind, of trying to make sure we do this in
> such a way where we don’t need to rework the whole thing for GMET.
> 
>> 
>> if (spte & shadow_nx_mask)
>> return false;
>> 
>> if (!role.has_mode_based_exec)
>> return (spte & shadow_x_mask) == shadow_x_mask;
>> 
>> if (WARN_ON_ONCE(!shadow_x_mask))
>> return is_user_access || !(spte & shadow_user_mask);
>> 
>> return spte & (is_user_access ? shadow_ux_mask : shadow_x_mask);

Thanks for the suggestion on the code side, I’ve integrated that. I haven’t fully
put my teeth into GMET, but Amit I’d appreciate a review of what I’ve done on the
MMU design side for V1 series and let me know if there are GMET prep tweaks that
make sense (or we could just punt it to a future GMET-specific series, either way).

Re: [RFC PATCH 14/18] KVM: x86/mmu: Extend is_executable_pte to understand MBEC

Posted by Chao Gao 9 months, 3 weeks ago

>-static inline bool is_executable_pte(u64 spte)
>+static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,
>+				     struct kvm_vcpu *vcpu)
> {
>-	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
>+	u64 x_mask = shadow_x_mask;
>+
>+	if (vcpu->arch.pt_guest_exec_control) {
>+		x_mask |= shadow_ux_mask;
>+		if (for_kernel_mode)
>+			x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
>+		else
>+			x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
>+	}

using VMX_EPT_* directly here looks weird. how about:

	u64 x_mask = shadow_x_mask;

	if (/* mbec enabled */ && !for_kernel_mode)
		x_mask = shadow_ux_mask;

	return (spte & (x_mask | shadow_nx_mask)) == x_mask;

>+
>+	return (spte & (x_mask | shadow_nx_mask)) == x_mask;
> }