[RFC PATCH 14/18] KVM: x86/mmu: Extend is_executable_pte to understand MBEC

Jon Kohler posted 18 patches 9 months, 1 week ago
[RFC PATCH 14/18] KVM: x86/mmu: Extend is_executable_pte to understand MBEC
Posted by Jon Kohler 9 months, 1 week ago
From: Mickaël Salaün <mic@digikod.net>

Extend is_executable_pte to understand user vs kernel executable
pages and plumb in kvm_vcpu into kvm_mmu_set_spte so that tracepoints
can tell the right execute permissions.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Co-developed-by: Jon Kohler <jon@nutanix.com>
Signed-off-by: Jon Kohler <jon@nutanix.com>

---
 arch/x86/kvm/mmu/mmu.c      | 11 ++++++-----
 arch/x86/kvm/mmu/mmutrace.h | 15 +++++++++------
 arch/x86/kvm/mmu/spte.h     | 15 +++++++++++++--
 arch/x86/kvm/mmu/tdp_mmu.c  |  2 +-
 4 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 791413b93589..5127520f01d2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2951,7 +2951,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
 		ret = RET_PF_SPURIOUS;
 	} else {
 		flush |= mmu_spte_update(sptep, spte);
-		trace_kvm_mmu_set_spte(level, gfn, sptep);
+		trace_kvm_mmu_set_spte(vcpu, level, gfn, sptep);
 	}
 
 	if (wrprot && write_fault)
@@ -3430,10 +3430,11 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
 	return true;
 }
 
-static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
+static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte,
+			      struct kvm_vcpu *vcpu)
 {
 	if (fault->exec)
-		return is_executable_pte(spte);
+		return is_executable_pte(spte, !fault->user, vcpu);
 
 	if (fault->write)
 		return is_writable_pte(spte);
@@ -3514,7 +3515,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		 * Need not check the access of upper level table entries since
 		 * they are always ACC_ALL.
 		 */
-		if (is_access_allowed(fault, spte)) {
+		if (is_access_allowed(fault, spte, vcpu)) {
 			ret = RET_PF_SPURIOUS;
 			break;
 		}
@@ -3561,7 +3562,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 
 		/* Verify that the fault can be handled in the fast path */
 		if (new_spte == spte ||
-		    !is_access_allowed(fault, new_spte))
+		    !is_access_allowed(fault, new_spte, vcpu))
 			break;
 
 		/*
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 2511fe64ca01..1067fb7ecd55 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -339,8 +339,8 @@ TRACE_EVENT(
 
 TRACE_EVENT(
 	kvm_mmu_set_spte,
-	TP_PROTO(int level, gfn_t gfn, u64 *sptep),
-	TP_ARGS(level, gfn, sptep),
+	TP_PROTO(struct kvm_vcpu *vcpu, int level, gfn_t gfn, u64 *sptep),
+	TP_ARGS(vcpu, level, gfn, sptep),
 
 	TP_STRUCT__entry(
 		__field(u64, gfn)
@@ -349,7 +349,8 @@ TRACE_EVENT(
 		__field(u8, level)
 		/* These depend on page entry type, so compute them now.  */
 		__field(bool, r)
-		__field(bool, x)
+		__field(bool, kx)
+		__field(bool, ux)
 		__field(signed char, u)
 	),
 
@@ -359,15 +360,17 @@ TRACE_EVENT(
 		__entry->sptep = virt_to_phys(sptep);
 		__entry->level = level;
 		__entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
-		__entry->x = is_executable_pte(__entry->spte);
+		__entry->kx = is_executable_pte(__entry->spte, true, vcpu);
+		__entry->ux = is_executable_pte(__entry->spte, false, vcpu);
 		__entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
 	),
 
-	TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+	TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx",
 		  __entry->gfn, __entry->spte,
 		  __entry->r ? "r" : "-",
 		  __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
-		  __entry->x ? "x" : "-",
+		  __entry->kx ? "X" : "-",
+		  __entry->ux ? "x" : "-",
 		  __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
 		  __entry->level, __entry->sptep
 	)
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1f7b388a56aa..fd7e29a0a567 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -346,9 +346,20 @@ static inline bool is_last_spte(u64 pte, int level)
 	return (level == PG_LEVEL_4K) || is_large_pte(pte);
 }
 
-static inline bool is_executable_pte(u64 spte)
+static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,
+				     struct kvm_vcpu *vcpu)
 {
-	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+	u64 x_mask = shadow_x_mask;
+
+	if (vcpu->arch.pt_guest_exec_control) {
+		x_mask |= shadow_ux_mask;
+		if (for_kernel_mode)
+			x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
+		else
+			x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
+	}
+
+	return (spte & (x_mask | shadow_nx_mask)) == x_mask;
 }
 
 static inline kvm_pfn_t spte_to_pfn(u64 pte)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 3b996c1fdaab..6a799ab42687 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1056,7 +1056,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 				     new_spte);
 		ret = RET_PF_EMULATE;
 	} else {
-		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+		trace_kvm_mmu_set_spte(vcpu, iter->level, iter->gfn,
 				       rcu_dereference(iter->sptep));
 	}
 
-- 
2.43.0

Re: [RFC PATCH 14/18] KVM: x86/mmu: Extend is_executable_pte to understand MBEC
Posted by Sean Christopherson 7 months, 1 week ago
On Thu, Mar 13, 2025, Jon Kohler wrote:
> @@ -359,15 +360,17 @@ TRACE_EVENT(
>  		__entry->sptep = virt_to_phys(sptep);
>  		__entry->level = level;
>  		__entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
> -		__entry->x = is_executable_pte(__entry->spte);
> +		__entry->kx = is_executable_pte(__entry->spte, true, vcpu);
> +		__entry->ux = is_executable_pte(__entry->spte, false, vcpu);
>  		__entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
>  	),
>  
> -	TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
> +	TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx",
>  		  __entry->gfn, __entry->spte,
>  		  __entry->r ? "r" : "-",
>  		  __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
> -		  __entry->x ? "x" : "-",
> +		  __entry->kx ? "X" : "-",
> +		  __entry->ux ? "x" : "-",

I don't have a better idea, but I do worry that X vs. x will lead to confusion.
But as I said, I don't have a better idea...

>  		  __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
>  		  __entry->level, __entry->sptep
>  	)
> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> index 1f7b388a56aa..fd7e29a0a567 100644
> --- a/arch/x86/kvm/mmu/spte.h
> +++ b/arch/x86/kvm/mmu/spte.h
> @@ -346,9 +346,20 @@ static inline bool is_last_spte(u64 pte, int level)
>  	return (level == PG_LEVEL_4K) || is_large_pte(pte);
>  }
>  
> -static inline bool is_executable_pte(u64 spte)
> +static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,

s/for_kernel_mode/is_user_access and invert.  A handful of KVM comments describe
supervisor as "kernel mode", but those are quite old and IMO unnecessarily imprecise.

> +				     struct kvm_vcpu *vcpu)

This needs to be an mmu (or maybe a root role?).  Hmm, thinking about the page
role, I don't think one new bit will suffice.  Simply adding ACC_USER_EXEC_MASK
won't let KVM differentiate between shadow pages created with ACC_EXEC_MASK for
an MMU without MBEC, and a page created explicitly without ACC_USER_EXEC_MASK
for an MMU *with* MBEC.

What I'm not sure about is if MBEC/GMET support needs to be captured in the base
page role, or if it shoving it in kvm_mmu_extended_role will suffice.  I'll think
more on this and report back, need to refresh all the shadowing paging stuff, again...


>  {
> -	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
> +	u64 x_mask = shadow_x_mask;
> +
> +	if (vcpu->arch.pt_guest_exec_control) {
> +		x_mask |= shadow_ux_mask;
> +		if (for_kernel_mode)
> +			x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
> +		else
> +			x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
> +	}

This is going to get messy when GMET support comes along, because the U/S bit
would need to be inverted to do the right thing for supervisor fetches.  Rather
than trying to shoehorn support into the existing code, I think we should prep
for GMET and make the code a wee bit easier to follow in the process.  We can
even implement the actual GMET semanctics, but guarded with a WARN (emulating
GMET isn't a terrible fallback in the event of a KVM bug).

	if (spte & shadow_nx_mask)
		return false;

	if (!role.has_mode_based_exec)
		return (spte & shadow_x_mask) == shadow_x_mask;

	if (WARN_ON_ONCE(!shadow_x_mask))
		return is_user_access || !(spte & shadow_user_mask);

	return spte & (is_user_access ? shadow_ux_mask : shadow_x_mask);
Re: [RFC PATCH 14/18] KVM: x86/mmu: Extend is_executable_pte to understand MBEC
Posted by Jon Kohler 7 months, 1 week ago

> On May 12, 2025, at 5:16 PM, Sean Christopherson <seanjc@google.com> wrote:
> 
> !-------------------------------------------------------------------|
>  CAUTION: External Email
> 
> |-------------------------------------------------------------------!
> 
> On Thu, Mar 13, 2025, Jon Kohler wrote:
>> @@ -359,15 +360,17 @@ TRACE_EVENT(
>> __entry->sptep = virt_to_phys(sptep);
>> __entry->level = level;
>> __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
>> - __entry->x = is_executable_pte(__entry->spte);
>> + __entry->kx = is_executable_pte(__entry->spte, true, vcpu);
>> + __entry->ux = is_executable_pte(__entry->spte, false, vcpu);
>> __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
>> ),
>> 
>> - TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
>> + TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx",
>>  __entry->gfn, __entry->spte,
>>  __entry->r ? "r" : "-",
>>  __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
>> -  __entry->x ? "x" : "-",
>> +  __entry->kx ? "X" : "-",
>> +  __entry->ux ? "x" : "-",
> 
> I don't have a better idea, but I do worry that X vs. x will lead to confusion.
> But as I said, I don't have a better idea...

Rampant confusion on this in our internal review, but it was the best we could
come up with on the first go-around here (outside of additional rigor on code
comments, etc) … which certainly don’t help at run/trace time.

> 
>>  __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
>>  __entry->level, __entry->sptep
>> )
>> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
>> index 1f7b388a56aa..fd7e29a0a567 100644
>> --- a/arch/x86/kvm/mmu/spte.h
>> +++ b/arch/x86/kvm/mmu/spte.h
>> @@ -346,9 +346,20 @@ static inline bool is_last_spte(u64 pte, int level)
>> return (level == PG_LEVEL_4K) || is_large_pte(pte);
>> }
>> 
>> -static inline bool is_executable_pte(u64 spte)
>> +static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,
> 
> s/for_kernel_mode/is_user_access and invert.  A handful of KVM comments describe
> supervisor as "kernel mode", but those are quite old and IMO unnecessarily imprecise.
> 
>> +     struct kvm_vcpu *vcpu)
> 
> This needs to be an mmu (or maybe a root role?).  Hmm, thinking about the page
> role, I don't think one new bit will suffice.  Simply adding ACC_USER_EXEC_MASK
> won't let KVM differentiate between shadow pages created with ACC_EXEC_MASK for
> an MMU without MBEC, and a page created explicitly without ACC_USER_EXEC_MASK
> for an MMU *with* MBEC.
> 
> What I'm not sure about is if MBEC/GMET support needs to be captured in the base
> page role, or if it shoving it in kvm_mmu_extended_role will suffice.  I'll think
> more on this and report back, need to refresh all the shadowing paging stuff, again...
> 
> 
>> {
>> - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
>> + u64 x_mask = shadow_x_mask;
>> +
>> + if (vcpu->arch.pt_guest_exec_control) {
>> + x_mask |= shadow_ux_mask;
>> + if (for_kernel_mode)
>> + x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
>> + else
>> + x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
>> + }
> 
> This is going to get messy when GMET support comes along, because the U/S bit
> would need to be inverted to do the right thing for supervisor fetches.  Rather
> than trying to shoehorn support into the existing code, I think we should prep
> for GMET and make the code a wee bit easier to follow in the process.  We can
> even implement the actual GMET semanctics, but guarded with a WARN (emulating
> GMET isn't a terrible fallback in the event of a KVM bug).

+Amit

We’re on the same page there. In fact, Amit and I have been talking off list about
GMET with (notionally) this same goal in mind, of trying to make sure we do this in
such a way where we don’t need to rework the whole thing for GMET.

> 
> if (spte & shadow_nx_mask)
> return false;
> 
> if (!role.has_mode_based_exec)
> return (spte & shadow_x_mask) == shadow_x_mask;
> 
> if (WARN_ON_ONCE(!shadow_x_mask))
> return is_user_access || !(spte & shadow_user_mask);
> 
> return spte & (is_user_access ? shadow_ux_mask : shadow_x_mask);

Ack, I’ll chew on this.

Re: [RFC PATCH 14/18] KVM: x86/mmu: Extend is_executable_pte to understand MBEC
Posted by Chao Gao 7 months, 4 weeks ago
>-static inline bool is_executable_pte(u64 spte)
>+static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,
>+				     struct kvm_vcpu *vcpu)
> {
>-	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
>+	u64 x_mask = shadow_x_mask;
>+
>+	if (vcpu->arch.pt_guest_exec_control) {
>+		x_mask |= shadow_ux_mask;
>+		if (for_kernel_mode)
>+			x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
>+		else
>+			x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
>+	}

using VMX_EPT_* directly here looks weird. how about:

	u64 x_mask = shadow_x_mask;

	if (/* mbec enabled */ && !for_kernel_mode)
		x_mask = shadow_ux_mask;

	return (spte & (x_mask | shadow_nx_mask)) == x_mask;

>+
>+	return (spte & (x_mask | shadow_nx_mask)) == x_mask;
> }