From: Mickaël Salaün <mic@digikod.net>
Extend is_executable_pte to understand user vs kernel executable
pages and plumb in kvm_vcpu into kvm_mmu_set_spte so that tracepoints
can tell the right execute permissions.
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Co-developed-by: Jon Kohler <jon@nutanix.com>
Signed-off-by: Jon Kohler <jon@nutanix.com>
---
arch/x86/kvm/mmu/mmu.c | 11 ++++++-----
arch/x86/kvm/mmu/mmutrace.h | 15 +++++++++------
arch/x86/kvm/mmu/spte.h | 15 +++++++++++++--
arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
4 files changed, 29 insertions(+), 14 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 791413b93589..5127520f01d2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2951,7 +2951,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
ret = RET_PF_SPURIOUS;
} else {
flush |= mmu_spte_update(sptep, spte);
- trace_kvm_mmu_set_spte(level, gfn, sptep);
+ trace_kvm_mmu_set_spte(vcpu, level, gfn, sptep);
}
if (wrprot && write_fault)
@@ -3430,10 +3430,11 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
return true;
}
-static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
+static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte,
+ struct kvm_vcpu *vcpu)
{
if (fault->exec)
- return is_executable_pte(spte);
+ return is_executable_pte(spte, !fault->user, vcpu);
if (fault->write)
return is_writable_pte(spte);
@@ -3514,7 +3515,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* Need not check the access of upper level table entries since
* they are always ACC_ALL.
*/
- if (is_access_allowed(fault, spte)) {
+ if (is_access_allowed(fault, spte, vcpu)) {
ret = RET_PF_SPURIOUS;
break;
}
@@ -3561,7 +3562,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
/* Verify that the fault can be handled in the fast path */
if (new_spte == spte ||
- !is_access_allowed(fault, new_spte))
+ !is_access_allowed(fault, new_spte, vcpu))
break;
/*
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 2511fe64ca01..1067fb7ecd55 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -339,8 +339,8 @@ TRACE_EVENT(
TRACE_EVENT(
kvm_mmu_set_spte,
- TP_PROTO(int level, gfn_t gfn, u64 *sptep),
- TP_ARGS(level, gfn, sptep),
+ TP_PROTO(struct kvm_vcpu *vcpu, int level, gfn_t gfn, u64 *sptep),
+ TP_ARGS(vcpu, level, gfn, sptep),
TP_STRUCT__entry(
__field(u64, gfn)
@@ -349,7 +349,8 @@ TRACE_EVENT(
__field(u8, level)
/* These depend on page entry type, so compute them now. */
__field(bool, r)
- __field(bool, x)
+ __field(bool, kx)
+ __field(bool, ux)
__field(signed char, u)
),
@@ -359,15 +360,17 @@ TRACE_EVENT(
__entry->sptep = virt_to_phys(sptep);
__entry->level = level;
__entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
- __entry->x = is_executable_pte(__entry->spte);
+ __entry->kx = is_executable_pte(__entry->spte, true, vcpu);
+ __entry->ux = is_executable_pte(__entry->spte, false, vcpu);
__entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
),
- TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+ TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx",
__entry->gfn, __entry->spte,
__entry->r ? "r" : "-",
__entry->spte & PT_WRITABLE_MASK ? "w" : "-",
- __entry->x ? "x" : "-",
+ __entry->kx ? "X" : "-",
+ __entry->ux ? "x" : "-",
__entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
__entry->level, __entry->sptep
)
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1f7b388a56aa..fd7e29a0a567 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -346,9 +346,20 @@ static inline bool is_last_spte(u64 pte, int level)
return (level == PG_LEVEL_4K) || is_large_pte(pte);
}
-static inline bool is_executable_pte(u64 spte)
+static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,
+ struct kvm_vcpu *vcpu)
{
- return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+ u64 x_mask = shadow_x_mask;
+
+ if (vcpu->arch.pt_guest_exec_control) {
+ x_mask |= shadow_ux_mask;
+ if (for_kernel_mode)
+ x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
+ else
+ x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
+ }
+
+ return (spte & (x_mask | shadow_nx_mask)) == x_mask;
}
static inline kvm_pfn_t spte_to_pfn(u64 pte)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 3b996c1fdaab..6a799ab42687 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1056,7 +1056,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
new_spte);
ret = RET_PF_EMULATE;
} else {
- trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+ trace_kvm_mmu_set_spte(vcpu, iter->level, iter->gfn,
rcu_dereference(iter->sptep));
}
--
2.43.0
On Thu, Mar 13, 2025, Jon Kohler wrote:
> @@ -359,15 +360,17 @@ TRACE_EVENT(
> __entry->sptep = virt_to_phys(sptep);
> __entry->level = level;
> __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
> - __entry->x = is_executable_pte(__entry->spte);
> + __entry->kx = is_executable_pte(__entry->spte, true, vcpu);
> + __entry->ux = is_executable_pte(__entry->spte, false, vcpu);
> __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
> ),
>
> - TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
> + TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx",
> __entry->gfn, __entry->spte,
> __entry->r ? "r" : "-",
> __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
> - __entry->x ? "x" : "-",
> + __entry->kx ? "X" : "-",
> + __entry->ux ? "x" : "-",
I don't have a better idea, but I do worry that X vs. x will lead to confusion.
But as I said, I don't have a better idea...
> __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
> __entry->level, __entry->sptep
> )
> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> index 1f7b388a56aa..fd7e29a0a567 100644
> --- a/arch/x86/kvm/mmu/spte.h
> +++ b/arch/x86/kvm/mmu/spte.h
> @@ -346,9 +346,20 @@ static inline bool is_last_spte(u64 pte, int level)
> return (level == PG_LEVEL_4K) || is_large_pte(pte);
> }
>
> -static inline bool is_executable_pte(u64 spte)
> +static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,
s/for_kernel_mode/is_user_access and invert. A handful of KVM comments describe
supervisor as "kernel mode", but those are quite old and IMO unnecessarily imprecise.
> + struct kvm_vcpu *vcpu)
This needs to be an mmu (or maybe a root role?). Hmm, thinking about the page
role, I don't think one new bit will suffice. Simply adding ACC_USER_EXEC_MASK
won't let KVM differentiate between shadow pages created with ACC_EXEC_MASK for
an MMU without MBEC, and a page created explicitly without ACC_USER_EXEC_MASK
for an MMU *with* MBEC.
What I'm not sure about is if MBEC/GMET support needs to be captured in the base
page role, or if it shoving it in kvm_mmu_extended_role will suffice. I'll think
more on this and report back, need to refresh all the shadowing paging stuff, again...
> {
> - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
> + u64 x_mask = shadow_x_mask;
> +
> + if (vcpu->arch.pt_guest_exec_control) {
> + x_mask |= shadow_ux_mask;
> + if (for_kernel_mode)
> + x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
> + else
> + x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
> + }
This is going to get messy when GMET support comes along, because the U/S bit
would need to be inverted to do the right thing for supervisor fetches. Rather
than trying to shoehorn support into the existing code, I think we should prep
for GMET and make the code a wee bit easier to follow in the process. We can
even implement the actual GMET semanctics, but guarded with a WARN (emulating
GMET isn't a terrible fallback in the event of a KVM bug).
if (spte & shadow_nx_mask)
return false;
if (!role.has_mode_based_exec)
return (spte & shadow_x_mask) == shadow_x_mask;
if (WARN_ON_ONCE(!shadow_x_mask))
return is_user_access || !(spte & shadow_user_mask);
return spte & (is_user_access ? shadow_ux_mask : shadow_x_mask);
> On May 12, 2025, at 5:16 PM, Sean Christopherson <seanjc@google.com> wrote:
>
> !-------------------------------------------------------------------|
> CAUTION: External Email
>
> |-------------------------------------------------------------------!
>
> On Thu, Mar 13, 2025, Jon Kohler wrote:
>> @@ -359,15 +360,17 @@ TRACE_EVENT(
>> __entry->sptep = virt_to_phys(sptep);
>> __entry->level = level;
>> __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
>> - __entry->x = is_executable_pte(__entry->spte);
>> + __entry->kx = is_executable_pte(__entry->spte, true, vcpu);
>> + __entry->ux = is_executable_pte(__entry->spte, false, vcpu);
>> __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
>> ),
>>
>> - TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
>> + TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx",
>> __entry->gfn, __entry->spte,
>> __entry->r ? "r" : "-",
>> __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
>> - __entry->x ? "x" : "-",
>> + __entry->kx ? "X" : "-",
>> + __entry->ux ? "x" : "-",
>
> I don't have a better idea, but I do worry that X vs. x will lead to confusion.
> But as I said, I don't have a better idea...
Rampant confusion on this in our internal review, but it was the best we could
come up with on the first go-around here (outside of additional rigor on code
comments, etc) … which certainly don’t help at run/trace time.
>
>> __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
>> __entry->level, __entry->sptep
>> )
>> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
>> index 1f7b388a56aa..fd7e29a0a567 100644
>> --- a/arch/x86/kvm/mmu/spte.h
>> +++ b/arch/x86/kvm/mmu/spte.h
>> @@ -346,9 +346,20 @@ static inline bool is_last_spte(u64 pte, int level)
>> return (level == PG_LEVEL_4K) || is_large_pte(pte);
>> }
>>
>> -static inline bool is_executable_pte(u64 spte)
>> +static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,
>
> s/for_kernel_mode/is_user_access and invert. A handful of KVM comments describe
> supervisor as "kernel mode", but those are quite old and IMO unnecessarily imprecise.
>
>> + struct kvm_vcpu *vcpu)
>
> This needs to be an mmu (or maybe a root role?). Hmm, thinking about the page
> role, I don't think one new bit will suffice. Simply adding ACC_USER_EXEC_MASK
> won't let KVM differentiate between shadow pages created with ACC_EXEC_MASK for
> an MMU without MBEC, and a page created explicitly without ACC_USER_EXEC_MASK
> for an MMU *with* MBEC.
>
> What I'm not sure about is if MBEC/GMET support needs to be captured in the base
> page role, or if it shoving it in kvm_mmu_extended_role will suffice. I'll think
> more on this and report back, need to refresh all the shadowing paging stuff, again...
>
>
>> {
>> - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
>> + u64 x_mask = shadow_x_mask;
>> +
>> + if (vcpu->arch.pt_guest_exec_control) {
>> + x_mask |= shadow_ux_mask;
>> + if (for_kernel_mode)
>> + x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
>> + else
>> + x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
>> + }
>
> This is going to get messy when GMET support comes along, because the U/S bit
> would need to be inverted to do the right thing for supervisor fetches. Rather
> than trying to shoehorn support into the existing code, I think we should prep
> for GMET and make the code a wee bit easier to follow in the process. We can
> even implement the actual GMET semanctics, but guarded with a WARN (emulating
> GMET isn't a terrible fallback in the event of a KVM bug).
+Amit
We’re on the same page there. In fact, Amit and I have been talking off list about
GMET with (notionally) this same goal in mind, of trying to make sure we do this in
such a way where we don’t need to rework the whole thing for GMET.
>
> if (spte & shadow_nx_mask)
> return false;
>
> if (!role.has_mode_based_exec)
> return (spte & shadow_x_mask) == shadow_x_mask;
>
> if (WARN_ON_ONCE(!shadow_x_mask))
> return is_user_access || !(spte & shadow_user_mask);
>
> return spte & (is_user_access ? shadow_ux_mask : shadow_x_mask);
Ack, I’ll chew on this.
>-static inline bool is_executable_pte(u64 spte)
>+static inline bool is_executable_pte(u64 spte, bool for_kernel_mode,
>+ struct kvm_vcpu *vcpu)
> {
>- return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
>+ u64 x_mask = shadow_x_mask;
>+
>+ if (vcpu->arch.pt_guest_exec_control) {
>+ x_mask |= shadow_ux_mask;
>+ if (for_kernel_mode)
>+ x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
>+ else
>+ x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
>+ }
using VMX_EPT_* directly here looks weird. how about:
u64 x_mask = shadow_x_mask;
if (/* mbec enabled */ && !for_kernel_mode)
x_mask = shadow_ux_mask;
return (spte & (x_mask | shadow_nx_mask)) == x_mask;
>+
>+ return (spte & (x_mask | shadow_nx_mask)) == x_mask;
> }
© 2016 - 2025 Red Hat, Inc.