From: Mickaël Salaün <mic@digikod.net>
Add EPT_VIOLATION_PROT_USER_EXEC (6) to reflect the user executable
permissions of a given address when Intel MBEC is enabled.
Refactor usage of EPT_VIOLATION_RWX_TO_PROT to understand all of the
specific bits that are now possible with MBEC.
Intel SDM 'Exit Qualification for EPT Violations' states the following
for Bit 6.
If the “mode-based execute control” VM-execution control is 0, the
value of this bit is undefined. If that control is 1, this bit is
the logical-AND of bit 10 in the EPT paging-structure entries used
to translate the guest-physical address of the access causing the
EPT violation. In this case, it indicates whether the guest-physical
address was executable for user-mode linear addresses.
Bit 6 is cleared to 0 if (1) the “mode-based execute control”
VM-execution control is 1; and (2) either (a) any of EPT
paging-structure entries used to translate the guest-physical address
of the access causing the EPT violation is not present; or
(b) 4-level EPT is in use and the guest-physical address sets any
bits in the range 51:48.
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Co-developed-by: Jon Kohler <jon@nutanix.com>
Signed-off-by: Jon Kohler <jon@nutanix.com>
---
arch/x86/include/asm/vmx.h | 7 ++++---
arch/x86/kvm/mmu/paging_tmpl.h | 15 ++++++++++++---
arch/x86/kvm/vmx/vmx.c | 7 +++++--
3 files changed, 21 insertions(+), 8 deletions(-)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index ffc90d672b5d..84c5be416f5c 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -587,6 +587,7 @@ enum vm_entry_failure_code {
#define EPT_VIOLATION_PROT_READ BIT(3)
#define EPT_VIOLATION_PROT_WRITE BIT(4)
#define EPT_VIOLATION_PROT_EXEC BIT(5)
+#define EPT_VIOLATION_PROT_USER_EXEC BIT(6)
#define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \
EPT_VIOLATION_PROT_WRITE | \
EPT_VIOLATION_PROT_EXEC)
@@ -596,7 +597,7 @@ enum vm_entry_failure_code {
#define EPT_VIOLATION_READ_TO_PROT(__epte) (((__epte) & VMX_EPT_READABLE_MASK) << 3)
#define EPT_VIOLATION_WRITE_TO_PROT(__epte) (((__epte) & VMX_EPT_WRITABLE_MASK) << 3)
#define EPT_VIOLATION_EXEC_TO_PROT(__epte) (((__epte) & VMX_EPT_EXECUTABLE_MASK) << 3)
-#define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3)
+#define EPT_VIOLATION_USER_EXEC_TO_PROT(__epte) (((__epte) & VMX_EPT_USER_EXECUTABLE_MASK) >> 4)
static_assert(EPT_VIOLATION_READ_TO_PROT(VMX_EPT_READABLE_MASK) ==
(EPT_VIOLATION_PROT_READ));
@@ -604,8 +605,8 @@ static_assert(EPT_VIOLATION_WRITE_TO_PROT(VMX_EPT_WRITABLE_MASK) ==
(EPT_VIOLATION_PROT_WRITE));
static_assert(EPT_VIOLATION_EXEC_TO_PROT(VMX_EPT_EXECUTABLE_MASK) ==
(EPT_VIOLATION_PROT_EXEC));
-static_assert(EPT_VIOLATION_RWX_TO_PROT(VMX_EPT_RWX_MASK) ==
- (EPT_VIOLATION_PROT_READ | EPT_VIOLATION_PROT_WRITE | EPT_VIOLATION_PROT_EXEC));
+static_assert(EPT_VIOLATION_USER_EXEC_TO_PROT(VMX_EPT_USER_EXECUTABLE_MASK) ==
+ (EPT_VIOLATION_PROT_USER_EXEC));
/*
* Exit Qualifications for NOTIFY VM EXIT
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 9bc3fc4a238b..a3a5cacda614 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -181,8 +181,9 @@ static inline unsigned FNAME(gpte_access)(u64 gpte)
unsigned access;
#if PTTYPE == PTTYPE_EPT
access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
- ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
- ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
+ ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+ ((gpte & VMX_EPT_USER_EXECUTABLE_MASK) ? ACC_USER_EXEC_MASK : 0) |
+ ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
#else
BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
BUILD_BUG_ON(ACC_EXEC_MASK != 1);
@@ -510,7 +511,15 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
* Note, pte_access holds the raw RWX bits from the EPTE, not
* ACC_*_MASK flags!
*/
- walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
+ walker->fault.exit_qualification |=
+ EPT_VIOLATION_READ_TO_PROT(pte_access);
+ walker->fault.exit_qualification |=
+ EPT_VIOLATION_WRITE_TO_PROT(pte_access);
+ walker->fault.exit_qualification |=
+ EPT_VIOLATION_EXEC_TO_PROT(pte_access);
+ if (vcpu->arch.pt_guest_exec_control)
+ walker->fault.exit_qualification |=
+ EPT_VIOLATION_USER_EXEC_TO_PROT(pte_access);
}
#endif
walker->fault.address = addr;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 116910159a3f..0aadfa924045 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5809,7 +5809,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification;
+ unsigned long exit_qualification, rwx_mask;
gpa_t gpa;
u64 error_code;
@@ -5839,7 +5839,10 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
? PFERR_FETCH_MASK : 0;
/* ept page table entry is present? */
- error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
+ rwx_mask = EPT_VIOLATION_PROT_MASK;
+ if (vcpu->arch.pt_guest_exec_control)
+ rwx_mask |= EPT_VIOLATION_PROT_USER_EXEC;
+ error_code |= (exit_qualification & rwx_mask)
? PFERR_PRESENT_MASK : 0;
if (error_code & EPT_VIOLATION_GVA_IS_VALID)
--
2.43.0
On Thu, Mar 13, 2025, Jon Kohler wrote:
> From: Mickaël Salaün <mic@digikod.net>
>
> Add EPT_VIOLATION_PROT_USER_EXEC (6) to reflect the user executable
> permissions of a given address when Intel MBEC is enabled.
>
> Refactor usage of EPT_VIOLATION_RWX_TO_PROT to understand all of the
> specific bits that are now possible with MBEC.
>
> Intel SDM 'Exit Qualification for EPT Violations' states the following
> for Bit 6.
> If the “mode-based execute control” VM-execution control is 0, the
> value of this bit is undefined. If that control is 1, this bit is
> the logical-AND of bit 10 in the EPT paging-structure entries used
> to translate the guest-physical address of the access causing the
> EPT violation. In this case, it indicates whether the guest-physical
> address was executable for user-mode linear addresses.
>
> Bit 6 is cleared to 0 if (1) the “mode-based execute control”
> VM-execution control is 1; and (2) either (a) any of EPT
> paging-structure entries used to translate the guest-physical address
> of the access causing the EPT violation is not present; or
> (b) 4-level EPT is in use and the guest-physical address sets any
> bits in the range 51:48.
>
> Signed-off-by: Mickaël Salaün <mic@digikod.net>
> Co-developed-by: Jon Kohler <jon@nutanix.com>
> Signed-off-by: Jon Kohler <jon@nutanix.com>
>
> ---
> arch/x86/include/asm/vmx.h | 7 ++++---
> arch/x86/kvm/mmu/paging_tmpl.h | 15 ++++++++++++---
> arch/x86/kvm/vmx/vmx.c | 7 +++++--
> 3 files changed, 21 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index ffc90d672b5d..84c5be416f5c 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -587,6 +587,7 @@ enum vm_entry_failure_code {
> #define EPT_VIOLATION_PROT_READ BIT(3)
> #define EPT_VIOLATION_PROT_WRITE BIT(4)
> #define EPT_VIOLATION_PROT_EXEC BIT(5)
> +#define EPT_VIOLATION_PROT_USER_EXEC BIT(6)
Ugh, TDX added this as EPT_VIOLATION_EXEC_FOR_RING3_LIN (apparently the TDX module
enables MBEC?). I like your name a lot better.
> #define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \
> EPT_VIOLATION_PROT_WRITE | \
> EPT_VIOLATION_PROT_EXEC)
Hmm, so I think EPT_VIOLATION_PROT_MASK should include EPT_VIOLATION_PROT_USER_EXEC.
The existing TDX change does not, because unfortunately the bit is undefined if
MBEC is unsupported, but that's easy to solve by unconditionally clearing the bit
in handle_ept_violation(). And then when nested-EPT MBEC support comes along,
handle_ept_violation() can be modified to conditionally clear the bit based on
whether or not the current MMU supports MBEC.
I'll post a patch to include the bit in EPT_VIOLATION_PROT_MASK, and opportunistically
change the name.
> @@ -596,7 +597,7 @@ enum vm_entry_failure_code {
> #define EPT_VIOLATION_READ_TO_PROT(__epte) (((__epte) & VMX_EPT_READABLE_MASK) << 3)
> #define EPT_VIOLATION_WRITE_TO_PROT(__epte) (((__epte) & VMX_EPT_WRITABLE_MASK) << 3)
> #define EPT_VIOLATION_EXEC_TO_PROT(__epte) (((__epte) & VMX_EPT_EXECUTABLE_MASK) << 3)
> -#define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3)
Why? There's no escaping the fact that EXEC, a.k.a. X, is doing double duty as
"exec for all" and "kernel exec". And KVM has nearly two decades of history
using EXEC/X to refer to "exec for all". I see no reason to throw all of that
away and discard the intuitive and pervasive RWX logic.
> @@ -510,7 +511,15 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
> * Note, pte_access holds the raw RWX bits from the EPTE, not
> * ACC_*_MASK flags!
> */
> - walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
> + walker->fault.exit_qualification |=
> + EPT_VIOLATION_READ_TO_PROT(pte_access);
> + walker->fault.exit_qualification |=
> + EPT_VIOLATION_WRITE_TO_PROT(pte_access);
> + walker->fault.exit_qualification |=
> + EPT_VIOLATION_EXEC_TO_PROT(pte_access);
IMO, this is a big net negative. I much prefer the existing code, as it highlights
that USER_EXEC is the oddball.
> + if (vcpu->arch.pt_guest_exec_control)
This is wrong on multiple fronts. As mentioned earlier in the series, this is a
property of the MMU (more specifically, the root role), not of the vCPU.
And consulting MBEC support *only* when synthesizing the exit qualifcation is
wrong, because it means pte_access contains bogus data when consumed by
FNAME(gpte_access). At a glance, FNAME(gpte_access) probably needs to be modified
to take in the page role, e.g. like FNAME(sync_spte) and FNAME(prefetch_gpte)
already adjust the access based on the owning shadow page's access mask.
> + walker->fault.exit_qualification |=
> + EPT_VIOLATION_USER_EXEC_TO_PROT(pte_access);
> }
> #endif
> walker->fault.address = addr;
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 116910159a3f..0aadfa924045 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5809,7 +5809,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
>
> static int handle_ept_violation(struct kvm_vcpu *vcpu)
> {
> - unsigned long exit_qualification;
> + unsigned long exit_qualification, rwx_mask;
> gpa_t gpa;
> u64 error_code;
>
> @@ -5839,7 +5839,10 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
> error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
> ? PFERR_FETCH_MASK : 0;
> /* ept page table entry is present? */
> - error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
> + rwx_mask = EPT_VIOLATION_PROT_MASK;
> + if (vcpu->arch.pt_guest_exec_control)
> + rwx_mask |= EPT_VIOLATION_PROT_USER_EXEC;
> + error_code |= (exit_qualification & rwx_mask)
> ? PFERR_PRESENT_MASK : 0;
As mentioned above, if KVM clears EPT_VIOLATION_PROT_USER_EXEC when it's
undefined, then this can simply use EPT_VIOLATION_PROT_MASK unchanged.
>
> if (error_code & EPT_VIOLATION_GVA_IS_VALID)
> --
> 2.43.0
>
> On May 12, 2025, at 2:54 PM, Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Mar 13, 2025, Jon Kohler wrote:
>> From: Mickaël Salaün <mic@digikod.net>
>>
>> Add EPT_VIOLATION_PROT_USER_EXEC (6) to reflect the user executable
>> permissions of a given address when Intel MBEC is enabled.
>>
>> Refactor usage of EPT_VIOLATION_RWX_TO_PROT to understand all of the
>> specific bits that are now possible with MBEC.
>>
>> Intel SDM 'Exit Qualification for EPT Violations' states the following
>> for Bit 6.
>> If the “mode-based execute control” VM-execution control is 0, the
>> value of this bit is undefined. If that control is 1, this bit is
>> the logical-AND of bit 10 in the EPT paging-structure entries used
>> to translate the guest-physical address of the access causing the
>> EPT violation. In this case, it indicates whether the guest-physical
>> address was executable for user-mode linear addresses.
>>
>> Bit 6 is cleared to 0 if (1) the “mode-based execute control”
>> VM-execution control is 1; and (2) either (a) any of EPT
>> paging-structure entries used to translate the guest-physical address
>> of the access causing the EPT violation is not present; or
>> (b) 4-level EPT is in use and the guest-physical address sets any
>> bits in the range 51:48.
>>
>> Signed-off-by: Mickaël Salaün <mic@digikod.net>
>> Co-developed-by: Jon Kohler <jon@nutanix.com>
>> Signed-off-by: Jon Kohler <jon@nutanix.com>
>>
>> ---
>> arch/x86/include/asm/vmx.h | 7 ++++---
>> arch/x86/kvm/mmu/paging_tmpl.h | 15 ++++++++++++---
>> arch/x86/kvm/vmx/vmx.c | 7 +++++--
>> 3 files changed, 21 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index ffc90d672b5d..84c5be416f5c 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -587,6 +587,7 @@ enum vm_entry_failure_code {
>> #define EPT_VIOLATION_PROT_READ BIT(3)
>> #define EPT_VIOLATION_PROT_WRITE BIT(4)
>> #define EPT_VIOLATION_PROT_EXEC BIT(5)
>> +#define EPT_VIOLATION_PROT_USER_EXEC BIT(6)
>
> Ugh, TDX added this as EPT_VIOLATION_EXEC_FOR_RING3_LIN (apparently the TDX module
> enables MBEC?). I like your name a lot better.
>
>> #define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \
>> EPT_VIOLATION_PROT_WRITE | \
>> EPT_VIOLATION_PROT_EXEC)
>
> Hmm, so I think EPT_VIOLATION_PROT_MASK should include EPT_VIOLATION_PROT_USER_EXEC.
> The existing TDX change does not, because unfortunately the bit is undefined if
> MBEC is unsupported, but that's easy to solve by unconditionally clearing the bit
> in handle_ept_violation(). And then when nested-EPT MBEC support comes along,
> handle_ept_violation() can be modified to conditionally clear the bit based on
> whether or not the current MMU supports MBEC.
>
> I'll post a patch to include the bit in EPT_VIOLATION_PROT_MASK, and opportunistically
> change the name.
I didn’t see a patch for this in 6.18, so I added a simple prep patch for this in v1
in my series. Happy to peel it out if needed so we can land that sooner, buyers choice.
>> @@ -596,7 +597,7 @@ enum vm_entry_failure_code {
>> #define EPT_VIOLATION_READ_TO_PROT(__epte) (((__epte) & VMX_EPT_READABLE_MASK) << 3)
>> #define EPT_VIOLATION_WRITE_TO_PROT(__epte) (((__epte) & VMX_EPT_WRITABLE_MASK) << 3)
>> #define EPT_VIOLATION_EXEC_TO_PROT(__epte) (((__epte) & VMX_EPT_EXECUTABLE_MASK) << 3)
>> -#define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3)
>
> Why? There's no escaping the fact that EXEC, a.k.a. X, is doing double duty as
> "exec for all" and "kernel exec". And KVM has nearly two decades of history
> using EXEC/X to refer to "exec for all". I see no reason to throw all of that
> away and discard the intuitive and pervasive RWX logic.
Yea, agreed, this was way too confusion in RFC, I fixed it all in v1 (knock on wood), so
it is much easier on the eyes across the board.
>> @@ -510,7 +511,15 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>> * Note, pte_access holds the raw RWX bits from the EPTE, not
>> * ACC_*_MASK flags!
>> */
>> - walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
>> + walker->fault.exit_qualification |=
>> + EPT_VIOLATION_READ_TO_PROT(pte_access);
>> + walker->fault.exit_qualification |=
>> + EPT_VIOLATION_WRITE_TO_PROT(pte_access);
>> + walker->fault.exit_qualification |=
>> + EPT_VIOLATION_EXEC_TO_PROT(pte_access);
>
> IMO, this is a big net negative. I much prefer the existing code, as it highlights
> that USER_EXEC is the oddball.
Sold, I didn’t love it either. V1 is much smoother in this area.
>
>> + if (vcpu->arch.pt_guest_exec_control)
>
> This is wrong on multiple fronts. As mentioned earlier in the series, this is a
> property of the MMU (more specifically, the root role), not of the vCPU.
>
> And consulting MBEC support *only* when synthesizing the exit qualifcation is
> wrong, because it means pte_access contains bogus data when consumed by
> FNAME(gpte_access). At a glance, FNAME(gpte_access) probably needs to be modified
> to take in the page role, e.g. like FNAME(sync_spte) and FNAME(prefetch_gpte)
> already adjust the access based on the owning shadow page's access mask.
Ack/done, got rid of all of the vcpu cruft and moved everything to mmu role, which
is much easier to deal with. Also went and brushed up a few more of these areas.
>> + walker->fault.exit_qualification |=
>> + EPT_VIOLATION_USER_EXEC_TO_PROT(pte_access);
>> }
>> #endif
>> walker->fault.address = addr;
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 116910159a3f..0aadfa924045 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -5809,7 +5809,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
>>
>> static int handle_ept_violation(struct kvm_vcpu *vcpu)
>> {
>> - unsigned long exit_qualification;
>> + unsigned long exit_qualification, rwx_mask;
>> gpa_t gpa;
>> u64 error_code;
>>
>> @@ -5839,7 +5839,10 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
>> error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
>> ? PFERR_FETCH_MASK : 0;
>> /* ept page table entry is present? */
>> - error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
>> + rwx_mask = EPT_VIOLATION_PROT_MASK;
>> + if (vcpu->arch.pt_guest_exec_control)
>> + rwx_mask |= EPT_VIOLATION_PROT_USER_EXEC;
>> + error_code |= (exit_qualification & rwx_mask)
>> ? PFERR_PRESENT_MASK : 0;
>
> As mentioned above, if KVM clears EPT_VIOLATION_PROT_USER_EXEC when it's
> undefined, then this can simply use EPT_VIOLATION_PROT_MASK unchanged.
Sold!
>> if (error_code & EPT_VIOLATION_GVA_IS_VALID)
>> --
>> 2.43.0
>>
>
© 2016 - 2026 Red Hat, Inc.