From: Yang Weijiang <weijiang.yang@intel.com>
Expose CET features to guest if KVM/host can support them, clear CPUID
feature bits if KVM/host cannot support.
Set CPUID feature bits so that CET features are available in guest CPUID.
Add CR4.CET bit support in order to allow guest set CET master control
bit.
Disable KVM CET feature if unrestricted_guest is unsupported/disabled as
KVM does not support emulating CET.
The CET load-bits in VM_ENTRY/VM_EXIT control fields should be set to make
guest CET xstates isolated from host's.
On platforms with VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error
code will fail, and if VMX_BASIC[bit56] == 1, #CP injection with or without
error code is allowed. Disable CET feature bits if the MSR bit is cleared
so that nested VMM can inject #CP if and only if VMX_BASIC[bit56] == 1.
Don't expose CET feature if either of {U,S}_CET xstate bits is cleared
in host XSS or if XSAVES isn't supported.
CET MSRs are reset to 0s after RESET, power-up and INIT, clear guest CET
xsave-area fields so that guest CET MSRs are reset to 0s after the events.
Meanwhile explicitly disable SHSTK and IBT for SVM because CET KVM enabling
for SVM is not ready.
Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
Signed-off-by: Chao Gao <chao.gao@intel.com>
---
v11:
1) Remove IBT CPUID reference to raw CPUID info after discussion.
2) Disable SHSTK/IBT support in SVM explicitly per Sean's comment.
3) Handle GUEST_S_CET field as a common field for SHSTK and IBT.
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/include/asm/vmx.h | 1 +
arch/x86/kvm/cpuid.c | 2 ++
arch/x86/kvm/svm/svm.c | 4 ++++
arch/x86/kvm/vmx/capabilities.h | 5 +++++
arch/x86/kvm/vmx/vmx.c | 30 +++++++++++++++++++++++++++++-
arch/x86/kvm/vmx/vmx.h | 6 ++++--
arch/x86/kvm/x86.c | 22 +++++++++++++++++++---
arch/x86/kvm/x86.h | 3 +++
9 files changed, 68 insertions(+), 7 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 30d9d434c048..2aca91c7ae1b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -142,7 +142,7 @@
| X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
| X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
| X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \
- | X86_CR4_LAM_SUP))
+ | X86_CR4_LAM_SUP | X86_CR4_CET))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index ce10a7e2d3d9..c85c50019523 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -134,6 +134,7 @@
#define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49)
#define VMX_BASIC_INOUT BIT_ULL(54)
#define VMX_BASIC_TRUE_CTLS BIT_ULL(55)
+#define VMX_BASIC_NO_HW_ERROR_CODE_CC BIT_ULL(56)
static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
{
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9b45607f9b37..7007ce792706 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -944,6 +944,7 @@ void kvm_set_cpu_caps(void)
VENDOR_F(WAITPKG),
F(SGX_LC),
F(BUS_LOCK_DETECT),
+ F(SHSTK),
);
/*
@@ -970,6 +971,7 @@ void kvm_set_cpu_caps(void)
F(AMX_INT8),
F(AMX_BF16),
F(FLUSH_L1D),
+ F(IBT),
);
if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) &&
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 803574920e41..6375695ce285 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -5223,6 +5223,10 @@ static __init void svm_set_cpu_caps(void)
kvm_caps.supported_perf_cap = 0;
kvm_caps.supported_xss = 0;
+ /* KVM doesn't yet support CET virtualization for SVM. */
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+ kvm_cpu_cap_clear(X86_FEATURE_IBT);
+
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
if (nested) {
kvm_cpu_cap_set(X86_FEATURE_SVM);
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 0f0e1717dc80..09c130d2e595 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -77,6 +77,11 @@ static inline bool cpu_has_vmx_basic_inout(void)
return vmcs_config.basic & VMX_BASIC_INOUT;
}
+static inline bool cpu_has_vmx_basic_no_hw_errcode(void)
+{
+ return vmcs_config.basic & VMX_BASIC_NO_HW_ERROR_CODE_CC;
+}
+
static inline bool cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS &&
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3d6da3836e6b..d837876e3726 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2598,6 +2598,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
{ VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
{ VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
{ VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
+ { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE },
};
memset(vmcs_conf, 0, sizeof(*vmcs_conf));
@@ -4862,6 +4863,14 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ vmcs_writel(GUEST_SSP, 0);
+ vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
+ }
+ if (kvm_cpu_cap_has(X86_FEATURE_IBT) ||
+ kvm_cpu_cap_has(X86_FEATURE_SHSTK))
+ vmcs_writel(GUEST_S_CET, 0);
+
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
vpid_sync_context(vmx->vpid);
@@ -6310,6 +6319,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
+ if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE)
+ pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
+ vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP),
+ vmcs_readl(GUEST_INTR_SSP_TABLE));
pr_err("*** Host State ***\n");
pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
@@ -6340,6 +6353,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
+ if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE)
+ pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
+ vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP),
+ vmcs_readl(HOST_INTR_SSP_TABLE));
pr_err("*** Control State ***\n");
pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
@@ -7917,7 +7934,6 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_UMIP);
/* CPUID 0xD.1 */
- kvm_caps.supported_xss = 0;
if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
@@ -7929,6 +7945,18 @@ static __init void vmx_set_cpu_caps(void)
if (cpu_has_vmx_waitpkg())
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
+
+ /*
+ * Disable CET if unrestricted_guest is unsupported as KVM doesn't
+ * enforce CET HW behaviors in emulator. On platforms with
+ * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
+ * fails, so disable CET in this case too.
+ */
+ if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
+ !cpu_has_vmx_basic_no_hw_errcode()) {
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+ kvm_cpu_cap_clear(X86_FEATURE_IBT);
+ }
}
static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 87174d961c85..f93062965a7a 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -482,7 +482,8 @@ static inline u8 vmx_get_rvi(void)
VM_ENTRY_LOAD_IA32_EFER | \
VM_ENTRY_LOAD_BNDCFGS | \
VM_ENTRY_PT_CONCEAL_PIP | \
- VM_ENTRY_LOAD_IA32_RTIT_CTL)
+ VM_ENTRY_LOAD_IA32_RTIT_CTL | \
+ VM_ENTRY_LOAD_CET_STATE)
#define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \
(VM_EXIT_SAVE_DEBUG_CONTROLS | \
@@ -504,7 +505,8 @@ static inline u8 vmx_get_rvi(void)
VM_EXIT_LOAD_IA32_EFER | \
VM_EXIT_CLEAR_BNDCFGS | \
VM_EXIT_PT_CONCEAL_PIP | \
- VM_EXIT_CLEAR_IA32_RTIT_CTL)
+ VM_EXIT_CLEAR_IA32_RTIT_CTL | \
+ VM_EXIT_LOAD_CET_STATE)
#define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \
(PIN_BASED_EXT_INTR_MASK | \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b17a8bf84db3..3b99124f8985 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -223,7 +223,8 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs;
| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
| XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
-#define KVM_SUPPORTED_XSS 0
+#define KVM_SUPPORTED_XSS (XFEATURE_MASK_CET_USER | \
+ XFEATURE_MASK_CET_KERNEL)
bool __read_mostly allow_smaller_maxphyaddr = 0;
EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
@@ -10073,6 +10074,20 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
kvm_caps.supported_xss = 0;
+ if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
+ !kvm_cpu_cap_has(X86_FEATURE_IBT))
+ kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER |
+ XFEATURE_MASK_CET_KERNEL);
+
+ if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER |
+ XFEATURE_MASK_CET_KERNEL)) !=
+ (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) {
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+ kvm_cpu_cap_clear(X86_FEATURE_IBT);
+ kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER |
+ XFEATURE_MASK_CET_KERNEL);
+ }
+
if (kvm_caps.has_tsc_control) {
/*
* Make sure the user can only configure tsc_khz values that
@@ -12729,10 +12744,11 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
/*
* On INIT, only select XSTATE components are zeroed, most components
* are unchanged. Currently, the only components that are zeroed and
- * supported by KVM are MPX related.
+ * supported by KVM are MPX and CET related.
*/
xfeatures_mask = (kvm_caps.supported_xcr0 | kvm_caps.supported_xss) &
- (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+ (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR |
+ XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL);
if (!xfeatures_mask)
return;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 8d2049a1f41b..e54c779e74a3 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -647,6 +647,9 @@ static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
__reserved_bits |= X86_CR4_PCIDE; \
if (!__cpu_has(__c, X86_FEATURE_LAM)) \
__reserved_bits |= X86_CR4_LAM_SUP; \
+ if (!__cpu_has(__c, X86_FEATURE_SHSTK) && \
+ !__cpu_has(__c, X86_FEATURE_IBT)) \
+ __reserved_bits |= X86_CR4_CET; \
__reserved_bits; \
})
--
2.47.1
On Fri, Jul 04, 2025 at 01:49:50AM -0700, Chao Gao wrote: > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > index 803574920e41..6375695ce285 100644 > --- a/arch/x86/kvm/svm/svm.c > +++ b/arch/x86/kvm/svm/svm.c > @@ -5223,6 +5223,10 @@ static __init void svm_set_cpu_caps(void) > kvm_caps.supported_perf_cap = 0; > kvm_caps.supported_xss = 0; > > + /* KVM doesn't yet support CET virtualization for SVM. */ > + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); > + kvm_cpu_cap_clear(X86_FEATURE_IBT); > + Since AMD isn't supporting IBT, not sure if it makes sense to clear IBT here since it doesn't look like we're clearing other features that we don't support in hardware. For compatibility, my series just removes both lines here, but the IBT clearing is probably not needed in this series. Thanks, John
On Wed, Aug 06, 2025, John Allen wrote: > On Fri, Jul 04, 2025 at 01:49:50AM -0700, Chao Gao wrote: > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > > index 803574920e41..6375695ce285 100644 > > --- a/arch/x86/kvm/svm/svm.c > > +++ b/arch/x86/kvm/svm/svm.c > > @@ -5223,6 +5223,10 @@ static __init void svm_set_cpu_caps(void) > > kvm_caps.supported_perf_cap = 0; > > kvm_caps.supported_xss = 0; > > > > + /* KVM doesn't yet support CET virtualization for SVM. */ > > + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); > > + kvm_cpu_cap_clear(X86_FEATURE_IBT); > > + > > Since AMD isn't supporting IBT, Isn't supporting IBT, yet. :-) I totally believe that AMD doesn't have any plans to support IBT, but unless IBT virtualization would Just Work (would it?), we should leave this in, because being paranoid is basically free. > not sure if it makes sense to clear IBT here since it doesn't look like we're > clearing other features that we don't support in hardware. For compatibility, > my series just removes both lines here, but the IBT clearing is probably not > needed in this series.
On 04.07.25 10:49, Chao Gao wrote: > From: Yang Weijiang <weijiang.yang@intel.com> > > Expose CET features to guest if KVM/host can support them, clear CPUID > feature bits if KVM/host cannot support. > [...] Can we please make CR4.CET a guest-owned bit as well (sending a patch in a second)? It's a logical continuation to making CR0.WP a guest-owned bit just that it's even easier this time, as no MMU role bits are involved and it still makes a big difference, at least for grsecurity guest kernels. Using the old test from [1] gives the following numbers (perf stat -r 5 ssdd 10 50000): * grsec guest on linux-6.16-rc5 + cet patches: 2.4647 +- 0.0706 seconds time elapsed ( +- 2.86% ) * grsec guest on linux-6.16-rc5 + cet patches + CR4.CET guest-owned: 1.5648 +- 0.0240 seconds time elapsed ( +- 1.53% ) Not only is it ~35% faster, it's also more stable, less fluctuation due to less VMEXITs, I believe. Thanks, Mathias [1] https://lore.kernel.org/kvm/20230322013731.102955-1-minipli@grsecurity.net/
On Mon, Jul 21, 2025, Mathias Krause wrote: > On 04.07.25 10:49, Chao Gao wrote: > > From: Yang Weijiang <weijiang.yang@intel.com> > > > > Expose CET features to guest if KVM/host can support them, clear CPUID > > feature bits if KVM/host cannot support. > > [...] > > Can we please make CR4.CET a guest-owned bit as well (sending a patch in > a second)? It's a logical continuation to making CR0.WP a guest-owned > bit just that it's even easier this time, as no MMU role bits are > involved and it still makes a big difference, at least for grsecurity > guest kernels. Out of curiosity, what's the use case for toggling CR4.CET at runtime? > Using the old test from [1] gives the following numbers (perf stat -r 5 > ssdd 10 50000): > > * grsec guest on linux-6.16-rc5 + cet patches: > 2.4647 +- 0.0706 seconds time elapsed ( +- 2.86% ) > > * grsec guest on linux-6.16-rc5 + cet patches + CR4.CET guest-owned: > 1.5648 +- 0.0240 seconds time elapsed ( +- 1.53% ) > > Not only is it ~35% faster, it's also more stable, less fluctuation due > to less VMEXITs, I believe. > > Thanks, > Mathias > > [1] > https://lore.kernel.org/kvm/20230322013731.102955-1-minipli@grsecurity.net/ > From 14ef5d8b952744c46c32f16fea3b29184cde3e65 Mon Sep 17 00:00:00 2001 > From: Mathias Krause <minipli@grsecurity.net> > Date: Mon, 21 Jul 2025 13:45:55 +0200 > Subject: [PATCH] KVM: VMX: Make CR4.CET a guest owned bit > > There's no need to intercept changes of CR4.CET, make it a guest-owned > bit where possible. In the changelog, please elaborate on the assertion that CR4.CET doesn't need to be intercepted, and include the motiviation and perf numbers. KVM's "rule" is to disable interception of something if and only if there is a good reason for doing so, because generally speaking intercepting is safer. E.g. KVM bugs are less likely to put the host at risk. "Because we can" isn't not a good reason :-) E.g. at one point CR4.LA57 was a guest-owned bit, and the code was buggy. Fixing things took far more effort than it should have there was no justification for the logic (IIRC, it was done purely on the whims of the original developer). KVM has had many such cases, where some weird behavior was never documented/justified, and I really, really want to avoid committing the same sins that have caused me so much pain :-) > This change is VMX-specific, as SVM has no such fine-grained control > register intercept control. > > Signed-off-by: Mathias Krause <minipli@grsecurity.net> > --- > arch/x86/kvm/kvm_cache_regs.h | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h > index 36a8786db291..8ddb01191d6f 100644 > --- a/arch/x86/kvm/kvm_cache_regs.h > +++ b/arch/x86/kvm/kvm_cache_regs.h > @@ -7,7 +7,8 @@ > #define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP) > #define KVM_POSSIBLE_CR4_GUEST_BITS \ > (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ > - | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) > + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE \ > + | X86_CR4_CET) > > #define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG) > #define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP) > -- > 2.47.2 >
On 21.07.25 19:45, Sean Christopherson wrote: > On Mon, Jul 21, 2025, Mathias Krause wrote: >> Can we please make CR4.CET a guest-owned bit as well (sending a patch in >> a second)? It's a logical continuation to making CR0.WP a guest-owned >> bit just that it's even easier this time, as no MMU role bits are >> involved and it still makes a big difference, at least for grsecurity >> guest kernels. > > Out of curiosity, what's the use case for toggling CR4.CET at runtime? Plain and simple: architectural requirements to be able to toggle CR0.WP. >> Using the old test from [1] gives the following numbers (perf stat -r 5 >> ssdd 10 50000): >> >> * grsec guest on linux-6.16-rc5 + cet patches: >> 2.4647 +- 0.0706 seconds time elapsed ( +- 2.86% ) >> >> * grsec guest on linux-6.16-rc5 + cet patches + CR4.CET guest-owned: >> 1.5648 +- 0.0240 seconds time elapsed ( +- 1.53% ) >> >> Not only is it ~35% faster, it's also more stable, less fluctuation due >> to less VMEXITs, I believe. Above test exercises the "pain path" by single-stepping a process and constantly switching between tracer and tracee. The scheduling part involves a CR0.WP toggle in grsecurity to be able to write to r/o memory, which now requires toggling CR4.CET as well. >> >> Thanks, >> Mathias >> >> [1] >> https://lore.kernel.org/kvm/20230322013731.102955-1-minipli@grsecurity.net/ > >> From 14ef5d8b952744c46c32f16fea3b29184cde3e65 Mon Sep 17 00:00:00 2001 >> From: Mathias Krause <minipli@grsecurity.net> >> Date: Mon, 21 Jul 2025 13:45:55 +0200 >> Subject: [PATCH] KVM: VMX: Make CR4.CET a guest owned bit >> >> There's no need to intercept changes of CR4.CET, make it a guest-owned >> bit where possible. > > In the changelog, please elaborate on the assertion that CR4.CET doesn't need to > be intercepted, and include the motiviation and perf numbers. KVM's "rule" is > to disable interception of something if and only if there is a good reason for > doing so, because generally speaking intercepting is safer. E.g. KVM bugs are > less likely to put the host at risk. "Because we can" isn't not a good reason :-) Understood, will extend the changelog accordingly. > > E.g. at one point CR4.LA57 was a guest-owned bit, and the code was buggy. Fixing > things took far more effort than it should have there was no justification for > the logic (IIRC, it was done purely on the whims of the original developer). > > KVM has had many such cases, where some weird behavior was never documented/justified, > and I really, really want to avoid committing the same sins that have caused me > so much pain :-) I totally understand your reasoning, "just because" shouldn't be the justification. In this case, however, not making it a guest-owned bit has a big performance impact for grsecurity, we would like to address. The defines and asserts regarding KVM_MMU_CR4_ROLE_BITS and KVM_POSSIBLE_CR4_GUEST_BITS (and KVM_MMU_CR0_ROLE_BITS / KVM_POSSIBLE_CR0_GUEST_BITS too) should catch future attempts on involving CR4.CET in any MMU-related decisions and I found no other use of the (nested) guest's CR4.CET value beside for sanity checks to prevent invalid architectural state (CR4.CET=1 but CR0.WP=0). That is, imho, existing documentation regarding the expectations on guest-owned bits and, even better, with BUILD_BUG_ON()s enforcing these. Thanks, Mathias
On Tue, Jul 22, 2025, Mathias Krause wrote: > On 21.07.25 19:45, Sean Christopherson wrote: > > On Mon, Jul 21, 2025, Mathias Krause wrote: > >> Can we please make CR4.CET a guest-owned bit as well (sending a patch in > >> a second)? It's a logical continuation to making CR0.WP a guest-owned > >> bit just that it's even easier this time, as no MMU role bits are > >> involved and it still makes a big difference, at least for grsecurity > >> guest kernels. > > > > Out of curiosity, what's the use case for toggling CR4.CET at runtime? > > Plain and simple: architectural requirements to be able to toggle CR0.WP. Ugh, right. That was less fun than I as expecting :-) > > E.g. at one point CR4.LA57 was a guest-owned bit, and the code was buggy. Fixing > > things took far more effort than it should have there was no justification for > > the logic (IIRC, it was done purely on the whims of the original developer). > > > > KVM has had many such cases, where some weird behavior was never documented/justified, > > and I really, really want to avoid committing the same sins that have caused me > > so much pain :-) > > I totally understand your reasoning, "just because" shouldn't be the > justification. In this case, however, not making it a guest-owned bit > has a big performance impact for grsecurity, we would like to address. Oh, I'm not objecting to the change, at all. I just want to make sure we capture the justification in the changelog.
There's no need to intercept changes to CR4.CET, as it's neither
included in KVM's MMU role bits, nor does KVM specifically care about
the actual value of a (nested) guest's CR4.CET value, beside for
enforcing architectural constraints, i.e. make sure that CR0.WP=1 if
CR4.CET=1.
Intercepting writes to CR4.CET is particularly bad for grsecurity
kernels with KERNEXEC or, even worse, KERNSEAL enabled. These features
heavily make use of read-only kernel objects and use a cpu-local CR0.WP
toggle to override it, when needed. Under a CET-enabled kernel, this
also requires toggling CR4.CET, hence the motivation to make it
guest-owned.
Using the old test from [1] gives the following runtime numbers (perf
stat -r 5 ssdd 10 50000):
* grsec guest on linux-6.16-rc5 + cet patches:
2.4647 +- 0.0706 seconds time elapsed ( +- 2.86% )
* grsec guest on linux-6.16-rc5 + cet patches + CR4.CET guest-owned:
1.5648 +- 0.0240 seconds time elapsed ( +- 1.53% )
Not only makes not intercepting CR4.CET the test run ~35% faster, it's
also more stable, less fluctuation due to less VMEXITs, I believe.
Therefore, make CR4.CET a guest-owned bit where possible.
This change is VMX-specific, as SVM has no such fine-grained control
register intercept control.
If KVM's assumptions regarding MMU role handling wrt. a guest's CR4.CET
value ever change, the BUILD_BUG_ON()s related to KVM_MMU_CR4_ROLE_BITS
and KVM_POSSIBLE_CR4_GUEST_BITS will catch that early.
Link: https://lore.kernel.org/kvm/20230322013731.102955-1-minipli@grsecurity.net/ [1]
Signed-off-by: Mathias Krause <minipli@grsecurity.net>
---
v2:
- provide motivation and performance numbers
arch/x86/kvm/kvm_cache_regs.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 36a8786db291..8ddb01191d6f 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -7,7 +7,8 @@
#define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP)
#define KVM_POSSIBLE_CR4_GUEST_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE)
+ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE \
+ | X86_CR4_CET)
#define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG)
#define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP)
--
2.47.2
It is recommended to first state what a patch does before providing the background and motivation. See https://docs.kernel.org/process/maintainer-kvm-x86.html#changelog >There's no need to intercept changes to CR4.CET, as it's neither >included in KVM's MMU role bits, nor does KVM specifically care about >the actual value of a (nested) guest's CR4.CET value, beside for >enforcing architectural constraints, i.e. make sure that CR0.WP=1 if >CR4.CET=1. > >Intercepting writes to CR4.CET is particularly bad for grsecurity >kernels with KERNEXEC or, even worse, KERNSEAL enabled. These features >heavily make use of read-only kernel objects and use a cpu-local CR0.WP >toggle to override it, when needed. Under a CET-enabled kernel, this >also requires toggling CR4.CET, hence the motivation to make it >guest-owned. > >Using the old test from [1] gives the following runtime numbers (perf >stat -r 5 ssdd 10 50000): > >* grsec guest on linux-6.16-rc5 + cet patches: > 2.4647 +- 0.0706 seconds time elapsed ( +- 2.86% ) > >* grsec guest on linux-6.16-rc5 + cet patches + CR4.CET guest-owned: > 1.5648 +- 0.0240 seconds time elapsed ( +- 1.53% ) > >Not only makes not intercepting CR4.CET the test run ~35% faster, it's >also more stable, less fluctuation due to less VMEXITs, I believe. > >Therefore, make CR4.CET a guest-owned bit where possible. > >This change is VMX-specific, as SVM has no such fine-grained control >register intercept control. Ah, that's why the shortlog is "KVM: VMX". I was wondering why the shortlog specifically mentions VMX while the patch actually touches x86 common code. > >If KVM's assumptions regarding MMU role handling wrt. a guest's CR4.CET >value ever change, the BUILD_BUG_ON()s related to KVM_MMU_CR4_ROLE_BITS >and KVM_POSSIBLE_CR4_GUEST_BITS will catch that early. > >Link: https://lore.kernel.org/kvm/20230322013731.102955-1-minipli@grsecurity.net/ [1] >Signed-off-by: Mathias Krause <minipli@grsecurity.net> The patch looks good. So, Reviewed-by: Chao Gao <chao.gao@intel.com>
© 2016 - 2025 Red Hat, Inc.