Syscall hardening (i.e., converting the syscall indirect branch to a
series of direct branches) may cause performance regressions in certain
scenarios. Only use the syscall hardening when indirect branches are
considered unsafe.
Fixes: 1e3ad78334a6 ("x86/syscall: Don't force use of indirect calls for system calls")
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
arch/x86/entry/common.c | 15 ++++++++++++---
arch/x86/entry/syscall_32.c | 11 +----------
arch/x86/entry/syscall_64.c | 6 ------
arch/x86/entry/syscall_x32.c | 7 ++++++-
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/include/asm/syscall.h | 8 +++++++-
arch/x86/kernel/cpu/bugs.c | 31 +++++++++++++++++++++++++++++-
7 files changed, 57 insertions(+), 22 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 6de50b80702e..c0f8116291e4 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -49,7 +49,10 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
if (likely(unr < NR_syscalls)) {
unr = array_index_nospec(unr, NR_syscalls);
- regs->ax = x64_sys_call(regs, unr);
+ if (likely(cpu_feature_enabled(X86_FEATURE_INDIRECT_SAFE)))
+ regs->ax = sys_call_table[unr](regs);
+ else
+ regs->ax = x64_sys_call(regs, unr);
return true;
}
return false;
@@ -66,7 +69,10 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
xnr = array_index_nospec(xnr, X32_NR_syscalls);
- regs->ax = x32_sys_call(regs, xnr);
+ if (likely(cpu_feature_enabled(X86_FEATURE_INDIRECT_SAFE)))
+ regs->ax = x32_sys_call_table[xnr](regs);
+ else
+ regs->ax = x32_sys_call(regs, xnr);
return true;
}
return false;
@@ -162,7 +168,10 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
if (likely(unr < IA32_NR_syscalls)) {
unr = array_index_nospec(unr, IA32_NR_syscalls);
- regs->ax = ia32_sys_call(regs, unr);
+ if (likely(cpu_feature_enabled(X86_FEATURE_INDIRECT_SAFE)))
+ regs->ax = ia32_sys_call_table[unr](regs);
+ else
+ regs->ax = ia32_sys_call(regs, unr);
} else if (nr != -1) {
regs->ax = __ia32_sys_ni_syscall(regs);
}
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index c2235bae17ef..aab31760b4e3 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -14,25 +14,16 @@
#endif
#define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
-
#include <asm/syscalls_32.h>
#undef __SYSCALL
-/*
- * The sys_call_table[] is no longer used for system calls, but
- * kernel/trace/trace_syscalls.c still wants to know the system
- * call address.
- */
-#ifdef CONFIG_X86_32
#define __SYSCALL(nr, sym) __ia32_##sym,
-const sys_call_ptr_t sys_call_table[] = {
+const sys_call_ptr_t ia32_sys_call_table[] = {
#include <asm/syscalls_32.h>
};
#undef __SYSCALL
-#endif
#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs);
-
long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
{
switch (nr) {
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 33b3f09e6f15..96ea1f8a1d3f 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -11,11 +11,6 @@
#include <asm/syscalls_64.h>
#undef __SYSCALL
-/*
- * The sys_call_table[] is no longer used for system calls, but
- * kernel/trace/trace_syscalls.c still wants to know the system
- * call address.
- */
#define __SYSCALL(nr, sym) __x64_##sym,
const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_64.h>
@@ -23,7 +18,6 @@ const sys_call_ptr_t sys_call_table[] = {
#undef __SYSCALL
#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-
long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
{
switch (nr) {
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index 03de4a932131..5aef4230faca 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -11,8 +11,13 @@
#include <asm/syscalls_x32.h>
#undef __SYSCALL
-#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
+#define __SYSCALL(nr, sym) __x64_##sym,
+const sys_call_ptr_t x32_sys_call_table[] = {
+#include <asm/syscalls_x32.h>
+};
+#undef __SYSCALL
+#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
{
switch (nr) {
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 3c7434329661..7c87fe80c696 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -470,6 +470,7 @@
#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */
#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */
#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */
+#define X86_FEATURE_INDIRECT_SAFE (21*32+ 4) /* "" Indirect branches aren't vulnerable to Spectre v2 */
/*
* BUG word(s)
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 2fc7bc3863ff..dfb59521244c 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -16,14 +16,20 @@
#include <asm/thread_info.h> /* for TS_COMPAT */
#include <asm/unistd.h>
-/* This is used purely for kernel/trace/trace_syscalls.c */
typedef long (*sys_call_ptr_t)(const struct pt_regs *);
extern const sys_call_ptr_t sys_call_table[];
+#if defined(CONFIG_X86_32)
+#define ia32_sys_call_table sys_call_table
+#else
/*
* These may not exist, but still put the prototypes in so we
* can use IS_ENABLED().
*/
+extern const sys_call_ptr_t ia32_sys_call_table[];
+extern const sys_call_ptr_t x32_sys_call_table[];
+#endif
+
extern long ia32_sys_call(const struct pt_regs *, unsigned int nr);
extern long x32_sys_call(const struct pt_regs *, unsigned int nr);
extern long x64_sys_call(const struct pt_regs *, unsigned int nr);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ca295b0c1eee..dcb97cc2758f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1664,6 +1664,15 @@ static void __init bhi_select_mitigation(void)
if (!IS_ENABLED(CONFIG_X86_64))
return;
+ /*
+ * There's no hardware mitigation in place, so mark indirect branches
+ * as unsafe.
+ *
+ * One could argue the SW loop makes indirect branches safe again, but
+ * Linus prefers it this way.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
+
/* Mitigate KVM by default */
setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
@@ -1678,6 +1687,21 @@ static void __init spectre_v2_select_mitigation(void)
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
+ /*
+ * X86_FEATURE_INDIRECT_SAFE indicates whether indirect calls can be
+ * considered safe. That means either:
+ *
+ * - the CPU isn't vulnerable to Spectre v2 or its variants;
+ *
+ * - a hardware mitigation is in place (e.g., IBRS, BHI_DIS_S); or
+ *
+ * - the user turned off mitigations altogether.
+ *
+ * Assume innocence until proven guilty: set the cap bit now, then
+ * clear it later if/when needed.
+ */
+ setup_force_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
+
/*
* If the CPU is not affected and the command line mode is NONE or AUTO
* then nothing to do.
@@ -1764,11 +1788,16 @@ static void __init spectre_v2_select_mitigation(void)
break;
case SPECTRE_V2_LFENCE:
+ setup_clear_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
+ fallthrough;
case SPECTRE_V2_EIBRS_LFENCE:
setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
- fallthrough;
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+ break;
case SPECTRE_V2_RETPOLINE:
+ setup_clear_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
+ fallthrough;
case SPECTRE_V2_EIBRS_RETPOLINE:
setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
break;
--
2.44.0
On 12.04.24 г. 21:10 ч., Josh Poimboeuf wrote:
> Syscall hardening (i.e., converting the syscall indirect branch to a
> series of direct branches) may cause performance regressions in certain
> scenarios. Only use the syscall hardening when indirect branches are
> considered unsafe.
>
> Fixes: 1e3ad78334a6 ("x86/syscall: Don't force use of indirect calls for system calls")
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> ---
> arch/x86/entry/common.c | 15 ++++++++++++---
> arch/x86/entry/syscall_32.c | 11 +----------
> arch/x86/entry/syscall_64.c | 6 ------
> arch/x86/entry/syscall_x32.c | 7 ++++++-
> arch/x86/include/asm/cpufeatures.h | 1 +
> arch/x86/include/asm/syscall.h | 8 +++++++-
> arch/x86/kernel/cpu/bugs.c | 31 +++++++++++++++++++++++++++++-
> 7 files changed, 57 insertions(+), 22 deletions(-)
>
To ask again, what do we gain by having this syscall hardening at the
same time as the always on BHB scrubbing sequence?
On Mon, 15 Apr 2024 at 00:37, Nikolay Borisov <nik.borisov@suse.com> wrote:
>
> To ask again, what do we gain by having this syscall hardening at the
> same time as the always on BHB scrubbing sequence?
What happens the next time some indirect call problem comes up?
If we had had *one* hardware bug in this area, that would be one
thing. But this has been going on for a decade now.
Linus
On 15.04.24 г. 18:16 ч., Linus Torvalds wrote: > On Mon, 15 Apr 2024 at 00:37, Nikolay Borisov <nik.borisov@suse.com> wrote: >> >> To ask again, what do we gain by having this syscall hardening at the >> same time as the always on BHB scrubbing sequence? > > What happens the next time some indirect call problem comes up? Same as with every issue - assess the problem and develop fixes. Let's be honest, the indirect branches in the syscall handler aren't the biggest problem, it's the stacked LSMs. And even if those get fixes chances are the security people will likely find some other avenue of attack, I think even now the attack is somewhat hard to pull off. So in any case this could have been a completely independent patch of the BHI series. > > If we had had *one* hardware bug in this area, that would be one > thing. But this has been going on for a decade now. > > Linus
On Mon, 15 Apr 2024 at 08:27, Nikolay Borisov <nik.borisov@suse.com> wrote:
>
> Same as with every issue - assess the problem and develop fixes.
No. Let's have at least all the infrastructure in place to be a bit proactive.
> Let's be honest, the indirect branches in the syscall handler aren't the
> biggest problem
Oh, they have been.
> it's the stacked LSMs.
Hopefully those will get fixed too.
There's a few other fairly reachable ones (the timer indirection ones
are much too close, and VFS file ops aren't entirely out of reach).
But maybe some day we'll be in a situation where it's actually fairly
hard to reach indirect kernel calls from untrusted user space.
The system call ones are pretty much always the first ones, though.
> And even if those get fixes
> chances are the security people will likely find some other avenue of
> attack, I think even now the attack is somewhat hard to pull off.
No disagreement about that. I think outright sw bugs are still the
99.9% thing. But let's learn from history instead of "assess the
problem" every time anew.
Linus
On Fri, Apr 12, 2024 at 11:10:32AM -0700, Josh Poimboeuf wrote: > diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h > index 3c7434329661..7c87fe80c696 100644 > --- a/arch/x86/include/asm/cpufeatures.h > +++ b/arch/x86/include/asm/cpufeatures.h > @@ -470,6 +470,7 @@ > #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ > #define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ > #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ > +#define X86_FEATURE_INDIRECT_SAFE (21*32+ 4) /* "" Indirect branches aren't vulnerable to Spectre v2 */ This should be (21*32+ 5). Other than that: Reviewed-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
On Fri, Apr 12, 2024 at 03:42:32PM -0700, Pawan Gupta wrote: > On Fri, Apr 12, 2024 at 11:10:32AM -0700, Josh Poimboeuf wrote: > > diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h > > index 3c7434329661..7c87fe80c696 100644 > > --- a/arch/x86/include/asm/cpufeatures.h > > +++ b/arch/x86/include/asm/cpufeatures.h > > @@ -470,6 +470,7 @@ > > #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ > > #define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ > > #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ > > +#define X86_FEATURE_INDIRECT_SAFE (21*32+ 4) /* "" Indirect branches aren't vulnerable to Spectre v2 */ > > This should be (21*32+ 5). Argh :-/ > Other than that: > > Reviewed-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com> Thanks! -- Josh
© 2016 - 2025 Red Hat, Inc.