[PATCH 5/7] x86/bugs: Only harden syscalls when needed

Josh Poimboeuf posted 7 patches 1 year, 10 months ago
[PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Josh Poimboeuf 1 year, 10 months ago
Syscall hardening (i.e., converting the syscall indirect branch to a
series of direct branches) may cause performance regressions in certain
scenarios.  Only use the syscall hardening when indirect branches are
considered unsafe.

Fixes: 1e3ad78334a6 ("x86/syscall: Don't force use of indirect calls for system calls")
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 arch/x86/entry/common.c            | 30 +++++++++++++++++++++++++---
 arch/x86/entry/syscall_32.c        | 11 +---------
 arch/x86/entry/syscall_64.c        |  8 +-------
 arch/x86/entry/syscall_x32.c       |  7 ++++++-
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/syscall.h     |  8 +++++++-
 arch/x86/kernel/cpu/bugs.c         | 32 +++++++++++++++++++++++++++++-
 7 files changed, 74 insertions(+), 23 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 6de50b80702e..80d432d2fe44 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -39,6 +39,28 @@
 
 #ifdef CONFIG_X86_64
 
+/*
+ * Do either a direct or an indirect call, depending on whether indirect calls
+ * are considered safe.
+ */
+#define __do_syscall(table, func_direct, nr, regs)			\
+({									\
+	unsigned long __rax, __rdi, __rsi;				\
+									\
+	asm_inline volatile(						\
+		ALTERNATIVE("call " __stringify(func_direct) "\n\t",	\
+			    ANNOTATE_RETPOLINE_SAFE			\
+			    "call *%[func_ptr]\n\t",			\
+			    X86_FEATURE_INDIRECT_SAFE)			\
+		: "=D" (__rdi), "=S" (__rsi), "=a" (__rax),		\
+		  ASM_CALL_CONSTRAINT					\
+		: "0" (regs), "1" (nr), [func_ptr] "r" (table[nr])	\
+		: "rdx", "rcx", "r8", "r9", "r10", "r11",		\
+		  "cc", "memory");					\
+									\
+	__rax;								\
+})
+
 static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
 {
 	/*
@@ -49,7 +71,7 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
 
 	if (likely(unr < NR_syscalls)) {
 		unr = array_index_nospec(unr, NR_syscalls);
-		regs->ax = x64_sys_call(regs, unr);
+		regs->ax = __do_syscall(sys_call_table, x64_sys_call, unr, regs);
 		return true;
 	}
 	return false;
@@ -66,7 +88,7 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
 
 	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
 		xnr = array_index_nospec(xnr, X32_NR_syscalls);
-		regs->ax = x32_sys_call(regs, xnr);
+		regs->ax = __do_syscall(x32_sys_call_table, x32_sys_call, xnr, regs);
 		return true;
 	}
 	return false;
@@ -147,6 +169,8 @@ static int ia32_emulation_override_cmdline(char *arg)
 	return kstrtobool(arg, &__ia32_enabled);
 }
 early_param("ia32_emulation", ia32_emulation_override_cmdline);
+#else
+#define __do_syscall(table, func_direct, nr, regs) table[nr](regs)
 #endif
 
 /*
@@ -162,7 +186,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
 
 	if (likely(unr < IA32_NR_syscalls)) {
 		unr = array_index_nospec(unr, IA32_NR_syscalls);
-		regs->ax = ia32_sys_call(regs, unr);
+		regs->ax = __do_syscall(ia32_sys_call_table, ia32_sys_call, unr, regs);
 	} else if (nr != -1) {
 		regs->ax = __ia32_sys_ni_syscall(regs);
 	}
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index c2235bae17ef..9185870a3ab3 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -14,25 +14,16 @@
 #endif
 
 #define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
-
 #include <asm/syscalls_32.h>
 #undef __SYSCALL
 
-/*
- * The sys_call_table[] is no longer used for system calls, but
- * kernel/trace/trace_syscalls.c still wants to know the system
- * call address.
- */
-#ifdef CONFIG_X86_32
 #define __SYSCALL(nr, sym) __ia32_##sym,
-const sys_call_ptr_t sys_call_table[] = {
+__visible const sys_call_ptr_t ia32_sys_call_table[] = {
 #include <asm/syscalls_32.h>
 };
 #undef __SYSCALL
-#endif
 
 #define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs);
-
 long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
 {
 	switch (nr) {
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 33b3f09e6f15..c368048efa41 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -11,19 +11,13 @@
 #include <asm/syscalls_64.h>
 #undef __SYSCALL
 
-/*
- * The sys_call_table[] is no longer used for system calls, but
- * kernel/trace/trace_syscalls.c still wants to know the system
- * call address.
- */
 #define __SYSCALL(nr, sym) __x64_##sym,
-const sys_call_ptr_t sys_call_table[] = {
+asmlinkage const sys_call_ptr_t sys_call_table[] = {
 #include <asm/syscalls_64.h>
 };
 #undef __SYSCALL
 
 #define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-
 long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
 {
 	switch (nr) {
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index 03de4a932131..89a717267fab 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -11,8 +11,13 @@
 #include <asm/syscalls_x32.h>
 #undef __SYSCALL
 
-#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
+#define __SYSCALL(nr, sym) __x64_##sym,
+asmlinkage const sys_call_ptr_t x32_sys_call_table[] = {
+#include <asm/syscalls_x32.h>
+};
+#undef __SYSCALL
 
+#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
 long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
 {
 	switch (nr) {
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 3c7434329661..7c87fe80c696 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -470,6 +470,7 @@
 #define X86_FEATURE_BHI_CTRL		(21*32+ 2) /* "" BHI_DIS_S HW control available */
 #define X86_FEATURE_CLEAR_BHB_HW	(21*32+ 3) /* "" BHI_DIS_S HW control enabled */
 #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */
+#define X86_FEATURE_INDIRECT_SAFE	(21*32+ 4) /* "" Indirect branches aren't vulnerable to Spectre v2 */
 
 /*
  * BUG word(s)
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 2fc7bc3863ff..dfb59521244c 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -16,14 +16,20 @@
 #include <asm/thread_info.h>	/* for TS_COMPAT */
 #include <asm/unistd.h>
 
-/* This is used purely for kernel/trace/trace_syscalls.c */
 typedef long (*sys_call_ptr_t)(const struct pt_regs *);
 extern const sys_call_ptr_t sys_call_table[];
 
+#if defined(CONFIG_X86_32)
+#define ia32_sys_call_table sys_call_table
+#else
 /*
  * These may not exist, but still put the prototypes in so we
  * can use IS_ENABLED().
  */
+extern const sys_call_ptr_t ia32_sys_call_table[];
+extern const sys_call_ptr_t x32_sys_call_table[];
+#endif
+
 extern long ia32_sys_call(const struct pt_regs *, unsigned int nr);
 extern long x32_sys_call(const struct pt_regs *, unsigned int nr);
 extern long x64_sys_call(const struct pt_regs *, unsigned int nr);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index a65c70709bb5..efffd87381b1 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1669,6 +1669,15 @@ static void __init bhi_select_mitigation(void)
 	if (!IS_ENABLED(CONFIG_X86_64))
 		return;
 
+	/*
+	 * There's no hardware mitigation in place, so mark indirect branches
+	 * as unsafe.
+	 *
+	 * One could argue the SW loop makes indirect branches safe again, but
+	 * Linus prefers it this way.
+	 */
+	setup_clear_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
+
 	/* Mitigate KVM by default */
 	setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
 	pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
@@ -1686,6 +1695,21 @@ static void __init spectre_v2_select_mitigation(void)
 	enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
 	enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
 
+	/*
+	 * X86_FEATURE_INDIRECT_SAFE indicates whether indirect calls can be
+	 * considered safe.  That means either:
+	 *
+	 *   - the CPU isn't vulnerable to Spectre v2 or its variants;
+	 *
+	 *   - a hardware mitigation is in place (e.g., IBRS, BHI_DIS_S); or
+	 *
+	 *   - the user turned off mitigations altogether.
+	 *
+	 * Assume innocence until proven guilty: set the cap bit now, then
+	 * clear it later if/when needed.
+	 */
+	setup_force_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
+
 	/*
 	 * If the CPU is not affected and the command line mode is NONE or AUTO
 	 * then nothing to do.
@@ -1720,6 +1744,7 @@ static void __init spectre_v2_select_mitigation(void)
 
 	case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
 		pr_err(SPECTRE_V2_LFENCE_MSG);
+		setup_clear_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
 		mode = SPECTRE_V2_LFENCE;
 		break;
 
@@ -1772,11 +1797,16 @@ static void __init spectre_v2_select_mitigation(void)
 		break;
 
 	case SPECTRE_V2_LFENCE:
+		setup_clear_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
+		fallthrough;
 	case SPECTRE_V2_EIBRS_LFENCE:
 		setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
-		fallthrough;
+		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+		break;
 
 	case SPECTRE_V2_RETPOLINE:
+		setup_clear_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
+		fallthrough;
 	case SPECTRE_V2_EIBRS_RETPOLINE:
 		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
 		break;
-- 
2.44.0
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Pawan Gupta 1 year, 10 months ago
On Wed, Apr 10, 2024 at 10:40:49PM -0700, Josh Poimboeuf wrote:
> Syscall hardening (i.e., converting the syscall indirect branch to a
> series of direct branches) may cause performance regressions in certain
> scenarios.  Only use the syscall hardening when indirect branches are
> considered unsafe.
> 
> Fixes: 1e3ad78334a6 ("x86/syscall: Don't force use of indirect calls for system calls")
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> ---
>  arch/x86/entry/common.c            | 30 +++++++++++++++++++++++++---
>  arch/x86/entry/syscall_32.c        | 11 +---------
>  arch/x86/entry/syscall_64.c        |  8 +-------
>  arch/x86/entry/syscall_x32.c       |  7 ++++++-
>  arch/x86/include/asm/cpufeatures.h |  1 +
>  arch/x86/include/asm/syscall.h     |  8 +++++++-
>  arch/x86/kernel/cpu/bugs.c         | 32 +++++++++++++++++++++++++++++-
>  7 files changed, 74 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 6de50b80702e..80d432d2fe44 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -39,6 +39,28 @@
>  
>  #ifdef CONFIG_X86_64
>  
> +/*
> + * Do either a direct or an indirect call, depending on whether indirect calls
> + * are considered safe.
> + */
> +#define __do_syscall(table, func_direct, nr, regs)			\
> +({									\
> +	unsigned long __rax, __rdi, __rsi;				\
> +									\
> +	asm_inline volatile(						\
> +		ALTERNATIVE("call " __stringify(func_direct) "\n\t",	\
> +			    ANNOTATE_RETPOLINE_SAFE			\
> +			    "call *%[func_ptr]\n\t",			\
> +			    X86_FEATURE_INDIRECT_SAFE)			\
> +		: "=D" (__rdi), "=S" (__rsi), "=a" (__rax),		\
> +		  ASM_CALL_CONSTRAINT					\
> +		: "0" (regs), "1" (nr), [func_ptr] "r" (table[nr])	\
> +		: "rdx", "rcx", "r8", "r9", "r10", "r11",		\
> +		  "cc", "memory");					\
> +									\
> +	__rax;								\
> +})

This is a nice implementation, but I think we can avoid the complexity
by using cpu_feature_enabled(). As cpu_feature_enabled() is also runtime
patched, atleast the likely() path should be comparable to this. Please
let me know if you have any concerns with this approach.

---
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 6de50b80702e..7c5332b83246 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -49,7 +49,11 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
 
 	if (likely(unr < NR_syscalls)) {
 		unr = array_index_nospec(unr, NR_syscalls);
-		regs->ax = x64_sys_call(regs, unr);
+		if (likely(cpu_feature_enabled(X86_FEATURE_INDIRECT_SAFE)))
+			regs->ax = sys_call_table[unr](regs);
+		else
+			regs->ax = x64_sys_call(regs, unr);
+
 		return true;
 	}
 	return false;
@@ -66,7 +70,11 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
 
 	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
 		xnr = array_index_nospec(xnr, X32_NR_syscalls);
-		regs->ax = x32_sys_call(regs, xnr);
+		if (likely(cpu_feature_enabled(X86_FEATURE_INDIRECT_SAFE)))
+			regs->ax = x32_sys_call_table[xnr](regs);
+		else
+			regs->ax = x32_sys_call(regs, xnr);
+
 		return true;
 	}
 	return false;
@@ -162,7 +170,10 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
 
 	if (likely(unr < IA32_NR_syscalls)) {
 		unr = array_index_nospec(unr, IA32_NR_syscalls);
-		regs->ax = ia32_sys_call(regs, unr);
+		if (likely(cpu_feature_enabled(X86_FEATURE_INDIRECT_SAFE)))
+			regs->ax = ia32_sys_call_table[unr](regs);
+		else
+			regs->ax = ia32_sys_call(regs, unr);
 	} else if (nr != -1) {
 		regs->ax = __ia32_sys_ni_syscall(regs);
 	}
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Pawan Gupta 1 year, 10 months ago
On Thu, Apr 11, 2024 at 11:28:54PM -0700, Pawan Gupta wrote:
> > +#define __do_syscall(table, func_direct, nr, regs)			\
> > +({									\
> > +	unsigned long __rax, __rdi, __rsi;				\
> > +									\
> > +	asm_inline volatile(						\
> > +		ALTERNATIVE("call " __stringify(func_direct) "\n\t",	\
> > +			    ANNOTATE_RETPOLINE_SAFE			\
> > +			    "call *%[func_ptr]\n\t",			\
> > +			    X86_FEATURE_INDIRECT_SAFE)			\
> > +		: "=D" (__rdi), "=S" (__rsi), "=a" (__rax),		\
> > +		  ASM_CALL_CONSTRAINT					\
> > +		: "0" (regs), "1" (nr), [func_ptr] "r" (table[nr])	\
> > +		: "rdx", "rcx", "r8", "r9", "r10", "r11",		\
> > +		  "cc", "memory");					\
> > +									\
> > +	__rax;								\
> > +})
> 
> This is a nice implementation, but I think we can avoid the complexity
> by using cpu_feature_enabled(). As cpu_feature_enabled() is also runtime
> patched, atleast the likely() path should be comparable to this. Please
> let me know if you have any concerns with this approach.
> 
> ---
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 6de50b80702e..7c5332b83246 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -49,7 +49,11 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
>  
>  	if (likely(unr < NR_syscalls)) {
>  		unr = array_index_nospec(unr, NR_syscalls);
> -		regs->ax = x64_sys_call(regs, unr);
> +		if (likely(cpu_feature_enabled(X86_FEATURE_INDIRECT_SAFE)))
> +			regs->ax = sys_call_table[unr](regs);
> +		else
> +			regs->ax = x64_sys_call(regs, unr);

BTW, this also solves the missing lfence case before the indirect call.
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Pawan Gupta 1 year, 10 months ago
On Wed, Apr 10, 2024 at 10:40:49PM -0700, Josh Poimboeuf wrote:
> Syscall hardening (i.e., converting the syscall indirect branch to a
> series of direct branches) may cause performance regressions in certain
> scenarios.  Only use the syscall hardening when indirect branches are
> considered unsafe.
> 
> Fixes: 1e3ad78334a6 ("x86/syscall: Don't force use of indirect calls for system calls")
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> ---
>  arch/x86/entry/common.c            | 30 +++++++++++++++++++++++++---
>  arch/x86/entry/syscall_32.c        | 11 +---------
>  arch/x86/entry/syscall_64.c        |  8 +-------
>  arch/x86/entry/syscall_x32.c       |  7 ++++++-
>  arch/x86/include/asm/cpufeatures.h |  1 +
>  arch/x86/include/asm/syscall.h     |  8 +++++++-
>  arch/x86/kernel/cpu/bugs.c         | 32 +++++++++++++++++++++++++++++-
>  7 files changed, 74 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 6de50b80702e..80d432d2fe44 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -39,6 +39,28 @@
>  
>  #ifdef CONFIG_X86_64
>  
> +/*
> + * Do either a direct or an indirect call, depending on whether indirect calls
> + * are considered safe.
> + */
> +#define __do_syscall(table, func_direct, nr, regs)			\
> +({									\
> +	unsigned long __rax, __rdi, __rsi;				\
> +									\
> +	asm_inline volatile(						\
> +		ALTERNATIVE("call " __stringify(func_direct) "\n\t",	\
> +			    ANNOTATE_RETPOLINE_SAFE			\
> +			    "call *%[func_ptr]\n\t",			\

This will likely not insert the lfence before the indirect call in
spectre_v2=eibrs,lfence mode. As X86_FEATURE_INDIRECT_SAFE is not
cleared when eIBRS is enabled, this will not be converted to direct
call.

[...]
> @@ -1720,6 +1744,7 @@ static void __init spectre_v2_select_mitigation(void)
>  
>  	case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
>  		pr_err(SPECTRE_V2_LFENCE_MSG);
> +		setup_clear_cpu_cap(X86_FEATURE_INDIRECT_SAFE);

I don't know if it intentional, this seems to be the duplicate of
X86_FEATURE_INDIRECT_SAFE clear later in SPECTRE_V2_LFENCE mode. Also it
seems a bit odd to do this here in SPECTRE_V2_CMD handling.

>  		mode = SPECTRE_V2_LFENCE;
>  		break;
>  
> @@ -1772,11 +1797,16 @@ static void __init spectre_v2_select_mitigation(void)
>  		break;
>  
>  	case SPECTRE_V2_LFENCE:
> +		setup_clear_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
> +		fallthrough;
>  	case SPECTRE_V2_EIBRS_LFENCE:
>  		setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
> -		fallthrough;
> +		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
> +		break;
>  
>  	case SPECTRE_V2_RETPOLINE:
> +		setup_clear_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
> +		fallthrough;
>  	case SPECTRE_V2_EIBRS_RETPOLINE:
>  		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
>  		break;
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Josh Poimboeuf 1 year, 10 months ago
On Thu, Apr 11, 2024 at 05:15:22PM -0700, Pawan Gupta wrote:
> > + * Do either a direct or an indirect call, depending on whether indirect calls
> > + * are considered safe.
> > + */
> > +#define __do_syscall(table, func_direct, nr, regs)			\
> > +({									\
> > +	unsigned long __rax, __rdi, __rsi;				\
> > +									\
> > +	asm_inline volatile(						\
> > +		ALTERNATIVE("call " __stringify(func_direct) "\n\t",	\
> > +			    ANNOTATE_RETPOLINE_SAFE			\
> > +			    "call *%[func_ptr]\n\t",			\
> 
> This will likely not insert the lfence before the indirect call in
> spectre_v2=eibrs,lfence mode. As X86_FEATURE_INDIRECT_SAFE is not
> cleared when eIBRS is enabled, this will not be converted to direct
> call.

Hm, I think the problem here is that SPECTRE_V2_EIBRS_LFENCE confusingly
sets X86_FEATURE_RETPOLINE.  So the following bit unintentionally takes
effect:

	/* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */
	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
		spec_ctrl_disable_kernel_rrsba();
		if (rrsba_disabled)
			return;
	}

If RRSBA gets disabled (which is likely), bhi_select_mitigation()
returns early and X86_FEATURE_INDIRECT_SAFE doesn't get cleared.

"LFENCE; CALL" is most definitely not a retpoline, so it's weird for
SPECTRE_V2_EIBRS_LFENCE to be setting X86_FEATURE_RETPOLINE.  We should
fix that.

Honestly, I think SPECTRE_V2_EIBRS_LFENCE is obsolete anyway.  It was
originally intended to be a BHI mitigation, but the real-world
benchmarks I've seen are showing it to be quite a bit slower than the
actual BHI mitigations.

Plus it's only a partial fix because the speculative window after the
branch can still be big enough to do multiple loads.

For similar reasons I'm thinking we should also remove the non-eIBRS
version (SPECTRE_V2_LFENCE).

I'll make some patches to do that, with warnings printed if somebody
tries to use them.  They can just fall back to the (more secure and
generally faster) defaults.

> [...]
> > @@ -1720,6 +1744,7 @@ static void __init spectre_v2_select_mitigation(void)
> >  
> >  	case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
> >  		pr_err(SPECTRE_V2_LFENCE_MSG);
> > +		setup_clear_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
> 
> I don't know if it intentional, this seems to be the duplicate of
> X86_FEATURE_INDIRECT_SAFE clear later in SPECTRE_V2_LFENCE mode. Also it
> seems a bit odd to do this here in SPECTRE_V2_CMD handling.

Yeah, I accidentally left that in from an earlier implementation.  It's
harmless but I'll clean that up too with a new patch unless Ingo wants
to remove that line.

-- 
Josh
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Ingo Molnar 1 year, 10 months ago
* Josh Poimboeuf <jpoimboe@kernel.org> wrote:

> > [...]
> > > @@ -1720,6 +1744,7 @@ static void __init spectre_v2_select_mitigation(void)
> > >  
> > >  	case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
> > >  		pr_err(SPECTRE_V2_LFENCE_MSG);
> > > +		setup_clear_cpu_cap(X86_FEATURE_INDIRECT_SAFE);
> > 
> > I don't know if it intentional, this seems to be the duplicate of
> > X86_FEATURE_INDIRECT_SAFE clear later in SPECTRE_V2_LFENCE mode. Also it
> > seems a bit odd to do this here in SPECTRE_V2_CMD handling.
> 
> Yeah, I accidentally left that in from an earlier implementation.  It's 
> harmless but I'll clean that up too with a new patch unless Ingo wants to 
> remove that line.

Lemme remove it entirely from x86/urgent, so that you can submit an updated 
patch with all feedback included.

In addition to the above line, Pawan's suggestion of doing it in C via 
cpu_feature_enabled() looks quite a bit simpler and easier to read & argue 
about, right?

Thanks,

	Ingo
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Pawan Gupta 1 year, 10 months ago
On Thu, Apr 11, 2024 at 08:57:40PM -0700, Josh Poimboeuf wrote:
> On Thu, Apr 11, 2024 at 05:15:22PM -0700, Pawan Gupta wrote:
> > > + * Do either a direct or an indirect call, depending on whether indirect calls
> > > + * are considered safe.
> > > + */
> > > +#define __do_syscall(table, func_direct, nr, regs)			\
> > > +({									\
> > > +	unsigned long __rax, __rdi, __rsi;				\
> > > +									\
> > > +	asm_inline volatile(						\
> > > +		ALTERNATIVE("call " __stringify(func_direct) "\n\t",	\
> > > +			    ANNOTATE_RETPOLINE_SAFE			\
> > > +			    "call *%[func_ptr]\n\t",			\
> > 
> > This will likely not insert the lfence before the indirect call in
> > spectre_v2=eibrs,lfence mode. As X86_FEATURE_INDIRECT_SAFE is not
> > cleared when eIBRS is enabled, this will not be converted to direct
> > call.
> 
> Hm, I think the problem here is that SPECTRE_V2_EIBRS_LFENCE confusingly
> sets X86_FEATURE_RETPOLINE.  So the following bit unintentionally takes

I think it is intentional, more on it below.

> effect:
> 
> 	/* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */
> 	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
> 		spec_ctrl_disable_kernel_rrsba();
> 		if (rrsba_disabled)
> 			return;
> 	}
> 
> If RRSBA gets disabled (which is likely), bhi_select_mitigation()
> returns early and X86_FEATURE_INDIRECT_SAFE doesn't get cleared.
> 
> "LFENCE; CALL" is most definitely not a retpoline, so it's weird for
> SPECTRE_V2_EIBRS_LFENCE to be setting X86_FEATURE_RETPOLINE.  We should
> fix that.

I could be completely wrong here, but my guess is, it is needed because
the thunk call inserted by the compiler with X86_FEATURE_RETPOLINE
provides room for adding the extra lfence.

In order to prefix lfence(3 bytes) indirect call is first converted to
call __x86_indirect_thunk_reg, which has a 5 byte opcode. At runtime,
thunk call is patched to "lfence;call *reg", which is also 3+2=5 bytes.

Thunk call is anyways needed because, there are indirect
calls opcodes that are 3 byte long e.g. call *%r8. So, wherever possible
lfence+call* is inlined, otherwise lfence is executed in a call to thunk,
which then does jmp *%reg.

> Honestly, I think SPECTRE_V2_EIBRS_LFENCE is obsolete anyway.  It was
> originally intended to be a BHI mitigation, but the real-world
> benchmarks I've seen are showing it to be quite a bit slower than the
> actual BHI mitigations.
> 
> Plus it's only a partial fix because the speculative window after the
> branch can still be big enough to do multiple loads.

Thats fair.
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Josh Poimboeuf 1 year, 10 months ago
On Thu, Apr 11, 2024 at 08:57:42PM -0700, Josh Poimboeuf wrote:
> For similar reasons I'm thinking we should also remove the non-eIBRS
> version (SPECTRE_V2_LFENCE).

Actually I guess that's still the default mitigation for AMD so I'll
leave that one in.

-- 
Josh
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Josh Poimboeuf 1 year, 10 months ago
On Thu, Apr 11, 2024 at 09:17:27PM -0700, Josh Poimboeuf wrote:
> On Thu, Apr 11, 2024 at 08:57:42PM -0700, Josh Poimboeuf wrote:
> > For similar reasons I'm thinking we should also remove the non-eIBRS
> > version (SPECTRE_V2_LFENCE).
> 
> Actually I guess that's still the default mitigation for AMD so I'll
> leave that one in.

Never mind, I forgot that got deprecated for AMD.

-- 
Josh
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Andrew Cooper 1 year, 10 months ago
On 12/04/2024 6:20 am, Josh Poimboeuf wrote:
> On Thu, Apr 11, 2024 at 09:17:27PM -0700, Josh Poimboeuf wrote:
>> On Thu, Apr 11, 2024 at 08:57:42PM -0700, Josh Poimboeuf wrote:
>>> For similar reasons I'm thinking we should also remove the non-eIBRS
>>> version (SPECTRE_V2_LFENCE).
>> Actually I guess that's still the default mitigation for AMD so I'll
>> leave that one in.
> Never mind, I forgot that got deprecated for AMD.

And then became necessary on two Atoms, although I can't for the life of
of me find Intel's footnote about this in the maze of speculation docs...

~Andrew
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Josh Poimboeuf 1 year, 10 months ago
On Fri, Apr 12, 2024 at 11:36:04AM +0100, Andrew Cooper wrote:
> On 12/04/2024 6:20 am, Josh Poimboeuf wrote:
> > On Thu, Apr 11, 2024 at 09:17:27PM -0700, Josh Poimboeuf wrote:
> >> On Thu, Apr 11, 2024 at 08:57:42PM -0700, Josh Poimboeuf wrote:
> >>> For similar reasons I'm thinking we should also remove the non-eIBRS
> >>> version (SPECTRE_V2_LFENCE).
> >> Actually I guess that's still the default mitigation for AMD so I'll
> >> leave that one in.
> > Never mind, I forgot that got deprecated for AMD.
> 
> And then became necessary on two Atoms, although I can't for the life of
> of me find Intel's footnote about this in the maze of speculation docs...

Found it on this page [1] but it doesn't seem to be a very confident
endorsement. And Linux doesn't seem to enable it for those parts
regardless.

  Intel® Atom Goldmont Plus and Tremont Mitigation

  Retpoline may not be a fully effective branch target injection
  mitigation on processors which are based on Intel Atom
  microarchitectures code named Goldmont Plus and Tremont, as documented
  in our existing guidance. On such processors, an LFENCE;JMP sequence may
  be an alternative for retpoline, although this is not architecturally
  guaranteed. Instructions may still be speculatively executed at the
  predicted near JMP target, which can allow some forms of shallow gadgets
  (for example, revealing register values) to be transiently executed.

  Intel is not currently evaluating LFENCE;JMP as an option other than for
  processors based on Goldmont Plus and Tremont microarchitectures, given
  the possibility of a sufficiently large transient window to execute a
  disclosure gadget.

https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/branch-history-injection.html

-- 
Josh
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Andrew Cooper 1 year, 10 months ago
On 11/04/2024 6:40 am, Josh Poimboeuf wrote:
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 6de50b80702e..80d432d2fe44 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -39,6 +39,28 @@
>  
>  #ifdef CONFIG_X86_64
>  
> +/*
> + * Do either a direct or an indirect call, depending on whether indirect calls
> + * are considered safe.
> + */
> +#define __do_syscall(table, func_direct, nr, regs)			\
> +({									\
> +	unsigned long __rax, __rdi, __rsi;				\
> +									\
> +	asm_inline volatile(						\
> +		ALTERNATIVE("call " __stringify(func_direct) "\n\t",	\
> +			    ANNOTATE_RETPOLINE_SAFE			\
> +			    "call *%[func_ptr]\n\t",			\

This wants to be a plain maybe-thunk'd indirect call, and without the
ANNOTATE_RETPOLINE_SAFE.

Or you're going to get into cases where some combinations of command
line options do unexpected things e.g. retpolining everything except the
syscall dispatch.

~Andrew
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Josh Poimboeuf 1 year, 10 months ago
On Thu, Apr 11, 2024 at 11:06:37AM +0100, Andrew Cooper wrote:
> > +#define __do_syscall(table, func_direct, nr, regs)			\
> > +({									\
> > +	unsigned long __rax, __rdi, __rsi;				\
> > +									\
> > +	asm_inline volatile(						\
> > +		ALTERNATIVE("call " __stringify(func_direct) "\n\t",	\
> > +			    ANNOTATE_RETPOLINE_SAFE			\
> > +			    "call *%[func_ptr]\n\t",			\
> 
> This wants to be a plain maybe-thunk'd indirect call, and without the
> ANNOTATE_RETPOLINE_SAFE.
> 
> Or you're going to get into cases where some combinations of command
> line options do unexpected things e.g. retpolining everything except the
> syscall dispatch.

In that case won't X86_FEATURE_INDIRECT_SAFE get cleared, resulting in
the above using a direct call?  Or did I miss something?

-- 
Josh
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Andrew Cooper 1 year, 10 months ago
On 11/04/2024 4:38 pm, Josh Poimboeuf wrote:
> On Thu, Apr 11, 2024 at 11:06:37AM +0100, Andrew Cooper wrote:
>>> +#define __do_syscall(table, func_direct, nr, regs)			\
>>> +({									\
>>> +	unsigned long __rax, __rdi, __rsi;				\
>>> +									\
>>> +	asm_inline volatile(						\
>>> +		ALTERNATIVE("call " __stringify(func_direct) "\n\t",	\
>>> +			    ANNOTATE_RETPOLINE_SAFE			\
>>> +			    "call *%[func_ptr]\n\t",			\
>> This wants to be a plain maybe-thunk'd indirect call, and without the
>> ANNOTATE_RETPOLINE_SAFE.
>>
>> Or you're going to get into cases where some combinations of command
>> line options do unexpected things e.g. retpolining everything except the
>> syscall dispatch.
> In that case won't X86_FEATURE_INDIRECT_SAFE get cleared, resulting in
> the above using a direct call?  Or did I miss something?

That works until the next time anyone touches the logic.  Then it's
latent vulnerability, or an incorrect trial of a non-default option.

I guarantee you'll save someone (probably someone on this CC list) a
headache in the future by not introducing an unnecessary special case here.

~Andrew
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Nikolay Borisov 1 year, 10 months ago

On 11.04.24 г. 8:40 ч., Josh Poimboeuf wrote:
> Syscall hardening (i.e., converting the syscall indirect branch to a
> series of direct branches) may cause performance regressions in certain
> scenarios.  Only use the syscall hardening when indirect branches are
> considered unsafe.
> 
> Fixes: 1e3ad78334a6 ("x86/syscall: Don't force use of indirect calls for system calls")
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>

Why fiddle with syscall mechanism if the bhb scrubbing sequence 
mitigates bhb? AFAIU (correct me if I'm wrong) the original idea was to 
have use syscall hardening instead of the BHB sequence but since it 
became clear that's not sufficient bhb scrubbing completely subsumes the 
direct branch approach in the syscall handler?

<snip>
Re: [PATCH 5/7] x86/bugs: Only harden syscalls when needed
Posted by Josh Poimboeuf 1 year, 10 months ago
On Thu, Apr 11, 2024 at 09:20:17AM +0300, Nikolay Borisov wrote:
> On 11.04.24 г. 8:40 ч., Josh Poimboeuf wrote:
> > Syscall hardening (i.e., converting the syscall indirect branch to a
> > series of direct branches) may cause performance regressions in certain
> > scenarios.  Only use the syscall hardening when indirect branches are
> > considered unsafe.
> > 
> > Fixes: 1e3ad78334a6 ("x86/syscall: Don't force use of indirect calls for system calls")
> > Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> 
> Why fiddle with syscall mechanism if the bhb scrubbing sequence mitigates
> bhb? AFAIU (correct me if I'm wrong) the original idea was to have use
> syscall hardening instead of the BHB sequence but since it became clear
> that's not sufficient bhb scrubbing completely subsumes the direct branch
> approach in the syscall handler?

I agree, but I think Linus wanted it for some reason.  I might not have
gotten the X86_FEATURE_INDIRECT_SAFE conditions right, maybe Linus can
clarify.

I'm going to experiment with having objtool find all indirect branches
reachable 66 branches from syscall entry.  If we converted all those to
direct branches then the SW loop wouldn't be needed.

But until then I don't see much point in the syscall direct branches.
We could just disable it completely until if/when it's really needed.

-- 
Josh