[PATCH RFC 1/2] x86/entry_64: Add a separate unmitigated entry/exit path

Pawan Gupta posted 2 patches 1 year, 4 months ago
[PATCH RFC 1/2] x86/entry_64: Add a separate unmitigated entry/exit path
Posted by Pawan Gupta 1 year, 4 months ago
CPU mitigations are deployed system-wide, but usually not all of the
userspace is malicious. Yet, they suffer from the performance impact
of the mitigations. This all or nothing approach is due to lack of a
way for kernel to know which userspace can be trusted and which cannot.

For scenarios where an admin can decide which processes to trust, an
interface to tell the kernel to possibly skip the mitigation would be
useful.

In preparation for kernel to be able to selectively apply mitigation
per-process add a separate kernel entry/exit path that skips the
mitigations.

Originally-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
---
 arch/x86/entry/entry_64.S     | 66 +++++++++++++++++++++++++++++++++++--------
 arch/x86/include/asm/proto.h  | 15 +++++++---
 arch/x86/include/asm/ptrace.h | 15 +++++++---
 arch/x86/kernel/cpu/common.c  |  2 +-
 4 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1b5be07f8669..eeaf4226d09c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -84,7 +84,7 @@
  * with them due to bugs in both AMD and Intel CPUs.
  */
 
-SYM_CODE_START(entry_SYSCALL_64)
+.macro __entry_SYSCALL_64 mitigated=0
 	UNWIND_HINT_ENTRY
 	ENDBR
 
@@ -94,7 +94,12 @@ SYM_CODE_START(entry_SYSCALL_64)
 	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
 	movq	PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
 
-SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
+.if \mitigated
+SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack_mitigated, SYM_L_GLOBAL)
+.else
+SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack_unmitigated, SYM_L_GLOBAL)
+.endif
+
 	ANNOTATE_NOENDBR
 
 	/* Construct struct pt_regs on stack */
@@ -103,7 +108,11 @@ SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
 	pushq	%r11					/* pt_regs->flags */
 	pushq	$__USER_CS				/* pt_regs->cs */
 	pushq	%rcx					/* pt_regs->ip */
+
+.if \mitigated
 SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
+.endif
+
 	pushq	%rax					/* pt_regs->orig_ax */
 
 	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
@@ -113,10 +122,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
 	/* Sign extend the lower 32bit as syscall numbers are treated as int */
 	movslq	%eax, %rsi
 
+.if \mitigated
 	/* clobbers %rax, make sure it is after saving the syscall nr */
 	IBRS_ENTER
 	UNTRAIN_RET
 	CLEAR_BRANCH_HISTORY
+.endif
 
 	call	do_syscall_64		/* returns with IRQs disabled */
 
@@ -127,15 +138,26 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
 	 * In the Xen PV case we must use iret anyway.
 	 */
 
-	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
-		"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+.if \mitigated
+	push %rax
+	IBRS_EXIT
+	CLEAR_CPU_BUFFERS
+	pop %rax
+.endif
+
+	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode_from_syscall", \
+		"jmp swapgs_restore_regs_and_return_to_usermode_from_syscall", X86_FEATURE_XENPV
 
 	/*
 	 * We win! This label is here just for ease of understanding
 	 * perf profiles. Nothing jumps here.
 	 */
-syscall_return_via_sysret:
-	IBRS_EXIT
+.if \mitigated
+syscall_return_via_sysret_mitigated:
+.else
+syscall_return_via_sysret_unmitigated:
+.endif
+
 	POP_REGS pop_rdi=0
 
 	/*
@@ -159,15 +181,36 @@ syscall_return_via_sysret:
 
 	popq	%rdi
 	popq	%rsp
-SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
+
+.if \mitigated
+SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack_mitigated, SYM_L_GLOBAL)
+.else
+SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack_unmitigated, SYM_L_GLOBAL)
+.endif
+
 	ANNOTATE_NOENDBR
 	swapgs
-	CLEAR_CPU_BUFFERS
+
+.if \mitigated
+SYM_INNER_LABEL(entry_SYSRETQ_end_mitigated, SYM_L_GLOBAL)
+.else
+SYM_INNER_LABEL(entry_SYSRETQ_end_unmitigated, SYM_L_GLOBAL)
+.endif
 	sysretq
-SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
+
+.endm /* __entry_SYSCALL_64 */
+
+SYM_CODE_START(entry_SYSCALL_64_unmitigated)
+	__entry_SYSCALL_64 mitigated=0
 	ANNOTATE_NOENDBR
 	int3
-SYM_CODE_END(entry_SYSCALL_64)
+SYM_CODE_END(entry_SYSCALL_64_unmitigated)
+
+SYM_CODE_START(entry_SYSCALL_64_mitigated)
+	__entry_SYSCALL_64 mitigated=1
+	ANNOTATE_NOENDBR
+	int3
+SYM_CODE_END(entry_SYSCALL_64_mitigated)
 
 /*
  * %rdi: prev task
@@ -559,6 +602,8 @@ __irqentry_text_end:
 SYM_CODE_START_LOCAL(common_interrupt_return)
 SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
 	IBRS_EXIT
+	CLEAR_CPU_BUFFERS
+SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode_from_syscall, SYM_L_GLOBAL)
 #ifdef CONFIG_XEN_PV
 	ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
 #endif
@@ -573,7 +618,6 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
 
 .Lswapgs_and_iret:
 	swapgs
-	CLEAR_CPU_BUFFERS
 	/* Assert that the IRET frame indicates user mode. */
 	testb	$3, 8(%rsp)
 	jnz	.Lnative_iret
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 484f4f0131a5..0936e0e70659 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -11,10 +11,17 @@ struct task_struct;
 void syscall_init(void);
 
 #ifdef CONFIG_X86_64
-void entry_SYSCALL_64(void);
-void entry_SYSCALL_64_safe_stack(void);
-void entry_SYSRETQ_unsafe_stack(void);
-void entry_SYSRETQ_end(void);
+
+void entry_SYSCALL_64_unmitigated(void);
+void entry_SYSCALL_64_safe_stack_unmitigated(void);
+void entry_SYSRETQ_unsafe_stack_unmitigated(void);
+void entry_SYSRETQ_end_unmitigated(void);
+
+void entry_SYSCALL_64_mitigated(void);
+void entry_SYSCALL_64_safe_stack_mitigated(void);
+void entry_SYSRETQ_unsafe_stack_mitigated(void);
+void entry_SYSRETQ_end_mitigated(void);
+
 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
 #endif
 
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5a83fbd9bc0b..74a13c76d241 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -261,11 +261,18 @@ static inline bool any_64bit_mode(struct pt_regs *regs)
 
 static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
 {
-	bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
-		    regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack);
+	bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64_unmitigated &&
+		    regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack_unmitigated);
+
+	ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack_unmitigated &&
+		      regs->ip <  (unsigned long)entry_SYSRETQ_end_unmitigated);
+
+	ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_64_mitigated &&
+		      regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack_mitigated);
+
+	ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack_mitigated &&
+		      regs->ip <  (unsigned long)entry_SYSRETQ_end_mitigated);
 
-	ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack &&
-		      regs->ip <  (unsigned long)entry_SYSRETQ_end);
 #ifdef CONFIG_IA32_EMULATION
 	ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
 		      regs->ip <  (unsigned long)entry_SYSCALL_compat_safe_stack);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d4e539d4e158..e72c37f3a437 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2026,7 +2026,7 @@ static void wrmsrl_cstar(unsigned long val)
 
 static inline void idt_syscall_init(void)
 {
-	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64_unmitigated);
 
 	if (ia32_enabled()) {
 		wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);

-- 
2.34.1
Re: [PATCH RFC 1/2] x86/entry_64: Add a separate unmitigated entry/exit path
Posted by Waiman Long 1 year, 4 months ago
On 9/19/24 17:52, Pawan Gupta wrote:
> CPU mitigations are deployed system-wide, but usually not all of the
> userspace is malicious. Yet, they suffer from the performance impact
> of the mitigations. This all or nothing approach is due to lack of a
> way for kernel to know which userspace can be trusted and which cannot.
>
> For scenarios where an admin can decide which processes to trust, an
> interface to tell the kernel to possibly skip the mitigation would be
> useful.
>
> In preparation for kernel to be able to selectively apply mitigation
> per-process add a separate kernel entry/exit path that skips the
> mitigations.
>
> Originally-by: Josh Poimboeuf <jpoimboe@kernel.org>
> Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>

For the current patch, not all x86 CPU vulnerability mitigations can be 
disabled. Maybe we should list the subset of mitigations that can be 
disabled.

Cheers,
Longman
Re: [PATCH RFC 1/2] x86/entry_64: Add a separate unmitigated entry/exit path
Posted by Pawan Gupta 1 year, 4 months ago
On Fri, Sep 20, 2024 at 02:57:34AM -0400, Waiman Long wrote:
> 
> On 9/19/24 17:52, Pawan Gupta wrote:
> > CPU mitigations are deployed system-wide, but usually not all of the
> > userspace is malicious. Yet, they suffer from the performance impact
> > of the mitigations. This all or nothing approach is due to lack of a
> > way for kernel to know which userspace can be trusted and which cannot.
> > 
> > For scenarios where an admin can decide which processes to trust, an
> > interface to tell the kernel to possibly skip the mitigation would be
> > useful.
> > 
> > In preparation for kernel to be able to selectively apply mitigation
> > per-process add a separate kernel entry/exit path that skips the
> > mitigations.
> > 
> > Originally-by: Josh Poimboeuf <jpoimboe@kernel.org>
> > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> 
> For the current patch, not all x86 CPU vulnerability mitigations can be
> disabled. Maybe we should list the subset of mitigations that can be
> disabled.

Yes, will update that mitigations that can be bypassed are BHI, VERW,
Retbleed-IBRS, Retbleed-unret and IBPB.

Meltdown, Spectre-v1, eIBRS, GDS, SRBDS, retpoline and rethunk stays
enabled.