x86/ibt: FineIBT-BHI

[PATCH 13/14] x86: BHI stubs

Posted by Peter Zijlstra 2 months ago

Mostly generated combinatorical stubs used to poison function
argument pointers.

Notably, since this targets eIBRS parts which do not suffer from
retbleed, use normal return instructions to save some space.

In total: 6c1 + 6c2 + 6c3 + 6c4 + 1 = 6 + 15 + 20 + 15 + 1 = 57 stubs.

Note: Scott said only 0.6% of the kernel functions take 5 or more
pointer arguments, if any of those turns out to be performance
critical, we can add more stubs.

Note: the nested for loops are horrid, should be fixed.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/cfi.h    |   11 
 arch/x86/kernel/alternative.c |   94 ++++++
 arch/x86/lib/Makefile         |    1 
 arch/x86/lib/bhi.S            |  602 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 708 insertions(+)

--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -101,6 +101,17 @@ enum cfi_mode {
 
 extern enum cfi_mode cfi_mode;
 
+typedef u8 bhi_thunk_8[8];
+typedef u8 bhi_thunk_16[16];
+typedef u8 bhi_thunk_32[32];
+
+extern bhi_thunk_8  __bhi_args_6c1[];
+extern bhi_thunk_16 __bhi_args_6c2[];
+extern bhi_thunk_16 __bhi_args_6c3[];
+extern bhi_thunk_32 __bhi_args_6c4[];
+
+extern u8 __bhi_args_all[];
+
 struct pt_regs;
 
 #ifdef CONFIG_CFI_CLANG
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1039,10 +1039,104 @@ u32 cfi_get_func_hash(void *func)
 
 	return hash;
 }
+
 #endif
 
 #ifdef CONFIG_FINEIBT
 
+static void *bhi_args_1(u8 args, void *addr)
+{
+	u8 bytes[5];
+
+	for (int i = 0; i < 6; i++) {
+		if (args != BIT(i))
+			continue;
+
+		bytes[0] = 0x2e;
+		memcpy(&bytes[1], &__bhi_args_6c1[i], 4);
+
+		text_poke_early(addr, bytes, 5);
+
+		return NULL;
+	}
+
+	return __bhi_args_all;
+}
+
+static void *bhi_args_2(u8 args, void *addr)
+{
+	int x = 0;
+
+	for (int i = 0; i < 6; i++) {
+		for (int j = i + 1; j < 6; j++) {
+			if (args == (BIT(i) | BIT(j)))
+				return &__bhi_args_6c2[x];
+			x++;
+		}
+	}
+
+	return __bhi_args_all;
+}
+
+static void *bhi_args_3(u8 args, void *addr)
+{
+	int x = 0;
+
+	for (int i = 0; i < 6; i++) {
+		for (int j = i + 1; j < 6; j++) {
+			for (int k = j + 1; k < 6; k++) {
+				if (args == (BIT(i) | BIT(j) | BIT(k)))
+					return &__bhi_args_6c3[x];
+				x++;
+			}
+		}
+	}
+
+	return __bhi_args_all;
+}
+
+static void *bhi_args_4(u8 args, void *addr)
+{
+	int x = 0;
+
+	for (int i = 0; i < 6; i++) {
+		for (int j = i + 1; j < 6; j++) {
+			for (int k = j + 1; k < 6; k++) {
+				for (int l = k + 1; l < 6; l++) {
+					if (args == (BIT(i) | BIT(j) | BIT(k) | BIT(l)))
+						return &__bhi_args_6c4[x];
+					x++;
+				}
+			}
+		}
+	}
+
+	return __bhi_args_all;
+}
+
+static void bhi_args(u8 args, void *addr)
+{
+	void *dest = __bhi_args_all;
+
+	if (WARN_ON_ONCE(!args))
+		return;
+
+	switch(hweight8(args)) {
+	case 1: if (bhi_args_1(args, addr) == dest)
+			break;
+		return;
+
+	case 2: dest = bhi_args_2(args, addr); break;
+	case 3: dest = bhi_args_3(args, addr); break;
+	case 4: dest = bhi_args_4(args, addr); break;
+
+	default:
+		break;
+	}
+
+	text_poke_early(addr, text_gen_insn(CALL_INSN_OPCODE, addr, dest), 5);
+}
+
 static bool cfi_rand __ro_after_init = true;
 static u32  cfi_seed __ro_after_init;
 
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -60,4 +60,5 @@ endif
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o copy_user_uncached_64.o
 	lib-y += cmpxchg16b_emu.o
+	lib-y += bhi.o
 endif
--- /dev/null
+++ b/arch/x86/lib/bhi.S
@@ -0,0 +1,602 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/unwind_hints.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * At the function start, launder function arguments that are a pointer through
+ * CMOVcc, this will create a write dependency in the speculation flow.
+ *
+ * Notably, the CFI preambles calling these will have ZF set and r10 zero.
+ */
+
+.pushsection .noinstr.text, "ax"
+
+	.align 8
+SYM_CODE_START(__bhi_args_6c1)
+	ANNOTATE_NOENDBR
+	.align 8
+SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 8
+SYM_INNER_LABEL(__bhi_args_1, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 8
+SYM_INNER_LABEL(__bhi_args_2, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdx
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 8
+SYM_INNER_LABEL(__bhi_args_3, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rcx
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 8
+SYM_INNER_LABEL(__bhi_args_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 8
+SYM_INNER_LABEL(__bhi_args_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+SYM_CODE_END(__bhi_args_6c1)
+
+
+	.align 16
+SYM_CODE_START(__bhi_args_6c2)
+	ANNOTATE_NOENDBR
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_1, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rsi
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_2, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rdx
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_3, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rcx
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_1_2, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %rdx
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_1_3, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %rcx
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_1_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_1_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_2_3, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdx
+	cmovne %r10, %rcx
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_2_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdx
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_2_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdx
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_3_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rcx
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_3_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rcx
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_4_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %r8
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+SYM_CODE_END(__bhi_args_6c2)
+
+
+	.align 16
+SYM_CODE_START(__bhi_args_6c3)
+	ANNOTATE_NOENDBR
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_1_2, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rsi
+	cmovne %r10, %rdx
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_1_3, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rsi
+	cmovne %r10, %rcx
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_1_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rsi
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_1_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rsi
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_2_3, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rdx
+	cmovne %r10, %rcx
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_2_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rdx
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_2_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rdx
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_3_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rcx
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_3_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rcx
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_0_4_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %r8
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_1_2_3, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %rdx
+	cmovne %r10, %rcx
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_1_2_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %rdx
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_1_2_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %rdx
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_1_3_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %rcx
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_1_3_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %rcx
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_1_4_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %r8
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_2_3_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdx
+	cmovne %r10, %rcx
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_2_3_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdx
+	cmovne %r10, %rcx
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_2_4_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdx
+	cmovne %r10, %r8
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 16
+SYM_INNER_LABEL(__bhi_args_3_4_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rcx
+	cmovne %r10, %r8
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+SYM_CODE_END(__bhi_args_6c3)
+
+
+	.align 32
+SYM_CODE_START(__bhi_args_6c4)
+	ANNOTATE_NOENDBR
+	.align 32
+SYM_INNER_LABEL(__bhi_args_0_1_2_3, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rsi
+	cmovne %r10, %rdx
+	cmovne %r10, %rcx
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_0_1_2_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rsi
+	cmovne %r10, %rdx
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_0_1_2_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rsi
+	cmovne %r10, %rdx
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_0_1_3_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rsi
+	cmovne %r10, %rcx
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_0_1_3_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rsi
+	cmovne %r10, %rcx
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_0_1_4_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rsi
+	cmovne %r10, %r8
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_0_2_3_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rdx
+	cmovne %r10, %rcx
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_0_2_3_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rdx
+	cmovne %r10, %rcx
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_0_2_4_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rdx
+	cmovne %r10, %r8
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_0_3_4_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rcx
+	cmovne %r10, %r8
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_1_2_3_4, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %rdx
+	cmovne %r10, %rcx
+	cmovne %r10, %r8
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_1_2_3_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %rdx
+	cmovne %r10, %rcx
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_1_2_4_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %rdx
+	cmovne %r10, %r8
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_1_3_4_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rsi
+	cmovne %r10, %rcx
+	cmovne %r10, %r8
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+	.align 32
+SYM_INNER_LABEL(__bhi_args_2_3_4_5, SYM_L_LOCAL)
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdx
+	cmovne %r10, %rcx
+	cmovne %r10, %r8
+	cmovne %r10, %r9
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+
+SYM_CODE_END(__bhi_args_6c4)
+
+SYM_CODE_START(__bhi_args_all)
+	ANNOTATE_NOENDBR
+	UNWIND_HINT_FUNC
+	cmovne %r10, %rdi
+	cmovne %r10, %rsi
+	cmovne %r10, %rdx
+	cmovne %r10, %rcx
+	cmovne %r10, %r8
+	cmovne %r10, %r9
+	cmovne %r10, %rsp
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+SYM_CODE_END(__bhi_args_all)
+
+.popsection

Re: [PATCH 13/14] x86: BHI stubs

Posted by Josh Poimboeuf 1 month, 4 weeks ago

On Fri, Sep 27, 2024 at 09:49:09PM +0200, Peter Zijlstra wrote:
> +/*
> + * At the function start, launder function arguments that are a pointer through
> + * CMOVcc, this will create a write dependency in the speculation flow.
> + *
> + * Notably, the CFI preambles calling these will have ZF set and r10 zero.
> + */
> +
> +.pushsection .noinstr.text, "ax"
> +
> +	.align 8
> +SYM_CODE_START(__bhi_args_6c1)
> +	ANNOTATE_NOENDBR
> +	.align 8
> +SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL)
> +	UNWIND_HINT_FUNC
> +	cmovne %r10, %rdi

IIUC, this works because if the "jz" in the CFI preamble mispredicts to
the __bhi_args_* code, "cmovne" will zero out the speculative value of
rdi.

Why use %r10 instead of a literal $0?  Also how do you know %r10 is 0?

-- 
Josh

Re: [PATCH 13/14] x86: BHI stubs

Posted by Andrew Cooper 1 month, 4 weeks ago

On 30/09/2024 10:30 pm, Josh Poimboeuf wrote:
> On Fri, Sep 27, 2024 at 09:49:09PM +0200, Peter Zijlstra wrote:
>> +/*
>> + * At the function start, launder function arguments that are a pointer through
>> + * CMOVcc, this will create a write dependency in the speculation flow.
>> + *
>> + * Notably, the CFI preambles calling these will have ZF set and r10 zero.
>> + */
>> +
>> +.pushsection .noinstr.text, "ax"
>> +
>> +	.align 8
>> +SYM_CODE_START(__bhi_args_6c1)
>> +	ANNOTATE_NOENDBR
>> +	.align 8
>> +SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL)
>> +	UNWIND_HINT_FUNC
>> +	cmovne %r10, %rdi
> IIUC, this works because if the "jz" in the CFI preamble mispredicts to
> the __bhi_args_* code, "cmovne" will zero out the speculative value of
> rdi.
>
> Why use %r10 instead of a literal $0?  Also how do you know %r10 is 0?

There's no encoding for CMOVcc which takes an $imm.

%r10 is guaranteed zero after the FineIBT prologue, but I don't see
anything in patch 11 which makes this true in the !FineIBT case.

~Andrew

Re: [PATCH 13/14] x86: BHI stubs

Posted by Josh Poimboeuf 1 month, 4 weeks ago

On Mon, Sep 30, 2024 at 11:23:38PM +0100, Andrew Cooper wrote:
> On 30/09/2024 10:30 pm, Josh Poimboeuf wrote:
> > On Fri, Sep 27, 2024 at 09:49:09PM +0200, Peter Zijlstra wrote:
> >> +SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL)
> >> +	UNWIND_HINT_FUNC
> >> +	cmovne %r10, %rdi
> > IIUC, this works because if the "jz" in the CFI preamble mispredicts to
> > the __bhi_args_* code, "cmovne" will zero out the speculative value of
> > rdi.
> >
> > Why use %r10 instead of a literal $0?  Also how do you know %r10 is 0?
> 
> There's no encoding for CMOVcc which takes an $imm.

Ah.

> %r10 is guaranteed zero after the FineIBT prologue

If the "jz" in the FineIBT prologue mispredicts, isn't %r10 non-zero by
definition?

> , but I don't see
> anything in patch 11 which makes this true in the !FineIBT case.

I thought this code is only used by FineIBT?

-- 
Josh

Re: [PATCH 13/14] x86: BHI stubs

Posted by Peter Zijlstra 1 month, 4 weeks ago

On Mon, Sep 30, 2024 at 03:38:48PM -0700, Josh Poimboeuf wrote:
> On Mon, Sep 30, 2024 at 11:23:38PM +0100, Andrew Cooper wrote:
> > On 30/09/2024 10:30 pm, Josh Poimboeuf wrote:
> > > On Fri, Sep 27, 2024 at 09:49:09PM +0200, Peter Zijlstra wrote:
> > >> +SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL)
> > >> +	UNWIND_HINT_FUNC
> > >> +	cmovne %r10, %rdi
> > > IIUC, this works because if the "jz" in the CFI preamble mispredicts to
> > > the __bhi_args_* code, "cmovne" will zero out the speculative value of
> > > rdi.
> > >
> > > Why use %r10 instead of a literal $0?  Also how do you know %r10 is 0?
> > 
> > There's no encoding for CMOVcc which takes an $imm.
> 
> Ah.
> 
> > %r10 is guaranteed zero after the FineIBT prologue
> 
> If the "jz" in the FineIBT prologue mispredicts, isn't %r10 non-zero by
> definition?

Since I just wrote the comment...

 * FineIBT-BHI:
 *
 * __cfi_foo:
 *   endbr64
 *   subl 0x12345678, %r10d
 *   jz   foo-1
 *   ud2
 * foo-1:
 *   call __bhi_args_XXX
 * foo+4:
 *   ... code here ...
 *   ret
 *
 * direct caller:
 *   call foo+4
 *
 * indirect caller:
 *   lea foo(%rip), %r11
 *   ...
 *   movl $0x12345678, %r10d
 *   subl $16, %r11
 *   nop4
 *   call *%r11

And lets take a random bhi function:

+       .align 16
+SYM_INNER_LABEL(__bhi_args_0_1, SYM_L_LOCAL)
+       UNWIND_HINT_FUNC
+       cmovne %r10, %rdi
+       cmovne %r10, %rsi
+       ANNOTATE_UNRET_SAFE
+       ret
+       int3

So the case you worry about is SUBL does *not* result in 0, but we
speculate JZ true and end up in CALL, and do CMOVne.

Since we speculated Z, we must then also not do the CMOV, so the value
of R10 is irrelevant, it will not be used. The thing however is that
CMOV will unconditionally put a store dependency on the target register
(RDI, RSI in the above sequence) and as such any further speculative
code trying to use those registers will stall.

> > , but I don't see
> > anything in patch 11 which makes this true in the !FineIBT case.
> 
> I thought this code is only used by FineIBT?

Right, so I do have me a patch that adds it to regular KCFI as well, but
I dropped it for now, since I don't have a strong rationale for it and
it requires yet more compiler tinkering.

Re: [PATCH 13/14] x86: BHI stubs

Posted by Andrew Cooper 1 month, 4 weeks ago

On 01/10/2024 12:03 pm, Peter Zijlstra wrote:
> On Mon, Sep 30, 2024 at 03:38:48PM -0700, Josh Poimboeuf wrote:
>> On Mon, Sep 30, 2024 at 11:23:38PM +0100, Andrew Cooper wrote:
>>> On 30/09/2024 10:30 pm, Josh Poimboeuf wrote:
>>>> On Fri, Sep 27, 2024 at 09:49:09PM +0200, Peter Zijlstra wrote:
>>>>> +SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL)
>>>>> +	UNWIND_HINT_FUNC
>>>>> +	cmovne %r10, %rdi
>>>> IIUC, this works because if the "jz" in the CFI preamble mispredicts to
>>>> the __bhi_args_* code, "cmovne" will zero out the speculative value of
>>>> rdi.
>>>>
>>>> Why use %r10 instead of a literal $0?  Also how do you know %r10 is 0?
>>> There's no encoding for CMOVcc which takes an $imm.
>> Ah.
>>
>>> %r10 is guaranteed zero after the FineIBT prologue
>> If the "jz" in the FineIBT prologue mispredicts, isn't %r10 non-zero by
>> definition?
> Since I just wrote the comment...
>
>  * FineIBT-BHI:
>  *
>  * __cfi_foo:
>  *   endbr64
>  *   subl 0x12345678, %r10d
>  *   jz   foo-1
>  *   ud2
>  * foo-1:
>  *   call __bhi_args_XXX
>  * foo+4:
>  *   ... code here ...
>  *   ret
>  *
>  * direct caller:
>  *   call foo+4
>  *
>  * indirect caller:
>  *   lea foo(%rip), %r11
>  *   ...
>  *   movl $0x12345678, %r10d
>  *   subl $16, %r11

A compiler would normally spell this:

    lea foo-16(%rip), %r11

>  *   nop4
>  *   call *%r11
>
> And lets take a random bhi function:
>
> +       .align 16
> +SYM_INNER_LABEL(__bhi_args_0_1, SYM_L_LOCAL)
> +       UNWIND_HINT_FUNC
> +       cmovne %r10, %rdi
> +       cmovne %r10, %rsi
> +       ANNOTATE_UNRET_SAFE
> +       ret
> +       int3
>
> So the case you worry about is SUBL does *not* result in 0, but we
> speculate JZ true and end up in CALL, and do CMOVne.
>
> Since we speculated Z, we must then also not do the CMOV, so the value
> of R10 is irrelevant, it will not be used. The thing however is that
> CMOV will unconditionally put a store dependency on the target register
> (RDI, RSI in the above sequence) and as such any further speculative
> code trying to use those registers will stall.

How does that help?

The write dependency doesn't stop a dependent load from executing in the
shadow of a mispredicted branch.

This is why SLH forces the pointer to 0 on the bad speculation path.

~Andrew

Re: [PATCH 13/14] x86: BHI stubs

Posted by Peter Zijlstra 1 month, 3 weeks ago

On Tue, Oct 01, 2024 at 12:20:02PM +0100, Andrew Cooper wrote:
> On 01/10/2024 12:03 pm, Peter Zijlstra wrote:
> > On Mon, Sep 30, 2024 at 03:38:48PM -0700, Josh Poimboeuf wrote:
> >> On Mon, Sep 30, 2024 at 11:23:38PM +0100, Andrew Cooper wrote:
> >>> On 30/09/2024 10:30 pm, Josh Poimboeuf wrote:
> >>>> On Fri, Sep 27, 2024 at 09:49:09PM +0200, Peter Zijlstra wrote:
> >>>>> +SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL)
> >>>>> +	UNWIND_HINT_FUNC
> >>>>> +	cmovne %r10, %rdi
> >>>> IIUC, this works because if the "jz" in the CFI preamble mispredicts to
> >>>> the __bhi_args_* code, "cmovne" will zero out the speculative value of
> >>>> rdi.
> >>>>
> >>>> Why use %r10 instead of a literal $0?  Also how do you know %r10 is 0?
> >>> There's no encoding for CMOVcc which takes an $imm.
> >> Ah.
> >>
> >>> %r10 is guaranteed zero after the FineIBT prologue
> >> If the "jz" in the FineIBT prologue mispredicts, isn't %r10 non-zero by
> >> definition?
> > Since I just wrote the comment...
> >
> >  * FineIBT-BHI:
> >  *
> >  * __cfi_foo:
> >  *   endbr64
> >  *   subl 0x12345678, %r10d
> >  *   jz   foo-1
> >  *   ud2
> >  * foo-1:
> >  *   call __bhi_args_XXX
> >  * foo+4:
> >  *   ... code here ...
> >  *   ret
> >  *
> >  * direct caller:
> >  *   call foo+4
> >  *
> >  * indirect caller:
> >  *   lea foo(%rip), %r11
> >  *   ...
> >  *   movl $0x12345678, %r10d
> >  *   subl $16, %r11
> 
> A compiler would normally spell this:
> 
>     lea foo-16(%rip), %r11

Yeah, but the original lea is outside of the code we control, the kcfi
caller section we're rewriting gets to have %r11 as is.

> >  *   nop4
> >  *   call *%r11
> >
> > And lets take a random bhi function:
> >
> > +       .align 16
> > +SYM_INNER_LABEL(__bhi_args_0_1, SYM_L_LOCAL)
> > +       UNWIND_HINT_FUNC
> > +       cmovne %r10, %rdi
> > +       cmovne %r10, %rsi
> > +       ANNOTATE_UNRET_SAFE
> > +       ret
> > +       int3
> >
> > So the case you worry about is SUBL does *not* result in 0, but we
> > speculate JZ true and end up in CALL, and do CMOVne.
> >
> > Since we speculated Z, we must then also not do the CMOV, so the value
> > of R10 is irrelevant, it will not be used. The thing however is that
> > CMOV will unconditionally put a store dependency on the target register
> > (RDI, RSI in the above sequence) and as such any further speculative
> > code trying to use those registers will stall.
> 
> How does that help?
> 
> The write dependency doesn't stop a dependent load from executing in the
> shadow of a mispredicted branch.

I've been given to understand CMOVcc will kill any further speculation
using the target register. So by 'poisoning' all argument registers that
are involved with loads, we avoid any such load from happening during
speculation.

Re: [PATCH 13/14] x86: BHI stubs

Posted by Andrew Cooper 1 month, 3 weeks ago

On 03/10/2024 1:17 pm, Peter Zijlstra wrote:
> On Tue, Oct 01, 2024 at 12:20:02PM +0100, Andrew Cooper wrote:
>> On 01/10/2024 12:03 pm, Peter Zijlstra wrote:
>>>  *   nop4
>>>  *   call *%r11
>>>
>>> And lets take a random bhi function:
>>>
>>> +       .align 16
>>> +SYM_INNER_LABEL(__bhi_args_0_1, SYM_L_LOCAL)
>>> +       UNWIND_HINT_FUNC
>>> +       cmovne %r10, %rdi
>>> +       cmovne %r10, %rsi
>>> +       ANNOTATE_UNRET_SAFE
>>> +       ret
>>> +       int3
>>>
>>> So the case you worry about is SUBL does *not* result in 0, but we
>>> speculate JZ true and end up in CALL, and do CMOVne.
>>>
>>> Since we speculated Z, we must then also not do the CMOV, so the value
>>> of R10 is irrelevant, it will not be used. The thing however is that
>>> CMOV will unconditionally put a store dependency on the target register
>>> (RDI, RSI in the above sequence) and as such any further speculative
>>> code trying to use those registers will stall.
>> How does that help?
>>
>> The write dependency doesn't stop a dependent load from executing in the
>> shadow of a mispredicted branch.
> I've been given to understand CMOVcc will kill any further speculation
> using the target register. So by 'poisoning' all argument registers that
> are involved with loads, we avoid any such load from happening during
> speculation.

IANAPA (I am not a pipeline architect), but AIUI,

CMOVcc establishes a data dependency between flags and the destination
register that doesn't exist in the pipeline if you'd used a conditional
branch instead.

It does prevent a dependent load from executing before the CMOVcc has
executed.  But it does not stop that load from executing speculatively
eventually.

So, given the following case:

* SUB is/will results nonzero (ZF=0, %r10=nonzero)
* JZ predicted taken, despite (ZF=0)

we call __bhi_args_XXX wherein:

* CMOVNZ blocks until SUB executes (flags dependency)
* CMOVNZ eventually executes, and because ZF=0, it really does write
%r10 over the target registers

and then we enter the function with all pointers containing the nonzero
residual from the hash check.

Now, because it's a SUBL, the result is < 2^32, a straight deference of
one of these pointers will be blocked by SMAP (noone cares about 32bit,
or pre-SMAP hardware, right?)

Forward references from the pointers will be safe (assuming SIB doesn't
reach the canonical boundary), but backward references may wrap around
back into the kernel space.  These will not be blocked by SMAP and will
spill their secrets if suitably provoked.

~Andrew

RE: [PATCH 13/14] x86: BHI stubs

Posted by Constable, Scott D 1 month, 2 weeks ago

Hello Andrew,

Your observation is valid. If we assume that the hashing function used by FineIBT is uniformly distributed, then the distribution of hashes at the call site and at the call target is [0,2^32-1]. The difference of the two hashes computed in R10 will have the same distribution because of wrap-around, and the mean of this distribution is 2^31-1. Therefore, to reasonably bypass the proposed mitigation, I believe an attacker would need the hardened pointer to be added/subtracted to/from an attacker-controlled 64-bit value, or an attacker-controlled 32-bit value scaled by 2, 4, or 8. Therefore, I think it would be reasonable to additionally apply the CMOV hardening to any 32-/64-bit integral parameters, including enums. I scanned the kernel (Ubuntu noble 6.8 config) and found that 77% of parameters to indirect call targets are pointers (which we already harden) and less than 20% are 32-/64-bit integrals and enums.

I think that this proposal would also address some other potential corner cases, such as:
- an attacker-controlled 32-/64-bit attacker-controlled integral parameter is used to index into a fixed-address array
- an attacker-controlled 64-bit attacker-controlled integral parameter is cast into a pointer

Does this proposal address your concern?

Thanks and regards,

Scott Constable

>On 03/10/2024 1:17 pm, Peter Zijlstra wrote:
>> On Tue, Oct 01, 2024 at 12:20:02PM +0100, Andrew Cooper wrote:
>>> On 01/10/2024 12:03 pm, Peter Zijlstra wrote:
>>>>  *   nop4
>>>>  *   call *%r11
>>>>
>>>> And lets take a random bhi function:
>>>>
>>>> +       .align 16
>>>> +SYM_INNER_LABEL(__bhi_args_0_1, SYM_L_LOCAL)
>>>> +       UNWIND_HINT_FUNC
>>>> +       cmovne %r10, %rdi
>>>> +       cmovne %r10, %rsi
>>>> +       ANNOTATE_UNRET_SAFE
>>>> +       ret
>>>> +       int3
>>>>
>>>> So the case you worry about is SUBL does *not* result in 0, but we 
>>>> speculate JZ true and end up in CALL, and do CMOVne.
>>>>
>>>> Since we speculated Z, we must then also not do the CMOV, so the 
>>>> value of R10 is irrelevant, it will not be used. The thing however 
>>>> is that CMOV will unconditionally put a store dependency on the 
>>>> target register (RDI, RSI in the above sequence) and as such any 
>>>> further speculative code trying to use those registers will stall.
>>> How does that help?
>>>
>>> The write dependency doesn't stop a dependent load from executing in 
>>> the shadow of a mispredicted branch.
>> I've been given to understand CMOVcc will kill any further speculation 
>> using the target register. So by 'poisoning' all argument registers 
>> that are involved with loads, we avoid any such load from happening 
>> during speculation.

> IANAPA (I am not a pipeline architect), but AIUI,

> CMOVcc establishes a data dependency between flags and the destination register that doesn't exist in the pipeline if you'd used a conditional branch instead.

> It does prevent a dependent load from executing before the CMOVcc has executed.  But it does not stop that load from executing speculatively eventually.

> So, given the following case:

> * SUB is/will results nonzero (ZF=0, %r10=nonzero)
> * JZ predicted taken, despite (ZF=0)

> we call __bhi_args_XXX wherein:

> * CMOVNZ blocks until SUB executes (flags dependency)
> * CMOVNZ eventually executes, and because ZF=0, it really does write
> %r10 over the target registers

> and then we enter the function with all pointers containing the nonzero residual from the hash check.

> Now, because it's a SUBL, the result is < 2^32, a straight deference of one of these pointers will be blocked by SMAP (noone cares about 32bit, or pre-SMAP hardware, right?)

> Forward references from the pointers will be safe (assuming SIB doesn't reach the canonical boundary), but backward references may wrap around back into the kernel space.  These will not be blocked by SMAP and will spill their secrets if suitably provoked.

> ~Andrew

Re: [PATCH 13/14] x86: BHI stubs

Posted by Andrew Cooper 1 month, 2 weeks ago

On 14/10/2024 6:50 pm, Constable, Scott D wrote:
> Hello Andrew,
>
> Your observation is valid. If we assume that the hashing function used by FineIBT is uniformly distributed, then the distribution of hashes at the call site and at the call target is [0,2^32-1]. The difference of the two hashes computed in R10 will have the same distribution because of wrap-around, and the mean of this distribution is 2^31-1. Therefore, to reasonably bypass the proposed mitigation, I believe an attacker would need the hardened pointer to be added/subtracted to/from an attacker-controlled 64-bit value, or an attacker-controlled 32-bit value scaled by 2, 4, or 8. Therefore, I think it would be reasonable to additionally apply the CMOV hardening to any 32-/64-bit integral parameters, including enums. I scanned the kernel (Ubuntu noble 6.8 config) and found that 77% of parameters to indirect call targets are pointers (which we already harden) and less than 20% are 32-/64-bit integrals and enums.
>
> I think that this proposal would also address some other potential corner cases, such as:
> - an attacker-controlled 32-/64-bit attacker-controlled integral parameter is used to index into a fixed-address array
> - an attacker-controlled 64-bit attacker-controlled integral parameter is cast into a pointer
>
> Does this proposal address your concern?

Hello,

Thankyou for the analysis, and I'm glad I'm not just clutching at straws.

However, I'm not sure if extending this to other cases works very well. 
While the second point is probably easy for the compiler to figure out,
the former is looking a little bit more like a halting problem.

One key aspect is "how far can speculation continue beyond a
mispredicted Jcc", but it occurs to me since the last email that there
is no answer that Intel will give here.  It is uarch dependent and
expected to increase on future parts, so safety wise we must assume
infinite.

And infinite is no good, so we must reason about "good enough".

My gut feeling is that blindly using the residual from the hash check
isn't good enough.  7 years of  speculation fixes have shown that the
researchers are constantly proving "this will be good enough" wrong.

So, instead of simply using the residual, why don't we explicitly set
%r10 to a known value?

Because we need to preserve flags from original hash check, we can't use
any of the simple zeroing idioms, but we could use MOV $0, %r10 before
the CMOVs targetting the pointer parameters.

But, if we're using a long-ish encoding anyway, why not MOV $GB(2)-1, %r10 ?

This way, in the bad speculation path we'll set all pointers to 2G,
which removes most of the risk with backwards references, and makes the
behaviour invariant of the hash residual (which itself reduces the
opportunities to leak the hash value).

So I suppose the real question is whether one extra MOV is acceptable,
and is it good enough?  My gut feeling is yes to both.

To the extra cases, they can of course be added if the compiler support
isn't too horrible, independently of the extra MOV.  But, if 77% of
parameters to indirect functions are pointers anyway, isn't it work
considering CMOV-ing all parameter registers and turning the 57 stubs
into just 6, and improve I$ locality?

~Andrew

RE: [PATCH 13/14] x86: BHI stubs

Posted by Constable, Scott D 1 month, 1 week ago

Hello Andrew,

My understanding of the FineIBT approach is that the hash values are not intended to be secret, and therefore leaking these hash values would not expose a new risk. Joao is also on this thread and knows a lot more about FineIBT than I do, so he can chime in if my reasoning is unsound.

Our intent in using the hash check residual as the poison is to overwrite a potentially attacker-controllable value with a fixed value that the attacker cannot control. But the points you raise have made us revisit this assumption. The residual from the hash check does give an attacker a small degree of control, because the attacker may be able to trigger a speculative mis-prediction from several different indirect call sites to a given indirect call target. If those call sites have different hash values, then the residual will differ accordingly. But this does not amount to arbitrary control over the residual. For example, if the attacker wants to poison the target's registers with a specific value (e.g., 0xdeadbeef) that will cause an out-of-bounds read to a desired location in kernel memory (e.g., fixed_addr+0xdeadbeef), then the attacker will need to identify an indirect call site with a 32-bit hash H1 such that the target's hash H2 satisfies H1-H2=0xdeadbeef. This scenario does not seem to be more hazardous than other existing Spectre v1 gadgets that likely exist throughout the kernel, which may leak some data at some specific addresses but without giving an attacker arbitrary control to leak gobs of kernel memory.

We do agree with your suggestion that all of the arguments should be poisoned, not just the pointers. For example, there might be some indirect call targets that look like:

void foo(unsigned short i) {
    unsigned long x = fixed_address_u64_array[i];
    // code that leaks x
}

In a previous email I had suggested that it might not be necessary to poison some arguments, including 16-bit values. Then code such as this (if it exists) could expose up to 512K of kernel memory above fixed_address_u64_array. So your concern about poisoning a proper subset of the arguments is valid, and we appreciate that your suggestion would also simplify the implementation by requiring fewer stubs.

Regards,

Scott Constable

> Hello,
>
> Thankyou for the analysis, and I'm glad I'm not just clutching at straws.
>
> However, I'm not sure if extending this to other cases works very well. While the second point is probably easy for the compiler to figure out, the former is looking a little bit more like a halting problem.
>
> One key aspect is "how far can speculation continue beyond a mispredicted Jcc", but it occurs to me since the last email that there is no answer that Intel will give here.  It is uarch dependent and expected to increase on future parts, so safety wise we must assume infinite.
>
> And infinite is no good, so we must reason about "good enough".
>
> My gut feeling is that blindly using the residual from the hash check isn't good enough.  7 years of  speculation fixes have shown that the researchers are constantly proving "this will be good enough" wrong.
>
> So, instead of simply using the residual, why don't we explicitly set
> %r10 to a known value?
>
> Because we need to preserve flags from original hash check, we can't use any of the simple zeroing idioms, but we could use MOV $0, %r10 before the CMOVs targetting the pointer parameters.
>
> But, if we're using a long-ish encoding anyway, why not MOV $GB(2)-1, %r10 ?
>
> This way, in the bad speculation path we'll set all pointers to 2G, which removes most of the risk with backwards references, and makes the behaviour invariant of the hash residual (which itself reduces the opportunities to leak the hash value).
>
> So I suppose the real question is whether one extra MOV is acceptable, and is it good enough?  My gut feeling is yes to both.
>
>
> To the extra cases, they can of course be added if the compiler support isn't too horrible, independently of the extra MOV.  But, if 77% of parameters to indirect functions are pointers anyway, isn't it work considering CMOV-ing all parameter registers and turning the 57 stubs into just 6, and improve I$ locality?
>
> ~Andrew

Re: [PATCH 13/14] x86: BHI stubs

Posted by Joao Moreira 1 month ago

On Mon, Oct 21, 2024 at 8:06 AM Constable, Scott D
<scott.d.constable@intel.com> wrote:
>
> Hello Andrew,
>
> My understanding of the FineIBT approach is that the hash values are not intended to be secret, and therefore leaking these hash values would not expose a new risk. Joao is also on this thread and knows a lot more about FineIBT than I do, so he can chime in if my reasoning is unsound.

Technically the hashes are not a secret indeed. With that said, I
think it was Kees who submitted a patch that randomizes the hash
values at boot time, as a security in depth / probabilistic measure
against an attacker being able to generate executable valid targets.

Tks,
Joao

Re: [PATCH 13/14] x86: BHI stubs

Posted by Andrew Cooper 1 month, 4 weeks ago

On 30/09/2024 11:38 pm, Josh Poimboeuf wrote:
> On Mon, Sep 30, 2024 at 11:23:38PM +0100, Andrew Cooper wrote:
>> On 30/09/2024 10:30 pm, Josh Poimboeuf wrote:
>>> On Fri, Sep 27, 2024 at 09:49:09PM +0200, Peter Zijlstra wrote:
>>>> +SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL)
>>>> +	UNWIND_HINT_FUNC
>>>> +	cmovne %r10, %rdi
>>> IIUC, this works because if the "jz" in the CFI preamble mispredicts to
>>> the __bhi_args_* code, "cmovne" will zero out the speculative value of
>>> rdi.
>>>
>>> Why use %r10 instead of a literal $0?  Also how do you know %r10 is 0?
>> There's no encoding for CMOVcc which takes an $imm.
> Ah.
>
>> %r10 is guaranteed zero after the FineIBT prologue
> If the "jz" in the FineIBT prologue mispredicts, isn't %r10 non-zero by
> definition?

FineIBT uses SUB (and not CMP) to destroy the hash in %r10.

This makes it marginally harder to leak an unknown hash; you can't
trivially deference it, but there is a linear function if you know the
hash of the caller side.

In the bad speculation path, you're still overwriting pointers with a
number that is < 2^32, so will be stalled by SMAP if dereferenced.

>
>> , but I don't see
>> anything in patch 11 which makes this true in the !FineIBT case.
> I thought this code is only used by FineIBT?

Erm pass.  Peter?

~Andrew

Re: [PATCH 13/14] x86: BHI stubs

Posted by Josh Poimboeuf 1 month, 4 weeks ago

On Mon, Sep 30, 2024 at 02:30:32PM -0700, Josh Poimboeuf wrote:
> On Fri, Sep 27, 2024 at 09:49:09PM +0200, Peter Zijlstra wrote:
> > +/*
> > + * At the function start, launder function arguments that are a pointer through
> > + * CMOVcc, this will create a write dependency in the speculation flow.
> > + *
> > + * Notably, the CFI preambles calling these will have ZF set and r10 zero.
> > + */
> > +
> > +.pushsection .noinstr.text, "ax"
> > +
> > +	.align 8
> > +SYM_CODE_START(__bhi_args_6c1)
> > +	ANNOTATE_NOENDBR
> > +	.align 8
> > +SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL)
> > +	UNWIND_HINT_FUNC
> > +	cmovne %r10, %rdi
> 
> IIUC, this works because if the "jz" in the CFI preamble mispredicts to
> the __bhi_args_* code, "cmovne" will zero out the speculative value of
> rdi.
> 
> Why use %r10 instead of a literal $0?  Also how do you know %r10 is 0?

BTW, this "speculative pointer clearing" feature is broader than just
BHI so I wouldn't call it that.  It's really just a logical extension of
FineIBT IMO.

-- 
Josh

Re: [PATCH 13/14] x86: BHI stubs

Posted by Josh Poimboeuf 2 months ago

On Fri, Sep 27, 2024 at 09:49:09PM +0200, Peter Zijlstra wrote:
> +static void *bhi_args_1(u8 args, void *addr)
> +{
> +	u8 bytes[5];
> +
> +	for (int i = 0; i < 6; i++) {
> +		if (args != BIT(i))
> +			continue;
> +
> +		bytes[0] = 0x2e;
> +		memcpy(&bytes[1], &__bhi_args_6c1[i], 4);
> +
> +		text_poke_early(addr, bytes, 5);
> +
> +		return NULL;

I assume there's some good reason this doesn't return a pointer to the
code like the others?

-- 
Josh

Re: [PATCH 13/14] x86: BHI stubs

Posted by Peter Zijlstra 2 months ago

On Fri, Sep 27, 2024 at 06:37:36PM -0700, Josh Poimboeuf wrote:
> On Fri, Sep 27, 2024 at 09:49:09PM +0200, Peter Zijlstra wrote:
> > +static void *bhi_args_1(u8 args, void *addr)
> > +{
> > +	u8 bytes[5];
> > +
> > +	for (int i = 0; i < 6; i++) {
> > +		if (args != BIT(i))
> > +			continue;
> > +
> > +		bytes[0] = 0x2e;
> > +		memcpy(&bytes[1], &__bhi_args_6c1[i], 4);
> > +
> > +		text_poke_early(addr, bytes, 5);
> > +
> > +		return NULL;
> 
> I assume there's some good reason this doesn't return a pointer to the
> code like the others?

The 1 bit case is different in that it does in-place CMOVcc while the
others do a CALL to an external stub.

Not saying this is the best way to do it, but it's what I ended up with
back then.