arch/x86/include/asm/bug.h | 9 +- arch/x86/include/asm/cfi.h | 4 +- arch/x86/kernel/alternative.c | 193 ++++++++++++++++++++++++------------------ arch/x86/kernel/cet.c | 2 +- arch/x86/kernel/traps.c | 8 +- arch/x86/lib/bhi.S | 58 ++++++------- arch/x86/lib/retpoline.S | 4 +- arch/x86/net/bpf_jit_comp.c | 7 +- 8 files changed, 158 insertions(+), 127 deletions(-)
Hi!
A while ago FineIBT started using the instruction 0xEA to generate #UD.
All existing parts will generate #UD in 64bit mode on that instruction.
However; Intel/AMD have not blessed using this instruction, it is on
their 'reserved' list for future use.
Peter Anvin worked the committees and got use of 0xD6 blessed, and it
will be called UDB (per the next SDM or so).
Reworking the FineIBT code to use UDB wasn't entirely trivial, and I've
had to switch the hash register to EAX in order to free up some bytes.
Per the x86_64 ABI, EAX is used to pass the number of vector registers
for varargs -- something that should not happen in the kernel. More so,
we build with -mskip-rax-setup, which should leave EAX completely unused
in the calling convention.
The code boots and passes the LKDTM CFI_FORWARD_PROTO test for various
combinations (non exhaustive so far).
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
arch/x86/include/asm/bug.h | 9 +-
arch/x86/include/asm/cfi.h | 4 +-
arch/x86/kernel/alternative.c | 193 ++++++++++++++++++++++++------------------
arch/x86/kernel/cet.c | 2 +-
arch/x86/kernel/traps.c | 8 +-
arch/x86/lib/bhi.S | 58 ++++++-------
arch/x86/lib/retpoline.S | 4 +-
arch/x86/net/bpf_jit_comp.c | 7 +-
8 files changed, 158 insertions(+), 127 deletions(-)
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index e427a3f7b751..70711183a020 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -5,14 +5,19 @@
#include <linux/stringify.h>
#include <linux/instrumentation.h>
#include <linux/objtool.h>
+#include <asm/asm.h>
/*
* Despite that some emulators terminate on UD2, we use it for WARN().
*/
-#define ASM_UD2 ".byte 0x0f, 0x0b"
+#define ASM_UD2 _ASM_BYTES(0x0f, 0x0b)
#define INSN_UD2 0x0b0f
#define LEN_UD2 2
+#define ASM_UDB _ASM_BYTES(0xd6)
+#define INSN_UDB 0xd6
+#define LEN_UDB 1
+
/*
* In clang we have UD1s reporting UBSAN failures on X86, 64 and 32bit.
*/
@@ -26,7 +31,7 @@
#define BUG_UD2 0xfffe
#define BUG_UD1 0xfffd
#define BUG_UD1_UBSAN 0xfffc
-#define BUG_EA 0xffea
+#define BUG_UDB 0xffd6
#define BUG_LOCK 0xfff0
#ifdef CONFIG_GENERIC_BUG
diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 1751f1eb95ef..e7060328bc28 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -71,7 +71,7 @@
*
* __cfi_foo:
* endbr64
- * subl 0x12345678, %r10d
+ * subl 0x12345678, %eax
* jz foo
* ud2
* nop
@@ -86,7 +86,7 @@
* indirect caller:
* lea foo(%rip), %r11
* ...
- * movl $0x12345678, %r10d
+ * movl $0x12345678, %eax
* subl $16, %r11
* nop4
* call *%r11
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7bde68247b5f..e4f4b7dce4ec 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -147,10 +147,10 @@ static void *its_init_thunk(void *thunk, int reg)
/*
* When ITS uses indirect branch thunk the fineibt_paranoid
* caller sequence doesn't fit in the caller site. So put the
- * remaining part of the sequence (<ea> + JNE) into the ITS
+ * remaining part of the sequence (UDB + JNE) into the ITS
* thunk.
*/
- bytes[i++] = 0xea; /* invalid instruction */
+ bytes[i++] = 0xd6; /* UDB */
bytes[i++] = 0x75; /* JNE */
bytes[i++] = 0xfd;
@@ -163,7 +163,7 @@ static void *its_init_thunk(void *thunk, int reg)
reg -= 8;
}
bytes[i++] = 0xff;
- bytes[i++] = 0xe0 + reg; /* jmp *reg */
+ bytes[i++] = 0xe0 + reg; /* JMP *reg */
bytes[i++] = 0xcc;
return thunk + offset;
@@ -970,7 +970,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
case JMP32_INSN_OPCODE:
/* Check for cfi_paranoid + ITS */
dest = addr + insn.length + insn.immediate.value;
- if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) {
+ if (dest[-1] == 0xd6 && (dest[0] & 0xf0) == 0x70) {
WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
continue;
}
@@ -1300,9 +1300,9 @@ early_param("cfi", cfi_parse_cmdline);
*
* __cfi_\func: __cfi_\func:
* movl $0x12345678,%eax // 5 endbr64 // 4
- * nop subl $0x12345678,%r10d // 7
- * nop jne __cfi_\func+6 // 2
- * nop nop3 // 3
+ * nop subl $0x12345678,%eax // 5
+ * nop nopl -42(%eax) // 5
+ * nop jne __cfi_\func+13 // 2
* nop
* nop
* nop
@@ -1314,9 +1314,9 @@ early_param("cfi", cfi_parse_cmdline);
*
*
* caller: caller:
- * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
+ * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%eax // 5
* addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4
- * je 1f // 2 nop4 // 4
+ * je 1f // 2 nop5 // 5
* ud2 // 2
* 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6
*
@@ -1325,20 +1325,20 @@ early_param("cfi", cfi_parse_cmdline);
/*
* <fineibt_preamble_start>:
* 0: f3 0f 1e fa endbr64
- * 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d
- * b: 75 f9 jne 6 <fineibt_preamble_start+0x6>
- * d: 0f 1f 00 nopl (%rax)
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * b: 67 0f 1f 40 <d6> nopl -42(%eax)
+ * e: 75 fd jne 13 <fineibt_preamble_start+0xd>
*
- * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as
- * (bad) on x86_64 and raises #UD.
+ * Note that the JNE target is the 0xD6 byte inside the NOPL, this decodes as
+ * UDB on x86_64 and raises #UD.
*/
asm( ".pushsection .rodata \n"
"fineibt_preamble_start: \n"
" endbr64 \n"
- " subl $0x12345678, %r10d \n"
+ " subl $0x12345678, %eax \n"
"fineibt_preamble_bhi: \n"
- " jne fineibt_preamble_start+6 \n"
- ASM_NOP3
+ " nopl -42(%eax) \n"
+ " jne fineibt_preamble_start+13 \n"
"fineibt_preamble_end: \n"
".popsection\n"
);
@@ -1349,20 +1349,20 @@ extern u8 fineibt_preamble_end[];
#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start)
-#define fineibt_preamble_ud 6
-#define fineibt_preamble_hash 7
+#define fineibt_preamble_ud 13
+#define fineibt_preamble_hash 5
/*
* <fineibt_caller_start>:
- * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
- * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11
- * a: 0f 1f 40 00 nopl 0x0(%rax)
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 4d 8d 5b f0 lea -0x10(%r11), %r11
+ * 9: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
*/
asm( ".pushsection .rodata \n"
"fineibt_caller_start: \n"
- " movl $0x12345678, %r10d \n"
+ " movl $0x12345678, %eax \n"
" lea -0x10(%r11), %r11 \n"
- ASM_NOP4
+ ASM_NOP5
"fineibt_caller_end: \n"
".popsection \n"
);
@@ -1371,7 +1371,7 @@ extern u8 fineibt_caller_start[];
extern u8 fineibt_caller_end[];
#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
-#define fineibt_caller_hash 2
+#define fineibt_caller_hash 1
#define fineibt_caller_jmp (fineibt_caller_size - 2)
@@ -1388,9 +1388,9 @@ extern u8 fineibt_caller_end[];
* of adding a load.
*
* <fineibt_paranoid_start>:
- * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
- * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d
- * a: 4d 8d 5b <f0> lea -0x10(%r11), %r11
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 41 3b 43 f5 cmp -0x11(%r11), %eax
+ * 9: 2e 4d 8d 5b <f0> cs lea -0x10(%r11), %r11
* e: 75 fd jne d <fineibt_paranoid_start+0xd>
* 10: 41 ff d3 call *%r11
* 13: 90 nop
@@ -1402,9 +1402,10 @@ extern u8 fineibt_caller_end[];
*/
asm( ".pushsection .rodata \n"
"fineibt_paranoid_start: \n"
- " movl $0x12345678, %r10d \n"
- " cmpl -9(%r11), %r10d \n"
- " lea -0x10(%r11), %r11 \n"
+ " mov $0x12345678, %eax \n"
+ " cmpl -11(%r11), %eax \n"
+ " cs lea -0x10(%r11), %r11 \n"
+ "#fineibt_caller_size: \n"
" jne fineibt_paranoid_start+0xd \n"
"fineibt_paranoid_ind: \n"
" call *%r11 \n"
@@ -1520,57 +1521,76 @@ static int cfi_rand_preamble(s32 *start, s32 *end)
return 0;
}
+/*
+ * Inline the bhi-arity 1 case:
+ *
+ * __cfi_foo:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 2e 49 0f 45 fa cs cmovne %rax, %rdi
+ * e: 75 03 jne foo+0x3
+ *
+ * foo:
+ * 10: 0f 1f 40 <d6> nopl -42(%rax)
+ *
+ * Notably, this scheme is incompatible with permissive CFI
+ * because the CMOVcc is unconditional and RDI will have been
+ * clobbered.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_bhi1_start: \n"
+ " cs cmovne %rax, %rdi \n"
+ " jne fineibt_bhi1_func + 0x3 \n"
+ "fineibt_bhi1_func: \n"
+ " nopl -42(%rax) \n"
+ "fineibt_bhi1_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_bhi1_start[];
+extern u8 fineibt_bhi1_end[];
+
+#define fineibt_bhi1_size (fineibt_bhi1_end - fineibt_bhi1_start)
+#define fineibt_bhi1_ud 0x13
+
static void cfi_fineibt_bhi_preamble(void *addr, int arity)
{
+ u8 bytes[MAX_INSN_SIZE];
+
if (!arity)
return;
if (!cfi_warn && arity == 1) {
- /*
- * Crazy scheme to allow arity-1 inline:
- *
- * __cfi_foo:
- * 0: f3 0f 1e fa endbr64
- * 4: 41 81 <ea> 78 56 34 12 sub 0x12345678, %r10d
- * b: 49 0f 45 fa cmovne %r10, %rdi
- * f: 75 f5 jne __cfi_foo+6
- * 11: 0f 1f 00 nopl (%rax)
- *
- * Code that direct calls to foo()+0, decodes the tail end as:
- *
- * foo:
- * 0: f5 cmc
- * 1: 0f 1f 00 nopl (%rax)
- *
- * which clobbers CF, but does not affect anything ABI
- * wise.
- *
- * Notably, this scheme is incompatible with permissive CFI
- * because the CMOVcc is unconditional and RDI will have been
- * clobbered.
- */
- const u8 magic[9] = {
- 0x49, 0x0f, 0x45, 0xfa,
- 0x75, 0xf5,
- BYTES_NOP3,
- };
-
- text_poke_early(addr + fineibt_preamble_bhi, magic, 9);
-
+ text_poke_early(addr + fineibt_preamble_bhi,
+ fineibt_bhi1_start, fineibt_bhi1_size);
return;
}
- text_poke_early(addr + fineibt_preamble_bhi,
- text_gen_insn(CALL_INSN_OPCODE,
- addr + fineibt_preamble_bhi,
- __bhi_args[arity]),
- CALL_INSN_SIZE);
+ /*
+ * Replace the bytes at fineibt_preamble_bhi with a CALL instruction
+ * that lines up exactly with the end of the preamble, such that the
+ * return address will be foo+0.
+ *
+ * __cfi_foo:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 2e 2e e8 DD DD DD DD cs cs call __bhi_args[arity]
+ */
+ bytes[0] = 0x2e;
+ bytes[1] = 0x2e;
+ __text_gen_insn(bytes + 2, CALL_INSN_OPCODE,
+ addr + fineibt_preamble_bhi + 2,
+ __bhi_args[arity], CALL_INSN_SIZE);
+
+ text_poke_early(addr + fineibt_preamble_bhi, bytes, 7);
}
static int cfi_rewrite_preamble(s32 *start, s32 *end)
{
s32 *s;
+ BUG_ON(fineibt_bhi1_size != 11);
+
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
int arity;
@@ -1655,6 +1675,7 @@ static int cfi_rewrite_callers(s32 *start, s32 *end)
{
s32 *s;
+ BUG_ON(fineibt_caller_size != 14);
BUG_ON(fineibt_paranoid_size != 20);
for (s = start; s < end; s++) {
@@ -1824,10 +1845,9 @@ static void poison_cfi(void *addr)
/*
* __cfi_\func:
* osp nopl (%rax)
- * subl $0, %r10d
- * jz 1f
- * ud2
- * 1: nop
+ * sub $0, %rax
+ * nopl -42(%eax)
+ * jne __cfi_\func+0xd
*/
poison_endbr(addr);
poison_hash(addr + fineibt_preamble_hash);
@@ -1854,24 +1874,30 @@ static void poison_cfi(void *addr)
}
/*
- * When regs->ip points to a 0xEA byte in the FineIBT preamble,
+ * When regs->ip points to a 0xD6 byte in the FineIBT preamble,
* return true and fill out target and type.
*
* We check the preamble by checking for the ENDBR instruction relative to the
- * 0xEA instruction.
+ * UDB instruction.
*/
static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type)
{
unsigned long addr = regs->ip - fineibt_preamble_ud;
u32 hash;
- if (!exact_endbr((void *)addr))
+again:
+ if (!exact_endbr((void *)addr)) {
+ if (cfi_bhi && addr == regs->ip - fineibt_preamble_ud) {
+ addr = regs->ip - fineibt_bhi1_ud;
+ goto again;
+ }
return false;
+ }
*target = addr + fineibt_preamble_size;
__get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
- *type = (u32)regs->r10 + hash;
+ *type = (u32)regs->ax + hash;
/*
* Since regs->ip points to the middle of an instruction; it cannot
@@ -1914,7 +1940,7 @@ static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32
return false;
__get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
- *type = (u32)regs->r10 + hash;
+ *type = (u32)regs->ax + hash;
/*
* The UD2 sites are constructed with a RET immediately following,
@@ -1931,7 +1957,7 @@ static bool is_paranoid_thunk(unsigned long addr)
u32 thunk;
__get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault);
- return (thunk & 0x00FFFFFF) == 0xfd75ea;
+ return (thunk & 0x00FFFFFF) == 0xfd75d6;
Efault:
return false;
@@ -1939,8 +1965,7 @@ static bool is_paranoid_thunk(unsigned long addr)
/*
* regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[]
- * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS
- * thunk.
+ * sequence, or to UDB + Jcc.d8 for cfi_paranoid + ITS thunk.
*/
static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type)
{
@@ -1951,7 +1976,7 @@ static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target,
if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) {
*target = regs->r11 + fineibt_preamble_size;
- *type = regs->r10;
+ *type = regs->ax;
/*
* Since the trapping instruction is the exact, but LOCK prefixed,
@@ -1963,14 +1988,14 @@ static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target,
/*
* The cfi_paranoid + ITS thunk combination results in:
*
- * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
- * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d
+ * 0: 2e b8 78 56 34 12 cs mov $0x12345678, %eax
+ * 6: 41 3b 43 f7 cmp -0x9(%r11), %eax
* a: 4d 8d 5b f0 lea -0x10(%r11), %r11
* e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11
*
* Where the paranoid_thunk looks like:
*
- * 1d: <ea> (bad)
+ * 1d: <d6> udb
* __x86_indirect_paranoid_thunk_r11:
* 1e: 75 fd jne 1d
* __x86_indirect_its_thunk_r11:
@@ -1980,7 +2005,7 @@ static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target,
*/
if (is_paranoid_thunk(regs->ip)) {
*target = regs->r11 + fineibt_preamble_size;
- *type = regs->r10;
+ *type = regs->ax;
regs->ip = *target;
return true;
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index 99444409c026..59f8bf022ec5 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -123,7 +123,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code)
return;
}
- pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs));
+ early_printk("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs));
if (!ibt_fatal) {
printk(KERN_DEFAULT CUT_HERE);
__warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 36354b470590..6b22611e69cc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -97,7 +97,7 @@ __always_inline int is_valid_bugaddr(unsigned long addr)
* Check for UD1 or UD2, accounting for Address Size Override Prefixes.
* If it's a UD1, further decode to determine its use:
*
- * FineIBT: ea (bad)
+ * FineIBT: d6 udb
* FineIBT: f0 75 f9 lock jne . - 6
* UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax
* UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax
@@ -130,9 +130,9 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
WARN_ON_ONCE(!lock);
return BUG_LOCK;
- case 0xea:
+ case 0xd6:
*len = addr - start;
- return BUG_EA;
+ return BUG_UDB;
case OPCODE_ESCAPE:
break;
@@ -341,7 +341,7 @@ static noinstr bool handle_bug(struct pt_regs *regs)
}
fallthrough;
- case BUG_EA:
+ case BUG_UDB:
case BUG_LOCK:
if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
handled = true;
diff --git a/arch/x86/lib/bhi.S b/arch/x86/lib/bhi.S
index 58891681261b..aad1e5839202 100644
--- a/arch/x86/lib/bhi.S
+++ b/arch/x86/lib/bhi.S
@@ -5,7 +5,7 @@
#include <asm/nospec-branch.h>
/*
- * Notably, the FineIBT preamble calling these will have ZF set and r10 zero.
+ * Notably, the FineIBT preamble calling these will have ZF set and eax zero.
*
* The very last element is in fact larger than 32 bytes, but since its the
* last element, this does not matter,
@@ -36,7 +36,7 @@ SYM_INNER_LABEL(__bhi_args_1, SYM_L_LOCAL)
ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
jne .Lud_1
- cmovne %r10, %rdi
+ cmovne %rax, %rdi
ANNOTATE_UNRET_SAFE
ret
int3
@@ -53,8 +53,8 @@ SYM_INNER_LABEL(__bhi_args_2, SYM_L_LOCAL)
ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
jne .Lud_1
- cmovne %r10, %rdi
- cmovne %r10, %rsi
+ cmovne %rax, %rdi
+ cmovne %rax, %rsi
ANNOTATE_UNRET_SAFE
ret
int3
@@ -64,9 +64,9 @@ SYM_INNER_LABEL(__bhi_args_3, SYM_L_LOCAL)
ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
jne .Lud_1
- cmovne %r10, %rdi
- cmovne %r10, %rsi
- cmovne %r10, %rdx
+ cmovne %rax, %rdi
+ cmovne %rax, %rsi
+ cmovne %rax, %rdx
ANNOTATE_UNRET_SAFE
ret
int3
@@ -76,10 +76,10 @@ SYM_INNER_LABEL(__bhi_args_4, SYM_L_LOCAL)
ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
jne .Lud_2
- cmovne %r10, %rdi
- cmovne %r10, %rsi
- cmovne %r10, %rdx
- cmovne %r10, %rcx
+ cmovne %rax, %rdi
+ cmovne %rax, %rsi
+ cmovne %rax, %rdx
+ cmovne %rax, %rcx
ANNOTATE_UNRET_SAFE
ret
int3
@@ -89,11 +89,11 @@ SYM_INNER_LABEL(__bhi_args_5, SYM_L_LOCAL)
ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
jne .Lud_2
- cmovne %r10, %rdi
- cmovne %r10, %rsi
- cmovne %r10, %rdx
- cmovne %r10, %rcx
- cmovne %r10, %r8
+ cmovne %rax, %rdi
+ cmovne %rax, %rsi
+ cmovne %rax, %rdx
+ cmovne %rax, %rcx
+ cmovne %rax, %r8
ANNOTATE_UNRET_SAFE
ret
int3
@@ -110,12 +110,12 @@ SYM_INNER_LABEL(__bhi_args_6, SYM_L_LOCAL)
ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
jne .Lud_2
- cmovne %r10, %rdi
- cmovne %r10, %rsi
- cmovne %r10, %rdx
- cmovne %r10, %rcx
- cmovne %r10, %r8
- cmovne %r10, %r9
+ cmovne %rax, %rdi
+ cmovne %rax, %rsi
+ cmovne %rax, %rdx
+ cmovne %rax, %rcx
+ cmovne %rax, %r8
+ cmovne %rax, %r9
ANNOTATE_UNRET_SAFE
ret
int3
@@ -125,13 +125,13 @@ SYM_INNER_LABEL(__bhi_args_7, SYM_L_LOCAL)
ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
jne .Lud_2
- cmovne %r10, %rdi
- cmovne %r10, %rsi
- cmovne %r10, %rdx
- cmovne %r10, %rcx
- cmovne %r10, %r8
- cmovne %r10, %r9
- cmovne %r10, %rsp
+ cmovne %rax, %rdi
+ cmovne %rax, %rsi
+ cmovne %rax, %rdx
+ cmovne %rax, %rcx
+ cmovne %rax, %r8
+ cmovne %rax, %r9
+ cmovne %rax, %rsp
ANNOTATE_UNRET_SAFE
ret
int3
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index d78d769a02bd..24b7aca454ec 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -373,10 +373,10 @@ SYM_FUNC_END(call_depth_return_thunk)
.macro ITS_THUNK reg
/*
- * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b)
+ * If CFI paranoid is used then the ITS thunk starts with opcodes (1: udb; jne 1b)
* that complete the fineibt_paranoid caller sequence.
*/
-1: .byte 0xea
+1: ASM_UDB
SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL)
UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 7e3fca164620..f54353ef3e51 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -419,12 +419,13 @@ static void emit_fineibt(u8 **pprog, u8 *ip, u32 hash, int arity)
u8 *prog = *pprog;
EMIT_ENDBR();
- EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */
+ EMIT1_off32(0x2d, hash); /* subl $hash, %eax */
if (cfi_bhi) {
+ EMIT2(0x2e, 0x2e); /* cs cs */
emit_call(&prog, __bhi_args[arity], ip + 11);
} else {
- EMIT2(0x75, 0xf9); /* jne.d8 .-7 */
- EMIT3(0x0f, 0x1f, 0x00); /* nop3 */
+ EMIT5(0x67, 0x0f, 0x1f, 0x40, 0xd6); /* nopl -42(%eax) */
+ EMIT2(0x75, 0xfd); /* jne.d8 .-3 */
}
EMIT_ENDBR_POISON();
On Thu, Aug 14, 2025 at 2:17 PM Peter Zijlstra <peterz@infradead.org> wrote: > > Hi! > > A while ago FineIBT started using the instruction 0xEA to generate #UD. > All existing parts will generate #UD in 64bit mode on that instruction. > > However; Intel/AMD have not blessed using this instruction, it is on > their 'reserved' list for future use. > > Peter Anvin worked the committees and got use of 0xD6 blessed, and it > will be called UDB (per the next SDM or so). > > Reworking the FineIBT code to use UDB wasn't entirely trivial, and I've > had to switch the hash register to EAX in order to free up some bytes. > > Per the x86_64 ABI, EAX is used to pass the number of vector registers > for varargs -- something that should not happen in the kernel. More so, > we build with -mskip-rax-setup, which should leave EAX completely unused > in the calling convention. rax is used to pass tail_call count. See diagram in commit log: https://lore.kernel.org/all/20240714123902.32305-2-hffilwlqm@gmail.com/ Before that commit rax was used differently. Bottom line rax was used for a long time to support bpf_tail_calls. I'm traveling atm. So cc-ing folks for follow ups.
On Fri, Aug 15, 2025 at 08:42:39AM +0300, Alexei Starovoitov wrote: > On Thu, Aug 14, 2025 at 2:17 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > Hi! > > > > A while ago FineIBT started using the instruction 0xEA to generate #UD. > > All existing parts will generate #UD in 64bit mode on that instruction. > > > > However; Intel/AMD have not blessed using this instruction, it is on > > their 'reserved' list for future use. > > > > Peter Anvin worked the committees and got use of 0xD6 blessed, and it > > will be called UDB (per the next SDM or so). > > > > Reworking the FineIBT code to use UDB wasn't entirely trivial, and I've > > had to switch the hash register to EAX in order to free up some bytes. > > > > Per the x86_64 ABI, EAX is used to pass the number of vector registers > > for varargs -- something that should not happen in the kernel. More so, > > we build with -mskip-rax-setup, which should leave EAX completely unused > > in the calling convention. > > rax is used to pass tail_call count. > See diagram in commit log: > https://lore.kernel.org/all/20240714123902.32305-2-hffilwlqm@gmail.com/ > Before that commit rax was used differently. > Bottom line rax was used for a long time to support bpf_tail_calls. > I'm traveling atm. So cc-ing folks for follow ups. IIRC the bpf2bpf tailcall doesn't use CFI at the moment. But let me double check. So emit_cfi() is called at the very start of emit_prologue() and __arch_prepare_bpf_trampoline() in the BPF_TRAMP_F_INDIRECT case. Now, emit_prologue() starts with the CFI bits, but the tailcall lands at X86_TAIL_CALL_OFFSET, at which spot we only have EMIT_ENDBR(), nothing else. So RAX should be unaffected at that point. So, AFAICT, we're good on that point. It is just the C level indirect function call ABI that is affected, BPF internal conventions are unaffected.
On 15/8/25 15:57, Peter Zijlstra wrote: > On Fri, Aug 15, 2025 at 08:42:39AM +0300, Alexei Starovoitov wrote: >> On Thu, Aug 14, 2025 at 2:17 PM Peter Zijlstra <peterz@infradead.org> wrote: >>> >>> Hi! >>> >>> A while ago FineIBT started using the instruction 0xEA to generate #UD. >>> All existing parts will generate #UD in 64bit mode on that instruction. >>> >>> However; Intel/AMD have not blessed using this instruction, it is on >>> their 'reserved' list for future use. >>> >>> Peter Anvin worked the committees and got use of 0xD6 blessed, and it >>> will be called UDB (per the next SDM or so). >>> >>> Reworking the FineIBT code to use UDB wasn't entirely trivial, and I've >>> had to switch the hash register to EAX in order to free up some bytes. >>> >>> Per the x86_64 ABI, EAX is used to pass the number of vector registers >>> for varargs -- something that should not happen in the kernel. More so, >>> we build with -mskip-rax-setup, which should leave EAX completely unused >>> in the calling convention. >> >> rax is used to pass tail_call count. >> See diagram in commit log: >> https://lore.kernel.org/all/20240714123902.32305-2-hffilwlqm@gmail.com/ >> Before that commit rax was used differently. >> Bottom line rax was used for a long time to support bpf_tail_calls. >> I'm traveling atm. So cc-ing folks for follow ups. > > IIRC the bpf2bpf tailcall doesn't use CFI at the moment. But let me > double check. > > So emit_cfi() is called at the very start of emit_prologue() and > __arch_prepare_bpf_trampoline() in the BPF_TRAMP_F_INDIRECT case. > > Now, emit_prologue() starts with the CFI bits, but the tailcall lands at > X86_TAIL_CALL_OFFSET, at which spot we only have EMIT_ENDBR(), nothing > else. So RAX should be unaffected at that point. > > So, AFAICT, we're good on that point. It is just the C level indirect > function call ABI that is affected, BPF internal conventions are > unaffected. > RAX is used for propagating tail_call_cnt_ptr from caller to callee for bpf2bpf+tailcall on x86_64. Before the aforementioned commit, RAX is used for propagating tail_call_cnt from caller to callee for the case. Thanks, Leon
On 2025-08-14 04:17, Peter Zijlstra wrote: > Hi! > > A while ago FineIBT started using the instruction 0xEA to generate #UD. > All existing parts will generate #UD in 64bit mode on that instruction. > > However; Intel/AMD have not blessed using this instruction, it is on > their 'reserved' list for future use. > > Peter Anvin worked the committees and got use of 0xD6 blessed, and it > will be called UDB (per the next SDM or so). > > Reworking the FineIBT code to use UDB wasn't entirely trivial, and I've > had to switch the hash register to EAX in order to free up some bytes. > > Per the x86_64 ABI, EAX is used to pass the number of vector registers > for varargs -- something that should not happen in the kernel. More so, > we build with -mskip-rax-setup, which should leave EAX completely unused > in the calling convention. > > The code boots and passes the LKDTM CFI_FORWARD_PROTO test for various > combinations (non exhaustive so far). > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Looks good to me (and using %eax will save one byte per call site as well), but as per our IRC discussion, *my understanding* is that the best possible performance (least branch predictor impact across implementations) is to use a forward branch with a 2E prefix (jcc,pn in GAS syntax) rather than a reverse branch, if space allows. -hpa
On Thu, Aug 14, 2025 at 06:27:44PM -0700, H. Peter Anvin wrote: > On 2025-08-14 04:17, Peter Zijlstra wrote: > > Hi! > > > > A while ago FineIBT started using the instruction 0xEA to generate #UD. > > All existing parts will generate #UD in 64bit mode on that instruction. > > > > However; Intel/AMD have not blessed using this instruction, it is on > > their 'reserved' list for future use. > > > > Peter Anvin worked the committees and got use of 0xD6 blessed, and it > > will be called UDB (per the next SDM or so). > > > > Reworking the FineIBT code to use UDB wasn't entirely trivial, and I've > > had to switch the hash register to EAX in order to free up some bytes. > > > > Per the x86_64 ABI, EAX is used to pass the number of vector registers > > for varargs -- something that should not happen in the kernel. More so, > > we build with -mskip-rax-setup, which should leave EAX completely unused > > in the calling convention. > > > > The code boots and passes the LKDTM CFI_FORWARD_PROTO test for various > > combinations (non exhaustive so far). > > > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > Looks good to me (and using %eax will save one byte per call site as > well), but as per our IRC discussion, *my understanding* is that the > best possible performance (least branch predictor impact across > implementations) is to use a forward branch with a 2E prefix (jcc,pn in > GAS syntax) rather than a reverse branch, if space allows. Oh right. I did see that comment on IRC and them promptly forgot about it again :/ I'll have a poke. Scott, do you agree? You being responsible for the backward jump and such.
On Fri, Aug 15, 2025 at 09:49:39AM +0200, Peter Zijlstra wrote: > On Thu, Aug 14, 2025 at 06:27:44PM -0700, H. Peter Anvin wrote: > > On 2025-08-14 04:17, Peter Zijlstra wrote: > > > Hi! > > > > > > A while ago FineIBT started using the instruction 0xEA to generate #UD. > > > All existing parts will generate #UD in 64bit mode on that instruction. > > > > > > However; Intel/AMD have not blessed using this instruction, it is on > > > their 'reserved' list for future use. > > > > > > Peter Anvin worked the committees and got use of 0xD6 blessed, and it > > > will be called UDB (per the next SDM or so). > > > > > > Reworking the FineIBT code to use UDB wasn't entirely trivial, and I've > > > had to switch the hash register to EAX in order to free up some bytes. > > > > > > Per the x86_64 ABI, EAX is used to pass the number of vector registers > > > for varargs -- something that should not happen in the kernel. More so, > > > we build with -mskip-rax-setup, which should leave EAX completely unused > > > in the calling convention. > > > > > > The code boots and passes the LKDTM CFI_FORWARD_PROTO test for various > > > combinations (non exhaustive so far). > > > > > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > > > Looks good to me (and using %eax will save one byte per call site as > > well), but as per our IRC discussion, *my understanding* is that the > > best possible performance (least branch predictor impact across > > implementations) is to use a forward branch with a 2E prefix (jcc,pn in > > GAS syntax) rather than a reverse branch, if space allows. > > Oh right. I did see that comment on IRC and them promptly forgot about > it again :/ I'll have a poke. Scott, do you agree? You being responsible > for the backward jump and such. On top of the other, to show the delta. If we want a fwd branch, we can stick the D6 in the endbr poison nop. That frees up more bytes again, but also that matches what I already did for the bhi1 case, so less special cases is more better. I've had to use cs prefixed jcc.d32, because our older toolchains don't like the ,pn notation. --- arch/x86/include/asm/cfi.h | 5 ++--- arch/x86/include/asm/ibt.h | 10 +++------ arch/x86/kernel/alternative.c | 50 +++++++++++++++++++++---------------------- arch/x86/net/bpf_jit_comp.c | 3 +-- 4 files changed, 30 insertions(+), 38 deletions(-) diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 9256908b5134..3fcfdd996962 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -72,10 +72,9 @@ * __cfi_foo: * endbr64 * subl 0x12345678, %eax - * nopl -42(%eax) - * jne __cfi_foo+13 + * jne.32,pn foo+3 * foo: - * osp nop3 # was endbr64 + * nopl -42(%rax) # was endbr64 * ... code here ... * ret * diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index 28d845257303..5e45d6424722 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -59,10 +59,10 @@ static __always_inline __attribute_const__ u32 gen_endbr(void) static __always_inline __attribute_const__ u32 gen_endbr_poison(void) { /* - * 4 byte NOP that isn't NOP4 (in fact it is OSP NOP3), such that it - * will be unique to (former) ENDBR sites. + * 4 byte NOP that isn't NOP4, such that it will be unique to (former) + * ENDBR sites. Additionally it carries UDB as immediate. */ - return 0x001f0f66; /* osp nopl (%rax) */ + return 0xd6401f0f; /* nopl -42(%rax) */ } static inline bool __is_endbr(u32 val) @@ -70,10 +70,6 @@ static inline bool __is_endbr(u32 val) if (val == gen_endbr_poison()) return true; - /* See cfi_fineibt_bhi_preamble() */ - if (IS_ENABLED(CONFIG_FINEIBT_BHI) && val == 0x001f0ff5) - return true; - val &= ~0x01000000U; /* ENDBR32 -> ENDBR64 */ return val == gen_endbr(); } diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 66e26025b2d4..3fc0d1edab70 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1301,8 +1301,7 @@ early_param("cfi", cfi_parse_cmdline); * __cfi_\func: __cfi_\func: * movl $0x12345678,%eax // 5 endbr64 // 4 * nop subl $0x12345678,%eax // 5 - * nop nopl -42(%eax) // 5 - * nop jne __cfi_\func+13 // 2 + * nop jne.d32,pn \func+3 // 7 * nop * nop * nop @@ -1311,6 +1310,9 @@ early_param("cfi", cfi_parse_cmdline); * nop * nop * nop + * nop + * \func: \func: + * endbr64 nopl -42(%rax) * * * caller: caller: @@ -1326,8 +1328,8 @@ early_param("cfi", cfi_parse_cmdline); * <fineibt_preamble_start>: * 0: f3 0f 1e fa endbr64 * 4: 2d 78 56 34 12 sub $0x12345678, %eax - * b: 67 0f 1f 40 <d6> nopl -42(%eax) - * e: 75 fd jne 13 <fineibt_preamble_start+0xd> + * 9: 2e 0f 85 03 00 00 00 jne,pn 13 <fineibt_preamble_start+0x13> + * 10: 0f 1f 40 d6 nopl -0x2a(%rax) * * Note that the JNE target is the 0xD6 byte inside the NOPL, this decodes as * UDB on x86_64 and raises #UD. @@ -1337,8 +1339,9 @@ asm( ".pushsection .rodata \n" " endbr64 \n" " subl $0x12345678, %eax \n" "fineibt_preamble_bhi: \n" - " nopl -42(%eax) \n" - " jne fineibt_preamble_start+13 \n" + " cs jne.d32 fineibt_preamble_start+0x13 \n" + "#fineibt_func: \n" + " nopl -42(%rax) \n" "fineibt_preamble_end: \n" ".popsection\n" ); @@ -1349,7 +1352,7 @@ extern u8 fineibt_preamble_end[]; #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) #define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start) -#define fineibt_preamble_ud 13 +#define fineibt_preamble_ud 0x13 #define fineibt_preamble_hash 5 /* @@ -1551,7 +1554,6 @@ extern u8 fineibt_bhi1_start[]; extern u8 fineibt_bhi1_end[]; #define fineibt_bhi1_size (fineibt_bhi1_end - fineibt_bhi1_start) -#define fineibt_bhi1_ud 0x13 static void cfi_fineibt_bhi_preamble(void *addr, int arity) { @@ -1589,8 +1591,6 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; - BUG_ON(fineibt_bhi1_size != 11); - for (s = start; s < end; s++) { void *addr = (void *)s + *s; int arity; @@ -1675,9 +1675,6 @@ static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; - BUG_ON(fineibt_caller_size != 14); - BUG_ON(fineibt_paranoid_size != 20); - for (s = start; s < end; s++) { void *addr = (void *)s + *s; struct insn insn; @@ -1728,13 +1725,18 @@ static int cfi_rewrite_callers(s32 *start, s32 *end) return 0; } +#define FINEIBT_WARN(_f, _v) \ + WARN_ONCE((_f) != (_v), "FineIBT: " #_f " %ld != %d\n", _f, _v) + static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, bool builtin) { int ret; - if (WARN_ONCE(fineibt_preamble_size != 16, - "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) + if (FINEIBT_WARN(fineibt_preamble_size, 20) || + FINEIBT_WARN(fineibt_preamble_bhi + fineibt_bhi1_size, 20) || + FINEIBT_WARN(fineibt_caller_size, 14) || + FINEIBT_WARN(fineibt_paranoid_size, 20)) return; if (cfi_mode == CFI_AUTO) { @@ -1873,6 +1875,8 @@ static void poison_cfi(void *addr) } } +#define fineibt_prefix_size (fineibt_preamble_size - ENDBR_INSN_SIZE) + /* * When regs->ip points to a 0xD6 byte in the FineIBT preamble, * return true and fill out target and type. @@ -1885,16 +1889,10 @@ static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, unsigned long addr = regs->ip - fineibt_preamble_ud; u32 hash; -again: - if (!exact_endbr((void *)addr)) { - if (cfi_bhi && addr == regs->ip - fineibt_preamble_ud) { - addr = regs->ip - fineibt_bhi1_ud; - goto again; - } + if (!exact_endbr((void *)addr)) return false; - } - *target = addr + fineibt_preamble_size; + *target = addr + fineibt_prefix_size; __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); *type = (u32)regs->ax + hash; @@ -1935,7 +1933,7 @@ static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault); *target = addr; - addr -= fineibt_preamble_size; + addr -= fineibt_prefix_size; if (!exact_endbr((void *)addr)) return false; @@ -1975,7 +1973,7 @@ static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, return false; if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) { - *target = regs->r11 + fineibt_preamble_size; + *target = regs->r11 + fineibt_prefix_size; *type = regs->ax; /* @@ -2004,7 +2002,7 @@ static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, * */ if (is_paranoid_thunk(regs->ip)) { - *target = regs->r11 + fineibt_preamble_size; + *target = regs->r11 + fineibt_prefix_size; *type = regs->ax; regs->ip = *target; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f54353ef3e51..5178ef1aa5c7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -424,8 +424,7 @@ static void emit_fineibt(u8 **pprog, u8 *ip, u32 hash, int arity) EMIT2(0x2e, 0x2e); /* cs cs */ emit_call(&prog, __bhi_args[arity], ip + 11); } else { - EMIT5(0x67, 0x0f, 0x1f, 0x40, 0xd6); /* nopl -42(%eax) */ - EMIT2(0x75, 0xfd); /* jne.d8 .-3 */ + EMIT3_off32(0x2e, 0x0f, 0x85, 3); /* jne.d32,pn 3 */ } EMIT_ENDBR_POISON();
On Fri, Aug 15, 2025 at 12:28:39PM +0200, Peter Zijlstra wrote: > On Fri, Aug 15, 2025 at 09:49:39AM +0200, Peter Zijlstra wrote: > > On Thu, Aug 14, 2025 at 06:27:44PM -0700, H. Peter Anvin wrote: > > > On 2025-08-14 04:17, Peter Zijlstra wrote: > > > > Hi! > > > > > > > > A while ago FineIBT started using the instruction 0xEA to generate #UD. > > > > All existing parts will generate #UD in 64bit mode on that instruction. > > > > > > > > However; Intel/AMD have not blessed using this instruction, it is on > > > > their 'reserved' list for future use. > > > > > > > > Peter Anvin worked the committees and got use of 0xD6 blessed, and it > > > > will be called UDB (per the next SDM or so). > > > > > > > > Reworking the FineIBT code to use UDB wasn't entirely trivial, and I've > > > > had to switch the hash register to EAX in order to free up some bytes. > > > > > > > > Per the x86_64 ABI, EAX is used to pass the number of vector registers > > > > for varargs -- something that should not happen in the kernel. More so, > > > > we build with -mskip-rax-setup, which should leave EAX completely unused > > > > in the calling convention. > > > > > > > > The code boots and passes the LKDTM CFI_FORWARD_PROTO test for various > > > > combinations (non exhaustive so far). > > > > > > > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > > > > > Looks good to me (and using %eax will save one byte per call site as > > > well), but as per our IRC discussion, *my understanding* is that the > > > best possible performance (least branch predictor impact across > > > implementations) is to use a forward branch with a 2E prefix (jcc,pn in > > > GAS syntax) rather than a reverse branch, if space allows. > > > > Oh right. I did see that comment on IRC and them promptly forgot about > > it again :/ I'll have a poke. Scott, do you agree? You being responsible > > for the backward jump and such. > > On top of the other, to show the delta. > > If we want a fwd branch, we can stick the D6 in the endbr poison nop. > That frees up more bytes again, but also that matches what I already did > for the bhi1 case, so less special cases is more better. > > I've had to use cs prefixed jcc.d32, because our older toolchains don't > like the ,pn notation. And then I forgot to move that cs prefix around in the bhi1 case... fixed that.
On 15/08/2025 11:30 am, Peter Zijlstra wrote: > On Fri, Aug 15, 2025 at 12:28:39PM +0200, Peter Zijlstra wrote: >> On Fri, Aug 15, 2025 at 09:49:39AM +0200, Peter Zijlstra wrote: >>> On Thu, Aug 14, 2025 at 06:27:44PM -0700, H. Peter Anvin wrote: >>>> On 2025-08-14 04:17, Peter Zijlstra wrote: >>>>> Hi! >>>>> >>>>> A while ago FineIBT started using the instruction 0xEA to generate #UD. >>>>> All existing parts will generate #UD in 64bit mode on that instruction. >>>>> >>>>> However; Intel/AMD have not blessed using this instruction, it is on >>>>> their 'reserved' list for future use. >>>>> >>>>> Peter Anvin worked the committees and got use of 0xD6 blessed, and it >>>>> will be called UDB (per the next SDM or so). >>>>> >>>>> Reworking the FineIBT code to use UDB wasn't entirely trivial, and I've >>>>> had to switch the hash register to EAX in order to free up some bytes. >>>>> >>>>> Per the x86_64 ABI, EAX is used to pass the number of vector registers >>>>> for varargs -- something that should not happen in the kernel. More so, >>>>> we build with -mskip-rax-setup, which should leave EAX completely unused >>>>> in the calling convention. >>>>> >>>>> The code boots and passes the LKDTM CFI_FORWARD_PROTO test for various >>>>> combinations (non exhaustive so far). >>>>> >>>>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> >>>> Looks good to me (and using %eax will save one byte per call site as >>>> well), but as per our IRC discussion, *my understanding* is that the >>>> best possible performance (least branch predictor impact across >>>> implementations) is to use a forward branch with a 2E prefix (jcc,pn in >>>> GAS syntax) rather than a reverse branch, if space allows. >>> Oh right. I did see that comment on IRC and them promptly forgot about >>> it again :/ I'll have a poke. Scott, do you agree? You being responsible >>> for the backward jump and such. >> On top of the other, to show the delta. >> >> If we want a fwd branch, we can stick the D6 in the endbr poison nop. >> That frees up more bytes again, but also that matches what I already did >> for the bhi1 case, so less special cases is more better. >> >> I've had to use cs prefixed jcc.d32, because our older toolchains don't >> like the ,pn notation. > And then I forgot to move that cs prefix around in the bhi1 case... > fixed that. Dare I ask what ,pn notation is? It's not only the older toolchains that don't know it :) ~Andrew
On Fri, Aug 15, 2025 at 11:43:11AM +0100, Andrew Cooper wrote: > >> I've had to use cs prefixed jcc.d32, because our older toolchains don't > >> like the ,pn notation. > > And then I forgot to move that cs prefix around in the bhi1 case... > > fixed that. > > Dare I ask what ,pn notation is? It's not only the older toolchains > that don't know it :) Some fancy notation for the CS/DS branch hints. CS Jcc, decodes to Jcc,pn for non-taken DS Jcc, decodes to Jcc,pt for taken
On 15/08/2025 11:59 am, Peter Zijlstra wrote: > On Fri, Aug 15, 2025 at 11:43:11AM +0100, Andrew Cooper wrote: > >>>> I've had to use cs prefixed jcc.d32, because our older toolchains don't >>>> like the ,pn notation. >>> And then I forgot to move that cs prefix around in the bhi1 case... >>> fixed that. >> Dare I ask what ,pn notation is? It's not only the older toolchains >> that don't know it :) > Some fancy notation for the CS/DS branch hints. > > CS Jcc, decodes to Jcc,pn for non-taken > DS Jcc, decodes to Jcc,pt for taken Ah, thanks. I was looking at the hex in one of the comments and still couldn't figure it out. So with this notation, we also have the dual meaning of ,pt between the P4 and LNC. At least the encoding is the same. ~Andrew
On 2025-08-15 04:19, Andrew Cooper wrote: >> >> CS Jcc, decodes to Jcc,pn for non-taken >> DS Jcc, decodes to Jcc,pt for taken > > Ah, thanks. I was looking at the hex in one of the comments and still > couldn't figure it out. > > So with this notation, we also have the dual meaning of ,pt between the > P4 and LNC. At least the encoding is the same. > What "dual meaning?" -hpa
On 15/08/2025 6:14 pm, H. Peter Anvin wrote: > On 2025-08-15 04:19, Andrew Cooper wrote: >>> CS Jcc, decodes to Jcc,pn for non-taken >>> DS Jcc, decodes to Jcc,pt for taken >> Ah, thanks. I was looking at the hex in one of the comments and still >> couldn't figure it out. >> >> So with this notation, we also have the dual meaning of ,pt between the >> P4 and LNC. At least the encoding is the same. >> > What "dual meaning?" Well, it has different semantics now it's been reintroduced in LNC. (Only has any effect on a prediction miss, and causes a proactive decode resteer). Sure, it's "just perf" so can be argued as "compatible behaviour", but people caring about the ,pt / ,pn properties do need to know the different. ~Andrew
On August 15, 2025 10:21:48 AM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >On 15/08/2025 6:14 pm, H. Peter Anvin wrote: >> On 2025-08-15 04:19, Andrew Cooper wrote: >>>> CS Jcc, decodes to Jcc,pn for non-taken >>>> DS Jcc, decodes to Jcc,pt for taken >>> Ah, thanks. I was looking at the hex in one of the comments and still >>> couldn't figure it out. >>> >>> So with this notation, we also have the dual meaning of ,pt between the >>> P4 and LNC. At least the encoding is the same. >>> >> What "dual meaning?" > >Well, it has different semantics now it's been reintroduced in LNC. >(Only has any effect on a prediction miss, and causes a proactive decode >resteer). > >Sure, it's "just perf" so can be argued as "compatible behaviour", but >people caring about the ,pt / ,pn properties do need to know the different. > >~Andrew The architectural semantics are "hint to the microarchitecture that this branch is strongly biased in one specific direction", and that has been true since the P4. There are no other semantic guarantees, including, of course, that the hardware will do anything at all with it. Performance tuning at the uarch level changes over time and will continue to do so, so if you don't distinguish between architectural behavior and performance you have to define pretty much every instruction as semantically different in each generation.
On Thu, Aug 14, 2025 at 01:17:32PM +0200, Peter Zijlstra wrote: > Hi! > > A while ago FineIBT started using the instruction 0xEA to generate #UD. > All existing parts will generate #UD in 64bit mode on that instruction. > > However; Intel/AMD have not blessed using this instruction, it is on > their 'reserved' list for future use. > > Peter Anvin worked the committees and got use of 0xD6 blessed, and it > will be called UDB (per the next SDM or so). > > Reworking the FineIBT code to use UDB wasn't entirely trivial, and I've > had to switch the hash register to EAX in order to free up some bytes. > > Per the x86_64 ABI, EAX is used to pass the number of vector registers > for varargs -- something that should not happen in the kernel. More so, > we build with -mskip-rax-setup, which should leave EAX completely unused > in the calling convention. > > The code boots and passes the LKDTM CFI_FORWARD_PROTO test for various > combinations (non exhaustive so far). Must be perfect :P on my hardware: cfi=auto (i.e. cfi=fineibt,paranoid): ``` [ +0.000861] SMP alternatives: Using paranoid FineIBT CFI ... [Aug14 11:33] lkdtm: Performing direct entry CFI_FORWARD_PROTO [ +0.000005] lkdtm: Calling matched prototype ... [ +0.000000] lkdtm: Calling mismatched prototype ... [ +0.000014] CFI failure at lkdtm_indirect_call+0x1c/0x30 [lkdtm] (target: lkdtm_increment_int+0x0/0x30 [lkdtm]; expected type: 0x4d3fe584) [ +0.000027] Oops: invalid opcode: 0000 [#1] SMP NOPTI [ +0.000004] CPU: 0 UID: 0 PID: 3232 Comm: cat Not tainted 6.17.0-rc1-debug-next-20250814-02528-g3aebcf0edb14 #1 PREEMPT(full) 724b1db7f580ddb34e14b43820ff16d9fd75d36c [ +0.000004] Hardware name: AZW MINI S/MINI S, BIOS ADLNV106 05/12/2024 [ +0.000002] RIP: 0010:lkdtm_indirect_call+0x1c/0x30 [lkdtm] [ +0.000008] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 49 89 fb 48 c7 c7 30 49 fd c1 b8 84 e5 3f 4d 41 3b 43 f5 2e 4d 8d 5b <f0> 75 fd 41 ff e3 90 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 f3 0f [ +0.000002] RSP: 0018:ffffceb847387d38 EFLAGS: 00010216 [ +0.000002] RAX: 000000004d3fe584 RBX: 0000000000000001 RCX: 332c9ba94c7b7c00 [ +0.000002] RDX: 0000000000000002 RSI: 00000000ffffefff RDI: ffffffffc1fd4930 [ +0.000002] RBP: 0000000000000002 R08: 0000000000000fff R09: ffffffffb025b2a0 [ +0.000001] R10: 0000000000002ffd R11: ffffffffc218e0d0 R12: 0000000000000000 [ +0.000002] R13: 0000000000000006 R14: ffff8bc3c3d3b000 R15: ffffffffc1fd47c0 [ +0.000001] FS: 00007f025f833740(0000) GS:ffff8bc6febb8000(0000) knlGS:0000000000000000 [ +0.000002] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000001] CR2: 00007f025f5bf000 CR3: 000000017036a002 CR4: 0000000000f72ef0 [ +0.000002] PKRU: 55555554 [ +0.000001] Call Trace: [ +0.000002] <TASK> [ +0.000002] lkdtm_CFI_FORWARD_PROTO+0x39/0x60 [lkdtm 57ec71482c5cb110c2f23373ae7338671243d293] [ +0.000008] lkdtm_do_action+0x26/0x40 [lkdtm 57ec71482c5cb110c2f23373ae7338671243d293] [ +0.000009] direct_entry+0x147/0x160 [lkdtm 57ec71482c5cb110c2f23373ae7338671243d293] [ +0.000009] full_proxy_write+0x9e/0x140 [ +0.000005] vfs_write+0x11b/0x3f0 [ +0.000004] ksys_write+0x79/0x100 [ +0.000002] do_syscall_64+0x85/0x920 [ +0.000003] ? do_user_addr_fault+0x271/0x680 [ +0.000003] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ +0.000002] RIP: 0033:0x7f025f6931ce [ +0.000036] Code: 4d 89 d8 e8 64 be 00 00 4c 8b 5d f8 41 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 11 c9 c3 0f 1f 80 00 00 00 00 48 8b 45 10 0f 05 <c9> c3 83 e2 39 83 fa 08 75 e7 e8 13 ff ff ff 0f 1f 00 f3 0f 1e fa [ +0.000001] RSP: 002b:00007ffd001e8930 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 [ +0.000003] RAX: ffffffffffffffda RBX: 0000000000000012 RCX: 00007f025f6931ce [ +0.000001] RDX: 0000000000000012 RSI: 00007f025f5bf000 RDI: 0000000000000001 [ +0.000001] RBP: 00007ffd001e8940 R08: 0000000000000000 R09: 0000000000000000 [ +0.000002] R10: 0000000000000000 R11: 0000000000000202 R12: 00007f025f5bf000 [ +0.000001] R13: 0000000000000012 R14: 0000000000000000 R15: 0000000000000000 [ +0.000002] </TASK> [ +0.000001] Modules linked in: lkdtm ... [ +0.000047] hid_logitech_dj hid_razer nvme nvme_core nvme_keyring spi_intel_pci nvme_auth spi_intel [ +0.000006] ---[ end trace 0000000000000000 ]--- [ +0.000002] RIP: 0010:lkdtm_indirect_call+0x1c/0x30 [lkdtm] [ +0.000008] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 49 89 fb 48 c7 c7 30 49 fd c1 b8 84 e5 3f 4d 41 3b 43 f5 2e 4d 8d 5b <f0> 75 fd 41 ff e3 90 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 f3 0f [ +0.000002] RSP: 0018:ffffceb847387d38 EFLAGS: 00010216 [ +0.000002] RAX: 000000004d3fe584 RBX: 0000000000000001 RCX: 332c9ba94c7b7c00 [ +0.000001] RDX: 0000000000000002 RSI: 00000000ffffefff RDI: ffffffffc1fd4930 [ +0.000001] RBP: 0000000000000002 R08: 0000000000000fff R09: ffffffffb025b2a0 [ +0.000001] R10: 0000000000002ffd R11: ffffffffc218e0d0 R12: 0000000000000000 [ +0.000001] R13: 0000000000000006 R14: ffff8bc3c3d3b000 R15: ffffffffc1fd47c0 [ +0.000001] FS: 00007f025f833740(0000) GS:ffff8bc6febb8000(0000) knlGS:0000000000000000 [ +0.000002] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000001] CR2: 00007f025f5bf000 CR3: 000000017036a002 CR4: 0000000000f72ef0 [ +0.000001] PKRU: 55555554 ``` cfi=fineibt,paranoid,bhi ``` [Aug14 12:27] Linux version 6.17.0-rc1-debug-next-20250814-02528-g3aebcf0edb14 (nathan@ax162) (ClangBuiltLinux clang version 22.0.0git (https://github.com/llvm/llvm-project.git 9f96e3f80f5d25e50ca8423d4eb9063989b85f0b), ClangBuiltLinux LLD 22.0.0 (https://github.com/llvm/llvm-project.git 9f96e3f80f5d25e50ca8423d4eb9063989b85f0b)) #1 SMP PREEMPT_DYNAMIC Thu Aug 14 11:11:44 MST 2025 ... [ +0.000860] SMP alternatives: Using paranoid FineIBT+BHI CFI ... [ +1.406475] lkdtm: Performing direct entry CFI_FORWARD_PROTO [ +0.000005] lkdtm: Calling matched prototype ... [ +0.000001] lkdtm: Calling mismatched prototype ... [ +0.000012] CFI failure at lkdtm_indirect_call+0x1c/0x30 [lkdtm] (target: lkdtm_increment_int+0x0/0x30 [lkdtm]; expected type: 0x0b71a5dc) [ +0.000027] Oops: invalid opcode: 0000 [#1] SMP NOPTI [ +0.000004] CPU: 3 UID: 0 PID: 2100 Comm: cat Tainted: G W 6.17.0-rc1-debug-next-20250814-02528-g3aebcf0edb14 #1 PREEMPT(full) 724b1db7f580ddb34e14b43820ff16d9fd75d36c [ +0.000004] Tainted: [W]=WARN [ +0.000001] Hardware name: AZW MINI S/MINI S, BIOS ADLNV106 05/12/2024 [ +0.000002] RIP: 0010:lkdtm_indirect_call+0x1c/0x30 [lkdtm] [ +0.000008] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 49 89 fb 48 c7 c7 30 89 46 c2 b8 dc a5 71 0b 41 3b 43 f5 2e 4d 8d 5b <f0> 75 fd 41 ff e3 90 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 f3 0f [ +0.000002] RSP: 0018:ffffcf77c4507c68 EFLAGS: 00010283 [ +0.000002] RAX: 000000000b71a5dc RBX: 0000000000000001 RCX: f242256082c3a100 [ +0.000002] RDX: 0000000000000002 RSI: 00000000ffffefff RDI: ffffffffc2468930 [ +0.000002] RBP: 0000000000000002 R08: 0000000000000fff R09: ffffffff9a85b2a0 [ +0.000001] R10: 0000000000002ffd R11: ffffffffc19e40d0 R12: 0000000000000000 [ +0.000001] R13: 0000000000000006 R14: ffff8ed41e1e6000 R15: ffffffffc24687c0 [ +0.000002] FS: 00007fda4f785740(0000) GS:ffff8ed7d4738000(0000) knlGS:0000000000000000 [ +0.000002] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000001] CR2: 00007fda4f744000 CR3: 000000016856f005 CR4: 0000000000f72ef0 [ +0.000002] PKRU: 55555554 [ +0.000001] Call Trace: [ +0.000003] <TASK> [ +0.000001] lkdtm_CFI_FORWARD_PROTO+0x39/0x60 [lkdtm 57ec71482c5cb110c2f23373ae7338671243d293] [ +0.000009] lkdtm_do_action+0x26/0x40 [lkdtm 57ec71482c5cb110c2f23373ae7338671243d293] [ +0.000008] direct_entry+0x147/0x160 [lkdtm 57ec71482c5cb110c2f23373ae7338671243d293] [ +0.000009] full_proxy_write+0x9e/0x140 [ +0.000004] vfs_write+0x11b/0x3f0 [ +0.000004] ? __lruvec_stat_mod_folio+0x7c/0xc0 [ +0.000002] ? set_ptes+0x12/0xa0 [ +0.000008] ? do_pte_missing+0xb8c/0xf00 [ +0.000002] ksys_write+0x79/0x100 [ +0.000003] do_syscall_64+0x85/0x920 [ +0.000004] ? do_user_addr_fault+0x271/0x680 [ +0.000002] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ +0.000002] RIP: 0033:0x7fda4f4931ce [ +0.000027] Code: 4d 89 d8 e8 64 be 00 00 4c 8b 5d f8 41 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 11 c9 c3 0f 1f 80 00 00 00 00 48 8b 45 10 0f 05 <c9> c3 83 e2 39 83 fa 08 75 e7 e8 13 ff ff ff 0f 1f 00 f3 0f 1e fa [ +0.000001] RSP: 002b:00007fffe1b7c780 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 [ +0.000002] RAX: ffffffffffffffda RBX: 0000000000000012 RCX: 00007fda4f4931ce [ +0.000002] RDX: 0000000000000012 RSI: 00007fda4f744000 RDI: 0000000000000001 [ +0.000001] RBP: 00007fffe1b7c790 R08: 0000000000000000 R09: 0000000000000000 [ +0.000001] R10: 0000000000000000 R11: 0000000000000202 R12: 00007fda4f744000 [ +0.000001] R13: 0000000000000012 R14: 0000000000000000 R15: 0000000000000000 [ +0.000002] </TASK> [ +0.000000] Modules linked in: lkdtm ... [ +0.000044] hid_logitech_dj hid_razer nvme nvme_core nvme_keyring spi_intel_pci nvme_auth spi_intel [ +0.000016] ---[ end trace 0000000000000000 ]--- [ +0.000002] RIP: 0010:lkdtm_indirect_call+0x1c/0x30 [lkdtm] [ +0.000009] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 49 89 fb 48 c7 c7 30 89 46 c2 b8 dc a5 71 0b 41 3b 43 f5 2e 4d 8d 5b <f0> 75 fd 41 ff e3 90 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 f3 0f [ +0.000001] RSP: 0018:ffffcf77c4507c68 EFLAGS: 00010283 [ +0.000002] RAX: 000000000b71a5dc RBX: 0000000000000001 RCX: f242256082c3a100 [ +0.000001] RDX: 0000000000000002 RSI: 00000000ffffefff RDI: ffffffffc2468930 [ +0.000001] RBP: 0000000000000002 R08: 0000000000000fff R09: ffffffff9a85b2a0 [ +0.000001] R10: 0000000000002ffd R11: ffffffffc19e40d0 R12: 0000000000000000 [ +0.000001] R13: 0000000000000006 R14: ffff8ed41e1e6000 R15: ffffffffc24687c0 [ +0.000001] FS: 00007fda4f785740(0000) GS:ffff8ed7d4738000(0000) knlGS:0000000000000000 [ +0.000002] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000001] CR2: 00007fda4f744000 CR3: 000000016856f005 CR4: 0000000000f72ef0 [ +0.000001] PKRU: 55555554 ``` I also tested cfi=fineibt,bhi cfi=fineibt,bhi,warn cfi=fineibt cfi=kcfi which showed no additional errors. Cheers, Nathan
On Thu, Aug 14, 2025 at 01:17:33PM +0200, Peter Zijlstra wrote: > diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c > index 99444409c026..59f8bf022ec5 100644 > --- a/arch/x86/kernel/cet.c > +++ b/arch/x86/kernel/cet.c > @@ -123,7 +123,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) > return; > } > > - pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); > + early_printk("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); > if (!ibt_fatal) { > printk(KERN_DEFAULT CUT_HERE); > __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); Obviously this is debugging remnant and will go away.
© 2016 - 2025 Red Hat, Inc.