Scott notes that non-taken branches are faster. Abuse overlapping code
that traps instead of explicit UD2 instructions.
And LEA does not modify flags and will have less dependencies.
Suggested-by: Scott Constable <scott.d.constable@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
arch/x86/kernel/alternative.c | 58 ++++++++++++++++++++++++++----------------
arch/x86/net/bpf_jit_comp.c | 5 +--
2 files changed, 39 insertions(+), 24 deletions(-)
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1054,9 +1054,9 @@ early_param("cfi", cfi_parse_cmdline);
* __cfi_\func: __cfi_\func:
* movl $0x12345678,%eax // 5 endbr64 // 4
* nop subl $0x12345678,%r10d // 7
- * nop jz 1f // 2
- * nop ud2 // 2
- * nop 1: nop // 1
+ * nop jne __cfi_\func+6 // 2
+ * nop nop3 // 3
+ * nop
* nop
* nop
* nop
@@ -1068,37 +1068,47 @@ early_param("cfi", cfi_parse_cmdline);
*
* caller: caller:
* movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
- * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4
+ * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4
* je 1f // 2 nop4 // 4
* ud2 // 2
- * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5
+ * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6
*
*/
-asm( ".pushsection .rodata \n"
- "fineibt_preamble_start: \n"
- " endbr64 \n"
- " subl $0x12345678, %r10d \n"
- " je fineibt_preamble_end \n"
- "fineibt_preamble_ud2: \n"
- " ud2 \n"
- " nop \n"
- "fineibt_preamble_end: \n"
+/*
+ * <fineibt_preamble_start>:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d
+ * b: 75 f9 jne 6 <fineibt_preamble_start+0x6>
+ * d: 0f 1f 00 nopl (%rax)
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_preamble_start: \n"
+ " endbr64 \n"
+ " subl $0x12345678, %r10d \n"
+ " jne fineibt_preamble_start+6 \n"
+ ASM_NOP3
+ "fineibt_preamble_end: \n"
".popsection\n"
);
extern u8 fineibt_preamble_start[];
-extern u8 fineibt_preamble_ud2[];
extern u8 fineibt_preamble_end[];
#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
-#define fineibt_preamble_ud2 (fineibt_preamble_ud2 - fineibt_preamble_start)
+#define fineibt_preamble_ud 6
#define fineibt_preamble_hash 7
+/*
+ * <fineibt_caller_start>:
+ * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
+ * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11
+ * a: 0f 1f 40 00 nopl 0x0(%rax)
+ */
asm( ".pushsection .rodata \n"
"fineibt_caller_start: \n"
" movl $0x12345678, %r10d \n"
- " sub $16, %r11 \n"
+ " lea -0x10(%r11), %r11 \n"
ASM_NOP4
"fineibt_caller_end: \n"
".popsection \n"
@@ -1429,15 +1439,15 @@ static void poison_cfi(void *addr)
}
/*
- * regs->ip points to a UD2 instruction, return true and fill out target and
- * type when this UD2 is from a FineIBT preamble.
+ * When regs->ip points to a 0xEA byte in the FineIBT preamble,
+ * return true and fill out target and type.
*
* We check the preamble by checking for the ENDBR instruction relative to the
- * UD2 instruction.
+ * 0xEA instruction.
*/
bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
{
- unsigned long addr = regs->ip - fineibt_preamble_ud2;
+ unsigned long addr = regs->ip - fineibt_preamble_ud;
u32 hash;
if (!exact_endbr((void *)addr))
@@ -1448,6 +1458,12 @@ bool decode_fineibt_insn(struct pt_regs
__get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
*type = (u32)regs->r10 + hash;
+ /*
+ * Since regs->ip points to the middle of an instruction; it cannot
+ * continue with the normal fixup.
+ */
+ regs->ip = *target;
+
return true;
Efault:
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -417,9 +417,8 @@ static void emit_fineibt(u8 **pprog, u32
EMIT_ENDBR();
EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */
- EMIT2(0x74, 0x07); /* jz.d8 +7 */
- EMIT2(0x0f, 0x0b); /* ud2 */
- EMIT1(0x90); /* nop */
+ EMIT2(0x75, 0xf9); /* jne.d8 .-7 */
+ EMIT3(0x0f, 0x1f, 0x00); /* nop3 */
EMIT_ENDBR_POISON();
*pprog = prog;
On Wed, Feb 19, 2025 at 05:21:12PM +0100, Peter Zijlstra wrote:
> Scott notes that non-taken branches are faster. Abuse overlapping code
> that traps instead of explicit UD2 instructions.
Some kind of commenting is needed in here to explicitly call out the
embedded EA in the "subl" instruction. There is a tiny hint of it in the
disassembly dump of fineibt_preamble_start, but it's very subtle for
someone trying to understand this fresh.
> And LEA does not modify flags and will have less dependencies.
>
> Suggested-by: Scott Constable <scott.d.constable@intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
But it works!
Reviewed-by: Kees Cook <kees@kernel.org>
> ---
> arch/x86/kernel/alternative.c | 58 ++++++++++++++++++++++++++----------------
> arch/x86/net/bpf_jit_comp.c | 5 +--
> 2 files changed, 39 insertions(+), 24 deletions(-)
>
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -1054,9 +1054,9 @@ early_param("cfi", cfi_parse_cmdline);
> * __cfi_\func: __cfi_\func:
> * movl $0x12345678,%eax // 5 endbr64 // 4
> * nop subl $0x12345678,%r10d // 7
> - * nop jz 1f // 2
> - * nop ud2 // 2
> - * nop 1: nop // 1
> + * nop jne __cfi_\func+6 // 2
> + * nop nop3 // 3
> + * nop
> * nop
> * nop
> * nop
> @@ -1068,37 +1068,47 @@ early_param("cfi", cfi_parse_cmdline);
> *
> * caller: caller:
> * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
> - * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4
> + * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4
> * je 1f // 2 nop4 // 4
> * ud2 // 2
> - * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5
> + * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6
> *
> */
>
> -asm( ".pushsection .rodata \n"
> - "fineibt_preamble_start: \n"
> - " endbr64 \n"
> - " subl $0x12345678, %r10d \n"
> - " je fineibt_preamble_end \n"
> - "fineibt_preamble_ud2: \n"
> - " ud2 \n"
> - " nop \n"
> - "fineibt_preamble_end: \n"
> +/*
> + * <fineibt_preamble_start>:
> + * 0: f3 0f 1e fa endbr64
> + * 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d
> + * b: 75 f9 jne 6 <fineibt_preamble_start+0x6>
> + * d: 0f 1f 00 nopl (%rax)
> + */
> +asm( ".pushsection .rodata \n"
> + "fineibt_preamble_start: \n"
> + " endbr64 \n"
> + " subl $0x12345678, %r10d \n"
> + " jne fineibt_preamble_start+6 \n"
> + ASM_NOP3
> + "fineibt_preamble_end: \n"
> ".popsection\n"
> );
>
> extern u8 fineibt_preamble_start[];
> -extern u8 fineibt_preamble_ud2[];
> extern u8 fineibt_preamble_end[];
>
> #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
> -#define fineibt_preamble_ud2 (fineibt_preamble_ud2 - fineibt_preamble_start)
> +#define fineibt_preamble_ud 6
> #define fineibt_preamble_hash 7
>
> +/*
> + * <fineibt_caller_start>:
> + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
> + * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11
> + * a: 0f 1f 40 00 nopl 0x0(%rax)
> + */
> asm( ".pushsection .rodata \n"
> "fineibt_caller_start: \n"
> " movl $0x12345678, %r10d \n"
> - " sub $16, %r11 \n"
> + " lea -0x10(%r11), %r11 \n"
> ASM_NOP4
> "fineibt_caller_end: \n"
> ".popsection \n"
> @@ -1429,15 +1439,15 @@ static void poison_cfi(void *addr)
> }
>
> /*
> - * regs->ip points to a UD2 instruction, return true and fill out target and
> - * type when this UD2 is from a FineIBT preamble.
> + * When regs->ip points to a 0xEA byte in the FineIBT preamble,
> + * return true and fill out target and type.
> *
> * We check the preamble by checking for the ENDBR instruction relative to the
> - * UD2 instruction.
> + * 0xEA instruction.
> */
> bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
> {
> - unsigned long addr = regs->ip - fineibt_preamble_ud2;
> + unsigned long addr = regs->ip - fineibt_preamble_ud;
> u32 hash;
>
> if (!exact_endbr((void *)addr))
> @@ -1448,6 +1458,12 @@ bool decode_fineibt_insn(struct pt_regs
> __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
> *type = (u32)regs->r10 + hash;
>
> + /*
> + * Since regs->ip points to the middle of an instruction; it cannot
> + * continue with the normal fixup.
> + */
> + regs->ip = *target;
> +
> return true;
>
> Efault:
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -417,9 +417,8 @@ static void emit_fineibt(u8 **pprog, u32
>
> EMIT_ENDBR();
> EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */
> - EMIT2(0x74, 0x07); /* jz.d8 +7 */
> - EMIT2(0x0f, 0x0b); /* ud2 */
> - EMIT1(0x90); /* nop */
> + EMIT2(0x75, 0xf9); /* jne.d8 .-7 */
> + EMIT3(0x0f, 0x1f, 0x00); /* nop3 */
> EMIT_ENDBR_POISON();
>
> *pprog = prog;
>
>
--
Kees Cook
On Wed, Feb 19, 2025 at 10:01:15AM -0800, Kees Cook wrote:
> On Wed, Feb 19, 2025 at 05:21:12PM +0100, Peter Zijlstra wrote:
> > Scott notes that non-taken branches are faster. Abuse overlapping code
> > that traps instead of explicit UD2 instructions.
>
> Some kind of commenting is needed in here to explicitly call out the
> embedded EA in the "subl" instruction. There is a tiny hint of it in the
> disassembly dump of fineibt_preamble_start, but it's very subtle for
> someone trying to understand this fresh.
Ah, but you found my clue :-)
How's this?
---
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1080,6 +1080,9 @@ early_param("cfi", cfi_parse_cmdline);
* 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d
* b: 75 f9 jne 6 <fineibt_preamble_start+0x6>
* d: 0f 1f 00 nopl (%rax)
+ *
+ * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as
+ * (bad) on x86_64 and raises #UD.
*/
asm( ".pushsection .rodata \n"
"fineibt_preamble_start: \n"
On Wed, Feb 19, 2025 at 07:18:33PM +0100, Peter Zijlstra wrote:
> On Wed, Feb 19, 2025 at 10:01:15AM -0800, Kees Cook wrote:
> > On Wed, Feb 19, 2025 at 05:21:12PM +0100, Peter Zijlstra wrote:
> > > Scott notes that non-taken branches are faster. Abuse overlapping code
> > > that traps instead of explicit UD2 instructions.
> >
> > Some kind of commenting is needed in here to explicitly call out the
> > embedded EA in the "subl" instruction. There is a tiny hint of it in the
> > disassembly dump of fineibt_preamble_start, but it's very subtle for
> > someone trying to understand this fresh.
>
> Ah, but you found my clue :-)
>
> How's this?
>
> ---
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -1080,6 +1080,9 @@ early_param("cfi", cfi_parse_cmdline);
> * 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d
> * b: 75 f9 jne 6 <fineibt_preamble_start+0x6>
> * d: 0f 1f 00 nopl (%rax)
> + *
> + * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as
> + * (bad) on x86_64 and raises #UD.
> */
> asm( ".pushsection .rodata \n"
> "fineibt_preamble_start: \n"
Better! Thank you. :)
--
Kees Cook
On 19/02/2025 4:21 pm, Peter Zijlstra wrote: > Scott notes that non-taken branches are faster. Abuse overlapping code > that traps instead of explicit UD2 instructions. > > And LEA does not modify flags and will have less dependencies. > > Suggested-by: Scott Constable <scott.d.constable@intel.com> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Can we get a bit more info on this "non-taken branches are faster" ? For modern cores which have branch prediction pre-decode, a branch unknown to the predictor will behave as non-taken until the Jcc executes[1]. Something size of Linux is surely going to exceed the branch predictor capacity, so it's perhaps fair to say that there's a reasonable chance to miss in the predictor. But, for a branch known to the predictor, taken branches ought to be bubble-less these days. At least, this is what the marketing material claims. And, this doesn't account for branches which alias in the predictor and end up with a wrong prediction. ~Andrew [1] Yes, I know RWC has the reintroduced 0xee prefix with the decode resteer.
Hi Andrew, I can elaborate, if only "a bit." Your intuition about branches is pretty accurate, and the difference between taken vs. not-taken should, on average, be marginal. I can quote from Intel's software optimization manual: "Conditional branches that are never taken do not consume BTB resources." Additionally, there are some more subtle reasons that not-taken branches can be preferable--these vary by microarchitecture. Regards, Scott Constable -----Original Message----- From: Andrew Cooper <andrew.cooper3@citrix.com> Sent: Wednesday, February 19, 2025 9:15 AM To: Peter Zijlstra <peterz@infradead.org>; x86@kernel.org Cc: linux-kernel@vger.kernel.org; Milburn, Alyssa <alyssa.milburn@intel.com>; Constable, Scott D <scott.d.constable@intel.com>; joao@overdrivepizza.com; jpoimboe@kernel.org; jose.marchesi@oracle.com; hjl.tools@gmail.com; ndesaulniers@google.com; samitolvanen@google.com; nathan@kernel.org; ojeda@kernel.org; kees@kernel.org; alexei.starovoitov@gmail.com; mhiramat@kernel.org; jmill@asu.edu Subject: Re: [PATCH v3 05/10] x86/ibt: Optimize FineIBT sequence On 19/02/2025 4:21 pm, Peter Zijlstra wrote: > Scott notes that non-taken branches are faster. Abuse overlapping code > that traps instead of explicit UD2 instructions. > > And LEA does not modify flags and will have less dependencies. > > Suggested-by: Scott Constable <scott.d.constable@intel.com> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Can we get a bit more info on this "non-taken branches are faster" ? For modern cores which have branch prediction pre-decode, a branch unknown to the predictor will behave as non-taken until the Jcc executes[1]. Something size of Linux is surely going to exceed the branch predictor capacity, so it's perhaps fair to say that there's a reasonable chance to miss in the predictor. But, for a branch known to the predictor, taken branches ought to be bubble-less these days. At least, this is what the marketing material claims. And, this doesn't account for branches which alias in the predictor and end up with a wrong prediction. ~Andrew [1] Yes, I know RWC has the reintroduced 0xee prefix with the decode resteer.
© 2016 - 2025 Red Hat, Inc.