[PATCH v3 05/10] x86/ibt: Optimize FineIBT sequence

Peter Zijlstra posted 10 patches 10 months ago
There is a newer version of this series
[PATCH v3 05/10] x86/ibt: Optimize FineIBT sequence
Posted by Peter Zijlstra 10 months ago
Scott notes that non-taken branches are faster. Abuse overlapping code
that traps instead of explicit UD2 instructions.

And LEA does not modify flags and will have less dependencies.

Suggested-by: Scott Constable <scott.d.constable@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/alternative.c |   58 ++++++++++++++++++++++++++----------------
 arch/x86/net/bpf_jit_comp.c   |    5 +--
 2 files changed, 39 insertions(+), 24 deletions(-)

--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1054,9 +1054,9 @@ early_param("cfi", cfi_parse_cmdline);
  * __cfi_\func:					__cfi_\func:
  *	movl   $0x12345678,%eax		// 5	     endbr64			// 4
  *	nop					     subl   $0x12345678,%r10d   // 7
- *	nop					     jz     1f			// 2
- *	nop					     ud2			// 2
- *	nop					1:   nop			// 1
+ *	nop					     jne    __cfi_\func+6	// 2
+ *	nop					     nop3			// 3
+ *	nop
  *	nop
  *	nop
  *	nop
@@ -1068,37 +1068,47 @@ early_param("cfi", cfi_parse_cmdline);
  *
  * caller:					caller:
  *	movl	$(-0x12345678),%r10d	 // 6	     movl   $0x12345678,%r10d	// 6
- *	addl	$-15(%r11),%r10d	 // 4	     sub    $16,%r11		// 4
+ *	addl	$-15(%r11),%r10d	 // 4	     lea    -0x10(%r11),%r11	// 4
  *	je	1f			 // 2	     nop4			// 4
  *	ud2				 // 2
- * 1:	call	__x86_indirect_thunk_r11 // 5	     call   *%r11; nop2;	// 5
+ * 1:	cs call	__x86_indirect_thunk_r11 // 6	     call   *%r11; nop3;	// 6
  *
  */
 
-asm(	".pushsection .rodata			\n"
-	"fineibt_preamble_start:		\n"
-	"	endbr64				\n"
-	"	subl	$0x12345678, %r10d	\n"
-	"	je	fineibt_preamble_end	\n"
-	"fineibt_preamble_ud2:			\n"
-	"	ud2				\n"
-	"	nop				\n"
-	"fineibt_preamble_end:			\n"
+/*
+ * <fineibt_preamble_start>:
+ *  0:   f3 0f 1e fa             endbr64
+ *  4:   41 81 <ea> 78 56 34 12  sub    $0x12345678, %r10d
+ *  b:   75 f9                   jne    6 <fineibt_preamble_start+0x6>
+ *  d:   0f 1f 00                nopl   (%rax)
+ */
+asm(	".pushsection .rodata				\n"
+	"fineibt_preamble_start:			\n"
+	"	endbr64					\n"
+	"	subl	$0x12345678, %r10d		\n"
+	"	jne	fineibt_preamble_start+6	\n"
+	ASM_NOP3
+	"fineibt_preamble_end:				\n"
 	".popsection\n"
 );
 
 extern u8 fineibt_preamble_start[];
-extern u8 fineibt_preamble_ud2[];
 extern u8 fineibt_preamble_end[];
 
 #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
-#define fineibt_preamble_ud2  (fineibt_preamble_ud2 - fineibt_preamble_start)
+#define fineibt_preamble_ud   6
 #define fineibt_preamble_hash 7
 
+/*
+ * <fineibt_caller_start>:
+ *  0:   41 ba 78 56 34 12       mov    $0x12345678, %r10d
+ *  6:   4d 8d 5b f0             lea    -0x10(%r11), %r11
+ *  a:   0f 1f 40 00             nopl   0x0(%rax)
+ */
 asm(	".pushsection .rodata			\n"
 	"fineibt_caller_start:			\n"
 	"	movl	$0x12345678, %r10d	\n"
-	"	sub	$16, %r11		\n"
+	"	lea	-0x10(%r11), %r11	\n"
 	ASM_NOP4
 	"fineibt_caller_end:			\n"
 	".popsection				\n"
@@ -1429,15 +1439,15 @@ static void poison_cfi(void *addr)
 }
 
 /*
- * regs->ip points to a UD2 instruction, return true and fill out target and
- * type when this UD2 is from a FineIBT preamble.
+ * When regs->ip points to a 0xEA byte in the FineIBT preamble,
+ * return true and fill out target and type.
  *
  * We check the preamble by checking for the ENDBR instruction relative to the
- * UD2 instruction.
+ * 0xEA instruction.
  */
 bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
 {
-	unsigned long addr = regs->ip - fineibt_preamble_ud2;
+	unsigned long addr = regs->ip - fineibt_preamble_ud;
 	u32 hash;
 
 	if (!exact_endbr((void *)addr))
@@ -1448,6 +1458,12 @@ bool decode_fineibt_insn(struct pt_regs
 	__get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
 	*type = (u32)regs->r10 + hash;
 
+	/*
+	 * Since regs->ip points to the middle of an instruction; it cannot
+	 * continue with the normal fixup.
+	 */
+	regs->ip = *target;
+
 	return true;
 
 Efault:
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -417,9 +417,8 @@ static void emit_fineibt(u8 **pprog, u32
 
 	EMIT_ENDBR();
 	EMIT3_off32(0x41, 0x81, 0xea, hash);		/* subl $hash, %r10d	*/
-	EMIT2(0x74, 0x07);				/* jz.d8 +7		*/
-	EMIT2(0x0f, 0x0b);				/* ud2			*/
-	EMIT1(0x90);					/* nop			*/
+	EMIT2(0x75, 0xf9);				/* jne.d8 .-7		*/
+	EMIT3(0x0f, 0x1f, 0x00);			/* nop3			*/
 	EMIT_ENDBR_POISON();
 
 	*pprog = prog;
Re: [PATCH v3 05/10] x86/ibt: Optimize FineIBT sequence
Posted by Kees Cook 10 months ago
On Wed, Feb 19, 2025 at 05:21:12PM +0100, Peter Zijlstra wrote:
> Scott notes that non-taken branches are faster. Abuse overlapping code
> that traps instead of explicit UD2 instructions.

Some kind of commenting is needed in here to explicitly call out the
embedded EA in the "subl" instruction. There is a tiny hint of it in the
disassembly dump of fineibt_preamble_start, but it's very subtle for
someone trying to understand this fresh.

> And LEA does not modify flags and will have less dependencies.
> 
> Suggested-by: Scott Constable <scott.d.constable@intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

But it works!

Reviewed-by: Kees Cook <kees@kernel.org>

> ---
>  arch/x86/kernel/alternative.c |   58 ++++++++++++++++++++++++++----------------
>  arch/x86/net/bpf_jit_comp.c   |    5 +--
>  2 files changed, 39 insertions(+), 24 deletions(-)
> 
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -1054,9 +1054,9 @@ early_param("cfi", cfi_parse_cmdline);
>   * __cfi_\func:					__cfi_\func:
>   *	movl   $0x12345678,%eax		// 5	     endbr64			// 4
>   *	nop					     subl   $0x12345678,%r10d   // 7
> - *	nop					     jz     1f			// 2
> - *	nop					     ud2			// 2
> - *	nop					1:   nop			// 1
> + *	nop					     jne    __cfi_\func+6	// 2
> + *	nop					     nop3			// 3
> + *	nop
>   *	nop
>   *	nop
>   *	nop
> @@ -1068,37 +1068,47 @@ early_param("cfi", cfi_parse_cmdline);
>   *
>   * caller:					caller:
>   *	movl	$(-0x12345678),%r10d	 // 6	     movl   $0x12345678,%r10d	// 6
> - *	addl	$-15(%r11),%r10d	 // 4	     sub    $16,%r11		// 4
> + *	addl	$-15(%r11),%r10d	 // 4	     lea    -0x10(%r11),%r11	// 4
>   *	je	1f			 // 2	     nop4			// 4
>   *	ud2				 // 2
> - * 1:	call	__x86_indirect_thunk_r11 // 5	     call   *%r11; nop2;	// 5
> + * 1:	cs call	__x86_indirect_thunk_r11 // 6	     call   *%r11; nop3;	// 6
>   *
>   */
>  
> -asm(	".pushsection .rodata			\n"
> -	"fineibt_preamble_start:		\n"
> -	"	endbr64				\n"
> -	"	subl	$0x12345678, %r10d	\n"
> -	"	je	fineibt_preamble_end	\n"
> -	"fineibt_preamble_ud2:			\n"
> -	"	ud2				\n"
> -	"	nop				\n"
> -	"fineibt_preamble_end:			\n"
> +/*
> + * <fineibt_preamble_start>:
> + *  0:   f3 0f 1e fa             endbr64
> + *  4:   41 81 <ea> 78 56 34 12  sub    $0x12345678, %r10d
> + *  b:   75 f9                   jne    6 <fineibt_preamble_start+0x6>
> + *  d:   0f 1f 00                nopl   (%rax)
> + */
> +asm(	".pushsection .rodata				\n"
> +	"fineibt_preamble_start:			\n"
> +	"	endbr64					\n"
> +	"	subl	$0x12345678, %r10d		\n"
> +	"	jne	fineibt_preamble_start+6	\n"
> +	ASM_NOP3
> +	"fineibt_preamble_end:				\n"
>  	".popsection\n"
>  );
>  
>  extern u8 fineibt_preamble_start[];
> -extern u8 fineibt_preamble_ud2[];
>  extern u8 fineibt_preamble_end[];
>  
>  #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
> -#define fineibt_preamble_ud2  (fineibt_preamble_ud2 - fineibt_preamble_start)
> +#define fineibt_preamble_ud   6
>  #define fineibt_preamble_hash 7
>  
> +/*
> + * <fineibt_caller_start>:
> + *  0:   41 ba 78 56 34 12       mov    $0x12345678, %r10d
> + *  6:   4d 8d 5b f0             lea    -0x10(%r11), %r11
> + *  a:   0f 1f 40 00             nopl   0x0(%rax)
> + */
>  asm(	".pushsection .rodata			\n"
>  	"fineibt_caller_start:			\n"
>  	"	movl	$0x12345678, %r10d	\n"
> -	"	sub	$16, %r11		\n"
> +	"	lea	-0x10(%r11), %r11	\n"
>  	ASM_NOP4
>  	"fineibt_caller_end:			\n"
>  	".popsection				\n"
> @@ -1429,15 +1439,15 @@ static void poison_cfi(void *addr)
>  }
>  
>  /*
> - * regs->ip points to a UD2 instruction, return true and fill out target and
> - * type when this UD2 is from a FineIBT preamble.
> + * When regs->ip points to a 0xEA byte in the FineIBT preamble,
> + * return true and fill out target and type.
>   *
>   * We check the preamble by checking for the ENDBR instruction relative to the
> - * UD2 instruction.
> + * 0xEA instruction.
>   */
>  bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
>  {
> -	unsigned long addr = regs->ip - fineibt_preamble_ud2;
> +	unsigned long addr = regs->ip - fineibt_preamble_ud;
>  	u32 hash;
>  
>  	if (!exact_endbr((void *)addr))
> @@ -1448,6 +1458,12 @@ bool decode_fineibt_insn(struct pt_regs
>  	__get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
>  	*type = (u32)regs->r10 + hash;
>  
> +	/*
> +	 * Since regs->ip points to the middle of an instruction; it cannot
> +	 * continue with the normal fixup.
> +	 */
> +	regs->ip = *target;
> +
>  	return true;
>  
>  Efault:
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -417,9 +417,8 @@ static void emit_fineibt(u8 **pprog, u32
>  
>  	EMIT_ENDBR();
>  	EMIT3_off32(0x41, 0x81, 0xea, hash);		/* subl $hash, %r10d	*/
> -	EMIT2(0x74, 0x07);				/* jz.d8 +7		*/
> -	EMIT2(0x0f, 0x0b);				/* ud2			*/
> -	EMIT1(0x90);					/* nop			*/
> +	EMIT2(0x75, 0xf9);				/* jne.d8 .-7		*/
> +	EMIT3(0x0f, 0x1f, 0x00);			/* nop3			*/
>  	EMIT_ENDBR_POISON();
>  
>  	*pprog = prog;
> 
> 

-- 
Kees Cook
Re: [PATCH v3 05/10] x86/ibt: Optimize FineIBT sequence
Posted by Peter Zijlstra 10 months ago
On Wed, Feb 19, 2025 at 10:01:15AM -0800, Kees Cook wrote:
> On Wed, Feb 19, 2025 at 05:21:12PM +0100, Peter Zijlstra wrote:
> > Scott notes that non-taken branches are faster. Abuse overlapping code
> > that traps instead of explicit UD2 instructions.
> 
> Some kind of commenting is needed in here to explicitly call out the
> embedded EA in the "subl" instruction. There is a tiny hint of it in the
> disassembly dump of fineibt_preamble_start, but it's very subtle for
> someone trying to understand this fresh.

Ah, but you found my clue :-)

How's this?

---
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1080,6 +1080,9 @@ early_param("cfi", cfi_parse_cmdline);
  *  4:   41 81 <ea> 78 56 34 12  sub    $0x12345678, %r10d
  *  b:   75 f9                   jne    6 <fineibt_preamble_start+0x6>
  *  d:   0f 1f 00                nopl   (%rax)
+ *
+ * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as
+ * (bad) on x86_64 and raises #UD.
  */
 asm(	".pushsection .rodata				\n"
 	"fineibt_preamble_start:			\n"
Re: [PATCH v3 05/10] x86/ibt: Optimize FineIBT sequence
Posted by Kees Cook 10 months ago
On Wed, Feb 19, 2025 at 07:18:33PM +0100, Peter Zijlstra wrote:
> On Wed, Feb 19, 2025 at 10:01:15AM -0800, Kees Cook wrote:
> > On Wed, Feb 19, 2025 at 05:21:12PM +0100, Peter Zijlstra wrote:
> > > Scott notes that non-taken branches are faster. Abuse overlapping code
> > > that traps instead of explicit UD2 instructions.
> > 
> > Some kind of commenting is needed in here to explicitly call out the
> > embedded EA in the "subl" instruction. There is a tiny hint of it in the
> > disassembly dump of fineibt_preamble_start, but it's very subtle for
> > someone trying to understand this fresh.
> 
> Ah, but you found my clue :-)
> 
> How's this?
> 
> ---
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -1080,6 +1080,9 @@ early_param("cfi", cfi_parse_cmdline);
>   *  4:   41 81 <ea> 78 56 34 12  sub    $0x12345678, %r10d
>   *  b:   75 f9                   jne    6 <fineibt_preamble_start+0x6>
>   *  d:   0f 1f 00                nopl   (%rax)
> + *
> + * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as
> + * (bad) on x86_64 and raises #UD.
>   */
>  asm(	".pushsection .rodata				\n"
>  	"fineibt_preamble_start:			\n"

Better! Thank you. :)

-- 
Kees Cook
Re: [PATCH v3 05/10] x86/ibt: Optimize FineIBT sequence
Posted by Andrew Cooper 10 months ago
On 19/02/2025 4:21 pm, Peter Zijlstra wrote:
> Scott notes that non-taken branches are faster. Abuse overlapping code
> that traps instead of explicit UD2 instructions.
>
> And LEA does not modify flags and will have less dependencies.
>
> Suggested-by: Scott Constable <scott.d.constable@intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Can we get a bit more info on this "non-taken branches are faster" ?

For modern cores which have branch prediction pre-decode, a branch
unknown to the predictor will behave as non-taken until the Jcc executes[1].

Something size of Linux is surely going to exceed the branch predictor
capacity, so it's perhaps fair to say that there's a reasonable chance
to miss in the predictor.

But, for a branch known to the predictor, taken branches ought to be
bubble-less these days.  At least, this is what the marketing material
claims.

And, this doesn't account for branches which alias in the predictor and
end up with a wrong prediction.

~Andrew

[1] Yes, I know RWC has the reintroduced 0xee prefix with the decode
resteer.
RE: [PATCH v3 05/10] x86/ibt: Optimize FineIBT sequence
Posted by Constable, Scott D 10 months ago
Hi Andrew,

I can elaborate, if only "a bit." Your intuition about branches is pretty accurate, and the difference between taken vs. not-taken should, on average, be marginal. I can quote from Intel's software optimization manual: "Conditional branches that are never taken do not consume BTB resources." Additionally, there are some more subtle reasons that not-taken branches can be preferable--these vary by microarchitecture.

Regards,

Scott Constable

-----Original Message-----
From: Andrew Cooper <andrew.cooper3@citrix.com> 
Sent: Wednesday, February 19, 2025 9:15 AM
To: Peter Zijlstra <peterz@infradead.org>; x86@kernel.org
Cc: linux-kernel@vger.kernel.org; Milburn, Alyssa <alyssa.milburn@intel.com>; Constable, Scott D <scott.d.constable@intel.com>; joao@overdrivepizza.com; jpoimboe@kernel.org; jose.marchesi@oracle.com; hjl.tools@gmail.com; ndesaulniers@google.com; samitolvanen@google.com; nathan@kernel.org; ojeda@kernel.org; kees@kernel.org; alexei.starovoitov@gmail.com; mhiramat@kernel.org; jmill@asu.edu
Subject: Re: [PATCH v3 05/10] x86/ibt: Optimize FineIBT sequence

On 19/02/2025 4:21 pm, Peter Zijlstra wrote:
> Scott notes that non-taken branches are faster. Abuse overlapping code 
> that traps instead of explicit UD2 instructions.
>
> And LEA does not modify flags and will have less dependencies.
>
> Suggested-by: Scott Constable <scott.d.constable@intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Can we get a bit more info on this "non-taken branches are faster" ?

For modern cores which have branch prediction pre-decode, a branch unknown to the predictor will behave as non-taken until the Jcc executes[1].

Something size of Linux is surely going to exceed the branch predictor capacity, so it's perhaps fair to say that there's a reasonable chance to miss in the predictor.

But, for a branch known to the predictor, taken branches ought to be bubble-less these days.  At least, this is what the marketing material claims.

And, this doesn't account for branches which alias in the predictor and end up with a wrong prediction.

~Andrew

[1] Yes, I know RWC has the reintroduced 0xee prefix with the decode resteer.