[RFC][PATCH] x86,nospec: Simplify {JMP,CALL}_NOSPEC

Peter Zijlstra posted 1 patch 3 years, 9 months ago
There is a newer version of this series
arch/x86/include/asm/nospec-branch.h | 24 ++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)
[RFC][PATCH] x86,nospec: Simplify {JMP,CALL}_NOSPEC
Posted by Peter Zijlstra 3 years, 9 months ago
On Tue, Jul 19, 2022 at 03:19:26PM +0200, Maciej S. Szmigiero wrote:
> On 4.12.2021 14:43, Peter Zijlstra wrote:
> > Make use of an upcomming GCC feature to mitigate
> > straight-line-speculation for x86:
> > 
> >    https://gcc.gnu.org/g:53a643f8568067d7700a9f2facc8ba39974973d3
> >    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102952
> >    https://bugs.llvm.org/show_bug.cgi?id=52323
> > 
> > It's built tested on x86_64-allyesconfig using GCC-12 and GCC-11.
> > 
> > Maintenace overhead of this should be fairly low due to objtool
> > validation.
> > 
> > Size overhead of all these additional int3 instructions comes to:
> > 
> >     text	   data	    bss	    dec	    hex	filename
> > 22267751	6933356	2011368	31212475	1dc43bb	defconfig-build/vmlinux
> > 22804126	6933356	1470696	31208178	1dc32f2	defconfig-build/vmlinux.sls
> > 
> > Or roughly 2.4% additional text.
> > 
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> (..)
> > --- a/arch/x86/lib/retpoline.S
> > +++ b/arch/x86/lib/retpoline.S
> > @@ -34,7 +34,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\re
> >   	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
> >   		      __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
> > -		      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
> > +		      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_AMD
> >   .endm
> 
> Looking at this __x86_indirect_thunk_* change makes me wonder why there is
> no similar int3 SLS protection in the X86_FEATURE_RETPOLINE_LFENCE case of
> JMP_NOSPEC in arch/x86/include/asm/nospec-branch.h:
> > .macro JMP_NOSPEC reg:req
> > #ifdef CONFIG_RETPOLINE
> > 	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
> > 		      __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
> > 		      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE
> > #else
> 
> JMP_NOSPEC users seem to have no explicit trailing int3 instructions
> either.
> 
> Or am I missing something here?

Probably just forgot about those. I'm thinking we ought to do something
like this...

---
Subject: x86,nospec: Simplify {JMP,CALL}_NOSPEC

Have {JMP,CALL}_NOSPEC generate the same code GCC does for indirect
calls and rely on the objtool retpoline patching infrastructure.

There's no reason these should be alternatives while the vast bulk of
compiler generated retpolines are not.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/nospec-branch.h | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 10a3bfc1eb23..7bb319d2932c 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -93,6 +93,19 @@
 #endif
 .endm
 
+/*
+ * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call
+ * to the retpoline thunk with a CS prefix when the register requires
+ * a RAX prefix byte to encode. Also see apply_alternatives().
+ */
+.macro __CS_PREFIX reg:req
+	.irp rs,r8,r9,r10,r11,r12,r13,r14,r15
+	.ifc \reg,\rs
+	.byte 0x2e
+	.endif
+	.endr
+.endm
+
 /*
  * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
  * indirect jmp/call which may be susceptible to the Spectre variant 2
@@ -100,19 +113,18 @@
  */
 .macro JMP_NOSPEC reg:req
 #ifdef CONFIG_RETPOLINE
-	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
-		      __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
-		      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE
+	__CS_PREFIX \reg
+	jmp	__x86_indirect_thunk_\reg
 #else
 	jmp	*%\reg
+	int3
 #endif
 .endm
 
 .macro CALL_NOSPEC reg:req
 #ifdef CONFIG_RETPOLINE
-	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \
-		      __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
-		      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_LFENCE
+	__CS_PREFIX \reg
+	call	__x86_indirect_thunk_\reg
 #else
 	call	*%\reg
 #endif
Re: [RFC][PATCH] x86,nospec: Simplify {JMP,CALL}_NOSPEC
Posted by Peter Zijlstra 3 years, 9 months ago
On Tue, Jul 19, 2022 at 11:23:07PM +0200, Peter Zijlstra wrote:
> Subject: x86,nospec: Simplify {JMP,CALL}_NOSPEC
> 
> Have {JMP,CALL}_NOSPEC generate the same code GCC does for indirect
> calls and rely on the objtool retpoline patching infrastructure.
> 
> There's no reason these should be alternatives while the vast bulk of
> compiler generated retpolines are not.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/include/asm/nospec-branch.h | 24 ++++++++++++++++++------
>  1 file changed, 18 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> index 10a3bfc1eb23..7bb319d2932c 100644
> --- a/arch/x86/include/asm/nospec-branch.h
> +++ b/arch/x86/include/asm/nospec-branch.h
> @@ -93,6 +93,19 @@
>  #endif
>  .endm
>  
> +/*
> + * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call
> + * to the retpoline thunk with a CS prefix when the register requires
> + * a RAX prefix byte to encode. Also see apply_alternatives().

Obviously I meant: apply_retpolines() ...

> + */
> +.macro __CS_PREFIX reg:req
> +	.irp rs,r8,r9,r10,r11,r12,r13,r14,r15
> +	.ifc \reg,\rs
> +	.byte 0x2e
> +	.endif
> +	.endr
> +.endm
> +
>  /*
>   * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
>   * indirect jmp/call which may be susceptible to the Spectre variant 2
> @@ -100,19 +113,18 @@
>   */
>  .macro JMP_NOSPEC reg:req
>  #ifdef CONFIG_RETPOLINE
> -	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
> -		      __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
> -		      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE
> +	__CS_PREFIX \reg
> +	jmp	__x86_indirect_thunk_\reg
>  #else
>  	jmp	*%\reg
> +	int3
>  #endif
>  .endm
>  
>  .macro CALL_NOSPEC reg:req
>  #ifdef CONFIG_RETPOLINE
> -	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \
> -		      __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
> -		      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_LFENCE
> +	__CS_PREFIX \reg
> +	call	__x86_indirect_thunk_\reg
>  #else
>  	call	*%\reg
>  #endif
Re: [RFC][PATCH] x86,nospec: Simplify {JMP,CALL}_NOSPEC
Posted by Maciej S. Szmigiero 3 years, 9 months ago
On 19.07.2022 23:33, Peter Zijlstra wrote:
> On Tue, Jul 19, 2022 at 11:23:07PM +0200, Peter Zijlstra wrote:
>> Subject: x86,nospec: Simplify {JMP,CALL}_NOSPEC
>>
>> Have {JMP,CALL}_NOSPEC generate the same code GCC does for indirect
>> calls and rely on the objtool retpoline patching infrastructure.
>>
>> There's no reason these should be alternatives while the vast bulk of
>> compiler generated retpolines are not.
>>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> ---
>>   arch/x86/include/asm/nospec-branch.h | 24 ++++++++++++++++++------
>>   1 file changed, 18 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
>> index 10a3bfc1eb23..7bb319d2932c 100644
>> --- a/arch/x86/include/asm/nospec-branch.h
>> +++ b/arch/x86/include/asm/nospec-branch.h
>> @@ -93,6 +93,19 @@
>>   #endif
>>   .endm
>>   
>> +/*
>> + * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call
>> + * to the retpoline thunk with a CS prefix when the register requires
>> + * a RAX prefix byte to encode. Also see apply_alternatives().
> 
> Obviously I meant: apply_retpolines() ...

Will apply_retpolines() actually patch in that trailing int3 in
the X86_FEATURE_RETPOLINE_LFENCE case?

Looking at its code it uses just ordinary NOPs as fill:
> 	/*
> 	 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
> 	 */
> 	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
> 		bytes[i++] = 0x0f;
> 		bytes[i++] = 0xae;
> 		bytes[i++] = 0xe8; /* LFENCE */
> 	}
> 
> 	ret = emit_indirect(op, reg, bytes + i);
> 	if (ret < 0)
> 		return ret;
> 	i += ret;
> 
> 	for (; i < insn->length;)
> 		bytes[i++] = BYTES_NOP1;

BYTES_NOP1 is 0x90.

>> + */
>> +.macro __CS_PREFIX reg:req
>> +	.irp rs,r8,r9,r10,r11,r12,r13,r14,r15
>> +	.ifc \reg,\rs
>> +	.byte 0x2e
>> +	.endif
>> +	.endr
>> +.endm
>> +
>>   /*
>>    * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
>>    * indirect jmp/call which may be susceptible to the Spectre variant 2
>> @@ -100,19 +113,18 @@
>>    */
>>   .macro JMP_NOSPEC reg:req
>>   #ifdef CONFIG_RETPOLINE
>> -	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
>> -		      __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
>> -		      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE
>> +	__CS_PREFIX \reg
>> +	jmp	__x86_indirect_thunk_\reg
>>   #else
>>   	jmp	*%\reg
>> +	int3
>>   #endif

Perhaps that int3 should be here to be common to both
"#ifdef CONFIG_RETPOLINE" branches?

>>   .endm
>>   

Thanks,
Maciej
Re: [RFC][PATCH] x86,nospec: Simplify {JMP,CALL}_NOSPEC
Posted by Peter Zijlstra 3 years, 9 months ago
On Wed, Jul 20, 2022 at 02:01:39AM +0200, Maciej S. Szmigiero wrote:
> > Obviously I meant: apply_retpolines() ...
> 
> Will apply_retpolines() actually patch in that trailing int3 in
> the X86_FEATURE_RETPOLINE_LFENCE case?
> 
> Looking at its code it uses just ordinary NOPs as fill:
> > 	/*
> > 	 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
> > 	 */
> > 	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
> > 		bytes[i++] = 0x0f;
> > 		bytes[i++] = 0xae;
> > 		bytes[i++] = 0xe8; /* LFENCE */
> > 	}
> > 
> > 	ret = emit_indirect(op, reg, bytes + i);
> > 	if (ret < 0)
> > 		return ret;
> > 	i += ret;
> > 
> > 	for (; i < insn->length;)
> > 		bytes[i++] = BYTES_NOP1;

There is no space for int3 in that case. You get 3 bytes for LFENCE and
{2,3} bytes for 'jmp *%reg', which fully consumes the {5,6} bytes
available.

There will be no nops added.

But this is what all regular retpolines get to look like.

The plan was; and that's still pending, to get the INT3 from the AMD BTC
mitigation that adds INT3 after regular JMPs but those compiler patches
still need to happen I think.