[PATCH RFC 4/4] x86/srso: Use CALL-based return thunks to reduce overhead

Andrew Cooper posted 4 patches 2 years, 3 months ago
[PATCH RFC 4/4] x86/srso: Use CALL-based return thunks to reduce overhead
Posted by Andrew Cooper 2 years, 3 months ago
The SRSO safety depends on having a CALL to an {ADD,LEA}/RET sequence which
has been made safe in the BTB.  Specifically, there needs to be no pertubance
to the RAS between a correctly predicted CALL and the subsequent RET.

Use the new infrastructure to CALL to a return thunk.  Remove
srso_fam1?_safe_ret() symbols and point srso_fam1?_return_thunk().

This removes one taken branch from every function return, which will reduce
the overhead of the mitigation.  It also removes one of three moving pieces
from the SRSO mess.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
CC: Borislav Petkov <bp@alien8.de>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Josh Poimboeuf <jpoimboe@kernel.org>
CC: Babu Moger <babu.moger@amd.com>
CC: David.Kaplan@amd.com
CC: Nikolay Borisov <nik.borisov@suse.com>
CC: gregkh@linuxfoundation.org
CC: Thomas Gleixner <tglx@linutronix.de>

RFC:

  vmlinux.o: warning: objtool: srso_fam17_return_thunk(): can't find starting instruction

Any objtool whisperers know what's going on, and particularly why
srso_fam19_return_thunk() appears to be happy?

Also, depends on the resolution of the RFC in the previous patch.
---
 arch/x86/kernel/cpu/bugs.c    |  4 ++-
 arch/x86/kernel/vmlinux.lds.S |  6 ++---
 arch/x86/lib/retpoline.S      | 47 ++++++++++++++---------------------
 3 files changed, 25 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index de2f84aa526f..c4d580b485a7 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -2458,8 +2458,10 @@ static void __init srso_select_mitigation(void)
 		if (IS_ENABLED(CONFIG_CPU_SRSO)) {
 			/*
 			 * Enable the return thunk for generated code
-			 * like ftrace, static_call, etc.
+			 * like ftrace, static_call, etc.  These
+			 * ret-thunks need to call to their target.
 			 */
+			x86_return_thunk_use_call = true;
 			setup_force_cpu_cap(X86_FEATURE_RETHUNK);
 			setup_force_cpu_cap(X86_FEATURE_UNRET);
 
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 127ccdbf6d95..ed7d4020c2b4 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -522,7 +522,7 @@ INIT_PER_CPU(irq_stack_backing_store);
 
 #ifdef CONFIG_RETHUNK
 . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
-. = ASSERT((srso_fam17_safe_ret & 0x3f) == 0, "srso_fam17_safe_ret not cacheline-aligned");
+. = ASSERT((srso_fam17_return_thunk & 0x3f) == 0, "srso_fam17_return_thunk not cacheline-aligned");
 #endif
 
 #ifdef CONFIG_CPU_SRSO
@@ -536,8 +536,8 @@ INIT_PER_CPU(irq_stack_backing_store);
  * Instead do: (A | B) - (A & B) in order to compute the XOR
  * of the two function addresses:
  */
-. = ASSERT(((ABSOLUTE(srso_fam19_untrain_ret) | srso_fam19_safe_ret) -
-		(ABSOLUTE(srso_fam19_untrain_ret) & srso_fam19_safe_ret)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)),
+. = ASSERT(((ABSOLUTE(srso_fam19_untrain_ret) | srso_fam19_return_thunk) -
+		(ABSOLUTE(srso_fam19_untrain_ret) & srso_fam19_return_thunk)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)),
 		"SRSO function pair won't alias");
 #endif
 
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index d8732ae21122..2b1c92632158 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -133,11 +133,11 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
 #ifdef CONFIG_RETHUNK
 
 /*
- * srso_fam19_untrain_ret() and srso_fam19_safe_ret() are placed at
+ * srso_fam19_untrain_ret() and srso_fam19_return_thunk() are placed at
  * special addresses:
  *
  * - srso_fam19_untrain_ret() is 2M aligned
- * - srso_fam19_safe_ret() is also in the same 2M page but bits 2, 8, 14
+ * - srso_fam19_return_thunk() is also in the same 2M page but bits 2, 8, 14
  * and 20 in its virtual address are set (while those bits in the
  * srso_fam19_untrain_ret() function are cleared).
  *
@@ -145,7 +145,7 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
  * target buffer of Zen3/4 generations, leading to any potential
  * poisoned entries at that BTB slot to get evicted.
  *
- * As a result, srso_fam19_safe_ret() becomes a safe return.
+ * As a result, srso_fam19_return_thunk() becomes a safe return.
  */
 #ifdef CONFIG_CPU_SRSO
 	.section .text..__x86.rethunk_untrain
@@ -155,7 +155,8 @@ SYM_START(srso_fam19_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
 	ANNOTATE_NOENDBR
 	ASM_NOP2
 	lfence
-	jmp srso_fam19_return_thunk
+	call srso_fam19_return_thunk
+	ud2
 SYM_FUNC_END(srso_fam19_untrain_ret)
 __EXPORT_THUNK(srso_fam19_untrain_ret)
 
@@ -169,23 +170,17 @@ SYM_START(srso_fam19_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
 SYM_FUNC_END(srso_fam19_untrain_ret)
 #endif
 
-SYM_START(srso_fam19_safe_ret, SYM_L_GLOBAL, SYM_A_NONE)
-	lea 8(%_ASM_SP), %_ASM_SP
+SYM_START(srso_fam19_return_thunk, SYM_L_GLOBAL, SYM_A_NONE)
 	UNWIND_HINT_FUNC
+	ANNOTATE_NOENDBR
+	lea 8(%_ASM_SP), %_ASM_SP
 	ANNOTATE_UNRET_SAFE
 	ret
 	int3
-SYM_FUNC_END(srso_fam19_safe_ret)
+SYM_FUNC_END(srso_fam19_return_thunk)
 
 	.section .text..__x86.return_thunk
 
-SYM_CODE_START(srso_fam19_return_thunk)
-	UNWIND_HINT_FUNC
-	ANNOTATE_NOENDBR
-	call srso_fam19_safe_ret
-	ud2
-SYM_CODE_END(srso_fam19_return_thunk)
-
 /*
  * Some generic notes on the untraining sequences:
  *
@@ -194,13 +189,13 @@ SYM_CODE_END(srso_fam19_return_thunk)
  *
  * The SRSO Zen1/2 (MOVABS) untraining sequence is longer than the
  * Retbleed sequence because the return sequence done there
- * (srso_fam17_safe_ret()) is longer and the return sequence must fully nest
+ * (srso_fam17_return_thunk()) is longer and the return sequence must fully nest
  * (end before) the untraining sequence. Therefore, the untraining
  * sequence must fully overlap the return sequence.
  *
  * Regarding alignment - the instructions which need to be untrained,
  * must all start at a cacheline boundary for Zen1/2 generations. That
- * is, instruction sequences starting at srso_fam17_safe_ret() and
+ * is, instruction sequences starting at srso_fam17_return_thunk() and
  * the respective instruction sequences at retbleed_return_thunk()
  * must start at a cacheline boundary.
  */
@@ -272,12 +267,12 @@ __EXPORT_THUNK(retbleed_untrain_ret)
  *
  * movabs $0xccccc30824648d48,%rax
  *
- * and when the return thunk executes the inner label srso_fam17_safe_ret()
+ * and when the return thunk executes the inner label srso_fam17_return_thunk()
  * later, it is a stack manipulation and a RET which is mispredicted and
  * thus a "safe" one to use.
  */
 	.align 64
-	.skip 64 - (srso_fam17_safe_ret - srso_fam17_untrain_ret), 0xcc
+	.skip 64 - (srso_fam17_return_thunk - srso_fam17_untrain_ret), 0xcc
 SYM_START(srso_fam17_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
 	ANNOTATE_NOENDBR
 	.byte 0x48, 0xb8
@@ -288,26 +283,22 @@ SYM_START(srso_fam17_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
  * and execution will continue at the return site read from the top of
  * the stack.
  */
-SYM_INNER_LABEL(srso_fam17_safe_ret, SYM_L_GLOBAL)
+SYM_INNER_LABEL(srso_fam17_return_thunk, SYM_L_GLOBAL)
+	UNWIND_HINT_FUNC
+	ANNOTATE_NOENDBR
 	lea 8(%_ASM_SP), %_ASM_SP
+	ANNOTATE_UNRET_SAFE
 	ret
 	int3
 	int3
 	/* end of movabs */
 	lfence
-	call srso_fam17_safe_ret
+	call srso_fam17_return_thunk
 	ud2
-SYM_CODE_END(srso_fam17_safe_ret)
+SYM_CODE_END(srso_fam17_return_thunk)
 SYM_FUNC_END(srso_fam17_untrain_ret)
 __EXPORT_THUNK(srso_fam17_untrain_ret)
 
-SYM_CODE_START(srso_fam17_return_thunk)
-	UNWIND_HINT_FUNC
-	ANNOTATE_NOENDBR
-	call srso_fam17_safe_ret
-	ud2
-SYM_CODE_END(srso_fam17_return_thunk)
-
 SYM_FUNC_START(entry_untrain_ret)
 	ALTERNATIVE_2 "jmp retbleed_untrain_ret", \
 		      "jmp srso_fam17_untrain_ret", X86_FEATURE_SRSO, \
-- 
2.30.2
Re: [PATCH RFC 4/4] x86/srso: Use CALL-based return thunks to reduce overhead
Posted by Peter Zijlstra 2 years, 3 months ago
On Mon, Aug 21, 2023 at 12:27:23PM +0100, Andrew Cooper wrote:

> 
>   vmlinux.o: warning: objtool: srso_fam17_return_thunk(): can't find starting instruction
> 

> @@ -288,26 +283,22 @@ SYM_START(srso_fam17_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
>   * and execution will continue at the return site read from the top of
>   * the stack.
>   */
> -SYM_INNER_LABEL(srso_fam17_safe_ret, SYM_L_GLOBAL)
> +SYM_INNER_LABEL(srso_fam17_return_thunk, SYM_L_GLOBAL)

This srso_safe_ret -> srso_fam17_safe_ret you forgot in the last patch,
is then here renamed yet again to srso_fam17_return_thunk.

And there is your objtool splat.

> +	UNWIND_HINT_FUNC
> +	ANNOTATE_NOENDBR
>  	lea 8(%_ASM_SP), %_ASM_SP
> +	ANNOTATE_UNRET_SAFE
>  	ret
>  	int3
>  	int3
>  	/* end of movabs */
>  	lfence
> -	call srso_fam17_safe_ret
> +	call srso_fam17_return_thunk
>  	ud2
> -SYM_CODE_END(srso_fam17_safe_ret)
> +SYM_CODE_END(srso_fam17_return_thunk)
>  SYM_FUNC_END(srso_fam17_untrain_ret)
>  __EXPORT_THUNK(srso_fam17_untrain_ret)
>
Re: [PATCH RFC 4/4] x86/srso: Use CALL-based return thunks to reduce overhead
Posted by Josh Poimboeuf 2 years, 3 months ago
On Mon, Aug 21, 2023 at 12:27:23PM +0100, Andrew Cooper wrote:
> The SRSO safety depends on having a CALL to an {ADD,LEA}/RET sequence which
> has been made safe in the BTB.  Specifically, there needs to be no pertubance
> to the RAS between a correctly predicted CALL and the subsequent RET.
> 
> Use the new infrastructure to CALL to a return thunk.  Remove
> srso_fam1?_safe_ret() symbols and point srso_fam1?_return_thunk().
> 
> This removes one taken branch from every function return, which will reduce
> the overhead of the mitigation.  It also removes one of three moving pieces
> from the SRSO mess.

So, the address of whatever instruction comes after the 'CALL
srso_*_return_thunk' is added to the RSB/RAS, and that might be
speculated to when the thunk returns.  Is that a concern?

> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: x86@kernel.org
> CC: linux-kernel@vger.kernel.org
> CC: Borislav Petkov <bp@alien8.de>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Josh Poimboeuf <jpoimboe@kernel.org>
> CC: Babu Moger <babu.moger@amd.com>
> CC: David.Kaplan@amd.com
> CC: Nikolay Borisov <nik.borisov@suse.com>
> CC: gregkh@linuxfoundation.org
> CC: Thomas Gleixner <tglx@linutronix.de>
> 
> RFC:
> 
>   vmlinux.o: warning: objtool: srso_fam17_return_thunk(): can't find starting instruction
> 
> Any objtool whisperers know what's going on, and particularly why
> srso_fam19_return_thunk() appears to be happy?
> 
> Also, depends on the resolution of the RFC in the previous patch.

I can take a look.

-- 
Josh
Re: [PATCH RFC 4/4] x86/srso: Use CALL-based return thunks to reduce overhead
Posted by Andrew Cooper 2 years, 3 months ago
On 21/08/2023 4:16 pm, Josh Poimboeuf wrote:
> On Mon, Aug 21, 2023 at 12:27:23PM +0100, Andrew Cooper wrote:
>> The SRSO safety depends on having a CALL to an {ADD,LEA}/RET sequence which
>> has been made safe in the BTB.  Specifically, there needs to be no pertubance
>> to the RAS between a correctly predicted CALL and the subsequent RET.
>>
>> Use the new infrastructure to CALL to a return thunk.  Remove
>> srso_fam1?_safe_ret() symbols and point srso_fam1?_return_thunk().
>>
>> This removes one taken branch from every function return, which will reduce
>> the overhead of the mitigation.  It also removes one of three moving pieces
>> from the SRSO mess.
> So, the address of whatever instruction comes after the 'CALL
> srso_*_return_thunk' is added to the RSB/RAS, and that might be
> speculated to when the thunk returns.  Is that a concern?

That is very intentional, and key to the safety.

Replacing a RET with a CALL/{ADD,LEA}/RET sequence is a form of
retpoline thunk.  The only difference with regular retpolines is that
the intended target is already on the stack, and not in a GPR.


If the CALL mispredicts, it doesn't matter.  When decode catches up
(allegedly either instantaneously on Fam19h, or a few cycles late on
Fam17h), the top of the RAS is corrected will point at the INT3
following the CALL instruction.

When the CALL is corrected, speculation continues at the real
destination (the {ADD,LEA}/RET sequence) where the {ADD,LEA} pops the
"wrong" return address off the stack and lets the RET take the next
address up the stack.

The RET predicts to INT3 following the call (which is safe), and
eventually gets corrected to the parent return address on the stack
which is the real intended destination.

Therefore, rogue RET speculation is always safely contained at the INT3
until the RET uop can execute, notice the mispredict, and correct to
what the stack says is correct.

Of course, relying on the fact that the {ADD,LEA}+RET sequence doesn't
have poison in the BTB, which is what the UNTRAIN_RET sequence is trying
to achieve with microarchitectural means.

>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> ---
>> CC: x86@kernel.org
>> CC: linux-kernel@vger.kernel.org
>> CC: Borislav Petkov <bp@alien8.de>
>> CC: Peter Zijlstra <peterz@infradead.org>
>> CC: Josh Poimboeuf <jpoimboe@kernel.org>
>> CC: Babu Moger <babu.moger@amd.com>
>> CC: David.Kaplan@amd.com
>> CC: Nikolay Borisov <nik.borisov@suse.com>
>> CC: gregkh@linuxfoundation.org
>> CC: Thomas Gleixner <tglx@linutronix.de>
>>
>> RFC:
>>
>>   vmlinux.o: warning: objtool: srso_fam17_return_thunk(): can't find starting instruction
>>
>> Any objtool whisperers know what's going on, and particularly why
>> srso_fam19_return_thunk() appears to be happy?
>>
>> Also, depends on the resolution of the RFC in the previous patch.
> I can take a look.

Thanks.

~Andrew
Re: [PATCH RFC 4/4] x86/srso: Use CALL-based return thunks to reduce overhead
Posted by Josh Poimboeuf 2 years, 3 months ago
On Tue, Aug 22, 2023 at 12:01:29AM +0100, Andrew Cooper wrote:
> On 21/08/2023 4:16 pm, Josh Poimboeuf wrote:
> > On Mon, Aug 21, 2023 at 12:27:23PM +0100, Andrew Cooper wrote:
> >> The SRSO safety depends on having a CALL to an {ADD,LEA}/RET sequence which
> >> has been made safe in the BTB.  Specifically, there needs to be no pertubance
> >> to the RAS between a correctly predicted CALL and the subsequent RET.
> >>
> >> Use the new infrastructure to CALL to a return thunk.  Remove
> >> srso_fam1?_safe_ret() symbols and point srso_fam1?_return_thunk().
> >>
> >> This removes one taken branch from every function return, which will reduce
> >> the overhead of the mitigation.  It also removes one of three moving pieces
> >> from the SRSO mess.
> > So, the address of whatever instruction comes after the 'CALL
> > srso_*_return_thunk' is added to the RSB/RAS, and that might be
> > speculated to when the thunk returns.  Is that a concern?
> 
> That is very intentional, and key to the safety.
> 
> Replacing a RET with a CALL/{ADD,LEA}/RET sequence is a form of
> retpoline thunk.  The only difference with regular retpolines is that
> the intended target is already on the stack, and not in a GPR.
> 
> 
> If the CALL mispredicts, it doesn't matter.  When decode catches up
> (allegedly either instantaneously on Fam19h, or a few cycles late on
> Fam17h), the top of the RAS is corrected will point at the INT3
> following the CALL instruction.

That's the thing though, at least with my kernel/compiler combo there's
no INT3 after the JMP __x86_return_thunk, and there's no room to patch
one in after the CALL, as the JMP and CALL are both 5 bytes.

-- 
Josh
Re: [PATCH RFC 4/4] x86/srso: Use CALL-based return thunks to reduce overhead
Posted by Nikolay Borisov 2 years, 3 months ago

On 22.08.23 г. 5:22 ч., Josh Poimboeuf wrote:
> On Tue, Aug 22, 2023 at 12:01:29AM +0100, Andrew Cooper wrote:
>> On 21/08/2023 4:16 pm, Josh Poimboeuf wrote:
>>> On Mon, Aug 21, 2023 at 12:27:23PM +0100, Andrew Cooper wrote:
>>>> The SRSO safety depends on having a CALL to an {ADD,LEA}/RET sequence which
>>>> has been made safe in the BTB.  Specifically, there needs to be no pertubance
>>>> to the RAS between a correctly predicted CALL and the subsequent RET.
>>>>
>>>> Use the new infrastructure to CALL to a return thunk.  Remove
>>>> srso_fam1?_safe_ret() symbols and point srso_fam1?_return_thunk().
>>>>
>>>> This removes one taken branch from every function return, which will reduce
>>>> the overhead of the mitigation.  It also removes one of three moving pieces
>>>> from the SRSO mess.
>>> So, the address of whatever instruction comes after the 'CALL
>>> srso_*_return_thunk' is added to the RSB/RAS, and that might be
>>> speculated to when the thunk returns.  Is that a concern?
>>
>> That is very intentional, and key to the safety.
>>
>> Replacing a RET with a CALL/{ADD,LEA}/RET sequence is a form of
>> retpoline thunk.  The only difference with regular retpolines is that
>> the intended target is already on the stack, and not in a GPR.
>>
>>
>> If the CALL mispredicts, it doesn't matter.  When decode catches up
>> (allegedly either instantaneously on Fam19h, or a few cycles late on
>> Fam17h), the top of the RAS is corrected will point at the INT3
>> following the CALL instruction.
> 
> That's the thing though, at least with my kernel/compiler combo there's
> no INT3 after the JMP __x86_return_thunk, and there's no room to patch
> one in after the CALL, as the JMP and CALL are both 5 bytes.

FWIW gcc's mfunction-return=thunk-return only ever generates a jmp, 
thunk/thunk-inline OTOH generates a "full fledged" thunk with all the 
necessary speculation catching tricks.

For reference:

https://godbolt.org/z/M1avYc63b
> 
Re: [PATCH RFC 4/4] x86/srso: Use CALL-based return thunks to reduce overhead
Posted by Josh Poimboeuf 2 years, 3 months ago
On Tue, Aug 22, 2023 at 09:45:07AM +0300, Nikolay Borisov wrote:
> 
> 
> On 22.08.23 г. 5:22 ч., Josh Poimboeuf wrote:
> > On Tue, Aug 22, 2023 at 12:01:29AM +0100, Andrew Cooper wrote:
> > > On 21/08/2023 4:16 pm, Josh Poimboeuf wrote:
> > > > On Mon, Aug 21, 2023 at 12:27:23PM +0100, Andrew Cooper wrote:
> > > > > The SRSO safety depends on having a CALL to an {ADD,LEA}/RET sequence which
> > > > > has been made safe in the BTB.  Specifically, there needs to be no pertubance
> > > > > to the RAS between a correctly predicted CALL and the subsequent RET.
> > > > > 
> > > > > Use the new infrastructure to CALL to a return thunk.  Remove
> > > > > srso_fam1?_safe_ret() symbols and point srso_fam1?_return_thunk().
> > > > > 
> > > > > This removes one taken branch from every function return, which will reduce
> > > > > the overhead of the mitigation.  It also removes one of three moving pieces
> > > > > from the SRSO mess.
> > > > So, the address of whatever instruction comes after the 'CALL
> > > > srso_*_return_thunk' is added to the RSB/RAS, and that might be
> > > > speculated to when the thunk returns.  Is that a concern?
> > > 
> > > That is very intentional, and key to the safety.
> > > 
> > > Replacing a RET with a CALL/{ADD,LEA}/RET sequence is a form of
> > > retpoline thunk.  The only difference with regular retpolines is that
> > > the intended target is already on the stack, and not in a GPR.
> > > 
> > > 
> > > If the CALL mispredicts, it doesn't matter.  When decode catches up
> > > (allegedly either instantaneously on Fam19h, or a few cycles late on
> > > Fam17h), the top of the RAS is corrected will point at the INT3
> > > following the CALL instruction.
> > 
> > That's the thing though, at least with my kernel/compiler combo there's
> > no INT3 after the JMP __x86_return_thunk, and there's no room to patch
> > one in after the CALL, as the JMP and CALL are both 5 bytes.
> 
> FWIW gcc's mfunction-return=thunk-return only ever generates a jmp,
> thunk/thunk-inline OTOH generates a "full fledged" thunk with all the
> necessary speculation catching tricks.
> 
> For reference:
> 
> https://godbolt.org/z/M1avYc63b

The problem is the call-site, not the thunk.  Ideally we'd have an
option which adds an INT3 after the 'JMP __x86_return_thunk'.

-- 
Josh
Re: [PATCH RFC 4/4] x86/srso: Use CALL-based return thunks to reduce overhead
Posted by Peter Zijlstra 2 years, 3 months ago
On Tue, Aug 22, 2023 at 03:18:28PM -0700, Josh Poimboeuf wrote:

> The problem is the call-site, not the thunk.  Ideally we'd have an
> option which adds an INT3 after the 'JMP __x86_return_thunk'.

The -mharden-sls=all option *SHOULD* be extended to unconditionally emit
INT3 after everyt JMP instruction -- including the one used for
-mfunction-return=thunk-extern.

This is a known missing mitigation for an AMD SLS issue.

Due to the whole branch-type-confusion thing, AMD CPUs can predict the
JMP as 'not-a-branch' and continue to the next instruction.

I'm sure Andrew has the proper name and CVE stashed away somewhere. IIRC
he even has a GCC bugzilla entry for it.
Re: [PATCH RFC 4/4] x86/srso: Use CALL-based return thunks to reduce overhead
Posted by Nikolay Borisov 2 years, 3 months ago

On 23.08.23 г. 1:18 ч., Josh Poimboeuf wrote:
> On Tue, Aug 22, 2023 at 09:45:07AM +0300, Nikolay Borisov wrote:
>>
>>
>> On 22.08.23 г. 5:22 ч., Josh Poimboeuf wrote:
>>> On Tue, Aug 22, 2023 at 12:01:29AM +0100, Andrew Cooper wrote:
>>>> On 21/08/2023 4:16 pm, Josh Poimboeuf wrote:
>>>>> On Mon, Aug 21, 2023 at 12:27:23PM +0100, Andrew Cooper wrote:
>>>>>> The SRSO safety depends on having a CALL to an {ADD,LEA}/RET sequence which
>>>>>> has been made safe in the BTB.  Specifically, there needs to be no pertubance
>>>>>> to the RAS between a correctly predicted CALL and the subsequent RET.
>>>>>>
>>>>>> Use the new infrastructure to CALL to a return thunk.  Remove
>>>>>> srso_fam1?_safe_ret() symbols and point srso_fam1?_return_thunk().
>>>>>>
>>>>>> This removes one taken branch from every function return, which will reduce
>>>>>> the overhead of the mitigation.  It also removes one of three moving pieces
>>>>>> from the SRSO mess.
>>>>> So, the address of whatever instruction comes after the 'CALL
>>>>> srso_*_return_thunk' is added to the RSB/RAS, and that might be
>>>>> speculated to when the thunk returns.  Is that a concern?
>>>>
>>>> That is very intentional, and key to the safety.
>>>>
>>>> Replacing a RET with a CALL/{ADD,LEA}/RET sequence is a form of
>>>> retpoline thunk.  The only difference with regular retpolines is that
>>>> the intended target is already on the stack, and not in a GPR.
>>>>
>>>>
>>>> If the CALL mispredicts, it doesn't matter.  When decode catches up
>>>> (allegedly either instantaneously on Fam19h, or a few cycles late on
>>>> Fam17h), the top of the RAS is corrected will point at the INT3
>>>> following the CALL instruction.
>>>
>>> That's the thing though, at least with my kernel/compiler combo there's
>>> no INT3 after the JMP __x86_return_thunk, and there's no room to patch
>>> one in after the CALL, as the JMP and CALL are both 5 bytes.
>>
>> FWIW gcc's mfunction-return=thunk-return only ever generates a jmp,
>> thunk/thunk-inline OTOH generates a "full fledged" thunk with all the
>> necessary speculation catching tricks.
>>
>> For reference:
>>
>> https://godbolt.org/z/M1avYc63b
> 
> The problem is the call-site, not the thunk.  Ideally we'd have an
> option which adds an INT3 after the 'JMP __x86_return_thunk'.

The way I see it, it seems the int3/ud2 or w/e sequence belongs to the 
thunk and not the call site (what you said). However, Andrew's solution 
depends on the callsite sort of being the thunk.

It seems something like that has already been done for the indirect 
thunk but not for return thunk:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102952
>