[v1] x86/retpoline: Don't clobber RFLAGS during srso_safe_ret()

[PATCH] x86/retpoline: Don't clobber RFLAGS during srso_safe_ret()

Posted by Sean Christopherson 2 years, 6 months ago

Use 'lea' instead of 'add' when adjusting %rsp in srso_safe_ret() so as to
avoid clobbering flags.  Drop one of the INT3 instructions to account for
the LEA consuming one more byte than the ADD.

KVM's emulator makes indirect calls into a jump table of sorts, where
the destination of each call is a small blob of code that performs fast
emulation by executing the target instruction with fixed operands.

E.g. to emulate ADC, fastop() invokes adcb_al_dl():

  adcb_al_dl:
      0xffffffff8105f5f0 <+0>:  adc    %dl,%al
      0xffffffff8105f5f2 <+2>:  jmp    0xffffffff81a39270 <__x86_return_thunk>

A major motivation for doing fast emulation is to leverage the CPU to
handle consumption and manipulation of arithmetic flags, i.e. RFLAGS is
both an input and output to the target of the call.  fastop() collects
the RFLAGS result by pushing RFLAGS onto the stack and popping them back
into a variable (held in RDI in this case)

  asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"

      0xffffffff81062be7 <+71>: mov    0xc0(%r8),%rdx
      0xffffffff81062bee <+78>: mov    0x100(%r8),%rcx
      0xffffffff81062bf5 <+85>: push   %rdi
      0xffffffff81062bf6 <+86>: popf
      0xffffffff81062bf7 <+87>: call   *%rsi
      0xffffffff81062bf9 <+89>: nop
      0xffffffff81062bfa <+90>: nop
      0xffffffff81062bfb <+91>: nop
      0xffffffff81062bfc <+92>: pushf
      0xffffffff81062bfd <+93>: pop    %rdi

and then propagating the arithmetic flags into the vCPU's emulator state:

    ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);

      0xffffffff81062be0 <+64>:  and    $0xfffffffffffff72a,%r9
      0xffffffff81062bfe <+94>:  and    $0x8d5,%edi
      0xffffffff81062c0d <+109>: or     %rdi,%r9
      0xffffffff81062c1a <+122>: mov    %r9,0x10(%r8)

The failures can be most easily reproduced by running the "emulator" test
in KVM-Unit-Tests.

If you're feeling a bit of deja vu, see commit b63f20a778c8
("x86/retpoline: Don't clobber RFLAGS during CALL_NOSPEC on i386").

Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation")
Reported-by: Srikanth Aithal <sraithal@amd.com>
Closes: https://lore.kernel.org/all/de474347-122d-54cd-eabf-9dcc95ab9eae@amd.com
Cc: stable@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---

Those that fail to learn from history are doomed to repeat it. :-D

 arch/x86/lib/retpoline.S | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 2cff585f22f2..132cedbf9e57 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -164,7 +164,7 @@ __EXPORT_THUNK(srso_untrain_ret_alias)
 /* Needs a definition for the __x86_return_thunk alternative below. */
 SYM_START(srso_safe_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
 #ifdef CONFIG_CPU_SRSO
-	add $8, %_ASM_SP
+	lea 8(%_ASM_SP), %_ASM_SP
 	UNWIND_HINT_FUNC
 #endif
 	ANNOTATE_UNRET_SAFE
@@ -239,7 +239,7 @@ __EXPORT_THUNK(zen_untrain_ret)
  * SRSO untraining sequence for Zen1/2, similar to zen_untrain_ret()
  * above. On kernel entry, srso_untrain_ret() is executed which is a
  *
- * movabs $0xccccccc308c48348,%rax
+ * movabs $0xccccc30824648d48,%rax
  *
  * and when the return thunk executes the inner label srso_safe_ret()
  * later, it is a stack manipulation and a RET which is mispredicted and
@@ -252,11 +252,10 @@ SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
 	.byte 0x48, 0xb8
 
 SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
-	add $8, %_ASM_SP
+	lea 8(%_ASM_SP), %_ASM_SP
 	ret
 	int3
 	int3
-	int3
 	lfence
 	call srso_safe_ret
 	int3

base-commit: 25aa0bebba72b318e71fe205bfd1236550cc9534
-- 
2.41.0.694.ge786442a9b-goog

Re: [PATCH] x86/retpoline: Don't clobber RFLAGS during srso_safe_ret()

Posted by Mika Penttilä 2 years, 6 months ago

On 8/11/23 18:52, Sean Christopherson wrote:
> Use 'lea' instead of 'add' when adjusting %rsp in srso_safe_ret() so as to
> avoid clobbering flags.  Drop one of the INT3 instructions to account for
> the LEA consuming one more byte than the ADD.
>
> KVM's emulator makes indirect calls into a jump table of sorts, where
> the destination of each call is a small blob of code that performs fast
> emulation by executing the target instruction with fixed operands.
>
> E.g. to emulate ADC, fastop() invokes adcb_al_dl():
>
>    adcb_al_dl:
>        0xffffffff8105f5f0 <+0>:  adc    %dl,%al
>        0xffffffff8105f5f2 <+2>:  jmp    0xffffffff81a39270 <__x86_return_thunk>
>
> A major motivation for doing fast emulation is to leverage the CPU to
> handle consumption and manipulation of arithmetic flags, i.e. RFLAGS is
> both an input and output to the target of the call.  fastop() collects
> the RFLAGS result by pushing RFLAGS onto the stack and popping them back
> into a variable (held in RDI in this case)
>
>    asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
>
>        0xffffffff81062be7 <+71>: mov    0xc0(%r8),%rdx
>        0xffffffff81062bee <+78>: mov    0x100(%r8),%rcx
>        0xffffffff81062bf5 <+85>: push   %rdi
>        0xffffffff81062bf6 <+86>: popf
>        0xffffffff81062bf7 <+87>: call   *%rsi
>        0xffffffff81062bf9 <+89>: nop
>        0xffffffff81062bfa <+90>: nop
>        0xffffffff81062bfb <+91>: nop
>        0xffffffff81062bfc <+92>: pushf
>        0xffffffff81062bfd <+93>: pop    %rdi
>
> and then propagating the arithmetic flags into the vCPU's emulator state:
>
>      ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
>
>        0xffffffff81062be0 <+64>:  and    $0xfffffffffffff72a,%r9
>        0xffffffff81062bfe <+94>:  and    $0x8d5,%edi
>        0xffffffff81062c0d <+109>: or     %rdi,%r9
>        0xffffffff81062c1a <+122>: mov    %r9,0x10(%r8)
>
> The failures can be most easily reproduced by running the "emulator" test
> in KVM-Unit-Tests.
>
> If you're feeling a bit of deja vu, see commit b63f20a778c8
> ("x86/retpoline: Don't clobber RFLAGS during CALL_NOSPEC on i386").
>
> Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation")
> Reported-by: Srikanth Aithal <sraithal@amd.com>
> Closes: https://lore.kernel.org/all/de474347-122d-54cd-eabf-9dcc95ab9eae@amd.com
> Cc: stable@vger.kernel.org
> Cc: kvm@vger.kernel.org
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>
> Those that fail to learn from history are doomed to repeat it. :-D
>
>   arch/x86/lib/retpoline.S | 7 +++----
>   1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
> index 2cff585f22f2..132cedbf9e57 100644
> --- a/arch/x86/lib/retpoline.S
> +++ b/arch/x86/lib/retpoline.S
> @@ -164,7 +164,7 @@ __EXPORT_THUNK(srso_untrain_ret_alias)
>   /* Needs a definition for the __x86_return_thunk alternative below. */
>   SYM_START(srso_safe_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
>   #ifdef CONFIG_CPU_SRSO
> -	add $8, %_ASM_SP
> +	lea 8(%_ASM_SP), %_ASM_SP
>   	UNWIND_HINT_FUNC
>   #endif
>   	ANNOTATE_UNRET_SAFE
> @@ -239,7 +239,7 @@ __EXPORT_THUNK(zen_untrain_ret)
>    * SRSO untraining sequence for Zen1/2, similar to zen_untrain_ret()
>    * above. On kernel entry, srso_untrain_ret() is executed which is a
>    *
> - * movabs $0xccccccc308c48348,%rax
> + * movabs $0xccccc30824648d48,%rax
>    *
>    * and when the return thunk executes the inner label srso_safe_ret()
>    * later, it is a stack manipulation and a RET which is mispredicted and
> @@ -252,11 +252,10 @@ SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
>   	.byte 0x48, 0xb8
>   
>   SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
> -	add $8, %_ASM_SP
> +	lea 8(%_ASM_SP), %_ASM_SP
>   	ret
>   	int3
>   	int3
> -	int3
>   	lfence
>   	call srso_safe_ret
>   	int3
>
> base-commit: 25aa0bebba72b318e71fe205bfd1236550cc9534

Don't we have the same kind of problems with __x86_return_skl ?

--Mika

Re: [PATCH] x86/retpoline: Don't clobber RFLAGS during srso_safe_ret()

Posted by Sean Christopherson 2 years, 6 months ago

On Fri, Aug 11, 2023, Mika Penttilä wrote:
> > @@ -252,11 +252,10 @@ SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
> >   	.byte 0x48, 0xb8
> >   SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
> > -	add $8, %_ASM_SP
> > +	lea 8(%_ASM_SP), %_ASM_SP
> >   	ret
> >   	int3
> >   	int3
> > -	int3
> >   	lfence
> >   	call srso_safe_ret
> >   	int3
> > 
> > base-commit: 25aa0bebba72b318e71fe205bfd1236550cc9534
> 
> Don't we have the same kind of problems with __x86_return_skl ?

Yep, forcing that path via "retbleed=force retbleed=stuff spectre_v2=retpoline,generic"
yields the same failures.  I have no idea how to go about cleanly fixing that.
The logic effectively requires modifying flags, the only thing I can think of is
to save/restore flags across the thunk, which seems beyond gross.

Given that no one has complained about this, I think I'd vote to simply disable
KVM if call depth tracking is being used.

Re: [PATCH] x86/retpoline: Don't clobber RFLAGS during srso_safe_ret()

Posted by Borislav Petkov 2 years, 6 months ago

On Fri, Aug 11, 2023 at 08:52:55AM -0700, Sean Christopherson wrote:
> A major motivation for doing fast emulation is to leverage the CPU to
> handle consumption and manipulation of arithmetic flags, i.e. RFLAGS is
> both an input and output to the target of the call.  fastop() collects
> the RFLAGS result by pushing RFLAGS onto the stack and popping them back
> into a variable (held in RDI in this case)
> 
>   asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"

Right, and I've tested this countless times with gcc-built host and
guest.

But Nathan's case where the host is built with gcc but the guest with
clang, would trigger this. And as he confirms, that fixes it so I wonder
what is the difference in code generation to make this rFLAGS corruption
noticeable in that particular configuration.

Oh well, later when the fires are put out.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Re: [PATCH] x86/retpoline: Don't clobber RFLAGS during srso_safe_ret()

Posted by Sean Christopherson 2 years, 6 months ago

On Fri, Aug 11, 2023, Borislav Petkov wrote:
> On Fri, Aug 11, 2023 at 08:52:55AM -0700, Sean Christopherson wrote:
> > A major motivation for doing fast emulation is to leverage the CPU to
> > handle consumption and manipulation of arithmetic flags, i.e. RFLAGS is
> > both an input and output to the target of the call.  fastop() collects
> > the RFLAGS result by pushing RFLAGS onto the stack and popping them back
> > into a variable (held in RDI in this case)
> > 
> >   asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
> 
> Right, and I've tested this countless times with gcc-built host and
> guest.
> 
> But Nathan's case where the host is built with gcc but the guest with
> clang, would trigger this. And as he confirms, that fixes it so I wonder
> what is the difference in code generation to make this rFLAGS corruption
> noticeable in that particular configuration.

Might be I/O APIC accesses?  Unless things have changed, the I/O APIC code uses
a struct overlay to access the I/O APIC, i.e. when doing emulated MMIO accesses.
If clang generates an ADD or whatever and consumes flags, e.g. instead of a
straight MOV, that would explain the problems.

Re: [PATCH] x86/retpoline: Don't clobber RFLAGS during srso_safe_ret()

Posted by Nathan Chancellor 2 years, 6 months ago

On Fri, Aug 11, 2023 at 08:52:55AM -0700, Sean Christopherson wrote:
> Use 'lea' instead of 'add' when adjusting %rsp in srso_safe_ret() so as to
> avoid clobbering flags.  Drop one of the INT3 instructions to account for
> the LEA consuming one more byte than the ADD.
> 
> KVM's emulator makes indirect calls into a jump table of sorts, where
> the destination of each call is a small blob of code that performs fast
> emulation by executing the target instruction with fixed operands.
> 
> E.g. to emulate ADC, fastop() invokes adcb_al_dl():
> 
>   adcb_al_dl:
>       0xffffffff8105f5f0 <+0>:  adc    %dl,%al
>       0xffffffff8105f5f2 <+2>:  jmp    0xffffffff81a39270 <__x86_return_thunk>
> 
> A major motivation for doing fast emulation is to leverage the CPU to
> handle consumption and manipulation of arithmetic flags, i.e. RFLAGS is
> both an input and output to the target of the call.  fastop() collects
> the RFLAGS result by pushing RFLAGS onto the stack and popping them back
> into a variable (held in RDI in this case)
> 
>   asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
> 
>       0xffffffff81062be7 <+71>: mov    0xc0(%r8),%rdx
>       0xffffffff81062bee <+78>: mov    0x100(%r8),%rcx
>       0xffffffff81062bf5 <+85>: push   %rdi
>       0xffffffff81062bf6 <+86>: popf
>       0xffffffff81062bf7 <+87>: call   *%rsi
>       0xffffffff81062bf9 <+89>: nop
>       0xffffffff81062bfa <+90>: nop
>       0xffffffff81062bfb <+91>: nop
>       0xffffffff81062bfc <+92>: pushf
>       0xffffffff81062bfd <+93>: pop    %rdi
> 
> and then propagating the arithmetic flags into the vCPU's emulator state:
> 
>     ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
> 
>       0xffffffff81062be0 <+64>:  and    $0xfffffffffffff72a,%r9
>       0xffffffff81062bfe <+94>:  and    $0x8d5,%edi
>       0xffffffff81062c0d <+109>: or     %rdi,%r9
>       0xffffffff81062c1a <+122>: mov    %r9,0x10(%r8)
> 
> The failures can be most easily reproduced by running the "emulator" test
> in KVM-Unit-Tests.
> 
> If you're feeling a bit of deja vu, see commit b63f20a778c8
> ("x86/retpoline: Don't clobber RFLAGS during CALL_NOSPEC on i386").
> 
> Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation")
> Reported-by: Srikanth Aithal <sraithal@amd.com>
> Closes: https://lore.kernel.org/all/de474347-122d-54cd-eabf-9dcc95ab9eae@amd.com
> Cc: stable@vger.kernel.org
> Cc: kvm@vger.kernel.org
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>

This resolves the issue I reported at [1].

Tested-by: Nathan Chancellor <nathan@kernel.org>

[1]: https://lore.kernel.org/20230810013334.GA5354@dev-arch.thelio-3990X/

> ---
> 
> Those that fail to learn from history are doomed to repeat it. :-D
> 
>  arch/x86/lib/retpoline.S | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
> index 2cff585f22f2..132cedbf9e57 100644
> --- a/arch/x86/lib/retpoline.S
> +++ b/arch/x86/lib/retpoline.S
> @@ -164,7 +164,7 @@ __EXPORT_THUNK(srso_untrain_ret_alias)
>  /* Needs a definition for the __x86_return_thunk alternative below. */
>  SYM_START(srso_safe_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
>  #ifdef CONFIG_CPU_SRSO
> -	add $8, %_ASM_SP
> +	lea 8(%_ASM_SP), %_ASM_SP
>  	UNWIND_HINT_FUNC
>  #endif
>  	ANNOTATE_UNRET_SAFE
> @@ -239,7 +239,7 @@ __EXPORT_THUNK(zen_untrain_ret)
>   * SRSO untraining sequence for Zen1/2, similar to zen_untrain_ret()
>   * above. On kernel entry, srso_untrain_ret() is executed which is a
>   *
> - * movabs $0xccccccc308c48348,%rax
> + * movabs $0xccccc30824648d48,%rax
>   *
>   * and when the return thunk executes the inner label srso_safe_ret()
>   * later, it is a stack manipulation and a RET which is mispredicted and
> @@ -252,11 +252,10 @@ SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
>  	.byte 0x48, 0xb8
>  
>  SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
> -	add $8, %_ASM_SP
> +	lea 8(%_ASM_SP), %_ASM_SP
>  	ret
>  	int3
>  	int3
> -	int3
>  	lfence
>  	call srso_safe_ret
>  	int3
> 
> base-commit: 25aa0bebba72b318e71fe205bfd1236550cc9534
> -- 
> 2.41.0.694.ge786442a9b-goog
>

[tip: x86/urgent] x86/retpoline: Don't clobber RFLAGS during srso_safe_ret()

Posted by tip-bot2 for Sean Christopherson 2 years, 5 months ago

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID:     ba5ca5e5e6a1d55923e88b4a83da452166f5560e
Gitweb:        https://git.kernel.org/tip/ba5ca5e5e6a1d55923e88b4a83da452166f5560e
Author:        Sean Christopherson <seanjc@google.com>
AuthorDate:    Fri, 11 Aug 2023 08:52:55 -07:00
Committer:     Borislav Petkov (AMD) <bp@alien8.de>
CommitterDate: Mon, 14 Aug 2023 10:47:55 +02:00

x86/retpoline: Don't clobber RFLAGS during srso_safe_ret()

Use LEA instead of ADD when adjusting %rsp in srso_safe_ret{,_alias}()
so as to avoid clobbering flags.  Drop one of the INT3 instructions to
account for the LEA consuming one more byte than the ADD.

KVM's emulator makes indirect calls into a jump table of sorts, where
the destination of each call is a small blob of code that performs fast
emulation by executing the target instruction with fixed operands.

E.g. to emulate ADC, fastop() invokes adcb_al_dl():

  adcb_al_dl:
    <+0>:  adc    %dl,%al
    <+2>:  jmp    <__x86_return_thunk>

A major motivation for doing fast emulation is to leverage the CPU to
handle consumption and manipulation of arithmetic flags, i.e. RFLAGS is
both an input and output to the target of the call.  fastop() collects
the RFLAGS result by pushing RFLAGS onto the stack and popping them back
into a variable (held in %rdi in this case):

  asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"

  <+71>: mov    0xc0(%r8),%rdx
  <+78>: mov    0x100(%r8),%rcx
  <+85>: push   %rdi
  <+86>: popf
  <+87>: call   *%rsi
  <+89>: nop
  <+90>: nop
  <+91>: nop
  <+92>: pushf
  <+93>: pop    %rdi

and then propagating the arithmetic flags into the vCPU's emulator state:

  ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);

  <+64>:  and    $0xfffffffffffff72a,%r9
  <+94>:  and    $0x8d5,%edi
  <+109>: or     %rdi,%r9
  <+122>: mov    %r9,0x10(%r8)

The failures can be most easily reproduced by running the "emulator"
test in KVM-Unit-Tests.

If you're feeling a bit of deja vu, see commit b63f20a778c8
("x86/retpoline: Don't clobber RFLAGS during CALL_NOSPEC on i386").

In addition, this breaks booting of clang-compiled guest on
a gcc-compiled host where the host contains the %rsp-modifying SRSO
mitigations.

  [ bp: Massage commit message, extend, remove addresses. ]

Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation")
Closes: https://lore.kernel.org/all/de474347-122d-54cd-eabf-9dcc95ab9eae@amd.com
Reported-by: Srikanth Aithal <sraithal@amd.com>
Reported-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/20230810013334.GA5354@dev-arch.thelio-3990X/
Link: https://lore.kernel.org/r/20230811155255.250835-1-seanjc@google.com
---
 arch/x86/lib/retpoline.S | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 2cff585..132cedb 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -164,7 +164,7 @@ __EXPORT_THUNK(srso_untrain_ret_alias)
 /* Needs a definition for the __x86_return_thunk alternative below. */
 SYM_START(srso_safe_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
 #ifdef CONFIG_CPU_SRSO
-	add $8, %_ASM_SP
+	lea 8(%_ASM_SP), %_ASM_SP
 	UNWIND_HINT_FUNC
 #endif
 	ANNOTATE_UNRET_SAFE
@@ -239,7 +239,7 @@ __EXPORT_THUNK(zen_untrain_ret)
  * SRSO untraining sequence for Zen1/2, similar to zen_untrain_ret()
  * above. On kernel entry, srso_untrain_ret() is executed which is a
  *
- * movabs $0xccccccc308c48348,%rax
+ * movabs $0xccccc30824648d48,%rax
  *
  * and when the return thunk executes the inner label srso_safe_ret()
  * later, it is a stack manipulation and a RET which is mispredicted and
@@ -252,11 +252,10 @@ SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
 	.byte 0x48, 0xb8
 
 SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
-	add $8, %_ASM_SP
+	lea 8(%_ASM_SP), %_ASM_SP
 	ret
 	int3
 	int3
-	int3
 	lfence
 	call srso_safe_ret
 	int3