[PATCH v3 14/16] x86/fred: Play nice with invoking asm_fred_entry_from_kvm() on non-FRED hardware

Peter Zijlstra posted 16 patches 2 months, 3 weeks ago
[PATCH v3 14/16] x86/fred: Play nice with invoking asm_fred_entry_from_kvm() on non-FRED hardware
Posted by Peter Zijlstra 2 months, 3 weeks ago
From: Josh Poimboeuf <jpoimboe@kernel.org>

Modify asm_fred_entry_from_kvm() to allow it to be invoked by KVM even
when FRED isn't fully enabled, e.g. when running with
CONFIG_X86_FRED=y on non-FRED hardware.  This will allow forcing KVM
to always use the FRED entry points for 64-bit kernels, which in turn
will eliminate a rather gross non-CFI indirect call that KVM uses to
trampoline IRQs by doing IDT lookups.

The point of asm_fred_entry_from_kvm() is to bridge between C
(vmx:handle_external_interrupt_irqoff()) and more C
(__fred_entry_from_kvm()) while changing the calling context to appear
like an interrupt (pt_regs). Making the whole thing bound by C ABI.

All that remains for non-FRED hardware is to restore RSP (to undo the
redzone and alignment). However the trivial change would result in
code like:

  push %rbp
  mov %rsp, %rbp

  sub $REDZONE, %rsp
  and $MASK, %rsp

  PUSH_AND_CLEAR_REGS
   push %rbp

  POP_REGS
   pop %rbp <-- *objtool fail*

  mov %rbp, %rsp
  pop %rbp
  ret

And this will confuse objtool something wicked -- it gets confused by
the extra pop %rbp, not realizing the push and pop preserve the value.

Rather than trying to each objtool about this, recognise that since
the code is bound by C ABI on both ends and interrupts are not allowed
to change pt_regs (only exceptions are) it is sufficient to PUSH_REGS
in order to create pt_regs, but there is no reason to POP_REGS --
provided the callee-saved registers are preserved.

So avoid clearing callee-saved regs and skip POP_REGS.

[Original patch by Sean; much of this version by Josh; Changelog,
comments and final form by Peterz]

Originally-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/calling.h       |   11 +++++------
 arch/x86/entry/entry_64_fred.S |   33 ++++++++++++++++++++++++++-------
 arch/x86/kernel/asm-offsets.c  |    1 +
 3 files changed, 32 insertions(+), 13 deletions(-)

--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -99,7 +99,7 @@ For 32-bit we have the following convent
 	.endif
 .endm
 
-.macro CLEAR_REGS clear_bp=1
+.macro CLEAR_REGS clear_callee=1
 	/*
 	 * Sanitize registers of values that a speculation attack might
 	 * otherwise want to exploit. The lower registers are likely clobbered
@@ -113,20 +113,19 @@ For 32-bit we have the following convent
 	xorl	%r9d,  %r9d	/* nospec r9  */
 	xorl	%r10d, %r10d	/* nospec r10 */
 	xorl	%r11d, %r11d	/* nospec r11 */
+	.if \clear_callee
 	xorl	%ebx,  %ebx	/* nospec rbx */
-	.if \clear_bp
 	xorl	%ebp,  %ebp	/* nospec rbp */
-	.endif
 	xorl	%r12d, %r12d	/* nospec r12 */
 	xorl	%r13d, %r13d	/* nospec r13 */
 	xorl	%r14d, %r14d	/* nospec r14 */
 	xorl	%r15d, %r15d	/* nospec r15 */
-
+	.endif
 .endm
 
-.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_bp=1 unwind_hint=1
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_callee=1 unwind_hint=1
 	PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint
-	CLEAR_REGS clear_bp=\clear_bp
+	CLEAR_REGS clear_callee=\clear_callee
 .endm
 
 .macro POP_REGS pop_rdi=1
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -112,18 +112,37 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
 	push %rax				/* Return RIP */
 	push $0					/* Error code, 0 for IRQ/NMI */
 
-	PUSH_AND_CLEAR_REGS clear_bp=0 unwind_hint=0
+	PUSH_AND_CLEAR_REGS clear_callee=0 unwind_hint=0
+
 	movq %rsp, %rdi				/* %rdi -> pt_regs */
+	/*
+	 * At this point: {rdi, rsi, rdx, rcx, r8, r9}, {r10, r11}, {rax, rdx}
+	 * are clobbered, which corresponds to: arguments, extra caller-saved
+	 * and return. All registers a C function is allowed to clobber.
+	 *
+	 * Notably, the callee-saved registers: {rbx, r12, r13, r14, r15}
+	 * are untouched, with the exception of rbp, which carries the stack
+	 * frame and will be restored before exit.
+	 *
+	 * Further calling another C function will not alter this state.
+	 */
 	call __fred_entry_from_kvm		/* Call the C entry point */
-	POP_REGS
-	ERETS
-1:
+
+1:	/*
+	 * When FRED, use ERETS to potentially clear NMIs, otherwise simply
+	 * restore the stack pointer.
+	 */
+	ALTERNATIVE "nop; nop; mov %rbp, %rsp", \
+	            __stringify(add $C_PTREGS_SIZE, %rsp; ERETS), \
+		    X86_FEATURE_FRED
+
 	/*
-	 * Objtool doesn't understand what ERETS does, this hint tells it that
-	 * yes, we'll reach here and with what stack state. A save/restore pair
-	 * isn't strictly needed, but it's the simplest form.
+	 * Objtool doesn't understand ERETS, and the cfi register state is
+	 * different from initial_func_cfi due to PUSH_REGS. Tell it the state
+	 * is similar to where UNWIND_HINT_SAVE is.
 	 */
 	UNWIND_HINT_RESTORE
+
 	pop %rbp
 	RET
 
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -102,6 +102,7 @@ static void __used common(void)
 
 	BLANK();
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+	OFFSET(C_PTREGS_SIZE, pt_regs, orig_ax);
 
 	/* TLB state for the entry code */
 	OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
Re: [PATCH v3 14/16] x86/fred: Play nice with invoking asm_fred_entry_from_kvm() on non-FRED hardware
Posted by Xin Li 2 months, 1 week ago
On 7/14/2025 3:20 AM, Peter Zijlstra wrote:
>   	call __fred_entry_from_kvm		/* Call the C entry point */
> -	POP_REGS
> -	ERETS
> -1:
> +
> +1:	/*

The symbol "1" is misplaced; it needs to be put after the ERETS
instruction.

> +	 * When FRED, use ERETS to potentially clear NMIs, otherwise simply
> +	 * restore the stack pointer.
> +	 */
> +	ALTERNATIVE "nop; nop; mov %rbp, %rsp", \

Why explicitly add two nops here?

ALTERNATIVE will still pad three-byte nop after the MOV instruction.

> +	            __stringify(add $C_PTREGS_SIZE, %rsp; ERETS), \
> +		    X86_FEATURE_FRED
> +
>   	/*
> -	 * Objtool doesn't understand what ERETS does, this hint tells it that
> -	 * yes, we'll reach here and with what stack state. A save/restore pair
> -	 * isn't strictly needed, but it's the simplest form.
> +	 * Objtool doesn't understand ERETS, and the cfi register state is
> +	 * different from initial_func_cfi due to PUSH_REGS. Tell it the state
> +	 * is similar to where UNWIND_HINT_SAVE is.
>   	 */
>   	UNWIND_HINT_RESTORE
> +
>   	pop %rbp
>   	RET
>
Re: [PATCH v3 14/16] x86/fred: Play nice with invoking asm_fred_entry_from_kvm() on non-FRED hardware
Posted by Peter Zijlstra 1 month, 2 weeks ago
On Fri, Jul 25, 2025 at 09:54:32PM -0700, Xin Li wrote:
> On 7/14/2025 3:20 AM, Peter Zijlstra wrote:
> >   	call __fred_entry_from_kvm		/* Call the C entry point */
> > -	POP_REGS
> > -	ERETS
> > -1:
> > +
> > +1:	/*
> 
> The symbol "1" is misplaced; it needs to be put after the ERETS
> instruction.

Doh, fixed.

> > +	 * When FRED, use ERETS to potentially clear NMIs, otherwise simply
> > +	 * restore the stack pointer.
> > +	 */
> > +	ALTERNATIVE "nop; nop; mov %rbp, %rsp", \
> 
> Why explicitly add two nops here?

Because the CFI information for all alternative code flows must be the
same. So by playing games with instruction offsets you can have
conflicting CFI inside the alternative.

Specifically, we have:

0:	90        nop
1:	90        nop
2:	48 89 ec  mov %rbp, %rsp


0:	48 83 c4 0c  add $12, %rsp
4:      f2 0f 01 ca  erets

This gets us CFI updates on 0, 2 and 4, without conflicts.


> ALTERNATIVE will still pad three-byte nop after the MOV instruction.
> 
> > +	            __stringify(add $C_PTREGS_SIZE, %rsp; ERETS), \
> > +		    X86_FEATURE_FRED
> > +
> >   	/*
> > -	 * Objtool doesn't understand what ERETS does, this hint tells it that
> > -	 * yes, we'll reach here and with what stack state. A save/restore pair
> > -	 * isn't strictly needed, but it's the simplest form.
> > +	 * Objtool doesn't understand ERETS, and the cfi register state is
> > +	 * different from initial_func_cfi due to PUSH_REGS. Tell it the state
> > +	 * is similar to where UNWIND_HINT_SAVE is.
> >   	 */
> >   	UNWIND_HINT_RESTORE
> > +
> >   	pop %rbp
> >   	RET
>
[tip: x86/core] x86/fred: Play nice with invoking asm_fred_entry_from_kvm() on non-FRED hardware
Posted by tip-bot2 for Josh Poimboeuf 1 month, 2 weeks ago
The following commit has been merged into the x86/core branch of tip:

Commit-ID:     deed19b9b28724bd32e85063c60718c0a6803906
Gitweb:        https://git.kernel.org/tip/deed19b9b28724bd32e85063c60718c0a6803906
Author:        Josh Poimboeuf <jpoimboe@kernel.org>
AuthorDate:    Fri, 06 Jun 2025 13:04:25 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 18 Aug 2025 14:23:08 +02:00

x86/fred: Play nice with invoking asm_fred_entry_from_kvm() on non-FRED hardware

Modify asm_fred_entry_from_kvm() to allow it to be invoked by KVM even
when FRED isn't fully enabled, e.g. when running with
CONFIG_X86_FRED=y on non-FRED hardware.  This will allow forcing KVM
to always use the FRED entry points for 64-bit kernels, which in turn
will eliminate a rather gross non-CFI indirect call that KVM uses to
trampoline IRQs by doing IDT lookups.

The point of asm_fred_entry_from_kvm() is to bridge between C
(vmx:handle_external_interrupt_irqoff()) and more C
(__fred_entry_from_kvm()) while changing the calling context to appear
like an interrupt (pt_regs). Making the whole thing bound by C ABI.

All that remains for non-FRED hardware is to restore RSP (to undo the
redzone and alignment). However the trivial change would result in
code like:

  push %rbp
  mov %rsp, %rbp

  sub $REDZONE, %rsp
  and $MASK, %rsp

  PUSH_AND_CLEAR_REGS
   push %rbp

  POP_REGS
   pop %rbp <-- *objtool fail*

  mov %rbp, %rsp
  pop %rbp
  ret

And this will confuse objtool something wicked -- it gets confused by
the extra pop %rbp, not realizing the push and pop preserve the value.

Rather than trying to each objtool about this, recognise that since
the code is bound by C ABI on both ends and interrupts are not allowed
to change pt_regs (only exceptions are) it is sufficient to PUSH_REGS
in order to create pt_regs, but there is no reason to POP_REGS --
provided the callee-saved registers are preserved.

So avoid clearing callee-saved regs and skip POP_REGS.

[Original patch by Sean; much of this version by Josh; Changelog,
comments and final form by Peterz]

Originally-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Sean Christopherson <seanjc@google.com>
Link: https://lkml.kernel.org/r/20250714103441.245417052@infradead.org
---
 arch/x86/entry/calling.h       | 11 +++++------
 arch/x86/entry/entry_64_fred.S | 33 ++++++++++++++++++++++++++-------
 arch/x86/kernel/asm-offsets.c  |  1 +-
 3 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 9451968..77e2d92 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -99,7 +99,7 @@ For 32-bit we have the following conventions - kernel is built with
 	.endif
 .endm
 
-.macro CLEAR_REGS clear_bp=1
+.macro CLEAR_REGS clear_callee=1
 	/*
 	 * Sanitize registers of values that a speculation attack might
 	 * otherwise want to exploit. The lower registers are likely clobbered
@@ -113,20 +113,19 @@ For 32-bit we have the following conventions - kernel is built with
 	xorl	%r9d,  %r9d	/* nospec r9  */
 	xorl	%r10d, %r10d	/* nospec r10 */
 	xorl	%r11d, %r11d	/* nospec r11 */
+	.if \clear_callee
 	xorl	%ebx,  %ebx	/* nospec rbx */
-	.if \clear_bp
 	xorl	%ebp,  %ebp	/* nospec rbp */
-	.endif
 	xorl	%r12d, %r12d	/* nospec r12 */
 	xorl	%r13d, %r13d	/* nospec r13 */
 	xorl	%r14d, %r14d	/* nospec r14 */
 	xorl	%r15d, %r15d	/* nospec r15 */
-
+	.endif
 .endm
 
-.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_bp=1 unwind_hint=1
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_callee=1 unwind_hint=1
 	PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint
-	CLEAR_REGS clear_bp=\clear_bp
+	CLEAR_REGS clear_callee=\clear_callee
 .endm
 
 .macro POP_REGS pop_rdi=1
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index 29c5c32..0d00ec8 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -112,18 +112,37 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
 	push %rax				/* Return RIP */
 	push $0					/* Error code, 0 for IRQ/NMI */
 
-	PUSH_AND_CLEAR_REGS clear_bp=0 unwind_hint=0
+	PUSH_AND_CLEAR_REGS clear_callee=0 unwind_hint=0
+
 	movq %rsp, %rdi				/* %rdi -> pt_regs */
+	/*
+	 * At this point: {rdi, rsi, rdx, rcx, r8, r9}, {r10, r11}, {rax, rdx}
+	 * are clobbered, which corresponds to: arguments, extra caller-saved
+	 * and return. All registers a C function is allowed to clobber.
+	 *
+	 * Notably, the callee-saved registers: {rbx, r12, r13, r14, r15}
+	 * are untouched, with the exception of rbp, which carries the stack
+	 * frame and will be restored before exit.
+	 *
+	 * Further calling another C function will not alter this state.
+	 */
 	call __fred_entry_from_kvm		/* Call the C entry point */
-	POP_REGS
-	ERETS
-1:
+
 	/*
-	 * Objtool doesn't understand what ERETS does, this hint tells it that
-	 * yes, we'll reach here and with what stack state. A save/restore pair
-	 * isn't strictly needed, but it's the simplest form.
+	 * When FRED, use ERETS to potentially clear NMIs, otherwise simply
+	 * restore the stack pointer.
+	 */
+	ALTERNATIVE "nop; nop; mov %rbp, %rsp", \
+	            __stringify(add $C_PTREGS_SIZE, %rsp; ERETS), \
+		    X86_FEATURE_FRED
+
+1:	/*
+	 * Objtool doesn't understand ERETS, and the cfi register state is
+	 * different from initial_func_cfi due to PUSH_REGS. Tell it the state
+	 * is similar to where UNWIND_HINT_SAVE is.
 	 */
 	UNWIND_HINT_RESTORE
+
 	pop %rbp
 	RET
 
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 6259b47..32ba599 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -102,6 +102,7 @@ static void __used common(void)
 
 	BLANK();
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+	OFFSET(C_PTREGS_SIZE, pt_regs, orig_ax);
 
 	/* TLB state for the entry code */
 	OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);