Under FRED, entry_from_pv() handles everything. To start with, implement
exception handling in the same manner as entry_from_xen(), although we can
unconditionally enable interrupts after the async/fatal events.
After entry_from_pv() returns, test_all_events() needs to run to perform
exception and interrupt injection. Split entry_FRED_R3() into two and
introduce eretu_exit_to_guest() as the latter half, coming unilaterally from
restore_all_guest().
For all of this, there is a slightly complicated relationship with CONFIG_PV.
entry_FRED_R3() must exist irrespective of CONFIG_PV, because it's the
entrypoint registered with hardware. For simplicity, entry_from_pv() is
always called, but it collapses into fatal_trap() in the !PV case.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
v2:
* New
---
xen/arch/x86/traps.c | 76 +++++++++++++++++++++++++++++++-
xen/arch/x86/x86_64/entry-fred.S | 13 +++++-
xen/arch/x86/x86_64/entry.S | 4 +-
3 files changed, 90 insertions(+), 3 deletions(-)
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 67763bec0dc5..72df446a6a78 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2265,9 +2265,83 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
{
+ struct fred_info *fi = cpu_regs_fred_info(regs);
+ uint8_t type = regs->fred_ss.type;
+ uint8_t vec = regs->fred_ss.vector;
+
/* Copy fred_ss.vector into entry_vector as IDT delivery would have done. */
- regs->entry_vector = regs->fred_ss.vector;
+ regs->entry_vector = vec;
+
+ if ( !IS_ENABLED(CONFIG_PV) )
+ goto fatal;
+
+ /*
+ * First, handle the asynchronous or fatal events. These are either
+ * unrelated to the interrupted context, or may not have valid context
+ * recorded, and all have special rules on how/whether to re-enable IRQs.
+ */
+ switch ( type )
+ {
+ case X86_ET_EXT_INTR:
+ return do_IRQ(regs);
+ case X86_ET_NMI:
+ return do_nmi(regs);
+
+ case X86_ET_HW_EXC:
+ switch ( vec )
+ {
+ case X86_EXC_DF: return do_double_fault(regs);
+ case X86_EXC_MC: return do_machine_check(regs);
+ }
+ break;
+ }
+
+ /*
+ * With the asynchronous events handled, what remains are the synchronous
+ * ones. Guest context always had interrupts enabled.
+ */
+ local_irq_enable();
+
+ switch ( type )
+ {
+ case X86_ET_HW_EXC:
+ case X86_ET_PRIV_SW_EXC:
+ case X86_ET_SW_EXC:
+ switch ( vec )
+ {
+ case X86_EXC_PF: handle_PF(regs, fi->edata); break;
+ case X86_EXC_GP: do_general_protection(regs); break;
+ case X86_EXC_UD: do_invalid_op(regs); break;
+ case X86_EXC_NM: do_device_not_available(regs); break;
+ case X86_EXC_BP: do_int3(regs); break;
+ case X86_EXC_DB: handle_DB(regs, fi->edata); break;
+
+ case X86_EXC_DE:
+ case X86_EXC_OF:
+ case X86_EXC_BR:
+ case X86_EXC_NP:
+ case X86_EXC_SS:
+ case X86_EXC_MF:
+ case X86_EXC_AC:
+ case X86_EXC_XM:
+ do_trap(regs);
+ break;
+
+ case X86_EXC_CP: do_entry_CP(regs); break;
+
+ default:
+ goto fatal;
+ }
+ break;
+
+ default:
+ goto fatal;
+ }
+
+ return;
+
+ fatal:
fatal_trap(regs, false);
}
diff --git a/xen/arch/x86/x86_64/entry-fred.S b/xen/arch/x86/x86_64/entry-fred.S
index 3c3320df22cb..07684f38a078 100644
--- a/xen/arch/x86/x86_64/entry-fred.S
+++ b/xen/arch/x86/x86_64/entry-fred.S
@@ -15,9 +15,20 @@ FUNC(entry_FRED_R3, 4096)
mov %rsp, %rdi
call entry_from_pv
+#ifndef CONFIG_PV
+ BUG /* Not Reached */
+#else
+ GET_STACK_END(14)
+ movq STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
+
+ jmp test_all_events
+#endif
+END(entry_FRED_R3)
+
+FUNC(eretu_exit_to_guest)
POP_GPRS
eretu
-END(entry_FRED_R3)
+END(eretu_exit_to_guest)
/* The Ring0 entrypoint is at Ring3 + 0x100. */
.org entry_FRED_R3 + 0x100, 0xcc
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index ca446c6ff0ce..0692163faa44 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -63,7 +63,7 @@ UNLIKELY_END(syscall_no_callback)
/* Conditionally clear DF */
and %esi, UREGS_eflags(%rsp)
/* %rbx: struct vcpu */
-test_all_events:
+LABEL(test_all_events, 0)
ASSERT_NOT_IN_ATOMIC
cli # tests must not race interrupts
/*test_softirqs:*/
@@ -152,6 +152,8 @@ END(switch_to_kernel)
FUNC_LOCAL(restore_all_guest)
ASSERT_INTERRUPTS_DISABLED
+ ALTERNATIVE "", "jmp eretu_exit_to_guest", X86_FEATURE_XEN_FRED
+
/* Stash guest SPEC_CTRL value while we can read struct vcpu. */
mov VCPU_arch_msrs(%rbx), %rdx
mov VCPUMSR_spec_ctrl_raw(%rdx), %r15d
--
2.39.5
On 28.08.2025 17:04, Andrew Cooper wrote:
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -2265,9 +2265,83 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
>
> void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
> {
> + struct fred_info *fi = cpu_regs_fred_info(regs);
> + uint8_t type = regs->fred_ss.type;
> + uint8_t vec = regs->fred_ss.vector;
> +
> /* Copy fred_ss.vector into entry_vector as IDT delivery would have done. */
> - regs->entry_vector = regs->fred_ss.vector;
> + regs->entry_vector = vec;
> +
> + if ( !IS_ENABLED(CONFIG_PV) )
> + goto fatal;
> +
> + /*
> + * First, handle the asynchronous or fatal events. These are either
> + * unrelated to the interrupted context, or may not have valid context
> + * recorded, and all have special rules on how/whether to re-enable IRQs.
> + */
> + switch ( type )
> + {
> + case X86_ET_EXT_INTR:
> + return do_IRQ(regs);
>
> + case X86_ET_NMI:
> + return do_nmi(regs);
> +
> + case X86_ET_HW_EXC:
> + switch ( vec )
> + {
> + case X86_EXC_DF: return do_double_fault(regs);
> + case X86_EXC_MC: return do_machine_check(regs);
Looking at patch 21, I came to wonder where it is that we're moving back to
SL0 in the #MC case (which may not be fatal), for ERETU to not fault.
Jan
On 01/09/2025 1:57 pm, Jan Beulich wrote:
> On 28.08.2025 17:04, Andrew Cooper wrote:
>> --- a/xen/arch/x86/traps.c
>> +++ b/xen/arch/x86/traps.c
>> @@ -2265,9 +2265,83 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
>>
>> void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
>> {
>> + struct fred_info *fi = cpu_regs_fred_info(regs);
>> + uint8_t type = regs->fred_ss.type;
>> + uint8_t vec = regs->fred_ss.vector;
>> +
>> /* Copy fred_ss.vector into entry_vector as IDT delivery would have done. */
>> - regs->entry_vector = regs->fred_ss.vector;
>> + regs->entry_vector = vec;
>> +
>> + if ( !IS_ENABLED(CONFIG_PV) )
>> + goto fatal;
>> +
>> + /*
>> + * First, handle the asynchronous or fatal events. These are either
>> + * unrelated to the interrupted context, or may not have valid context
>> + * recorded, and all have special rules on how/whether to re-enable IRQs.
>> + */
>> + switch ( type )
>> + {
>> + case X86_ET_EXT_INTR:
>> + return do_IRQ(regs);
>>
>> + case X86_ET_NMI:
>> + return do_nmi(regs);
>> +
>> + case X86_ET_HW_EXC:
>> + switch ( vec )
>> + {
>> + case X86_EXC_DF: return do_double_fault(regs);
>> + case X86_EXC_MC: return do_machine_check(regs);
> Looking at patch 21, I came to wonder where it is that we're moving back to
> SL0 in the #MC case (which may not be fatal), for ERETU to not fault.
(Almost) any event taken in Ring3 enters Ring0 at SL0, even those with
custom STK_LVLS configuration.
See 5.1.2 Determining the New Values for Stack Level, RSP, and SSP
Nested exceptions (i.e contributory fault) and #DF can end up at SL > 0
with a Ring 3 frame. In principle you'd need to do recovery based on
regs->fred_ss.nested but Xen doesn't have any contributory exceptions
configured like this.
Under FRED, there are far fewer ways to take a contributory fault.
Pagetable corruption for the entrypoint or stack, or hitting a stack
guard page. Hitting the guard page will #DF; others will triple fault.
~Andrew
On 01.09.2025 15:27, Andrew Cooper wrote:
> On 01/09/2025 1:57 pm, Jan Beulich wrote:
>> On 28.08.2025 17:04, Andrew Cooper wrote:
>>> --- a/xen/arch/x86/traps.c
>>> +++ b/xen/arch/x86/traps.c
>>> @@ -2265,9 +2265,83 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
>>>
>>> void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
>>> {
>>> + struct fred_info *fi = cpu_regs_fred_info(regs);
>>> + uint8_t type = regs->fred_ss.type;
>>> + uint8_t vec = regs->fred_ss.vector;
>>> +
>>> /* Copy fred_ss.vector into entry_vector as IDT delivery would have done. */
>>> - regs->entry_vector = regs->fred_ss.vector;
>>> + regs->entry_vector = vec;
>>> +
>>> + if ( !IS_ENABLED(CONFIG_PV) )
>>> + goto fatal;
>>> +
>>> + /*
>>> + * First, handle the asynchronous or fatal events. These are either
>>> + * unrelated to the interrupted context, or may not have valid context
>>> + * recorded, and all have special rules on how/whether to re-enable IRQs.
>>> + */
>>> + switch ( type )
>>> + {
>>> + case X86_ET_EXT_INTR:
>>> + return do_IRQ(regs);
>>>
>>> + case X86_ET_NMI:
>>> + return do_nmi(regs);
>>> +
>>> + case X86_ET_HW_EXC:
>>> + switch ( vec )
>>> + {
>>> + case X86_EXC_DF: return do_double_fault(regs);
>>> + case X86_EXC_MC: return do_machine_check(regs);
>> Looking at patch 21, I came to wonder where it is that we're moving back to
>> SL0 in the #MC case (which may not be fatal), for ERETU to not fault.
>
> (Almost) any event taken in Ring3 enters Ring0 at SL0, even those with
> custom STK_LVLS configuration.
>
> See 5.1.2 Determining the New Values for Stack Level, RSP, and SSP
Oh, right - that's something I still need to properly settle in a corner of
my brain.
Jan
On 28.08.2025 17:04, Andrew Cooper wrote:
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -2265,9 +2265,83 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
>
> void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
> {
> + struct fred_info *fi = cpu_regs_fred_info(regs);
> + uint8_t type = regs->fred_ss.type;
> + uint8_t vec = regs->fred_ss.vector;
> +
> /* Copy fred_ss.vector into entry_vector as IDT delivery would have done. */
> - regs->entry_vector = regs->fred_ss.vector;
> + regs->entry_vector = vec;
> +
> + if ( !IS_ENABLED(CONFIG_PV) )
> + goto fatal;
> +
> + /*
> + * First, handle the asynchronous or fatal events. These are either
> + * unrelated to the interrupted context, or may not have valid context
> + * recorded, and all have special rules on how/whether to re-enable IRQs.
> + */
> + switch ( type )
> + {
> + case X86_ET_EXT_INTR:
> + return do_IRQ(regs);
>
> + case X86_ET_NMI:
> + return do_nmi(regs);
> +
> + case X86_ET_HW_EXC:
> + switch ( vec )
> + {
> + case X86_EXC_DF: return do_double_fault(regs);
> + case X86_EXC_MC: return do_machine_check(regs);
> + }
> + break;
> + }
This switch() is identical to entry_from_xen()'s. Fold into a helper?
> + /*
> + * With the asynchronous events handled, what remains are the synchronous
> + * ones. Guest context always had interrupts enabled.
> + */
> + local_irq_enable();
In the comment, maybe s/Guest/PV guest/?
> + switch ( type )
> + {
> + case X86_ET_HW_EXC:
> + case X86_ET_PRIV_SW_EXC:
> + case X86_ET_SW_EXC:
> + switch ( vec )
> + {
> + case X86_EXC_PF: handle_PF(regs, fi->edata); break;
> + case X86_EXC_GP: do_general_protection(regs); break;
> + case X86_EXC_UD: do_invalid_op(regs); break;
> + case X86_EXC_NM: do_device_not_available(regs); break;
> + case X86_EXC_BP: do_int3(regs); break;
> + case X86_EXC_DB: handle_DB(regs, fi->edata); break;
> +
> + case X86_EXC_DE:
> + case X86_EXC_OF:
> + case X86_EXC_BR:
> + case X86_EXC_NP:
> + case X86_EXC_SS:
> + case X86_EXC_MF:
> + case X86_EXC_AC:
> + case X86_EXC_XM:
> + do_trap(regs);
> + break;
> +
> + case X86_EXC_CP: do_entry_CP(regs); break;
> +
> + default:
> + goto fatal;
> + }
> + break;
This again looks identical to when entry_from_xen() has. Maybe, instead of
a helper for each switch(), we could have a common always-inline function
(with all necessary parametrization) that both invoke?
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -63,7 +63,7 @@ UNLIKELY_END(syscall_no_callback)
> /* Conditionally clear DF */
> and %esi, UREGS_eflags(%rsp)
> /* %rbx: struct vcpu */
> -test_all_events:
> +LABEL(test_all_events, 0)
> ASSERT_NOT_IN_ATOMIC
> cli # tests must not race interrupts
> /*test_softirqs:*/
> @@ -152,6 +152,8 @@ END(switch_to_kernel)
> FUNC_LOCAL(restore_all_guest)
> ASSERT_INTERRUPTS_DISABLED
>
> + ALTERNATIVE "", "jmp eretu_exit_to_guest", X86_FEATURE_XEN_FRED
> +
> /* Stash guest SPEC_CTRL value while we can read struct vcpu. */
> mov VCPU_arch_msrs(%rbx), %rdx
I assume it's deliberate that you don't "consume" this insn into the
alternative, but without the description saying anything it's not quite
clear why.
Jan
© 2016 - 2025 Red Hat, Inc.