[PATCH v2 20/23] x86/pv: Exception handling in FRED mode

Andrew Cooper posted 23 patches 2 months ago
There is a newer version of this series
[PATCH v2 20/23] x86/pv: Exception handling in FRED mode
Posted by Andrew Cooper 2 months ago
Under FRED, entry_from_pv() handles everything.  To start with, implement
exception handling in the same manner as entry_from_xen(), although we can
unconditionally enable interrupts after the async/fatal events.

After entry_from_pv() returns, test_all_events() needs to run to perform
exception and interrupt injection.  Split entry_FRED_R3() into two and
introduce eretu_exit_to_guest() as the latter half, coming unilaterally from
restore_all_guest().

For all of this, there is a slightly complicated relationship with CONFIG_PV.
entry_FRED_R3() must exist irrespective of CONFIG_PV, because it's the
entrypoint registered with hardware.  For simplicity, entry_from_pv() is
always called, but it collapses into fatal_trap() in the !PV case.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>

v2:
 * New
---
 xen/arch/x86/traps.c             | 76 +++++++++++++++++++++++++++++++-
 xen/arch/x86/x86_64/entry-fred.S | 13 +++++-
 xen/arch/x86/x86_64/entry.S      |  4 +-
 3 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 67763bec0dc5..72df446a6a78 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2265,9 +2265,83 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
 
 void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
 {
+    struct fred_info *fi = cpu_regs_fred_info(regs);
+    uint8_t type = regs->fred_ss.type;
+    uint8_t vec = regs->fred_ss.vector;
+
     /* Copy fred_ss.vector into entry_vector as IDT delivery would have done. */
-    regs->entry_vector = regs->fred_ss.vector;
+    regs->entry_vector = vec;
+
+    if ( !IS_ENABLED(CONFIG_PV) )
+        goto fatal;
+
+    /*
+     * First, handle the asynchronous or fatal events.  These are either
+     * unrelated to the interrupted context, or may not have valid context
+     * recorded, and all have special rules on how/whether to re-enable IRQs.
+     */
+    switch ( type )
+    {
+    case X86_ET_EXT_INTR:
+        return do_IRQ(regs);
 
+    case X86_ET_NMI:
+        return do_nmi(regs);
+
+    case X86_ET_HW_EXC:
+        switch ( vec )
+        {
+        case X86_EXC_DF: return do_double_fault(regs);
+        case X86_EXC_MC: return do_machine_check(regs);
+        }
+        break;
+    }
+
+    /*
+     * With the asynchronous events handled, what remains are the synchronous
+     * ones.  Guest context always had interrupts enabled.
+     */
+    local_irq_enable();
+
+    switch ( type )
+    {
+    case X86_ET_HW_EXC:
+    case X86_ET_PRIV_SW_EXC:
+    case X86_ET_SW_EXC:
+        switch ( vec )
+        {
+        case X86_EXC_PF:  handle_PF(regs, fi->edata); break;
+        case X86_EXC_GP:  do_general_protection(regs); break;
+        case X86_EXC_UD:  do_invalid_op(regs); break;
+        case X86_EXC_NM:  do_device_not_available(regs); break;
+        case X86_EXC_BP:  do_int3(regs); break;
+        case X86_EXC_DB:  handle_DB(regs, fi->edata); break;
+
+        case X86_EXC_DE:
+        case X86_EXC_OF:
+        case X86_EXC_BR:
+        case X86_EXC_NP:
+        case X86_EXC_SS:
+        case X86_EXC_MF:
+        case X86_EXC_AC:
+        case X86_EXC_XM:
+            do_trap(regs);
+            break;
+
+        case X86_EXC_CP:  do_entry_CP(regs); break;
+
+        default:
+            goto fatal;
+        }
+        break;
+
+    default:
+        goto fatal;
+    }
+
+    return;
+
+ fatal:
     fatal_trap(regs, false);
 }
 
diff --git a/xen/arch/x86/x86_64/entry-fred.S b/xen/arch/x86/x86_64/entry-fred.S
index 3c3320df22cb..07684f38a078 100644
--- a/xen/arch/x86/x86_64/entry-fred.S
+++ b/xen/arch/x86/x86_64/entry-fred.S
@@ -15,9 +15,20 @@ FUNC(entry_FRED_R3, 4096)
         mov     %rsp, %rdi
         call    entry_from_pv
 
+#ifndef CONFIG_PV
+        BUG     /* Not Reached */
+#else
+        GET_STACK_END(14)
+        movq    STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
+
+        jmp     test_all_events
+#endif
+END(entry_FRED_R3)
+
+FUNC(eretu_exit_to_guest)
         POP_GPRS
         eretu
-END(entry_FRED_R3)
+END(eretu_exit_to_guest)
 
         /* The Ring0 entrypoint is at Ring3 + 0x100. */
         .org entry_FRED_R3 + 0x100, 0xcc
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index ca446c6ff0ce..0692163faa44 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -63,7 +63,7 @@ UNLIKELY_END(syscall_no_callback)
         /* Conditionally clear DF */
         and   %esi, UREGS_eflags(%rsp)
 /* %rbx: struct vcpu */
-test_all_events:
+LABEL(test_all_events, 0)
         ASSERT_NOT_IN_ATOMIC
         cli                             # tests must not race interrupts
 /*test_softirqs:*/
@@ -152,6 +152,8 @@ END(switch_to_kernel)
 FUNC_LOCAL(restore_all_guest)
         ASSERT_INTERRUPTS_DISABLED
 
+        ALTERNATIVE "", "jmp eretu_exit_to_guest", X86_FEATURE_XEN_FRED
+
         /* Stash guest SPEC_CTRL value while we can read struct vcpu. */
         mov VCPU_arch_msrs(%rbx), %rdx
         mov VCPUMSR_spec_ctrl_raw(%rdx), %r15d
-- 
2.39.5


Re: [PATCH v2 20/23] x86/pv: Exception handling in FRED mode
Posted by Jan Beulich 1 month, 4 weeks ago
On 28.08.2025 17:04, Andrew Cooper wrote:
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -2265,9 +2265,83 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
>  
>  void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
>  {
> +    struct fred_info *fi = cpu_regs_fred_info(regs);
> +    uint8_t type = regs->fred_ss.type;
> +    uint8_t vec = regs->fred_ss.vector;
> +
>      /* Copy fred_ss.vector into entry_vector as IDT delivery would have done. */
> -    regs->entry_vector = regs->fred_ss.vector;
> +    regs->entry_vector = vec;
> +
> +    if ( !IS_ENABLED(CONFIG_PV) )
> +        goto fatal;
> +
> +    /*
> +     * First, handle the asynchronous or fatal events.  These are either
> +     * unrelated to the interrupted context, or may not have valid context
> +     * recorded, and all have special rules on how/whether to re-enable IRQs.
> +     */
> +    switch ( type )
> +    {
> +    case X86_ET_EXT_INTR:
> +        return do_IRQ(regs);
>  
> +    case X86_ET_NMI:
> +        return do_nmi(regs);
> +
> +    case X86_ET_HW_EXC:
> +        switch ( vec )
> +        {
> +        case X86_EXC_DF: return do_double_fault(regs);
> +        case X86_EXC_MC: return do_machine_check(regs);

Looking at patch 21, I came to wonder where it is that we're moving back to
SL0 in the #MC case (which may not be fatal), for ERETU to not fault.

Jan
Re: [PATCH v2 20/23] x86/pv: Exception handling in FRED mode
Posted by Andrew Cooper 1 month, 4 weeks ago
On 01/09/2025 1:57 pm, Jan Beulich wrote:
> On 28.08.2025 17:04, Andrew Cooper wrote:
>> --- a/xen/arch/x86/traps.c
>> +++ b/xen/arch/x86/traps.c
>> @@ -2265,9 +2265,83 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
>>  
>>  void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
>>  {
>> +    struct fred_info *fi = cpu_regs_fred_info(regs);
>> +    uint8_t type = regs->fred_ss.type;
>> +    uint8_t vec = regs->fred_ss.vector;
>> +
>>      /* Copy fred_ss.vector into entry_vector as IDT delivery would have done. */
>> -    regs->entry_vector = regs->fred_ss.vector;
>> +    regs->entry_vector = vec;
>> +
>> +    if ( !IS_ENABLED(CONFIG_PV) )
>> +        goto fatal;
>> +
>> +    /*
>> +     * First, handle the asynchronous or fatal events.  These are either
>> +     * unrelated to the interrupted context, or may not have valid context
>> +     * recorded, and all have special rules on how/whether to re-enable IRQs.
>> +     */
>> +    switch ( type )
>> +    {
>> +    case X86_ET_EXT_INTR:
>> +        return do_IRQ(regs);
>>  
>> +    case X86_ET_NMI:
>> +        return do_nmi(regs);
>> +
>> +    case X86_ET_HW_EXC:
>> +        switch ( vec )
>> +        {
>> +        case X86_EXC_DF: return do_double_fault(regs);
>> +        case X86_EXC_MC: return do_machine_check(regs);
> Looking at patch 21, I came to wonder where it is that we're moving back to
> SL0 in the #MC case (which may not be fatal), for ERETU to not fault.

(Almost) any event taken in Ring3 enters Ring0 at SL0, even those with
custom STK_LVLS configuration.

See 5.1.2 Determining the New Values for Stack Level, RSP, and SSP

Nested exceptions (i.e contributory fault) and #DF can end up at SL > 0
with a Ring 3 frame.  In principle you'd need to do recovery based on
regs->fred_ss.nested but Xen doesn't have any contributory exceptions
configured like this.

Under FRED, there are far fewer ways to take a contributory fault. 
Pagetable corruption for the entrypoint or stack, or hitting a stack
guard page.  Hitting the guard page will #DF; others will triple fault.

~Andrew

Re: [PATCH v2 20/23] x86/pv: Exception handling in FRED mode
Posted by Jan Beulich 1 month, 4 weeks ago
On 01.09.2025 15:27, Andrew Cooper wrote:
> On 01/09/2025 1:57 pm, Jan Beulich wrote:
>> On 28.08.2025 17:04, Andrew Cooper wrote:
>>> --- a/xen/arch/x86/traps.c
>>> +++ b/xen/arch/x86/traps.c
>>> @@ -2265,9 +2265,83 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
>>>  
>>>  void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
>>>  {
>>> +    struct fred_info *fi = cpu_regs_fred_info(regs);
>>> +    uint8_t type = regs->fred_ss.type;
>>> +    uint8_t vec = regs->fred_ss.vector;
>>> +
>>>      /* Copy fred_ss.vector into entry_vector as IDT delivery would have done. */
>>> -    regs->entry_vector = regs->fred_ss.vector;
>>> +    regs->entry_vector = vec;
>>> +
>>> +    if ( !IS_ENABLED(CONFIG_PV) )
>>> +        goto fatal;
>>> +
>>> +    /*
>>> +     * First, handle the asynchronous or fatal events.  These are either
>>> +     * unrelated to the interrupted context, or may not have valid context
>>> +     * recorded, and all have special rules on how/whether to re-enable IRQs.
>>> +     */
>>> +    switch ( type )
>>> +    {
>>> +    case X86_ET_EXT_INTR:
>>> +        return do_IRQ(regs);
>>>  
>>> +    case X86_ET_NMI:
>>> +        return do_nmi(regs);
>>> +
>>> +    case X86_ET_HW_EXC:
>>> +        switch ( vec )
>>> +        {
>>> +        case X86_EXC_DF: return do_double_fault(regs);
>>> +        case X86_EXC_MC: return do_machine_check(regs);
>> Looking at patch 21, I came to wonder where it is that we're moving back to
>> SL0 in the #MC case (which may not be fatal), for ERETU to not fault.
> 
> (Almost) any event taken in Ring3 enters Ring0 at SL0, even those with
> custom STK_LVLS configuration.
> 
> See 5.1.2 Determining the New Values for Stack Level, RSP, and SSP

Oh, right - that's something I still need to properly settle in a corner of
my brain.

Jan
Re: [PATCH v2 20/23] x86/pv: Exception handling in FRED mode
Posted by Jan Beulich 1 month, 4 weeks ago
On 28.08.2025 17:04, Andrew Cooper wrote:
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -2265,9 +2265,83 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
>  
>  void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
>  {
> +    struct fred_info *fi = cpu_regs_fred_info(regs);
> +    uint8_t type = regs->fred_ss.type;
> +    uint8_t vec = regs->fred_ss.vector;
> +
>      /* Copy fred_ss.vector into entry_vector as IDT delivery would have done. */
> -    regs->entry_vector = regs->fred_ss.vector;
> +    regs->entry_vector = vec;
> +
> +    if ( !IS_ENABLED(CONFIG_PV) )
> +        goto fatal;
> +
> +    /*
> +     * First, handle the asynchronous or fatal events.  These are either
> +     * unrelated to the interrupted context, or may not have valid context
> +     * recorded, and all have special rules on how/whether to re-enable IRQs.
> +     */
> +    switch ( type )
> +    {
> +    case X86_ET_EXT_INTR:
> +        return do_IRQ(regs);
>  
> +    case X86_ET_NMI:
> +        return do_nmi(regs);
> +
> +    case X86_ET_HW_EXC:
> +        switch ( vec )
> +        {
> +        case X86_EXC_DF: return do_double_fault(regs);
> +        case X86_EXC_MC: return do_machine_check(regs);
> +        }
> +        break;
> +    }

This switch() is identical to entry_from_xen()'s. Fold into a helper?

> +    /*
> +     * With the asynchronous events handled, what remains are the synchronous
> +     * ones.  Guest context always had interrupts enabled.
> +     */
> +    local_irq_enable();

In the comment, maybe s/Guest/PV guest/?

> +    switch ( type )
> +    {
> +    case X86_ET_HW_EXC:
> +    case X86_ET_PRIV_SW_EXC:
> +    case X86_ET_SW_EXC:
> +        switch ( vec )
> +        {
> +        case X86_EXC_PF:  handle_PF(regs, fi->edata); break;
> +        case X86_EXC_GP:  do_general_protection(regs); break;
> +        case X86_EXC_UD:  do_invalid_op(regs); break;
> +        case X86_EXC_NM:  do_device_not_available(regs); break;
> +        case X86_EXC_BP:  do_int3(regs); break;
> +        case X86_EXC_DB:  handle_DB(regs, fi->edata); break;
> +
> +        case X86_EXC_DE:
> +        case X86_EXC_OF:
> +        case X86_EXC_BR:
> +        case X86_EXC_NP:
> +        case X86_EXC_SS:
> +        case X86_EXC_MF:
> +        case X86_EXC_AC:
> +        case X86_EXC_XM:
> +            do_trap(regs);
> +            break;
> +
> +        case X86_EXC_CP:  do_entry_CP(regs); break;
> +
> +        default:
> +            goto fatal;
> +        }
> +        break;

This again looks identical to when entry_from_xen() has. Maybe, instead of
a helper for each switch(), we could have a common always-inline function
(with all necessary parametrization) that both invoke?

> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -63,7 +63,7 @@ UNLIKELY_END(syscall_no_callback)
>          /* Conditionally clear DF */
>          and   %esi, UREGS_eflags(%rsp)
>  /* %rbx: struct vcpu */
> -test_all_events:
> +LABEL(test_all_events, 0)
>          ASSERT_NOT_IN_ATOMIC
>          cli                             # tests must not race interrupts
>  /*test_softirqs:*/
> @@ -152,6 +152,8 @@ END(switch_to_kernel)
>  FUNC_LOCAL(restore_all_guest)
>          ASSERT_INTERRUPTS_DISABLED
>  
> +        ALTERNATIVE "", "jmp eretu_exit_to_guest", X86_FEATURE_XEN_FRED
> +
>          /* Stash guest SPEC_CTRL value while we can read struct vcpu. */
>          mov VCPU_arch_msrs(%rbx), %rdx

I assume it's deliberate that you don't "consume" this insn into the
alternative, but without the description saying anything it's not quite
clear why.

Jan