[PATCH] x86/fred: Send an EVENT_CHECK IPI on exit from NMI

Andrew Cooper posted 1 patch 1 week, 3 days ago
Patches applied successfully (tree, apply log)
git fetch https://gitlab.com/xen-project/patchew/xen tags/patchew/20260624142338.653064-1-andrew.cooper3@citrix.com
xen/arch/x86/x86_64/entry-fred.S |  6 ++++
xen/arch/x86/x86_64/entry.S      | 49 +++++++++++++++++++-------------
2 files changed, 36 insertions(+), 19 deletions(-)
[PATCH] x86/fred: Send an EVENT_CHECK IPI on exit from NMI
Posted by Andrew Cooper 1 week, 3 days ago
Returning from an NMI which hits guest context needs special casing in FRED
mode just like it does in IDT mode.

Break nmi_exit_to_guest() out of handle_ist_exception(), and use it in
entry_FRED_R3() also.

Expand the comment a little, and invert the conditional jump to
compat_restore_all_guest() to avoid needing an #else clause for CONFIG_PV32.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <jbeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Teddy Astie <teddy.astie@vates.tech>
CC: Oleksii Kurochko <oleksii.kurochko@gmail.com>

Slightly RFC, not tested yet.  (My AMD system takes an eternity to reboot)

For 4.22.  Found during testing of FRED.  The consqeuence is that we can end
up scheduling while still in NMI context, after which things like the watchdog
and other diagnostics don't work properly.
---
 xen/arch/x86/x86_64/entry-fred.S |  6 ++++
 xen/arch/x86/x86_64/entry.S      | 49 +++++++++++++++++++-------------
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/xen/arch/x86/x86_64/entry-fred.S b/xen/arch/x86/x86_64/entry-fred.S
index e9c84423dacd..1ad9694a043b 100644
--- a/xen/arch/x86/x86_64/entry-fred.S
+++ b/xen/arch/x86/x86_64/entry-fred.S
@@ -20,6 +20,12 @@ FUNC(entry_FRED_R3, 4096)
         GET_STACK_END(14)
         movq    STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
 
+        /* NMIs need special handling on return to guest. */
+        movzbl  UREGS_ss + 6(%rsp), %eax
+        and     $0xf, %eax
+        cmp     $X86_ET_NMI, %al
+        je      nmi_exit_to_guest
+
         jmp     test_all_events
 #else
         BUG     /* Not Reached */
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index 17ca6a493906..de5d854f5533 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -146,6 +146,35 @@ process_trap:
         jmp  test_all_events
 END(switch_to_kernel)
 
+/*
+ * When returning to guest from an NMI, we must execute an IRET/ERETU to
+ * re-enable NMIs, and must not process softirqs which can e.g. schedule
+ * rather than returning to guest context.
+ *
+ * If a softirq is pending, send ourselves an EVENT_CHECK IPI to compensate.
+ * This will cause softirq processing to occur upon leaving NMI context.
+ *
+ * %rbx: struct vcpu, %r14 stack_end
+ */
+FUNC(nmi_exit_to_guest)
+        mov     STACK_CPUINFO_FIELD(processor_id)(%r14), %eax
+        shl     $IRQSTAT_shift, %eax
+        lea     irq_stat + IRQSTAT_softirq_pending(%rip), %rcx
+        cmpl    $0, (%rcx, %rax, 1)
+        je      1f
+        mov     $EVENT_CHECK_VECTOR, %edi
+        call    send_IPI_self
+1:
+        /* For restore_all_guest. */
+        mov     STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
+#ifdef CONFIG_PV32
+        mov     VCPU_domain(%rbx), %rax
+        cmpb    $0, DOMAIN_is_32bit_pv(%rax)
+        jne     compat_restore_all_guest
+#endif
+        jmp     restore_all_guest
+END(nmi_exit_to_guest)
+
         .section .text.entry, "ax", @progbits
 
 /* %rbx: struct vcpu, interrupts disabled */
@@ -1209,25 +1238,7 @@ FUNC(handle_ist_exception)
 #ifdef CONFIG_PV
         testb $3,UREGS_cs(%rsp)
         jz    restore_all_xen
-        /* Send an IPI to ourselves to cover for the lack of event checking. */
-        mov   STACK_CPUINFO_FIELD(processor_id)(%r14), %eax
-        shll  $IRQSTAT_shift,%eax
-        leaq  irq_stat+IRQSTAT_softirq_pending(%rip),%rcx
-        cmpl  $0,(%rcx,%rax,1)
-        je    1f
-        movl  $EVENT_CHECK_VECTOR,%edi
-        call  send_IPI_self
-1:
-        /* For restore_all_guest. */
-        mov   STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
-#ifdef CONFIG_PV32
-        movq  VCPU_domain(%rbx),%rax
-        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
-        je    restore_all_guest
-        jmp   compat_restore_all_guest
-#else
-        jmp   restore_all_guest
-#endif
+        jmp   nmi_exit_to_guest
 #else
         ASSERT_CONTEXT_IS_XEN
         jmp   restore_all_xen

base-commit: d42ace60290a4b4184ee2133b245b134fdf96fed
-- 
2.39.5


Re: [PATCH] x86/fred: Send an EVENT_CHECK IPI on exit from NMI
Posted by Oleksii Kurochko 1 week, 2 days ago

On 6/24/26 4:23 PM, Andrew Cooper wrote:
> Returning from an NMI which hits guest context needs special casing in FRED
> mode just like it does in IDT mode.
> 
> Break nmi_exit_to_guest() out of handle_ist_exception(), and use it in
> entry_FRED_R3() also.
> 
> Expand the comment a little, and invert the conditional jump to
> compat_restore_all_guest() to avoid needing an #else clause for CONFIG_PV32.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Jan Beulich <jbeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> CC: Teddy Astie <teddy.astie@vates.tech>
> CC: Oleksii Kurochko <oleksii.kurochko@gmail.com>
> 
> Slightly RFC, not tested yet.  (My AMD system takes an eternity to reboot)

I would like to have a test on hardware to verify that it doesn't break 
something else. With that:

> 
> For 4.22.  Found during testing of FRED.  The consqeuence is that we can end
> up scheduling while still in NMI context, after which things like the watchdog
> and other diagnostics don't work properly.
> ---

  Relase-Acked-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>

Thanks.

~ Oleksii

Re: [PATCH] x86/fred: Send an EVENT_CHECK IPI on exit from NMI
Posted by Andrew Cooper 1 week, 2 days ago
On 25/06/2026 10:53 am, Oleksii Kurochko wrote:
>
>
> On 6/24/26 4:23 PM, Andrew Cooper wrote:
>> Returning from an NMI which hits guest context needs special casing
>> in FRED
>> mode just like it does in IDT mode.
>>
>> Break nmi_exit_to_guest() out of handle_ist_exception(), and use it in
>> entry_FRED_R3() also.
>>
>> Expand the comment a little, and invert the conditional jump to
>> compat_restore_all_guest() to avoid needing an #else clause for
>> CONFIG_PV32.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> ---
>> CC: Jan Beulich <jbeulich@suse.com>
>> CC: Roger Pau Monné <roger.pau@citrix.com>
>> CC: Teddy Astie <teddy.astie@vates.tech>
>> CC: Oleksii Kurochko <oleksii.kurochko@gmail.com>
>>
>> Slightly RFC, not tested yet.  (My AMD system takes an eternity to
>> reboot)
>
> I would like to have a test on hardware to verify that it doesn't
> break something else.

Yes, just confirmed elsewhere on this thread.

>  With that:
>
>>
>> For 4.22.  Found during testing of FRED.  The consqeuence is that we
>> can end
>> up scheduling while still in NMI context, after which things like the
>> watchdog
>> and other diagnostics don't work properly.
>> ---
>
>  Relase-Acked-by: Oleksii Kurochko <oleksii.kurochko@gmail.com> 

Thankyou.

~Andrew

Re: [PATCH] x86/fred: Send an EVENT_CHECK IPI on exit from NMI
Posted by Jan Beulich 1 week, 3 days ago
On 24.06.2026 16:23, Andrew Cooper wrote:
> --- a/xen/arch/x86/x86_64/entry-fred.S
> +++ b/xen/arch/x86/x86_64/entry-fred.S
> @@ -20,6 +20,12 @@ FUNC(entry_FRED_R3, 4096)
>          GET_STACK_END(14)
>          movq    STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
>  
> +        /* NMIs need special handling on return to guest. */
> +        movzbl  UREGS_ss + 6(%rsp), %eax
> +        and     $0xf, %eax
> +        cmp     $X86_ET_NMI, %al
> +        je      nmi_exit_to_guest
> +
>          jmp     test_all_events

Actually, how about shrinking this to just

        test    %al, %al
        jnz     nmi_exit_to_guest

by having entry_from_pv() return a boolean?

Jan
Re: [PATCH] x86/fred: Send an EVENT_CHECK IPI on exit from NMI
Posted by Andrew Cooper 1 week, 3 days ago
On 24/06/2026 3:47 pm, Jan Beulich wrote:
> On 24.06.2026 16:23, Andrew Cooper wrote:
>> --- a/xen/arch/x86/x86_64/entry-fred.S
>> +++ b/xen/arch/x86/x86_64/entry-fred.S
>> @@ -20,6 +20,12 @@ FUNC(entry_FRED_R3, 4096)
>>          GET_STACK_END(14)
>>          movq    STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
>>  
>> +        /* NMIs need special handling on return to guest. */
>> +        movzbl  UREGS_ss + 6(%rsp), %eax
>> +        and     $0xf, %eax
>> +        cmp     $X86_ET_NMI, %al
>> +        je      nmi_exit_to_guest
>> +
>>          jmp     test_all_events
> Actually, how about shrinking this to just
>
>         test    %al, %al
>         jnz     nmi_exit_to_guest
>
> by having entry_from_pv() return a boolean?

I considered that, and dismissed it.

It involve changing large chunks of traps.c (simply to compile) and puts
far more than 4 instructions of logic onto the common path.

~Andrew
Re: [PATCH] x86/fred: Send an EVENT_CHECK IPI on exit from NMI
Posted by Jan Beulich 1 week, 3 days ago
On 24.06.2026 17:09, Andrew Cooper wrote:
> On 24/06/2026 3:47 pm, Jan Beulich wrote:
>> On 24.06.2026 16:23, Andrew Cooper wrote:
>>> --- a/xen/arch/x86/x86_64/entry-fred.S
>>> +++ b/xen/arch/x86/x86_64/entry-fred.S
>>> @@ -20,6 +20,12 @@ FUNC(entry_FRED_R3, 4096)
>>>          GET_STACK_END(14)
>>>          movq    STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
>>>  
>>> +        /* NMIs need special handling on return to guest. */
>>> +        movzbl  UREGS_ss + 6(%rsp), %eax
>>> +        and     $0xf, %eax
>>> +        cmp     $X86_ET_NMI, %al
>>> +        je      nmi_exit_to_guest
>>> +
>>>          jmp     test_all_events
>> Actually, how about shrinking this to just
>>
>>         test    %al, %al
>>         jnz     nmi_exit_to_guest
>>
>> by having entry_from_pv() return a boolean?
> 
> I considered that, and dismissed it.
> 
> It involve changing large chunks of traps.c (simply to compile) and puts
> far more than 4 instructions of logic onto the common path.

Does it? Code size overall may grow, but since every return there is that
of a constant (false in all but one case), every individual path would go
down from 4 insns (with a memory access, admittedly a cache-hot one) to
just 3 (the two here and the MOV or XOR on the return paths).

Jan
Re: [PATCH] x86/fred: Send an EVENT_CHECK IPI on exit from NMI
Posted by Jan Beulich 1 week, 3 days ago
On 24.06.2026 16:23, Andrew Cooper wrote:
> Returning from an NMI which hits guest context needs special casing in FRED
> mode just like it does in IDT mode.
> 
> Break nmi_exit_to_guest() out of handle_ist_exception(), and use it in
> entry_FRED_R3() also.
> 
> Expand the comment a little, and invert the conditional jump to
> compat_restore_all_guest() to avoid needing an #else clause for CONFIG_PV32.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>

Reviewed-by: Jan Beulich <jbeulich@suse.com>
provided of course ...

> Slightly RFC, not tested yet.  (My AMD system takes an eternity to reboot)

... the results of this won't prove it wrong.

> For 4.22.  Found during testing of FRED.  The consqeuence is that we can end
> up scheduling while still in NMI context, after which things like the watchdog
> and other diagnostics don't work properly.

May therefore want a Fixes: tag (it'll also want backporting aiui).

> --- a/xen/arch/x86/x86_64/entry-fred.S
> +++ b/xen/arch/x86/x86_64/entry-fred.S
> @@ -20,6 +20,12 @@ FUNC(entry_FRED_R3, 4096)
>          GET_STACK_END(14)
>          movq    STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
>  
> +        /* NMIs need special handling on return to guest. */
> +        movzbl  UREGS_ss + 6(%rsp), %eax
> +        and     $0xf, %eax

As you may be aware, I'm not overly happy with such literal numbers. But
well, alternatives look a little involved. So just a remark, not a request
to consider any kind of adjustment.

> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -146,6 +146,35 @@ process_trap:
>          jmp  test_all_events
>  END(switch_to_kernel)
>  
> +/*
> + * When returning to guest from an NMI, we must execute an IRET/ERETU to
> + * re-enable NMIs, and must not process softirqs which can e.g. schedule
> + * rather than returning to guest context.
> + *
> + * If a softirq is pending, send ourselves an EVENT_CHECK IPI to compensate.
> + * This will cause softirq processing to occur upon leaving NMI context.
> + *
> + * %rbx: struct vcpu, %r14 stack_end
> + */
> +FUNC(nmi_exit_to_guest)
> +        mov     STACK_CPUINFO_FIELD(processor_id)(%r14), %eax
> +        shl     $IRQSTAT_shift, %eax
> +        lea     irq_stat + IRQSTAT_softirq_pending(%rip), %rcx
> +        cmpl    $0, (%rcx, %rax, 1)
> +        je      1f
> +        mov     $EVENT_CHECK_VECTOR, %edi
> +        call    send_IPI_self
> +1:
> +        /* For restore_all_guest. */
> +        mov     STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
> +#ifdef CONFIG_PV32
> +        mov     VCPU_domain(%rbx), %rax
> +        cmpb    $0, DOMAIN_is_32bit_pv(%rax)

Would you be open to a little bit of trickery here while you move the code?
The low 12 bits of %rbx are clear, so instead of $0 we could use %bl here.

> +        jne     compat_restore_all_guest
> +#endif
> +        jmp     restore_all_guest
> +END(nmi_exit_to_guest)

Much like you flipped the Jcc/JMP here, ...

> @@ -1209,25 +1238,7 @@ FUNC(handle_ist_exception)
>  #ifdef CONFIG_PV
>          testb $3,UREGS_cs(%rsp)
>          jz    restore_all_xen

... how about also making this plus ...

> -        /* Send an IPI to ourselves to cover for the lack of event checking. */
> -        mov   STACK_CPUINFO_FIELD(processor_id)(%r14), %eax
> -        shll  $IRQSTAT_shift,%eax
> -        leaq  irq_stat+IRQSTAT_softirq_pending(%rip),%rcx
> -        cmpl  $0,(%rcx,%rax,1)
> -        je    1f
> -        movl  $EVENT_CHECK_VECTOR,%edi
> -        call  send_IPI_self
> -1:
> -        /* For restore_all_guest. */
> -        mov   STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
> -#ifdef CONFIG_PV32
> -        movq  VCPU_domain(%rbx),%rax
> -        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
> -        je    restore_all_guest
> -        jmp   compat_restore_all_guest
> -#else
> -        jmp   restore_all_guest
> -#endif
> +        jmp   nmi_exit_to_guest

... this

        jnz   nmi_exit_to_guest
        jmp   restore_all_xen

then allowing to fold with ...

>  #else
>          ASSERT_CONTEXT_IS_XEN
>          jmp   restore_all_xen

... this?

Jan
Re: [PATCH for-4.22] x86/fred: Send an EVENT_CHECK IPI on exit from NMI
Posted by Andrew Cooper 1 week, 2 days ago
On 24/06/2026 3:43 pm, Jan Beulich wrote:
> On 24.06.2026 16:23, Andrew Cooper wrote:
>> Returning from an NMI which hits guest context needs special casing in FRED
>> mode just like it does in IDT mode.
>>
>> Break nmi_exit_to_guest() out of handle_ist_exception(), and use it in
>> entry_FRED_R3() also.
>>
>> Expand the comment a little, and invert the conditional jump to
>> compat_restore_all_guest() to avoid needing an #else clause for CONFIG_PV32.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>
> provided of course ...
>
>> Slightly RFC, not tested yet.  (My AMD system takes an eternity to reboot)
> ... the results of this won't prove it wrong.

Testing is good.  We now get core register state on all CPUs, rather
than missing the ones which were in guest context.

~Andrew


Re: [PATCH] x86/fred: Send an EVENT_CHECK IPI on exit from NMI
Posted by Andrew Cooper 1 week, 3 days ago
On 24/06/2026 3:43 pm, Jan Beulich wrote:
> On 24.06.2026 16:23, Andrew Cooper wrote:
>> Returning from an NMI which hits guest context needs special casing in FRED
>> mode just like it does in IDT mode.
>>
>> Break nmi_exit_to_guest() out of handle_ist_exception(), and use it in
>> entry_FRED_R3() also.
>>
>> Expand the comment a little, and invert the conditional jump to
>> compat_restore_all_guest() to avoid needing an #else clause for CONFIG_PV32.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>
> provided of course ...
>
>> Slightly RFC, not tested yet.  (My AMD system takes an eternity to reboot)
> ... the results of this won't prove it wrong.
>
>> For 4.22.  Found during testing of FRED.  The consqeuence is that we can end
>> up scheduling while still in NMI context, after which things like the watchdog
>> and other diagnostics don't work properly.
> May therefore want a Fixes: tag (it'll also want backporting aiui).

Ah yes, I'd meant to set one, but forgot.

Fixes: 87cfcbe9f0b5 ("x86/pv: Guest exception handling in FRED mode")

>
>> --- a/xen/arch/x86/x86_64/entry-fred.S
>> +++ b/xen/arch/x86/x86_64/entry-fred.S
>> @@ -20,6 +20,12 @@ FUNC(entry_FRED_R3, 4096)
>>          GET_STACK_END(14)
>>          movq    STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
>>  
>> +        /* NMIs need special handling on return to guest. */
>> +        movzbl  UREGS_ss + 6(%rsp), %eax
>> +        and     $0xf, %eax
> As you may be aware, I'm not overly happy with such literal numbers. But
> well, alternatives look a little involved. So just a remark, not a request
> to consider any kind of adjustment.

The 0xf cannot usefully be anything else.  It's the width of the event
type field in a FRED frame, but you need to visually see it's less than
0xff or the switch from %eax to %al looks wrong.

The +6 can't be generated by asm-offsets because the infrastructure
doesn't work on bitfields.

>
>> --- a/xen/arch/x86/x86_64/entry.S
>> +++ b/xen/arch/x86/x86_64/entry.S
>> @@ -146,6 +146,35 @@ process_trap:
>>          jmp  test_all_events
>>  END(switch_to_kernel)
>>  
>> +/*
>> + * When returning to guest from an NMI, we must execute an IRET/ERETU to
>> + * re-enable NMIs, and must not process softirqs which can e.g. schedule
>> + * rather than returning to guest context.
>> + *
>> + * If a softirq is pending, send ourselves an EVENT_CHECK IPI to compensate.
>> + * This will cause softirq processing to occur upon leaving NMI context.
>> + *
>> + * %rbx: struct vcpu, %r14 stack_end
>> + */
>> +FUNC(nmi_exit_to_guest)
>> +        mov     STACK_CPUINFO_FIELD(processor_id)(%r14), %eax
>> +        shl     $IRQSTAT_shift, %eax
>> +        lea     irq_stat + IRQSTAT_softirq_pending(%rip), %rcx
>> +        cmpl    $0, (%rcx, %rax, 1)
>> +        je      1f
>> +        mov     $EVENT_CHECK_VECTOR, %edi
>> +        call    send_IPI_self
>> +1:
>> +        /* For restore_all_guest. */
>> +        mov     STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
>> +#ifdef CONFIG_PV32
>> +        mov     VCPU_domain(%rbx), %rax
>> +        cmpb    $0, DOMAIN_is_32bit_pv(%rax)
> Would you be open to a little bit of trickery here while you move the code?
> The low 12 bits of %rbx are clear, so instead of $0 we could use %bl here.

struct vcpu being page aligned is a convenience not a requirement.  It's
hard alignment requirements are 32b and even then only with CONFIG_SHADOW.



>
>> +        jne     compat_restore_all_guest
>> +#endif
>> +        jmp     restore_all_guest
>> +END(nmi_exit_to_guest)
> Much like you flipped the Jcc/JMP here, ...
>
>> @@ -1209,25 +1238,7 @@ FUNC(handle_ist_exception)
>>  #ifdef CONFIG_PV
>>          testb $3,UREGS_cs(%rsp)
>>          jz    restore_all_xen
> ... how about also making this plus ...
>
>> -        /* Send an IPI to ourselves to cover for the lack of event checking. */
>> -        mov   STACK_CPUINFO_FIELD(processor_id)(%r14), %eax
>> -        shll  $IRQSTAT_shift,%eax
>> -        leaq  irq_stat+IRQSTAT_softirq_pending(%rip),%rcx
>> -        cmpl  $0,(%rcx,%rax,1)
>> -        je    1f
>> -        movl  $EVENT_CHECK_VECTOR,%edi
>> -        call  send_IPI_self
>> -1:
>> -        /* For restore_all_guest. */
>> -        mov   STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
>> -#ifdef CONFIG_PV32
>> -        movq  VCPU_domain(%rbx),%rax
>> -        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
>> -        je    restore_all_guest
>> -        jmp   compat_restore_all_guest
>> -#else
>> -        jmp   restore_all_guest
>> -#endif
>> +        jmp   nmi_exit_to_guest
> ... this
>
>         jnz   nmi_exit_to_guest
>         jmp   restore_all_xen
>
> then allowing to fold with ...
>
>>  #else
>>          ASSERT_CONTEXT_IS_XEN
>>          jmp   restore_all_xen
> ... this?

This makes the diff rather less legible (and specifically, far less
obviously a "break out"), and changes the configurations that the ASSERT
lives in.

Perhaps as a followup, but not in this patch.

~Andrew