xen/arch/x86/x86_64/entry-fred.S | 6 ++++ xen/arch/x86/x86_64/entry.S | 49 +++++++++++++++++++------------- 2 files changed, 36 insertions(+), 19 deletions(-)
Returning from an NMI which hits guest context needs special casing in FRED
mode just like it does in IDT mode.
Break nmi_exit_to_guest() out of handle_ist_exception(), and use it in
entry_FRED_R3() also.
Expand the comment a little, and invert the conditional jump to
compat_restore_all_guest() to avoid needing an #else clause for CONFIG_PV32.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <jbeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Teddy Astie <teddy.astie@vates.tech>
CC: Oleksii Kurochko <oleksii.kurochko@gmail.com>
Slightly RFC, not tested yet. (My AMD system takes an eternity to reboot)
For 4.22. Found during testing of FRED. The consqeuence is that we can end
up scheduling while still in NMI context, after which things like the watchdog
and other diagnostics don't work properly.
---
xen/arch/x86/x86_64/entry-fred.S | 6 ++++
xen/arch/x86/x86_64/entry.S | 49 +++++++++++++++++++-------------
2 files changed, 36 insertions(+), 19 deletions(-)
diff --git a/xen/arch/x86/x86_64/entry-fred.S b/xen/arch/x86/x86_64/entry-fred.S
index e9c84423dacd..1ad9694a043b 100644
--- a/xen/arch/x86/x86_64/entry-fred.S
+++ b/xen/arch/x86/x86_64/entry-fred.S
@@ -20,6 +20,12 @@ FUNC(entry_FRED_R3, 4096)
GET_STACK_END(14)
movq STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
+ /* NMIs need special handling on return to guest. */
+ movzbl UREGS_ss + 6(%rsp), %eax
+ and $0xf, %eax
+ cmp $X86_ET_NMI, %al
+ je nmi_exit_to_guest
+
jmp test_all_events
#else
BUG /* Not Reached */
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index 17ca6a493906..de5d854f5533 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -146,6 +146,35 @@ process_trap:
jmp test_all_events
END(switch_to_kernel)
+/*
+ * When returning to guest from an NMI, we must execute an IRET/ERETU to
+ * re-enable NMIs, and must not process softirqs which can e.g. schedule
+ * rather than returning to guest context.
+ *
+ * If a softirq is pending, send ourselves an EVENT_CHECK IPI to compensate.
+ * This will cause softirq processing to occur upon leaving NMI context.
+ *
+ * %rbx: struct vcpu, %r14 stack_end
+ */
+FUNC(nmi_exit_to_guest)
+ mov STACK_CPUINFO_FIELD(processor_id)(%r14), %eax
+ shl $IRQSTAT_shift, %eax
+ lea irq_stat + IRQSTAT_softirq_pending(%rip), %rcx
+ cmpl $0, (%rcx, %rax, 1)
+ je 1f
+ mov $EVENT_CHECK_VECTOR, %edi
+ call send_IPI_self
+1:
+ /* For restore_all_guest. */
+ mov STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
+#ifdef CONFIG_PV32
+ mov VCPU_domain(%rbx), %rax
+ cmpb $0, DOMAIN_is_32bit_pv(%rax)
+ jne compat_restore_all_guest
+#endif
+ jmp restore_all_guest
+END(nmi_exit_to_guest)
+
.section .text.entry, "ax", @progbits
/* %rbx: struct vcpu, interrupts disabled */
@@ -1209,25 +1238,7 @@ FUNC(handle_ist_exception)
#ifdef CONFIG_PV
testb $3,UREGS_cs(%rsp)
jz restore_all_xen
- /* Send an IPI to ourselves to cover for the lack of event checking. */
- mov STACK_CPUINFO_FIELD(processor_id)(%r14), %eax
- shll $IRQSTAT_shift,%eax
- leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx
- cmpl $0,(%rcx,%rax,1)
- je 1f
- movl $EVENT_CHECK_VECTOR,%edi
- call send_IPI_self
-1:
- /* For restore_all_guest. */
- mov STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
-#ifdef CONFIG_PV32
- movq VCPU_domain(%rbx),%rax
- cmpb $0,DOMAIN_is_32bit_pv(%rax)
- je restore_all_guest
- jmp compat_restore_all_guest
-#else
- jmp restore_all_guest
-#endif
+ jmp nmi_exit_to_guest
#else
ASSERT_CONTEXT_IS_XEN
jmp restore_all_xen
base-commit: d42ace60290a4b4184ee2133b245b134fdf96fed
--
2.39.5
On 6/24/26 4:23 PM, Andrew Cooper wrote: > Returning from an NMI which hits guest context needs special casing in FRED > mode just like it does in IDT mode. > > Break nmi_exit_to_guest() out of handle_ist_exception(), and use it in > entry_FRED_R3() also. > > Expand the comment a little, and invert the conditional jump to > compat_restore_all_guest() to avoid needing an #else clause for CONFIG_PV32. > > Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> > --- > CC: Jan Beulich <jbeulich@suse.com> > CC: Roger Pau Monné <roger.pau@citrix.com> > CC: Teddy Astie <teddy.astie@vates.tech> > CC: Oleksii Kurochko <oleksii.kurochko@gmail.com> > > Slightly RFC, not tested yet. (My AMD system takes an eternity to reboot) I would like to have a test on hardware to verify that it doesn't break something else. With that: > > For 4.22. Found during testing of FRED. The consqeuence is that we can end > up scheduling while still in NMI context, after which things like the watchdog > and other diagnostics don't work properly. > --- Relase-Acked-by: Oleksii Kurochko <oleksii.kurochko@gmail.com> Thanks. ~ Oleksii
On 25/06/2026 10:53 am, Oleksii Kurochko wrote: > > > On 6/24/26 4:23 PM, Andrew Cooper wrote: >> Returning from an NMI which hits guest context needs special casing >> in FRED >> mode just like it does in IDT mode. >> >> Break nmi_exit_to_guest() out of handle_ist_exception(), and use it in >> entry_FRED_R3() also. >> >> Expand the comment a little, and invert the conditional jump to >> compat_restore_all_guest() to avoid needing an #else clause for >> CONFIG_PV32. >> >> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> >> --- >> CC: Jan Beulich <jbeulich@suse.com> >> CC: Roger Pau Monné <roger.pau@citrix.com> >> CC: Teddy Astie <teddy.astie@vates.tech> >> CC: Oleksii Kurochko <oleksii.kurochko@gmail.com> >> >> Slightly RFC, not tested yet. (My AMD system takes an eternity to >> reboot) > > I would like to have a test on hardware to verify that it doesn't > break something else. Yes, just confirmed elsewhere on this thread. > With that: > >> >> For 4.22. Found during testing of FRED. The consqeuence is that we >> can end >> up scheduling while still in NMI context, after which things like the >> watchdog >> and other diagnostics don't work properly. >> --- > > Relase-Acked-by: Oleksii Kurochko <oleksii.kurochko@gmail.com> Thankyou. ~Andrew
On 24.06.2026 16:23, Andrew Cooper wrote:
> --- a/xen/arch/x86/x86_64/entry-fred.S
> +++ b/xen/arch/x86/x86_64/entry-fred.S
> @@ -20,6 +20,12 @@ FUNC(entry_FRED_R3, 4096)
> GET_STACK_END(14)
> movq STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
>
> + /* NMIs need special handling on return to guest. */
> + movzbl UREGS_ss + 6(%rsp), %eax
> + and $0xf, %eax
> + cmp $X86_ET_NMI, %al
> + je nmi_exit_to_guest
> +
> jmp test_all_events
Actually, how about shrinking this to just
test %al, %al
jnz nmi_exit_to_guest
by having entry_from_pv() return a boolean?
Jan
On 24/06/2026 3:47 pm, Jan Beulich wrote: > On 24.06.2026 16:23, Andrew Cooper wrote: >> --- a/xen/arch/x86/x86_64/entry-fred.S >> +++ b/xen/arch/x86/x86_64/entry-fred.S >> @@ -20,6 +20,12 @@ FUNC(entry_FRED_R3, 4096) >> GET_STACK_END(14) >> movq STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx >> >> + /* NMIs need special handling on return to guest. */ >> + movzbl UREGS_ss + 6(%rsp), %eax >> + and $0xf, %eax >> + cmp $X86_ET_NMI, %al >> + je nmi_exit_to_guest >> + >> jmp test_all_events > Actually, how about shrinking this to just > > test %al, %al > jnz nmi_exit_to_guest > > by having entry_from_pv() return a boolean? I considered that, and dismissed it. It involve changing large chunks of traps.c (simply to compile) and puts far more than 4 instructions of logic onto the common path. ~Andrew
On 24.06.2026 17:09, Andrew Cooper wrote: > On 24/06/2026 3:47 pm, Jan Beulich wrote: >> On 24.06.2026 16:23, Andrew Cooper wrote: >>> --- a/xen/arch/x86/x86_64/entry-fred.S >>> +++ b/xen/arch/x86/x86_64/entry-fred.S >>> @@ -20,6 +20,12 @@ FUNC(entry_FRED_R3, 4096) >>> GET_STACK_END(14) >>> movq STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx >>> >>> + /* NMIs need special handling on return to guest. */ >>> + movzbl UREGS_ss + 6(%rsp), %eax >>> + and $0xf, %eax >>> + cmp $X86_ET_NMI, %al >>> + je nmi_exit_to_guest >>> + >>> jmp test_all_events >> Actually, how about shrinking this to just >> >> test %al, %al >> jnz nmi_exit_to_guest >> >> by having entry_from_pv() return a boolean? > > I considered that, and dismissed it. > > It involve changing large chunks of traps.c (simply to compile) and puts > far more than 4 instructions of logic onto the common path. Does it? Code size overall may grow, but since every return there is that of a constant (false in all but one case), every individual path would go down from 4 insns (with a memory access, admittedly a cache-hot one) to just 3 (the two here and the MOV or XOR on the return paths). Jan
On 24.06.2026 16:23, Andrew Cooper wrote:
> Returning from an NMI which hits guest context needs special casing in FRED
> mode just like it does in IDT mode.
>
> Break nmi_exit_to_guest() out of handle_ist_exception(), and use it in
> entry_FRED_R3() also.
>
> Expand the comment a little, and invert the conditional jump to
> compat_restore_all_guest() to avoid needing an #else clause for CONFIG_PV32.
>
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
provided of course ...
> Slightly RFC, not tested yet. (My AMD system takes an eternity to reboot)
... the results of this won't prove it wrong.
> For 4.22. Found during testing of FRED. The consqeuence is that we can end
> up scheduling while still in NMI context, after which things like the watchdog
> and other diagnostics don't work properly.
May therefore want a Fixes: tag (it'll also want backporting aiui).
> --- a/xen/arch/x86/x86_64/entry-fred.S
> +++ b/xen/arch/x86/x86_64/entry-fred.S
> @@ -20,6 +20,12 @@ FUNC(entry_FRED_R3, 4096)
> GET_STACK_END(14)
> movq STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
>
> + /* NMIs need special handling on return to guest. */
> + movzbl UREGS_ss + 6(%rsp), %eax
> + and $0xf, %eax
As you may be aware, I'm not overly happy with such literal numbers. But
well, alternatives look a little involved. So just a remark, not a request
to consider any kind of adjustment.
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -146,6 +146,35 @@ process_trap:
> jmp test_all_events
> END(switch_to_kernel)
>
> +/*
> + * When returning to guest from an NMI, we must execute an IRET/ERETU to
> + * re-enable NMIs, and must not process softirqs which can e.g. schedule
> + * rather than returning to guest context.
> + *
> + * If a softirq is pending, send ourselves an EVENT_CHECK IPI to compensate.
> + * This will cause softirq processing to occur upon leaving NMI context.
> + *
> + * %rbx: struct vcpu, %r14 stack_end
> + */
> +FUNC(nmi_exit_to_guest)
> + mov STACK_CPUINFO_FIELD(processor_id)(%r14), %eax
> + shl $IRQSTAT_shift, %eax
> + lea irq_stat + IRQSTAT_softirq_pending(%rip), %rcx
> + cmpl $0, (%rcx, %rax, 1)
> + je 1f
> + mov $EVENT_CHECK_VECTOR, %edi
> + call send_IPI_self
> +1:
> + /* For restore_all_guest. */
> + mov STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
> +#ifdef CONFIG_PV32
> + mov VCPU_domain(%rbx), %rax
> + cmpb $0, DOMAIN_is_32bit_pv(%rax)
Would you be open to a little bit of trickery here while you move the code?
The low 12 bits of %rbx are clear, so instead of $0 we could use %bl here.
> + jne compat_restore_all_guest
> +#endif
> + jmp restore_all_guest
> +END(nmi_exit_to_guest)
Much like you flipped the Jcc/JMP here, ...
> @@ -1209,25 +1238,7 @@ FUNC(handle_ist_exception)
> #ifdef CONFIG_PV
> testb $3,UREGS_cs(%rsp)
> jz restore_all_xen
... how about also making this plus ...
> - /* Send an IPI to ourselves to cover for the lack of event checking. */
> - mov STACK_CPUINFO_FIELD(processor_id)(%r14), %eax
> - shll $IRQSTAT_shift,%eax
> - leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx
> - cmpl $0,(%rcx,%rax,1)
> - je 1f
> - movl $EVENT_CHECK_VECTOR,%edi
> - call send_IPI_self
> -1:
> - /* For restore_all_guest. */
> - mov STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
> -#ifdef CONFIG_PV32
> - movq VCPU_domain(%rbx),%rax
> - cmpb $0,DOMAIN_is_32bit_pv(%rax)
> - je restore_all_guest
> - jmp compat_restore_all_guest
> -#else
> - jmp restore_all_guest
> -#endif
> + jmp nmi_exit_to_guest
... this
jnz nmi_exit_to_guest
jmp restore_all_xen
then allowing to fold with ...
> #else
> ASSERT_CONTEXT_IS_XEN
> jmp restore_all_xen
... this?
Jan
On 24/06/2026 3:43 pm, Jan Beulich wrote: > On 24.06.2026 16:23, Andrew Cooper wrote: >> Returning from an NMI which hits guest context needs special casing in FRED >> mode just like it does in IDT mode. >> >> Break nmi_exit_to_guest() out of handle_ist_exception(), and use it in >> entry_FRED_R3() also. >> >> Expand the comment a little, and invert the conditional jump to >> compat_restore_all_guest() to avoid needing an #else clause for CONFIG_PV32. >> >> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> > Reviewed-by: Jan Beulich <jbeulich@suse.com> > provided of course ... > >> Slightly RFC, not tested yet. (My AMD system takes an eternity to reboot) > ... the results of this won't prove it wrong. Testing is good. We now get core register state on all CPUs, rather than missing the ones which were in guest context. ~Andrew
On 24/06/2026 3:43 pm, Jan Beulich wrote:
> On 24.06.2026 16:23, Andrew Cooper wrote:
>> Returning from an NMI which hits guest context needs special casing in FRED
>> mode just like it does in IDT mode.
>>
>> Break nmi_exit_to_guest() out of handle_ist_exception(), and use it in
>> entry_FRED_R3() also.
>>
>> Expand the comment a little, and invert the conditional jump to
>> compat_restore_all_guest() to avoid needing an #else clause for CONFIG_PV32.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>
> provided of course ...
>
>> Slightly RFC, not tested yet. (My AMD system takes an eternity to reboot)
> ... the results of this won't prove it wrong.
>
>> For 4.22. Found during testing of FRED. The consqeuence is that we can end
>> up scheduling while still in NMI context, after which things like the watchdog
>> and other diagnostics don't work properly.
> May therefore want a Fixes: tag (it'll also want backporting aiui).
Ah yes, I'd meant to set one, but forgot.
Fixes: 87cfcbe9f0b5 ("x86/pv: Guest exception handling in FRED mode")
>
>> --- a/xen/arch/x86/x86_64/entry-fred.S
>> +++ b/xen/arch/x86/x86_64/entry-fred.S
>> @@ -20,6 +20,12 @@ FUNC(entry_FRED_R3, 4096)
>> GET_STACK_END(14)
>> movq STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
>>
>> + /* NMIs need special handling on return to guest. */
>> + movzbl UREGS_ss + 6(%rsp), %eax
>> + and $0xf, %eax
> As you may be aware, I'm not overly happy with such literal numbers. But
> well, alternatives look a little involved. So just a remark, not a request
> to consider any kind of adjustment.
The 0xf cannot usefully be anything else. It's the width of the event
type field in a FRED frame, but you need to visually see it's less than
0xff or the switch from %eax to %al looks wrong.
The +6 can't be generated by asm-offsets because the infrastructure
doesn't work on bitfields.
>
>> --- a/xen/arch/x86/x86_64/entry.S
>> +++ b/xen/arch/x86/x86_64/entry.S
>> @@ -146,6 +146,35 @@ process_trap:
>> jmp test_all_events
>> END(switch_to_kernel)
>>
>> +/*
>> + * When returning to guest from an NMI, we must execute an IRET/ERETU to
>> + * re-enable NMIs, and must not process softirqs which can e.g. schedule
>> + * rather than returning to guest context.
>> + *
>> + * If a softirq is pending, send ourselves an EVENT_CHECK IPI to compensate.
>> + * This will cause softirq processing to occur upon leaving NMI context.
>> + *
>> + * %rbx: struct vcpu, %r14 stack_end
>> + */
>> +FUNC(nmi_exit_to_guest)
>> + mov STACK_CPUINFO_FIELD(processor_id)(%r14), %eax
>> + shl $IRQSTAT_shift, %eax
>> + lea irq_stat + IRQSTAT_softirq_pending(%rip), %rcx
>> + cmpl $0, (%rcx, %rax, 1)
>> + je 1f
>> + mov $EVENT_CHECK_VECTOR, %edi
>> + call send_IPI_self
>> +1:
>> + /* For restore_all_guest. */
>> + mov STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
>> +#ifdef CONFIG_PV32
>> + mov VCPU_domain(%rbx), %rax
>> + cmpb $0, DOMAIN_is_32bit_pv(%rax)
> Would you be open to a little bit of trickery here while you move the code?
> The low 12 bits of %rbx are clear, so instead of $0 we could use %bl here.
struct vcpu being page aligned is a convenience not a requirement. It's
hard alignment requirements are 32b and even then only with CONFIG_SHADOW.
>
>> + jne compat_restore_all_guest
>> +#endif
>> + jmp restore_all_guest
>> +END(nmi_exit_to_guest)
> Much like you flipped the Jcc/JMP here, ...
>
>> @@ -1209,25 +1238,7 @@ FUNC(handle_ist_exception)
>> #ifdef CONFIG_PV
>> testb $3,UREGS_cs(%rsp)
>> jz restore_all_xen
> ... how about also making this plus ...
>
>> - /* Send an IPI to ourselves to cover for the lack of event checking. */
>> - mov STACK_CPUINFO_FIELD(processor_id)(%r14), %eax
>> - shll $IRQSTAT_shift,%eax
>> - leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx
>> - cmpl $0,(%rcx,%rax,1)
>> - je 1f
>> - movl $EVENT_CHECK_VECTOR,%edi
>> - call send_IPI_self
>> -1:
>> - /* For restore_all_guest. */
>> - mov STACK_CPUINFO_FIELD(current_vcpu)(%r14), %rbx
>> -#ifdef CONFIG_PV32
>> - movq VCPU_domain(%rbx),%rax
>> - cmpb $0,DOMAIN_is_32bit_pv(%rax)
>> - je restore_all_guest
>> - jmp compat_restore_all_guest
>> -#else
>> - jmp restore_all_guest
>> -#endif
>> + jmp nmi_exit_to_guest
> ... this
>
> jnz nmi_exit_to_guest
> jmp restore_all_xen
>
> then allowing to fold with ...
>
>> #else
>> ASSERT_CONTEXT_IS_XEN
>> jmp restore_all_xen
> ... this?
This makes the diff rather less legible (and specifically, far less
obviously a "break out"), and changes the configurations that the ASSERT
lives in.
Perhaps as a followup, but not in this patch.
~Andrew
© 2016 - 2026 Red Hat, Inc.