With the current AP bring up code Xen can get stuck indefinitely if an AP
freezes during boot after the 'callin' step.  Introduce a 10s timeout while
waiting for APs to finish startup.
On failure of an AP to complete startup send an NMI to trigger the printing
of a stack backtrace on the stuck AP and panic on the BSP.
The sending of the NMI re-uses the code already present in fatal_trap(), by
moving it to a separate function.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/include/asm/processor.h |  1 +
 xen/arch/x86/smpboot.c               |  8 ++++
 xen/arch/x86/traps.c                 | 66 +++++++++++++++++-----------
 3 files changed, 49 insertions(+), 26 deletions(-)
diff --git a/xen/arch/x86/include/asm/processor.h b/xen/arch/x86/include/asm/processor.h
index eacd425c5350..10d8078cc1ca 100644
--- a/xen/arch/x86/include/asm/processor.h
+++ b/xen/arch/x86/include/asm/processor.h
@@ -371,6 +371,7 @@ void show_registers(const struct cpu_user_regs *regs);
 #define dump_execution_state() run_in_exception_handler(show_execution_state)
 void show_page_walk(unsigned long addr);
 void noreturn fatal_trap(const struct cpu_user_regs *regs, bool show_remote);
+void show_execution_state_nmi(const cpumask_t *mask, bool show_all);
 
 extern void mtrr_ap_init(void);
 extern void mtrr_bp_init(void);
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 48ce996ba414..77dce3e3e22b 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -1370,6 +1370,7 @@ int cpu_add(uint32_t apic_id, uint32_t acpi_id, uint32_t pxm)
 int __cpu_up(unsigned int cpu)
 {
     int apicid, ret;
+    s_time_t start;
 
     if ( (apicid = x86_cpu_to_apicid[cpu]) == BAD_APICID )
         return -ENODEV;
@@ -1388,10 +1389,17 @@ int __cpu_up(unsigned int cpu)
     time_latch_stamps();
 
     set_cpu_state(CPU_STATE_ONLINE);
+    start = NOW();
     while ( !cpu_online(cpu) )
     {
         cpu_relax();
         process_pending_softirqs();
+        if ( NOW() > start + SECONDS(10) )
+        {
+            /* AP is stuck, send NMI and panic. */
+            show_execution_state_nmi(cpumask_of(cpu), true);
+            panic("CPU%u: Stuck while starting up\n", cpu);
+        }
     }
 
     return 0;
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index c94779b4ad4f..9b9e3726e2fb 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -714,13 +714,15 @@ static cpumask_t show_state_mask;
 static bool opt_show_all;
 boolean_param("async-show-all", opt_show_all);
 
+static bool force_show_all;
+
 static int cf_check nmi_show_execution_state(
     const struct cpu_user_regs *regs, int cpu)
 {
     if ( !cpumask_test_cpu(cpu, &show_state_mask) )
         return 0;
 
-    if ( opt_show_all )
+    if ( opt_show_all || force_show_all )
         show_execution_state(regs);
     else if ( guest_mode(regs) )
         printk(XENLOG_ERR "CPU%d\t%pv\t%04x:%p in guest\n",
@@ -734,6 +736,40 @@ static int cf_check nmi_show_execution_state(
     return 1;
 }
 
+void show_execution_state_nmi(const cpumask_t *mask, bool show_all)
+{
+    unsigned int msecs, pending;
+
+    force_show_all = show_all;
+
+    watchdog_disable();
+    console_start_sync();
+
+    cpumask_copy(&show_state_mask, mask);
+    set_nmi_callback(nmi_show_execution_state);
+    /* Ensure new callback is set before sending out the NMI. */
+    smp_wmb();
+    send_IPI_mask(mask, APIC_DM_NMI);
+
+    /* Wait at most 10ms for some other CPU to respond. */
+    msecs = 10;
+    pending = cpumask_weight(&show_state_mask);
+    while ( pending && msecs-- )
+    {
+        unsigned int left;
+
+        mdelay(1);
+        left = cpumask_weight(&show_state_mask);
+        if ( left < pending )
+        {
+            pending = left;
+            msecs = 10;
+        }
+    }
+    if ( pending )
+        printk("Non-responding CPUs: {%*pbl}\n", CPUMASK_PR(&show_state_mask));
+}
+
 const char *vector_name(unsigned int vec)
 {
     static const char names[][4] = {
@@ -780,33 +816,11 @@ void fatal_trap(const struct cpu_user_regs *regs, bool show_remote)
 
         if ( show_remote )
         {
-            unsigned int msecs, pending;
+            cpumask_t *scratch = this_cpu(scratch_cpumask);
 
-            cpumask_andnot(&show_state_mask, &cpu_online_map,
+            cpumask_andnot(scratch, &cpu_online_map,
                            cpumask_of(smp_processor_id()));
-            set_nmi_callback(nmi_show_execution_state);
-            /* Ensure new callback is set before sending out the NMI. */
-            smp_wmb();
-            smp_send_nmi_allbutself();
-
-            /* Wait at most 10ms for some other CPU to respond. */
-            msecs = 10;
-            pending = cpumask_weight(&show_state_mask);
-            while ( pending && msecs-- )
-            {
-                unsigned int left;
-
-                mdelay(1);
-                left = cpumask_weight(&show_state_mask);
-                if ( left < pending )
-                {
-                    pending = left;
-                    msecs = 10;
-                }
-            }
-            if ( pending )
-                printk("Non-responding CPUs: {%*pbl}\n",
-                       CPUMASK_PR(&show_state_mask));
+            show_execution_state_nmi(scratch, false);
         }
     }
 
-- 
2.49.0
On 21.05.2025 18:55, Roger Pau Monne wrote:
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -714,13 +714,15 @@ static cpumask_t show_state_mask;
>  static bool opt_show_all;
>  boolean_param("async-show-all", opt_show_all);
>  
> +static bool force_show_all;
> +
>  static int cf_check nmi_show_execution_state(
>      const struct cpu_user_regs *regs, int cpu)
>  {
>      if ( !cpumask_test_cpu(cpu, &show_state_mask) )
>          return 0;
>  
> -    if ( opt_show_all )
> +    if ( opt_show_all || force_show_all )
>          show_execution_state(regs);
>      else if ( guest_mode(regs) )
>          printk(XENLOG_ERR "CPU%d\t%pv\t%04x:%p in guest\n",
> @@ -734,6 +736,40 @@ static int cf_check nmi_show_execution_state(
>      return 1;
>  }
>  
> +void show_execution_state_nmi(const cpumask_t *mask, bool show_all)
> +{
> +    unsigned int msecs, pending;
> +
> +    force_show_all = show_all;
Both forms of the call can, aiui, in principle race with one another.
I think you want to avoid setting the static to false once it was set
to true.
Furthermore, as long as all calls here with the 2nd argument being
true are followed by panic() or alike, I see no reason why you couldn't
simply re-use opt_show_all, setting that one to true. (Or else there
would then also be some resetting of the new static.)
Jan
                
            On Thu, May 22, 2025 at 09:18:57AM +0200, Jan Beulich wrote:
> On 21.05.2025 18:55, Roger Pau Monne wrote:
> > --- a/xen/arch/x86/traps.c
> > +++ b/xen/arch/x86/traps.c
> > @@ -714,13 +714,15 @@ static cpumask_t show_state_mask;
> >  static bool opt_show_all;
> >  boolean_param("async-show-all", opt_show_all);
> >  
> > +static bool force_show_all;
> > +
> >  static int cf_check nmi_show_execution_state(
> >      const struct cpu_user_regs *regs, int cpu)
> >  {
> >      if ( !cpumask_test_cpu(cpu, &show_state_mask) )
> >          return 0;
> >  
> > -    if ( opt_show_all )
> > +    if ( opt_show_all || force_show_all )
> >          show_execution_state(regs);
> >      else if ( guest_mode(regs) )
> >          printk(XENLOG_ERR "CPU%d\t%pv\t%04x:%p in guest\n",
> > @@ -734,6 +736,40 @@ static int cf_check nmi_show_execution_state(
> >      return 1;
> >  }
> >  
> > +void show_execution_state_nmi(const cpumask_t *mask, bool show_all)
> > +{
> > +    unsigned int msecs, pending;
> > +
> > +    force_show_all = show_all;
Sorry, I did send v2 before seeing your comments.
> Both forms of the call can, aiui, in principle race with one another.
> I think you want to avoid setting the static to false once it was set
> to true.
> 
> Furthermore, as long as all calls here with the 2nd argument being
> true are followed by panic() or alike, I see no reason why you couldn't
> simply re-use opt_show_all, setting that one to true. (Or else there
> would then also be some resetting of the new static.)
So basically do something like:
if ( show_all )
    opt_show_all = true;
And only overwrite opt_show_all when the caller requests full traces?
Thanks, Roger.
                
            On 22.05.2025 16:11, Roger Pau Monné wrote:
> On Thu, May 22, 2025 at 09:18:57AM +0200, Jan Beulich wrote:
>> On 21.05.2025 18:55, Roger Pau Monne wrote:
>>> --- a/xen/arch/x86/traps.c
>>> +++ b/xen/arch/x86/traps.c
>>> @@ -714,13 +714,15 @@ static cpumask_t show_state_mask;
>>>  static bool opt_show_all;
>>>  boolean_param("async-show-all", opt_show_all);
>>>  
>>> +static bool force_show_all;
>>> +
>>>  static int cf_check nmi_show_execution_state(
>>>      const struct cpu_user_regs *regs, int cpu)
>>>  {
>>>      if ( !cpumask_test_cpu(cpu, &show_state_mask) )
>>>          return 0;
>>>  
>>> -    if ( opt_show_all )
>>> +    if ( opt_show_all || force_show_all )
>>>          show_execution_state(regs);
>>>      else if ( guest_mode(regs) )
>>>          printk(XENLOG_ERR "CPU%d\t%pv\t%04x:%p in guest\n",
>>> @@ -734,6 +736,40 @@ static int cf_check nmi_show_execution_state(
>>>      return 1;
>>>  }
>>>  
>>> +void show_execution_state_nmi(const cpumask_t *mask, bool show_all)
>>> +{
>>> +    unsigned int msecs, pending;
>>> +
>>> +    force_show_all = show_all;
> 
> Sorry, I did send v2 before seeing your comments.
> 
>> Both forms of the call can, aiui, in principle race with one another.
>> I think you want to avoid setting the static to false once it was set
>> to true.
>>
>> Furthermore, as long as all calls here with the 2nd argument being
>> true are followed by panic() or alike, I see no reason why you couldn't
>> simply re-use opt_show_all, setting that one to true. (Or else there
>> would then also be some resetting of the new static.)
> 
> So basically do something like:
> 
> if ( show_all )
>     opt_show_all = true;
> 
> And only overwrite opt_show_all when the caller requests full traces?
Yes, that's what I think it boils down to.
Jan
                
            On 21/05/2025 5:55 pm, Roger Pau Monne wrote:
> With the current AP bring up code Xen can get stuck indefinitely if an AP
You want a comma between "code, Xen" to make the sentence easier to parse.
> freezes during boot after the 'callin' step.  Introduce a 10s timeout while
> waiting for APs to finish startup.
5s is the timeout used in other parts of AP bringup.  I'd suggest being
consistent here.
> On failure of an AP to complete startup send an NMI to trigger the printing
Again, a comma between "startup, send" would go a long way.
> of a stack backtrace on the stuck AP and panic on the BSP.
>
> The sending of the NMI re-uses the code already present in fatal_trap(), by
> moving it to a separate function.
I'd be tempted to split the patch in two.
>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
It may be worth nothing that this came from the ICX143 investigation,
even if it wasn't relevant in the end?
> diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
> index 48ce996ba414..77dce3e3e22b 100644
> --- a/xen/arch/x86/smpboot.c
> +++ b/xen/arch/x86/smpboot.c
> @@ -1388,10 +1389,17 @@ int __cpu_up(unsigned int cpu)
>      time_latch_stamps();
>  
>      set_cpu_state(CPU_STATE_ONLINE);
> +    start = NOW();
>      while ( !cpu_online(cpu) )
>      {
>          cpu_relax();
>          process_pending_softirqs();
> +        if ( NOW() > start + SECONDS(10) )
(NOW() - start) > SECONDS(10)
It has one fewer boundary conditions, even if it is rather unlikely that
start + 10s will overflow.
> diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
> index c94779b4ad4f..9b9e3726e2fb 100644
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -734,6 +736,40 @@ static int cf_check nmi_show_execution_state(
>      return 1;
>  }
>  
> +void show_execution_state_nmi(const cpumask_t *mask, bool show_all)
> +{
> +    unsigned int msecs, pending;
> +
> +    force_show_all = show_all;
> +
> +    watchdog_disable();
> +    console_start_sync();
> +
> +    cpumask_copy(&show_state_mask, mask);
> +    set_nmi_callback(nmi_show_execution_state);
> +    /* Ensure new callback is set before sending out the NMI. */
> +    smp_wmb();
I know this is only moving code, but this is wrong.  So is the smp_mb()
in the x2apic drivers.
It would only be correct in principle for xAPIC (which is an MMIO
store), except it's UC and is strongly ordered anyway.  Furthermore, the
sequence point for the send_IPI_mask() prevents the compiler from
reordering this unsafely.
The x2APIC drivers need LFENCE;MFENCE on Intel, and just MFENCE on AMD,
and this (critically) is not smp_mb(), which is now just a locked operation.
I bet these aren't the only examples of incorrect barriers WRT IPIs.  I
guess we should fix those separately.
~Andrew
                
            On Wed, May 21, 2025 at 07:16:18PM +0100, Andrew Cooper wrote:
> On 21/05/2025 5:55 pm, Roger Pau Monne wrote:
> > With the current AP bring up code Xen can get stuck indefinitely if an AP
> 
> You want a comma between "code, Xen" to make the sentence easier to parse.
> 
> > freezes during boot after the 'callin' step.  Introduce a 10s timeout while
> > waiting for APs to finish startup.
> 
> 5s is the timeout used in other parts of AP bringup.  I'd suggest being
> consistent here.
> 
> 
> > On failure of an AP to complete startup send an NMI to trigger the printing
> 
> Again, a comma between "startup, send" would go a long way.
> 
> > of a stack backtrace on the stuck AP and panic on the BSP.
> >
> > The sending of the NMI re-uses the code already present in fatal_trap(), by
> > moving it to a separate function.
> 
> I'd be tempted to split the patch in two.
> 
> >
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > ---
> 
> It may be worth nothing that this came from the ICX143 investigation,
> even if it wasn't relevant in the end?
> 
> > diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
> > index 48ce996ba414..77dce3e3e22b 100644
> > --- a/xen/arch/x86/smpboot.c
> > +++ b/xen/arch/x86/smpboot.c
> > @@ -1388,10 +1389,17 @@ int __cpu_up(unsigned int cpu)
> >      time_latch_stamps();
> >  
> >      set_cpu_state(CPU_STATE_ONLINE);
> > +    start = NOW();
> >      while ( !cpu_online(cpu) )
> >      {
> >          cpu_relax();
> >          process_pending_softirqs();
> > +        if ( NOW() > start + SECONDS(10) )
> 
> (NOW() - start) > SECONDS(10)
> 
> It has one fewer boundary conditions, even if it is rather unlikely that
> start + 10s will overflow.
> 
> > diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
> > index c94779b4ad4f..9b9e3726e2fb 100644
> > --- a/xen/arch/x86/traps.c
> > +++ b/xen/arch/x86/traps.c
> > @@ -734,6 +736,40 @@ static int cf_check nmi_show_execution_state(
> >      return 1;
> >  }
> >  
> > +void show_execution_state_nmi(const cpumask_t *mask, bool show_all)
> > +{
> > +    unsigned int msecs, pending;
> > +
> > +    force_show_all = show_all;
> > +
> > +    watchdog_disable();
> > +    console_start_sync();
> > +
> > +    cpumask_copy(&show_state_mask, mask);
> > +    set_nmi_callback(nmi_show_execution_state);
> > +    /* Ensure new callback is set before sending out the NMI. */
> > +    smp_wmb();
> 
> I know this is only moving code, but this is wrong.  So is the smp_mb()
> in the x2apic drivers.
> 
> It would only be correct in principle for xAPIC (which is an MMIO
> store), except it's UC and is strongly ordered anyway.  Furthermore, the
> sequence point for the send_IPI_mask() prevents the compiler from
> reordering this unsafely.
> 
> The x2APIC drivers need LFENCE;MFENCE on Intel, and just MFENCE on AMD,
> and this (critically) is not smp_mb(), which is now just a locked operation.
> 
> I bet these aren't the only examples of incorrect barriers WRT IPIs.  I
> guess we should fix those separately.
Thanks, I will remove the smp_wmb() ahead of moving the code, but
other instances in the APIC drivers I will leave for a different
series, I don't want to delay the work here on those fixes.
Regards, Roger.
                
            © 2016 - 2025 Red Hat, Inc.