[PATCH 04/16] xen/cpu: prevent disable_nonboot_cpus crash on ARM64

Mykola Kvach posted 16 patches 4 days, 12 hours ago
[PATCH 04/16] xen/cpu: prevent disable_nonboot_cpus crash on ARM64
Posted by Mykola Kvach 4 days, 12 hours ago
If we call disable_nonboot_cpus on ARM64 with system_state set
to SYS_STATE_suspend, the following assertion will be triggered:

```
(XEN) [   25.582712] Disabling non-boot CPUs ...
(XEN) [   25.587032] Assertion '!in_irq() && (local_irq_is_enabled() || num_online_cpus() <= 1)' failed at common/xmalloc_tlsf.c:714
[...]
(XEN) [   25.975069] Xen call trace:
(XEN) [   25.978353]    [<00000a000022e098>] xfree+0x130/0x1a4 (PC)
(XEN) [   25.984314]    [<00000a000022e08c>] xfree+0x124/0x1a4 (LR)
(XEN) [   25.990276]    [<00000a00002747d4>] release_irq+0xe4/0xe8
(XEN) [   25.996152]    [<00000a0000278588>] time.c#cpu_time_callback+0x44/0x60
(XEN) [   26.003150]    [<00000a000021d678>] notifier_call_chain+0x7c/0xa0
(XEN) [   26.009717]    [<00000a00002018e0>] cpu.c#cpu_notifier_call_chain+0x24/0x48
(XEN) [   26.017148]    [<00000a000020192c>] cpu.c#_take_cpu_down+0x28/0x34
(XEN) [   26.023801]    [<00000a0000201944>] cpu.c#take_cpu_down+0xc/0x18
(XEN) [   26.030281]    [<00000a0000225c5c>] stop_machine.c#stopmachine_action+0xbc/0xe4
(XEN) [   26.038057]    [<00000a00002264bc>] tasklet.c#do_tasklet_work+0xb8/0x100
(XEN) [   26.045229]    [<00000a00002268a4>] do_tasklet+0x68/0xb0
(XEN) [   26.051018]    [<00000a000026e120>] domain.c#idle_loop+0x7c/0x194
(XEN) [   26.057585]    [<00000a0000277e30>] start_secondary+0x21c/0x220
(XEN) [   26.063978]    [<00000a0000361258>] 00000a0000361258
```

This happens because before invoking take_cpu_down via the stop_machine_run
function on the target CPU, stop_machine_run requests
the STOPMACHINE_DISABLE_IRQ state on that CPU. Releasing memory in
the release_irq function then triggers the assertion:

/*
 * Heap allocations may need TLB flushes which may require IRQs to be
 * enabled (except when only 1 PCPU is online).
 */
#define ASSERT_ALLOC_CONTEXT()

This patch introduces a new tasklet to perform the CPU_DYING call chain for
a particular CPU. However, we cannot call take_cpu_down from the tasklet
because the __cpu_disable function disables local IRQs, causing the system
to crash inside spin_lock_irq, which is called after the tasklet function
invocation inside do_tasklet_work:

void _spin_lock_irq(spinlock_t *lock)
{
    ASSERT(local_irq_is_enabled());

To resolve this, take_cpu_down is split into two parts. The first part triggers
the CPU_DYING call chain, while the second part, __cpu_disable, is invoked from
stop_machine_run.

Signed-off-by: Mykola Kvach <mykola_kvach@epam.com>
---
This patch was introduced in patch series V3.
---
 xen/common/cpu.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/xen/common/cpu.c b/xen/common/cpu.c
index f09af0444b..99d4c0c579 100644
--- a/xen/common/cpu.c
+++ b/xen/common/cpu.c
@@ -48,6 +48,10 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
 
 static DEFINE_RWLOCK(cpu_add_remove_lock);
 
+#ifdef CONFIG_ARM_64
+static DEFINE_PER_CPU(struct tasklet, cpu_down_tasklet);
+#endif
+
 bool get_cpu_maps(void)
 {
     return read_trylock(&cpu_add_remove_lock);
@@ -101,6 +105,14 @@ static void cf_check _take_cpu_down(void *unused)
     __cpu_disable();
 }
 
+#ifdef CONFIG_ARM_64
+static int cf_check cpu_disable_stop_machine(void *unused)
+{
+    __cpu_disable();
+    return 0;
+}
+#endif
+
 static int cf_check take_cpu_down(void *arg)
 {
     _take_cpu_down(arg);
@@ -128,6 +140,14 @@ int cpu_down(unsigned int cpu)
 
     if ( system_state < SYS_STATE_active || system_state == SYS_STATE_resume )
         on_selected_cpus(cpumask_of(cpu), _take_cpu_down, NULL, true);
+#ifdef CONFIG_ARM_64
+    else if ( system_state == SYS_STATE_suspend )
+    {
+        tasklet_schedule_on_cpu(&per_cpu(cpu_down_tasklet, cpu), cpu);
+        if ( (err = stop_machine_run(cpu_disable_stop_machine, NULL, cpu)) < 0 )
+            goto fail;
+    }
+#endif
     else if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 )
         goto fail;
 
@@ -247,3 +267,26 @@ void enable_nonboot_cpus(void)
 
     cpumask_clear(&frozen_cpus);
 }
+
+#ifdef CONFIG_ARM_64
+
+static void cf_check cpu_down_t_action(void *unused)
+{
+    cpu_notifier_call_chain(smp_processor_id(), CPU_DYING, NULL, true);
+}
+
+static int __init init_cpu_down_tasklet(void)
+{
+    unsigned int cpu;
+
+    for_each_possible_cpu(cpu) {
+        struct tasklet *t = &per_cpu(cpu_down_tasklet, cpu);
+
+        tasklet_init(t, cpu_down_t_action, NULL);
+    }
+
+    return 0;
+}
+__initcall(init_cpu_down_tasklet);
+
+#endif /* CONFIG_ARM_64 */
-- 
2.43.0