With large NR_CPUS on-stack cpumask_t variables are problematic. Now that
the IRQ handler can't be invoked in a nested manner anymore, we can
instead use a per-CPU variable. While we can't use scratch_cpumask in code
invoked from IRQ handlers, simply amend that one with a HPET-special form.
(Note that only one of the two IRQ handling functions can come into play
at any one time.)
Fixes: 996576b965cc ("xen: allow up to 16383 cpus")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
---
While doing this I noticed that this and all pre-existing uses of
DEFINE_PER_CPU_READ_MOSTLY() aren't quite right: When the type is
cpumask_var_t, the variable is read-mostly only when NR_CPUS <=
2 * BITS_PER_LONG. That'll want sorting separately, though.
It is important for this to not be moved ahead of "x86/HPET: use single,
global, low-priority vector for broadcast IRQ", as the original call here
from set_channel_irq_affinity() may not use the new variable (it would
need to use scratch_cpumask instead).
---
v2: New.
--- a/xen/arch/x86/hpet.c
+++ b/xen/arch/x86/hpet.c
@@ -196,7 +196,7 @@ static void evt_do_broadcast(cpumask_t *
static void cf_check handle_hpet_broadcast(struct hpet_event_channel *ch)
{
- cpumask_t mask;
+ cpumask_t *scratch = this_cpu(hpet_scratch_cpumask);
s_time_t now, next_event;
unsigned int cpu;
unsigned long flags;
@@ -209,7 +209,7 @@ again:
spin_unlock_irqrestore(&ch->lock, flags);
next_event = STIME_MAX;
- cpumask_clear(&mask);
+ cpumask_clear(scratch);
now = NOW();
/* find all expired events */
@@ -218,13 +218,13 @@ again:
s_time_t deadline = ACCESS_ONCE(per_cpu(timer_deadline, cpu));
if ( deadline <= now )
- __cpumask_set_cpu(cpu, &mask);
+ __cpumask_set_cpu(cpu, scratch);
else if ( deadline < next_event )
next_event = deadline;
}
/* wakeup the cpus which have an expired event. */
- evt_do_broadcast(&mask);
+ evt_do_broadcast(scratch);
if ( next_event != STIME_MAX )
{
--- a/xen/arch/x86/include/asm/smp.h
+++ b/xen/arch/x86/include/asm/smp.h
@@ -22,6 +22,7 @@
DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask);
DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);
+DECLARE_PER_CPU(cpumask_var_t, hpet_scratch_cpumask);
DECLARE_PER_CPU(cpumask_var_t, send_ipi_cpumask);
/*
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -55,6 +55,9 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
static cpumask_t scratch_cpu0mask;
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, hpet_scratch_cpumask);
+static cpumask_t hpet_scratch_cpu0mask;
+
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, send_ipi_cpumask);
static cpumask_t send_ipi_cpu0mask;
@@ -976,6 +979,8 @@ static void cpu_smpboot_free(unsigned in
FREE_CPUMASK_VAR(per_cpu(cpu_core_mask, cpu));
if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
FREE_CPUMASK_VAR(per_cpu(scratch_cpumask, cpu));
+ if ( per_cpu(hpet_scratch_cpumask, cpu) != &hpet_scratch_cpu0mask )
+ FREE_CPUMASK_VAR(per_cpu(hpet_scratch_cpumask, cpu));
if ( per_cpu(send_ipi_cpumask, cpu) != &send_ipi_cpu0mask )
FREE_CPUMASK_VAR(per_cpu(send_ipi_cpumask, cpu));
}
@@ -1112,6 +1117,7 @@ static int cpu_smpboot_alloc(unsigned in
if ( !(cond_zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
cond_zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
cond_alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) &&
+ cond_alloc_cpumask_var(&per_cpu(hpet_scratch_cpumask, cpu)) &&
cond_alloc_cpumask_var(&per_cpu(send_ipi_cpumask, cpu))) )
goto out;
@@ -1239,6 +1245,7 @@ void __init smp_prepare_boot_cpu(void)
cpumask_set_cpu(cpu, &cpu_present_map);
#if NR_CPUS > 2 * BITS_PER_LONG
per_cpu(scratch_cpumask, cpu) = &scratch_cpu0mask;
+ per_cpu(hpet_scratch_cpumask, cpu) = &hpet_scratch_cpu0mask;
per_cpu(send_ipi_cpumask, cpu) = &send_ipi_cpu0mask;
#endif