[PATCH v2 for-4.21 3/9] x86/HPET: replace handle_hpet_broadcast()'s on-stack cpumask_t

Jan Beulich posted 9 patches 1 week, 3 days ago
There is a newer version of this series
[PATCH v2 for-4.21 3/9] x86/HPET: replace handle_hpet_broadcast()'s on-stack cpumask_t
Posted by Jan Beulich 1 week, 3 days ago
With large NR_CPUS on-stack cpumask_t variables are problematic. Now that
the IRQ handler can't be invoked in a nested manner anymore, we can
instead use a per-CPU variable. While we can't use scratch_cpumask in code
invoked from IRQ handlers, simply amend that one with a HPET-special form.
(Note that only one of the two IRQ handling functions can come into play
at any one time.)

Fixes: 996576b965cc ("xen: allow up to 16383 cpus")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
While doing this I noticed that this and all pre-existing uses of
DEFINE_PER_CPU_READ_MOSTLY() aren't quite right: When the type is
cpumask_var_t, the variable is read-mostly only when NR_CPUS <=
2 * BITS_PER_LONG. That'll want sorting separately, though.

It is important for this to not be moved ahead of "x86/HPET: use single,
global, low-priority vector for broadcast IRQ", as the original call here
from set_channel_irq_affinity() may not use the new variable (it would
need to use scratch_cpumask instead).
---
v2: New.

--- a/xen/arch/x86/hpet.c
+++ b/xen/arch/x86/hpet.c
@@ -196,7 +196,7 @@ static void evt_do_broadcast(cpumask_t *
 
 static void cf_check handle_hpet_broadcast(struct hpet_event_channel *ch)
 {
-    cpumask_t mask;
+    cpumask_t *scratch = this_cpu(hpet_scratch_cpumask);
     s_time_t now, next_event;
     unsigned int cpu;
     unsigned long flags;
@@ -209,7 +209,7 @@ again:
     spin_unlock_irqrestore(&ch->lock, flags);
 
     next_event = STIME_MAX;
-    cpumask_clear(&mask);
+    cpumask_clear(scratch);
     now = NOW();
 
     /* find all expired events */
@@ -218,13 +218,13 @@ again:
         s_time_t deadline = ACCESS_ONCE(per_cpu(timer_deadline, cpu));
 
         if ( deadline <= now )
-            __cpumask_set_cpu(cpu, &mask);
+            __cpumask_set_cpu(cpu, scratch);
         else if ( deadline < next_event )
             next_event = deadline;
     }
 
     /* wakeup the cpus which have an expired event. */
-    evt_do_broadcast(&mask);
+    evt_do_broadcast(scratch);
 
     if ( next_event != STIME_MAX )
     {
--- a/xen/arch/x86/include/asm/smp.h
+++ b/xen/arch/x86/include/asm/smp.h
@@ -22,6 +22,7 @@
 DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask);
 DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
 DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);
+DECLARE_PER_CPU(cpumask_var_t, hpet_scratch_cpumask);
 DECLARE_PER_CPU(cpumask_var_t, send_ipi_cpumask);
 
 /*
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -55,6 +55,9 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
 static cpumask_t scratch_cpu0mask;
 
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, hpet_scratch_cpumask);
+static cpumask_t hpet_scratch_cpu0mask;
+
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, send_ipi_cpumask);
 static cpumask_t send_ipi_cpu0mask;
 
@@ -976,6 +979,8 @@ static void cpu_smpboot_free(unsigned in
         FREE_CPUMASK_VAR(per_cpu(cpu_core_mask, cpu));
         if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
             FREE_CPUMASK_VAR(per_cpu(scratch_cpumask, cpu));
+        if ( per_cpu(hpet_scratch_cpumask, cpu) != &hpet_scratch_cpu0mask )
+            FREE_CPUMASK_VAR(per_cpu(hpet_scratch_cpumask, cpu));
         if ( per_cpu(send_ipi_cpumask, cpu) != &send_ipi_cpu0mask )
             FREE_CPUMASK_VAR(per_cpu(send_ipi_cpumask, cpu));
     }
@@ -1112,6 +1117,7 @@ static int cpu_smpboot_alloc(unsigned in
     if ( !(cond_zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
            cond_zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
            cond_alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) &&
+           cond_alloc_cpumask_var(&per_cpu(hpet_scratch_cpumask, cpu)) &&
            cond_alloc_cpumask_var(&per_cpu(send_ipi_cpumask, cpu))) )
         goto out;
 
@@ -1239,6 +1245,7 @@ void __init smp_prepare_boot_cpu(void)
     cpumask_set_cpu(cpu, &cpu_present_map);
 #if NR_CPUS > 2 * BITS_PER_LONG
     per_cpu(scratch_cpumask, cpu) = &scratch_cpu0mask;
+    per_cpu(hpet_scratch_cpumask, cpu) = &hpet_scratch_cpu0mask;
     per_cpu(send_ipi_cpumask, cpu) = &send_ipi_cpu0mask;
 #endif
Re: [PATCH v2 for-4.21 3/9] x86/HPET: replace handle_hpet_broadcast()'s on-stack cpumask_t
Posted by Roger Pau Monné 1 week, 2 days ago
On Mon, Oct 20, 2025 at 01:19:20PM +0200, Jan Beulich wrote:
> With large NR_CPUS on-stack cpumask_t variables are problematic. Now that
> the IRQ handler can't be invoked in a nested manner anymore, we can
> instead use a per-CPU variable. While we can't use scratch_cpumask in code
> invoked from IRQ handlers, simply amend that one with a HPET-special form.
> (Note that only one of the two IRQ handling functions can come into play
> at any one time.)
> 
> Fixes: 996576b965cc ("xen: allow up to 16383 cpus")
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

> ---
> While doing this I noticed that this and all pre-existing uses of
> DEFINE_PER_CPU_READ_MOSTLY() aren't quite right: When the type is
> cpumask_var_t, the variable is read-mostly only when NR_CPUS <=
> 2 * BITS_PER_LONG. That'll want sorting separately, though.
> 
> It is important for this to not be moved ahead of "x86/HPET: use single,
> global, low-priority vector for broadcast IRQ", as the original call here
> from set_channel_irq_affinity() may not use the new variable (it would
> need to use scratch_cpumask instead).
> ---
> v2: New.
> 
> --- a/xen/arch/x86/hpet.c
> +++ b/xen/arch/x86/hpet.c
> @@ -196,7 +196,7 @@ static void evt_do_broadcast(cpumask_t *
>  
>  static void cf_check handle_hpet_broadcast(struct hpet_event_channel *ch)
>  {
> -    cpumask_t mask;
> +    cpumask_t *scratch = this_cpu(hpet_scratch_cpumask);
>      s_time_t now, next_event;
>      unsigned int cpu;
>      unsigned long flags;
> @@ -209,7 +209,7 @@ again:
>      spin_unlock_irqrestore(&ch->lock, flags);
>  
>      next_event = STIME_MAX;
> -    cpumask_clear(&mask);
> +    cpumask_clear(scratch);
>      now = NOW();
>  
>      /* find all expired events */
> @@ -218,13 +218,13 @@ again:
>          s_time_t deadline = ACCESS_ONCE(per_cpu(timer_deadline, cpu));
>  
>          if ( deadline <= now )
> -            __cpumask_set_cpu(cpu, &mask);
> +            __cpumask_set_cpu(cpu, scratch);
>          else if ( deadline < next_event )
>              next_event = deadline;
>      }
>  
>      /* wakeup the cpus which have an expired event. */
> -    evt_do_broadcast(&mask);
> +    evt_do_broadcast(scratch);
>  
>      if ( next_event != STIME_MAX )
>      {
> --- a/xen/arch/x86/include/asm/smp.h
> +++ b/xen/arch/x86/include/asm/smp.h
> @@ -22,6 +22,7 @@
>  DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask);
>  DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
>  DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);
> +DECLARE_PER_CPU(cpumask_var_t, hpet_scratch_cpumask);

Should this be declared in the hpet.h header?

>  DECLARE_PER_CPU(cpumask_var_t, send_ipi_cpumask);
>  
>  /*
> --- a/xen/arch/x86/smpboot.c
> +++ b/xen/arch/x86/smpboot.c
> @@ -55,6 +55,9 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t
>  DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
>  static cpumask_t scratch_cpu0mask;
>  
> +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, hpet_scratch_cpumask);
> +static cpumask_t hpet_scratch_cpu0mask;

And then this defined in hpet.c.

Just thinking whether someone would like to introduce support for
build time disabling HPET in the future.

Can always be moved at a later time anyway.

Thanks, Roger.

Re: [PATCH v2 for-4.21 3/9] x86/HPET: replace handle_hpet_broadcast()'s on-stack cpumask_t
Posted by Jan Beulich 1 week, 2 days ago
On 21.10.2025 16:08, Roger Pau Monné wrote:
> On Mon, Oct 20, 2025 at 01:19:20PM +0200, Jan Beulich wrote:
>> With large NR_CPUS on-stack cpumask_t variables are problematic. Now that
>> the IRQ handler can't be invoked in a nested manner anymore, we can
>> instead use a per-CPU variable. While we can't use scratch_cpumask in code
>> invoked from IRQ handlers, simply amend that one with a HPET-special form.
>> (Note that only one of the two IRQ handling functions can come into play
>> at any one time.)
>>
>> Fixes: 996576b965cc ("xen: allow up to 16383 cpus")
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks.

>> --- a/xen/arch/x86/include/asm/smp.h
>> +++ b/xen/arch/x86/include/asm/smp.h
>> @@ -22,6 +22,7 @@
>>  DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask);
>>  DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
>>  DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);
>> +DECLARE_PER_CPU(cpumask_var_t, hpet_scratch_cpumask);
> 
> Should this be declared in the hpet.h header?

Imo not without also ...

>> --- a/xen/arch/x86/smpboot.c
>> +++ b/xen/arch/x86/smpboot.c
>> @@ -55,6 +55,9 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t
>>  DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
>>  static cpumask_t scratch_cpu0mask;
>>  
>> +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, hpet_scratch_cpumask);
>> +static cpumask_t hpet_scratch_cpu0mask;
> 
> And then this defined in hpet.c.

... moving this. Which in turn I specifically avoided putting in hpet.c,
as otherwise the change would further grow, as would the risk of things
going out of sync.

Jan

Re: [PATCH v2 for-4.21 3/9] x86/HPET: replace handle_hpet_broadcast()'s on-stack cpumask_t
Posted by Jan Beulich 1 week, 3 days ago
On 20.10.2025 13:19, Jan Beulich wrote:
> With large NR_CPUS on-stack cpumask_t variables are problematic. Now that
> the IRQ handler can't be invoked in a nested manner anymore, we can
> instead use a per-CPU variable. While we can't use scratch_cpumask in code
> invoked from IRQ handlers, simply amend that one with a HPET-special form.
> (Note that only one of the two IRQ handling functions can come into play
> at any one time.)
> 
> Fixes: 996576b965cc ("xen: allow up to 16383 cpus")
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

While the cover letter (intentionally) says otherwise, I decided to leave the
4.21 tag in place here: We may want to consider this one, as making the
original problem less severe (leaving patch 2 out of the picture).

Jan