[v1] x86/HPET: broadcast IRQ and other improvements

[PATCH for-4.21 03/10] x86/HPET: use single, global, low-priority vector for broadcast IRQ

Posted by Jan Beulich 2 weeks ago

Using dynamically allocated / maintained vectors has several downsides:
- possible nesting of IRQs due to the effects of IRQ migration,
- reduction of vectors available for devices,
- IRQs not moving as intended if there's shortage of vectors,
- higher runtime overhead.

As the vector also doesn't need to be of any priority (first and foremost
it really shouldn't be of higher or same priority as the timer IRQ, as
that raises TIMER_SOFTIRQ anyway), avoid any "ordinary" vectors altogther
and use a vector from the 0x10...0x1f exception vector space. Exception vs
interrupt can easily be distinguished by checking for the presence of an
error code.

Fixes: 996576b965cc ("xen: allow up to 16383 cpus")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
This is an alternative proposal to
https://lists.xen.org/archives/html/xen-devel/2014-03/msg00399.html.

The Fixes: tag indicates where the problem got signficantly worse; in
principle it was there already before (crashing at perhaps 6 or 7 levels
of nested IRQs).

--- a/xen/arch/x86/hpet.c
+++ b/xen/arch/x86/hpet.c
@@ -9,17 +9,19 @@
 #include <xen/timer.h>
 #include <xen/smp.h>
 #include <xen/softirq.h>
+#include <xen/cpuidle.h>
 #include <xen/irq.h>
 #include <xen/numa.h>
 #include <xen/param.h>
 #include <xen/sched.h>
 
 #include <asm/apic.h>
-#include <asm/fixmap.h>
 #include <asm/div64.h>
+#include <asm/fixmap.h>
+#include <asm/genapic.h>
 #include <asm/hpet.h>
+#include <asm/irq-vectors.h>
 #include <asm/msi.h>
-#include <xen/cpuidle.h>
 
 #define MAX_DELTA_NS MILLISECS(10*1000)
 #define MIN_DELTA_NS MICROSECS(20)
@@ -307,15 +309,13 @@ static void cf_check hpet_msi_set_affini
     struct hpet_event_channel *ch = desc->action->dev_id;
     struct msi_msg msg = ch->msi.msg;
 
-    msg.dest32 = set_desc_affinity(desc, mask);
-    if ( msg.dest32 == BAD_APICID )
-        return;
+    /* This really is only for dump_irqs(). */
+    cpumask_copy(desc->arch.cpu_mask, mask);
 
-    msg.data &= ~MSI_DATA_VECTOR_MASK;
-    msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
+    msg.dest32 = cpu_mask_to_apicid(mask);
     msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
     msg.address_lo |= MSI_ADDR_DEST_ID(msg.dest32);
-    if ( msg.data != ch->msi.msg.data || msg.dest32 != ch->msi.msg.dest32 )
+    if ( msg.dest32 != ch->msi.msg.dest32 )
         hpet_msi_write(ch, &msg);
 }
 
@@ -328,7 +328,7 @@ static hw_irq_controller hpet_msi_type =
     .shutdown   = hpet_msi_shutdown,
     .enable	    = hpet_msi_unmask,
     .disable    = hpet_msi_mask,
-    .ack        = ack_nonmaskable_msi_irq,
+    .ack        = irq_actor_none,
     .end        = end_nonmaskable_irq,
     .set_affinity   = hpet_msi_set_affinity,
 };
@@ -347,6 +347,12 @@ static int __init hpet_setup_msi_irq(str
     u32 cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
     irq_desc_t *desc = irq_to_desc(ch->msi.irq);
 
+    clear_irq_vector(ch->msi.irq);
+    ret = bind_irq_vector(ch->msi.irq, HPET_BROADCAST_VECTOR, &cpu_online_map);
+    if ( ret )
+        return ret;
+    cpumask_setall(desc->affinity);
+
     if ( iommu_intremap != iommu_intremap_off )
     {
         ch->msi.hpet_id = hpet_blockid;
@@ -457,7 +463,7 @@ static struct hpet_event_channel *hpet_g
     /*
      * Try the least recently used channel first.  It may still have its IRQ's
      * affinity set to the desired CPU.  This way we also limit having multiple
-     * of our IRQs raised on the same CPU, in possibly a nested manner.
+     * of our IRQs raised on the same CPU.
      */
     ch = per_cpu(lru_channel, cpu);
     if ( ch && !test_and_set_bit(HPET_EVT_USED_BIT, &ch->flags) )
@@ -497,6 +503,7 @@ static void set_channel_irq_affinity(str
     spin_lock(&desc->lock);
     hpet_msi_mask(desc);
     hpet_msi_set_affinity(desc, cpumask_of(ch->cpu));
+    per_cpu(vector_irq, ch->cpu)[HPET_BROADCAST_VECTOR] = ch->msi.irq;
     hpet_msi_unmask(desc);
     spin_unlock(&desc->lock);
 
--- a/xen/arch/x86/include/asm/irq-vectors.h
+++ b/xen/arch/x86/include/asm/irq-vectors.h
@@ -18,6 +18,15 @@
 /* IRQ0 (timer) is statically allocated but must be high priority. */
 #define IRQ0_VECTOR             0xf0
 
+/*
+ * Low-priority (for now statically allocated) vectors, sharing entry
+ * points with exceptions in the 0x10 ... 0x1f range, as long as the
+ * respective exception has an error code.
+ */
+#define FIRST_LOPRIORITY_VECTOR 0x10
+#define HPET_BROADCAST_VECTOR   X86_EXC_AC
+#define LAST_LOPRIORITY_VECTOR  0x1f
+
 /* Legacy PIC uses vectors 0x20-0x2f. */
 #define FIRST_LEGACY_VECTOR     FIRST_DYNAMIC_VECTOR
 #define LAST_LEGACY_VECTOR      (FIRST_LEGACY_VECTOR + 0xf)
@@ -40,7 +49,7 @@
 /* There's no IRQ2 at the PIC. */
 #define IRQ_MOVE_CLEANUP_VECTOR (FIRST_LEGACY_VECTOR + 2)
 
-#define FIRST_IRQ_VECTOR        FIRST_DYNAMIC_VECTOR
+#define FIRST_IRQ_VECTOR        FIRST_LOPRIORITY_VECTOR
 #define LAST_IRQ_VECTOR         LAST_HIPRIORITY_VECTOR
 
 #endif /* _ASM_IRQ_VECTORS_H */
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -755,8 +755,9 @@ void setup_vector_irq(unsigned int cpu)
         if ( !irq_desc_initialized(desc) )
             continue;
         vector = irq_to_vector(irq);
-        if ( vector >= FIRST_HIPRIORITY_VECTOR &&
-             vector <= LAST_HIPRIORITY_VECTOR )
+        if ( vector <= (vector >= FIRST_HIPRIORITY_VECTOR
+                        ? LAST_HIPRIORITY_VECTOR
+                        : LAST_LOPRIORITY_VECTOR) )
             cpumask_set_cpu(cpu, desc->arch.cpu_mask);
         else if ( !cpumask_test_cpu(cpu, desc->arch.cpu_mask) )
             continue;
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -158,7 +158,7 @@ void msi_compose_msg(unsigned vector, co
 {
     memset(msg, 0, sizeof(*msg));
 
-    if ( vector < FIRST_DYNAMIC_VECTOR )
+    if ( vector < FIRST_LOPRIORITY_VECTOR )
         return;
 
     if ( cpu_mask )
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -1045,7 +1045,13 @@ END(entry_GP)
 
 FUNC(entry_AC)
         ENDBR64
+        /* #AC shares its entry point with the HPET broadcast interrupt. */
+        test  $8, %spl
+        jz    .Lac
+        push  $0
+.Lac:
         movb  $X86_EXC_AC, EFRAME_entry_vector(%rsp)
+        jnz   common_interrupt
         jmp   handle_exception
 END(entry_AC)

Re: [PATCH for-4.21 03/10] x86/HPET: use single, global, low-priority vector for broadcast IRQ

Posted by Andrew Cooper 2 weeks ago

On 16/10/2025 8:32 am, Jan Beulich wrote:
> Using dynamically allocated / maintained vectors has several downsides:
> - possible nesting of IRQs due to the effects of IRQ migration,
> - reduction of vectors available for devices,
> - IRQs not moving as intended if there's shortage of vectors,
> - higher runtime overhead.
>
> As the vector also doesn't need to be of any priority (first and foremost
> it really shouldn't be of higher or same priority as the timer IRQ, as
> that raises TIMER_SOFTIRQ anyway), avoid any "ordinary" vectors altogther
> and use a vector from the 0x10...0x1f exception vector space. Exception vs
> interrupt can easily be distinguished by checking for the presence of an
> error code.
>
> Fixes: 996576b965cc ("xen: allow up to 16383 cpus")
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

This is so cunning that it took me a while to figure out how this even
functioned, given no apparent change to TPR.

Having this behaviour under FRED is easy.  In fact, allowing the use of
vectors 0x10-0x1f under FRED is one "extra" I haven't gotten around to
doing yet.

But, the problem it introduces under IDT is that for all the other
reserved exceptions, we'll panic if we see them.  That was the point of
setting TPR to 0x10 originally.

~Andrew

Re: [PATCH for-4.21 03/10] x86/HPET: use single, global, low-priority vector for broadcast IRQ

Posted by Jan Beulich 1 week, 6 days ago

On 16.10.2025 19:01, Andrew Cooper wrote:
> On 16/10/2025 8:32 am, Jan Beulich wrote:
>> Using dynamically allocated / maintained vectors has several downsides:
>> - possible nesting of IRQs due to the effects of IRQ migration,
>> - reduction of vectors available for devices,
>> - IRQs not moving as intended if there's shortage of vectors,
>> - higher runtime overhead.
>>
>> As the vector also doesn't need to be of any priority (first and foremost
>> it really shouldn't be of higher or same priority as the timer IRQ, as
>> that raises TIMER_SOFTIRQ anyway), avoid any "ordinary" vectors altogther
>> and use a vector from the 0x10...0x1f exception vector space. Exception vs
>> interrupt can easily be distinguished by checking for the presence of an
>> error code.
>>
>> Fixes: 996576b965cc ("xen: allow up to 16383 cpus")
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> This is so cunning that it took me a while to figure out how this even
> functioned, given no apparent change to TPR.

Yeah, the TPR part took me a while to figure out. I was scratching my head
as to why I wasn't seeing any interrupts, until I figured I need to make
that adjustment to FIRST_IRQ_VECTOR.

> Having this behaviour under FRED is easy.  In fact, allowing the use of
> vectors 0x10-0x1f under FRED is one "extra" I haven't gotten around to
> doing yet.

Ah yes.

> But, the problem it introduces under IDT is that for all the other
> reserved exceptions, we'll panic if we see them.  That was the point of
> setting TPR to 0x10 originally.

Which would be a clear indication of something being wrong, rather than
things misbehaving (likely) entirely silently. So more of a benefit than
a downside?

Of course, if we were afraid of other vectors getting signaled, other
entry points may need amending similarly to what I do to entry_AC(). It
then isn't quite clear to me what, if anything, to do to entry points of
exceptions coming without error code.

Jan

Re: [PATCH for-4.21 03/10] x86/HPET: use single, global, low-priority vector for broadcast IRQ

Posted by Roger Pau Monné 2 weeks ago

On Thu, Oct 16, 2025 at 09:32:04AM +0200, Jan Beulich wrote:
> Using dynamically allocated / maintained vectors has several downsides:
> - possible nesting of IRQs due to the effects of IRQ migration,
> - reduction of vectors available for devices,
> - IRQs not moving as intended if there's shortage of vectors,
> - higher runtime overhead.
> 
> As the vector also doesn't need to be of any priority (first and foremost
> it really shouldn't be of higher or same priority as the timer IRQ, as
> that raises TIMER_SOFTIRQ anyway), avoid any "ordinary" vectors altogther
> and use a vector from the 0x10...0x1f exception vector space. Exception vs
> interrupt can easily be distinguished by checking for the presence of an
> error code.
> 
> Fixes: 996576b965cc ("xen: allow up to 16383 cpus")
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> This is an alternative proposal to
> https://lists.xen.org/archives/html/xen-devel/2014-03/msg00399.html.
> 
> The Fixes: tag indicates where the problem got signficantly worse; in
> principle it was there already before (crashing at perhaps 6 or 7 levels
> of nested IRQs).
> 
> --- a/xen/arch/x86/hpet.c
> +++ b/xen/arch/x86/hpet.c
> @@ -9,17 +9,19 @@
>  #include <xen/timer.h>
>  #include <xen/smp.h>
>  #include <xen/softirq.h>
> +#include <xen/cpuidle.h>
>  #include <xen/irq.h>
>  #include <xen/numa.h>
>  #include <xen/param.h>
>  #include <xen/sched.h>
>  
>  #include <asm/apic.h>
> -#include <asm/fixmap.h>
>  #include <asm/div64.h>
> +#include <asm/fixmap.h>
> +#include <asm/genapic.h>
>  #include <asm/hpet.h>
> +#include <asm/irq-vectors.h>
>  #include <asm/msi.h>
> -#include <xen/cpuidle.h>
>  
>  #define MAX_DELTA_NS MILLISECS(10*1000)
>  #define MIN_DELTA_NS MICROSECS(20)
> @@ -307,15 +309,13 @@ static void cf_check hpet_msi_set_affini
>      struct hpet_event_channel *ch = desc->action->dev_id;
>      struct msi_msg msg = ch->msi.msg;
>  
> -    msg.dest32 = set_desc_affinity(desc, mask);
> -    if ( msg.dest32 == BAD_APICID )
> -        return;
> +    /* This really is only for dump_irqs(). */
> +    cpumask_copy(desc->arch.cpu_mask, mask);

If you no longer call set_desc_affinity(), could you adjust the second
parameter of hpet_msi_set_affinity() to be unsigned int cpu instead of
a cpumask?

And here just clear desc->arch.cpu_mask and set the passed CPU.

>  
> -    msg.data &= ~MSI_DATA_VECTOR_MASK;
> -    msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
> +    msg.dest32 = cpu_mask_to_apicid(mask);

And here you can just use cpu_physical_id().

>      msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
>      msg.address_lo |= MSI_ADDR_DEST_ID(msg.dest32);
> -    if ( msg.data != ch->msi.msg.data || msg.dest32 != ch->msi.msg.dest32 )
> +    if ( msg.dest32 != ch->msi.msg.dest32 )
>          hpet_msi_write(ch, &msg);

A further note here, which ties to my comment on the previous patch
about loosing the interrupt during the masked window.  If the vector
is the same across all CPUs, we no longer need to update the MSI data
field, just the address one, which can be done atomically.  We also
have signaling from the IOMMU whether the MSI fields need writing.

We can avoid the masking, and the possible drop of interrupts.

>  }
>  
> @@ -328,7 +328,7 @@ static hw_irq_controller hpet_msi_type =
>      .shutdown   = hpet_msi_shutdown,
>      .enable	    = hpet_msi_unmask,
>      .disable    = hpet_msi_mask,
> -    .ack        = ack_nonmaskable_msi_irq,
> +    .ack        = irq_actor_none,
>      .end        = end_nonmaskable_irq,
>      .set_affinity   = hpet_msi_set_affinity,
>  };
> @@ -347,6 +347,12 @@ static int __init hpet_setup_msi_irq(str
>      u32 cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
>      irq_desc_t *desc = irq_to_desc(ch->msi.irq);
>  
> +    clear_irq_vector(ch->msi.irq);
> +    ret = bind_irq_vector(ch->msi.irq, HPET_BROADCAST_VECTOR, &cpu_online_map);
> +    if ( ret )
> +        return ret;
> +    cpumask_setall(desc->affinity);
> +
>      if ( iommu_intremap != iommu_intremap_off )
>      {
>          ch->msi.hpet_id = hpet_blockid;
> @@ -457,7 +463,7 @@ static struct hpet_event_channel *hpet_g
>      /*
>       * Try the least recently used channel first.  It may still have its IRQ's
>       * affinity set to the desired CPU.  This way we also limit having multiple
> -     * of our IRQs raised on the same CPU, in possibly a nested manner.
> +     * of our IRQs raised on the same CPU.
>       */
>      ch = per_cpu(lru_channel, cpu);
>      if ( ch && !test_and_set_bit(HPET_EVT_USED_BIT, &ch->flags) )
> @@ -497,6 +503,7 @@ static void set_channel_irq_affinity(str
>      spin_lock(&desc->lock);
>      hpet_msi_mask(desc);
>      hpet_msi_set_affinity(desc, cpumask_of(ch->cpu));
> +    per_cpu(vector_irq, ch->cpu)[HPET_BROADCAST_VECTOR] = ch->msi.irq;

I would set the vector table ahead of setting the affinity, in case we
can drop the mask calls around this block of code.

I also wonder, do you really need the bind_irq_vector() if you
manually set the affinity afterwards, and the vector table plus
desc->arch.cpu_mask are also set here?

>      hpet_msi_unmask(desc);
>      spin_unlock(&desc->lock);
>  
> --- a/xen/arch/x86/include/asm/irq-vectors.h
> +++ b/xen/arch/x86/include/asm/irq-vectors.h
> @@ -18,6 +18,15 @@
>  /* IRQ0 (timer) is statically allocated but must be high priority. */
>  #define IRQ0_VECTOR             0xf0
>  
> +/*
> + * Low-priority (for now statically allocated) vectors, sharing entry
> + * points with exceptions in the 0x10 ... 0x1f range, as long as the
> + * respective exception has an error code.
> + */
> +#define FIRST_LOPRIORITY_VECTOR 0x10
> +#define HPET_BROADCAST_VECTOR   X86_EXC_AC
> +#define LAST_LOPRIORITY_VECTOR  0x1f

I wonder if it won't be clearer to simply reserve a vector if the HPET
is used, instead of hijacking the AC one.  It's one vector less, but
arguably now that we unconditionally use physical destination mode our
pool of vectors has expanded considerably.

> +
>  /* Legacy PIC uses vectors 0x20-0x2f. */
>  #define FIRST_LEGACY_VECTOR     FIRST_DYNAMIC_VECTOR
>  #define LAST_LEGACY_VECTOR      (FIRST_LEGACY_VECTOR + 0xf)
> @@ -40,7 +49,7 @@
>  /* There's no IRQ2 at the PIC. */
>  #define IRQ_MOVE_CLEANUP_VECTOR (FIRST_LEGACY_VECTOR + 2)
>  
> -#define FIRST_IRQ_VECTOR        FIRST_DYNAMIC_VECTOR
> +#define FIRST_IRQ_VECTOR        FIRST_LOPRIORITY_VECTOR
>  #define LAST_IRQ_VECTOR         LAST_HIPRIORITY_VECTOR
>  
>  #endif /* _ASM_IRQ_VECTORS_H */
> --- a/xen/arch/x86/irq.c
> +++ b/xen/arch/x86/irq.c
> @@ -755,8 +755,9 @@ void setup_vector_irq(unsigned int cpu)
>          if ( !irq_desc_initialized(desc) )
>              continue;
>          vector = irq_to_vector(irq);
> -        if ( vector >= FIRST_HIPRIORITY_VECTOR &&
> -             vector <= LAST_HIPRIORITY_VECTOR )
> +        if ( vector <= (vector >= FIRST_HIPRIORITY_VECTOR
> +                        ? LAST_HIPRIORITY_VECTOR
> +                        : LAST_LOPRIORITY_VECTOR) )
>              cpumask_set_cpu(cpu, desc->arch.cpu_mask);

I think this is wrong.  The low priority vector used by the HPET will
only target a single CPU at a time, and hence adding extra CPUs to
that mask as part of AP bringup is not correct.

Thanks, Roger.

Re: [PATCH for-4.21 03/10] x86/HPET: use single, global, low-priority vector for broadcast IRQ

Posted by Jan Beulich 1 week, 6 days ago

On 16.10.2025 18:27, Roger Pau Monné wrote:
> On Thu, Oct 16, 2025 at 09:32:04AM +0200, Jan Beulich wrote:
>> @@ -307,15 +309,13 @@ static void cf_check hpet_msi_set_affini
>>      struct hpet_event_channel *ch = desc->action->dev_id;
>>      struct msi_msg msg = ch->msi.msg;
>>  
>> -    msg.dest32 = set_desc_affinity(desc, mask);
>> -    if ( msg.dest32 == BAD_APICID )
>> -        return;
>> +    /* This really is only for dump_irqs(). */
>> +    cpumask_copy(desc->arch.cpu_mask, mask);
> 
> If you no longer call set_desc_affinity(), could you adjust the second
> parameter of hpet_msi_set_affinity() to be unsigned int cpu instead of
> a cpumask?

Looks like I could, yes. But then we need to split the function, as it's
also used as the .set_affinity hook.

> And here just clear desc->arch.cpu_mask and set the passed CPU.

Which would still better be a cpumask_copy(), just given cpumask_of(cpu)
as input.

>> -    msg.data &= ~MSI_DATA_VECTOR_MASK;
>> -    msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
>> +    msg.dest32 = cpu_mask_to_apicid(mask);
> 
> And here you can just use cpu_physical_id().

Right. All of which (up to here; but see below) perhaps better a separate,
follow-on cleanup change.

>>      msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
>>      msg.address_lo |= MSI_ADDR_DEST_ID(msg.dest32);
>> -    if ( msg.data != ch->msi.msg.data || msg.dest32 != ch->msi.msg.dest32 )
>> +    if ( msg.dest32 != ch->msi.msg.dest32 )
>>          hpet_msi_write(ch, &msg);
> 
> A further note here, which ties to my comment on the previous patch
> about loosing the interrupt during the masked window.  If the vector
> is the same across all CPUs, we no longer need to update the MSI data
> field, just the address one, which can be done atomically.  We also
> have signaling from the IOMMU whether the MSI fields need writing.

Hmm, yes, we can leverage that, as long as we're willing to make assumptions
here about what exactly iommu_update_ire_from_msi() does: We'd then rely on
not only the original (untranslated) msg->data not changing, but also the
translated one. That looks to hold for both Intel and AMD, but it's still
something we want to be sure we actually want to make the code dependent
upon. (I'm intending to at least add an assertion to that effect.)

> We can avoid the masking, and the possible drop of interrupts.

Hmm, right. There's nothing wrong with the caller relying on the write
being atomic now. (Really, continuing to use hpet_msi_write() wouldn't
be a problem, as re-writing the low half of HPET_Tn_ROUTE() with the
same value is going to be benign. Unless of course that write was the
source of the extra IRQs I'm seeing.)

Taking together with what you said further up, having
set_channel_irq_affinity() no longer use hpet_msi_set_affinity() as it
is to ...

>> @@ -328,7 +328,7 @@ static hw_irq_controller hpet_msi_type =
>>      .shutdown   = hpet_msi_shutdown,
>>      .enable	    = hpet_msi_unmask,
>>      .disable    = hpet_msi_mask,
>> -    .ack        = ack_nonmaskable_msi_irq,
>> +    .ack        = irq_actor_none,
>>      .end        = end_nonmaskable_irq,
>>      .set_affinity   = hpet_msi_set_affinity,

... satisfy the use here would then probably be desirable right away.
The little bit that's left of hpet_msi_set_affinity() would then be
open-coded in set_channel_irq_affinity().

Getting rid of the masking would (hopefully) also get rid of the stray
IRQs that I'm observing, assuming my guessing towards the reason there
is correct.

>> @@ -497,6 +503,7 @@ static void set_channel_irq_affinity(str
>>      spin_lock(&desc->lock);
>>      hpet_msi_mask(desc);
>>      hpet_msi_set_affinity(desc, cpumask_of(ch->cpu));
>> +    per_cpu(vector_irq, ch->cpu)[HPET_BROADCAST_VECTOR] = ch->msi.irq;
> 
> I would set the vector table ahead of setting the affinity, in case we
> can drop the mask calls around this block of code.

Isn't there a problematic window either way round? I can make the change,
but I don't see that addressing anything. The new comparator value will
be written later anyway, and interrupts up to that point aren't of any
interest anyway. I.e. it doesn't matter which of the CPUs gets to handle
them.

> I also wonder, do you really need the bind_irq_vector() if you
> manually set the affinity afterwards, and the vector table plus
> desc->arch.cpu_mask are also set here?

At the very least I'd then also need to open-code the setting of
desc->arch.vector and desc->arch.used. Possibly also the setting of the
bit in desc->arch.used_vectors. And strictly speaking also the
trace_irq_mask() invocation.

>> --- a/xen/arch/x86/include/asm/irq-vectors.h
>> +++ b/xen/arch/x86/include/asm/irq-vectors.h
>> @@ -18,6 +18,15 @@
>>  /* IRQ0 (timer) is statically allocated but must be high priority. */
>>  #define IRQ0_VECTOR             0xf0
>>  
>> +/*
>> + * Low-priority (for now statically allocated) vectors, sharing entry
>> + * points with exceptions in the 0x10 ... 0x1f range, as long as the
>> + * respective exception has an error code.
>> + */
>> +#define FIRST_LOPRIORITY_VECTOR 0x10
>> +#define HPET_BROADCAST_VECTOR   X86_EXC_AC
>> +#define LAST_LOPRIORITY_VECTOR  0x1f
> 
> I wonder if it won't be clearer to simply reserve a vector if the HPET
> is used, instead of hijacking the AC one.  It's one vector less, but
> arguably now that we unconditionally use physical destination mode our
> pool of vectors has expanded considerably.

Well, I'd really like to avoid consuming an otherwise usable vector, if
at all possible (as per Andrew's FRED plans, that won't be possible
there anymore then).

>> --- a/xen/arch/x86/irq.c
>> +++ b/xen/arch/x86/irq.c
>> @@ -755,8 +755,9 @@ void setup_vector_irq(unsigned int cpu)
>>          if ( !irq_desc_initialized(desc) )
>>              continue;
>>          vector = irq_to_vector(irq);
>> -        if ( vector >= FIRST_HIPRIORITY_VECTOR &&
>> -             vector <= LAST_HIPRIORITY_VECTOR )
>> +        if ( vector <= (vector >= FIRST_HIPRIORITY_VECTOR
>> +                        ? LAST_HIPRIORITY_VECTOR
>> +                        : LAST_LOPRIORITY_VECTOR) )
>>              cpumask_set_cpu(cpu, desc->arch.cpu_mask);
> 
> I think this is wrong.  The low priority vector used by the HPET will
> only target a single CPU at a time, and hence adding extra CPUs to
> that mask as part of AP bringup is not correct.

I'm not sure about "wrong". It's not strictly necessary for the HPET one,
I expect, but it's generally what would be necessary. For the HPET one,
hpet_msi_set_affinity() replaces the value anyway. (I can add a sentence
to this effect to the description, if that helps.)

Jan

Re: [PATCH for-4.21 03/10] x86/HPET: use single, global, low-priority vector for broadcast IRQ

Posted by Roger Pau Monné 1 week, 6 days ago

On Fri, Oct 17, 2025 at 09:15:08AM +0200, Jan Beulich wrote:
> On 16.10.2025 18:27, Roger Pau Monné wrote:
> > On Thu, Oct 16, 2025 at 09:32:04AM +0200, Jan Beulich wrote:
> >> @@ -307,15 +309,13 @@ static void cf_check hpet_msi_set_affini
> >>      struct hpet_event_channel *ch = desc->action->dev_id;
> >>      struct msi_msg msg = ch->msi.msg;
> >>  
> >> -    msg.dest32 = set_desc_affinity(desc, mask);
> >> -    if ( msg.dest32 == BAD_APICID )
> >> -        return;
> >> +    /* This really is only for dump_irqs(). */
> >> +    cpumask_copy(desc->arch.cpu_mask, mask);
> > 
> > If you no longer call set_desc_affinity(), could you adjust the second
> > parameter of hpet_msi_set_affinity() to be unsigned int cpu instead of
> > a cpumask?
> 
> Looks like I could, yes. But then we need to split the function, as it's
> also used as the .set_affinity hook.

I see, I wasn't taking that into account.

> > And here just clear desc->arch.cpu_mask and set the passed CPU.
> 
> Which would still better be a cpumask_copy(), just given cpumask_of(cpu)
> as input.

As is it, yes.

> >> -    msg.data &= ~MSI_DATA_VECTOR_MASK;
> >> -    msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
> >> +    msg.dest32 = cpu_mask_to_apicid(mask);
> > 
> > And here you can just use cpu_physical_id().
> 
> Right. All of which (up to here; but see below) perhaps better a separate,
> follow-on cleanup change.

Yes, it's too much fuss, and I also have plans in that area to deal
with it myself anyway.  Just wanted to avoid changing this now to be
changed again.  But it's too unrelated to put in this change.

> >>      msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
> >>      msg.address_lo |= MSI_ADDR_DEST_ID(msg.dest32);
> >> -    if ( msg.data != ch->msi.msg.data || msg.dest32 != ch->msi.msg.dest32 )
> >> +    if ( msg.dest32 != ch->msi.msg.dest32 )
> >>          hpet_msi_write(ch, &msg);
> > 
> > A further note here, which ties to my comment on the previous patch
> > about loosing the interrupt during the masked window.  If the vector
> > is the same across all CPUs, we no longer need to update the MSI data
> > field, just the address one, which can be done atomically.  We also
> > have signaling from the IOMMU whether the MSI fields need writing.
> 
> Hmm, yes, we can leverage that, as long as we're willing to make assumptions
> here about what exactly iommu_update_ire_from_msi() does: We'd then rely on
> not only the original (untranslated) msg->data not changing, but also the
> translated one. That looks to hold for both Intel and AMD, but it's still
> something we want to be sure we actually want to make the code dependent
> upon. (I'm intending to at least add an assertion to that effect.)

We could still mask when needed, but the masking would be
conditionally done in hpet_msi_write().

It seems however this might be better done as a followup change.

> > We can avoid the masking, and the possible drop of interrupts.
> 
> Hmm, right. There's nothing wrong with the caller relying on the write
> being atomic now. (Really, continuing to use hpet_msi_write() wouldn't
> be a problem, as re-writing the low half of HPET_Tn_ROUTE() with the
> same value is going to be benign. Unless of course that write was the
> source of the extra IRQs I'm seeing.)

Oh, yes, that's right, we don't even need to avoid the write.

> Taking together with what you said further up, having
> set_channel_irq_affinity() no longer use hpet_msi_set_affinity() as it
> is to ...
> 
> >> @@ -328,7 +328,7 @@ static hw_irq_controller hpet_msi_type =
> >>      .shutdown   = hpet_msi_shutdown,
> >>      .enable	    = hpet_msi_unmask,
> >>      .disable    = hpet_msi_mask,
> >> -    .ack        = ack_nonmaskable_msi_irq,
> >> +    .ack        = irq_actor_none,
> >>      .end        = end_nonmaskable_irq,
> >>      .set_affinity   = hpet_msi_set_affinity,
> 
> ... satisfy the use here would then probably be desirable right away.
> The little bit that's left of hpet_msi_set_affinity() would then be
> open-coded in set_channel_irq_affinity().

As you see fit, I'm not going to insist if the changes become too
unrelated to the fix itself.  Can always be done as a followup patch,
specially taking into account we are in hard code freeze.

> Getting rid of the masking would (hopefully) also get rid of the stray
> IRQs that I'm observing, assuming my guessing towards the reason there
> is correct.
> 
> >> @@ -497,6 +503,7 @@ static void set_channel_irq_affinity(str
> >>      spin_lock(&desc->lock);
> >>      hpet_msi_mask(desc);
> >>      hpet_msi_set_affinity(desc, cpumask_of(ch->cpu));
> >> +    per_cpu(vector_irq, ch->cpu)[HPET_BROADCAST_VECTOR] = ch->msi.irq;
> > 
> > I would set the vector table ahead of setting the affinity, in case we
> > can drop the mask calls around this block of code.
> 
> Isn't there a problematic window either way round? I can make the change,
> but I don't see that addressing anything. The new comparator value will
> be written later anyway, and interrupts up to that point aren't of any
> interest anyway. I.e. it doesn't matter which of the CPUs gets to handle
> them.

It's preferable to get a silent stray interrupt (if the per-cpu vector
table is correctly setup), rather than to get a message from Xen that
an unknown vector has been received?

If a vector is injected ahead of vector_irq being set Xen would
complain in do_IRQ() that that's no handler for such vector.

> > I also wonder, do you really need the bind_irq_vector() if you
> > manually set the affinity afterwards, and the vector table plus
> > desc->arch.cpu_mask are also set here?
> 
> At the very least I'd then also need to open-code the setting of
> desc->arch.vector and desc->arch.used. Possibly also the setting of the
> bit in desc->arch.used_vectors. And strictly speaking also the
> trace_irq_mask() invocation.

Let's keep it as-is.

> >> --- a/xen/arch/x86/include/asm/irq-vectors.h
> >> +++ b/xen/arch/x86/include/asm/irq-vectors.h
> >> @@ -18,6 +18,15 @@
> >>  /* IRQ0 (timer) is statically allocated but must be high priority. */
> >>  #define IRQ0_VECTOR             0xf0
> >>  
> >> +/*
> >> + * Low-priority (for now statically allocated) vectors, sharing entry
> >> + * points with exceptions in the 0x10 ... 0x1f range, as long as the
> >> + * respective exception has an error code.
> >> + */
> >> +#define FIRST_LOPRIORITY_VECTOR 0x10
> >> +#define HPET_BROADCAST_VECTOR   X86_EXC_AC
> >> +#define LAST_LOPRIORITY_VECTOR  0x1f
> > 
> > I wonder if it won't be clearer to simply reserve a vector if the HPET
> > is used, instead of hijacking the AC one.  It's one vector less, but
> > arguably now that we unconditionally use physical destination mode our
> > pool of vectors has expanded considerably.
> 
> Well, I'd really like to avoid consuming an otherwise usable vector, if
> at all possible (as per Andrew's FRED plans, that won't be possible
> there anymore then).

If re-using the AC vector is not possible with FRED we might want to
do this uniformly and always consume a vector then?

> >> --- a/xen/arch/x86/irq.c
> >> +++ b/xen/arch/x86/irq.c
> >> @@ -755,8 +755,9 @@ void setup_vector_irq(unsigned int cpu)
> >>          if ( !irq_desc_initialized(desc) )
> >>              continue;
> >>          vector = irq_to_vector(irq);
> >> -        if ( vector >= FIRST_HIPRIORITY_VECTOR &&
> >> -             vector <= LAST_HIPRIORITY_VECTOR )
> >> +        if ( vector <= (vector >= FIRST_HIPRIORITY_VECTOR
> >> +                        ? LAST_HIPRIORITY_VECTOR
> >> +                        : LAST_LOPRIORITY_VECTOR) )
> >>              cpumask_set_cpu(cpu, desc->arch.cpu_mask);
> > 
> > I think this is wrong.  The low priority vector used by the HPET will
> > only target a single CPU at a time, and hence adding extra CPUs to
> > that mask as part of AP bringup is not correct.
> 
> I'm not sure about "wrong". It's not strictly necessary for the HPET one,
> I expect, but it's generally what would be necessary. For the HPET one,
> hpet_msi_set_affinity() replaces the value anyway. (I can add a sentence
> to this effect to the description, if that helps.)

I do think it's wrong, it's just not harmful per-se apart from showing
up in the output of dump_irqs().  The value in desc->arch.cpu_mask
should be the CPU that's the destination of the interrupt.  In this
case, the HPET interrupt does have a single destination at a give
time, and adding another one will make the output of dump_irqs() show
two destinations, when the interrupt will target a single interrupt.

If anything you should add the CPU to the affinity set
(desc->affinity), but that's not needed since you already init the
affinity mask with cpumask_setall().

FWIW, I'm working on tentatively getting rid of the
desc->arch.{cpu,old_cpu,pending}_mask fields and converting them to
plain unsigned ints after we have dropped logical interrupt delivery
for external interrupts.

Thanks, Roger.

Re: [PATCH for-4.21 03/10] x86/HPET: use single, global, low-priority vector for broadcast IRQ

Posted by Jan Beulich 1 week, 3 days ago

On 17.10.2025 10:20, Roger Pau Monné wrote:
> On Fri, Oct 17, 2025 at 09:15:08AM +0200, Jan Beulich wrote:
>> On 16.10.2025 18:27, Roger Pau Monné wrote:
>>> On Thu, Oct 16, 2025 at 09:32:04AM +0200, Jan Beulich wrote:
>>>> @@ -497,6 +503,7 @@ static void set_channel_irq_affinity(str
>>>>      spin_lock(&desc->lock);
>>>>      hpet_msi_mask(desc);
>>>>      hpet_msi_set_affinity(desc, cpumask_of(ch->cpu));
>>>> +    per_cpu(vector_irq, ch->cpu)[HPET_BROADCAST_VECTOR] = ch->msi.irq;
>>>
>>> I would set the vector table ahead of setting the affinity, in case we
>>> can drop the mask calls around this block of code.
>>
>> Isn't there a problematic window either way round? I can make the change,
>> but I don't see that addressing anything. The new comparator value will
>> be written later anyway, and interrupts up to that point aren't of any
>> interest anyway. I.e. it doesn't matter which of the CPUs gets to handle
>> them.
> 
> It's preferable to get a silent stray interrupt (if the per-cpu vector
> table is correctly setup), rather than to get a message from Xen that
> an unknown vector has been received?
> 
> If a vector is injected ahead of vector_irq being set Xen would
> complain in do_IRQ() that that's no handler for such vector.

As of now, setup_vector_irq() makes sure the field isn't uninitialized
(i.e. left at INT_MIN). With that change dropped (see below), there
would indeed be such a risk (on the first instance on each CPU).

>>>> --- a/xen/arch/x86/include/asm/irq-vectors.h
>>>> +++ b/xen/arch/x86/include/asm/irq-vectors.h
>>>> @@ -18,6 +18,15 @@
>>>>  /* IRQ0 (timer) is statically allocated but must be high priority. */
>>>>  #define IRQ0_VECTOR             0xf0
>>>>  
>>>> +/*
>>>> + * Low-priority (for now statically allocated) vectors, sharing entry
>>>> + * points with exceptions in the 0x10 ... 0x1f range, as long as the
>>>> + * respective exception has an error code.
>>>> + */
>>>> +#define FIRST_LOPRIORITY_VECTOR 0x10
>>>> +#define HPET_BROADCAST_VECTOR   X86_EXC_AC
>>>> +#define LAST_LOPRIORITY_VECTOR  0x1f
>>>
>>> I wonder if it won't be clearer to simply reserve a vector if the HPET
>>> is used, instead of hijacking the AC one.  It's one vector less, but
>>> arguably now that we unconditionally use physical destination mode our
>>> pool of vectors has expanded considerably.
>>
>> Well, I'd really like to avoid consuming an otherwise usable vector, if
>> at all possible (as per Andrew's FRED plans, that won't be possible
>> there anymore then).
> 
> If re-using the AC vector is not possible with FRED we might want to
> do this uniformly and always consume a vector then?

Right now it saves us a vector. I'd leave the FRED side for when that's
going to be implemented. Which - aiui - isn't going to be straightforward
anyway, due to (at least) requirements around IRQ_MOVE_CLEANUP_VECTOR.

>>>> --- a/xen/arch/x86/irq.c
>>>> +++ b/xen/arch/x86/irq.c
>>>> @@ -755,8 +755,9 @@ void setup_vector_irq(unsigned int cpu)
>>>>          if ( !irq_desc_initialized(desc) )
>>>>              continue;
>>>>          vector = irq_to_vector(irq);
>>>> -        if ( vector >= FIRST_HIPRIORITY_VECTOR &&
>>>> -             vector <= LAST_HIPRIORITY_VECTOR )
>>>> +        if ( vector <= (vector >= FIRST_HIPRIORITY_VECTOR
>>>> +                        ? LAST_HIPRIORITY_VECTOR
>>>> +                        : LAST_LOPRIORITY_VECTOR) )
>>>>              cpumask_set_cpu(cpu, desc->arch.cpu_mask);
>>>
>>> I think this is wrong.  The low priority vector used by the HPET will
>>> only target a single CPU at a time, and hence adding extra CPUs to
>>> that mask as part of AP bringup is not correct.
>>
>> I'm not sure about "wrong". It's not strictly necessary for the HPET one,
>> I expect, but it's generally what would be necessary. For the HPET one,
>> hpet_msi_set_affinity() replaces the value anyway. (I can add a sentence
>> to this effect to the description, if that helps.)
> 
> I do think it's wrong, it's just not harmful per-se apart from showing
> up in the output of dump_irqs().  The value in desc->arch.cpu_mask
> should be the CPU that's the destination of the interrupt.  In this
> case, the HPET interrupt does have a single destination at a give
> time, and adding another one will make the output of dump_irqs() show
> two destinations, when the interrupt will target a single interrupt.

Just that as soon as the interrupt is actually in use, what is done
here doesn't matter anymore.

I continue to think the change is correct for the general case: I'd
expect these special vectors to normally (just not here) be used as
"direct APIC vectors", in which case the IRQ does have multiple
destinations.

Problem is - if I don't make this change, I still expect I ought to
make _some_ change here, as the following "else if()" might be getting
in the way. Then again the vector_irq[] assignment also isn't strictly
needed this early, as set_channel_irq_affinity() deals with that
anyway.

Bottom line - I guess I'll drop this change, realizing that adding
something here later on may then be harder to understand.

> If anything you should add the CPU to the affinity set
> (desc->affinity), but that's not needed since you already init the
> affinity mask with cpumask_setall().

Indeed.

> FWIW, I'm working on tentatively getting rid of the
> desc->arch.{cpu,old_cpu,pending}_mask fields and converting them to
> plain unsigned ints after we have dropped logical interrupt delivery
> for external interrupts.

I'm aware, yes. And I realize this change - for the HPET case - would
be getting in the way (to some degree).

Jan

Re: [PATCH for-4.21 03/10] x86/HPET: use single, global, low-priority vector for broadcast IRQ

Posted by Roger Pau Monné 1 week, 3 days ago

On Mon, Oct 20, 2025 at 07:53:51AM +0200, Jan Beulich wrote:
> On 17.10.2025 10:20, Roger Pau Monné wrote:
> > On Fri, Oct 17, 2025 at 09:15:08AM +0200, Jan Beulich wrote:
> >> On 16.10.2025 18:27, Roger Pau Monné wrote:
> >>> On Thu, Oct 16, 2025 at 09:32:04AM +0200, Jan Beulich wrote:
> >>>> @@ -497,6 +503,7 @@ static void set_channel_irq_affinity(str
> >>>>      spin_lock(&desc->lock);
> >>>>      hpet_msi_mask(desc);
> >>>>      hpet_msi_set_affinity(desc, cpumask_of(ch->cpu));
> >>>> +    per_cpu(vector_irq, ch->cpu)[HPET_BROADCAST_VECTOR] = ch->msi.irq;
> >>>
> >>> I would set the vector table ahead of setting the affinity, in case we
> >>> can drop the mask calls around this block of code.
> >>
> >> Isn't there a problematic window either way round? I can make the change,
> >> but I don't see that addressing anything. The new comparator value will
> >> be written later anyway, and interrupts up to that point aren't of any
> >> interest anyway. I.e. it doesn't matter which of the CPUs gets to handle
> >> them.
> > 
> > It's preferable to get a silent stray interrupt (if the per-cpu vector
> > table is correctly setup), rather than to get a message from Xen that
> > an unknown vector has been received?
> > 
> > If a vector is injected ahead of vector_irq being set Xen would
> > complain in do_IRQ() that that's no handler for such vector.
> 
> As of now, setup_vector_irq() makes sure the field isn't uninitialized
> (i.e. left at INT_MIN). With that change dropped (see below), there
> would indeed be such a risk (on the first instance on each CPU).
> 
> >>>> --- a/xen/arch/x86/irq.c
> >>>> +++ b/xen/arch/x86/irq.c
> >>>> @@ -755,8 +755,9 @@ void setup_vector_irq(unsigned int cpu)
> >>>>          if ( !irq_desc_initialized(desc) )
> >>>>              continue;
> >>>>          vector = irq_to_vector(irq);
> >>>> -        if ( vector >= FIRST_HIPRIORITY_VECTOR &&
> >>>> -             vector <= LAST_HIPRIORITY_VECTOR )
> >>>> +        if ( vector <= (vector >= FIRST_HIPRIORITY_VECTOR
> >>>> +                        ? LAST_HIPRIORITY_VECTOR
> >>>> +                        : LAST_LOPRIORITY_VECTOR) )
> >>>>              cpumask_set_cpu(cpu, desc->arch.cpu_mask);
> >>>
> >>> I think this is wrong.  The low priority vector used by the HPET will
> >>> only target a single CPU at a time, and hence adding extra CPUs to
> >>> that mask as part of AP bringup is not correct.
> >>
> >> I'm not sure about "wrong". It's not strictly necessary for the HPET one,
> >> I expect, but it's generally what would be necessary. For the HPET one,
> >> hpet_msi_set_affinity() replaces the value anyway. (I can add a sentence
> >> to this effect to the description, if that helps.)
> > 
> > I do think it's wrong, it's just not harmful per-se apart from showing
> > up in the output of dump_irqs().  The value in desc->arch.cpu_mask
> > should be the CPU that's the destination of the interrupt.  In this
> > case, the HPET interrupt does have a single destination at a give
> > time, and adding another one will make the output of dump_irqs() show
> > two destinations, when the interrupt will target a single interrupt.
> 
> Just that as soon as the interrupt is actually in use, what is done
> here doesn't matter anymore.
> 
> I continue to think the change is correct for the general case: I'd
> expect these special vectors to normally (just not here) be used as
> "direct APIC vectors", in which case the IRQ does have multiple
> destinations.

I think it depends on the usage of the vector.  There are indeed
vectors that are active on all CPUs at the same time (like the current
hi priority ones).  However in the case of the HPET vector that's not
the case, it targets a single CPU specifically.

I think it would be best if vectors that are used on all CPUs at the
same time are initialized using cpumask_all or cpumask_setall(), and
avoid having to add a new bit every time a CPU is started.  It's fine
for cpu_mask to contain offline CPUs.

> Problem is - if I don't make this change, I still expect I ought to
> make _some_ change here, as the following "else if()" might be getting
> in the way. Then again the vector_irq[] assignment also isn't strictly
> needed this early, as set_channel_irq_affinity() deals with that
> anyway.
> 
> Bottom line - I guess I'll drop this change, realizing that adding
> something here later on may then be harder to understand.

I have plans to change this soonish hopefully, which I expect will
make this clearer.

> > If anything you should add the CPU to the affinity set
> > (desc->affinity), but that's not needed since you already init the
> > affinity mask with cpumask_setall().
> 
> Indeed.
> 
> > FWIW, I'm working on tentatively getting rid of the
> > desc->arch.{cpu,old_cpu,pending}_mask fields and converting them to
> > plain unsigned ints after we have dropped logical interrupt delivery
> > for external interrupts.
> 
> I'm aware, yes. And I realize this change - for the HPET case - would
> be getting in the way (to some degree).

Thanks, Roger.

Re: [PATCH for-4.21 03/10] x86/HPET: use single, global, low-priority vector for broadcast IRQ

Posted by Jan Beulich 1 week, 3 days ago

On 20.10.2025 17:49, Roger Pau Monné wrote:
> On Mon, Oct 20, 2025 at 07:53:51AM +0200, Jan Beulich wrote:
>> On 17.10.2025 10:20, Roger Pau Monné wrote:
>>> On Fri, Oct 17, 2025 at 09:15:08AM +0200, Jan Beulich wrote:
>>>> On 16.10.2025 18:27, Roger Pau Monné wrote:
>>>>> On Thu, Oct 16, 2025 at 09:32:04AM +0200, Jan Beulich wrote:
>>>>>> @@ -497,6 +503,7 @@ static void set_channel_irq_affinity(str
>>>>>>      spin_lock(&desc->lock);
>>>>>>      hpet_msi_mask(desc);
>>>>>>      hpet_msi_set_affinity(desc, cpumask_of(ch->cpu));
>>>>>> +    per_cpu(vector_irq, ch->cpu)[HPET_BROADCAST_VECTOR] = ch->msi.irq;
>>>>>
>>>>> I would set the vector table ahead of setting the affinity, in case we
>>>>> can drop the mask calls around this block of code.
>>>>
>>>> Isn't there a problematic window either way round? I can make the change,
>>>> but I don't see that addressing anything. The new comparator value will
>>>> be written later anyway, and interrupts up to that point aren't of any
>>>> interest anyway. I.e. it doesn't matter which of the CPUs gets to handle
>>>> them.
>>>
>>> It's preferable to get a silent stray interrupt (if the per-cpu vector
>>> table is correctly setup), rather than to get a message from Xen that
>>> an unknown vector has been received?
>>>
>>> If a vector is injected ahead of vector_irq being set Xen would
>>> complain in do_IRQ() that that's no handler for such vector.
>>
>> As of now, setup_vector_irq() makes sure the field isn't uninitialized
>> (i.e. left at INT_MIN). With that change dropped (see below), there
>> would indeed be such a risk (on the first instance on each CPU).
>>
>>>>>> --- a/xen/arch/x86/irq.c
>>>>>> +++ b/xen/arch/x86/irq.c
>>>>>> @@ -755,8 +755,9 @@ void setup_vector_irq(unsigned int cpu)
>>>>>>          if ( !irq_desc_initialized(desc) )
>>>>>>              continue;
>>>>>>          vector = irq_to_vector(irq);
>>>>>> -        if ( vector >= FIRST_HIPRIORITY_VECTOR &&
>>>>>> -             vector <= LAST_HIPRIORITY_VECTOR )
>>>>>> +        if ( vector <= (vector >= FIRST_HIPRIORITY_VECTOR
>>>>>> +                        ? LAST_HIPRIORITY_VECTOR
>>>>>> +                        : LAST_LOPRIORITY_VECTOR) )
>>>>>>              cpumask_set_cpu(cpu, desc->arch.cpu_mask);
>>>>>
>>>>> I think this is wrong.  The low priority vector used by the HPET will
>>>>> only target a single CPU at a time, and hence adding extra CPUs to
>>>>> that mask as part of AP bringup is not correct.
>>>>
>>>> I'm not sure about "wrong". It's not strictly necessary for the HPET one,
>>>> I expect, but it's generally what would be necessary. For the HPET one,
>>>> hpet_msi_set_affinity() replaces the value anyway. (I can add a sentence
>>>> to this effect to the description, if that helps.)
>>>
>>> I do think it's wrong, it's just not harmful per-se apart from showing
>>> up in the output of dump_irqs().  The value in desc->arch.cpu_mask
>>> should be the CPU that's the destination of the interrupt.  In this
>>> case, the HPET interrupt does have a single destination at a give
>>> time, and adding another one will make the output of dump_irqs() show
>>> two destinations, when the interrupt will target a single interrupt.
>>
>> Just that as soon as the interrupt is actually in use, what is done
>> here doesn't matter anymore.
>>
>> I continue to think the change is correct for the general case: I'd
>> expect these special vectors to normally (just not here) be used as
>> "direct APIC vectors", in which case the IRQ does have multiple
>> destinations.
> 
> I think it depends on the usage of the vector.  There are indeed
> vectors that are active on all CPUs at the same time (like the current
> hi priority ones).  However in the case of the HPET vector that's not
> the case, it targets a single CPU specifically.
> 
> I think it would be best if vectors that are used on all CPUs at the
> same time are initialized using cpumask_all or cpumask_setall(), and
> avoid having to add a new bit every time a CPU is started.  It's fine
> for cpu_mask to contain offline CPUs.

I don't think so. There may be less dependencies now, but look at e.g.
the check in _bind_irq_vector(). Or this loop

            for_each_cpu(cpu, desc->arch.cpu_mask)
                per_cpu(vector_irq, cpu)[desc->arch.vector] = irq;

in _assign_irq_vector() (that may be fine because of how the mask is
set just before the loop, but the loop itself very much assumes no
offline CPUs in there). The most problematic example may be in
fixup_irqs(), where cpumask_any(desc->arch.cpu_mask) is used.

Jan

Re: [PATCH for-4.21 03/10] x86/HPET: use single, global, low-priority vector for broadcast IRQ

Posted by Roger Pau Monné 1 week, 2 days ago

On Mon, Oct 20, 2025 at 06:05:04PM +0200, Jan Beulich wrote:
> On 20.10.2025 17:49, Roger Pau Monné wrote:
> > On Mon, Oct 20, 2025 at 07:53:51AM +0200, Jan Beulich wrote:
> >> On 17.10.2025 10:20, Roger Pau Monné wrote:
> >>> On Fri, Oct 17, 2025 at 09:15:08AM +0200, Jan Beulich wrote:
> >>>> On 16.10.2025 18:27, Roger Pau Monné wrote:
> >>>>> On Thu, Oct 16, 2025 at 09:32:04AM +0200, Jan Beulich wrote:
> >>>>>> @@ -497,6 +503,7 @@ static void set_channel_irq_affinity(str
> >>>>>>      spin_lock(&desc->lock);
> >>>>>>      hpet_msi_mask(desc);
> >>>>>>      hpet_msi_set_affinity(desc, cpumask_of(ch->cpu));
> >>>>>> +    per_cpu(vector_irq, ch->cpu)[HPET_BROADCAST_VECTOR] = ch->msi.irq;
> >>>>>
> >>>>> I would set the vector table ahead of setting the affinity, in case we
> >>>>> can drop the mask calls around this block of code.
> >>>>
> >>>> Isn't there a problematic window either way round? I can make the change,
> >>>> but I don't see that addressing anything. The new comparator value will
> >>>> be written later anyway, and interrupts up to that point aren't of any
> >>>> interest anyway. I.e. it doesn't matter which of the CPUs gets to handle
> >>>> them.
> >>>
> >>> It's preferable to get a silent stray interrupt (if the per-cpu vector
> >>> table is correctly setup), rather than to get a message from Xen that
> >>> an unknown vector has been received?
> >>>
> >>> If a vector is injected ahead of vector_irq being set Xen would
> >>> complain in do_IRQ() that that's no handler for such vector.
> >>
> >> As of now, setup_vector_irq() makes sure the field isn't uninitialized
> >> (i.e. left at INT_MIN). With that change dropped (see below), there
> >> would indeed be such a risk (on the first instance on each CPU).
> >>
> >>>>>> --- a/xen/arch/x86/irq.c
> >>>>>> +++ b/xen/arch/x86/irq.c
> >>>>>> @@ -755,8 +755,9 @@ void setup_vector_irq(unsigned int cpu)
> >>>>>>          if ( !irq_desc_initialized(desc) )
> >>>>>>              continue;
> >>>>>>          vector = irq_to_vector(irq);
> >>>>>> -        if ( vector >= FIRST_HIPRIORITY_VECTOR &&
> >>>>>> -             vector <= LAST_HIPRIORITY_VECTOR )
> >>>>>> +        if ( vector <= (vector >= FIRST_HIPRIORITY_VECTOR
> >>>>>> +                        ? LAST_HIPRIORITY_VECTOR
> >>>>>> +                        : LAST_LOPRIORITY_VECTOR) )
> >>>>>>              cpumask_set_cpu(cpu, desc->arch.cpu_mask);
> >>>>>
> >>>>> I think this is wrong.  The low priority vector used by the HPET will
> >>>>> only target a single CPU at a time, and hence adding extra CPUs to
> >>>>> that mask as part of AP bringup is not correct.
> >>>>
> >>>> I'm not sure about "wrong". It's not strictly necessary for the HPET one,
> >>>> I expect, but it's generally what would be necessary. For the HPET one,
> >>>> hpet_msi_set_affinity() replaces the value anyway. (I can add a sentence
> >>>> to this effect to the description, if that helps.)
> >>>
> >>> I do think it's wrong, it's just not harmful per-se apart from showing
> >>> up in the output of dump_irqs().  The value in desc->arch.cpu_mask
> >>> should be the CPU that's the destination of the interrupt.  In this
> >>> case, the HPET interrupt does have a single destination at a give
> >>> time, and adding another one will make the output of dump_irqs() show
> >>> two destinations, when the interrupt will target a single interrupt.
> >>
> >> Just that as soon as the interrupt is actually in use, what is done
> >> here doesn't matter anymore.
> >>
> >> I continue to think the change is correct for the general case: I'd
> >> expect these special vectors to normally (just not here) be used as
> >> "direct APIC vectors", in which case the IRQ does have multiple
> >> destinations.
> > 
> > I think it depends on the usage of the vector.  There are indeed
> > vectors that are active on all CPUs at the same time (like the current
> > hi priority ones).  However in the case of the HPET vector that's not
> > the case, it targets a single CPU specifically.
> > 
> > I think it would be best if vectors that are used on all CPUs at the
> > same time are initialized using cpumask_all or cpumask_setall(), and
> > avoid having to add a new bit every time a CPU is started.  It's fine
> > for cpu_mask to contain offline CPUs.
> 
> I don't think so. There may be less dependencies now, but look at e.g.
> the check in _bind_irq_vector(). Or this loop
> 
>             for_each_cpu(cpu, desc->arch.cpu_mask)
>                 per_cpu(vector_irq, cpu)[desc->arch.vector] = irq;
> 
> in _assign_irq_vector() (that may be fine because of how the mask is
> set just before the loop, but the loop itself very much assumes no
> offline CPUs in there). The most problematic example may be in
> fixup_irqs(), where cpumask_any(desc->arch.cpu_mask) is used.

Then it looks like the comment ahead of the field declaration in irq.h
is wrong:

        /*
         * Except for high priority interrupts @cpu_mask may have bits set for
         * offline CPUs.  Consumers need to be careful to mask this down to
         * online ones as necessary.  There is supposed to always be a non-
         * empty intersection with cpu_online_map.
         */

I realize now the comment says "Except for high priority", but we
don't seem to make a such differentiation in most of the code (like
fixup_irqs()).

Hopefully this will be way more simple if I can get rid of the
cpumasks in arch_irq_desc.

Thanks, Roger.

[PATCH for-4.21 01/10] x86/HPET: limit channel changes
[PATCH for-4.21 02/10] x86/HPET: disable unused channels
[PATCH for-4.21 03/10] x86/HPET: use single, global, low-priority vector for broadcast IRQ
[PATCH for-4.21 04/10] x86/HPET: ignore "stale" IRQs
[PATCH 05/10] x86/HPET: avoid indirect call to event handler
[PATCH 06/10] x86/HPET: make another channel flags update atomic
[PATCH 07/10] x86/HPET: move legacy tick IRQ count adjustment
[PATCH 08/10] x86/HPET: shrink IRQ-descriptor locked region in set_channel_irq_affinity()
[PATCH 09/10] x86/HPET: reduce hpet_next_event() call sites
[PATCH 10/10] x86/HPET: don't use hardcoded 0 for "long timeout"