[v1] x86/idle: Multiple MWAIT fixes

[PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

Posted by Andrew Cooper 4 months ago

In order elide IPIs, we must be able to identify whether a target CPU is in
MWAIT at the point it is woken up.  i.e. the store to wake it up must also
identify the state.

Create a new in_mwait variable beside __softirq_pending, so we can use a
CMPXCHG to set the softirq while also observing the status safely.  Implement
an x86 version of arch_pend_softirq() which does this.

In mwait_idle_with_hints(), advertise in_mwait, with an explanation of
precisely what it means.  X86_BUG_MONITOR can be accounted for simply by not
advertising in_mwait.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Anthony PERARD <anthony.perard@vates.tech>
CC: Michal Orzel <michal.orzel@amd.com>
CC: Julien Grall <julien@xen.org>
CC: Stefano Stabellini <sstabellini@kernel.org>

This is modelled after Linux's TIF_NEED_RESCHED (single bit equivelent of all
of __softirq_pending), and TIF_POLLING_NRFLAG (arch-neutral "in_mwait").

In Linux, they're both in the same flags field, which adds complexity.  In
Xen, __softirq_pending is already unsigned long for everything other than x86,
so adding an arch-neutral "in_mwait" would need wider changes.
---
 xen/arch/x86/acpi/cpu_idle.c       | 20 +++++++++++++++++-
 xen/arch/x86/include/asm/hardirq.h | 14 +++++++++++-
 xen/arch/x86/include/asm/softirq.h | 34 ++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index caa0fef0b3b1..13040ef467a0 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -439,7 +439,21 @@ __initcall(cpu_idle_key_init);
 void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
 {
     unsigned int cpu = smp_processor_id();
-    const unsigned int *this_softirq_pending = &softirq_pending(cpu);
+    irq_cpustat_t *stat = &irq_stat[cpu];
+    const unsigned int *this_softirq_pending = &stat->__softirq_pending;
+
+    /*
+     * By setting in_mwait, we promise to other CPUs that we'll notice changes
+     * to __softirq_pending without being sent an IPI.  We achieve this by
+     * either not going to sleep, or by having hardware notice on our behalf.
+     *
+     * Some errata exist where MONITOR doesn't work properly, and the
+     * workaround is to force the use of an IPI.  Cause this to happen by
+     * simply not advertising outselves as being in_mwait.
+     */
+    alternative_io("movb $1, %[in_mwait]",
+                   "", X86_BUG_MONITOR,
+                   [in_mwait] "=m" (stat->in_mwait));
 
     monitor(this_softirq_pending, 0, 0);
     smp_mb();
@@ -452,6 +466,10 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
         mwait(eax, ecx);
         spec_ctrl_exit_idle(info);
     }
+
+    alternative_io("movb $0, %[in_mwait]",
+                   "", X86_BUG_MONITOR,
+                   [in_mwait] "=m" (stat->in_mwait));
 }
 
 static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
diff --git a/xen/arch/x86/include/asm/hardirq.h b/xen/arch/x86/include/asm/hardirq.h
index f3e93cc9b507..1647cff04dc8 100644
--- a/xen/arch/x86/include/asm/hardirq.h
+++ b/xen/arch/x86/include/asm/hardirq.h
@@ -5,7 +5,19 @@
 #include <xen/types.h>
 
 typedef struct {
-    unsigned int __softirq_pending;
+    /*
+     * The layout is important.  Any CPU can set bits in __softirq_pending,
+     * but in_mwait is a status bit owned by the CPU.  softirq_mwait_raw must
+     * cover both, and must be in a single cacheline.
+     */
+    union {
+        struct {
+            unsigned int __softirq_pending;
+            bool in_mwait;
+        };
+        uint64_t softirq_mwait_raw;
+    };
+
     unsigned int __local_irq_count;
     unsigned int nmi_count;
     unsigned int mce_count;
diff --git a/xen/arch/x86/include/asm/softirq.h b/xen/arch/x86/include/asm/softirq.h
index e4b194f069fb..069e5716a68d 100644
--- a/xen/arch/x86/include/asm/softirq.h
+++ b/xen/arch/x86/include/asm/softirq.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_SOFTIRQ_H__
 #define __ASM_SOFTIRQ_H__
 
+#include <asm/system.h>
+
 #define NMI_SOFTIRQ            (NR_COMMON_SOFTIRQS + 0)
 #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
 #define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
@@ -9,4 +11,36 @@
 #define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
 #define NR_ARCH_SOFTIRQS       5
 
+/*
+ * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
+ * skipped, false if the IPI cannot be skipped.
+ *
+ * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
+ * set softirq @nr while also observing in_mwait in a race-free way.
+ */
+static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int cpu)
+{
+    uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
+    uint64_t old, new;
+    unsigned int softirq = 1U << nr;
+
+    old = ACCESS_ONCE(*ptr);
+
+    do {
+        if ( old & softirq )
+            /* Softirq already pending, nothing to do. */
+            return true;
+
+        new = old | softirq;
+
+    } while ( (old = cmpxchg(ptr, old, new)) != new );
+
+    /*
+     * We have caused the softirq to become pending.  If in_mwait was set, the
+     * target CPU will notice the modification and act on it.
+     */
+    return new & (1UL << 32);
+}
+#define arch_pend_softirq arch_pend_softirq
+
 #endif /* __ASM_SOFTIRQ_H__ */
-- 
2.39.5

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

Posted by Roger Pau Monné 3 months, 4 weeks ago

On Wed, Jul 02, 2025 at 03:41:19PM +0100, Andrew Cooper wrote:
> diff --git a/xen/arch/x86/include/asm/softirq.h b/xen/arch/x86/include/asm/softirq.h
> index e4b194f069fb..069e5716a68d 100644
> --- a/xen/arch/x86/include/asm/softirq.h
> +++ b/xen/arch/x86/include/asm/softirq.h
> @@ -1,6 +1,8 @@
>  #ifndef __ASM_SOFTIRQ_H__
>  #define __ASM_SOFTIRQ_H__
>  
> +#include <asm/system.h>
> +
>  #define NMI_SOFTIRQ            (NR_COMMON_SOFTIRQS + 0)
>  #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
>  #define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
> @@ -9,4 +11,36 @@
>  #define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
>  #define NR_ARCH_SOFTIRQS       5
>  
> +/*
> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
> + * skipped, false if the IPI cannot be skipped.
> + *
> + * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
> + * set softirq @nr while also observing in_mwait in a race-free way.
> + */
> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int cpu)
> +{
> +    uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
> +    uint64_t old, new;
> +    unsigned int softirq = 1U << nr;
> +
> +    old = ACCESS_ONCE(*ptr);
> +
> +    do {
> +        if ( old & softirq )
> +            /* Softirq already pending, nothing to do. */
> +            return true;
> +
> +        new = old | softirq;
> +
> +    } while ( (old = cmpxchg(ptr, old, new)) != new );
> +
> +    /*
> +     * We have caused the softirq to become pending.  If in_mwait was set, the
> +     * target CPU will notice the modification and act on it.
> +     */
> +    return new & (1UL << 32);

Maybe I haven't got enough coffee yet, but if we do the cmpxchg()
won't it be enough to send the IPI when softirq is first set, but not
necessarily for each extra softirq that's set if there's already one
enabled?  IOW:

    uint64_t new, softirq = 1U << nr;
    uint64_t old = ACCESS_ONCE(*ptr);

    do {
        if ( old & softirq )
            /* Softirq already pending, nothing to do. */
            return true;

        new = old | softirq;

    } while ( (old = cmpxchg(ptr, old, new)) != (new & ~softirq) );

    /*
     * We have caused the softirq to become pending when it was
     * previously unset.  If in_mwait was set, the target CPU will
     * notice the modification and act on it.
     *
     * Reduce the logic to simply check whether the old value was
     * different than 0: it will either be the in_mwait field or any
     * already pending softirqs.
     */
    return old;

Thanks, Roger.

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

Posted by Andrew Cooper 3 months, 4 weeks ago

On 04/07/2025 8:52 am, Roger Pau Monné wrote:
> On Wed, Jul 02, 2025 at 03:41:19PM +0100, Andrew Cooper wrote:
>> diff --git a/xen/arch/x86/include/asm/softirq.h b/xen/arch/x86/include/asm/softirq.h
>> index e4b194f069fb..069e5716a68d 100644
>> --- a/xen/arch/x86/include/asm/softirq.h
>> +++ b/xen/arch/x86/include/asm/softirq.h
>> @@ -1,6 +1,8 @@
>>  #ifndef __ASM_SOFTIRQ_H__
>>  #define __ASM_SOFTIRQ_H__
>>  
>> +#include <asm/system.h>
>> +
>>  #define NMI_SOFTIRQ            (NR_COMMON_SOFTIRQS + 0)
>>  #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
>>  #define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
>> @@ -9,4 +11,36 @@
>>  #define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
>>  #define NR_ARCH_SOFTIRQS       5
>>  
>> +/*
>> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
>> + * skipped, false if the IPI cannot be skipped.
>> + *
>> + * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
>> + * set softirq @nr while also observing in_mwait in a race-free way.
>> + */
>> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int cpu)
>> +{
>> +    uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
>> +    uint64_t old, new;
>> +    unsigned int softirq = 1U << nr;
>> +
>> +    old = ACCESS_ONCE(*ptr);
>> +
>> +    do {
>> +        if ( old & softirq )
>> +            /* Softirq already pending, nothing to do. */
>> +            return true;
>> +
>> +        new = old | softirq;
>> +
>> +    } while ( (old = cmpxchg(ptr, old, new)) != new );
>> +
>> +    /*
>> +     * We have caused the softirq to become pending.  If in_mwait was set, the
>> +     * target CPU will notice the modification and act on it.
>> +     */
>> +    return new & (1UL << 32);
> Maybe I haven't got enough coffee yet, but if we do the cmpxchg()
> won't it be enough to send the IPI when softirq is first set, but not
> necessarily for each extra softirq that's set if there's already one
> enabled?

It was me who didn't have enough coffee when writing the code.  Already
fixed locally.

~Andrew

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

Posted by Roger Pau Monné 3 months, 4 weeks ago

On Wed, Jul 02, 2025 at 03:41:19PM +0100, Andrew Cooper wrote:
> In order elide IPIs, we must be able to identify whether a target CPU is in
> MWAIT at the point it is woken up.  i.e. the store to wake it up must also
> identify the state.
> 
> Create a new in_mwait variable beside __softirq_pending, so we can use a
> CMPXCHG to set the softirq while also observing the status safely.  Implement
> an x86 version of arch_pend_softirq() which does this.
> 
> In mwait_idle_with_hints(), advertise in_mwait, with an explanation of
> precisely what it means.  X86_BUG_MONITOR can be accounted for simply by not
> advertising in_mwait.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> CC: Anthony PERARD <anthony.perard@vates.tech>
> CC: Michal Orzel <michal.orzel@amd.com>
> CC: Julien Grall <julien@xen.org>
> CC: Stefano Stabellini <sstabellini@kernel.org>
> 
> This is modelled after Linux's TIF_NEED_RESCHED (single bit equivelent of all
> of __softirq_pending), and TIF_POLLING_NRFLAG (arch-neutral "in_mwait").
> 
> In Linux, they're both in the same flags field, which adds complexity.  In
> Xen, __softirq_pending is already unsigned long for everything other than x86,
> so adding an arch-neutral "in_mwait" would need wider changes.
> ---
>  xen/arch/x86/acpi/cpu_idle.c       | 20 +++++++++++++++++-
>  xen/arch/x86/include/asm/hardirq.h | 14 +++++++++++-
>  xen/arch/x86/include/asm/softirq.h | 34 ++++++++++++++++++++++++++++++
>  3 files changed, 66 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
> index caa0fef0b3b1..13040ef467a0 100644
> --- a/xen/arch/x86/acpi/cpu_idle.c
> +++ b/xen/arch/x86/acpi/cpu_idle.c
> @@ -439,7 +439,21 @@ __initcall(cpu_idle_key_init);
>  void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>  {
>      unsigned int cpu = smp_processor_id();
> -    const unsigned int *this_softirq_pending = &softirq_pending(cpu);
> +    irq_cpustat_t *stat = &irq_stat[cpu];
> +    const unsigned int *this_softirq_pending = &stat->__softirq_pending;
> +
> +    /*
> +     * By setting in_mwait, we promise to other CPUs that we'll notice changes
> +     * to __softirq_pending without being sent an IPI.  We achieve this by
> +     * either not going to sleep, or by having hardware notice on our behalf.
> +     *
> +     * Some errata exist where MONITOR doesn't work properly, and the
> +     * workaround is to force the use of an IPI.  Cause this to happen by
> +     * simply not advertising outselves as being in_mwait.
> +     */
> +    alternative_io("movb $1, %[in_mwait]",
> +                   "", X86_BUG_MONITOR,
> +                   [in_mwait] "=m" (stat->in_mwait));
>  
>      monitor(this_softirq_pending, 0, 0);
>      smp_mb();
> @@ -452,6 +466,10 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>          mwait(eax, ecx);
>          spec_ctrl_exit_idle(info);
>      }
> +
> +    alternative_io("movb $0, %[in_mwait]",
> +                   "", X86_BUG_MONITOR,
> +                   [in_mwait] "=m" (stat->in_mwait));

Isn't it a bit overkill to use alternatives here when you could have a
conditional based on a global variable to decide whether to set/clear
in_mwait?

>  }
>  
>  static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
> diff --git a/xen/arch/x86/include/asm/hardirq.h b/xen/arch/x86/include/asm/hardirq.h
> index f3e93cc9b507..1647cff04dc8 100644
> --- a/xen/arch/x86/include/asm/hardirq.h
> +++ b/xen/arch/x86/include/asm/hardirq.h
> @@ -5,7 +5,19 @@
>  #include <xen/types.h>
>  
>  typedef struct {
> -    unsigned int __softirq_pending;
> +    /*
> +     * The layout is important.  Any CPU can set bits in __softirq_pending,
> +     * but in_mwait is a status bit owned by the CPU.  softirq_mwait_raw must
> +     * cover both, and must be in a single cacheline.
> +     */
> +    union {
> +        struct {
> +            unsigned int __softirq_pending;
> +            bool in_mwait;

Given the usage in assembly where you store 0 and 1, this might better
be a uint8_t then?

> +        };
> +        uint64_t softirq_mwait_raw;
> +    };

This could be a named union type ...

> +
>      unsigned int __local_irq_count;
>      unsigned int nmi_count;
>      unsigned int mce_count;
> diff --git a/xen/arch/x86/include/asm/softirq.h b/xen/arch/x86/include/asm/softirq.h
> index e4b194f069fb..069e5716a68d 100644
> --- a/xen/arch/x86/include/asm/softirq.h
> +++ b/xen/arch/x86/include/asm/softirq.h
> @@ -1,6 +1,8 @@
>  #ifndef __ASM_SOFTIRQ_H__
>  #define __ASM_SOFTIRQ_H__
>  
> +#include <asm/system.h>
> +
>  #define NMI_SOFTIRQ            (NR_COMMON_SOFTIRQS + 0)
>  #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
>  #define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
> @@ -9,4 +11,36 @@
>  #define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
>  #define NR_ARCH_SOFTIRQS       5
>  
> +/*
> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
> + * skipped, false if the IPI cannot be skipped.
> + *
> + * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
> + * set softirq @nr while also observing in_mwait in a race-free way.
> + */
> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int cpu)
> +{
> +    uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
> +    uint64_t old, new;

... so that you also use it here?  And the return check below could become
new.in_mwait instead of having to use a bitmask?

Thanks, Roger.

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

Posted by Andrew Cooper 3 months, 4 weeks ago

On 03/07/2025 5:36 pm, Roger Pau Monné wrote:
> On Wed, Jul 02, 2025 at 03:41:19PM +0100, Andrew Cooper wrote:
>> In order elide IPIs, we must be able to identify whether a target CPU is in
>> MWAIT at the point it is woken up.  i.e. the store to wake it up must also
>> identify the state.
>>
>> Create a new in_mwait variable beside __softirq_pending, so we can use a
>> CMPXCHG to set the softirq while also observing the status safely.  Implement
>> an x86 version of arch_pend_softirq() which does this.
>>
>> In mwait_idle_with_hints(), advertise in_mwait, with an explanation of
>> precisely what it means.  X86_BUG_MONITOR can be accounted for simply by not
>> advertising in_mwait.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> ---
>> CC: Jan Beulich <JBeulich@suse.com>
>> CC: Roger Pau Monné <roger.pau@citrix.com>
>> CC: Anthony PERARD <anthony.perard@vates.tech>
>> CC: Michal Orzel <michal.orzel@amd.com>
>> CC: Julien Grall <julien@xen.org>
>> CC: Stefano Stabellini <sstabellini@kernel.org>
>>
>> This is modelled after Linux's TIF_NEED_RESCHED (single bit equivelent of all
>> of __softirq_pending), and TIF_POLLING_NRFLAG (arch-neutral "in_mwait").
>>
>> In Linux, they're both in the same flags field, which adds complexity.  In
>> Xen, __softirq_pending is already unsigned long for everything other than x86,
>> so adding an arch-neutral "in_mwait" would need wider changes.
>> ---
>>  xen/arch/x86/acpi/cpu_idle.c       | 20 +++++++++++++++++-
>>  xen/arch/x86/include/asm/hardirq.h | 14 +++++++++++-
>>  xen/arch/x86/include/asm/softirq.h | 34 ++++++++++++++++++++++++++++++
>>  3 files changed, 66 insertions(+), 2 deletions(-)
>>
>> diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
>> index caa0fef0b3b1..13040ef467a0 100644
>> --- a/xen/arch/x86/acpi/cpu_idle.c
>> +++ b/xen/arch/x86/acpi/cpu_idle.c
>> @@ -439,7 +439,21 @@ __initcall(cpu_idle_key_init);
>>  void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>>  {
>>      unsigned int cpu = smp_processor_id();
>> -    const unsigned int *this_softirq_pending = &softirq_pending(cpu);
>> +    irq_cpustat_t *stat = &irq_stat[cpu];
>> +    const unsigned int *this_softirq_pending = &stat->__softirq_pending;
>> +
>> +    /*
>> +     * By setting in_mwait, we promise to other CPUs that we'll notice changes
>> +     * to __softirq_pending without being sent an IPI.  We achieve this by
>> +     * either not going to sleep, or by having hardware notice on our behalf.
>> +     *
>> +     * Some errata exist where MONITOR doesn't work properly, and the
>> +     * workaround is to force the use of an IPI.  Cause this to happen by
>> +     * simply not advertising outselves as being in_mwait.
>> +     */
>> +    alternative_io("movb $1, %[in_mwait]",
>> +                   "", X86_BUG_MONITOR,
>> +                   [in_mwait] "=m" (stat->in_mwait));
>>  
>>      monitor(this_softirq_pending, 0, 0);
>>      smp_mb();
>> @@ -452,6 +466,10 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>>          mwait(eax, ecx);
>>          spec_ctrl_exit_idle(info);
>>      }
>> +
>> +    alternative_io("movb $0, %[in_mwait]",
>> +                   "", X86_BUG_MONITOR,
>> +                   [in_mwait] "=m" (stat->in_mwait));
> Isn't it a bit overkill to use alternatives here when you could have a
> conditional based on a global variable to decide whether to set/clear
> in_mwait?

I view it differently.  Why should the common case suffer overhead
(extra memory read and conditional branch) to work around 3 buggy pieces
of hardware in a common path?
>>  }
>>  
>>  static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
>> diff --git a/xen/arch/x86/include/asm/hardirq.h b/xen/arch/x86/include/asm/hardirq.h
>> index f3e93cc9b507..1647cff04dc8 100644
>> --- a/xen/arch/x86/include/asm/hardirq.h
>> +++ b/xen/arch/x86/include/asm/hardirq.h
>> @@ -5,7 +5,19 @@
>>  #include <xen/types.h>
>>  
>>  typedef struct {
>> -    unsigned int __softirq_pending;
>> +    /*
>> +     * The layout is important.  Any CPU can set bits in __softirq_pending,
>> +     * but in_mwait is a status bit owned by the CPU.  softirq_mwait_raw must
>> +     * cover both, and must be in a single cacheline.
>> +     */
>> +    union {
>> +        struct {
>> +            unsigned int __softirq_pending;
>> +            bool in_mwait;
> Given the usage in assembly where you store 0 and 1, this might better
> be a uint8_t then?

We have loads of asm code which accesses bool's like this.  C guarantees
that sizeof(bool) >= sizeof(char).

>
>> +        };
>> +        uint64_t softirq_mwait_raw;
>> +    };
> This could be a named union type ...
>
>> +
>>      unsigned int __local_irq_count;
>>      unsigned int nmi_count;
>>      unsigned int mce_count;
>> diff --git a/xen/arch/x86/include/asm/softirq.h b/xen/arch/x86/include/asm/softirq.h
>> index e4b194f069fb..069e5716a68d 100644
>> --- a/xen/arch/x86/include/asm/softirq.h
>> +++ b/xen/arch/x86/include/asm/softirq.h
>> @@ -1,6 +1,8 @@
>>  #ifndef __ASM_SOFTIRQ_H__
>>  #define __ASM_SOFTIRQ_H__
>>  
>> +#include <asm/system.h>
>> +
>>  #define NMI_SOFTIRQ            (NR_COMMON_SOFTIRQS + 0)
>>  #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
>>  #define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
>> @@ -9,4 +11,36 @@
>>  #define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
>>  #define NR_ARCH_SOFTIRQS       5
>>  
>> +/*
>> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
>> + * skipped, false if the IPI cannot be skipped.
>> + *
>> + * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
>> + * set softirq @nr while also observing in_mwait in a race-free way.
>> + */
>> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int cpu)
>> +{
>> +    uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
>> +    uint64_t old, new;
> ... so that you also use it here?

No, it cant.  The of softirq_pending() in common code requires no
intermediate field names, and I'm not untangling that mess in a series
wanting backporting.

~Andrew

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

Posted by Roger Pau Monné 3 months, 4 weeks ago

On Thu, Jul 03, 2025 at 06:48:23PM +0100, Andrew Cooper wrote:
> On 03/07/2025 5:36 pm, Roger Pau Monné wrote:
> > On Wed, Jul 02, 2025 at 03:41:19PM +0100, Andrew Cooper wrote:
> >> In order elide IPIs, we must be able to identify whether a target CPU is in
> >> MWAIT at the point it is woken up.  i.e. the store to wake it up must also
> >> identify the state.
> >>
> >> Create a new in_mwait variable beside __softirq_pending, so we can use a
> >> CMPXCHG to set the softirq while also observing the status safely.  Implement
> >> an x86 version of arch_pend_softirq() which does this.
> >>
> >> In mwait_idle_with_hints(), advertise in_mwait, with an explanation of
> >> precisely what it means.  X86_BUG_MONITOR can be accounted for simply by not
> >> advertising in_mwait.
> >>
> >> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> >> ---
> >> CC: Jan Beulich <JBeulich@suse.com>
> >> CC: Roger Pau Monné <roger.pau@citrix.com>
> >> CC: Anthony PERARD <anthony.perard@vates.tech>
> >> CC: Michal Orzel <michal.orzel@amd.com>
> >> CC: Julien Grall <julien@xen.org>
> >> CC: Stefano Stabellini <sstabellini@kernel.org>
> >>
> >> This is modelled after Linux's TIF_NEED_RESCHED (single bit equivelent of all
> >> of __softirq_pending), and TIF_POLLING_NRFLAG (arch-neutral "in_mwait").
> >>
> >> In Linux, they're both in the same flags field, which adds complexity.  In
> >> Xen, __softirq_pending is already unsigned long for everything other than x86,
> >> so adding an arch-neutral "in_mwait" would need wider changes.
> >> ---
> >>  xen/arch/x86/acpi/cpu_idle.c       | 20 +++++++++++++++++-
> >>  xen/arch/x86/include/asm/hardirq.h | 14 +++++++++++-
> >>  xen/arch/x86/include/asm/softirq.h | 34 ++++++++++++++++++++++++++++++
> >>  3 files changed, 66 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
> >> index caa0fef0b3b1..13040ef467a0 100644
> >> --- a/xen/arch/x86/acpi/cpu_idle.c
> >> +++ b/xen/arch/x86/acpi/cpu_idle.c
> >> @@ -439,7 +439,21 @@ __initcall(cpu_idle_key_init);
> >>  void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
> >>  {
> >>      unsigned int cpu = smp_processor_id();
> >> -    const unsigned int *this_softirq_pending = &softirq_pending(cpu);
> >> +    irq_cpustat_t *stat = &irq_stat[cpu];
> >> +    const unsigned int *this_softirq_pending = &stat->__softirq_pending;
> >> +
> >> +    /*
> >> +     * By setting in_mwait, we promise to other CPUs that we'll notice changes
> >> +     * to __softirq_pending without being sent an IPI.  We achieve this by
> >> +     * either not going to sleep, or by having hardware notice on our behalf.
> >> +     *
> >> +     * Some errata exist where MONITOR doesn't work properly, and the
> >> +     * workaround is to force the use of an IPI.  Cause this to happen by
> >> +     * simply not advertising outselves as being in_mwait.
> >> +     */
> >> +    alternative_io("movb $1, %[in_mwait]",
> >> +                   "", X86_BUG_MONITOR,
> >> +                   [in_mwait] "=m" (stat->in_mwait));
> >>  
> >>      monitor(this_softirq_pending, 0, 0);
> >>      smp_mb();
> >> @@ -452,6 +466,10 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
> >>          mwait(eax, ecx);
> >>          spec_ctrl_exit_idle(info);
> >>      }
> >> +
> >> +    alternative_io("movb $0, %[in_mwait]",
> >> +                   "", X86_BUG_MONITOR,
> >> +                   [in_mwait] "=m" (stat->in_mwait));
> > Isn't it a bit overkill to use alternatives here when you could have a
> > conditional based on a global variable to decide whether to set/clear
> > in_mwait?
> 
> I view it differently.  Why should the common case suffer overhead
> (extra memory read and conditional branch) to work around 3 buggy pieces
> of hardware in a common path?

TBH I don't think the extra read and conditional would make any
difference in this specific path performance wise.  Either the CPU is
going to sleep and has nothing to do, or the cost of getting back from
idle will shadow the overhead of an extra read and conditional.

It's all a question of taste I guess.  If it was me I would set/clear
stat->in_mwait unconditionally in mwait_idle_with_hints(), and then in
arch_pend_softirq() I would return:

return new & (1UL << 32) || force_mwait_ipi_wakeup;

Or AFAICT you could possibly skip the cmpxchg() in the
force_mwait_ipi_wakeup case in arch_pend_softirq(), and just do:

if ( force_mwait_ipi_wakeup )
    return test_and_set_bit(nr, &softirq_pending(cpu));

Because in that case Xen doesn't care about the in_mwait status.  It
would be an optimization for the buggy hardware that already has to
deal with the extra cost of always sending an IPI.

> >> +        };
> >> +        uint64_t softirq_mwait_raw;
> >> +    };
> > This could be a named union type ...
> >
> >> +
> >>      unsigned int __local_irq_count;
> >>      unsigned int nmi_count;
> >>      unsigned int mce_count;
> >> diff --git a/xen/arch/x86/include/asm/softirq.h b/xen/arch/x86/include/asm/softirq.h
> >> index e4b194f069fb..069e5716a68d 100644
> >> --- a/xen/arch/x86/include/asm/softirq.h
> >> +++ b/xen/arch/x86/include/asm/softirq.h
> >> @@ -1,6 +1,8 @@
> >>  #ifndef __ASM_SOFTIRQ_H__
> >>  #define __ASM_SOFTIRQ_H__
> >>  
> >> +#include <asm/system.h>
> >> +
> >>  #define NMI_SOFTIRQ            (NR_COMMON_SOFTIRQS + 0)
> >>  #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
> >>  #define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
> >> @@ -9,4 +11,36 @@
> >>  #define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
> >>  #define NR_ARCH_SOFTIRQS       5
> >>  
> >> +/*
> >> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
> >> + * skipped, false if the IPI cannot be skipped.
> >> + *
> >> + * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
> >> + * set softirq @nr while also observing in_mwait in a race-free way.
> >> + */
> >> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int cpu)
> >> +{
> >> +    uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
> >> +    uint64_t old, new;
> > ... so that you also use it here?
> 
> No, it cant.  The of softirq_pending() in common code requires no
> intermediate field names, and I'm not untangling that mess in a series
> wanting backporting.

Oh, I see.  Never mind then, something for a later cleanup if
anything.

Thanks, Roger.

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

Posted by Andrew Cooper 3 months, 4 weeks ago

On 04/07/2025 8:24 am, Roger Pau Monné wrote:
> On Thu, Jul 03, 2025 at 06:48:23PM +0100, Andrew Cooper wrote:
>> On 03/07/2025 5:36 pm, Roger Pau Monné wrote:
>>> On Wed, Jul 02, 2025 at 03:41:19PM +0100, Andrew Cooper wrote:
>>>> In order elide IPIs, we must be able to identify whether a target CPU is in
>>>> MWAIT at the point it is woken up.  i.e. the store to wake it up must also
>>>> identify the state.
>>>>
>>>> Create a new in_mwait variable beside __softirq_pending, so we can use a
>>>> CMPXCHG to set the softirq while also observing the status safely.  Implement
>>>> an x86 version of arch_pend_softirq() which does this.
>>>>
>>>> In mwait_idle_with_hints(), advertise in_mwait, with an explanation of
>>>> precisely what it means.  X86_BUG_MONITOR can be accounted for simply by not
>>>> advertising in_mwait.
>>>>
>>>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>>> ---
>>>> CC: Jan Beulich <JBeulich@suse.com>
>>>> CC: Roger Pau Monné <roger.pau@citrix.com>
>>>> CC: Anthony PERARD <anthony.perard@vates.tech>
>>>> CC: Michal Orzel <michal.orzel@amd.com>
>>>> CC: Julien Grall <julien@xen.org>
>>>> CC: Stefano Stabellini <sstabellini@kernel.org>
>>>>
>>>> This is modelled after Linux's TIF_NEED_RESCHED (single bit equivelent of all
>>>> of __softirq_pending), and TIF_POLLING_NRFLAG (arch-neutral "in_mwait").
>>>>
>>>> In Linux, they're both in the same flags field, which adds complexity.  In
>>>> Xen, __softirq_pending is already unsigned long for everything other than x86,
>>>> so adding an arch-neutral "in_mwait" would need wider changes.
>>>> ---
>>>>  xen/arch/x86/acpi/cpu_idle.c       | 20 +++++++++++++++++-
>>>>  xen/arch/x86/include/asm/hardirq.h | 14 +++++++++++-
>>>>  xen/arch/x86/include/asm/softirq.h | 34 ++++++++++++++++++++++++++++++
>>>>  3 files changed, 66 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
>>>> index caa0fef0b3b1..13040ef467a0 100644
>>>> --- a/xen/arch/x86/acpi/cpu_idle.c
>>>> +++ b/xen/arch/x86/acpi/cpu_idle.c
>>>> @@ -439,7 +439,21 @@ __initcall(cpu_idle_key_init);
>>>>  void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>>>>  {
>>>>      unsigned int cpu = smp_processor_id();
>>>> -    const unsigned int *this_softirq_pending = &softirq_pending(cpu);
>>>> +    irq_cpustat_t *stat = &irq_stat[cpu];
>>>> +    const unsigned int *this_softirq_pending = &stat->__softirq_pending;
>>>> +
>>>> +    /*
>>>> +     * By setting in_mwait, we promise to other CPUs that we'll notice changes
>>>> +     * to __softirq_pending without being sent an IPI.  We achieve this by
>>>> +     * either not going to sleep, or by having hardware notice on our behalf.
>>>> +     *
>>>> +     * Some errata exist where MONITOR doesn't work properly, and the
>>>> +     * workaround is to force the use of an IPI.  Cause this to happen by
>>>> +     * simply not advertising outselves as being in_mwait.
>>>> +     */
>>>> +    alternative_io("movb $1, %[in_mwait]",
>>>> +                   "", X86_BUG_MONITOR,
>>>> +                   [in_mwait] "=m" (stat->in_mwait));
>>>>  
>>>>      monitor(this_softirq_pending, 0, 0);
>>>>      smp_mb();
>>>> @@ -452,6 +466,10 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>>>>          mwait(eax, ecx);
>>>>          spec_ctrl_exit_idle(info);
>>>>      }
>>>> +
>>>> +    alternative_io("movb $0, %[in_mwait]",
>>>> +                   "", X86_BUG_MONITOR,
>>>> +                   [in_mwait] "=m" (stat->in_mwait));
>>> Isn't it a bit overkill to use alternatives here when you could have a
>>> conditional based on a global variable to decide whether to set/clear
>>> in_mwait?
>> I view it differently.  Why should the common case suffer overhead
>> (extra memory read and conditional branch) to work around 3 buggy pieces
>> of hardware in a common path?
> TBH I don't think the extra read and conditional would make any
> difference in this specific path performance wise.  Either the CPU is
> going to sleep and has nothing to do, or the cost of getting back from
> idle will shadow the overhead of an extra read and conditional.
>
> It's all a question of taste I guess.  If it was me I would set/clear
> stat->in_mwait unconditionally in mwait_idle_with_hints(), and then in
> arch_pend_softirq() I would return:
>
> return new & (1UL << 32) || force_mwait_ipi_wakeup;
>
> Or AFAICT you could possibly skip the cmpxchg() in the
> force_mwait_ipi_wakeup case in arch_pend_softirq(), and just do:
>
> if ( force_mwait_ipi_wakeup )
>     return test_and_set_bit(nr, &softirq_pending(cpu));
>
> Because in that case Xen doesn't care about the in_mwait status.  It
> would be an optimization for the buggy hardware that already has to
> deal with the extra cost of always sending an IPI.

These will both function, but they're both poor options.

They're in a loop over a cpumask, and because of the full barriers in
the atomic options, the read cannot be hoisted or the loop split,
because the compiler has been told "the value may change on each loop
iteration".


This was a code-gen/perf note I had on your original errata fix, which I
said "lets fix later".  Later is now.

The other part of the fix is arch_pend_softirq() is static inline, and
not out-of-line in a separate TU.
>>>> +        };
>>>> +        uint64_t softirq_mwait_raw;
>>>> +    };
>>> This could be a named union type ...
>>>
>>>> +
>>>>      unsigned int __local_irq_count;
>>>>      unsigned int nmi_count;
>>>>      unsigned int mce_count;
>>>> diff --git a/xen/arch/x86/include/asm/softirq.h b/xen/arch/x86/include/asm/softirq.h
>>>> index e4b194f069fb..069e5716a68d 100644
>>>> --- a/xen/arch/x86/include/asm/softirq.h
>>>> +++ b/xen/arch/x86/include/asm/softirq.h
>>>> @@ -1,6 +1,8 @@
>>>>  #ifndef __ASM_SOFTIRQ_H__
>>>>  #define __ASM_SOFTIRQ_H__
>>>>  
>>>> +#include <asm/system.h>
>>>> +
>>>>  #define NMI_SOFTIRQ            (NR_COMMON_SOFTIRQS + 0)
>>>>  #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
>>>>  #define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
>>>> @@ -9,4 +11,36 @@
>>>>  #define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
>>>>  #define NR_ARCH_SOFTIRQS       5
>>>>  
>>>> +/*
>>>> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
>>>> + * skipped, false if the IPI cannot be skipped.
>>>> + *
>>>> + * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
>>>> + * set softirq @nr while also observing in_mwait in a race-free way.
>>>> + */
>>>> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int cpu)
>>>> +{
>>>> +    uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
>>>> +    uint64_t old, new;
>>> ... so that you also use it here?
>> No, it cant.  The of softirq_pending() in common code requires no
>> intermediate field names, and I'm not untangling that mess in a series
>> wanting backporting.
> Oh, I see.  Never mind then, something for a later cleanup if
> anything.

Yeah, I've got further cleanup pending, although I don't have a good fix
for this specifically.

~Andrew

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

Posted by Jan Beulich 3 months, 4 weeks ago

On 02.07.2025 16:41, Andrew Cooper wrote:
> In order elide IPIs, we must be able to identify whether a target CPU is in
> MWAIT at the point it is woken up.  i.e. the store to wake it up must also
> identify the state.
> 
> Create a new in_mwait variable beside __softirq_pending, so we can use a
> CMPXCHG to set the softirq while also observing the status safely.  Implement
> an x86 version of arch_pend_softirq() which does this.
> 
> In mwait_idle_with_hints(), advertise in_mwait, with an explanation of
> precisely what it means.  X86_BUG_MONITOR can be accounted for simply by not
> advertising in_mwait.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> CC: Anthony PERARD <anthony.perard@vates.tech>
> CC: Michal Orzel <michal.orzel@amd.com>
> CC: Julien Grall <julien@xen.org>
> CC: Stefano Stabellini <sstabellini@kernel.org>
> 
> This is modelled after Linux's TIF_NEED_RESCHED (single bit equivelent of all
> of __softirq_pending), and TIF_POLLING_NRFLAG (arch-neutral "in_mwait").
> 
> In Linux, they're both in the same flags field, which adds complexity.  In
> Xen, __softirq_pending is already unsigned long for everything other than x86,
> so adding an arch-neutral "in_mwait" would need wider changes.

Why would the flag need to be arch-neutral? A pretty straightforward way to
achieve what you want would seem to be to define an x86-only MWAIT_SOFTIRQ,
which we'd never raise or open, but which instead we'd advertise to common
code as an always-ignore mask (to be OR-ed in by __do_softirq()). However,
setting and clearing such a bit would of course require LOCK-ed insns,
which clearly is less desirable than the simple MOVB you're using.

> --- a/xen/arch/x86/acpi/cpu_idle.c
> +++ b/xen/arch/x86/acpi/cpu_idle.c
> @@ -439,7 +439,21 @@ __initcall(cpu_idle_key_init);
>  void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>  {
>      unsigned int cpu = smp_processor_id();
> -    const unsigned int *this_softirq_pending = &softirq_pending(cpu);
> +    irq_cpustat_t *stat = &irq_stat[cpu];
> +    const unsigned int *this_softirq_pending = &stat->__softirq_pending;
> +
> +    /*
> +     * By setting in_mwait, we promise to other CPUs that we'll notice changes
> +     * to __softirq_pending without being sent an IPI.  We achieve this by
> +     * either not going to sleep, or by having hardware notice on our behalf.
> +     *
> +     * Some errata exist where MONITOR doesn't work properly, and the
> +     * workaround is to force the use of an IPI.  Cause this to happen by
> +     * simply not advertising outselves as being in_mwait.

Nit: ourselves

> +     */
> +    alternative_io("movb $1, %[in_mwait]",
> +                   "", X86_BUG_MONITOR,
> +                   [in_mwait] "=m" (stat->in_mwait));
>  
>      monitor(this_softirq_pending, 0, 0);
>      smp_mb();

Unlike alternative(), alternative_io() has no memory clobber. To the compiler
the two resulting asm() therefore have no dependency operand-wise[1]. The
sole ordering criteria then is that they're both volatile asm()-s. It not
being explicitly said (anywhere that I could find) that volatile guarantees
such ordering, I wonder if we wouldn't better have an operand-based
dependency between the two. Otoh it may well be that we rely on such ordering
elsewhere already, in which case having one more instance probably would be
okay(ish).

[1] It's actually worse than that, I think: monitor() also doesn't specify
the location as a (memory) input.

> @@ -452,6 +466,10 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>          mwait(eax, ecx);
>          spec_ctrl_exit_idle(info);
>      }
> +
> +    alternative_io("movb $0, %[in_mwait]",
> +                   "", X86_BUG_MONITOR,
> +                   [in_mwait] "=m" (stat->in_mwait));

This doesn't strictly need to be an alternative, does it? We can save a
memory write in the buggy case, but that likely makes at most a marginal
difference.

> --- a/xen/arch/x86/include/asm/hardirq.h
> +++ b/xen/arch/x86/include/asm/hardirq.h
> @@ -5,7 +5,19 @@
>  #include <xen/types.h>
>  
>  typedef struct {
> -    unsigned int __softirq_pending;
> +    /*
> +     * The layout is important.  Any CPU can set bits in __softirq_pending,
> +     * but in_mwait is a status bit owned by the CPU.  softirq_mwait_raw must
> +     * cover both, and must be in a single cacheline.
> +     */
> +    union {
> +        struct {
> +            unsigned int __softirq_pending;
> +            bool in_mwait;
> +        };
> +        uint64_t softirq_mwait_raw;
> +    };

To guard against someone changing e.g. __softirq_pending to unsigned long
while ignoring the comment, how about adding a BUILD_BUG_ON() checking both
parts of the union are the same size (which of course would require naming
either the union field within the containing struct or its struct member)?

> @@ -9,4 +11,36 @@
>  #define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
>  #define NR_ARCH_SOFTIRQS       5
>  
> +/*
> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
> + * skipped, false if the IPI cannot be skipped.
> + *
> + * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
> + * set softirq @nr while also observing in_mwait in a race-free way.
> + */
> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int cpu)
> +{
> +    uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
> +    uint64_t old, new;
> +    unsigned int softirq = 1U << nr;
> +
> +    old = ACCESS_ONCE(*ptr);
> +
> +    do {
> +        if ( old & softirq )
> +            /* Softirq already pending, nothing to do. */
> +            return true;
> +
> +        new = old | softirq;
> +
> +    } while ( (old = cmpxchg(ptr, old, new)) != new );

Don't you mean

    } while ( (new = cmpxchg(ptr, old, new)) != old );

here (with new latched ahead of the loop and old set from new inside it),
like we have it elsewhere when we avoid the use of yet another variable,
e.g. in inc_linear_{entries,uses}()?

> +    /*
> +     * We have caused the softirq to become pending.  If in_mwait was set, the
> +     * target CPU will notice the modification and act on it.
> +     */
> +    return new & (1UL << 32);

Hmm, this requires the layout to allow for even less re-arrangement than the
comment ahead of the union says: You strictly require in_mwait to follow
__softirq_pending immediately, and the (assembly) write to strictly write 1
into the field. Could I talk you into at least

    return new & (1UL << (offsetof(..., in_mwait) * 8));

? This way the field being inspected would also be mentioned in the access
itself (i.e. become grep-able beyond the mentioning in the comment). And I
sincerely dislike hard-coded literal numbers when they (relatively) easily
can be expressed another way.

Jan

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

Posted by Andrew Cooper 3 months, 4 weeks ago

On 03/07/2025 10:01 am, Jan Beulich wrote:
> On 02.07.2025 16:41, Andrew Cooper wrote:
>> In order elide IPIs, we must be able to identify whether a target CPU is in
>> MWAIT at the point it is woken up.  i.e. the store to wake it up must also
>> identify the state.
>>
>> Create a new in_mwait variable beside __softirq_pending, so we can use a
>> CMPXCHG to set the softirq while also observing the status safely.  Implement
>> an x86 version of arch_pend_softirq() which does this.
>>
>> In mwait_idle_with_hints(), advertise in_mwait, with an explanation of
>> precisely what it means.  X86_BUG_MONITOR can be accounted for simply by not
>> advertising in_mwait.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> ---
>> CC: Jan Beulich <JBeulich@suse.com>
>> CC: Roger Pau Monné <roger.pau@citrix.com>
>> CC: Anthony PERARD <anthony.perard@vates.tech>
>> CC: Michal Orzel <michal.orzel@amd.com>
>> CC: Julien Grall <julien@xen.org>
>> CC: Stefano Stabellini <sstabellini@kernel.org>
>>
>> This is modelled after Linux's TIF_NEED_RESCHED (single bit equivelent of all
>> of __softirq_pending), and TIF_POLLING_NRFLAG (arch-neutral "in_mwait").
>>
>> In Linux, they're both in the same flags field, which adds complexity.  In
>> Xen, __softirq_pending is already unsigned long for everything other than x86,
>> so adding an arch-neutral "in_mwait" would need wider changes.
> Why would the flag need to be arch-neutral?

Because it's not about mwait; it's about the ability to notice the
rising edge of TIF_NEED_RESCHED, and is implemented by more than just
x86 in Linux.

>> +     */
>> +    alternative_io("movb $1, %[in_mwait]",
>> +                   "", X86_BUG_MONITOR,
>> +                   [in_mwait] "=m" (stat->in_mwait));
>>  
>>      monitor(this_softirq_pending, 0, 0);
>>      smp_mb();
> Unlike alternative(), alternative_io() has no memory clobber. To the compiler
> the two resulting asm() therefore have no dependency operand-wise[1]. The
> sole ordering criteria then is that they're both volatile asm()-s. It not
> being explicitly said (anywhere that I could find) that volatile guarantees
> such ordering, I wonder if we wouldn't better have an operand-based
> dependency between the two. Otoh it may well be that we rely on such ordering
> elsewhere already, in which case having one more instance probably would be
> okay(ish).
>
> [1] It's actually worse than that, I think: monitor() also doesn't specify
> the location as a (memory) input.

The GCC bugzilla has plenty of statements that volatiles (which have
survived optimisation passes) can't be reordered.

>
>> @@ -452,6 +466,10 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>>          mwait(eax, ecx);
>>          spec_ctrl_exit_idle(info);
>>      }
>> +
>> +    alternative_io("movb $0, %[in_mwait]",
>> +                   "", X86_BUG_MONITOR,
>> +                   [in_mwait] "=m" (stat->in_mwait));
> This doesn't strictly need to be an alternative, does it? We can save a
> memory write in the buggy case, but that likely makes at most a marginal
> difference.

It doesn't *need* to be an alternative.  It could be:

if ( !cpu_bug_monitor )
    ACCESS_ONCE(stat->in_mwait) = true;

but getting rid of the branch is an advantage too.
>> --- a/xen/arch/x86/include/asm/hardirq.h
>> +++ b/xen/arch/x86/include/asm/hardirq.h
>> @@ -5,7 +5,19 @@
>>  #include <xen/types.h>
>>  
>>  typedef struct {
>> -    unsigned int __softirq_pending;
>> +    /*
>> +     * The layout is important.  Any CPU can set bits in __softirq_pending,
>> +     * but in_mwait is a status bit owned by the CPU.  softirq_mwait_raw must
>> +     * cover both, and must be in a single cacheline.
>> +     */
>> +    union {
>> +        struct {
>> +            unsigned int __softirq_pending;
>> +            bool in_mwait;
>> +        };
>> +        uint64_t softirq_mwait_raw;
>> +    };
> To guard against someone changing e.g. __softirq_pending to unsigned long
> while ignoring the comment, how about adding a BUILD_BUG_ON() checking both
> parts of the union are the same size (which of course would require naming
> either the union field within the containing struct or its struct member)?

That turns out to be very hard because of the definition of
softirq_pending() being common.  More below.


>
>> @@ -9,4 +11,36 @@
>>  #define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
>>  #define NR_ARCH_SOFTIRQS       5
>>  
>> +/*
>> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
>> + * skipped, false if the IPI cannot be skipped.
>> + *
>> + * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
>> + * set softirq @nr while also observing in_mwait in a race-free way.
>> + */
>> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int cpu)
>> +{
>> +    uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
>> +    uint64_t old, new;
>> +    unsigned int softirq = 1U << nr;
>> +
>> +    old = ACCESS_ONCE(*ptr);
>> +
>> +    do {
>> +        if ( old & softirq )
>> +            /* Softirq already pending, nothing to do. */
>> +            return true;
>> +
>> +        new = old | softirq;
>> +
>> +    } while ( (old = cmpxchg(ptr, old, new)) != new );
> Don't you mean
>
>     } while ( (new = cmpxchg(ptr, old, new)) != old );
>
> here

No. I'm pretty sure I don't.

> (with new latched ahead of the loop and old set from new inside it),
> like we have it elsewhere when we avoid the use of yet another variable,
> e.g. in inc_linear_{entries,uses}()?

Eww, no.  Having new and old backwards like that is borderline
obfucation, and is unnecessary cognitive complexity for the reader.


>
>> +    /*
>> +     * We have caused the softirq to become pending.  If in_mwait was set, the
>> +     * target CPU will notice the modification and act on it.
>> +     */
>> +    return new & (1UL << 32);
> Hmm, this requires the layout to allow for even less re-arrangement than the
> comment ahead of the union says: You strictly require in_mwait to follow
> __softirq_pending immediately, and the (assembly) write to strictly write 1
> into the field. Could I talk you into at least
>
>     return new & (1UL << (offsetof(..., in_mwait) * 8));
>
> ? This way the field being inspected would also be mentioned in the access
> itself (i.e. become grep-able beyond the mentioning in the comment). And I
> sincerely dislike hard-coded literal numbers when they (relatively) easily
> can be expressed another way.

The nice way to do this would be a named union and a proper field, but
that doesn't work because of how softirq_pending() is declared.

I suppose I can use an offsetof() expression.

~Andrew

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

Posted by Jan Beulich 3 months, 4 weeks ago

On 03.07.2025 13:59, Andrew Cooper wrote:
> On 03/07/2025 10:01 am, Jan Beulich wrote:
>> On 02.07.2025 16:41, Andrew Cooper wrote:
>>> @@ -452,6 +466,10 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>>>          mwait(eax, ecx);
>>>          spec_ctrl_exit_idle(info);
>>>      }
>>> +
>>> +    alternative_io("movb $0, %[in_mwait]",
>>> +                   "", X86_BUG_MONITOR,
>>> +                   [in_mwait] "=m" (stat->in_mwait));
>> This doesn't strictly need to be an alternative, does it? We can save a
>> memory write in the buggy case, but that likely makes at most a marginal
>> difference.
> 
> It doesn't *need* to be an alternative.  It could be:
> 
> if ( !cpu_bug_monitor )
>     ACCESS_ONCE(stat->in_mwait) = true;
> 
> but getting rid of the branch is an advantage too.

That's the other alternative blob. What I mean that here it could simply
be

    ACCESS_ONCE(stat->in_mwait) = false;

without any conditional.

>>> --- a/xen/arch/x86/include/asm/hardirq.h
>>> +++ b/xen/arch/x86/include/asm/hardirq.h
>>> @@ -5,7 +5,19 @@
>>>  #include <xen/types.h>
>>>  
>>>  typedef struct {
>>> -    unsigned int __softirq_pending;
>>> +    /*
>>> +     * The layout is important.  Any CPU can set bits in __softirq_pending,
>>> +     * but in_mwait is a status bit owned by the CPU.  softirq_mwait_raw must
>>> +     * cover both, and must be in a single cacheline.
>>> +     */
>>> +    union {
>>> +        struct {
>>> +            unsigned int __softirq_pending;
>>> +            bool in_mwait;
>>> +        };
>>> +        uint64_t softirq_mwait_raw;
>>> +    };
>> To guard against someone changing e.g. __softirq_pending to unsigned long
>> while ignoring the comment, how about adding a BUILD_BUG_ON() checking both
>> parts of the union are the same size (which of course would require naming
>> either the union field within the containing struct or its struct member)?
> 
> That turns out to be very hard because of the definition of
> softirq_pending() being common.  More below.

Hmm, yes, I see. Then maybe

    BUILD_BUG_ON(offsetof(irq_cpustat_t, in_mwait) +
                   sizeof(irq_stat[0].in_mwait) >
                 offsetof(irq_cpustat_t, softirq_mwait_raw) +
                   sizeof(irq_stat[0].softirq_mwait_raw));

(or something substantially similar using e.g. typeof())?

>>> @@ -9,4 +11,36 @@
>>>  #define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
>>>  #define NR_ARCH_SOFTIRQS       5
>>>  
>>> +/*
>>> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
>>> + * skipped, false if the IPI cannot be skipped.
>>> + *
>>> + * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
>>> + * set softirq @nr while also observing in_mwait in a race-free way.
>>> + */
>>> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int cpu)
>>> +{
>>> +    uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
>>> +    uint64_t old, new;
>>> +    unsigned int softirq = 1U << nr;
>>> +
>>> +    old = ACCESS_ONCE(*ptr);
>>> +
>>> +    do {
>>> +        if ( old & softirq )
>>> +            /* Softirq already pending, nothing to do. */
>>> +            return true;
>>> +
>>> +        new = old | softirq;
>>> +
>>> +    } while ( (old = cmpxchg(ptr, old, new)) != new );
>> Don't you mean
>>
>>     } while ( (new = cmpxchg(ptr, old, new)) != old );
>>
>> here
> 
> No. I'm pretty sure I don't.
> 
>> (with new latched ahead of the loop and old set from new inside it),
>> like we have it elsewhere when we avoid the use of yet another variable,
>> e.g. in inc_linear_{entries,uses}()?
> 
> Eww, no.  Having new and old backwards like that is borderline
> obfucation, and is unnecessary cognitive complexity for the reader.

Why backwards? You want to compare the (original) value read against the
expected old value. The way you wrote it you'll do at least one extra
loop iteration, as you wait for the fetched (original) value to equal
"new".

Jan

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

Posted by Andrew Cooper 3 months, 4 weeks ago

On 03/07/2025 3:07 pm, Jan Beulich wrote:
> On 03.07.2025 13:59, Andrew Cooper wrote:
>> On 03/07/2025 10:01 am, Jan Beulich wrote:
>>> On 02.07.2025 16:41, Andrew Cooper wrote:
>>>> @@ -452,6 +466,10 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>>>>          mwait(eax, ecx);
>>>>          spec_ctrl_exit_idle(info);
>>>>      }
>>>> +
>>>> +    alternative_io("movb $0, %[in_mwait]",
>>>> +                   "", X86_BUG_MONITOR,
>>>> +                   [in_mwait] "=m" (stat->in_mwait));
>>> This doesn't strictly need to be an alternative, does it? We can save a
>>> memory write in the buggy case, but that likely makes at most a marginal
>>> difference.
>> It doesn't *need* to be an alternative.  It could be:
>>
>> if ( !cpu_bug_monitor )
>>     ACCESS_ONCE(stat->in_mwait) = true;
>>
>> but getting rid of the branch is an advantage too.
> That's the other alternative blob. What I mean that here it could simply
> be
>
>     ACCESS_ONCE(stat->in_mwait) = false;
>
> without any conditional.

It could.  Or it could be consistent with it's other half.

>
>>>> --- a/xen/arch/x86/include/asm/hardirq.h
>>>> +++ b/xen/arch/x86/include/asm/hardirq.h
>>>> @@ -5,7 +5,19 @@
>>>>  #include <xen/types.h>
>>>>  
>>>>  typedef struct {
>>>> -    unsigned int __softirq_pending;
>>>> +    /*
>>>> +     * The layout is important.  Any CPU can set bits in __softirq_pending,
>>>> +     * but in_mwait is a status bit owned by the CPU.  softirq_mwait_raw must
>>>> +     * cover both, and must be in a single cacheline.
>>>> +     */
>>>> +    union {
>>>> +        struct {
>>>> +            unsigned int __softirq_pending;
>>>> +            bool in_mwait;
>>>> +        };
>>>> +        uint64_t softirq_mwait_raw;
>>>> +    };
>>> To guard against someone changing e.g. __softirq_pending to unsigned long
>>> while ignoring the comment, how about adding a BUILD_BUG_ON() checking both
>>> parts of the union are the same size (which of course would require naming
>>> either the union field within the containing struct or its struct member)?
>> That turns out to be very hard because of the definition of
>> softirq_pending() being common.  More below.
> Hmm, yes, I see. Then maybe
>
>     BUILD_BUG_ON(offsetof(irq_cpustat_t, in_mwait) +
>                    sizeof(irq_stat[0].in_mwait) >
>                  offsetof(irq_cpustat_t, softirq_mwait_raw) +
>                    sizeof(irq_stat[0].softirq_mwait_raw));
>
> (or something substantially similar using e.g. typeof())?
>
>>>> @@ -9,4 +11,36 @@
>>>>  #define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
>>>>  #define NR_ARCH_SOFTIRQS       5
>>>>  
>>>> +/*
>>>> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
>>>> + * skipped, false if the IPI cannot be skipped.
>>>> + *
>>>> + * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
>>>> + * set softirq @nr while also observing in_mwait in a race-free way.
>>>> + */
>>>> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int cpu)
>>>> +{
>>>> +    uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
>>>> +    uint64_t old, new;
>>>> +    unsigned int softirq = 1U << nr;
>>>> +
>>>> +    old = ACCESS_ONCE(*ptr);
>>>> +
>>>> +    do {
>>>> +        if ( old & softirq )
>>>> +            /* Softirq already pending, nothing to do. */
>>>> +            return true;
>>>> +
>>>> +        new = old | softirq;
>>>> +
>>>> +    } while ( (old = cmpxchg(ptr, old, new)) != new );
>>> Don't you mean
>>>
>>>     } while ( (new = cmpxchg(ptr, old, new)) != old );
>>>
>>> here
>> No. I'm pretty sure I don't.
>>
>>> (with new latched ahead of the loop and old set from new inside it),
>>> like we have it elsewhere when we avoid the use of yet another variable,
>>> e.g. in inc_linear_{entries,uses}()?
>> Eww, no.  Having new and old backwards like that is borderline
>> obfucation, and is unnecessary cognitive complexity for the reader.
> Why backwards? You want to compare the (original) value read against the
> expected old value. The way you wrote it you'll do at least one extra
> loop iteration, as you wait for the fetched (original) value to equal
> "new".

Hmm, yes, that's not quite what I intended, but I'm also not happy
writing anything of the form "new = cmpxchg()".  It's plain wrong for
the semantics of cmpxchg.

~Andrew

[PATCH 1/6] x86/idle: Remove broken MWAIT implementation
[PATCH 2/6] x86/idle: Convert force_mwait_ipi_wakeup to X86_BUG_MONITOR
[PATCH 3/6] xen/softirq: Rework arch_skip_send_event_check() into arch_pend_softirq()
[PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm
[PATCH 5/6] x86/idle: Drop incorrect smp_mb() in mwait_idle_with_hints()
[PATCH 6/6] x86/idle: Fix buggy "x86/mwait-idle: enable interrupts before C1 on Xeons"