Introduce a helper function to allow offlined CPUs to enter FFh idle
states with a specific MWAIT hint. The new helper will be used in
subsequent patches by the acpi_idle and intel_idle drivers.
No functional change intended.
Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
---
arch/x86/include/asm/smp.h | 3 ++
arch/x86/kernel/smpboot.c | 90 ++++++++++++++++++++------------------
2 files changed, 51 insertions(+), 42 deletions(-)
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index ca073f40698f..dfd09a1e09bf 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);
void smp_kick_mwait_play_dead(void);
+void mwait_play_dead(unsigned int hint);
void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
@@ -164,6 +165,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
{
return (struct cpumask *)cpumask_of(0);
}
+
+static inline void mwait_play_dead(unsigned int eax_hint) { }
#endif /* CONFIG_SMP */
#ifdef CONFIG_DEBUG_NMI_SELFTEST
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b5a8f0891135..8a3545c2cae9 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1272,13 +1272,57 @@ void play_dead_common(void)
local_irq_disable();
}
+void __noreturn mwait_play_dead(unsigned int eax_hint)
+{
+ struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
+
+ /* Set up state for the kexec() hack below */
+ md->status = CPUDEAD_MWAIT_WAIT;
+ md->control = CPUDEAD_MWAIT_WAIT;
+
+ wbinvd();
+
+ while (1) {
+ /*
+ * The CLFLUSH is a workaround for erratum AAI65 for
+ * the Xeon 7400 series. It's not clear it is actually
+ * needed, but it should be harmless in either case.
+ * The WBINVD is insufficient due to the spurious-wakeup
+ * case where we return around the loop.
+ */
+ mb();
+ clflush(md);
+ mb();
+ __monitor(md, 0, 0);
+ mb();
+ __mwait(eax_hint, 0);
+
+ if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+ /*
+ * Kexec is about to happen. Don't go back into mwait() as
+ * the kexec kernel might overwrite text and data including
+ * page tables and stack. So mwait() would resume when the
+ * monitor cache line is written to and then the CPU goes
+ * south due to overwritten text, page tables and stack.
+ *
+ * Note: This does _NOT_ protect against a stray MCE, NMI,
+ * SMI. They will resume execution at the instruction
+ * following the HLT instruction and run into the problem
+ * which this is trying to prevent.
+ */
+ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+ while(1)
+ native_halt();
+ }
+ }
+}
+
/*
* We need to flush the caches before going to sleep, lest we have
* dirty data in our caches when we come back up.
*/
-static inline void mwait_play_dead(void)
+static inline void mwait_play_dead_cpuid_hint(void)
{
- struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
unsigned int eax, ebx, ecx, edx;
unsigned int highest_cstate = 0;
unsigned int highest_subcstate = 0;
@@ -1316,45 +1360,7 @@ static inline void mwait_play_dead(void)
(highest_subcstate - 1);
}
- /* Set up state for the kexec() hack below */
- md->status = CPUDEAD_MWAIT_WAIT;
- md->control = CPUDEAD_MWAIT_WAIT;
-
- wbinvd();
-
- while (1) {
- /*
- * The CLFLUSH is a workaround for erratum AAI65 for
- * the Xeon 7400 series. It's not clear it is actually
- * needed, but it should be harmless in either case.
- * The WBINVD is insufficient due to the spurious-wakeup
- * case where we return around the loop.
- */
- mb();
- clflush(md);
- mb();
- __monitor(md, 0, 0);
- mb();
- __mwait(eax, 0);
-
- if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
- /*
- * Kexec is about to happen. Don't go back into mwait() as
- * the kexec kernel might overwrite text and data including
- * page tables and stack. So mwait() would resume when the
- * monitor cache line is written to and then the CPU goes
- * south due to overwritten text, page tables and stack.
- *
- * Note: This does _NOT_ protect against a stray MCE, NMI,
- * SMI. They will resume execution at the instruction
- * following the HLT instruction and run into the problem
- * which this is trying to prevent.
- */
- WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
- while(1)
- native_halt();
- }
- }
+ mwait_play_dead(eax);
}
/*
@@ -1407,7 +1413,7 @@ void native_play_dead(void)
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
- mwait_play_dead();
+ mwait_play_dead_cpuid_hint();
if (cpuidle_play_dead())
hlt_play_dead();
}
--
2.47.1
First, I would change the subject to something like "x86/smp: Add hint
parameter to mwait_play_dead()"
On Fri, Nov 29, 2024 at 7:22 PM Patryk Wlazlyn
<patryk.wlazlyn@linux.intel.com> wrote:
>
> Introduce a helper function to allow offlined CPUs to enter FFh idle
> states with a specific MWAIT hint. The new helper will be used in
> subsequent patches by the acpi_idle and intel_idle drivers.
And the above would become
"Change mwait_play_dead() into a helper function allowing CPUs going
offline to enter idle states via MWAIT with a specific hint passed to
it as an argument.
Add mwait_play_dead_cpuid_hint() as a wrapper around mwait_play_dead()
implementing the existing behavior of the code.
Subsequently, the new helper will also be used by the acpi_idle and
intel_idle drivers in idle-state-specific :enter_dead() callbacks."
> No functional change intended.
>
> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
> ---
> arch/x86/include/asm/smp.h | 3 ++
> arch/x86/kernel/smpboot.c | 90 ++++++++++++++++++++------------------
> 2 files changed, 51 insertions(+), 42 deletions(-)
>
> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index ca073f40698f..dfd09a1e09bf 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu);
> int wbinvd_on_all_cpus(void);
>
> void smp_kick_mwait_play_dead(void);
> +void mwait_play_dead(unsigned int hint);
>
> void native_smp_send_reschedule(int cpu);
> void native_send_call_func_ipi(const struct cpumask *mask);
> @@ -164,6 +165,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
> {
> return (struct cpumask *)cpumask_of(0);
> }
> +
> +static inline void mwait_play_dead(unsigned int eax_hint) { }
> #endif /* CONFIG_SMP */
>
> #ifdef CONFIG_DEBUG_NMI_SELFTEST
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index b5a8f0891135..8a3545c2cae9 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -1272,13 +1272,57 @@ void play_dead_common(void)
> local_irq_disable();
> }
>
> +void __noreturn mwait_play_dead(unsigned int eax_hint)
> +{
> + struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
> +
> + /* Set up state for the kexec() hack below */
> + md->status = CPUDEAD_MWAIT_WAIT;
> + md->control = CPUDEAD_MWAIT_WAIT;
> +
> + wbinvd();
> +
> + while (1) {
> + /*
> + * The CLFLUSH is a workaround for erratum AAI65 for
> + * the Xeon 7400 series. It's not clear it is actually
> + * needed, but it should be harmless in either case.
> + * The WBINVD is insufficient due to the spurious-wakeup
> + * case where we return around the loop.
> + */
> + mb();
> + clflush(md);
> + mb();
> + __monitor(md, 0, 0);
> + mb();
> + __mwait(eax_hint, 0);
> +
> + if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
> + /*
> + * Kexec is about to happen. Don't go back into mwait() as
> + * the kexec kernel might overwrite text and data including
> + * page tables and stack. So mwait() would resume when the
> + * monitor cache line is written to and then the CPU goes
> + * south due to overwritten text, page tables and stack.
> + *
> + * Note: This does _NOT_ protect against a stray MCE, NMI,
> + * SMI. They will resume execution at the instruction
> + * following the HLT instruction and run into the problem
> + * which this is trying to prevent.
> + */
> + WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
> + while(1)
> + native_halt();
> + }
> + }
> +}
> +
> /*
> * We need to flush the caches before going to sleep, lest we have
> * dirty data in our caches when we come back up.
> */
> -static inline void mwait_play_dead(void)
> +static inline void mwait_play_dead_cpuid_hint(void)
> {
> - struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
> unsigned int eax, ebx, ecx, edx;
> unsigned int highest_cstate = 0;
> unsigned int highest_subcstate = 0;
> @@ -1316,45 +1360,7 @@ static inline void mwait_play_dead(void)
> (highest_subcstate - 1);
> }
>
> - /* Set up state for the kexec() hack below */
> - md->status = CPUDEAD_MWAIT_WAIT;
> - md->control = CPUDEAD_MWAIT_WAIT;
> -
> - wbinvd();
> -
> - while (1) {
> - /*
> - * The CLFLUSH is a workaround for erratum AAI65 for
> - * the Xeon 7400 series. It's not clear it is actually
> - * needed, but it should be harmless in either case.
> - * The WBINVD is insufficient due to the spurious-wakeup
> - * case where we return around the loop.
> - */
> - mb();
> - clflush(md);
> - mb();
> - __monitor(md, 0, 0);
> - mb();
> - __mwait(eax, 0);
> -
> - if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
> - /*
> - * Kexec is about to happen. Don't go back into mwait() as
> - * the kexec kernel might overwrite text and data including
> - * page tables and stack. So mwait() would resume when the
> - * monitor cache line is written to and then the CPU goes
> - * south due to overwritten text, page tables and stack.
> - *
> - * Note: This does _NOT_ protect against a stray MCE, NMI,
> - * SMI. They will resume execution at the instruction
> - * following the HLT instruction and run into the problem
> - * which this is trying to prevent.
> - */
> - WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
> - while(1)
> - native_halt();
> - }
> - }
> + mwait_play_dead(eax);
> }
>
> /*
> @@ -1407,7 +1413,7 @@ void native_play_dead(void)
> play_dead_common();
> tboot_shutdown(TB_SHUTDOWN_WFS);
>
> - mwait_play_dead();
> + mwait_play_dead_cpuid_hint();
> if (cpuidle_play_dead())
> hlt_play_dead();
> }
> --
And honestly I'm wondering why adding a parameter to mwait_play_dead()
is better than introducing mwait_play_dead_with_hint(), in analogy
with the existing mwait_idle_with_hints()?
The latter option would allow you to avoid introducing a function that
is deleted in the same patch series (in patch 4).
> And honestly I'm wondering why adding a parameter to mwait_play_dead() > is better than introducing mwait_play_dead_with_hint(), in analogy > with the existing mwait_idle_with_hints()? > > The latter option would allow you to avoid introducing a function that > is deleted in the same patch series (in patch 4). We need to be able to call part of the old mwait_play_dead() code, but without the hint calculation. mwait_idle_with_hints() doesn't have the "kexec hack" logic. We also need to leave the old code working and on top of that introduce the acpi_idle and intel_idle patches that use the new API. Now the old code is there and the new one. The only thing left is remove the old code. I did it that way because of the comments earlier indicating that I should not be breaking code in between. Let me know if I answered your question or if I misunderstood something now or earlier. I'll apply your changelog suggestions when we agree on the implementation.
On Tue, Dec 17, 2024 at 9:09 PM Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com> wrote: > > > And honestly I'm wondering why adding a parameter to mwait_play_dead() > > is better than introducing mwait_play_dead_with_hint(), in analogy > > with the existing mwait_idle_with_hints()? > > > > The latter option would allow you to avoid introducing a function that > > is deleted in the same patch series (in patch 4). > > We need to be able to call part of the old mwait_play_dead() code, > but without the hint calculation. > > mwait_idle_with_hints() doesn't have the "kexec hack" logic. Well, "in analogy" doesn't mean to use mwait_idle_with_hints() instead of the new function. Just the name of the new function could be similar to mwait_idle_with_hints() (which is the name of an existing function), that is mwait_play_dead_with_hint(). > We also need to leave the old code working and on top of that introduce > the acpi_idle and intel_idle patches that use the new API. Sure. If the name of the new function is mwait_play_dead_with_hint(), that will still work.
>>> And honestly I'm wondering why adding a parameter to mwait_play_dead() >>> is better than introducing mwait_play_dead_with_hint(), in analogy >>> with the existing mwait_idle_with_hints()? Well.. Maybe that wasn't that good of an idea. I've given the rationale in the 0/4: > Changes since v6: > * Renamed mwait_play_dead to mwait_play_dead_cpuid_hint in 1/1, so that > mwait_play_dead name can be reused for the function that takes the > MWAIT hint as an argument. This leaves the comments around the > smpboot.c file that reference the old mwait_play_dead() unchanged. It makes the patches simpler, in a sense that I don't have to update the comments each patch when moving things around and renaming.
On Fri, Nov 29, 2024 at 07:22:29PM +0100, Patryk Wlazlyn wrote:
> Introduce a helper function to allow offlined CPUs to enter FFh idle
> states with a specific MWAIT hint. The new helper will be used in
> subsequent patches by the acpi_idle and intel_idle drivers.
>
> No functional change intended.
>
> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
This looks good to me.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
--
Thanks and Regards
gautham.
> ---
> arch/x86/include/asm/smp.h | 3 ++
> arch/x86/kernel/smpboot.c | 90 ++++++++++++++++++++------------------
> 2 files changed, 51 insertions(+), 42 deletions(-)
>
> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index ca073f40698f..dfd09a1e09bf 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu);
> int wbinvd_on_all_cpus(void);
>
> void smp_kick_mwait_play_dead(void);
> +void mwait_play_dead(unsigned int hint);
>
> void native_smp_send_reschedule(int cpu);
> void native_send_call_func_ipi(const struct cpumask *mask);
> @@ -164,6 +165,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
> {
> return (struct cpumask *)cpumask_of(0);
> }
> +
> +static inline void mwait_play_dead(unsigned int eax_hint) { }
> #endif /* CONFIG_SMP */
>
> #ifdef CONFIG_DEBUG_NMI_SELFTEST
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index b5a8f0891135..8a3545c2cae9 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -1272,13 +1272,57 @@ void play_dead_common(void)
> local_irq_disable();
> }
>
> +void __noreturn mwait_play_dead(unsigned int eax_hint)
> +{
> + struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
> +
> + /* Set up state for the kexec() hack below */
> + md->status = CPUDEAD_MWAIT_WAIT;
> + md->control = CPUDEAD_MWAIT_WAIT;
> +
> + wbinvd();
> +
> + while (1) {
> + /*
> + * The CLFLUSH is a workaround for erratum AAI65 for
> + * the Xeon 7400 series. It's not clear it is actually
> + * needed, but it should be harmless in either case.
> + * The WBINVD is insufficient due to the spurious-wakeup
> + * case where we return around the loop.
> + */
> + mb();
> + clflush(md);
> + mb();
> + __monitor(md, 0, 0);
> + mb();
> + __mwait(eax_hint, 0);
> +
> + if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
> + /*
> + * Kexec is about to happen. Don't go back into mwait() as
> + * the kexec kernel might overwrite text and data including
> + * page tables and stack. So mwait() would resume when the
> + * monitor cache line is written to and then the CPU goes
> + * south due to overwritten text, page tables and stack.
> + *
> + * Note: This does _NOT_ protect against a stray MCE, NMI,
> + * SMI. They will resume execution at the instruction
> + * following the HLT instruction and run into the problem
> + * which this is trying to prevent.
> + */
> + WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
> + while(1)
> + native_halt();
> + }
> + }
> +}
> +
> /*
> * We need to flush the caches before going to sleep, lest we have
> * dirty data in our caches when we come back up.
> */
> -static inline void mwait_play_dead(void)
> +static inline void mwait_play_dead_cpuid_hint(void)
> {
> - struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
> unsigned int eax, ebx, ecx, edx;
> unsigned int highest_cstate = 0;
> unsigned int highest_subcstate = 0;
> @@ -1316,45 +1360,7 @@ static inline void mwait_play_dead(void)
> (highest_subcstate - 1);
> }
>
> - /* Set up state for the kexec() hack below */
> - md->status = CPUDEAD_MWAIT_WAIT;
> - md->control = CPUDEAD_MWAIT_WAIT;
> -
> - wbinvd();
> -
> - while (1) {
> - /*
> - * The CLFLUSH is a workaround for erratum AAI65 for
> - * the Xeon 7400 series. It's not clear it is actually
> - * needed, but it should be harmless in either case.
> - * The WBINVD is insufficient due to the spurious-wakeup
> - * case where we return around the loop.
> - */
> - mb();
> - clflush(md);
> - mb();
> - __monitor(md, 0, 0);
> - mb();
> - __mwait(eax, 0);
> -
> - if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
> - /*
> - * Kexec is about to happen. Don't go back into mwait() as
> - * the kexec kernel might overwrite text and data including
> - * page tables and stack. So mwait() would resume when the
> - * monitor cache line is written to and then the CPU goes
> - * south due to overwritten text, page tables and stack.
> - *
> - * Note: This does _NOT_ protect against a stray MCE, NMI,
> - * SMI. They will resume execution at the instruction
> - * following the HLT instruction and run into the problem
> - * which this is trying to prevent.
> - */
> - WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
> - while(1)
> - native_halt();
> - }
> - }
> + mwait_play_dead(eax);
> }
>
> /*
> @@ -1407,7 +1413,7 @@ void native_play_dead(void)
> play_dead_common();
> tboot_shutdown(TB_SHUTDOWN_WFS);
>
> - mwait_play_dead();
> + mwait_play_dead_cpuid_hint();
> if (cpuidle_play_dead())
> hlt_play_dead();
> }
> --
> 2.47.1
>
© 2016 - 2026 Red Hat, Inc.