[PATCH v7 1/4] x86/smp: Allow calling mwait_play_dead with an arbitrary hint

Patryk Wlazlyn posted 4 patches 3 weeks, 5 days ago
There is a newer version of this series
[PATCH v7 1/4] x86/smp: Allow calling mwait_play_dead with an arbitrary hint
Posted by Patryk Wlazlyn 3 weeks, 5 days ago
Introduce a helper function to allow offlined CPUs to enter FFh idle
states with a specific MWAIT hint. The new helper will be used in
subsequent patches by the acpi_idle and intel_idle drivers.

No functional change intended.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
---
 arch/x86/include/asm/smp.h |  3 ++
 arch/x86/kernel/smpboot.c  | 90 ++++++++++++++++++++------------------
 2 files changed, 51 insertions(+), 42 deletions(-)

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index ca073f40698f..dfd09a1e09bf 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu);
 int wbinvd_on_all_cpus(void);
 
 void smp_kick_mwait_play_dead(void);
+void mwait_play_dead(unsigned int hint);
 
 void native_smp_send_reschedule(int cpu);
 void native_send_call_func_ipi(const struct cpumask *mask);
@@ -164,6 +165,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
 {
 	return (struct cpumask *)cpumask_of(0);
 }
+
+static inline void mwait_play_dead(unsigned int eax_hint) { }
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_DEBUG_NMI_SELFTEST
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b5a8f0891135..8a3545c2cae9 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1272,13 +1272,57 @@ void play_dead_common(void)
 	local_irq_disable();
 }
 
+void __noreturn mwait_play_dead(unsigned int eax_hint)
+{
+	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
+
+	/* Set up state for the kexec() hack below */
+	md->status = CPUDEAD_MWAIT_WAIT;
+	md->control = CPUDEAD_MWAIT_WAIT;
+
+	wbinvd();
+
+	while (1) {
+		/*
+		 * The CLFLUSH is a workaround for erratum AAI65 for
+		 * the Xeon 7400 series.  It's not clear it is actually
+		 * needed, but it should be harmless in either case.
+		 * The WBINVD is insufficient due to the spurious-wakeup
+		 * case where we return around the loop.
+		 */
+		mb();
+		clflush(md);
+		mb();
+		__monitor(md, 0, 0);
+		mb();
+		__mwait(eax_hint, 0);
+
+		if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+			/*
+			 * Kexec is about to happen. Don't go back into mwait() as
+			 * the kexec kernel might overwrite text and data including
+			 * page tables and stack. So mwait() would resume when the
+			 * monitor cache line is written to and then the CPU goes
+			 * south due to overwritten text, page tables and stack.
+			 *
+			 * Note: This does _NOT_ protect against a stray MCE, NMI,
+			 * SMI. They will resume execution at the instruction
+			 * following the HLT instruction and run into the problem
+			 * which this is trying to prevent.
+			 */
+			WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+			while(1)
+				native_halt();
+		}
+	}
+}
+
 /*
  * We need to flush the caches before going to sleep, lest we have
  * dirty data in our caches when we come back up.
  */
-static inline void mwait_play_dead(void)
+static inline void mwait_play_dead_cpuid_hint(void)
 {
-	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
 	unsigned int eax, ebx, ecx, edx;
 	unsigned int highest_cstate = 0;
 	unsigned int highest_subcstate = 0;
@@ -1316,45 +1360,7 @@ static inline void mwait_play_dead(void)
 			(highest_subcstate - 1);
 	}
 
-	/* Set up state for the kexec() hack below */
-	md->status = CPUDEAD_MWAIT_WAIT;
-	md->control = CPUDEAD_MWAIT_WAIT;
-
-	wbinvd();
-
-	while (1) {
-		/*
-		 * The CLFLUSH is a workaround for erratum AAI65 for
-		 * the Xeon 7400 series.  It's not clear it is actually
-		 * needed, but it should be harmless in either case.
-		 * The WBINVD is insufficient due to the spurious-wakeup
-		 * case where we return around the loop.
-		 */
-		mb();
-		clflush(md);
-		mb();
-		__monitor(md, 0, 0);
-		mb();
-		__mwait(eax, 0);
-
-		if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
-			/*
-			 * Kexec is about to happen. Don't go back into mwait() as
-			 * the kexec kernel might overwrite text and data including
-			 * page tables and stack. So mwait() would resume when the
-			 * monitor cache line is written to and then the CPU goes
-			 * south due to overwritten text, page tables and stack.
-			 *
-			 * Note: This does _NOT_ protect against a stray MCE, NMI,
-			 * SMI. They will resume execution at the instruction
-			 * following the HLT instruction and run into the problem
-			 * which this is trying to prevent.
-			 */
-			WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
-			while(1)
-				native_halt();
-		}
-	}
+	mwait_play_dead(eax);
 }
 
 /*
@@ -1407,7 +1413,7 @@ void native_play_dead(void)
 	play_dead_common();
 	tboot_shutdown(TB_SHUTDOWN_WFS);
 
-	mwait_play_dead();
+	mwait_play_dead_cpuid_hint();
 	if (cpuidle_play_dead())
 		hlt_play_dead();
 }
-- 
2.47.1
Re: [PATCH v7 1/4] x86/smp: Allow calling mwait_play_dead with an arbitrary hint
Posted by Rafael J. Wysocki 2 weeks, 1 day ago
First, I would change the subject to something like "x86/smp: Add hint
parameter to mwait_play_dead()"

On Fri, Nov 29, 2024 at 7:22 PM Patryk Wlazlyn
<patryk.wlazlyn@linux.intel.com> wrote:
>
> Introduce a helper function to allow offlined CPUs to enter FFh idle
> states with a specific MWAIT hint. The new helper will be used in
> subsequent patches by the acpi_idle and intel_idle drivers.

And the above would become

"Change mwait_play_dead() into a helper function allowing CPUs going
offline to enter idle states via MWAIT with a specific hint passed to
it as an argument.

Add mwait_play_dead_cpuid_hint() as a wrapper around mwait_play_dead()
implementing the existing behavior of the code.

Subsequently, the new helper will also be used by the acpi_idle and
intel_idle drivers in idle-state-specific :enter_dead() callbacks."

> No functional change intended.
>
> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
> ---
>  arch/x86/include/asm/smp.h |  3 ++
>  arch/x86/kernel/smpboot.c  | 90 ++++++++++++++++++++------------------
>  2 files changed, 51 insertions(+), 42 deletions(-)
>
> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index ca073f40698f..dfd09a1e09bf 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu);
>  int wbinvd_on_all_cpus(void);
>
>  void smp_kick_mwait_play_dead(void);
> +void mwait_play_dead(unsigned int hint);
>
>  void native_smp_send_reschedule(int cpu);
>  void native_send_call_func_ipi(const struct cpumask *mask);
> @@ -164,6 +165,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
>  {
>         return (struct cpumask *)cpumask_of(0);
>  }
> +
> +static inline void mwait_play_dead(unsigned int eax_hint) { }
>  #endif /* CONFIG_SMP */
>
>  #ifdef CONFIG_DEBUG_NMI_SELFTEST
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index b5a8f0891135..8a3545c2cae9 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -1272,13 +1272,57 @@ void play_dead_common(void)
>         local_irq_disable();
>  }
>
> +void __noreturn mwait_play_dead(unsigned int eax_hint)
> +{
> +       struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
> +
> +       /* Set up state for the kexec() hack below */
> +       md->status = CPUDEAD_MWAIT_WAIT;
> +       md->control = CPUDEAD_MWAIT_WAIT;
> +
> +       wbinvd();
> +
> +       while (1) {
> +               /*
> +                * The CLFLUSH is a workaround for erratum AAI65 for
> +                * the Xeon 7400 series.  It's not clear it is actually
> +                * needed, but it should be harmless in either case.
> +                * The WBINVD is insufficient due to the spurious-wakeup
> +                * case where we return around the loop.
> +                */
> +               mb();
> +               clflush(md);
> +               mb();
> +               __monitor(md, 0, 0);
> +               mb();
> +               __mwait(eax_hint, 0);
> +
> +               if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
> +                       /*
> +                        * Kexec is about to happen. Don't go back into mwait() as
> +                        * the kexec kernel might overwrite text and data including
> +                        * page tables and stack. So mwait() would resume when the
> +                        * monitor cache line is written to and then the CPU goes
> +                        * south due to overwritten text, page tables and stack.
> +                        *
> +                        * Note: This does _NOT_ protect against a stray MCE, NMI,
> +                        * SMI. They will resume execution at the instruction
> +                        * following the HLT instruction and run into the problem
> +                        * which this is trying to prevent.
> +                        */
> +                       WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
> +                       while(1)
> +                               native_halt();
> +               }
> +       }
> +}
> +
>  /*
>   * We need to flush the caches before going to sleep, lest we have
>   * dirty data in our caches when we come back up.
>   */
> -static inline void mwait_play_dead(void)
> +static inline void mwait_play_dead_cpuid_hint(void)
>  {
> -       struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
>         unsigned int eax, ebx, ecx, edx;
>         unsigned int highest_cstate = 0;
>         unsigned int highest_subcstate = 0;
> @@ -1316,45 +1360,7 @@ static inline void mwait_play_dead(void)
>                         (highest_subcstate - 1);
>         }
>
> -       /* Set up state for the kexec() hack below */
> -       md->status = CPUDEAD_MWAIT_WAIT;
> -       md->control = CPUDEAD_MWAIT_WAIT;
> -
> -       wbinvd();
> -
> -       while (1) {
> -               /*
> -                * The CLFLUSH is a workaround for erratum AAI65 for
> -                * the Xeon 7400 series.  It's not clear it is actually
> -                * needed, but it should be harmless in either case.
> -                * The WBINVD is insufficient due to the spurious-wakeup
> -                * case where we return around the loop.
> -                */
> -               mb();
> -               clflush(md);
> -               mb();
> -               __monitor(md, 0, 0);
> -               mb();
> -               __mwait(eax, 0);
> -
> -               if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
> -                       /*
> -                        * Kexec is about to happen. Don't go back into mwait() as
> -                        * the kexec kernel might overwrite text and data including
> -                        * page tables and stack. So mwait() would resume when the
> -                        * monitor cache line is written to and then the CPU goes
> -                        * south due to overwritten text, page tables and stack.
> -                        *
> -                        * Note: This does _NOT_ protect against a stray MCE, NMI,
> -                        * SMI. They will resume execution at the instruction
> -                        * following the HLT instruction and run into the problem
> -                        * which this is trying to prevent.
> -                        */
> -                       WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
> -                       while(1)
> -                               native_halt();
> -               }
> -       }
> +       mwait_play_dead(eax);
>  }
>
>  /*
> @@ -1407,7 +1413,7 @@ void native_play_dead(void)
>         play_dead_common();
>         tboot_shutdown(TB_SHUTDOWN_WFS);
>
> -       mwait_play_dead();
> +       mwait_play_dead_cpuid_hint();
>         if (cpuidle_play_dead())
>                 hlt_play_dead();
>  }
> --

And honestly I'm wondering why adding a parameter to mwait_play_dead()
is better than introducing mwait_play_dead_with_hint(), in analogy
with the existing mwait_idle_with_hints()?

The latter option would allow you to avoid introducing a function that
is deleted in the same patch series (in patch 4).
Re: [PATCH v7 1/4] x86/smp: Allow calling mwait_play_dead with an arbitrary hint
Posted by Patryk Wlazlyn 1 week, 1 day ago
> And honestly I'm wondering why adding a parameter to mwait_play_dead()
> is better than introducing mwait_play_dead_with_hint(), in analogy
> with the existing mwait_idle_with_hints()?
>
> The latter option would allow you to avoid introducing a function that
> is deleted in the same patch series (in patch 4).

We need to be able to call part of the old mwait_play_dead() code,
but without the hint calculation.

mwait_idle_with_hints() doesn't have the "kexec hack" logic.

We also need to leave the old code working and on top of that introduce
the acpi_idle and intel_idle patches that use the new API.

Now the old code is there and the new one. The only thing left is remove
the old code. I did it that way because of the comments earlier indicating
that I should not be breaking code in between.

Let me know if I answered your question or if I misunderstood something
now or earlier.

I'll apply your changelog suggestions when we agree on the implementation.
Re: [PATCH v7 1/4] x86/smp: Allow calling mwait_play_dead with an arbitrary hint
Posted by Rafael J. Wysocki 1 week, 1 day ago
On Tue, Dec 17, 2024 at 9:09 PM Patryk Wlazlyn
<patryk.wlazlyn@linux.intel.com> wrote:
>
> > And honestly I'm wondering why adding a parameter to mwait_play_dead()
> > is better than introducing mwait_play_dead_with_hint(), in analogy
> > with the existing mwait_idle_with_hints()?
> >
> > The latter option would allow you to avoid introducing a function that
> > is deleted in the same patch series (in patch 4).
>
> We need to be able to call part of the old mwait_play_dead() code,
> but without the hint calculation.
>
> mwait_idle_with_hints() doesn't have the "kexec hack" logic.

Well, "in analogy" doesn't mean to use mwait_idle_with_hints() instead
of the new function.

Just the name of the new function could be similar to
mwait_idle_with_hints() (which is the name of an existing function),
that is mwait_play_dead_with_hint().

> We also need to leave the old code working and on top of that introduce
> the acpi_idle and intel_idle patches that use the new API.

Sure.  If the name of the new function is mwait_play_dead_with_hint(),
that will still work.
Re: [PATCH v7 1/4] x86/smp: Allow calling mwait_play_dead with an arbitrary hint
Posted by Gautham R. Shenoy 3 weeks, 2 days ago
On Fri, Nov 29, 2024 at 07:22:29PM +0100, Patryk Wlazlyn wrote:
> Introduce a helper function to allow offlined CPUs to enter FFh idle
> states with a specific MWAIT hint. The new helper will be used in
> subsequent patches by the acpi_idle and intel_idle drivers.
> 
> No functional change intended.
> 
> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>

This looks good to me.

Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>

--
Thanks and Regards
gautham.

> ---
>  arch/x86/include/asm/smp.h |  3 ++
>  arch/x86/kernel/smpboot.c  | 90 ++++++++++++++++++++------------------
>  2 files changed, 51 insertions(+), 42 deletions(-)
> 
> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index ca073f40698f..dfd09a1e09bf 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu);
>  int wbinvd_on_all_cpus(void);
>  
>  void smp_kick_mwait_play_dead(void);
> +void mwait_play_dead(unsigned int hint);
>  
>  void native_smp_send_reschedule(int cpu);
>  void native_send_call_func_ipi(const struct cpumask *mask);
> @@ -164,6 +165,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
>  {
>  	return (struct cpumask *)cpumask_of(0);
>  }
> +
> +static inline void mwait_play_dead(unsigned int eax_hint) { }
>  #endif /* CONFIG_SMP */
>  
>  #ifdef CONFIG_DEBUG_NMI_SELFTEST
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index b5a8f0891135..8a3545c2cae9 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -1272,13 +1272,57 @@ void play_dead_common(void)
>  	local_irq_disable();
>  }
>  
> +void __noreturn mwait_play_dead(unsigned int eax_hint)
> +{
> +	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
> +
> +	/* Set up state for the kexec() hack below */
> +	md->status = CPUDEAD_MWAIT_WAIT;
> +	md->control = CPUDEAD_MWAIT_WAIT;
> +
> +	wbinvd();
> +
> +	while (1) {
> +		/*
> +		 * The CLFLUSH is a workaround for erratum AAI65 for
> +		 * the Xeon 7400 series.  It's not clear it is actually
> +		 * needed, but it should be harmless in either case.
> +		 * The WBINVD is insufficient due to the spurious-wakeup
> +		 * case where we return around the loop.
> +		 */
> +		mb();
> +		clflush(md);
> +		mb();
> +		__monitor(md, 0, 0);
> +		mb();
> +		__mwait(eax_hint, 0);
> +
> +		if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
> +			/*
> +			 * Kexec is about to happen. Don't go back into mwait() as
> +			 * the kexec kernel might overwrite text and data including
> +			 * page tables and stack. So mwait() would resume when the
> +			 * monitor cache line is written to and then the CPU goes
> +			 * south due to overwritten text, page tables and stack.
> +			 *
> +			 * Note: This does _NOT_ protect against a stray MCE, NMI,
> +			 * SMI. They will resume execution at the instruction
> +			 * following the HLT instruction and run into the problem
> +			 * which this is trying to prevent.
> +			 */
> +			WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
> +			while(1)
> +				native_halt();
> +		}
> +	}
> +}
> +
>  /*
>   * We need to flush the caches before going to sleep, lest we have
>   * dirty data in our caches when we come back up.
>   */
> -static inline void mwait_play_dead(void)
> +static inline void mwait_play_dead_cpuid_hint(void)
>  {
> -	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
>  	unsigned int eax, ebx, ecx, edx;
>  	unsigned int highest_cstate = 0;
>  	unsigned int highest_subcstate = 0;
> @@ -1316,45 +1360,7 @@ static inline void mwait_play_dead(void)
>  			(highest_subcstate - 1);
>  	}
>  
> -	/* Set up state for the kexec() hack below */
> -	md->status = CPUDEAD_MWAIT_WAIT;
> -	md->control = CPUDEAD_MWAIT_WAIT;
> -
> -	wbinvd();
> -
> -	while (1) {
> -		/*
> -		 * The CLFLUSH is a workaround for erratum AAI65 for
> -		 * the Xeon 7400 series.  It's not clear it is actually
> -		 * needed, but it should be harmless in either case.
> -		 * The WBINVD is insufficient due to the spurious-wakeup
> -		 * case where we return around the loop.
> -		 */
> -		mb();
> -		clflush(md);
> -		mb();
> -		__monitor(md, 0, 0);
> -		mb();
> -		__mwait(eax, 0);
> -
> -		if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
> -			/*
> -			 * Kexec is about to happen. Don't go back into mwait() as
> -			 * the kexec kernel might overwrite text and data including
> -			 * page tables and stack. So mwait() would resume when the
> -			 * monitor cache line is written to and then the CPU goes
> -			 * south due to overwritten text, page tables and stack.
> -			 *
> -			 * Note: This does _NOT_ protect against a stray MCE, NMI,
> -			 * SMI. They will resume execution at the instruction
> -			 * following the HLT instruction and run into the problem
> -			 * which this is trying to prevent.
> -			 */
> -			WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
> -			while(1)
> -				native_halt();
> -		}
> -	}
> +	mwait_play_dead(eax);
>  }
>  
>  /*
> @@ -1407,7 +1413,7 @@ void native_play_dead(void)
>  	play_dead_common();
>  	tboot_shutdown(TB_SHUTDOWN_WFS);
>  
> -	mwait_play_dead();
> +	mwait_play_dead_cpuid_hint();
>  	if (cpuidle_play_dead())
>  		hlt_play_dead();
>  }
> -- 
> 2.47.1
>