[PATCH v7 4/4] x86/smp native_play_dead: Prefer cpuidle_play_dead() over mwait_play_dead()

Patryk Wlazlyn posted 4 patches 3 weeks, 5 days ago
There is a newer version of this series
[PATCH v7 4/4] x86/smp native_play_dead: Prefer cpuidle_play_dead() over mwait_play_dead()
Posted by Patryk Wlazlyn 3 weeks, 5 days ago
The current algorithm* for looking up the mwait hint for the deepest
cstate, in mwait_play_dead() code works by inspecting CPUID leaf 0x5 and
calculates the mwait hint based on the number of reported substates.
This approach depends on the hints associated with them to be continuous
in the range [0, NUM_SUBSTATES-1]. This continuity is not documented and
is not met on the recent Intel platforms.

 * The current algorithm is implemented in the for loop inspecting edx
   in mwait_play_dead().

For example, Intel's Sierra Forest report two cstates with two substates
each in cpuid leaf 0x5:

  Name*   target cstate    target subcstate (mwait hint)
  ===========================================================
  C1      0x00             0x00
  C1E     0x00             0x01

  --      0x10             ----

  C6S     0x20             0x22
  C6P     0x20             0x23

  --      0x30             ----

  /* No more (sub)states all the way down to the end. */
  ===========================================================

   * Names of the cstates are not included in the CPUID leaf 0x5, they are
     taken from the product specific documentation.

Notice that hints 0x20 and 0x21 are skipped entirely for the target
cstate 0x20 (C6), being a cause of the problem for the current cpuid
leaf 0x5 algorithm.

Remove the old implementation of play_dead MWAIT hint calculation based
on the CPUID leaf 0x5 in mwait_play_dead() and delegate calling of the
mwait_play_dead_with_hint() to the idle driver.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
---
 arch/x86/kernel/smpboot.c | 56 +++++----------------------------------
 1 file changed, 7 insertions(+), 49 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8a3545c2cae9..82801137486d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1272,6 +1272,10 @@ void play_dead_common(void)
 	local_irq_disable();
 }
 
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
 void __noreturn mwait_play_dead(unsigned int eax_hint)
 {
 	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
@@ -1317,52 +1321,6 @@ void __noreturn mwait_play_dead(unsigned int eax_hint)
 	}
 }
 
-/*
- * We need to flush the caches before going to sleep, lest we have
- * dirty data in our caches when we come back up.
- */
-static inline void mwait_play_dead_cpuid_hint(void)
-{
-	unsigned int eax, ebx, ecx, edx;
-	unsigned int highest_cstate = 0;
-	unsigned int highest_subcstate = 0;
-	int i;
-
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
-	    boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
-		return;
-	if (!this_cpu_has(X86_FEATURE_MWAIT))
-		return;
-	if (!this_cpu_has(X86_FEATURE_CLFLUSH))
-		return;
-	if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
-		return;
-
-	eax = CPUID_MWAIT_LEAF;
-	ecx = 0;
-	native_cpuid(&eax, &ebx, &ecx, &edx);
-
-	/*
-	 * eax will be 0 if EDX enumeration is not valid.
-	 * Initialized below to cstate, sub_cstate value when EDX is valid.
-	 */
-	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
-		eax = 0;
-	} else {
-		edx >>= MWAIT_SUBSTATE_SIZE;
-		for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
-			if (edx & MWAIT_SUBSTATE_MASK) {
-				highest_cstate = i;
-				highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
-			}
-		}
-		eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
-			(highest_subcstate - 1);
-	}
-
-	mwait_play_dead(eax);
-}
-
 /*
  * Kick all "offline" CPUs out of mwait on kexec(). See comment in
  * mwait_play_dead().
@@ -1413,9 +1371,9 @@ void native_play_dead(void)
 	play_dead_common();
 	tboot_shutdown(TB_SHUTDOWN_WFS);
 
-	mwait_play_dead_cpuid_hint();
-	if (cpuidle_play_dead())
-		hlt_play_dead();
+	/* Below returns only on error. */
+	cpuidle_play_dead();
+	hlt_play_dead();
 }
 
 #else /* ... !CONFIG_HOTPLUG_CPU */
-- 
2.47.1
Re: [PATCH v7 4/4] x86/smp native_play_dead: Prefer cpuidle_play_dead() over mwait_play_dead()
Posted by Gautham R. Shenoy 3 weeks, 2 days ago
Hello Patryk,

On Fri, Nov 29, 2024 at 07:22:32PM +0100, Patryk Wlazlyn wrote:

A couple of minor nits in the commit log:

> The current algorithm* for looking up the mwait hint for the deepest
> cstate, in mwait_play_dead() code works by inspecting CPUID leaf 0x5 and
             ^^^^^^^^^^^^^^^^^
	     mwait_play_dead_cpuid_hint() after the rename in Patch 1.
	     

> calculates the mwait hint based on the number of reported substates.
> This approach depends on the hints associated with them to be continuous
> in the range [0, NUM_SUBSTATES-1]. This continuity is not documented and
> is not met on the recent Intel platforms.
> 
>  * The current algorithm is implemented in the for loop inspecting edx
>    in mwait_play_dead().
> 
> For example, Intel's Sierra Forest report two cstates with two substates
> each in cpuid leaf 0x5:
> 
>   Name*   target cstate    target subcstate (mwait hint)
>   ===========================================================
>   C1      0x00             0x00
>   C1E     0x00             0x01
> 
>   --      0x10             ----
> 
>   C6S     0x20             0x22
>   C6P     0x20             0x23
> 
>   --      0x30             ----
> 
>   /* No more (sub)states all the way down to the end. */
>   ===========================================================
> 
>    * Names of the cstates are not included in the CPUID leaf 0x5, they are
>      taken from the product specific documentation.
> 
> Notice that hints 0x20 and 0x21 are skipped entirely for the target
> cstate 0x20 (C6), being a cause of the problem for the current cpuid
> leaf 0x5 algorithm.
> 
> Remove the old implementation of play_dead MWAIT hint calculation based
> on the CPUID leaf 0x5 in mwait_play_dead() and delegate calling of the
                           ^^^^^^^^^^^^^^^^^
                           mwait_play_dead_cpuid_hint()
			   
> mwait_play_dead_with_hint() to the idle driver.
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  mwait_play_dead()


This should work since both acpi-idle and intel-idle drivers now
support the .enter_dead() callbacks for FFh idle states.


Otherwise, the patch looks good to me. Feel free to add the following
tag in the next version.

Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>

--
Thanks and Regards
gautham.

> 
> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
> ---
>  arch/x86/kernel/smpboot.c | 56 +++++----------------------------------
>  1 file changed, 7 insertions(+), 49 deletions(-)
> 
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 8a3545c2cae9..82801137486d 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -1272,6 +1272,10 @@ void play_dead_common(void)
>  	local_irq_disable();
>  }
>  
> +/*
> + * We need to flush the caches before going to sleep, lest we have
> + * dirty data in our caches when we come back up.
> + */
>  void __noreturn mwait_play_dead(unsigned int eax_hint)
>  {
>  	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
> @@ -1317,52 +1321,6 @@ void __noreturn mwait_play_dead(unsigned int eax_hint)
>  	}
>  }
>  
> -/*
> - * We need to flush the caches before going to sleep, lest we have
> - * dirty data in our caches when we come back up.
> - */
> -static inline void mwait_play_dead_cpuid_hint(void)
> -{
> -	unsigned int eax, ebx, ecx, edx;
> -	unsigned int highest_cstate = 0;
> -	unsigned int highest_subcstate = 0;
> -	int i;
> -
> -	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
> -	    boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
> -		return;
> -	if (!this_cpu_has(X86_FEATURE_MWAIT))
> -		return;
> -	if (!this_cpu_has(X86_FEATURE_CLFLUSH))
> -		return;
> -	if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
> -		return;
> -
> -	eax = CPUID_MWAIT_LEAF;
> -	ecx = 0;
> -	native_cpuid(&eax, &ebx, &ecx, &edx);
> -
> -	/*
> -	 * eax will be 0 if EDX enumeration is not valid.
> -	 * Initialized below to cstate, sub_cstate value when EDX is valid.
> -	 */
> -	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
> -		eax = 0;
> -	} else {
> -		edx >>= MWAIT_SUBSTATE_SIZE;
> -		for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
> -			if (edx & MWAIT_SUBSTATE_MASK) {
> -				highest_cstate = i;
> -				highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
> -			}
> -		}
> -		eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
> -			(highest_subcstate - 1);
> -	}
> -
> -	mwait_play_dead(eax);
> -}
> -
>  /*
>   * Kick all "offline" CPUs out of mwait on kexec(). See comment in
>   * mwait_play_dead().
> @@ -1413,9 +1371,9 @@ void native_play_dead(void)
>  	play_dead_common();
>  	tboot_shutdown(TB_SHUTDOWN_WFS);
>  
> -	mwait_play_dead_cpuid_hint();
> -	if (cpuidle_play_dead())
> -		hlt_play_dead();
> +	/* Below returns only on error. */
> +	cpuidle_play_dead();
> +	hlt_play_dead();
>  }
>  
>  #else /* ... !CONFIG_HOTPLUG_CPU */
> -- 
> 2.47.1
>