x86: ERMS follow-on

[PATCH for-4.21 1/2] x86/AMD: avoid REP MOVSB for Zen3/4

Posted by Jan Beulich 1 week, 2 days ago

Along with Zen2 (which doesn't expose ERMS), both families reportedly
suffer from sub-optimal aliasing detection when deciding whether REP MOVSB
can actually be carried out the accelerated way. Therefore we want to
avoid its use in the common case (memset(), copy_page_hot()).

Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Question is whether merely avoiding REP MOVSB (but not REP MOVSQ) is going
to be good enough.

--- a/xen/arch/x86/copy_page.S
+++ b/xen/arch/x86/copy_page.S
@@ -57,6 +57,6 @@ END(copy_page_cold)
         .endm
 
 FUNC(copy_page_hot)
-        ALTERNATIVE copy_page_movsq, copy_page_movsb, X86_FEATURE_ERMS
+        ALTERNATIVE copy_page_movsq, copy_page_movsb, X86_FEATURE_XEN_REP_MOVSB
         RET
 END(copy_page_hot)
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -1386,6 +1386,10 @@ static void cf_check init_amd(struct cpu
 
 	check_syscfg_dram_mod_en();
 
+	if (c == &boot_cpu_data && cpu_has(c, X86_FEATURE_ERMS)
+	    && c->family != 0x19 /* Zen3/4 */)
+		setup_force_cpu_cap(X86_FEATURE_XEN_REP_MOVSB);
+
 	amd_log_freq(c);
 }
 
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -684,6 +684,9 @@ static void cf_check init_intel(struct c
 	 */
 	if (c == &boot_cpu_data && c->vfm == INTEL_SKYLAKE_X)
 		setup_clear_cpu_cap(X86_FEATURE_CLWB);
+
+	if (c == &boot_cpu_data && cpu_has(c, X86_FEATURE_ERMS))
+		setup_force_cpu_cap(X86_FEATURE_XEN_REP_MOVSB);
 }
 
 const struct cpu_dev __initconst_cf_clobber intel_cpu_dev = {
--- a/xen/arch/x86/include/asm/cpufeatures.h
+++ b/xen/arch/x86/include/asm/cpufeatures.h
@@ -7,7 +7,7 @@
 #define FSCAPINTS FEATURESET_NR_ENTRIES
 
 /* Synthetic words follow the featureset words. */
-#define X86_NR_SYNTH 1
+#define X86_NR_SYNTH 2
 #define X86_SYNTH(x) (FSCAPINTS * 32 + (x))
 
 /* Synthetic features */
@@ -43,6 +43,7 @@ XEN_CPUFEATURE(IBPB_ENTRY_PV,     X86_SY
 XEN_CPUFEATURE(IBPB_ENTRY_HVM,    X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for HVM */
 XEN_CPUFEATURE(USE_VMCALL,        X86_SYNTH(30)) /* Use VMCALL instead of VMMCALL */
 XEN_CPUFEATURE(PDX_COMPRESSION,   X86_SYNTH(31)) /* PDX compression */
+XEN_CPUFEATURE(XEN_REP_MOVSB,     X86_SYNTH(32)) /* REP MOVSB used for memcpy() and alike. */
 
 /* Bug words follow the synthetic words. */
 #define X86_NR_BUG 1
--- a/xen/arch/x86/memcpy.S
+++ b/xen/arch/x86/memcpy.S
@@ -10,7 +10,7 @@ FUNC(memcpy)
          * precautions were taken).
          */
         ALTERNATIVE "and $7, %edx; shr $3, %rcx", \
-                    STR(rep movsb; RET), X86_FEATURE_ERMS
+                    STR(rep movsb; RET), X86_FEATURE_XEN_REP_MOVSB
         rep movsq
         or      %edx, %ecx
         jz      1f

Re: [PATCH for-4.21 1/2] x86/AMD: avoid REP MOVSB for Zen3/4

Posted by Teddy Astie 1 week, 2 days ago

Le 25/09/2025 à 12:48, Jan Beulich a écrit :
> Along with Zen2 (which doesn't expose ERMS), both families reportedly
> suffer from sub-optimal aliasing detection when deciding whether REP MOVSB
> can actually be carried out the accelerated way. Therefore we want to
> avoid its use in the common case (memset(), copy_page_hot()).

s/memset/memcpy (memset probably uses rep stosb which is not affected IIUC)

> 
> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Question is whether merely avoiding REP MOVSB (but not REP MOVSQ) is going
> to be good enough.
> 

This probably wants to be checked with benchmarks of rep movsb vs rep 
movsq+b (current non-ERMS algorithm). If the issue also occurs with rep 
movsq, it may be preferable to keep rep movsb even considering this issue.

> --- a/xen/arch/x86/copy_page.S
> +++ b/xen/arch/x86/copy_page.S
> @@ -57,6 +57,6 @@ END(copy_page_cold)
>           .endm
>   
>   FUNC(copy_page_hot)
> -        ALTERNATIVE copy_page_movsq, copy_page_movsb, X86_FEATURE_ERMS
> +        ALTERNATIVE copy_page_movsq, copy_page_movsb, X86_FEATURE_XEN_REP_MOVSB
>           RET
>   END(copy_page_hot)
> --- a/xen/arch/x86/cpu/amd.c
> +++ b/xen/arch/x86/cpu/amd.c
> @@ -1386,6 +1386,10 @@ static void cf_check init_amd(struct cpu
>   
>   	check_syscfg_dram_mod_en();
>   
> +	if (c == &boot_cpu_data && cpu_has(c, X86_FEATURE_ERMS)
> +	    && c->family != 0x19 /* Zen3/4 */)
> +		setup_force_cpu_cap(X86_FEATURE_XEN_REP_MOVSB);
> +

May it be fixed through a (future ?) microcode update, especially since 
rep movs is microcoded on these archs ?

>   	amd_log_freq(c);
>   }
>   
> --- a/xen/arch/x86/cpu/intel.c
> +++ b/xen/arch/x86/cpu/intel.c
> @@ -684,6 +684,9 @@ static void cf_check init_intel(struct c
>   	 */
>   	if (c == &boot_cpu_data && c->vfm == INTEL_SKYLAKE_X)
>   		setup_clear_cpu_cap(X86_FEATURE_CLWB);
> +
> +	if (c == &boot_cpu_data && cpu_has(c, X86_FEATURE_ERMS))
> +		setup_force_cpu_cap(X86_FEATURE_XEN_REP_MOVSB);
>   }
>   
>   const struct cpu_dev __initconst_cf_clobber intel_cpu_dev = {
> --- a/xen/arch/x86/include/asm/cpufeatures.h
> +++ b/xen/arch/x86/include/asm/cpufeatures.h
> @@ -7,7 +7,7 @@
>   #define FSCAPINTS FEATURESET_NR_ENTRIES
>   
>   /* Synthetic words follow the featureset words. */
> -#define X86_NR_SYNTH 1
> +#define X86_NR_SYNTH 2
>   #define X86_SYNTH(x) (FSCAPINTS * 32 + (x))
>   
>   /* Synthetic features */
> @@ -43,6 +43,7 @@ XEN_CPUFEATURE(IBPB_ENTRY_PV,     X86_SY
>   XEN_CPUFEATURE(IBPB_ENTRY_HVM,    X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for HVM */
>   XEN_CPUFEATURE(USE_VMCALL,        X86_SYNTH(30)) /* Use VMCALL instead of VMMCALL */
>   XEN_CPUFEATURE(PDX_COMPRESSION,   X86_SYNTH(31)) /* PDX compression */
> +XEN_CPUFEATURE(XEN_REP_MOVSB,     X86_SYNTH(32)) /* REP MOVSB used for memcpy() and alike. */
>   
>   /* Bug words follow the synthetic words. */
>   #define X86_NR_BUG 1
> --- a/xen/arch/x86/memcpy.S
> +++ b/xen/arch/x86/memcpy.S
> @@ -10,7 +10,7 @@ FUNC(memcpy)
>            * precautions were taken).
>            */
>           ALTERNATIVE "and $7, %edx; shr $3, %rcx", \
> -                    STR(rep movsb; RET), X86_FEATURE_ERMS
> +                    STR(rep movsb; RET), X86_FEATURE_XEN_REP_MOVSB
>           rep movsq
>           or      %edx, %ecx
>           jz      1f
> 
> 

Teddy


--
Teddy Astie | Vates XCP-ng Developer

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech

Re: [PATCH for-4.21 1/2] x86/AMD: avoid REP MOVSB for Zen3/4

Posted by Jason Andryuk 4 days, 22 hours ago

On 2025-09-25 08:18, Teddy Astie wrote:
> Le 25/09/2025 à 12:48, Jan Beulich a écrit :
>> Along with Zen2 (which doesn't expose ERMS), both families reportedly
>> suffer from sub-optimal aliasing detection when deciding whether REP MOVSB
>> can actually be carried out the accelerated way. Therefore we want to
>> avoid its use in the common case (memset(), copy_page_hot()).
> 
> s/memset/memcpy (memset probably uses rep stosb which is not affected IIUC)
> 
>>
>> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>

With Teddy's suggested change:

Reviewed-by: Jason Andryuk <jason.andryuk@amd.com>

Thanks,
Jason

Re: [PATCH for-4.21 1/2] x86/AMD: avoid REP MOVSB for Zen3/4

Posted by Jan Beulich 1 week, 2 days ago

On 25.09.2025 14:18, Teddy Astie wrote:
> Le 25/09/2025 à 12:48, Jan Beulich a écrit :
>> Along with Zen2 (which doesn't expose ERMS), both families reportedly
>> suffer from sub-optimal aliasing detection when deciding whether REP MOVSB
>> can actually be carried out the accelerated way. Therefore we want to
>> avoid its use in the common case (memset(), copy_page_hot()).
> 
> s/memset/memcpy (memset probably uses rep stosb which is not affected IIUC)

Oops, yes.

>> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> Question is whether merely avoiding REP MOVSB (but not REP MOVSQ) is going
>> to be good enough.
> 
> This probably wants to be checked with benchmarks of rep movsb vs rep 
> movsq+b (current non-ERMS algorithm). If the issue also occurs with rep 
> movsq, it may be preferable to keep rep movsb even considering this issue.

Why? Then REP MOVSB is 8 times slower than REP MOVSQ.

>> --- a/xen/arch/x86/cpu/amd.c
>> +++ b/xen/arch/x86/cpu/amd.c
>> @@ -1386,6 +1386,10 @@ static void cf_check init_amd(struct cpu
>>   
>>   	check_syscfg_dram_mod_en();
>>   
>> +	if (c == &boot_cpu_data && cpu_has(c, X86_FEATURE_ERMS)
>> +	    && c->family != 0x19 /* Zen3/4 */)
>> +		setup_force_cpu_cap(X86_FEATURE_XEN_REP_MOVSB);
>> +
> 
> May it be fixed through a (future ?) microcode update, especially since 
> rep movs is microcoded on these archs ?

I don't know, but I also don't expect that to happen.

Jan

Re: [PATCH for-4.21 1/2] x86/AMD: avoid REP MOVSB for Zen3/4

Posted by Teddy Astie 4 days, 8 hours ago

Le 25/09/2025 à 15:02, Jan Beulich a écrit :
> On 25.09.2025 14:18, Teddy Astie wrote:
>> Le 25/09/2025 à 12:48, Jan Beulich a écrit :
>>> Along with Zen2 (which doesn't expose ERMS), both families reportedly
>>> suffer from sub-optimal aliasing detection when deciding whether REP MOVSB
>>> can actually be carried out the accelerated way. Therefore we want to
>>> avoid its use in the common case (memset(), copy_page_hot()).
>>
>> s/memset/memcpy (memset probably uses rep stosb which is not affected IIUC)
> 
> Oops, yes.
> 
>>> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> ---
>>> Question is whether merely avoiding REP MOVSB (but not REP MOVSQ) is going
>>> to be good enough.
>>
>> This probably wants to be checked with benchmarks of rep movsb vs rep
>> movsq+b (current non-ERMS algorithm). If the issue also occurs with rep
>> movsq, it may be preferable to keep rep movsb even considering this issue.
> 
> Why? Then REP MOVSB is 8 times slower than REP MOVSQ.
> 

It doesn't match my observations while quickly benching rep movsb vs rep 
movsq+b (fallback) with varying alignments/sizes on Zen3/4 (Ryzen and EPYC).

It's very sensitive to size and aligment, but in many (but not all) 
cases, rep movsb is significantly faster than rep movsq+b. The worst 
cases (mentioned bug) are much slower in both cases, though rep movsq+b 
tend to perform better in these cases.

So unfortunately it's not as simple as rep movsb being (almost) always 
slower, especially with the varied copy sizes and aligments that does 
grant_copy. That's what I would prefer having more data to have a better 
picture.

>>> --- a/xen/arch/x86/cpu/amd.c
>>> +++ b/xen/arch/x86/cpu/amd.c
>>> @@ -1386,6 +1386,10 @@ static void cf_check init_amd(struct cpu
>>>    
>>>    	check_syscfg_dram_mod_en();
>>>    
>>> +	if (c == &boot_cpu_data && cpu_has(c, X86_FEATURE_ERMS)
>>> +	    && c->family != 0x19 /* Zen3/4 */)
>>> +		setup_force_cpu_cap(X86_FEATURE_XEN_REP_MOVSB);
>>> +
>>
>> May it be fixed through a (future ?) microcode update, especially since
>> rep movs is microcoded on these archs ?
> 
> I don't know, but I also don't expect that to happen.
> 
> Jan
> 

Teddy


--
Teddy Astie | Vates XCP-ng Developer

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech