Along with Zen2 (which doesn't expose ERMS), both families reportedly
suffer from sub-optimal aliasing detection when deciding whether REP MOVSB
can actually be carried out the accelerated way. Therefore we want to
avoid its use in the common case (memset(), copy_page_hot()).
Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Question is whether merely avoiding REP MOVSB (but not REP MOVSQ) is going
to be good enough.
--- a/xen/arch/x86/copy_page.S
+++ b/xen/arch/x86/copy_page.S
@@ -57,6 +57,6 @@ END(copy_page_cold)
.endm
FUNC(copy_page_hot)
- ALTERNATIVE copy_page_movsq, copy_page_movsb, X86_FEATURE_ERMS
+ ALTERNATIVE copy_page_movsq, copy_page_movsb, X86_FEATURE_XEN_REP_MOVSB
RET
END(copy_page_hot)
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -1386,6 +1386,10 @@ static void cf_check init_amd(struct cpu
check_syscfg_dram_mod_en();
+ if (c == &boot_cpu_data && cpu_has(c, X86_FEATURE_ERMS)
+ && c->family != 0x19 /* Zen3/4 */)
+ setup_force_cpu_cap(X86_FEATURE_XEN_REP_MOVSB);
+
amd_log_freq(c);
}
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -684,6 +684,9 @@ static void cf_check init_intel(struct c
*/
if (c == &boot_cpu_data && c->vfm == INTEL_SKYLAKE_X)
setup_clear_cpu_cap(X86_FEATURE_CLWB);
+
+ if (c == &boot_cpu_data && cpu_has(c, X86_FEATURE_ERMS))
+ setup_force_cpu_cap(X86_FEATURE_XEN_REP_MOVSB);
}
const struct cpu_dev __initconst_cf_clobber intel_cpu_dev = {
--- a/xen/arch/x86/include/asm/cpufeatures.h
+++ b/xen/arch/x86/include/asm/cpufeatures.h
@@ -7,7 +7,7 @@
#define FSCAPINTS FEATURESET_NR_ENTRIES
/* Synthetic words follow the featureset words. */
-#define X86_NR_SYNTH 1
+#define X86_NR_SYNTH 2
#define X86_SYNTH(x) (FSCAPINTS * 32 + (x))
/* Synthetic features */
@@ -43,6 +43,7 @@ XEN_CPUFEATURE(IBPB_ENTRY_PV, X86_SY
XEN_CPUFEATURE(IBPB_ENTRY_HVM, X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for HVM */
XEN_CPUFEATURE(USE_VMCALL, X86_SYNTH(30)) /* Use VMCALL instead of VMMCALL */
XEN_CPUFEATURE(PDX_COMPRESSION, X86_SYNTH(31)) /* PDX compression */
+XEN_CPUFEATURE(XEN_REP_MOVSB, X86_SYNTH(32)) /* REP MOVSB used for memcpy() and alike. */
/* Bug words follow the synthetic words. */
#define X86_NR_BUG 1
--- a/xen/arch/x86/memcpy.S
+++ b/xen/arch/x86/memcpy.S
@@ -10,7 +10,7 @@ FUNC(memcpy)
* precautions were taken).
*/
ALTERNATIVE "and $7, %edx; shr $3, %rcx", \
- STR(rep movsb; RET), X86_FEATURE_ERMS
+ STR(rep movsb; RET), X86_FEATURE_XEN_REP_MOVSB
rep movsq
or %edx, %ecx
jz 1f
Le 25/09/2025 à 12:48, Jan Beulich a écrit : > Along with Zen2 (which doesn't expose ERMS), both families reportedly > suffer from sub-optimal aliasing detection when deciding whether REP MOVSB > can actually be carried out the accelerated way. Therefore we want to > avoid its use in the common case (memset(), copy_page_hot()). s/memset/memcpy (memset probably uses rep stosb which is not affected IIUC) > > Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > Question is whether merely avoiding REP MOVSB (but not REP MOVSQ) is going > to be good enough. > This probably wants to be checked with benchmarks of rep movsb vs rep movsq+b (current non-ERMS algorithm). If the issue also occurs with rep movsq, it may be preferable to keep rep movsb even considering this issue. > --- a/xen/arch/x86/copy_page.S > +++ b/xen/arch/x86/copy_page.S > @@ -57,6 +57,6 @@ END(copy_page_cold) > .endm > > FUNC(copy_page_hot) > - ALTERNATIVE copy_page_movsq, copy_page_movsb, X86_FEATURE_ERMS > + ALTERNATIVE copy_page_movsq, copy_page_movsb, X86_FEATURE_XEN_REP_MOVSB > RET > END(copy_page_hot) > --- a/xen/arch/x86/cpu/amd.c > +++ b/xen/arch/x86/cpu/amd.c > @@ -1386,6 +1386,10 @@ static void cf_check init_amd(struct cpu > > check_syscfg_dram_mod_en(); > > + if (c == &boot_cpu_data && cpu_has(c, X86_FEATURE_ERMS) > + && c->family != 0x19 /* Zen3/4 */) > + setup_force_cpu_cap(X86_FEATURE_XEN_REP_MOVSB); > + May it be fixed through a (future ?) microcode update, especially since rep movs is microcoded on these archs ? > amd_log_freq(c); > } > > --- a/xen/arch/x86/cpu/intel.c > +++ b/xen/arch/x86/cpu/intel.c > @@ -684,6 +684,9 @@ static void cf_check init_intel(struct c > */ > if (c == &boot_cpu_data && c->vfm == INTEL_SKYLAKE_X) > setup_clear_cpu_cap(X86_FEATURE_CLWB); > + > + if (c == &boot_cpu_data && cpu_has(c, X86_FEATURE_ERMS)) > + setup_force_cpu_cap(X86_FEATURE_XEN_REP_MOVSB); > } > > const struct cpu_dev __initconst_cf_clobber intel_cpu_dev = { > --- a/xen/arch/x86/include/asm/cpufeatures.h > +++ b/xen/arch/x86/include/asm/cpufeatures.h > @@ -7,7 +7,7 @@ > #define FSCAPINTS FEATURESET_NR_ENTRIES > > /* Synthetic words follow the featureset words. */ > -#define X86_NR_SYNTH 1 > +#define X86_NR_SYNTH 2 > #define X86_SYNTH(x) (FSCAPINTS * 32 + (x)) > > /* Synthetic features */ > @@ -43,6 +43,7 @@ XEN_CPUFEATURE(IBPB_ENTRY_PV, X86_SY > XEN_CPUFEATURE(IBPB_ENTRY_HVM, X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for HVM */ > XEN_CPUFEATURE(USE_VMCALL, X86_SYNTH(30)) /* Use VMCALL instead of VMMCALL */ > XEN_CPUFEATURE(PDX_COMPRESSION, X86_SYNTH(31)) /* PDX compression */ > +XEN_CPUFEATURE(XEN_REP_MOVSB, X86_SYNTH(32)) /* REP MOVSB used for memcpy() and alike. */ > > /* Bug words follow the synthetic words. */ > #define X86_NR_BUG 1 > --- a/xen/arch/x86/memcpy.S > +++ b/xen/arch/x86/memcpy.S > @@ -10,7 +10,7 @@ FUNC(memcpy) > * precautions were taken). > */ > ALTERNATIVE "and $7, %edx; shr $3, %rcx", \ > - STR(rep movsb; RET), X86_FEATURE_ERMS > + STR(rep movsb; RET), X86_FEATURE_XEN_REP_MOVSB > rep movsq > or %edx, %ecx > jz 1f > > Teddy -- Teddy Astie | Vates XCP-ng Developer XCP-ng & Xen Orchestra - Vates solutions web: https://vates.tech
On 2025-09-25 08:18, Teddy Astie wrote: > Le 25/09/2025 à 12:48, Jan Beulich a écrit : >> Along with Zen2 (which doesn't expose ERMS), both families reportedly >> suffer from sub-optimal aliasing detection when deciding whether REP MOVSB >> can actually be carried out the accelerated way. Therefore we want to >> avoid its use in the common case (memset(), copy_page_hot()). > > s/memset/memcpy (memset probably uses rep stosb which is not affected IIUC) > >> >> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> With Teddy's suggested change: Reviewed-by: Jason Andryuk <jason.andryuk@amd.com> Thanks, Jason
On 25.09.2025 14:18, Teddy Astie wrote: > Le 25/09/2025 à 12:48, Jan Beulich a écrit : >> Along with Zen2 (which doesn't expose ERMS), both families reportedly >> suffer from sub-optimal aliasing detection when deciding whether REP MOVSB >> can actually be carried out the accelerated way. Therefore we want to >> avoid its use in the common case (memset(), copy_page_hot()). > > s/memset/memcpy (memset probably uses rep stosb which is not affected IIUC) Oops, yes. >> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> >> --- >> Question is whether merely avoiding REP MOVSB (but not REP MOVSQ) is going >> to be good enough. > > This probably wants to be checked with benchmarks of rep movsb vs rep > movsq+b (current non-ERMS algorithm). If the issue also occurs with rep > movsq, it may be preferable to keep rep movsb even considering this issue. Why? Then REP MOVSB is 8 times slower than REP MOVSQ. >> --- a/xen/arch/x86/cpu/amd.c >> +++ b/xen/arch/x86/cpu/amd.c >> @@ -1386,6 +1386,10 @@ static void cf_check init_amd(struct cpu >> >> check_syscfg_dram_mod_en(); >> >> + if (c == &boot_cpu_data && cpu_has(c, X86_FEATURE_ERMS) >> + && c->family != 0x19 /* Zen3/4 */) >> + setup_force_cpu_cap(X86_FEATURE_XEN_REP_MOVSB); >> + > > May it be fixed through a (future ?) microcode update, especially since > rep movs is microcoded on these archs ? I don't know, but I also don't expect that to happen. Jan
Le 25/09/2025 à 15:02, Jan Beulich a écrit : > On 25.09.2025 14:18, Teddy Astie wrote: >> Le 25/09/2025 à 12:48, Jan Beulich a écrit : >>> Along with Zen2 (which doesn't expose ERMS), both families reportedly >>> suffer from sub-optimal aliasing detection when deciding whether REP MOVSB >>> can actually be carried out the accelerated way. Therefore we want to >>> avoid its use in the common case (memset(), copy_page_hot()). >> >> s/memset/memcpy (memset probably uses rep stosb which is not affected IIUC) > > Oops, yes. > >>> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> >>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >>> --- >>> Question is whether merely avoiding REP MOVSB (but not REP MOVSQ) is going >>> to be good enough. >> >> This probably wants to be checked with benchmarks of rep movsb vs rep >> movsq+b (current non-ERMS algorithm). If the issue also occurs with rep >> movsq, it may be preferable to keep rep movsb even considering this issue. > > Why? Then REP MOVSB is 8 times slower than REP MOVSQ. > It doesn't match my observations while quickly benching rep movsb vs rep movsq+b (fallback) with varying alignments/sizes on Zen3/4 (Ryzen and EPYC). It's very sensitive to size and aligment, but in many (but not all) cases, rep movsb is significantly faster than rep movsq+b. The worst cases (mentioned bug) are much slower in both cases, though rep movsq+b tend to perform better in these cases. So unfortunately it's not as simple as rep movsb being (almost) always slower, especially with the varied copy sizes and aligments that does grant_copy. That's what I would prefer having more data to have a better picture. >>> --- a/xen/arch/x86/cpu/amd.c >>> +++ b/xen/arch/x86/cpu/amd.c >>> @@ -1386,6 +1386,10 @@ static void cf_check init_amd(struct cpu >>> >>> check_syscfg_dram_mod_en(); >>> >>> + if (c == &boot_cpu_data && cpu_has(c, X86_FEATURE_ERMS) >>> + && c->family != 0x19 /* Zen3/4 */) >>> + setup_force_cpu_cap(X86_FEATURE_XEN_REP_MOVSB); >>> + >> >> May it be fixed through a (future ?) microcode update, especially since >> rep movs is microcoded on these archs ? > > I don't know, but I also don't expect that to happen. > > Jan > Teddy -- Teddy Astie | Vates XCP-ng Developer XCP-ng & Xen Orchestra - Vates solutions web: https://vates.tech
© 2016 - 2025 Red Hat, Inc.