0) Background:
We found that AmpereOne benefits from aggressive prefetches when
using 4K page size.
1) This patch:
1.1) adds new WORKAROUND_AMPERE_AC03_PREFETCH capability.
1.2) uses MIDR_AMPERE1 to filter the processor.
1.3) uses alternative_if to alternative the code
for AmpereOne.
1.4) adds software prefetches for the specific loop.
Also add a macro add_prefetch.
2) Test result:
In hugetlb or tmpfs, We can get big seqential read performance improvement
up to 1.3x ~ 1.4x.
Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
---
arch/arm64/Kconfig.platforms | 7 +++++++
arch/arm64/kernel/cpu_errata.c | 9 +++++++++
arch/arm64/lib/copy_template.S | 31 +++++++++++++++++++++++++++++++
arch/arm64/tools/cpucaps | 1 +
4 files changed, 48 insertions(+)
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index 6069120199bb..74ab8bea0019 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -8,6 +8,13 @@ config ARCH_ACTIONS
help
This enables support for the Actions Semiconductor S900 SoC family.
+config ARCH_AMPEREONE
+ bool "AmpereOne Platforms"
+ help
+ This enables support for the ARMv8 based AmpereOne chipsets.
+ AmpereOne is the next generation of Cloud Native Processors from
+ Ampere.
+
config ARCH_SUNXI
bool "Allwinner sunxi 64-bit SoC Family"
select ARCH_HAS_RESET_CONTROLLER
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 5706e74c5578..c0060d3086d0 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -744,6 +744,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
.capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38,
ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1),
},
+#endif
+#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
+ {
+ .desc = "Optimization for AmpereOne chip",
+ .capability = ARM64_WORKAROUND_AMPERE_AC03_PREFETCH,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = is_affected_midr_range,
+ .midr_range = MIDR_ALL_VERSIONS(MIDR_AMPERE1)
+ },
#endif
{
}
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 79b32569260c..b707c3ec6820 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -41,6 +41,18 @@
b.ne .Ltail63
.endm
+#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
+.macro add_prefetch
+ /*
+ * Add prefetch two cache lines by prfm to optimize the
+ * performance. The 2K offset is the best offset which
+ * we get from the tests.
+ */
+ prfm pldl2keep, [src, #2048]
+ prfm pldl2keep, [src, #2112]
+.endm
+#endif
+
/*
* Copy a buffer from src to dest (alignment handled by the hardware)
*
@@ -156,6 +168,13 @@ D_h .req x14
b .Lexitfunc
.Lcpy_over64:
+#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
+alternative_if ARM64_WORKAROUND_AMPERE_AC03_PREFETCH
+ cmp count, #PAGE_SIZE
+ b.ge .Lcpy_over_pagesize
+alternative_else_nop_endif
+#endif
+
subs count, count, #128
b.ge .Lcpy_body_large
/*
@@ -182,4 +201,16 @@ D_h .req x14
.p2align L1_CACHE_SHIFT
.Lcpy_body_large:
loop_for_copy_128_bytes
+
+#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
+ b .Lexitfunc
+
+ .p2align L1_CACHE_SHIFT
+.Lcpy_over_pagesize:
+alternative_if ARM64_WORKAROUND_AMPERE_AC03_PREFETCH
+ subs count, count, #128
+ loop_for_copy_128_bytes add_prefetch
+alternative_else_nop_endif
+#endif
+
.Lexitfunc:
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index dea3dc89234b..13e197abf249 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -100,3 +100,4 @@ WORKAROUND_NVIDIA_CARMEL_CNP
WORKAROUND_QCOM_FALKOR_E1003
WORKAROUND_REPEAT_TLBI
WORKAROUND_SPECULATIVE_AT
+WORKAROUND_AMPERE_AC03_PREFETCH
--
2.40.1
On 2023-11-22 9:28 am, Huang Shijie wrote:
> 0) Background:
> We found that AmpereOne benefits from aggressive prefetches when
> using 4K page size.
>
> 1) This patch:
> 1.1) adds new WORKAROUND_AMPERE_AC03_PREFETCH capability.
> 1.2) uses MIDR_AMPERE1 to filter the processor.
> 1.3) uses alternative_if to alternative the code
> for AmpereOne.
> 1.4) adds software prefetches for the specific loop.
> Also add a macro add_prefetch.
>
> 2) Test result:
> In hugetlb or tmpfs, We can get big seqential read performance improvement
> up to 1.3x ~ 1.4x.
Frankly the copy_template code is pretty terrible anyway, so the fact
that you're not touching anything *else* (memcpy(), copy_page(), etc.)
makes me wonder whether you'd benefit from just a better baseline to
begin with (unless the underlying concern really is something more
specific like the hardware prefetcher failing to recognise LDTR/STTR).
The last attempt to improve this derailed into questioning the usercopy
API semantics themselves, but for reference that would be my original
patches at [0] (more optimised, but some copy_to_user() fault fixups are
buggy), and/or Mark's follow-up at [1] (less aggressive but still better
than the current code, and doesn't touch copy_from_user()).
Thanks,
Robin.
[0]
https://lore.kernel.org/linux-arm-kernel/cover.1664363162.git.robin.murphy@arm.com/
[1]
https://lore.kernel.org/linux-arch/20230321122514.1743889-1-mark.rutland@arm.com/
> Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
> ---
> arch/arm64/Kconfig.platforms | 7 +++++++
> arch/arm64/kernel/cpu_errata.c | 9 +++++++++
> arch/arm64/lib/copy_template.S | 31 +++++++++++++++++++++++++++++++
> arch/arm64/tools/cpucaps | 1 +
> 4 files changed, 48 insertions(+)
>
> diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
> index 6069120199bb..74ab8bea0019 100644
> --- a/arch/arm64/Kconfig.platforms
> +++ b/arch/arm64/Kconfig.platforms
> @@ -8,6 +8,13 @@ config ARCH_ACTIONS
> help
> This enables support for the Actions Semiconductor S900 SoC family.
>
> +config ARCH_AMPEREONE
> + bool "AmpereOne Platforms"
> + help
> + This enables support for the ARMv8 based AmpereOne chipsets.
> + AmpereOne is the next generation of Cloud Native Processors from
> + Ampere.
> +
> config ARCH_SUNXI
> bool "Allwinner sunxi 64-bit SoC Family"
> select ARCH_HAS_RESET_CONTROLLER
> diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
> index 5706e74c5578..c0060d3086d0 100644
> --- a/arch/arm64/kernel/cpu_errata.c
> +++ b/arch/arm64/kernel/cpu_errata.c
> @@ -744,6 +744,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
> .capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38,
> ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1),
> },
> +#endif
> +#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
> + {
> + .desc = "Optimization for AmpereOne chip",
> + .capability = ARM64_WORKAROUND_AMPERE_AC03_PREFETCH,
> + .type = ARM64_CPUCAP_SYSTEM_FEATURE,
> + .matches = is_affected_midr_range,
> + .midr_range = MIDR_ALL_VERSIONS(MIDR_AMPERE1)
> + },
> #endif
> {
> }
> diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
> index 79b32569260c..b707c3ec6820 100644
> --- a/arch/arm64/lib/copy_template.S
> +++ b/arch/arm64/lib/copy_template.S
> @@ -41,6 +41,18 @@
> b.ne .Ltail63
> .endm
>
> +#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
> +.macro add_prefetch
> + /*
> + * Add prefetch two cache lines by prfm to optimize the
> + * performance. The 2K offset is the best offset which
> + * we get from the tests.
> + */
> + prfm pldl2keep, [src, #2048]
> + prfm pldl2keep, [src, #2112]
> +.endm
> +#endif
> +
> /*
> * Copy a buffer from src to dest (alignment handled by the hardware)
> *
> @@ -156,6 +168,13 @@ D_h .req x14
> b .Lexitfunc
>
> .Lcpy_over64:
> +#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
> +alternative_if ARM64_WORKAROUND_AMPERE_AC03_PREFETCH
> + cmp count, #PAGE_SIZE
> + b.ge .Lcpy_over_pagesize
> +alternative_else_nop_endif
> +#endif
> +
> subs count, count, #128
> b.ge .Lcpy_body_large
> /*
> @@ -182,4 +201,16 @@ D_h .req x14
> .p2align L1_CACHE_SHIFT
> .Lcpy_body_large:
> loop_for_copy_128_bytes
> +
> +#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
> + b .Lexitfunc
> +
> + .p2align L1_CACHE_SHIFT
> +.Lcpy_over_pagesize:
> +alternative_if ARM64_WORKAROUND_AMPERE_AC03_PREFETCH
> + subs count, count, #128
> + loop_for_copy_128_bytes add_prefetch
> +alternative_else_nop_endif
> +#endif
> +
> .Lexitfunc:
> diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
> index dea3dc89234b..13e197abf249 100644
> --- a/arch/arm64/tools/cpucaps
> +++ b/arch/arm64/tools/cpucaps
> @@ -100,3 +100,4 @@ WORKAROUND_NVIDIA_CARMEL_CNP
> WORKAROUND_QCOM_FALKOR_E1003
> WORKAROUND_REPEAT_TLBI
> WORKAROUND_SPECULATIVE_AT
> +WORKAROUND_AMPERE_AC03_PREFETCH
© 2016 - 2025 Red Hat, Inc.