arch/arm64/include/asm/asm-uaccess.h | 22 ++---- arch/arm64/kernel/pi/idreg-override.c | 2 + arch/arm64/lib/copy_from_user.S | 17 +++- arch/arm64/lib/copy_template.S | 108 +++++++++++++++++++------- arch/arm64/lib/copy_to_user.S | 17 +++- 5 files changed, 114 insertions(+), 52 deletions(-)
Based on Ben Niu's "Faster Arm64 __arch_copy_from_user and
__arch_copy_to_user" patch [1], this implementation further optimizes
and simplifies user space copies by:
1. Limiting optimization scope to >=128 bytes copies where PAN state matters.
For <128 bytes copies, the implementation uses non-privileged
instructions uniformly, simplifying the code and reducing maintenance
cost.
2. Adding "arm64.nopan" cmdline support using the standard idreg-override
framework, allowing runtime PAN disable without building separate
CONFIG_ARM64_PAN=y/n kernels as required by Ben Niu's version.
The implementation maintains separate paths for PAN-enabled (using
unprivileged ldtr/sttr) and PAN-disabled (using standard ldp/stp), with
runtime selection via ALTERNATIVE() at the large copy loop entry.
3. Retaining the critical path optimization from the original patch:
reducing pointer update instructions through manual batch updates,
processing 64 bytes per iteration with only one pair of add instructions.
Performance improvements measured on Kunpeng 920 with PAN disabled:
The ku_copy microbenchmark [2] (a kernel module that measures
copy_to/from_user throughput across various sizes by copying 1GB of
data in each test):
copy_to_user throughput change (positive = improvement):
128B: +0.9% 256B: +10.3% 512B: +23.3% 1024B: +38.1%
2048B: +56.2% 4096B: +68.5% 8192B: +74.8% 16384B: +79.7%
32768B: +80.7% 65536B: +81.3% 131072B: +77.3% 262144B: +77.9%
copy_from_user throughput change:
128B: +2.0% 256B: +7.5% 512B: +20.3% 1024B: +28.4%
2048B: +38.1% 4096B: +39.6% 8192B: +41.5% 16384B: +42.3%
32768B: +42.2% 65536B: +44.8% 131072B: +70.3% 262144B: +71.0%
Real-world workloads:
- RocksDB read-write mixed workload:
Overall throughput improved by 2%.
copy_to_user hotspot reduced from 3.3% to 2.7% of total CPU cycles.
copy_from_user hotspot reduced from 2.25% to 0.85% of total CPU cycles.
- BRPC rdma_performance (server side, baidu_std protocol over TCP):
copy_to_user accounts for ~11.5% of total CPU cycles.
After optimization, server CPU utilization reduced from 64% to 62%
(2% absolute improvement, equivalent to ~17% reduction in
copy_to_user overhead)
[1] https://lore.kernel.org/all/20251018052237.1368504-2-benniu@meta.com/
[2] https://github.com/mcfi/benchmark/tree/main/ku_copy
Co-developed-by: Ben Niu <benniu@meta.com>
Signed-off-by: Ben Niu <benniu@meta.com>
Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
Signed-off-by: Qi Xi <xiqi2@huawei.com>
---
Changes in v3:
- Limiting optimization scope to >=128 bytes copies.
- Use idreg-override for PAN runtime selection with "arm64.nopan" cmdline.
---
arch/arm64/include/asm/asm-uaccess.h | 22 ++----
arch/arm64/kernel/pi/idreg-override.c | 2 +
arch/arm64/lib/copy_from_user.S | 17 +++-
arch/arm64/lib/copy_template.S | 108 +++++++++++++++++++-------
arch/arm64/lib/copy_to_user.S | 17 +++-
5 files changed, 114 insertions(+), 52 deletions(-)
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 9148f5a31968..198a05d478fc 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -70,27 +70,21 @@ alternative_else_nop_endif
* This is complicated as there is no post-increment or pair versions of the
* unprivileged instructions, and USER() only works for single instructions.
*/
- .macro user_ldp l, reg1, reg2, addr, post_inc
-8888: ldtr \reg1, [\addr];
-8889: ldtr \reg2, [\addr, #8];
- add \addr, \addr, \post_inc;
+ .macro user_ldst l, inst, reg, addr, post_inc
+8888: \inst \reg, [\addr];
+ add \addr, \addr, \post_inc;
_asm_extable_uaccess 8888b, \l;
- _asm_extable_uaccess 8889b, \l;
.endm
- .macro user_stp l, reg1, reg2, addr, post_inc
-8888: sttr \reg1, [\addr];
-8889: sttr \reg2, [\addr, #8];
- add \addr, \addr, \post_inc;
+ .macro user_ldst_index l, inst, reg, addr, val
+8888: \inst \reg, [\addr, \val];
- _asm_extable_uaccess 8888b,\l;
- _asm_extable_uaccess 8889b,\l;
+ _asm_extable_uaccess 8888b, \l;
.endm
- .macro user_ldst l, inst, reg, addr, post_inc
-8888: \inst \reg, [\addr];
- add \addr, \addr, \post_inc;
+ .macro user_ldst_pair_index l, inst, reg1, reg2, addr, val
+8888: \inst \reg1, \reg2, [\addr, \val];
_asm_extable_uaccess 8888b, \l;
.endm
diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index bc57b290e5e7..ac26f1f3aad4 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -64,6 +64,7 @@ static const struct ftr_set_desc mmfr1 __prel64_initconst = {
.override = &id_aa64mmfr1_override,
.fields = {
FIELD("vh", ID_AA64MMFR1_EL1_VH_SHIFT, mmfr1_vh_filter),
+ FIELD("pan", ID_AA64MMFR1_EL1_PAN_SHIFT, NULL),
{}
},
};
@@ -249,6 +250,7 @@ static const struct {
{ "arm64.nolva", "id_aa64mmfr2.varange=0" },
{ "arm64.no32bit_el0", "id_aa64pfr0.el0=1" },
{ "arm64.nompam", "id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" },
+ { "arm64.nopan", "id_aa64mmfr1.pan=0" },
};
static int __init parse_hexdigit(const char *p, u64 *v)
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 400057d607ec..1f578c4d0ae6 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -44,12 +44,21 @@
str \reg, [\ptr], \val
.endm
- .macro ldp1 reg1, reg2, ptr, val
- user_ldp 9997f, \reg1, \reg2, \ptr, \val
+ .macro ldp_unpriv reg1, reg2, ptr, val
+ user_ldst_index 9997f, ldtr, \reg1, \ptr, \val
+ user_ldst_index 9997f, ldtr, \reg2, \ptr, \val + 8
.endm
- .macro stp1 reg1, reg2, ptr, val
- stp \reg1, \reg2, [\ptr], \val
+ .macro stp_unpriv reg1, reg2, ptr, val
+ stp \reg1, \reg2, [\ptr, \val]
+ .endm
+
+ .macro ldp_priv reg1, reg2, ptr, val
+ user_ldst_pair_index 9997f, ldp, \reg1, \reg2, \ptr, \val
+ .endm
+
+ .macro stp_priv reg1, reg2, ptr, val
+ stp \reg1, \reg2, [\ptr, \val]
.endm
.macro cpy1 dst, src, count
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 7f2f5a0e2fb9..5ef6dc9bf7d8 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -97,14 +97,20 @@ alternative_else_nop_endif
cmp tmp1w, #0x20
b.eq 1f
b.lt 2f
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
+ ldp_unpriv A_l, A_h, src, #0
+ stp_unpriv A_l, A_h, dst, #0
+ add src, src, #16
+ add dst, dst, #16
1:
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
+ ldp_unpriv A_l, A_h, src, #0
+ stp_unpriv A_l, A_h, dst, #0
+ add src, src, #16
+ add dst, dst, #16
2:
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
+ ldp_unpriv A_l, A_h, src, #0
+ stp_unpriv A_l, A_h, dst, #0
+ add src, src, #16
+ add dst, dst, #16
.Ltiny15:
/*
* Prefer to break one ldp/stp into several load/store to access
@@ -142,14 +148,16 @@ alternative_else_nop_endif
* Less than 128 bytes to copy, so handle 64 here and then jump
* to the tail.
*/
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
- ldp1 B_l, B_h, src, #16
- ldp1 C_l, C_h, src, #16
- stp1 B_l, B_h, dst, #16
- stp1 C_l, C_h, dst, #16
- ldp1 D_l, D_h, src, #16
- stp1 D_l, D_h, dst, #16
+ ldp_unpriv A_l, A_h, src, #0
+ stp_unpriv A_l, A_h, dst, #0
+ ldp_unpriv B_l, B_h, src, #16
+ ldp_unpriv C_l, C_h, src, #32
+ stp_unpriv B_l, B_h, dst, #16
+ stp_unpriv C_l, C_h, dst, #32
+ ldp_unpriv D_l, D_h, src, #48
+ stp_unpriv D_l, D_h, dst, #48
+ add src, src, #64
+ add dst, dst, #64
tst count, #0x3f
b.ne .Ltail63
@@ -161,30 +169,70 @@ alternative_else_nop_endif
*/
.p2align L1_CACHE_SHIFT
.Lcpy_body_large:
+ /* Runtime PAN decision for large copies */
+ ALTERNATIVE("b .Llarge_pan_disabled", "b .Llarge_pan_enabled", ARM64_HAS_PAN)
+
+.Llarge_pan_enabled:
+ /* PAN enabled version - use unprivileged loads (ldp_unpriv) */
/* pre-get 64 bytes data. */
- ldp1 A_l, A_h, src, #16
- ldp1 B_l, B_h, src, #16
- ldp1 C_l, C_h, src, #16
- ldp1 D_l, D_h, src, #16
+ ldp_unpriv A_l, A_h, src, #0
+ ldp_unpriv B_l, B_h, src, #16
+ ldp_unpriv C_l, C_h, src, #32
+ ldp_unpriv D_l, D_h, src, #48
+ add src, src, #64
+1:
+ /*
+ * interlace the load of next 64 bytes data block with store of the last
+ * loaded 64 bytes data.
+ */
+ stp_unpriv A_l, A_h, dst, #0
+ ldp_unpriv A_l, A_h, src, #0
+ stp_unpriv B_l, B_h, dst, #16
+ ldp_unpriv B_l, B_h, src, #16
+ stp_unpriv C_l, C_h, dst, #32
+ ldp_unpriv C_l, C_h, src, #32
+ stp_unpriv D_l, D_h, dst, #48
+ ldp_unpriv D_l, D_h, src, #48
+ add dst, dst, #64
+ add src, src, #64
+ subs count, count, #64
+ b.ge 1b
+ b .Llarge_done
+
+.Llarge_pan_disabled:
+ /* PAN disabled version - use normal loads without post-increment */
+ /* pre-get 64 bytes data using normal loads */
+ ldp_priv A_l, A_h, src, #0
+ ldp_priv B_l, B_h, src, #16
+ ldp_priv C_l, C_h, src, #32
+ ldp_priv D_l, D_h, src, #48
+ add src, src, #64
1:
/*
* interlace the load of next 64 bytes data block with store of the last
* loaded 64 bytes data.
*/
- stp1 A_l, A_h, dst, #16
- ldp1 A_l, A_h, src, #16
- stp1 B_l, B_h, dst, #16
- ldp1 B_l, B_h, src, #16
- stp1 C_l, C_h, dst, #16
- ldp1 C_l, C_h, src, #16
- stp1 D_l, D_h, dst, #16
- ldp1 D_l, D_h, src, #16
+ stp_priv A_l, A_h, dst, #0
+ ldp_priv A_l, A_h, src, #0
+ stp_priv B_l, B_h, dst, #16
+ ldp_priv B_l, B_h, src, #16
+ stp_priv C_l, C_h, dst, #32
+ ldp_priv C_l, C_h, src, #32
+ stp_priv D_l, D_h, dst, #48
+ ldp_priv D_l, D_h, src, #48
+ add dst, dst, #64
+ add src, src, #64
subs count, count, #64
b.ge 1b
- stp1 A_l, A_h, dst, #16
- stp1 B_l, B_h, dst, #16
- stp1 C_l, C_h, dst, #16
- stp1 D_l, D_h, dst, #16
+
+.Llarge_done:
+ /* Post-loop: store the last block of data using stp_unpriv */
+ /* (without post-increment) */
+ stp_unpriv A_l, A_h, dst, #0
+ stp_unpriv B_l, B_h, dst, #16
+ stp_unpriv C_l, C_h, dst, #32
+ stp_unpriv D_l, D_h, dst, #48
+ add dst, dst, #64
tst count, #0x3f
b.ne .Ltail63
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 819f2e3fc7a9..9738ae96c823 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -43,12 +43,21 @@
user_ldst 9997f, sttr, \reg, \ptr, \val
.endm
- .macro ldp1 reg1, reg2, ptr, val
- ldp \reg1, \reg2, [\ptr], \val
+ .macro ldp_unpriv reg1, reg2, ptr, val
+ ldp \reg1, \reg2, [\ptr, \val]
.endm
- .macro stp1 reg1, reg2, ptr, val
- user_stp 9997f, \reg1, \reg2, \ptr, \val
+ .macro stp_unpriv reg1, reg2, ptr, val
+ user_ldst_index 9997f, sttr, \reg1, \ptr, \val
+ user_ldst_index 9997f, sttr, \reg2, \ptr, \val + 8
+ .endm
+
+ .macro ldp_priv reg1, reg2, ptr, val
+ ldp \reg1, \reg2, [\ptr, \val]
+ .endm
+
+ .macro stp_priv reg1, reg2, ptr, val
+ user_ldst_pair_index 9997f, stp, \reg1, \reg2, \ptr, \val
.endm
.macro cpy1 dst, src, count
--
2.33.0
Kindly ping.
Also add Robin Murphy to CC.
On 16/03/2026 20:31, Qi Xi wrote:
> Based on Ben Niu's "Faster Arm64 __arch_copy_from_user and
> __arch_copy_to_user" patch [1], this implementation further optimizes
> and simplifies user space copies by:
>
> 1. Limiting optimization scope to >=128 bytes copies where PAN state matters.
> For <128 bytes copies, the implementation uses non-privileged
> instructions uniformly, simplifying the code and reducing maintenance
> cost.
> 2. Adding "arm64.nopan" cmdline support using the standard idreg-override
> framework, allowing runtime PAN disable without building separate
> CONFIG_ARM64_PAN=y/n kernels as required by Ben Niu's version.
> The implementation maintains separate paths for PAN-enabled (using
> unprivileged ldtr/sttr) and PAN-disabled (using standard ldp/stp), with
> runtime selection via ALTERNATIVE() at the large copy loop entry.
> 3. Retaining the critical path optimization from the original patch:
> reducing pointer update instructions through manual batch updates,
> processing 64 bytes per iteration with only one pair of add instructions.
>
> Performance improvements measured on Kunpeng 920 with PAN disabled:
>
> The ku_copy microbenchmark [2] (a kernel module that measures
> copy_to/from_user throughput across various sizes by copying 1GB of
> data in each test):
> copy_to_user throughput change (positive = improvement):
> 128B: +0.9% 256B: +10.3% 512B: +23.3% 1024B: +38.1%
> 2048B: +56.2% 4096B: +68.5% 8192B: +74.8% 16384B: +79.7%
> 32768B: +80.7% 65536B: +81.3% 131072B: +77.3% 262144B: +77.9%
> copy_from_user throughput change:
> 128B: +2.0% 256B: +7.5% 512B: +20.3% 1024B: +28.4%
> 2048B: +38.1% 4096B: +39.6% 8192B: +41.5% 16384B: +42.3%
> 32768B: +42.2% 65536B: +44.8% 131072B: +70.3% 262144B: +71.0%
>
> Real-world workloads:
> - RocksDB read-write mixed workload:
> Overall throughput improved by 2%.
> copy_to_user hotspot reduced from 3.3% to 2.7% of total CPU cycles.
> copy_from_user hotspot reduced from 2.25% to 0.85% of total CPU cycles.
>
> - BRPC rdma_performance (server side, baidu_std protocol over TCP):
> copy_to_user accounts for ~11.5% of total CPU cycles.
> After optimization, server CPU utilization reduced from 64% to 62%
> (2% absolute improvement, equivalent to ~17% reduction in
> copy_to_user overhead)
>
> [1] https://lore.kernel.org/all/20251018052237.1368504-2-benniu@meta.com/
> [2] https://github.com/mcfi/benchmark/tree/main/ku_copy
>
> Co-developed-by: Ben Niu <benniu@meta.com>
> Signed-off-by: Ben Niu <benniu@meta.com>
> Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
> Signed-off-by: Qi Xi <xiqi2@huawei.com>
> ---
> Changes in v3:
> - Limiting optimization scope to >=128 bytes copies.
> - Use idreg-override for PAN runtime selection with "arm64.nopan" cmdline.
> ---
> arch/arm64/include/asm/asm-uaccess.h | 22 ++----
> arch/arm64/kernel/pi/idreg-override.c | 2 +
> arch/arm64/lib/copy_from_user.S | 17 +++-
> arch/arm64/lib/copy_template.S | 108 +++++++++++++++++++-------
> arch/arm64/lib/copy_to_user.S | 17 +++-
> 5 files changed, 114 insertions(+), 52 deletions(-)
>
> diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
> index 9148f5a31968..198a05d478fc 100644
> --- a/arch/arm64/include/asm/asm-uaccess.h
> +++ b/arch/arm64/include/asm/asm-uaccess.h
> @@ -70,27 +70,21 @@ alternative_else_nop_endif
> * This is complicated as there is no post-increment or pair versions of the
> * unprivileged instructions, and USER() only works for single instructions.
> */
> - .macro user_ldp l, reg1, reg2, addr, post_inc
> -8888: ldtr \reg1, [\addr];
> -8889: ldtr \reg2, [\addr, #8];
> - add \addr, \addr, \post_inc;
> + .macro user_ldst l, inst, reg, addr, post_inc
> +8888: \inst \reg, [\addr];
> + add \addr, \addr, \post_inc;
>
> _asm_extable_uaccess 8888b, \l;
> - _asm_extable_uaccess 8889b, \l;
> .endm
>
> - .macro user_stp l, reg1, reg2, addr, post_inc
> -8888: sttr \reg1, [\addr];
> -8889: sttr \reg2, [\addr, #8];
> - add \addr, \addr, \post_inc;
> + .macro user_ldst_index l, inst, reg, addr, val
> +8888: \inst \reg, [\addr, \val];
>
> - _asm_extable_uaccess 8888b,\l;
> - _asm_extable_uaccess 8889b,\l;
> + _asm_extable_uaccess 8888b, \l;
> .endm
>
> - .macro user_ldst l, inst, reg, addr, post_inc
> -8888: \inst \reg, [\addr];
> - add \addr, \addr, \post_inc;
> + .macro user_ldst_pair_index l, inst, reg1, reg2, addr, val
> +8888: \inst \reg1, \reg2, [\addr, \val];
>
> _asm_extable_uaccess 8888b, \l;
> .endm
> diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
> index bc57b290e5e7..ac26f1f3aad4 100644
> --- a/arch/arm64/kernel/pi/idreg-override.c
> +++ b/arch/arm64/kernel/pi/idreg-override.c
> @@ -64,6 +64,7 @@ static const struct ftr_set_desc mmfr1 __prel64_initconst = {
> .override = &id_aa64mmfr1_override,
> .fields = {
> FIELD("vh", ID_AA64MMFR1_EL1_VH_SHIFT, mmfr1_vh_filter),
> + FIELD("pan", ID_AA64MMFR1_EL1_PAN_SHIFT, NULL),
> {}
> },
> };
> @@ -249,6 +250,7 @@ static const struct {
> { "arm64.nolva", "id_aa64mmfr2.varange=0" },
> { "arm64.no32bit_el0", "id_aa64pfr0.el0=1" },
> { "arm64.nompam", "id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" },
> + { "arm64.nopan", "id_aa64mmfr1.pan=0" },
> };
>
> static int __init parse_hexdigit(const char *p, u64 *v)
> diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
> index 400057d607ec..1f578c4d0ae6 100644
> --- a/arch/arm64/lib/copy_from_user.S
> +++ b/arch/arm64/lib/copy_from_user.S
> @@ -44,12 +44,21 @@
> str \reg, [\ptr], \val
> .endm
>
> - .macro ldp1 reg1, reg2, ptr, val
> - user_ldp 9997f, \reg1, \reg2, \ptr, \val
> + .macro ldp_unpriv reg1, reg2, ptr, val
> + user_ldst_index 9997f, ldtr, \reg1, \ptr, \val
> + user_ldst_index 9997f, ldtr, \reg2, \ptr, \val + 8
> .endm
>
> - .macro stp1 reg1, reg2, ptr, val
> - stp \reg1, \reg2, [\ptr], \val
> + .macro stp_unpriv reg1, reg2, ptr, val
> + stp \reg1, \reg2, [\ptr, \val]
> + .endm
> +
> + .macro ldp_priv reg1, reg2, ptr, val
> + user_ldst_pair_index 9997f, ldp, \reg1, \reg2, \ptr, \val
> + .endm
> +
> + .macro stp_priv reg1, reg2, ptr, val
> + stp \reg1, \reg2, [\ptr, \val]
> .endm
>
> .macro cpy1 dst, src, count
> diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
> index 7f2f5a0e2fb9..5ef6dc9bf7d8 100644
> --- a/arch/arm64/lib/copy_template.S
> +++ b/arch/arm64/lib/copy_template.S
> @@ -97,14 +97,20 @@ alternative_else_nop_endif
> cmp tmp1w, #0x20
> b.eq 1f
> b.lt 2f
> - ldp1 A_l, A_h, src, #16
> - stp1 A_l, A_h, dst, #16
> + ldp_unpriv A_l, A_h, src, #0
> + stp_unpriv A_l, A_h, dst, #0
> + add src, src, #16
> + add dst, dst, #16
> 1:
> - ldp1 A_l, A_h, src, #16
> - stp1 A_l, A_h, dst, #16
> + ldp_unpriv A_l, A_h, src, #0
> + stp_unpriv A_l, A_h, dst, #0
> + add src, src, #16
> + add dst, dst, #16
> 2:
> - ldp1 A_l, A_h, src, #16
> - stp1 A_l, A_h, dst, #16
> + ldp_unpriv A_l, A_h, src, #0
> + stp_unpriv A_l, A_h, dst, #0
> + add src, src, #16
> + add dst, dst, #16
> .Ltiny15:
> /*
> * Prefer to break one ldp/stp into several load/store to access
> @@ -142,14 +148,16 @@ alternative_else_nop_endif
> * Less than 128 bytes to copy, so handle 64 here and then jump
> * to the tail.
> */
> - ldp1 A_l, A_h, src, #16
> - stp1 A_l, A_h, dst, #16
> - ldp1 B_l, B_h, src, #16
> - ldp1 C_l, C_h, src, #16
> - stp1 B_l, B_h, dst, #16
> - stp1 C_l, C_h, dst, #16
> - ldp1 D_l, D_h, src, #16
> - stp1 D_l, D_h, dst, #16
> + ldp_unpriv A_l, A_h, src, #0
> + stp_unpriv A_l, A_h, dst, #0
> + ldp_unpriv B_l, B_h, src, #16
> + ldp_unpriv C_l, C_h, src, #32
> + stp_unpriv B_l, B_h, dst, #16
> + stp_unpriv C_l, C_h, dst, #32
> + ldp_unpriv D_l, D_h, src, #48
> + stp_unpriv D_l, D_h, dst, #48
> + add src, src, #64
> + add dst, dst, #64
>
> tst count, #0x3f
> b.ne .Ltail63
> @@ -161,30 +169,70 @@ alternative_else_nop_endif
> */
> .p2align L1_CACHE_SHIFT
> .Lcpy_body_large:
> + /* Runtime PAN decision for large copies */
> + ALTERNATIVE("b .Llarge_pan_disabled", "b .Llarge_pan_enabled", ARM64_HAS_PAN)
> +
> +.Llarge_pan_enabled:
> + /* PAN enabled version - use unprivileged loads (ldp_unpriv) */
> /* pre-get 64 bytes data. */
> - ldp1 A_l, A_h, src, #16
> - ldp1 B_l, B_h, src, #16
> - ldp1 C_l, C_h, src, #16
> - ldp1 D_l, D_h, src, #16
> + ldp_unpriv A_l, A_h, src, #0
> + ldp_unpriv B_l, B_h, src, #16
> + ldp_unpriv C_l, C_h, src, #32
> + ldp_unpriv D_l, D_h, src, #48
> + add src, src, #64
> +1:
> + /*
> + * interlace the load of next 64 bytes data block with store of the last
> + * loaded 64 bytes data.
> + */
> + stp_unpriv A_l, A_h, dst, #0
> + ldp_unpriv A_l, A_h, src, #0
> + stp_unpriv B_l, B_h, dst, #16
> + ldp_unpriv B_l, B_h, src, #16
> + stp_unpriv C_l, C_h, dst, #32
> + ldp_unpriv C_l, C_h, src, #32
> + stp_unpriv D_l, D_h, dst, #48
> + ldp_unpriv D_l, D_h, src, #48
> + add dst, dst, #64
> + add src, src, #64
> + subs count, count, #64
> + b.ge 1b
> + b .Llarge_done
> +
> +.Llarge_pan_disabled:
> + /* PAN disabled version - use normal loads without post-increment */
> + /* pre-get 64 bytes data using normal loads */
> + ldp_priv A_l, A_h, src, #0
> + ldp_priv B_l, B_h, src, #16
> + ldp_priv C_l, C_h, src, #32
> + ldp_priv D_l, D_h, src, #48
> + add src, src, #64
> 1:
> /*
> * interlace the load of next 64 bytes data block with store of the last
> * loaded 64 bytes data.
> */
> - stp1 A_l, A_h, dst, #16
> - ldp1 A_l, A_h, src, #16
> - stp1 B_l, B_h, dst, #16
> - ldp1 B_l, B_h, src, #16
> - stp1 C_l, C_h, dst, #16
> - ldp1 C_l, C_h, src, #16
> - stp1 D_l, D_h, dst, #16
> - ldp1 D_l, D_h, src, #16
> + stp_priv A_l, A_h, dst, #0
> + ldp_priv A_l, A_h, src, #0
> + stp_priv B_l, B_h, dst, #16
> + ldp_priv B_l, B_h, src, #16
> + stp_priv C_l, C_h, dst, #32
> + ldp_priv C_l, C_h, src, #32
> + stp_priv D_l, D_h, dst, #48
> + ldp_priv D_l, D_h, src, #48
> + add dst, dst, #64
> + add src, src, #64
> subs count, count, #64
> b.ge 1b
> - stp1 A_l, A_h, dst, #16
> - stp1 B_l, B_h, dst, #16
> - stp1 C_l, C_h, dst, #16
> - stp1 D_l, D_h, dst, #16
> +
> +.Llarge_done:
> + /* Post-loop: store the last block of data using stp_unpriv */
> + /* (without post-increment) */
> + stp_unpriv A_l, A_h, dst, #0
> + stp_unpriv B_l, B_h, dst, #16
> + stp_unpriv C_l, C_h, dst, #32
> + stp_unpriv D_l, D_h, dst, #48
> + add dst, dst, #64
>
> tst count, #0x3f
> b.ne .Ltail63
> diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
> index 819f2e3fc7a9..9738ae96c823 100644
> --- a/arch/arm64/lib/copy_to_user.S
> +++ b/arch/arm64/lib/copy_to_user.S
> @@ -43,12 +43,21 @@
> user_ldst 9997f, sttr, \reg, \ptr, \val
> .endm
>
> - .macro ldp1 reg1, reg2, ptr, val
> - ldp \reg1, \reg2, [\ptr], \val
> + .macro ldp_unpriv reg1, reg2, ptr, val
> + ldp \reg1, \reg2, [\ptr, \val]
> .endm
>
> - .macro stp1 reg1, reg2, ptr, val
> - user_stp 9997f, \reg1, \reg2, \ptr, \val
> + .macro stp_unpriv reg1, reg2, ptr, val
> + user_ldst_index 9997f, sttr, \reg1, \ptr, \val
> + user_ldst_index 9997f, sttr, \reg2, \ptr, \val + 8
> + .endm
> +
> + .macro ldp_priv reg1, reg2, ptr, val
> + ldp \reg1, \reg2, [\ptr, \val]
> + .endm
> +
> + .macro stp_priv reg1, reg2, ptr, val
> + user_ldst_pair_index 9997f, stp, \reg1, \reg2, \ptr, \val
> .endm
>
> .macro cpy1 dst, src, count
© 2016 - 2026 Red Hat, Inc.