Do not use memcpy() to extract syscall arguments from struct pt_regs
but rather just perform direct assignments.
The performance benchmarks with Generic Entry patch[1] with audit on
from perf bench basic syscall on kunpeng920 gives roughly a 1%
performance uplift and also aligns the implementation with
x86 and RISC-V.
| Metric | W/O this patch | With this patch | Change |
| ---------- | -------------- | --------------- | --------- |
| Total time | 2.241 [sec] | 2.211 [sec] | ↓1.36% |
| usecs/op | 0.224157 | 0.221146 | ↓1.36% |
| ops/sec | 4,461,157 | 4,501,409 | ↑0.9% |
Before:
<syscall_get_arguments.constprop.0>:
aa0103e2 mov x2, x1
91002003 add x3, x0, #0x8
f9408804 ldr x4, [x0, #272]
f8008444 str x4, [x2], #8
a9409404 ldp x4, x5, [x0, #8]
a9009424 stp x4, x5, [x1, #8]
a9418400 ldp x0, x1, [x0, #24]
a9010440 stp x0, x1, [x2, #16]
f9401060 ldr x0, [x3, #32]
f9001040 str x0, [x2, #32]
d65f03c0 ret
d503201f nop
After:
a9408e82 ldp x2, x3, [x20, #8]
2a1603e0 mov w0, w22
f9400e84 ldr x4, [x20, #24]
f9408a81 ldr x1, [x20, #272]
9401c4ba bl ffff800080215ca8 <__audit_syscall_entry>
[1]: https://lore.kernel.org/all/20251126071446.3234218-1-ruanjinjie@huawei.com/
Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
---
arch/arm64/include/asm/syscall.h | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h
index f3853047c28e..f3564ba97f7e 100644
--- a/arch/arm64/include/asm/syscall.h
+++ b/arch/arm64/include/asm/syscall.h
@@ -82,9 +82,11 @@ static inline void syscall_get_arguments(struct task_struct *task,
unsigned long *args)
{
args[0] = regs->orig_x0;
- args++;
-
- memcpy(args, ®s->regs[1], 5 * sizeof(args[0]));
+ args[1] = regs->regs[1];
+ args[2] = regs->regs[2];
+ args[3] = regs->regs[3];
+ args[4] = regs->regs[4];
+ args[5] = regs->regs[5];
}
static inline void syscall_set_arguments(struct task_struct *task,
--
2.34.1
On Thu, Nov 27, 2025 at 08:36:30PM +0800, Jinjie Ruan wrote:
> Do not use memcpy() to extract syscall arguments from struct pt_regs
> but rather just perform direct assignments.
>
> The performance benchmarks with Generic Entry patch[1] with audit on
> from perf bench basic syscall on kunpeng920 gives roughly a 1%
> performance uplift and also aligns the implementation with
> x86 and RISC-V.
>
> | Metric | W/O this patch | With this patch | Change |
> | ---------- | -------------- | --------------- | --------- |
> | Total time | 2.241 [sec] | 2.211 [sec] | ↓1.36% |
> | usecs/op | 0.224157 | 0.221146 | ↓1.36% |
> | ops/sec | 4,461,157 | 4,501,409 | ↑0.9% |
>
> Before:
> <syscall_get_arguments.constprop.0>:
> aa0103e2 mov x2, x1
> 91002003 add x3, x0, #0x8
> f9408804 ldr x4, [x0, #272]
> f8008444 str x4, [x2], #8
> a9409404 ldp x4, x5, [x0, #8]
> a9009424 stp x4, x5, [x1, #8]
> a9418400 ldp x0, x1, [x0, #24]
> a9010440 stp x0, x1, [x2, #16]
> f9401060 ldr x0, [x3, #32]
> f9001040 str x0, [x2, #32]
> d65f03c0 ret
> d503201f nop
>
> After:
> a9408e82 ldp x2, x3, [x20, #8]
> 2a1603e0 mov w0, w22
> f9400e84 ldr x4, [x20, #24]
> f9408a81 ldr x1, [x20, #272]
> 9401c4ba bl ffff800080215ca8 <__audit_syscall_entry>
It's probably worth noting that __audit_syscall_entry() only takes 4
syscall arguments, and hence the compiler has elided the copy of
regs->regs[4] and regs->regs[5], which it apparently couldn't manage
before.
> [1]: https://lore.kernel.org/all/20251126071446.3234218-1-ruanjinjie@huawei.com/
> Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
> ---
> arch/arm64/include/asm/syscall.h | 8 +++++---
> 1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h
> index f3853047c28e..f3564ba97f7e 100644
> --- a/arch/arm64/include/asm/syscall.h
> +++ b/arch/arm64/include/asm/syscall.h
> @@ -82,9 +82,11 @@ static inline void syscall_get_arguments(struct task_struct *task,
> unsigned long *args)
> {
> args[0] = regs->orig_x0;
> - args++;
> -
> - memcpy(args, ®s->regs[1], 5 * sizeof(args[0]));
> + args[1] = regs->regs[1];
> + args[2] = regs->regs[2];
> + args[3] = regs->regs[3];
> + args[4] = regs->regs[4];
> + args[5] = regs->regs[5];
> }
FWIW, I think this is clearer than the 'args++' and the memcpy(), so I'm
happy with this regardless of the performance concern.
However, as Dmitry says, we should keep this structurally the same as
syscall_set_arguments(), and so we should update that in the same way.
Mark.
On Mon, 1 Dec 2025 10:13:54 +0000
Mark Rutland <mark.rutland@arm.com> wrote:
> On Thu, Nov 27, 2025 at 08:36:30PM +0800, Jinjie Ruan wrote:
> > Do not use memcpy() to extract syscall arguments from struct pt_regs
> > but rather just perform direct assignments.
> >
> > The performance benchmarks with Generic Entry patch[1] with audit on
> > from perf bench basic syscall on kunpeng920 gives roughly a 1%
> > performance uplift and also aligns the implementation with
> > x86 and RISC-V.
> >
> > | Metric | W/O this patch | With this patch | Change |
> > | ---------- | -------------- | --------------- | --------- |
> > | Total time | 2.241 [sec] | 2.211 [sec] | ↓1.36% |
> > | usecs/op | 0.224157 | 0.221146 | ↓1.36% |
> > | ops/sec | 4,461,157 | 4,501,409 | ↑0.9% |
> >
> > Before:
> > <syscall_get_arguments.constprop.0>:
> > aa0103e2 mov x2, x1
> > 91002003 add x3, x0, #0x8
> > f9408804 ldr x4, [x0, #272]
> > f8008444 str x4, [x2], #8
> > a9409404 ldp x4, x5, [x0, #8]
> > a9009424 stp x4, x5, [x1, #8]
> > a9418400 ldp x0, x1, [x0, #24]
> > a9010440 stp x0, x1, [x2, #16]
> > f9401060 ldr x0, [x3, #32]
> > f9001040 str x0, [x2, #32]
> > d65f03c0 ret
> > d503201f nop
> >
> > After:
> > a9408e82 ldp x2, x3, [x20, #8]
> > 2a1603e0 mov w0, w22
> > f9400e84 ldr x4, [x20, #24]
> > f9408a81 ldr x1, [x20, #272]
> > 9401c4ba bl ffff800080215ca8 <__audit_syscall_entry>
>
> It's probably worth noting that __audit_syscall_entry() only takes 4
> syscall arguments, and hence the compiler has elided the copy of
> regs->regs[4] and regs->regs[5], which it apparently couldn't manage
> before.
Hasn't it actually inlined it and completely optimised away the regs[] array?
It looks (from the asm) as though syscall_get_arguments() is followed by:
fn(regs[0], regs[1], regs[2], regs[3])
David
>
> > [1]: https://lore.kernel.org/all/20251126071446.3234218-1-ruanjinjie@huawei.com/
> > Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
> > ---
> > arch/arm64/include/asm/syscall.h | 8 +++++---
> > 1 file changed, 5 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h
> > index f3853047c28e..f3564ba97f7e 100644
> > --- a/arch/arm64/include/asm/syscall.h
> > +++ b/arch/arm64/include/asm/syscall.h
> > @@ -82,9 +82,11 @@ static inline void syscall_get_arguments(struct task_struct *task,
> > unsigned long *args)
> > {
> > args[0] = regs->orig_x0;
> > - args++;
> > -
> > - memcpy(args, ®s->regs[1], 5 * sizeof(args[0]));
> > + args[1] = regs->regs[1];
> > + args[2] = regs->regs[2];
> > + args[3] = regs->regs[3];
> > + args[4] = regs->regs[4];
> > + args[5] = regs->regs[5];
> > }
>
> FWIW, I think this is clearer than the 'args++' and the memcpy(), so I'm
> happy with this regardless of the performance concern.
>
> However, as Dmitry says, we should keep this structurally the same as
> syscall_set_arguments(), and so we should update that in the same way.
>
> Mark.
>
On Mon, Dec 01, 2025 at 10:26:33AM +0000, david laight wrote: > On Mon, 1 Dec 2025 10:13:54 +0000 > Mark Rutland <mark.rutland@arm.com> wrote: > > On Thu, Nov 27, 2025 at 08:36:30PM +0800, Jinjie Ruan wrote: > > > Before: > > > <syscall_get_arguments.constprop.0>: > > > aa0103e2 mov x2, x1 > > > 91002003 add x3, x0, #0x8 > > > f9408804 ldr x4, [x0, #272] > > > f8008444 str x4, [x2], #8 > > > a9409404 ldp x4, x5, [x0, #8] > > > a9009424 stp x4, x5, [x1, #8] > > > a9418400 ldp x0, x1, [x0, #24] > > > a9010440 stp x0, x1, [x2, #16] > > > f9401060 ldr x0, [x3, #32] > > > f9001040 str x0, [x2, #32] > > > d65f03c0 ret > > > d503201f nop > > > > > > After: > > > a9408e82 ldp x2, x3, [x20, #8] > > > 2a1603e0 mov w0, w22 > > > f9400e84 ldr x4, [x20, #24] > > > f9408a81 ldr x1, [x20, #272] > > > 9401c4ba bl ffff800080215ca8 <__audit_syscall_entry> > > > > It's probably worth noting that __audit_syscall_entry() only takes 4 > > syscall arguments, and hence the compiler has elided the copy of > > regs->regs[4] and regs->regs[5], which it apparently couldn't manage > > before. > > Hasn't it actually inlined it and completely optimised away the regs[] array? > It looks (from the asm) as though syscall_get_arguments() is followed by: > fn(regs[0], regs[1], regs[2], regs[3]) Yes; I was assuming that people could infer that. I was poining out that the elision of copies/loads of regs->regs[4] and regs->regs[5] in particular was not a bug. Mark.
On Thu, Nov 27, 2025 at 08:36:30PM +0800, Jinjie Ruan wrote:
> Do not use memcpy() to extract syscall arguments from struct pt_regs
> but rather just perform direct assignments.
>
> The performance benchmarks with Generic Entry patch[1] with audit on
> from perf bench basic syscall on kunpeng920 gives roughly a 1%
> performance uplift and also aligns the implementation with
> x86 and RISC-V.
>
> | Metric | W/O this patch | With this patch | Change |
> | ---------- | -------------- | --------------- | --------- |
> | Total time | 2.241 [sec] | 2.211 [sec] | ↓1.36% |
> | usecs/op | 0.224157 | 0.221146 | ↓1.36% |
> | ops/sec | 4,461,157 | 4,501,409 | ↑0.9% |
>
> Before:
> <syscall_get_arguments.constprop.0>:
> aa0103e2 mov x2, x1
> 91002003 add x3, x0, #0x8
> f9408804 ldr x4, [x0, #272]
> f8008444 str x4, [x2], #8
> a9409404 ldp x4, x5, [x0, #8]
> a9009424 stp x4, x5, [x1, #8]
> a9418400 ldp x0, x1, [x0, #24]
> a9010440 stp x0, x1, [x2, #16]
> f9401060 ldr x0, [x3, #32]
> f9001040 str x0, [x2, #32]
> d65f03c0 ret
> d503201f nop
>
> After:
> a9408e82 ldp x2, x3, [x20, #8]
> 2a1603e0 mov w0, w22
> f9400e84 ldr x4, [x20, #24]
> f9408a81 ldr x1, [x20, #272]
> 9401c4ba bl ffff800080215ca8 <__audit_syscall_entry>
>
> [1]: https://lore.kernel.org/all/20251126071446.3234218-1-ruanjinjie@huawei.com/
> Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
> ---
> arch/arm64/include/asm/syscall.h | 8 +++++---
> 1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h
> index f3853047c28e..f3564ba97f7e 100644
> --- a/arch/arm64/include/asm/syscall.h
> +++ b/arch/arm64/include/asm/syscall.h
> @@ -82,9 +82,11 @@ static inline void syscall_get_arguments(struct task_struct *task,
> unsigned long *args)
> {
> args[0] = regs->orig_x0;
> - args++;
> -
> - memcpy(args, ®s->regs[1], 5 * sizeof(args[0]));
> + args[1] = regs->regs[1];
> + args[2] = regs->regs[2];
> + args[3] = regs->regs[3];
> + args[4] = regs->regs[4];
> + args[5] = regs->regs[5];
> }
>
> static inline void syscall_set_arguments(struct task_struct *task,
Please keep syscall_get_arguments() and syscall_set_arguments() in sync:
if you replace memset() with direct assignments in one of these functions,
please mirror the change in another.
--
ldv
© 2016 - 2025 Red Hat, Inc.