[PATCH v9 15/16] entry: Inline syscall_exit_work()

Jinjie Ruan posted 16 patches 1 week, 5 days ago
[PATCH v9 15/16] entry: Inline syscall_exit_work()
Posted by Jinjie Ruan 1 week, 5 days ago
After switch arm64 to Generic Entry, a new hotspot syscall_exit_work()
appeared because syscall_exit_work() is no longer inlined. so inline
syscall_exit_work(), and it has 2.6% performance uplift on perf bench
basic syscall on kunpeng920 as below.

    | Metric     | W/O this patch | With this patch | Change    |
    | ---------- | -------------- | --------------- | --------- |
    | Total time | 2.171 [sec]    | 2.114 [sec]     |  ↓2.6%    |
    | usecs/op   | 0.217192       | 0.211453        |  ↓2.6%    |
    | ops/sec    | 4,604,225      | 4,729,178       |  ↑2.7%    |

Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
---
 include/linux/entry-common.h  | 63 ++++++++++++++++++++++++++++++++++-
 kernel/entry/syscall-common.c | 59 ++------------------------------
 2 files changed, 64 insertions(+), 58 deletions(-)

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index cd6dacb2d8bf..2f84377fb016 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -2,6 +2,7 @@
 #ifndef __LINUX_ENTRYCOMMON_H
 #define __LINUX_ENTRYCOMMON_H
 
+#include <linux/audit.h>
 #include <linux/irq-entry-common.h>
 #include <linux/ptrace.h>
 #include <linux/seccomp.h>
@@ -128,6 +129,41 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l
 	return ret;
 }
 
+/*
+ * If SYSCALL_EMU is set, then the only reason to report is when
+ * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
+ * instruction has been already reported in syscall_enter_from_user_mode().
+ */
+static __always_inline bool report_single_step(unsigned long work)
+{
+	if (work & SYSCALL_WORK_SYSCALL_EMU)
+		return false;
+
+	return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
+}
+
+/**
+ * arch_ptrace_report_syscall_exit - Architecture specific
+ *				     ptrace_report_syscall_exit.
+ *
+ * Invoked from syscall_exit_work() to wrap ptrace_report_syscall_exit().
+ *
+ * The main purpose is to support arch-specific ptrace_report_syscall_exit
+ * implementation.
+ */
+static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
+							    int step);
+
+#ifndef arch_ptrace_report_syscall_exit
+static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
+							    int step)
+{
+	ptrace_report_syscall_exit(regs, step);
+}
+#endif
+
+void __trace_sys_exit(struct pt_regs *regs, long ret);
+
 /**
  * syscall_exit_work - Handle work before returning to user mode
  * @regs:	Pointer to current pt_regs
@@ -135,7 +171,32 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l
  *
  * Do one-time syscall specific work.
  */
-void syscall_exit_work(struct pt_regs *regs, unsigned long work);
+static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work)
+{
+	bool step;
+
+	/*
+	 * If the syscall was rolled back due to syscall user dispatching,
+	 * then the tracers below are not invoked for the same reason as
+	 * the entry side was not invoked in syscall_trace_enter(): The ABI
+	 * of these syscalls is unknown.
+	 */
+	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+		if (unlikely(current->syscall_dispatch.on_dispatch)) {
+			current->syscall_dispatch.on_dispatch = false;
+			return;
+		}
+	}
+
+	audit_syscall_exit(regs);
+
+	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
+		__trace_sys_exit(regs, syscall_get_return_value(current, regs));
+
+	step = report_single_step(work);
+	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
+		arch_ptrace_report_syscall_exit(regs, step);
+}
 
 /*
  * Syscall specific exit to user mode preparation. Runs with interrupts
diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c
index 27310e611567..1636f49c58d2 100644
--- a/kernel/entry/syscall-common.c
+++ b/kernel/entry/syscall-common.c
@@ -90,62 +90,7 @@ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
 	instrumentation_end();
 }
 
-/*
- * If SYSCALL_EMU is set, then the only reason to report is when
- * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
- * instruction has been already reported in syscall_enter_from_user_mode().
- */
-static inline bool report_single_step(unsigned long work)
-{
-	if (work & SYSCALL_WORK_SYSCALL_EMU)
-		return false;
-
-	return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
-}
-
-/**
- * arch_ptrace_report_syscall_exit - Architecture specific
- *				     ptrace_report_syscall_exit.
- *
- * Invoked from syscall_exit_work() to wrap ptrace_report_syscall_exit().
- *
- * The main purpose is to support arch-specific ptrace_report_syscall_exit
- * implementation.
- */
-static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
-							    int step);
-
-#ifndef arch_ptrace_report_syscall_exit
-static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
-							    int step)
+void __trace_sys_exit(struct pt_regs *regs, long ret)
 {
-	ptrace_report_syscall_exit(regs, step);
-}
-#endif
-
-void syscall_exit_work(struct pt_regs *regs, unsigned long work)
-{
-	bool step;
-
-	/*
-	 * If the syscall was rolled back due to syscall user dispatching,
-	 * then the tracers below are not invoked for the same reason as
-	 * the entry side was not invoked in syscall_trace_enter(): The ABI
-	 * of these syscalls is unknown.
-	 */
-	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
-		if (unlikely(current->syscall_dispatch.on_dispatch)) {
-			current->syscall_dispatch.on_dispatch = false;
-			return;
-		}
-	}
-
-	audit_syscall_exit(regs);
-
-	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
-		trace_sys_exit(regs, syscall_get_return_value(current, regs));
-
-	step = report_single_step(work);
-	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
-		arch_ptrace_report_syscall_exit(regs, step);
+	trace_sys_exit(regs, ret);
 }
-- 
2.34.1

Re: [PATCH v9 15/16] entry: Inline syscall_exit_work()
Posted by Kevin Brodsky 6 days, 23 hours ago
On 04/12/2025 09:21, Jinjie Ruan wrote:
> After switch arm64 to Generic Entry, a new hotspot syscall_exit_work()
> appeared because syscall_exit_work() is no longer inlined. so inline

Before this series the call to syscall_trace_exit() in el0_svc_common()
could not be inlined, so "no longer inlined" doesn't seem to be accurate.

> syscall_exit_work(), and it has 2.6% performance uplift on perf bench
> basic syscall on kunpeng920 as below.

That seems strange. syscall_exit_work() is only called if some flag in
SYSCALL_WORK_EXIT is set, which means that we're doing something special
like tracing. That shouldn't be the case when running a simple perf
bench syscall.

Also worth nothing that its counterpart (syscall_trace_enter())) is not
currently inlined, the asymmetry would have to be justified.

>     | Metric     | W/O this patch | With this patch | Change    |
>     | ---------- | -------------- | --------------- | --------- |
>     | Total time | 2.171 [sec]    | 2.114 [sec]     |  ↓2.6%    |
>     | usecs/op   | 0.217192       | 0.211453        |  ↓2.6%    |
>     | ops/sec    | 4,604,225      | 4,729,178       |  ↑2.7%    |
>
> Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
> ---
>  include/linux/entry-common.h  | 63 ++++++++++++++++++++++++++++++++++-
>  kernel/entry/syscall-common.c | 59 ++------------------------------

These changes are purely generic, surely all architectures using
GENERIC_ENTRY should get similar benefits (assuming LTO isn't used)?

>  2 files changed, 64 insertions(+), 58 deletions(-)
>
> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> index cd6dacb2d8bf..2f84377fb016 100644
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -2,6 +2,7 @@
>  #ifndef __LINUX_ENTRYCOMMON_H
>  #define __LINUX_ENTRYCOMMON_H
>  
> +#include <linux/audit.h>
>  #include <linux/irq-entry-common.h>
>  #include <linux/ptrace.h>
>  #include <linux/seccomp.h>
> @@ -128,6 +129,41 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l
>  	return ret;
>  }
>  
> +/*
> + * If SYSCALL_EMU is set, then the only reason to report is when
> + * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
> + * instruction has been already reported in syscall_enter_from_user_mode().
> + */
> +static __always_inline bool report_single_step(unsigned long work)
> +{
> +	if (work & SYSCALL_WORK_SYSCALL_EMU)
> +		return false;
> +
> +	return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
> +}
> +
> +/**
> + * arch_ptrace_report_syscall_exit - Architecture specific
> + *				     ptrace_report_syscall_exit.
> + *
> + * Invoked from syscall_exit_work() to wrap ptrace_report_syscall_exit().
> + *
> + * The main purpose is to support arch-specific ptrace_report_syscall_exit
> + * implementation.
> + */
> +static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
> +							    int step);
> +
> +#ifndef arch_ptrace_report_syscall_exit
> +static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
> +							    int step)
> +{
> +	ptrace_report_syscall_exit(regs, step);
> +}
> +#endif

If we want syscall_exit_work() to be inline, then why would we define
this hook in syscall-common.c in patch 12? Might as well define both
hooks in entry-common.h right away and avoid some noise here.

- Kevin

> [...]
Re: [PATCH v9 15/16] entry: Inline syscall_exit_work()
Posted by Jinjie Ruan 4 days, 9 hours ago

On 2025/12/9 21:48, Kevin Brodsky wrote:
> On 04/12/2025 09:21, Jinjie Ruan wrote:
>> After switch arm64 to Generic Entry, a new hotspot syscall_exit_work()
>> appeared because syscall_exit_work() is no longer inlined. so inline
> 
> Before this series the call to syscall_trace_exit() in el0_svc_common()
> could not be inlined, so "no longer inlined" doesn't seem to be accurate.

I think the original "syscall_trace_exit()" is on an equal footing with
new introduced syscall_exit_to_user_mode_prepare() which is now inlined.

> 
>> syscall_exit_work(), and it has 2.6% performance uplift on perf bench
>> basic syscall on kunpeng920 as below.
> 
> That seems strange. syscall_exit_work() is only called if some flag in
> SYSCALL_WORK_EXIT is set, which means that we're doing something special
> like tracing. That shouldn't be the case when running a simple perf
> bench syscall.
> 
> Also worth nothing that its counterpart (syscall_trace_enter())) is not
> currently inlined, the asymmetry would have to be justified.

Will check the "syscall_trace_enter" inline performance impact.

> 
>>     | Metric     | W/O this patch | With this patch | Change    |
>>     | ---------- | -------------- | --------------- | --------- |
>>     | Total time | 2.171 [sec]    | 2.114 [sec]     |  ↓2.6%    |
>>     | usecs/op   | 0.217192       | 0.211453        |  ↓2.6%    |
>>     | ops/sec    | 4,604,225      | 4,729,178       |  ↑2.7%    |
>>
>> Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
>> ---
>>  include/linux/entry-common.h  | 63 ++++++++++++++++++++++++++++++++++-
>>  kernel/entry/syscall-common.c | 59 ++------------------------------
> 
> These changes are purely generic, surely all architectures using
> GENERIC_ENTRY should get similar benefits (assuming LTO isn't used)?

It would be great if someone could help test the performance differences.

> 
>>  2 files changed, 64 insertions(+), 58 deletions(-)
>>
>> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
>> index cd6dacb2d8bf..2f84377fb016 100644
>> --- a/include/linux/entry-common.h
>> +++ b/include/linux/entry-common.h
>> @@ -2,6 +2,7 @@
>>  #ifndef __LINUX_ENTRYCOMMON_H
>>  #define __LINUX_ENTRYCOMMON_H
>>  
>> +#include <linux/audit.h>
>>  #include <linux/irq-entry-common.h>
>>  #include <linux/ptrace.h>
>>  #include <linux/seccomp.h>
>> @@ -128,6 +129,41 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l
>>  	return ret;
>>  }
>>  
>> +/*
>> + * If SYSCALL_EMU is set, then the only reason to report is when
>> + * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
>> + * instruction has been already reported in syscall_enter_from_user_mode().
>> + */
>> +static __always_inline bool report_single_step(unsigned long work)
>> +{
>> +	if (work & SYSCALL_WORK_SYSCALL_EMU)
>> +		return false;
>> +
>> +	return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
>> +}
>> +
>> +/**
>> + * arch_ptrace_report_syscall_exit - Architecture specific
>> + *				     ptrace_report_syscall_exit.
>> + *
>> + * Invoked from syscall_exit_work() to wrap ptrace_report_syscall_exit().
>> + *
>> + * The main purpose is to support arch-specific ptrace_report_syscall_exit
>> + * implementation.
>> + */
>> +static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
>> +							    int step);
>> +
>> +#ifndef arch_ptrace_report_syscall_exit
>> +static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
>> +							    int step)
>> +{
>> +	ptrace_report_syscall_exit(regs, step);
>> +}
>> +#endif
> 
> If we want syscall_exit_work() to be inline, then why would we define
> this hook in syscall-common.c in patch 12? Might as well define both
> hooks in entry-common.h right away and avoid some noise here.

Make sense.

> 
> - Kevin
> 
>> [...]
> 
Re: [PATCH v9 15/16] entry: Inline syscall_exit_work()
Posted by Thomas Gleixner 5 days, 6 hours ago
On Tue, Dec 09 2025 at 14:48, Kevin Brodsky wrote:
> On 04/12/2025 09:21, Jinjie Ruan wrote:
>> After switch arm64 to Generic Entry, a new hotspot syscall_exit_work()
>> appeared because syscall_exit_work() is no longer inlined. so inline
>
> Before this series the call to syscall_trace_exit() in el0_svc_common()
> could not be inlined, so "no longer inlined" doesn't seem to be accurate.
>
>> syscall_exit_work(), and it has 2.6% performance uplift on perf bench
>> basic syscall on kunpeng920 as below.
>
> That seems strange. syscall_exit_work() is only called if some flag in
> SYSCALL_WORK_EXIT is set, which means that we're doing something special
> like tracing. That shouldn't be the case when running a simple perf
> bench syscall.
>
> Also worth nothing that its counterpart (syscall_trace_enter())) is not
> currently inlined, the asymmetry would have to be justified.
>
>>     | Metric     | W/O this patch | With this patch | Change    |
>>     | ---------- | -------------- | --------------- | --------- |
>>     | Total time | 2.171 [sec]    | 2.114 [sec]     |  ↓2.6%    |
>>     | usecs/op   | 0.217192       | 0.211453        |  ↓2.6%    |
>>     | ops/sec    | 4,604,225      | 4,729,178       |  ↑2.7%    |
>>
>> Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
>> ---
>>  include/linux/entry-common.h  | 63 ++++++++++++++++++++++++++++++++++-
>>  kernel/entry/syscall-common.c | 59 ++------------------------------
>
> These changes are purely generic, surely all architectures using
> GENERIC_ENTRY should get similar benefits (assuming LTO isn't used)?

Correct, but as you said this does not make sense as the syscall exit
work should be rare.

Jinjie, can you please figure out which TIF bit is causing this to be
invoked?

I have a suspicion that it is TIF_NOTIFY_RESUME. If that's the case
you're seing the RSEQ overhead, which should be completely gone with the
rewrite that got just merged into Linus tree.

Thanks,

        tglx
Re: [PATCH v9 15/16] entry: Inline syscall_exit_work()
Posted by Jinjie Ruan 4 days, 12 hours ago

On 2025/12/11 14:55, Thomas Gleixner wrote:
> On Tue, Dec 09 2025 at 14:48, Kevin Brodsky wrote:
>> On 04/12/2025 09:21, Jinjie Ruan wrote:
>>> After switch arm64 to Generic Entry, a new hotspot syscall_exit_work()
>>> appeared because syscall_exit_work() is no longer inlined. so inline
>>
>> Before this series the call to syscall_trace_exit() in el0_svc_common()
>> could not be inlined, so "no longer inlined" doesn't seem to be accurate.
>>
>>> syscall_exit_work(), and it has 2.6% performance uplift on perf bench
>>> basic syscall on kunpeng920 as below.
>>
>> That seems strange. syscall_exit_work() is only called if some flag in
>> SYSCALL_WORK_EXIT is set, which means that we're doing something special
>> like tracing. That shouldn't be the case when running a simple perf
>> bench syscall.
>>
>> Also worth nothing that its counterpart (syscall_trace_enter())) is not
>> currently inlined, the asymmetry would have to be justified.
>>
>>>     | Metric     | W/O this patch | With this patch | Change    |
>>>     | ---------- | -------------- | --------------- | --------- |
>>>     | Total time | 2.171 [sec]    | 2.114 [sec]     |  ↓2.6%    |
>>>     | usecs/op   | 0.217192       | 0.211453        |  ↓2.6%    |
>>>     | ops/sec    | 4,604,225      | 4,729,178       |  ↑2.7%    |
>>>
>>> Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
>>> ---
>>>  include/linux/entry-common.h  | 63 ++++++++++++++++++++++++++++++++++-
>>>  kernel/entry/syscall-common.c | 59 ++------------------------------
>>
>> These changes are purely generic, surely all architectures using
>> GENERIC_ENTRY should get similar benefits (assuming LTO isn't used)?
> 
> Correct, but as you said this does not make sense as the syscall exit
> work should be rare.
> 
> Jinjie, can you please figure out which TIF bit is causing this to be
> invoked?
> 
> I have a suspicion that it is TIF_NOTIFY_RESUME. If that's the case
> you're seing the RSEQ overhead, which should be completely gone with the
> rewrite that got just merged into Linus tree.

It maybe audit flag, the test is with audit on.

> 
> Thanks,
> 
>         tglx
> 
> 
Re: [PATCH v9 15/16] entry: Inline syscall_exit_work()
Posted by Kevin Brodsky 5 days, 3 hours ago
On 11/12/2025 07:55, Thomas Gleixner wrote:
> I have a suspicion that it is TIF_NOTIFY_RESUME. If that's the case
> you're seing the RSEQ overhead, which should be completely gone with the
> rewrite that got just merged into Linus tree.

I don't think this is related. This patch inlines syscall_exit_work(),
which is called if some flag in SYSCALL_WORK_EXIT is set. This includes
syscall-specific stuff like tracing and singlestep. TIF_NOTIFY_RESUME is
part of EXIT_TO_USER_MODE_WORK, handled in exit_to_user_mode_prepare(),
and isn't specific to syscall handling.

- Kevin
Re: [PATCH v9 15/16] entry: Inline syscall_exit_work()
Posted by Thomas Gleixner 4 days, 12 hours ago
On Thu, Dec 11 2025 at 10:52, Kevin Brodsky wrote:
> On 11/12/2025 07:55, Thomas Gleixner wrote:
>> I have a suspicion that it is TIF_NOTIFY_RESUME. If that's the case
>> you're seing the RSEQ overhead, which should be completely gone with the
>> rewrite that got just merged into Linus tree.
>
> I don't think this is related. This patch inlines syscall_exit_work(),
> which is called if some flag in SYSCALL_WORK_EXIT is set. This includes
> syscall-specific stuff like tracing and singlestep. TIF_NOTIFY_RESUME is
> part of EXIT_TO_USER_MODE_WORK, handled in exit_to_user_mode_prepare(),
> and isn't specific to syscall handling.

Indeed. Confused myself.