[v3] rseq: Implement time slice extension mechanism

[patch V3 07/12] rseq: Implement syscall entry work for time slice extensions

Posted by Thomas Gleixner 3 months, 1 week ago

The kernel sets SYSCALL_WORK_RSEQ_SLICE when it grants a time slice
extension. This allows to handle the rseq_slice_yield() syscall, which is
used by user space to relinquish the CPU after finishing the critical
section for which it requested an extension.

In case the kernel state is still GRANTED, the kernel resets both kernel
and user space state with a set of sanity checks. If the kernel state is
already cleared, then this raced against the timer or some other interrupt
and just clears the work bit.

Doing it in syscall entry work allows to catch misbehaving user space,
which issues a syscall from the critical section. Wrong syscall and
inconsistent user space result in a SIGSEGV.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
---
V3: Use get/put_user()
---
 include/linux/entry-common.h  |    2 -
 include/linux/rseq.h          |    2 +
 include/linux/thread_info.h   |   16 ++++----
 kernel/entry/syscall-common.c |   11 ++++-
 kernel/rseq.c                 |   79 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 100 insertions(+), 10 deletions(-)

--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -36,8 +36,8 @@
 				 SYSCALL_WORK_SYSCALL_EMU |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
 				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
+				 SYSCALL_WORK_SYSCALL_RSEQ_SLICE |	\
 				 ARCH_SYSCALL_WORK_ENTER)
-
 #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -165,8 +165,10 @@ static inline void rseq_syscall(struct p
 #endif /* !CONFIG_DEBUG_RSEQ */
 
 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
+void rseq_syscall_enter_work(long syscall);
 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
+static inline void rseq_syscall_enter_work(long syscall) { }
 static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
 {
 	return -ENOTSUPP;
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -46,15 +46,17 @@ enum syscall_work_bit {
 	SYSCALL_WORK_BIT_SYSCALL_AUDIT,
 	SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
 	SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
+	SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE,
 };
 
-#define SYSCALL_WORK_SECCOMP		BIT(SYSCALL_WORK_BIT_SECCOMP)
-#define SYSCALL_WORK_SYSCALL_TRACEPOINT	BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
-#define SYSCALL_WORK_SYSCALL_TRACE	BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
-#define SYSCALL_WORK_SYSCALL_EMU	BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
-#define SYSCALL_WORK_SYSCALL_AUDIT	BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
-#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
-#define SYSCALL_WORK_SYSCALL_EXIT_TRAP	BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
+#define SYSCALL_WORK_SECCOMP			BIT(SYSCALL_WORK_BIT_SECCOMP)
+#define SYSCALL_WORK_SYSCALL_TRACEPOINT		BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
+#define SYSCALL_WORK_SYSCALL_TRACE		BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
+#define SYSCALL_WORK_SYSCALL_EMU		BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
+#define SYSCALL_WORK_SYSCALL_AUDIT		BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
+#define SYSCALL_WORK_SYSCALL_USER_DISPATCH	BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
+#define SYSCALL_WORK_SYSCALL_EXIT_TRAP		BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
+#define SYSCALL_WORK_SYSCALL_RSEQ_SLICE		BIT(SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE)
 #endif
 
 #include <asm/thread_info.h>
--- a/kernel/entry/syscall-common.c
+++ b/kernel/entry/syscall-common.c
@@ -17,8 +17,7 @@ static inline void syscall_enter_audit(s
 	}
 }
 
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
-				unsigned long work)
+long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work)
 {
 	long ret = 0;
 
@@ -32,6 +31,14 @@ long syscall_trace_enter(struct pt_regs
 			return -1L;
 	}
 
+	/*
+	 * User space got a time slice extension granted and relinquishes
+	 * the CPU. The work stops the slice timer to avoid an extra round
+	 * through hrtimer_interrupt().
+	 */
+	if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
+		rseq_syscall_enter_work(syscall);
+
 	/* Handle ptrace */
 	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
 		ret = ptrace_report_syscall_entry(regs);
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -501,6 +501,85 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
 
+static inline void rseq_slice_set_need_resched(struct task_struct *curr)
+{
+	/*
+	 * The interrupt guard is required to prevent inconsistent state in
+	 * this case:
+	 *
+	 * set_tsk_need_resched()
+	 * --> Interrupt
+	 *       wakeup()
+	 *        set_tsk_need_resched()
+	 *	  set_preempt_need_resched()
+	 *     schedule_on_return()
+	 *        clear_tsk_need_resched()
+	 *	  clear_preempt_need_resched()
+	 * set_preempt_need_resched()		<- Inconsistent state
+	 *
+	 * This is safe vs. a remote set of TIF_NEED_RESCHED because that
+	 * only sets the already set bit and does not create inconsistent
+	 * state.
+	 */
+	scoped_guard(irq)
+		set_need_resched_current();
+}
+
+static void rseq_slice_validate_ctrl(u32 expected)
+{
+	u32 __user *sctrl = &current->rseq.usrptr->slice_ctrl.all;
+	u32 uval;
+
+	if (get_user(uval, sctrl) || uval != expected)
+		force_sig(SIGSEGV);
+}
+
+/*
+ * Invoked from syscall entry if a time slice extension was granted and the
+ * kernel did not clear it before user space left the critical section.
+ */
+void rseq_syscall_enter_work(long syscall)
+{
+	struct task_struct *curr = current;
+	struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted };
+
+	clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+
+	if (static_branch_unlikely(&rseq_debug_enabled))
+		rseq_slice_validate_ctrl(ctrl.all);
+
+	/*
+	 * The kernel might have raced, revoked the grant and updated
+	 * userspace, but kept the SLICE work set.
+	 */
+	if (!ctrl.granted)
+		return;
+
+	rseq_stat_inc(rseq_stats.s_yielded);
+
+	/*
+	 * Required to make set_tsk_need_resched() correct on PREEMPT[RT]
+	 * kernels.
+	 */
+	scoped_guard(preempt) {
+		/*
+		 * Now that preemption is disabled, quickly check whether
+		 * the task was already rescheduled before arriving here.
+		 */
+		if (!curr->rseq.event.sched_switch)
+			rseq_slice_set_need_resched(curr);
+	}
+
+	curr->rseq.slice.state.granted = false;
+	/*
+	 * Clear the grant in user space and check whether this was the
+	 * correct syscall to yield. If the user access fails or the task
+	 * used an arbitrary syscall, terminate it.
+	 */
+	if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all) || syscall != __NR_rseq_slice_yield)
+		force_sig(SIGSEGV);
+}
+
 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
 {
 	switch (arg2) {

Re: [patch V3 07/12] rseq: Implement syscall entry work for time slice extensions

Posted by Prakash Sangappa 2 months, 3 weeks ago


> On Oct 29, 2025, at 6:22 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> The kernel sets SYSCALL_WORK_RSEQ_SLICE when it grants a time slice
> extension. This allows to handle the rseq_slice_yield() syscall, which is
> used by user space to relinquish the CPU after finishing the critical
> section for which it requested an extension.
> 
> In case the kernel state is still GRANTED, the kernel resets both kernel
> and user space state with a set of sanity checks. If the kernel state is
> already cleared, then this raced against the timer or some other interrupt
> and just clears the work bit.
> 
> Doing it in syscall entry work allows to catch misbehaving user space,
> which issues a syscall from the critical section. Wrong syscall and
> inconsistent user space result in a SIGSEGV.
> 
> 

[…]

> +/*
> + * Invoked from syscall entry if a time slice extension was granted and the
> + * kernel did not clear it before user space left the critical section.
> + */
> +void rseq_syscall_enter_work(long syscall)
> +{

[…]

> 
> + curr->rseq.slice.state.granted = false;
> + /*
> + * Clear the grant in user space and check whether this was the
> + * correct syscall to yield. If the user access fails or the task
> + * used an arbitrary syscall, terminate it.
> + */
> + if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all) || syscall != __NR_rseq_slice_yield)
> + force_sig(SIGSEGV);
> +}

I have been trying to get our Database team to implement changes to use the slice extension API.
They encounter the issue with a system call being made within the slice extension window and the
process dies with SEGV. 

Apparently it will be hard to enforce not calling a system call in the slice extension window due to layering.
For the DB use case, It is fine to terminate the slice extension if a system call is made, but the process
getting killed will not work.

Thanks,
-Prakash

Re: [patch V3 07/12] rseq: Implement syscall entry work for time slice extensions

Posted by Thomas Gleixner 2 months, 2 weeks ago

On Wed, Nov 19 2025 at 00:20, Prakash Sangappa wrote:
>> On Oct 29, 2025, at 6:22 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>> + if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all) || syscall != __NR_rseq_slice_yield)
>> + force_sig(SIGSEGV);
>> +}
>
> I have been trying to get our Database team to implement changes to
> use the slice extension API.  They encounter the issue with a system
> call being made within the slice extension window and the process dies
> with SEGV.

Good. Works as designed.

> Apparently it will be hard to enforce not calling a system call in the
> slice extension window due to layering.

Why do I have a smell of rotten onions in my nose right now?

> For the DB use case, It is fine to terminate the slice extension if a
> system call is made, but the process getting killed will not work.

That's not a question of being fine or not.

The point is that on PREEMPT_NONE/VOLUNATRY that arbitrary syscall can
consume tons of CPU cycles until it either schedules out voluntarily or
reaches __exit_to_user_mode_loop(), which is defeating the whole
mechanism. The timer does not help in that case because once the task is
in the kernel it won't be preempted on return from interrupt.

sys_rseq_sched_yield() is time bound, which is why it was implemented
that way.

I was absolutely right when I asked to tie this mechanism to
PREEMPT_LAZY|FULL in the first place. That would nicely avoid the whole
problem.

Something like the uncompiled and untested below should work. Though I
hate it with a passion.

Thanks,

        tglx
---
Subject: rseq/slice: Handle rotten onions gracefully
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 16:07:15 +0100

Add rant here.

Not-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rseq.c |   18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -643,13 +643,21 @@ void rseq_syscall_enter_work(long syscal
 	}
 
 	curr->rseq.slice.state.granted = false;
+	/* Clear the grant in user space. */
+	if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all))
+		force_sig(SIGSEGV);
+
 	/*
-	 * Clear the grant in user space and check whether this was the
-	 * correct syscall to yield. If the user access fails or the task
-	 * used an arbitrary syscall, terminate it.
+	 * Grudgingly support onion layer applications which cannot
+	 * guarantee that rseq_slice_yield() is used to yield the CPU for
+	 * terminating a grant. This is a NOP on PREEMPT_FULL/LAZY because
+	 * enabling preemption above already scheduled, but required for
+	 * PREEMPT_NONE/VOLUNTARY to prevent that the slice is further
+	 * expanded up to the point where the syscall code schedules
+	 * voluntarily or reaches exit_to_user_mode_loop().
 	 */
-	if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all) || syscall != __NR_rseq_slice_yield)
-		force_sig(SIGSEGV);
+	if (syscall != __NR_rseq_slice_yield)
+		cond_resched();
 }
 
 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)

Re: [patch V3 07/12] rseq: Implement syscall entry work for time slice extensions

Posted by Prakash Sangappa 2 months, 2 weeks ago


> On Nov 19, 2025, at 7:25 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> On Wed, Nov 19 2025 at 00:20, Prakash Sangappa wrote:
>>> On Oct 29, 2025, at 6:22 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>>> + if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all) || syscall != __NR_rseq_slice_yield)
>>> + force_sig(SIGSEGV);
>>> +}
>> 
>> I have been trying to get our Database team to implement changes to
>> use the slice extension API.  They encounter the issue with a system
>> call being made within the slice extension window and the process dies
>> with SEGV.
> 
> Good. Works as designed.
> 
>> Apparently it will be hard to enforce not calling a system call in the
>> slice extension window due to layering.
> 
> Why do I have a smell of rotten onions in my nose right now?
> 
>> For the DB use case, It is fine to terminate the slice extension if a
>> system call is made, but the process getting killed will not work.
> 
> That's not a question of being fine or not.
> 
> The point is that on PREEMPT_NONE/VOLUNATRY that arbitrary syscall can
> consume tons of CPU cycles until it either schedules out voluntarily or
> reaches __exit_to_user_mode_loop(), which is defeating the whole
> mechanism. The timer does not help in that case because once the task is
> in the kernel it won't be preempted on return from interrupt.
> 
> sys_rseq_sched_yield() is time bound, which is why it was implemented
> that way.
> 
> I was absolutely right when I asked to tie this mechanism to
> PREEMPT_LAZY|FULL in the first place. That would nicely avoid the whole
> problem.
> 
> Something like the uncompiled and untested below should work. Though I
> hate it with a passion.

That works. It addresses DB issue.


> + * Grudgingly support onion layer applications which cannot
> + * guarantee that rseq_slice_yield() is used to yield the CPU for
> + * terminating a grant. This is a NOP on PREEMPT_FULL/LAZY because
> + * enabling preemption above already scheduled, but required for
> + * PREEMPT_NONE/VOLUNTARY to prevent that the slice is further
> + * expanded up to the point where the syscall code schedules
> + * voluntarily or reaches exit_to_user_mode_loop().
> */
> - if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all) || syscall != __NR_rseq_slice_yield)
> - force_sig(SIGSEGV);
> + if (syscall != __NR_rseq_slice_yield)
> + cond_resched();
> }

With this change, here are the ’swingbench’ performance results I received from our Database team.
https://www.dominicgiles.com/swingbench/

Kernel based on rseq/slice v3 + above change.
System: 2 socket AMD.
Cached DB config - i.e DB files cached on tmpfs.

Response from Database performance engineer:-
Overall the results are very positive and consistent with the earlier findings, we see a clear benefit from the optimization running the same tests as earlier.

• The sgrant figure in /sys/kernel/debug/rseq/stats increases with the DB side optimization enabled, while it stays flat when disabled.  I believe this indicates that both the kernel-side code & the DB side triggers are working as expected.

• Due to the contentious nature of the workload these tests produce highly erratic results, but the optimization is showing improved performance across 3x tests with/without use of time slice extension.

• Swingbench throughput with use of time slice optimization
	• Run 1: 50,008.10
	• Run 2: 59,160.60
	• Run 3: 67,342.70
• Swingbench throughput without use of time slice optimization
	• Run 1: 36,422.80
	• Run 2: 33,186.00
	• Run 3: 44,309.80
• The application performs 55% better on average with the optimization.

-Prakash

Re: [patch V3 07/12] rseq: Implement syscall entry work for time slice extensions

Posted by Thomas Gleixner 2 months, 2 weeks ago

On Thu, Nov 20 2025 at 07:37, Prakash Sangappa wrote:
>> On Nov 19, 2025, at 7:25 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>> Something like the uncompiled and untested below should work. Though I
>> hate it with a passion.
>
> That works. It addresses DB issue.
>
> With this change, here are the ’swingbench’ performance results I received from our Database team.
> https://www.dominicgiles.com/swingbench/
>
> Kernel based on rseq/slice v3 + above change.
> System: 2 socket AMD.
> Cached DB config - i.e DB files cached on tmpfs.
>
> Response from Database performance engineer:-
>
> Overall the results are very positive and consistent with the earlier
> findings, we see a clear benefit from the optimization running the
> same tests as earlier.
>
> • The sgrant figure in /sys/kernel/debug/rseq/stats increases with the
>   DB side optimization enabled, while it stays flat when disabled.  I
>   believe this indicates that both the kernel-side code & the DB side
>   triggers are working as expected.

Correct.

> • Due to the contentious nature of the workload these tests produce
>   highly erratic results, but the optimization is showing improved
>   performance across 3x tests with/without use of time slice extension.
>
> • Swingbench throughput with use of time slice optimization
> 	• Run 1: 50,008.10
> 	• Run 2: 59,160.60
> 	• Run 3: 67,342.70
> • Swingbench throughput without use of time slice optimization
> 	• Run 1: 36,422.80
> 	• Run 2: 33,186.00
> 	• Run 3: 44,309.80
> • The application performs 55% better on average with the optimization.

55% is insane.

Could you please ask your performance guys to provide numbers for the
below configurations to see how the different parts of this work are
affecting the overall result:

 1) Linux 6.17 (no rseq rework, no slice)

 2) Linux 6.17 + your initial attempt to enable slice extension

We already have the numbers for the full new stack above (with and
without slice), so that should give us the full picture.

Thanks,

        tglx

Re: [patch V3 07/12] rseq: Implement syscall entry work for time slice extensions

Posted by david laight 2 months, 2 weeks ago

On Thu, 20 Nov 2025 12:31:54 +0100
Thomas Gleixner <tglx@linutronix.de> wrote:

...
> > • Due to the contentious nature of the workload these tests produce
> >   highly erratic results, but the optimization is showing improved
> >   performance across 3x tests with/without use of time slice extension.
> >
> > • Swingbench throughput with use of time slice optimization
> > 	• Run 1: 50,008.10
> > 	• Run 2: 59,160.60
> > 	• Run 3: 67,342.70
> > • Swingbench throughput without use of time slice optimization
> > 	• Run 1: 36,422.80
> > 	• Run 2: 33,186.00
> > 	• Run 3: 44,309.80
> > • The application performs 55% better on average with the optimization.  
> 
> 55% is insane.
> 
> Could you please ask your performance guys to provide numbers for the
> below configurations to see how the different parts of this work are
> affecting the overall result:
> 
>  1) Linux 6.17 (no rseq rework, no slice)
> 
>  2) Linux 6.17 + your initial attempt to enable slice extension
> 
> We already have the numbers for the full new stack above (with and
> without slice), so that should give us the full picture.

If is also worth checking that you don't have a single (or limited)
thread test where the busy thread is being bounced between cpu.

While busy the cpu frequency is increased, when moved to an idle
cpu it will initially run at the low frequency and then speed up.

This effect doubled the execution time of a (mostly) single threaded
fpga compile from 10 minutes to 20 minutes - all caused by one of
the mitigations that slowed down syscall entry/exit enough that a
load of basically idle processes that woke every 10ms to all be
active at once.

You've also got the underlying problem that you can't disable
interrupts in userspace.
If an ISR happens in your 'critical region' you just lose 'big time'.
Any threads that contend pretty much have to wait for the ISR
(and any non-threaded softints) to complete.
With heavy network traffic that can easily exceed 1ms.
Nothing you can to to the scheduler will change it.

	David

Re: [patch V3 07/12] rseq: Implement syscall entry work for time slice extensions

Posted by Prakash Sangappa 2 months, 2 weeks ago


> On Nov 20, 2025, at 3:31 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> On Thu, Nov 20 2025 at 07:37, Prakash Sangappa wrote:
>>> On Nov 19, 2025, at 7:25 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>>> Something like the uncompiled and untested below should work. Though I
>>> hate it with a passion.
>> 
>> That works. It addresses DB issue.
>> 
>> With this change, here are the ’swingbench’ performance results I received from our Database team.
>> https://www.dominicgiles.com/swingbench/
>> 
>> Kernel based on rseq/slice v3 + above change.
>> System: 2 socket AMD.
>> Cached DB config - i.e DB files cached on tmpfs.
>> 
>> Response from Database performance engineer:-
>> 
>> Overall the results are very positive and consistent with the earlier
>> findings, we see a clear benefit from the optimization running the
>> same tests as earlier.
>> 
>> • The sgrant figure in /sys/kernel/debug/rseq/stats increases with the
>>  DB side optimization enabled, while it stays flat when disabled.  I
>>  believe this indicates that both the kernel-side code & the DB side
>>  triggers are working as expected.
> 
> Correct.
> 
>> • Due to the contentious nature of the workload these tests produce
>>  highly erratic results, but the optimization is showing improved
>>  performance across 3x tests with/without use of time slice extension.
>> 
>> • Swingbench throughput with use of time slice optimization
>> • Run 1: 50,008.10
>> • Run 2: 59,160.60
>> • Run 3: 67,342.70
>> • Swingbench throughput without use of time slice optimization
>> • Run 1: 36,422.80
>> • Run 2: 33,186.00
>> • Run 3: 44,309.80
>> • The application performs 55% better on average with the optimization.
> 
> 55% is insane.
> 
> Could you please ask your performance guys to provide numbers for the
> below configurations to see how the different parts of this work are
> affecting the overall result:
> 
> 1) Linux 6.17 (no rseq rework, no slice)
> 
> 2) Linux 6.17 + your initial attempt to enable slice extension
> 
> We already have the numbers for the full new stack above (with and
> without slice), so that should give us the full picture.
> 

Ok, will ask him to run these. 
-Prakash.

> Thanks,
> 
>        tglx

Re: [patch V3 07/12] rseq: Implement syscall entry work for time slice extensions

Posted by Prakash Sangappa 2 months, 1 week ago


> On Nov 20, 2025, at 4:12 PM, Prakash Sangappa <prakash.sangappa@oracle.com> wrote:
> 
> 
> 
>> On Nov 20, 2025, at 3:31 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>> 
>> On Thu, Nov 20 2025 at 07:37, Prakash Sangappa wrote:
>>>> On Nov 19, 2025, at 7:25 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>>>> Something like the uncompiled and untested below should work. Though I
>>>> hate it with a passion.
>>> 
>>> That works. It addresses DB issue.
>>> 
>>> With this change, here are the ’swingbench’ performance results I received from our Database team.
>>> https://www.dominicgiles.com/swingbench/
>>> 
>>> Kernel based on rseq/slice v3 + above change.
>>> System: 2 socket AMD.
>>> Cached DB config - i.e DB files cached on tmpfs.
>>> 
>>> Response from Database performance engineer:-
>>> 
>>> Overall the results are very positive and consistent with the earlier
>>> findings, we see a clear benefit from the optimization running the
>>> same tests as earlier.
>>> 
>>> • The sgrant figure in /sys/kernel/debug/rseq/stats increases with the
>>> DB side optimization enabled, while it stays flat when disabled.  I
>>> believe this indicates that both the kernel-side code & the DB side
>>> triggers are working as expected.
>> 
>> Correct.
>> 
>>> • Due to the contentious nature of the workload these tests produce
>>> highly erratic results, but the optimization is showing improved
>>> performance across 3x tests with/without use of time slice extension.
>>> 
>>> • Swingbench throughput with use of time slice optimization
>>> • Run 1: 50,008.10
>>> • Run 2: 59,160.60
>>> • Run 3: 67,342.70
>>> • Swingbench throughput without use of time slice optimization
>>> • Run 1: 36,422.80
>>> • Run 2: 33,186.00
>>> • Run 3: 44,309.80
>>> • The application performs 55% better on average with the optimization.
>> 
>> 55% is insane.
>> 
>> Could you please ask your performance guys to provide numbers for the
>> below configurations to see how the different parts of this work are
>> affecting the overall result:
>> 
>> 1) Linux 6.17 (no rseq rework, no slice)
>> 
>> 2) Linux 6.17 + your initial attempt to enable slice extension
>> 
>> We already have the numbers for the full new stack above (with and
>> without slice), so that should give us the full picture.
>> 
> 

My previous(initial) implementation on v6.17 kernel was showing higher numbers.
So, to keep things similar to the rseq/slice kernel, got following numbers From DB engineer
with the  previous implementation  built on v6.18-rc4 kernel.

Swingbench thought put with use of slice extension(previous implementation)
	* Run 1: 50824.10
	* Run 2: 54058.30
	* Run 3: 30212.50
Swingbench through put without use of optimization.
	* Run 1: 33036.50
	* Run 2: 35939.60
	* Run 3: 40461.70 
Performs 23% better with time slice optimization.

The workload shows lot of variability. However overall trend seems consistent(ie we see
 improvement with slice extension).
I think above should give an idea of potential gains the underlying rseq framework optimization adds. 

Thanks,
-Prakash

> Ok, will ask him to run these. 
> -Prakash.
> 
>> Thanks,
>> 
>>       tglx
>

Re: [patch V3 07/12] rseq: Implement syscall entry work for time slice extensions

Posted by Mathieu Desnoyers 3 months, 1 week ago

On 2025-10-29 09:22, Thomas Gleixner wrote:
> The kernel sets SYSCALL_WORK_RSEQ_SLICE when it grants a time slice
> extension. This allows to handle the rseq_slice_yield() syscall, which is
> used by user space to relinquish the CPU after finishing the critical
> section for which it requested an extension.

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com