[v3] Scheduler time slice extension

[PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Prakash Sangappa 9 months, 1 week ago

Add support for a thread to request extending its execution time slice on
the cpu. The extra cpu time granted would help in allowing the thread to
complete executing the critical section and drop any locks without getting
preempted. The thread would request this cpu time extension, by setting a
bit in the restartable sequences(rseq) structure registered with the kernel.

Kernel will grant a 50us extension on the cpu, when it sees the bit set.
With the help of a timer, kernel force preempts the thread if it is still
running on the cpu when the 50us timer expires. The thread should yield
the cpu by making a system call after completing the critical section.

Suggested-by: Peter Ziljstra <peterz@infradead.org>
Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
---
v2:
- Add check in syscall_exit_to_user_mode_prepare() and reschedule if
  thread has 'rseq_sched_delay' set.
v3:
- Rename rseq_sched_delay -> sched_time_delay and move near other bits
  in struct task_struct.
- Use IS_ENABLED() check to access 'sched_time_delay' instead of #ifdef
- Modify coment describing RSEQ_CS_FLAG_DELAY_RESCHED flag.
- Remove rseq_delay_resched_tick() call from hrtick_clear().
---
 include/linux/entry-common.h | 11 +++++--
 include/linux/sched.h        | 16 +++++++++++
 include/uapi/linux/rseq.h    |  7 +++++
 kernel/entry/common.c        | 19 ++++++++----
 kernel/rseq.c                | 56 ++++++++++++++++++++++++++++++++++++
 kernel/sched/core.c          | 14 +++++++++
 kernel/sched/syscalls.c      |  5 ++++
 7 files changed, 120 insertions(+), 8 deletions(-)

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index fc61d0205c97..cec343f95210 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -303,7 +303,8 @@ void arch_do_signal_or_restart(struct pt_regs *regs);
  * exit_to_user_mode_loop - do any pending work before leaving to user space
  */
 unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-				     unsigned long ti_work);
+				     unsigned long ti_work,
+				     bool irq);
 
 /**
  * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
@@ -315,7 +316,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
  *    EXIT_TO_USER_MODE_WORK are set
  * 4) check that interrupts are still disabled
  */
-static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs,
+						bool irq)
 {
 	unsigned long ti_work;
 
@@ -326,7 +328,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
 
 	ti_work = read_thread_flags();
 	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
-		ti_work = exit_to_user_mode_loop(regs, ti_work);
+		ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
+
+	if (irq)
+		rseq_delay_resched_fini();
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c08fd199be4e..14bf0508bfca 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -339,6 +339,7 @@ extern int __must_check io_schedule_prepare(void);
 extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
+extern void hrtick_local_start(u64 delay);
 
 /* wrapper function to trace from this header file */
 DECLARE_TRACEPOINT(sched_set_state_tp);
@@ -1044,6 +1045,7 @@ struct task_struct {
 	/* delay due to memory thrashing */
 	unsigned                        in_thrashing:1;
 #endif
+	unsigned			sched_time_delay:1;
 #ifdef CONFIG_PREEMPT_RT
 	struct netdev_xmit		net_xmit;
 #endif
@@ -2249,6 +2251,20 @@ static inline bool owner_on_cpu(struct task_struct *owner)
 unsigned long sched_cpu_util(int cpu);
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_RSEQ
+
+extern bool rseq_delay_resched(void);
+extern void rseq_delay_resched_fini(void);
+extern void rseq_delay_resched_tick(void);
+
+#else
+
+static inline bool rseq_delay_resched(void) { return false; }
+static inline void rseq_delay_resched_fini(void) { }
+static inline void rseq_delay_resched_tick(void) { }
+
+#endif
+
 #ifdef CONFIG_SCHED_CORE
 extern void sched_core_free(struct task_struct *tsk);
 extern void sched_core_fork(struct task_struct *p);
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index c233aae5eac9..900cb75f6a88 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -26,6 +26,7 @@ enum rseq_cs_flags_bit {
 	RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT	= 0,
 	RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT	= 1,
 	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT	= 2,
+	RSEQ_CS_FLAG_DELAY_RESCHED_BIT		= 3,
 };
 
 enum rseq_cs_flags {
@@ -35,6 +36,8 @@ enum rseq_cs_flags {
 		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
 	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE	=
 		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+	RSEQ_CS_FLAG_DELAY_RESCHED		=
+		(1U << RSEQ_CS_FLAG_DELAY_RESCHED_BIT),
 };
 
 /*
@@ -128,6 +131,10 @@ struct rseq {
 	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
 	 *     Inhibit instruction sequence block restart on migration for
 	 *     this thread.
+	 * - RSEQ_CS_DELAY_RESCHED
+	 *     Request by user task to try delaying preemption. With
+	 *     use of a timer, extra cpu time upto 50us is granted for this
+	 *     thread before being rescheduled.
 	 */
 	__u32 flags;
 
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 20154572ede9..b26adccb32df 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -88,7 +88,8 @@ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
  * @ti_work:	TIF work flags as read by the caller
  */
 __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-						     unsigned long ti_work)
+						     unsigned long ti_work,
+						     bool irq)
 {
 	/*
 	 * Before returning to user space ensure that all pending work
@@ -98,8 +99,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 
 		local_irq_enable_exit_to_user(ti_work);
 
-		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
-			schedule();
+		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
+		       if (irq && rseq_delay_resched())
+			       clear_tsk_need_resched(current);
+		       else
+			       schedule();
+		}
 
 		if (ti_work & _TIF_UPROBE)
 			uprobe_notify_resume(regs);
@@ -184,6 +189,10 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
 
 	CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
 
+	/* reschedule if sched delay was granted */
+	if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay)
+		set_tsk_need_resched(current);
+
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
 		if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
 			local_irq_enable();
@@ -204,7 +213,7 @@ static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *reg
 {
 	syscall_exit_to_user_mode_prepare(regs);
 	local_irq_disable_exit_to_user();
-	exit_to_user_mode_prepare(regs);
+	exit_to_user_mode_prepare(regs, false);
 }
 
 void syscall_exit_to_user_mode_work(struct pt_regs *regs)
@@ -228,7 +237,7 @@ noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
 {
 	instrumentation_begin();
-	exit_to_user_mode_prepare(regs);
+	exit_to_user_mode_prepare(regs, true);
 	instrumentation_end();
 	exit_to_user_mode();
 }
diff --git a/kernel/rseq.c b/kernel/rseq.c
index b7a1ec327e81..0ecd16e01712 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -448,6 +448,62 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 	force_sigsegv(sig);
 }
 
+bool rseq_delay_resched(void)
+{
+	struct task_struct *t = current;
+	u32 flags;
+
+	if (!IS_ENABLED(CONFIG_SCHED_HRTICK))
+		return false;
+
+	if (!t->rseq)
+		return false;
+
+	if (t->sched_time_delay)
+		return false;
+
+	if (copy_from_user_nofault(&flags, &t->rseq->flags, sizeof(flags)))
+		return false;
+
+	if (!(flags & RSEQ_CS_FLAG_DELAY_RESCHED))
+		return false;
+
+	flags &= ~RSEQ_CS_FLAG_DELAY_RESCHED;
+	if (copy_to_user_nofault(&t->rseq->flags, &flags, sizeof(flags)))
+		return false;
+
+	t->sched_time_delay = 1;
+
+	return true;
+}
+
+void rseq_delay_resched_fini(void)
+{
+#ifdef CONFIG_SCHED_HRTICK
+	extern void hrtick_local_start(u64 delay);
+	struct task_struct *t = current;
+	/*
+	 * IRQs off, guaranteed to return to userspace, start timer on this CPU
+	 * to limit the resched-overdraft.
+	 *
+	 * If your critical section is longer than 50 us you get to keep the
+	 * pieces.
+	 */
+	if (t->sched_time_delay)
+		hrtick_local_start(50 * NSEC_PER_USEC);
+#endif
+}
+
+void rseq_delay_resched_tick(void)
+{
+#ifdef CONFIG_SCHED_HRTICK
+	struct task_struct *t = current;
+
+	if (t->sched_time_delay)
+		set_tsk_need_resched(t);
+#endif
+}
+
 #ifdef CONFIG_DEBUG_RSEQ
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4de24eefe661..8c8960245ec0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -844,6 +844,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 
+	rseq_delay_resched_tick();
+
 	rq_lock(rq, &rf);
 	update_rq_clock(rq);
 	rq->donor->sched_class->task_tick(rq, rq->curr, 1);
@@ -917,6 +919,16 @@ void hrtick_start(struct rq *rq, u64 delay)
 
 #endif /* CONFIG_SMP */
 
+void hrtick_local_start(u64 delay)
+{
+	struct rq *rq = this_rq();
+	struct rq_flags rf;
+
+	rq_lock(rq, &rf);
+	hrtick_start(rq, delay);
+	rq_unlock(rq, &rf);
+}
+
 static void hrtick_rq_init(struct rq *rq)
 {
 #ifdef CONFIG_SMP
@@ -6722,6 +6734,8 @@ static void __sched notrace __schedule(int sched_mode)
 picked:
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
+	if (IS_ENABLED(CONFIG_RSEQ))
+		prev->sched_time_delay = 0;
 	rq->last_seen_need_resched_ns = 0;
 
 	is_switch = prev != next;
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index cd38f4e9899d..1b2b64fe0fb1 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -1378,6 +1378,11 @@ static void do_sched_yield(void)
  */
 SYSCALL_DEFINE0(sched_yield)
 {
+	if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay) {
+		schedule();
+		return 0;
+	}
+
 	do_sched_yield();
 	return 0;
 }
-- 
2.43.5

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Sebastian Andrzej Siewior 9 months, 1 week ago

On 2025-05-02 01:59:52 [+0000], Prakash Sangappa wrote:
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index 20154572ede9..b26adccb32df 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -98,8 +99,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>  
>  		local_irq_enable_exit_to_user(ti_work);
>  
> -		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
> -			schedule();
> +		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {

I asked to limit this to LAZY only.

> +		       if (irq && rseq_delay_resched())
> +			       clear_tsk_need_resched(current);
> +		       else
> +			       schedule();
> +		}
>  
>  		if (ti_work & _TIF_UPROBE)
>  			uprobe_notify_resume(regs);

Sebastian

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Peter Zijlstra 9 months, 1 week ago

On Fri, May 02, 2025 at 02:34:33PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-05-02 01:59:52 [+0000], Prakash Sangappa wrote:
> > diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> > index 20154572ede9..b26adccb32df 100644
> > --- a/kernel/entry/common.c
> > +++ b/kernel/entry/common.c
> > @@ -98,8 +99,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> >  
> >  		local_irq_enable_exit_to_user(ti_work);
> >  
> > -		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
> > -			schedule();
> > +		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
> 
> I asked to limit this to LAZY only.

No -- this should work irrespective of the preemption model chosen.

It should also have a default time that is shorter than the scheduling
delay normally observed.

It should probably also have a PR_WARN on raising the sysctl value.

I know you worry about RT, but show me a trace where it is a problem.

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Sebastian Andrzej Siewior 9 months, 1 week ago

On 2025-05-02 16:35:44 [+0200], Peter Zijlstra wrote:
> On Fri, May 02, 2025 at 02:34:33PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2025-05-02 01:59:52 [+0000], Prakash Sangappa wrote:
> > > diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> > > index 20154572ede9..b26adccb32df 100644
> > > --- a/kernel/entry/common.c
> > > +++ b/kernel/entry/common.c
> > > @@ -98,8 +99,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > >  
> > >  		local_irq_enable_exit_to_user(ti_work);
> > >  
> > > -		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
> > > -			schedule();
> > > +		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
> > 
> > I asked to limit this to LAZY only.
> 
> No -- this should work irrespective of the preemption model chosen.

Focusing on LAZY would easily exclude the tasks with priorities.

> It should also have a default time that is shorter than the scheduling
> delay normally observed.
> 
> It should probably also have a PR_WARN on raising the sysctl value.
> 
> I know you worry about RT, but show me a trace where it is a problem.

The default extends the wakeup by 50us. With a 250us period this extends
the wakeup in worst case by 20% of the period. With 1ms it gets just to
5% of the whole period. You basically expect that everything is well
timed and this delay does not hurt anyone.
This delays interrupt driven wakeups (timer, hardware) and not every
wake up as I assumed initially so it is not as bad but still worse than
in has to be.

On top of that sched(7) says:
|   SCHED_FIFO: First in-first out scheduling
|      SCHED_FIFO  can be used only with static priorities higher than 0, which
|      means that when a SCHED_FIFO thread becomes runnable, it will always
|      immediately preempt any currently running SCHED_OTHER, SCHED_BATCH, or
|      SCHED_IDLE thread.  SCHED_FIFO is a simple scheduling algorithm without
|      time slicing.  For threads scheduled under the SCHED_FIFO  policy,  the
…
|   SCHED_DEADLINE: Sporadic task model deadline scheduling
…
|      if any SCHED_DEADLINE thread is runnable, it will preempt any thread
|      scheduled under
…

With this change, this "immediately preempt any currently running" is no
longer true.

While we could continue to argue how much the delay really is, I don't
understand why we can't exclude real time scheduling policy from this.

Sebastian

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Peter Zijlstra 9 months, 1 week ago

On Fri, May 02, 2025 at 01:59:52AM +0000, Prakash Sangappa wrote:
> Add support for a thread to request extending its execution time slice on
> the cpu. The extra cpu time granted would help in allowing the thread to
> complete executing the critical section and drop any locks without getting
> preempted. The thread would request this cpu time extension, by setting a
> bit in the restartable sequences(rseq) structure registered with the kernel.
> 
> Kernel will grant a 50us extension on the cpu, when it sees the bit set.
> With the help of a timer, kernel force preempts the thread if it is still
> running on the cpu when the 50us timer expires. The thread should yield
> the cpu by making a system call after completing the critical section.
> 
> Suggested-by: Peter Ziljstra <peterz@infradead.org>
> Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
> ---

> diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
> index c233aae5eac9..900cb75f6a88 100644
> --- a/include/uapi/linux/rseq.h
> +++ b/include/uapi/linux/rseq.h
> @@ -26,6 +26,7 @@ enum rseq_cs_flags_bit {
>  	RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT	= 0,
>  	RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT	= 1,
>  	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT	= 2,
> +	RSEQ_CS_FLAG_DELAY_RESCHED_BIT		= 3,
>  };
>  
>  enum rseq_cs_flags {
> @@ -35,6 +36,8 @@ enum rseq_cs_flags {
>  		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
>  	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE	=
>  		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
> +	RSEQ_CS_FLAG_DELAY_RESCHED		=
> +		(1U << RSEQ_CS_FLAG_DELAY_RESCHED_BIT),
>  };
>  
>  /*
> @@ -128,6 +131,10 @@ struct rseq {
>  	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
>  	 *     Inhibit instruction sequence block restart on migration for
>  	 *     this thread.
> +	 * - RSEQ_CS_DELAY_RESCHED
> +	 *     Request by user task to try delaying preemption. With
> +	 *     use of a timer, extra cpu time upto 50us is granted for this
> +	 *     thread before being rescheduled.
>  	 */
>  	__u32 flags;

So while it works for testing, this really is a rather crap interface
for real because userspace cannot up front tell if its going to work or
not.

> +void rseq_delay_resched_fini(void)
> +{
> +#ifdef CONFIG_SCHED_HRTICK
> +	extern void hrtick_local_start(u64 delay);
> +	struct task_struct *t = current;
> +	/*
> +	 * IRQs off, guaranteed to return to userspace, start timer on this CPU
> +	 * to limit the resched-overdraft.
> +	 *
> +	 * If your critical section is longer than 50 us you get to keep the
> +	 * pieces.
> +	 */
> +	if (t->sched_time_delay)
> +		hrtick_local_start(50 * NSEC_PER_USEC);
> +#endif
> +}

Should not the interface at least reflect this SCHED_HRTICK status? One,
slightly hacky way of doing this might to be invert the bit. Have the
system write a 1 when the feature is present, and have userspace flip it
to 0 to activate.

A better way might be to add a second bit.

Also, didn't we all agree 50us was overly optimistic and this number
should be lower?

> diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
> index cd38f4e9899d..1b2b64fe0fb1 100644
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -1378,6 +1378,11 @@ static void do_sched_yield(void)
>   */
>  SYSCALL_DEFINE0(sched_yield)
>  {
> +	if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay) {
> +		schedule();
> +		return 0;
> +	}
> +
>  	do_sched_yield();
>  	return 0;
>  }

Multiple people, very much including Linus, have already said this
'cute' hack isn't going to fly. Why is it still here?

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Prakash Sangappa 9 months, 1 week ago


> On May 2, 2025, at 2:05 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Fri, May 02, 2025 at 01:59:52AM +0000, Prakash Sangappa wrote:
>> Add support for a thread to request extending its execution time slice on
>> the cpu. The extra cpu time granted would help in allowing the thread to
>> complete executing the critical section and drop any locks without getting
>> preempted. The thread would request this cpu time extension, by setting a
>> bit in the restartable sequences(rseq) structure registered with the kernel.
>> 
>> Kernel will grant a 50us extension on the cpu, when it sees the bit set.
>> With the help of a timer, kernel force preempts the thread if it is still
>> running on the cpu when the 50us timer expires. The thread should yield
>> the cpu by making a system call after completing the critical section.
>> 
>> Suggested-by: Peter Ziljstra <peterz@infradead.org>
>> Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
>> ---
> 
>> diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
>> index c233aae5eac9..900cb75f6a88 100644
>> --- a/include/uapi/linux/rseq.h
>> +++ b/include/uapi/linux/rseq.h
>> @@ -26,6 +26,7 @@ enum rseq_cs_flags_bit {
>> RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
>> RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
>> RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
>> + RSEQ_CS_FLAG_DELAY_RESCHED_BIT = 3,
>> };
>> 
>> enum rseq_cs_flags {
>> @@ -35,6 +36,8 @@ enum rseq_cs_flags {
>> (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
>> RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE =
>> (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
>> + RSEQ_CS_FLAG_DELAY_RESCHED =
>> + (1U << RSEQ_CS_FLAG_DELAY_RESCHED_BIT),
>> };
>> 
>> /*
>> @@ -128,6 +131,10 @@ struct rseq {
>> * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
>> *     Inhibit instruction sequence block restart on migration for
>> *     this thread.
>> + * - RSEQ_CS_DELAY_RESCHED
>> + *     Request by user task to try delaying preemption. With
>> + *     use of a timer, extra cpu time upto 50us is granted for this
>> + *     thread before being rescheduled.
>> */
>> __u32 flags;
> 
> So while it works for testing, this really is a rather crap interface
> for real because userspace cannot up front tell if its going to work or
> not.
> 
>> +void rseq_delay_resched_fini(void)
>> +{
>> +#ifdef CONFIG_SCHED_HRTICK
>> + extern void hrtick_local_start(u64 delay);
>> + struct task_struct *t = current;
>> + /*
>> + * IRQs off, guaranteed to return to userspace, start timer on this CPU
>> + * to limit the resched-overdraft.
>> + *
>> + * If your critical section is longer than 50 us you get to keep the
>> + * pieces.
>> + */
>> + if (t->sched_time_delay)
>> + hrtick_local_start(50 * NSEC_PER_USEC);
>> +#endif
>> +}
> 
> Should not the interface at least reflect this SCHED_HRTICK status? One,
> slightly hacky way of doing this might to be invert the bit. Have the
> system write a 1 when the feature is present, and have userspace flip it
> to 0 to activate.
> 
> A better way might be to add a second bit.

Sure, the second bit would be set when the rseq structure is registered.
Or, could there be an API thru sys_rseq system call to query if this feature is
available?  Something Mathieu seems to suggest in his response.


> 
> Also, didn't we all agree 50us was overly optimistic and this number
> should be lower?
> 
>> diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
>> index cd38f4e9899d..1b2b64fe0fb1 100644
>> --- a/kernel/sched/syscalls.c
>> +++ b/kernel/sched/syscalls.c
>> @@ -1378,6 +1378,11 @@ static void do_sched_yield(void)
>>  */
>> SYSCALL_DEFINE0(sched_yield)
>> {
>> + if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay) {
>> + schedule();
>> + return 0;
>> + }
>> +
>> do_sched_yield();
>> return 0;
>> }
> 
> Multiple people, very much including Linus, have already said this
> 'cute' hack isn't going to fly. Why is it still here?

I can remove it. 
Which system call should be suggested for the application to 
yield the cpu at the end of the critical section?

Thanks,
-Prakash

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Mathieu Desnoyers 9 months, 1 week ago

On 2025-05-02 05:05, Peter Zijlstra wrote:
> On Fri, May 02, 2025 at 01:59:52AM +0000, Prakash Sangappa wrote:
>> Add support for a thread to request extending its execution time slice on
>> the cpu. The extra cpu time granted would help in allowing the thread to
>> complete executing the critical section and drop any locks without getting
>> preempted. The thread would request this cpu time extension, by setting a
>> bit in the restartable sequences(rseq) structure registered with the kernel.
>>
>> Kernel will grant a 50us extension on the cpu, when it sees the bit set.
>> With the help of a timer, kernel force preempts the thread if it is still
>> running on the cpu when the 50us timer expires. The thread should yield
>> the cpu by making a system call after completing the critical section.
>>
>> Suggested-by: Peter Ziljstra <peterz@infradead.org>
>> Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
>> ---
> 
>> diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
>> index c233aae5eac9..900cb75f6a88 100644
>> --- a/include/uapi/linux/rseq.h
>> +++ b/include/uapi/linux/rseq.h
>> @@ -26,6 +26,7 @@ enum rseq_cs_flags_bit {
>>   	RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT	= 0,
>>   	RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT	= 1,
>>   	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT	= 2,
>> +	RSEQ_CS_FLAG_DELAY_RESCHED_BIT		= 3,
>>   };
>>   
>>   enum rseq_cs_flags {
>> @@ -35,6 +36,8 @@ enum rseq_cs_flags {
>>   		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
>>   	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE	=
>>   		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
>> +	RSEQ_CS_FLAG_DELAY_RESCHED		=
>> +		(1U << RSEQ_CS_FLAG_DELAY_RESCHED_BIT),
>>   };
>>   
>>   /*
>> @@ -128,6 +131,10 @@ struct rseq {
>>   	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
>>   	 *     Inhibit instruction sequence block restart on migration for
>>   	 *     this thread.
>> +	 * - RSEQ_CS_DELAY_RESCHED
>> +	 *     Request by user task to try delaying preemption. With
>> +	 *     use of a timer, extra cpu time upto 50us is granted for this
>> +	 *     thread before being rescheduled.
>>   	 */
>>   	__u32 flags;
> 
> So while it works for testing, this really is a rather crap interface
> for real because userspace cannot up front tell if its going to work or
> not.

I'm also concerned about this. Using so far unused bits within those
flags is all nice in terms of compactness, but it does not allow
userspace to query availability of the feature. It will set the flag
and trigger warnings on older kernels.

There are a few approaches we can take to make this discoverable:

A) Expose the supported flags through getauxval(3), e.g. a new
    AT_RSEQ_CS_FLAGS which would contain a bitmask of the supported
    rseq_cs flags, or

B) Add a new "RSEQ_QUERY_CS_FLAGS" flag to sys_rseq @flags parameter. It
    would return the supported rseq_cs flags.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Prakash Sangappa 9 months, 1 week ago


> On May 2, 2025, at 8:39 AM, Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> 
> On 2025-05-02 05:05, Peter Zijlstra wrote:
>> On Fri, May 02, 2025 at 01:59:52AM +0000, Prakash Sangappa wrote:
>>> Add support for a thread to request extending its execution time slice on
>>> the cpu. The extra cpu time granted would help in allowing the thread to
>>> complete executing the critical section and drop any locks without getting
>>> preempted. The thread would request this cpu time extension, by setting a
>>> bit in the restartable sequences(rseq) structure registered with the kernel.
>>> 
>>> Kernel will grant a 50us extension on the cpu, when it sees the bit set.
>>> With the help of a timer, kernel force preempts the thread if it is still
>>> running on the cpu when the 50us timer expires. The thread should yield
>>> the cpu by making a system call after completing the critical section.
>>> 
>>> Suggested-by: Peter Ziljstra <peterz@infradead.org>
>>> Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
>>> ---
>>> diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
>>> index c233aae5eac9..900cb75f6a88 100644
>>> --- a/include/uapi/linux/rseq.h
>>> +++ b/include/uapi/linux/rseq.h
>>> @@ -26,6 +26,7 @@ enum rseq_cs_flags_bit {
>>>   RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
>>>   RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
>>>   RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
>>> + RSEQ_CS_FLAG_DELAY_RESCHED_BIT = 3,
>>>  };
>>>    enum rseq_cs_flags {
>>> @@ -35,6 +36,8 @@ enum rseq_cs_flags {
>>>   (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
>>>   RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE =
>>>   (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
>>> + RSEQ_CS_FLAG_DELAY_RESCHED =
>>> + (1U << RSEQ_CS_FLAG_DELAY_RESCHED_BIT),
>>>  };
>>>    /*
>>> @@ -128,6 +131,10 @@ struct rseq {
>>>   * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
>>>   *     Inhibit instruction sequence block restart on migration for
>>>   *     this thread.
>>> + * - RSEQ_CS_DELAY_RESCHED
>>> + *     Request by user task to try delaying preemption. With
>>> + *     use of a timer, extra cpu time upto 50us is granted for this
>>> + *     thread before being rescheduled.
>>>   */
>>>   __u32 flags;
>> So while it works for testing, this really is a rather crap interface
>> for real because userspace cannot up front tell if its going to work or
>> not.
> 
> I'm also concerned about this. Using so far unused bits within those
> flags is all nice in terms of compactness, but it does not allow
> userspace to query availability of the feature. It will set the flag
> and trigger warnings on older kernels.
> 
> There are a few approaches we can take to make this discoverable:
> 
> A) Expose the supported flags through getauxval(3), e.g. a new
>   AT_RSEQ_CS_FLAGS which would contain a bitmask of the supported
>   rseq_cs flags, or
> 
> B) Add a new "RSEQ_QUERY_CS_FLAGS" flag to sys_rseq @flags parameter. It
>   would return the supported rseq_cs flags.

Option B seems useful. I had thought about something like this.
If this is preferred, I can add the RSEQ_QUERY_CS_FLAGS.

Thanks,
-Prakash.
> 
> Thanks,
> 
> Mathieu
> 
> -- 
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Steven Rostedt 9 months, 1 week ago

On Fri, 2 May 2025 11:05:29 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> > diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
> > index cd38f4e9899d..1b2b64fe0fb1 100644
> > --- a/kernel/sched/syscalls.c
> > +++ b/kernel/sched/syscalls.c
> > @@ -1378,6 +1378,11 @@ static void do_sched_yield(void)
> >   */
> >  SYSCALL_DEFINE0(sched_yield)
> >  {
> > +	if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay) {
> > +		schedule();
> > +		return 0;
> > +	}
> > +
> >  	do_sched_yield();
> >  	return 0;
> >  }  
> 
> Multiple people, very much including Linus, have already said this
> 'cute' hack isn't going to fly. Why is it still here?

Who was against this?

Linus objected to "optimizing yield" because it has *semantics* that
people depend on because it has historical meaning. Things like "move
the process to the end of the scheduling queue of that priority".

  https://lore.kernel.org/linux-trace-kernel/CAHk-=wgTWVF6+dKPff-mhVwngFwBu_G9+fwOTF2Ds+YPj3xkeQ@mail.gmail.com/

I countered that this "optimization" would only affect those that use
the extended time slice and would not cause any issues with current
applications that depend on its current semantics.

Linus never replied to that.

Or did Linus reply to this someplace else too that I missed?

If we don't do this, what would be the system call to use to tell the
kernel that the task no longer needs extra time?

-- Steve

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Prakash Sangappa 9 months, 1 week ago


> On May 2, 2025, at 6:06 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Fri, 2 May 2025 11:05:29 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
>>> diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
>>> index cd38f4e9899d..1b2b64fe0fb1 100644
>>> --- a/kernel/sched/syscalls.c
>>> +++ b/kernel/sched/syscalls.c
>>> @@ -1378,6 +1378,11 @@ static void do_sched_yield(void)
>>>  */
>>> SYSCALL_DEFINE0(sched_yield)
>>> {
>>> + if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay) {
>>> + schedule();
>>> + return 0;
>>> + }
>>> +
>>> do_sched_yield();
>>> return 0;
>>> }  
>> 
>> Multiple people, very much including Linus, have already said this
>> 'cute' hack isn't going to fly. Why is it still here?
> 
> Who was against this?
> 
> Linus objected to "optimizing yield" because it has *semantics* that
> people depend on because it has historical meaning. Things like "move
> the process to the end of the scheduling queue of that priority".
> 
>  https://lore.kernel.org/linux-trace-kernel/CAHk-=wgTWVF6+dKPff-mhVwngFwBu_G9+fwOTF2Ds+YPj3xkeQ@mail.gmail.com/
> 
> I countered that this "optimization" would only affect those that use
> the extended time slice and would not cause any issues with current
> applications that depend on its current semantics.

Still wouldn’t  that affect performance of the application using the extended time slice optimization?
A sched_yield()  could end up in do_sched_yield(), if ‘shed_time_delay' is not set as the thread had been rescheduled.

> 
> Linus never replied to that.
> 
> Or did Linus reply to this someplace else too that I missed?
> 
> If we don't do this, what would be the system call to use to tell the
> kernel that the task no longer needs extra time?

Yes, need a system call to recommend. 

> 
> -- Steve

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Steven Rostedt 9 months, 1 week ago

On Mon, 5 May 2025 01:51:39 +0000
Prakash Sangappa <prakash.sangappa@oracle.com> wrote:

> > I countered that this "optimization" would only affect those that use
> > the extended time slice and would not cause any issues with current
> > applications that depend on its current semantics.  
> 
> Still wouldn’t  that affect performance of the application using the extended time slice optimization?
> A sched_yield()  could end up in do_sched_yield(), if ‘shed_time_delay' is not set as the thread had been rescheduled.

So I haven't taken a deeper look at your patches, but the patches I had,
one bit was reserved to let the task know that it received an extended time
slice it should reschedule as soon as possible. The task should only call
the system call after releasing the critical section IFF the kernel
extended its time slice.

As it shouldn't have to do a system call if the kernel didn't extend it.
Otherwise what's the purpose of this feature if it is always doing system
calls after leaving the critical section.

-- Steve

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Prakash Sangappa 9 months, 1 week ago

> On May 5, 2025, at 7:48 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Mon, 5 May 2025 01:51:39 +0000
> Prakash Sangappa <prakash.sangappa@oracle.com> wrote:
> 
>>> I countered that this "optimization" would only affect those that use
>>> the extended time slice and would not cause any issues with current
>>> applications that depend on its current semantics.  
>> 
>> Still wouldn’t  that affect performance of the application using the extended time slice optimization?
>> A sched_yield()  could end up in do_sched_yield(), if ‘shed_time_delay' is not set as the thread had been rescheduled.
> 
> So I haven't taken a deeper look at your patches, but the patches I had,
> one bit was reserved to let the task know that it received an extended time
> slice it should reschedule as soon as possible. The task should only call
> the system call after releasing the critical section IFF the kernel
> extended its time slice.

Yes the application would only call they system call to yield cpu, if the extension is granted. 

The ‘RSEQ_CS_FLAG_DELAY_RESCHED’ bit in the ‘rseq’ structure  is cleared when the 
time extension is granted. This would indicate to the application that a time extension was
granted. The application is expected to call the system call if this bit got cleared.

However, It is possible that the thread gets rescheduled within the extended time window due 
to another wakeup or runtime expiry, which the user thread would not know unless we add
another bit to indicate it got rescheduled in the extended time. Also, if the task’s critical section
ran longer and the hrtick rescheduled the thread, the application would not know.

Or we need to guarantee the thread will not get rescheduled in the extended time?

I believe it would be same with your patch also. 

-Prakash

> 
> As it shouldn't have to do a system call if the kernel didn't extend it.
> Otherwise what's the purpose of this feature if it is always doing system
> calls after leaving the critical section.
> 
> -- Steve

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Steven Rostedt 9 months, 1 week ago

On Mon, 5 May 2025 15:58:12 +0000
Prakash Sangappa <prakash.sangappa@oracle.com> wrote:

> Yes the application would only call they system call to yield cpu, if the extension is granted. 
> 
> The ‘RSEQ_CS_FLAG_DELAY_RESCHED’ bit in the ‘rseq’ structure  is cleared when the 
> time extension is granted. This would indicate to the application that a time extension was
> granted. The application is expected to call the system call if this bit got cleared.
> 
> However, It is possible that the thread gets rescheduled within the extended time window due 
> to another wakeup or runtime expiry, which the user thread would not know unless we add
> another bit to indicate it got rescheduled in the extended time. Also, if the task’s critical section
> ran longer and the hrtick rescheduled the thread, the application would not know.
> 
> Or we need to guarantee the thread will not get rescheduled in the extended time?
> 
> I believe it would be same with your patch also. 

So, my patch had a bit that got set when an extension happened.

Bit zero was set by the kernel.

Bits 1-31 was a counter (so that the code could nest).

On exiting the critical section a task would call something like:

static inline bool dec_extend(volatile unsigned *ptr)
{
	if (*ptr & ~1)
		asm volatile("subl %b1,%0"
				: "+m" (*(volatile char *)ptr)
				: "iq" (0x2)
				: "memory");

	return *ptr == 1;
}

[..]
	if (dec_extend(&rseq_map->cr_counter)) {
		rseq_map->cr_counter = 0;
		yield();
	}

That is, the user space task would increment the counter by two when
entering a critical section and decrement it by two when exiting. If the
counter ends up as 1, then it not only is out of all nested critical
sections, it also knows it was extended and will schedule out.

My kernel patches would set the bit if it extended the time slice, but if
it then scheduled the task, it would clear it again. That way when the task
is scheduled back on the CPU it will not call yield() again.

-- Steve

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Prakash Sangappa 9 months, 1 week ago

> On May 5, 2025, at 9:34 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Mon, 5 May 2025 15:58:12 +0000
> Prakash Sangappa <prakash.sangappa@oracle.com> wrote:
> 
>> Yes the application would only call they system call to yield cpu, if the extension is granted. 
>> 
>> The ‘RSEQ_CS_FLAG_DELAY_RESCHED’ bit in the ‘rseq’ structure  is cleared when the 
>> time extension is granted. This would indicate to the application that a time extension was
>> granted. The application is expected to call the system call if this bit got cleared.
>> 
>> However, It is possible that the thread gets rescheduled within the extended time window due 
>> to another wakeup or runtime expiry, which the user thread would not know unless we add
>> another bit to indicate it got rescheduled in the extended time. Also, if the task’s critical section
>> ran longer and the hrtick rescheduled the thread, the application would not know.
>> 
>> Or we need to guarantee the thread will not get rescheduled in the extended time?
>> 
>> I believe it would be same with your patch also.
> 
> So, my patch had a bit that got set when an extension happened.
> 
> Bit zero was set by the kernel.
> 
> Bits 1-31 was a counter (so that the code could nest).
> 
> On exiting the critical section a task would call something like:
> 
> static inline bool dec_extend(volatile unsigned *ptr)
> {
> if (*ptr & ~1)
> asm volatile("subl %b1,%0"
> : "+m" (*(volatile char *)ptr)
> : "iq" (0x2)
> : "memory");
> 
> return *ptr == 1;
> }
> 
> [..]
> if (dec_extend(&rseq_map->cr_counter)) {
> rseq_map->cr_counter = 0;
> yield();
> }
> 
> That is, the user space task would increment the counter by two when
> entering a critical section and decrement it by two when exiting. If the
> counter ends up as 1, then it not only is out of all nested critical
> sections, it also knows it was extended and will schedule out.
> 
> My kernel patches would set the bit if it extended the time slice, but if
> it then scheduled the task, it would clear it again. That way when the task
> is scheduled back on the CPU it will not call yield() again.
> 

I see, you set the flag in ‘rseq’ struct when extension was granted and clear it in 
rseq_delay_resched_tick(). In your case you had rseq_delay_resched_tick()
being called from hrtick_clear(). hrtick_clear() gets called from __schedule() if sched_feat()
is set.

In my patchset, it just clears the RSEQ_CS_FLAG_DELAY_RESCHED bit to indicate 
when extension was granted.

In order to allow the application to call sched_yield() to yield the cpu, only if the thread was not 
already rescheduled in the extended time, we would need the support as you have in your patch set.

Does it have to be sched_yield()? Or can the application call some other (fast)system call to yield the cpu
(getppid(2) ) if extended time was granted? 

It appears by default sched feature HRTICK/HRTICK_DL is disabled so hrtick_clear()
Will not get called from __schedule().  I have noticed that enabling these sched
feature to let hrtick_clear() get called from __schedule(), adds overhead.

So not sure if we need to enable the sched_feat(HRTICK//HRTICK_DL) to use the scheduler time extension.

Maybe rseq_delay_resched_tick() with this support,  would have to be called in __schedule() path but not 
thru hrtick_clear(),

-Prakash

> -- Steve

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Steven Rostedt 9 months, 1 week ago

On Tue, 6 May 2025 01:23:55 +0000
Prakash Sangappa <prakash.sangappa@oracle.com> wrote:

> Does it have to be sched_yield()? Or can the application call some other (fast)system call to yield the cpu
> (getppid(2) ) if extended time was granted?

It can be anything we decide I guess. yield() just seemed to be the most
appropriate, because in essence, that's exactly what it's doing.
 
> 
> It appears by default sched feature HRTICK/HRTICK_DL is disabled so
> hrtick_clear() Will not get called from __schedule().  I have noticed
> that enabling these sched feature to let hrtick_clear() get called from
> __schedule(), adds overhead.
> 
> So not sure if we need to enable the sched_feat(HRTICK//HRTICK_DL) to use
> the scheduler time extension.
> 
> Maybe rseq_delay_resched_tick() with this support,  would have to be
> called in __schedule() path but not thru hrtick_clear(),

I think that's more of a question for Peter.

-- Steve

Re: [PATCH V3 1/4] Sched: Scheduler time slice extension

Posted by Steven Rostedt 9 months, 1 week ago

On Mon, 5 May 2025 12:34:23 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> 	if (dec_extend(&rseq_map->cr_counter)) {
> 		rseq_map->cr_counter = 0;
> 		yield();
> 	}

Note there is a possibility that the kernel could schedule it between the
dec_extend() above and the yield() call where it would call yield() twice,
but the chances of that happening is extremely slim and if it does, doing
the extra do_sched_yield() will highly likely not show up in any benchmarks.

-- Steve