[v6] Scheduler time slice extension

[PATCH V6 1/7] Sched: Scheduler time slice extension

Posted by Prakash Sangappa 3 months, 1 week ago

Add support for a thread to request extending its execution time slice on
the cpu. The extra cpu time granted would help in allowing the thread to
complete executing the critical section and drop any locks without getting
preempted. The thread would request this cpu time extension, by setting a
bit in the restartable sequences(rseq) structure registered with the kernel.

Kernel will grant a 30us extension on the cpu, when it sees the bit set.
With the help of a timer, kernel force preempts the thread if it is still
running on the cpu when the 30us timer expires. The thread should yield
the cpu by making a system call after completing the critical section.

Suggested-by: Peter Ziljstra <peterz@infradead.org>
Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
---
 include/linux/entry-common.h | 17 ++++++++---
 include/linux/sched.h        | 16 +++++++++++
 include/uapi/linux/rseq.h    |  7 +++++
 kernel/entry/common.c        | 13 ++++++---
 kernel/rseq.c                | 56 ++++++++++++++++++++++++++++++++++++
 kernel/sched/core.c          | 14 +++++++++
 kernel/sched/syscalls.c      |  5 ++++
 7 files changed, 120 insertions(+), 8 deletions(-)

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index f94f3fdf15fc..d4fa952e394e 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -304,7 +304,8 @@ void arch_do_signal_or_restart(struct pt_regs *regs);
  * exit_to_user_mode_loop - do any pending work before leaving to user space
  */
 unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-				     unsigned long ti_work);
+				     unsigned long ti_work,
+				     bool irq);
 
 /**
  * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
@@ -316,7 +317,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
  *    EXIT_TO_USER_MODE_WORK are set
  * 4) check that interrupts are still disabled
  */
-static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs,
+						bool irq)
 {
 	unsigned long ti_work;
 
@@ -327,7 +329,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
 
 	ti_work = read_thread_flags();
 	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
-		ti_work = exit_to_user_mode_loop(regs, ti_work);
+		ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
+
+	if (irq)
+		rseq_delay_resched_fini();
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
 
@@ -396,6 +401,10 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
 
 	CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
 
+	/* reschedule if sched delay was granted */
+	if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay)
+		set_tsk_need_resched(current);
+
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
 		if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
 			local_irq_enable();
@@ -411,7 +420,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
 	if (unlikely(work & SYSCALL_WORK_EXIT))
 		syscall_exit_work(regs, work);
 	local_irq_disable_exit_to_user();
-	exit_to_user_mode_prepare(regs);
+	exit_to_user_mode_prepare(regs, false);
 }
 
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5bcf44ae6c79..9b4670d85131 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -338,6 +338,7 @@ extern int __must_check io_schedule_prepare(void);
 extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
+extern void hrtick_local_start(u64 delay);
 
 /* wrapper function to trace from this header file */
 DECLARE_TRACEPOINT(sched_set_state_tp);
@@ -1263,6 +1264,7 @@ struct task_struct {
 	int				softirq_context;
 	int				irq_config;
 #endif
+	unsigned			sched_time_delay:1;
 #ifdef CONFIG_PREEMPT_RT
 	int				softirq_disable_cnt;
 #endif
@@ -2245,6 +2247,20 @@ static inline bool owner_on_cpu(struct task_struct *owner)
 unsigned long sched_cpu_util(int cpu);
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_RSEQ
+
+extern bool rseq_delay_resched(void);
+extern void rseq_delay_resched_fini(void);
+extern void rseq_delay_resched_tick(void);
+
+#else
+
+static inline bool rseq_delay_resched(void) { return false; }
+static inline void rseq_delay_resched_fini(void) { }
+static inline void rseq_delay_resched_tick(void) { }
+
+#endif
+
 #ifdef CONFIG_SCHED_CORE
 extern void sched_core_free(struct task_struct *tsk);
 extern void sched_core_fork(struct task_struct *p);
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index c233aae5eac9..25fc636b17d5 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -26,6 +26,7 @@ enum rseq_cs_flags_bit {
 	RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT	= 0,
 	RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT	= 1,
 	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT	= 2,
+	RSEQ_CS_FLAG_DELAY_RESCHED_BIT		= 3,
 };
 
 enum rseq_cs_flags {
@@ -35,6 +36,8 @@ enum rseq_cs_flags {
 		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
 	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE	=
 		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+	RSEQ_CS_FLAG_DELAY_RESCHED		=
+		(1U << RSEQ_CS_FLAG_DELAY_RESCHED_BIT),
 };
 
 /*
@@ -128,6 +131,10 @@ struct rseq {
 	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
 	 *     Inhibit instruction sequence block restart on migration for
 	 *     this thread.
+	 * - RSEQ_CS_FLAG_DELAY_RESCHED
+	 *     Request by user thread to delay preemption. With use
+	 *     of a timer, kernel grants extra cpu time upto 30us for this
+	 *     thread before being rescheduled.
 	 */
 	__u32 flags;
 
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index a8dd1f27417c..8769c3592e26 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -88,7 +88,8 @@ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
  * @ti_work:	TIF work flags as read by the caller
  */
 __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-						     unsigned long ti_work)
+						     unsigned long ti_work,
+						     bool irq)
 {
 	/*
 	 * Before returning to user space ensure that all pending work
@@ -98,8 +99,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 
 		local_irq_enable_exit_to_user(ti_work);
 
-		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
-			schedule();
+		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
+		       if (irq && rseq_delay_resched())
+			       clear_tsk_need_resched(current);
+		       else
+			       schedule();
+		}
 
 		if (ti_work & _TIF_UPROBE)
 			uprobe_notify_resume(regs);
@@ -181,7 +186,7 @@ noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
 {
 	instrumentation_begin();
-	exit_to_user_mode_prepare(regs);
+	exit_to_user_mode_prepare(regs, true);
 	instrumentation_end();
 	exit_to_user_mode();
 }
diff --git a/kernel/rseq.c b/kernel/rseq.c
index b7a1ec327e81..dba44ca9f624 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -448,6 +448,62 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 	force_sigsegv(sig);
 }
 
+bool rseq_delay_resched(void)
+{
+	struct task_struct *t = current;
+	u32 flags;
+
+	if (!IS_ENABLED(CONFIG_SCHED_HRTICK))
+		return false;
+
+	if (!t->rseq)
+		return false;
+
+	if (t->sched_time_delay)
+		return false;
+
+	if (copy_from_user_nofault(&flags, &t->rseq->flags, sizeof(flags)))
+		return false;
+
+	if (!(flags & RSEQ_CS_FLAG_DELAY_RESCHED))
+		return false;
+
+	flags &= ~RSEQ_CS_FLAG_DELAY_RESCHED;
+	if (copy_to_user_nofault(&t->rseq->flags, &flags, sizeof(flags)))
+		return false;
+
+	t->sched_time_delay = 1;
+
+	return true;
+}
+
+void rseq_delay_resched_fini(void)
+{
+#ifdef CONFIG_SCHED_HRTICK
+	extern void hrtick_local_start(u64 delay);
+	struct task_struct *t = current;
+	/*
+	 * IRQs off, guaranteed to return to userspace, start timer on this CPU
+	 * to limit the resched-overdraft.
+	 *
+	 * If your critical section is longer than 30 us you get to keep the
+	 * pieces.
+	 */
+	if (t->sched_time_delay)
+		hrtick_local_start(30 * NSEC_PER_USEC);
+#endif
+}
+
+void rseq_delay_resched_tick(void)
+{
+#ifdef CONFIG_SCHED_HRTICK
+	struct task_struct *t = current;
+
+	if (t->sched_time_delay)
+		set_tsk_need_resched(t);
+#endif
+}
+
 #ifdef CONFIG_DEBUG_RSEQ
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4ad7cf3cfdca..c1b64879115f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -845,6 +845,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 
+	rseq_delay_resched_tick();
+
 	rq_lock(rq, &rf);
 	update_rq_clock(rq);
 	rq->donor->sched_class->task_tick(rq, rq->curr, 1);
@@ -918,6 +920,16 @@ void hrtick_start(struct rq *rq, u64 delay)
 
 #endif /* CONFIG_SMP */
 
+void hrtick_local_start(u64 delay)
+{
+	struct rq *rq = this_rq();
+	struct rq_flags rf;
+
+	rq_lock(rq, &rf);
+	hrtick_start(rq, delay);
+	rq_unlock(rq, &rf);
+}
+
 static void hrtick_rq_init(struct rq *rq)
 {
 #ifdef CONFIG_SMP
@@ -6740,6 +6752,8 @@ static void __sched notrace __schedule(int sched_mode)
 picked:
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
+	if (IS_ENABLED(CONFIG_RSEQ))
+		prev->sched_time_delay = 0;
 	rq->last_seen_need_resched_ns = 0;
 
 	is_switch = prev != next;
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index ee5641757838..d9a4e3a2e064 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -1379,6 +1379,11 @@ static void do_sched_yield(void)
  */
 SYSCALL_DEFINE0(sched_yield)
 {
+	if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay) {
+		schedule();
+		return 0;
+	}
+
 	do_sched_yield();
 	return 0;
 }
-- 
2.43.5

Re: [PATCH V6 1/7] Sched: Scheduler time slice extension

Posted by Thomas Gleixner 3 months, 1 week ago

On Tue, Jul 01 2025 at 00:37, Prakash Sangappa wrote:

The subsystem prefix for the scheduler is 'sched:' It's not that hard to
figure out.

>  unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> -				     unsigned long ti_work);
> +				     unsigned long ti_work,
> +				     bool irq);

No need for a new line
  
>  /**
>   * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
> @@ -316,7 +317,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>   *    EXIT_TO_USER_MODE_WORK are set
>   * 4) check that interrupts are still disabled
>   */
> -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
> +static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs,
> +						bool irq)

Ditto. 100 characters line width, please use it. And if you need a line
break, please align the second lines arguments properly. This is
documented:

https://www.kernel.org/doc/html/latest/process/maintainer-tip.html

>  {
>  	unsigned long ti_work;
>  
> @@ -327,7 +329,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
>  
>  	ti_work = read_thread_flags();
>  	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> -		ti_work = exit_to_user_mode_loop(regs, ti_work);
> +		ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
> +
> +	if (irq)
> +		rseq_delay_resched_fini();

This is an unconditional function call for every interrupt return and
it's even done when the whole thing is known to be non-functional at
compile time:

> +void rseq_delay_resched_fini(void)
> +{
> +#ifdef CONFIG_SCHED_HRTICK
  ....
> +#endif
> +}

Seriously?

>  	arch_exit_to_user_mode_prepare(regs, ti_work);
>  
> @@ -396,6 +401,10 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>  
>  	CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
>  
> +	/* reschedule if sched delay was granted */

Sentences start with an upper case letter and please use full words and
not arbitrary abbreviations. This is neither twatter nor SMS.

> +	if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay)
> +		set_tsk_need_resched(current);
> +
>  	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
>  		if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
>  			local_irq_enable();
> @@ -411,7 +420,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>  	if (unlikely(work & SYSCALL_WORK_EXIT))
>  		syscall_exit_work(regs, work);
>  	local_irq_disable_exit_to_user();
> -	exit_to_user_mode_prepare(regs);
> +	exit_to_user_mode_prepare(regs, false);
>  }
>  
>  /**
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 5bcf44ae6c79..9b4670d85131 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -338,6 +338,7 @@ extern int __must_check io_schedule_prepare(void);
>  extern void io_schedule_finish(int token);
>  extern long io_schedule_timeout(long timeout);
>  extern void io_schedule(void);
> +extern void hrtick_local_start(u64 delay);
>  
>  /* wrapper function to trace from this header file */
>  DECLARE_TRACEPOINT(sched_set_state_tp);
> @@ -1263,6 +1264,7 @@ struct task_struct {
>  	int				softirq_context;
>  	int				irq_config;
>  #endif
> +	unsigned			sched_time_delay:1;

Find an arbitrary place by rolling a dice and stick it in, right?

There is already a section with bit fields in this struct. So it's more
than bloody obvious to stick it there instead of creating a hole in the
middle of task struct.

>  #ifdef CONFIG_PREEMPT_RT
>  	int				softirq_disable_cnt;
>  #endif
> @@ -2245,6 +2247,20 @@ static inline bool owner_on_cpu(struct task_struct *owner)
>  unsigned long sched_cpu_util(int cpu);
>  #endif /* CONFIG_SMP */
>  
> +#ifdef CONFIG_RSEQ
> +

Remove these newlines please. They have zero value.

> +extern bool rseq_delay_resched(void);
> +extern void rseq_delay_resched_fini(void);
> +extern void rseq_delay_resched_tick(void);
> +
> +#else

> @@ -98,8 +99,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>  
>  		local_irq_enable_exit_to_user(ti_work);
>  
> -		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
> -			schedule();
> +		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
> +		       if (irq && rseq_delay_resched())

unlikely() and again this results in an unconditional function call for
every interrupt when CONFIG_RSEQ is enabled. A pointless exercise for
the majority of use cases.

What's worse is that it breaks the LAZY semantics. I explained this to
you before and this thing needs to be tied on the LAZY bit otherwise a
SCHED_OTHER task can prevent a real-time task from running, which is
fundamentally wrong.

So this wants to be:

	if (likely(!irq || !rseq_delay_resched(ti_work))
        	schedule();

and

static inline bool rseq_delay_resched(unsigned long ti_work)
{
        // Set when all Kconfig conditions are met
        if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
        	return false;

        // Only NEED_RESCHED_LAZY can be delayed
	if (ti_work & _TIF_NEED_RESCHED)
        	return false;

        // NONE indicates that current::rseq == NULL
        // PROBE indicates that current::rseq::flags needs to be
        // evaluated.
        // REQUESTED indicates that there was a successful request
        // already.
        if (likely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_PROBE))
        	return false;

        return __rseq_delay_resched();
}

or something like that.

> +bool rseq_delay_resched(void)
> +{
> +	struct task_struct *t = current;
> +	u32 flags;
> +
> +	if (!IS_ENABLED(CONFIG_SCHED_HRTICK))
> +		return false;
> +
> +	if (!t->rseq)
> +		return false;
> +
> +	if (t->sched_time_delay)
> +		return false;

Then all of the above conditions go away.

> +	if (copy_from_user_nofault(&flags, &t->rseq->flags, sizeof(flags)))
> +		return false;
> +
> +	if (!(flags & RSEQ_CS_FLAG_DELAY_RESCHED))
> +		return false;
> +
> +	flags &= ~RSEQ_CS_FLAG_DELAY_RESCHED;
> +	if (copy_to_user_nofault(&t->rseq->flags, &flags, sizeof(flags)))
> +		return false;
> +
> +	t->sched_time_delay = 1;

and this becomes:

       	t->rseq_delay_resched = RSEQ_RESCHED_DELAY_REQUESTED;

> +	return true;
> +}
> +
> +void rseq_delay_resched_fini(void)

What's does _fini() mean here? Absolutely nothing. This wants to be a
self explaining function name and see below

> +{
> +#ifdef CONFIG_SCHED_HRTICK

You really are fond of pointless function calls. Obviously performance
is not really a concern in your work.

> +	extern void hrtick_local_start(u64 delay);

header files with prototypes exist for a reason....

> +	struct task_struct *t = current;
> +	/*
> +	 * IRQs off, guaranteed to return to userspace, start timer on this CPU
> +	 * to limit the resched-overdraft.
> +	 *
> +	 * If your critical section is longer than 30 us you get to keep the
> +	 * pieces.
> +	 */
> +	if (t->sched_time_delay)
> +		hrtick_local_start(30 * NSEC_PER_USEC);
> +#endif

This whole thing can be condensed into:

static inline void rseq_delay_resched_arm_timer(void)
{
	if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
        	return;
        if (unlikely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_REQUESTED))
        	hrtick_local_start(...);
}

> +}
> +
> +void rseq_delay_resched_tick(void)
> +{
> +#ifdef CONFIG_SCHED_HRTICK
> +	struct task_struct *t = current;
> +
> +	if (t->sched_time_delay)
> +		set_tsk_need_resched(t);
> +#endif

Oh well.....

> +}
> +
>  #ifdef CONFIG_DEBUG_RSEQ
>  
>  /*
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 4ad7cf3cfdca..c1b64879115f 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -845,6 +845,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
>  
>  	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
>  
> +	rseq_delay_resched_tick();
> +
>  	rq_lock(rq, &rf);
>  	update_rq_clock(rq);
>  	rq->donor->sched_class->task_tick(rq, rq->curr, 1);
> @@ -918,6 +920,16 @@ void hrtick_start(struct rq *rq, u64 delay)
>  
>  #endif /* CONFIG_SMP */
>  
> +void hrtick_local_start(u64 delay)

How is this supposed to compile cleanly without a prototype?

Thanks,

        tglx

Re: [PATCH V6 1/7] Sched: Scheduler time slice extension

Posted by Prakash Sangappa 3 months, 1 week ago


> On Jul 1, 2025, at 1:42 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> On Tue, Jul 01 2025 at 00:37, Prakash Sangappa wrote:
> 
> The subsystem prefix for the scheduler is 'sched:' It's not that hard to
> figure out.

Will fix that. 

> 
>> unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>> -     unsigned long ti_work);
>> +     unsigned long ti_work,
>> +     bool irq);
> 
> No need for a new line

Ok.

> 
>> /**
>>  * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
>> @@ -316,7 +317,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>>  *    EXIT_TO_USER_MODE_WORK are set
>>  * 4) check that interrupts are still disabled
>>  */
>> -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
>> +static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs,
>> + bool irq)
> 
> Ditto. 100 characters line width, please use it. And if you need a line
> break, please align the second lines arguments properly. This is
> documented:
> 
> https://urldefense.com/v3/__https://www.kernel.org/doc/html/latest/process/maintainer-tip.html__;!!ACWV5N9M2RV99hQ!KeHcAVTEgkkOTIZCEsRYgcQtDwYHZ4s77sjO9fKsjO530m5mavRMOnDMB_v6fm5TBvEfRPV2tcR2KTqt1865VZ0$

Got it.

> 
>> {
>> unsigned long ti_work;
>> 
>> @@ -327,7 +329,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
>> 
>> ti_work = read_thread_flags();
>> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
>> - ti_work = exit_to_user_mode_loop(regs, ti_work);
>> + ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
>> +
>> + if (irq)
>> + rseq_delay_resched_fini();
> 
> This is an unconditional function call for every interrupt return and
> it's even done when the whole thing is known to be non-functional at
> compile time:

Will include IS_CONFIG check. 

> 
>> +void rseq_delay_resched_fini(void)
>> +{
>> +#ifdef CONFIG_SCHED_HRTICK
>  ....
>> +#endif
>> +}
> 
> Seriously?

Will make the new config CONFIG_SCHED_PREEMPT_DELAY dependent not SCHED_HRTICK,
So, I can remove these #ifdef, #endif.

> 
>> arch_exit_to_user_mode_prepare(regs, ti_work);
>> 
>> @@ -396,6 +401,10 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>> 
>> CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
>> 
>> + /* reschedule if sched delay was granted */
> 
> Sentences start with an upper case letter and please use full words and
> not arbitrary abbreviations. This is neither twatter nor SMS.

Will fix. 

> 
>> + if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay)
>> + set_tsk_need_resched(current);
>> +
>> if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
>> if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
>> local_irq_enable();
>> @@ -411,7 +420,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>> if (unlikely(work & SYSCALL_WORK_EXIT))
>> syscall_exit_work(regs, work);
>> local_irq_disable_exit_to_user();
>> - exit_to_user_mode_prepare(regs);
>> + exit_to_user_mode_prepare(regs, false);
>> }
>> 
>> /**
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 5bcf44ae6c79..9b4670d85131 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -338,6 +338,7 @@ extern int __must_check io_schedule_prepare(void);
>> extern void io_schedule_finish(int token);
>> extern long io_schedule_timeout(long timeout);
>> extern void io_schedule(void);
>> +extern void hrtick_local_start(u64 delay);
>> 
>> /* wrapper function to trace from this header file */
>> DECLARE_TRACEPOINT(sched_set_state_tp);
>> @@ -1263,6 +1264,7 @@ struct task_struct {
>> int softirq_context;
>> int irq_config;
>> #endif
>> + unsigned sched_time_delay:1;
> 
> Find an arbitrary place by rolling a dice and stick it in, right?

Sorry, merging issue. I had it next to the following 
>         unsigned                        in_thrashing:1;

Will fix it. 



> 
> There is already a section with bit fields in this struct. So it's more
> than bloody obvious to stick it there instead of creating a hole in the
> middle of task struct.
> 
>> #ifdef CONFIG_PREEMPT_RT
>> int softirq_disable_cnt;
>> #endif
>> @@ -2245,6 +2247,20 @@ static inline bool owner_on_cpu(struct task_struct *owner)
>> unsigned long sched_cpu_util(int cpu);
>> #endif /* CONFIG_SMP */
>> 
>> +#ifdef CONFIG_RSEQ
>> +
> 
> Remove these newlines please. They have zero value.

Ok

> 
>> +extern bool rseq_delay_resched(void);
>> +extern void rseq_delay_resched_fini(void);
>> +extern void rseq_delay_resched_tick(void);
>> +
>> +#else
> 
>> @@ -98,8 +99,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>> 
>> local_irq_enable_exit_to_user(ti_work);
>> 
>> - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
>> - schedule();
>> + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
>> +       if (irq && rseq_delay_resched())
> 
> unlikely() and again this results in an unconditional function call for
> every interrupt when CONFIG_RSEQ is enabled. A pointless exercise for
> the majority of use cases.
> 
> What's worse is that it breaks the LAZY semantics. I explained this to
> you before and this thing needs to be tied on the LAZY bit otherwise a
> SCHED_OTHER task can prevent a real-time task from running, which is
> fundamentally wrong.
> 
> So this wants to be:
> 
> if (likely(!irq || !rseq_delay_resched(ti_work))
>         schedule();
> 
> and
> 
> static inline bool rseq_delay_resched(unsigned long ti_work)
> {
>        // Set when all Kconfig conditions are met
>        if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
>         return false;
> 
>        // Only NEED_RESCHED_LAZY can be delayed
> if (ti_work & _TIF_NEED_RESCHED)
>         return false;
> 
>        // NONE indicates that current::rseq == NULL
>        // PROBE indicates that current::rseq::flags needs to be
>        // evaluated.
>        // REQUESTED indicates that there was a successful request
>        // already.
>        if (likely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_PROBE))
>         return false;
> 
>        return __rseq_delay_resched();
> }
> 
> or something like that.

Will refactor the code.

> 
>> +bool rseq_delay_resched(void)
>> +{
>> + struct task_struct *t = current;
>> + u32 flags;
>> +
>> + if (!IS_ENABLED(CONFIG_SCHED_HRTICK))
>> + return false;
>> +
>> + if (!t->rseq)
>> + return false;
>> +
>> + if (t->sched_time_delay)
>> + return false;
> 
> Then all of the above conditions go away.
> 
>> + if (copy_from_user_nofault(&flags, &t->rseq->flags, sizeof(flags)))
>> + return false;
>> +
>> + if (!(flags & RSEQ_CS_FLAG_DELAY_RESCHED))
>> + return false;
>> +
>> + flags &= ~RSEQ_CS_FLAG_DELAY_RESCHED;
>> + if (copy_to_user_nofault(&t->rseq->flags, &flags, sizeof(flags)))
>> + return false;
>> +
>> + t->sched_time_delay = 1;
> 
> and this becomes:
> 
>        t->rseq_delay_resched = RSEQ_RESCHED_DELAY_REQUESTED;
> 
>> + return true;
>> +}
>> +
>> +void rseq_delay_resched_fini(void)
> 
> What's does _fini() mean here? Absolutely nothing. This wants to be a
> self explaining function name and see below
> 
>> +{
>> +#ifdef CONFIG_SCHED_HRTICK
> 
> You really are fond of pointless function calls. Obviously performance
> is not really a concern in your work.
> 
>> + extern void hrtick_local_start(u64 delay);
> 
> header files with prototypes exist for a reason....
> 
>> + struct task_struct *t = current;
>> + /*
>> + * IRQs off, guaranteed to return to userspace, start timer on this CPU
>> + * to limit the resched-overdraft.
>> + *
>> + * If your critical section is longer than 30 us you get to keep the
>> + * pieces.
>> + */
>> + if (t->sched_time_delay)
>> + hrtick_local_start(30 * NSEC_PER_USEC);
>> +#endif
> 
> This whole thing can be condensed into:
> 
> static inline void rseq_delay_resched_arm_timer(void)
> {
> if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
>         return;
>        if (unlikely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_REQUESTED))
>         hrtick_local_start(...);
> }

Got it, will make the changes.

> 
>> +}
>> +
>> +void rseq_delay_resched_tick(void)
>> +{
>> +#ifdef CONFIG_SCHED_HRTICK
>> + struct task_struct *t = current;
>> +
>> + if (t->sched_time_delay)
>> + set_tsk_need_resched(t);
>> +#endif
> 
> Oh well.....
> 
>> +}
>> +
>> #ifdef CONFIG_DEBUG_RSEQ
>> 
>> /*
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 4ad7cf3cfdca..c1b64879115f 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -845,6 +845,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
>> 
>> WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
>> 
>> + rseq_delay_resched_tick();
>> +
>> rq_lock(rq, &rf);
>> update_rq_clock(rq);
>> rq->donor->sched_class->task_tick(rq, rq->curr, 1);
>> @@ -918,6 +920,16 @@ void hrtick_start(struct rq *rq, u64 delay)
>> 
>> #endif /* CONFIG_SMP */
>> 
>> +void hrtick_local_start(u64 delay)
> 
> How is this supposed to compile cleanly without a prototype?

Will fix.

Thanks for your comments.
-Prakash
> 
> Thanks,
> 
>        tglx

Re: [PATCH V6 1/7] Sched: Scheduler time slice extension

Posted by Peter Zijlstra 3 months, 1 week ago

On Tue, Jul 01, 2025 at 10:42:36AM +0200, Thomas Gleixner wrote:

> What's worse is that it breaks the LAZY semantics. I explained this to
> you before and this thing needs to be tied on the LAZY bit otherwise a
> SCHED_OTHER task can prevent a real-time task from running, which is
> fundamentally wrong.

So here we disagree, I don't want this tied to LAZY.

SCHED_OTHER can already inhibit a RT task from getting ran by doing a
syscall, this syscall will have non-preemptible sections and the RT task
will get delayed.

I very much want this thing to be limited to a time frame where a
userspace critical section (this thing) is smaller than such a kernel
critical section.

That is, there should be no observable difference between the effects of
this new thing and a syscall doing preempt_disable().

That said; the reason I don't want this tied to LAZY is that RT itself
is not subject to LAZY and this then means that RT threads cannot make
use of this new facility, whereas I think it makes perfect sense for
them to use this.

Re: [PATCH V6 1/7] Sched: Scheduler time slice extension

Posted by Thomas Gleixner 3 months, 1 week ago

On Tue, Jul 01 2025 at 12:56, Peter Zijlstra wrote:
> On Tue, Jul 01, 2025 at 10:42:36AM +0200, Thomas Gleixner wrote:
>
>> What's worse is that it breaks the LAZY semantics. I explained this to
>> you before and this thing needs to be tied on the LAZY bit otherwise a
>> SCHED_OTHER task can prevent a real-time task from running, which is
>> fundamentally wrong.
>
> So here we disagree, I don't want this tied to LAZY.
>
> SCHED_OTHER can already inhibit a RT task from getting ran by doing a
> syscall, this syscall will have non-preemptible sections and the RT task
> will get delayed.
>
> I very much want this thing to be limited to a time frame where a
> userspace critical section (this thing) is smaller than such a kernel
> critical section.
>
> That is, there should be no observable difference between the effects of
> this new thing and a syscall doing preempt_disable().
>
> That said; the reason I don't want this tied to LAZY is that RT itself
> is not subject to LAZY and this then means that RT threads cannot make
> use of this new facility, whereas I think it makes perfect sense for
> them to use this.

Fair enough, but can we pretty please have this explained and documented
and not just burried in some gory implementation details, which nobody
will understand in 3 months down the road.

Also if we go there and allow non-RT tasks to delay scheduling, then we
need a control mechanism to enable/disable this mechanism on a per task
or process basis. That way a RT system designer can prevent random
user space tasks, which think they are the most important piece, from
interfering with truly relevant RT tasks w/o going to chase down source
code and hack it into submission.

Thanks,

        tglx

Re: [PATCH V6 1/7] Sched: Scheduler time slice extension

Posted by Prakash Sangappa 3 months, 1 week ago


> On Jul 1, 2025, at 5:36 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> On Tue, Jul 01 2025 at 12:56, Peter Zijlstra wrote:
>> On Tue, Jul 01, 2025 at 10:42:36AM +0200, Thomas Gleixner wrote:
>> 
>>> What's worse is that it breaks the LAZY semantics. I explained this to
>>> you before and this thing needs to be tied on the LAZY bit otherwise a
>>> SCHED_OTHER task can prevent a real-time task from running, which is
>>> fundamentally wrong.
>> 
>> So here we disagree, I don't want this tied to LAZY.
>> 
>> SCHED_OTHER can already inhibit a RT task from getting ran by doing a
>> syscall, this syscall will have non-preemptible sections and the RT task
>> will get delayed.
>> 
>> I very much want this thing to be limited to a time frame where a
>> userspace critical section (this thing) is smaller than such a kernel
>> critical section.
>> 
>> That is, there should be no observable difference between the effects of
>> this new thing and a syscall doing preempt_disable().
>> 
>> That said; the reason I don't want this tied to LAZY is that RT itself
>> is not subject to LAZY and this then means that RT threads cannot make
>> use of this new facility, whereas I think it makes perfect sense for
>> them to use this.
> 
> Fair enough, but can we pretty please have this explained and documented
> and not just burried in some gory implementation details, which nobody
> will understand in 3 months down the road.
> 
> Also if we go there and allow non-RT tasks to delay scheduling, then we
> need a control mechanism to enable/disable this mechanism on a per task
> or process basis. That way a RT system designer can prevent random
> user space tasks, which think they are the most important piece, from
> interfering with truly relevant RT tasks w/o going to chase down source
> code and hack it into submission.

Could the per task  control mechanism be thru /proc?
Wonder how easy it will be to administer such control.

Alternatively, can we have a config option to apply to LAZY only?
This will not provide the finer  control as you suggested. 

Thanks,
-Prakash.

> 
> Thanks,
> 
>        tglx

Re: [PATCH V6 1/7] Sched: Scheduler time slice extension

Posted by Thomas Gleixner 3 months ago

On Thu, Jul 03 2025 at 05:38, Prakash Sangappa wrote:
>> On Jul 1, 2025, at 5:36 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>>
>> Also if we go there and allow non-RT tasks to delay scheduling, then we
>> need a control mechanism to enable/disable this mechanism on a per task
>> or process basis. That way a RT system designer can prevent random
>> user space tasks, which think they are the most important piece, from
>> interfering with truly relevant RT tasks w/o going to chase down source
>> code and hack it into submission.
>
> Could the per task  control mechanism be thru /proc?

Is that a serious question?

> Wonder how easy it will be to administer such control.

Obviously it's horrible.

That's what prctl() is for. Plus a proper inheritance mechanism on
fork/exec along with a system wide default which can be controlled via
the kernel command line.

> Alternatively, can we have a config option to apply to LAZY only?
> This will not provide the finer  control as you suggested. 

A config option is not solving anything; it's just a lazy hack to avoid
the hard work of a proper and future proof ABI design.

Thanks,

        tglx

Re: [PATCH V6 1/7] Sched: Scheduler time slice extension

Posted by Steven Rostedt 3 months, 1 week ago

On Tue, 01 Jul 2025 14:36:32 +0200
Thomas Gleixner <tglx@linutronix.de> wrote:

> > That said; the reason I don't want this tied to LAZY is that RT itself
> > is not subject to LAZY and this then means that RT threads cannot make
> > use of this new facility, whereas I think it makes perfect sense for
> > them to use this.  
> 
> Fair enough, but can we pretty please have this explained and documented
> and not just burried in some gory implementation details, which nobody
> will understand in 3 months down the road.
> 
> Also if we go there and allow non-RT tasks to delay scheduling, then we
> need a control mechanism to enable/disable this mechanism on a per task
> or process basis. That way a RT system designer can prevent random
> user space tasks, which think they are the most important piece, from
> interfering with truly relevant RT tasks w/o going to chase down source
> code and hack it into submission.

BTW, I already showed[1] that any amount of delay this adds will build up
on top of the current worse case latency. So just saying "we only delay
5us which is in the noise" is incorrect when you have a system that has
a worse case latency of 30us. Because that 5us now makes it 35us.

Which is why I said this must be able to be disabled. I wouldn't want
this on any RT system, unless it can be configured as Thomas states
that it can be limited to specific tasks and is default off for
anything that the admin doesn't explicitly state it's for.

[1] https://lore.kernel.org/all/20250609165532.3265e142@gandalf.local.home/

-- Steve

Re: [PATCH V6 1/7] Sched: Scheduler time slice extension

Posted by K Prateek Nayak 3 months, 1 week ago

Hello Peter,

On 7/1/2025 4:26 PM, Peter Zijlstra wrote:
> On Tue, Jul 01, 2025 at 10:42:36AM +0200, Thomas Gleixner wrote:
> 
>> What's worse is that it breaks the LAZY semantics. I explained this to
>> you before and this thing needs to be tied on the LAZY bit otherwise a
>> SCHED_OTHER task can prevent a real-time task from running, which is
>> fundamentally wrong.
> 
> So here we disagree, I don't want this tied to LAZY.
> 
> SCHED_OTHER can already inhibit a RT task from getting ran by doing a
> syscall, this syscall will have non-preemptible sections and the RT task
> will get delayed.
> 
> I very much want this thing to be limited to a time frame where a
> userspace critical section (this thing) is smaller than such a kernel
> critical section.
> 
> That is, there should be no observable difference between the effects of
> this new thing and a syscall doing preempt_disable().
> 
> 
> That said; the reason I don't want this tied to LAZY is that RT itself
> is not subject to LAZY and this then means that RT threads cannot make
> use of this new facility, whereas I think it makes perfect sense for
> them to use this.

Thinking out loud: I know we are trying to keep the overhead to a
minimum but is it acceptable to go through with schedule() and decide
on extending the time slice in pick_next_task_fair() / pick_task_rt()?

Then, a higher priority task can always preempt us when preemption is
enabled and between the tasks of same class, it is just a redundant
schedule() loop.

It'll require some additional care to start accounting for delay from
the time when NEED_RESCHED was set and not when schedule() is actually
called but would the overhead be that bad?

Or would we like to prevent preemption from RT tasks too on
!PREMMPT_RT since whatever the task asking for the extended slice is
doing is considered important enough?

-- 
Thanks and Regards,
Prateek

Re: [PATCH V6 1/7] Sched: Scheduler time slice extension

Posted by Peter Zijlstra 3 months, 1 week ago

On Tue, Jul 01, 2025 at 04:58:26PM +0530, K Prateek Nayak wrote:
> Hello Peter,
> 
> On 7/1/2025 4:26 PM, Peter Zijlstra wrote:
> > On Tue, Jul 01, 2025 at 10:42:36AM +0200, Thomas Gleixner wrote:
> > 
> > > What's worse is that it breaks the LAZY semantics. I explained this to
> > > you before and this thing needs to be tied on the LAZY bit otherwise a
> > > SCHED_OTHER task can prevent a real-time task from running, which is
> > > fundamentally wrong.
> > 
> > So here we disagree, I don't want this tied to LAZY.
> > 
> > SCHED_OTHER can already inhibit a RT task from getting ran by doing a
> > syscall, this syscall will have non-preemptible sections and the RT task
> > will get delayed.
> > 
> > I very much want this thing to be limited to a time frame where a
> > userspace critical section (this thing) is smaller than such a kernel
> > critical section.
> > 
> > That is, there should be no observable difference between the effects of
> > this new thing and a syscall doing preempt_disable().
> > 
> > 
> > That said; the reason I don't want this tied to LAZY is that RT itself
> > is not subject to LAZY and this then means that RT threads cannot make
> > use of this new facility, whereas I think it makes perfect sense for
> > them to use this.
> 
> Thinking out loud: I know we are trying to keep the overhead to a
> minimum but is it acceptable to go through with schedule() and decide
> on extending the time slice in pick_next_task_fair() / pick_task_rt()?
> 
> Then, a higher priority task can always preempt us when preemption is
> enabled and between the tasks of same class, it is just a redundant
> schedule() loop.
> 
> It'll require some additional care to start accounting for delay from
> the time when NEED_RESCHED was set and not when schedule() is actually
> called but would the overhead be that bad?

Probably not -- if care was taken to make sure all callers have an
up-to-date rq->clock (many will have today, some might need updating).
Then its just a matter of saving a copy.

Basically stick assert_clock_updated() in __resched_curr() and make all
the splats go away.

> Or would we like to prevent preemption from RT tasks too on
> !PREMMPT_RT since whatever the task asking for the extended slice is
> doing is considered important enough?

I'm not sure I see the need for this complication -- under the premise
that the duration is strictly limited to less than what syscalls can
already inflict upon us, there should be no observable difference in
worst case timing.

But yes, if this makes some people feel better, then I suppose can look
at this.