[PATCH V7 01/11] sched: Scheduler time slice extension

Prakash Sangappa posted 11 patches 2 months, 1 week ago
[PATCH V7 01/11] sched: Scheduler time slice extension
Posted by Prakash Sangappa 2 months, 1 week ago
Add support for a thread to request extending its execution time slice on
the cpu. The extra cpu time granted would help in allowing the thread to
complete executing the critical section and drop any locks without getting
preempted. The thread would request this cpu time extension, by setting a
bit in the restartable sequences(rseq) structure registered with the kernel.

Kernel will grant a 30us extension on the cpu, when it sees the bit set.
With the help of a timer, kernel force preempts the thread if it is still
running on the cpu when the 30us timer expires. The thread should yield
the cpu by making a system call after completing the critical section.

Suggested-by: Peter Ziljstra <peterz@infradead.org>
Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
---
 include/linux/entry-common.h | 14 +++++++++----
 include/linux/sched.h        | 28 ++++++++++++++++++++++++++
 include/uapi/linux/rseq.h    |  7 +++++++
 init/Kconfig                 |  7 +++++++
 kernel/entry/common.c        | 28 ++++++++++++++++++++++----
 kernel/rseq.c                | 38 ++++++++++++++++++++++++++++++++++++
 kernel/sched/core.c          | 15 ++++++++++++++
 kernel/sched/syscalls.c      |  4 ++++
 8 files changed, 133 insertions(+), 8 deletions(-)

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index f94f3fdf15fc..7b258d2510f8 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -304,7 +304,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs);
  * exit_to_user_mode_loop - do any pending work before leaving to user space
  */
 unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-				     unsigned long ti_work);
+				     unsigned long ti_work, bool irq);
 
 /**
  * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
@@ -316,7 +316,7 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
  *    EXIT_TO_USER_MODE_WORK are set
  * 4) check that interrupts are still disabled
  */
-static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs, bool irq)
 {
 	unsigned long ti_work;
 
@@ -327,7 +327,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
 
 	ti_work = read_thread_flags();
 	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
-		ti_work = exit_to_user_mode_loop(regs, ti_work);
+		ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
+
+	if (IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY) && irq)
+		rseq_delay_resched_arm_timer();
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
 
@@ -396,6 +399,9 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
 
 	CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
 
+	/* Reschedule if scheduler time delay was granted */
+	rseq_delay_set_need_resched();
+
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
 		if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
 			local_irq_enable();
@@ -411,7 +417,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
 	if (unlikely(work & SYSCALL_WORK_EXIT))
 		syscall_exit_work(regs, work);
 	local_irq_disable_exit_to_user();
-	exit_to_user_mode_prepare(regs);
+	exit_to_user_mode_prepare(regs, false);
 }
 
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5bcf44ae6c79..5d2819afd481 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -338,6 +338,7 @@ extern int __must_check io_schedule_prepare(void);
 extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
+extern void hrtick_local_start(u64 delay);
 
 /* wrapper function to trace from this header file */
 DECLARE_TRACEPOINT(sched_set_state_tp);
@@ -1048,6 +1049,7 @@ struct task_struct {
 	unsigned                        in_thrashing:1;
 #endif
 	unsigned			in_nf_duplicate:1;
+	unsigned			rseq_delay_resched:2;
 #ifdef CONFIG_PREEMPT_RT
 	struct netdev_xmit		net_xmit;
 #endif
@@ -1711,6 +1713,13 @@ static inline char task_state_to_char(struct task_struct *tsk)
 
 extern struct pid *cad_pid;
 
+/*
+ * Used in tsk->rseq_delay_resched.
+ */
+#define	RSEQ_RESCHED_DELAY_NONE		0	/* tsk->rseq not registered */
+#define	RSEQ_RESCHED_DELAY_PROBE	1	/* Evaluate tsk->rseq->flags */
+#define	RSEQ_RESCHED_DELAY_REQUESTED	2	/* Request to delay reschedule successful */
+
 /*
  * Per process flags
  */
@@ -2245,6 +2254,25 @@ static inline bool owner_on_cpu(struct task_struct *owner)
 unsigned long sched_cpu_util(int cpu);
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_RSEQ_RESCHED_DELAY
+extern bool __rseq_delay_resched(void);
+extern void rseq_delay_resched_arm_timer(void);
+extern void rseq_delay_resched_tick(void);
+static inline bool rseq_delay_set_need_resched(void)
+{
+    if (current->rseq_delay_resched == RSEQ_RESCHED_DELAY_REQUESTED) {
+	    set_tsk_need_resched(current);
+	    return true;
+    }
+    return false;
+}
+#else
+static inline bool __rseq_delay_resched(void) { return false; }
+static inline void rseq_delay_resched_arm_timer(void) { }
+static inline void rseq_delay_resched_tick(void) { }
+static inline bool rseq_delay_set_need_resched(void) { return false; }
+#endif
+
 #ifdef CONFIG_SCHED_CORE
 extern void sched_core_free(struct task_struct *tsk);
 extern void sched_core_fork(struct task_struct *p);
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index c233aae5eac9..25fc636b17d5 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -26,6 +26,7 @@ enum rseq_cs_flags_bit {
 	RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT	= 0,
 	RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT	= 1,
 	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT	= 2,
+	RSEQ_CS_FLAG_DELAY_RESCHED_BIT		= 3,
 };
 
 enum rseq_cs_flags {
@@ -35,6 +36,8 @@ enum rseq_cs_flags {
 		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
 	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE	=
 		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+	RSEQ_CS_FLAG_DELAY_RESCHED		=
+		(1U << RSEQ_CS_FLAG_DELAY_RESCHED_BIT),
 };
 
 /*
@@ -128,6 +131,10 @@ struct rseq {
 	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
 	 *     Inhibit instruction sequence block restart on migration for
 	 *     this thread.
+	 * - RSEQ_CS_FLAG_DELAY_RESCHED
+	 *     Request by user thread to delay preemption. With use
+	 *     of a timer, kernel grants extra cpu time upto 30us for this
+	 *     thread before being rescheduled.
 	 */
 	__u32 flags;
 
diff --git a/init/Kconfig b/init/Kconfig
index ce76e913aa2b..3005abab77cf 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1130,6 +1130,13 @@ config SCHED_MM_CID
 	def_bool y
 	depends on SMP && RSEQ
 
+config RSEQ_RESCHED_DELAY
+	def_bool y
+	depends on SMP && RSEQ && SCHED_HRTICK
+	help
+	  This feature enables a thread to request extending its time slice on
+	  the cpu by delaying preemption.
+
 config UCLAMP_TASK_GROUP
 	bool "Utilization clamping per group of tasks"
 	depends on CGROUP_SCHED
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index a8dd1f27417c..3d2d670980ec 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -82,13 +82,31 @@ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
 
+static inline bool rseq_delay_resched(unsigned long ti_work)
+{
+	if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
+		return false;
+
+	if (unlikely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_PROBE))
+		return false;
+
+	if (!(ti_work & (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_LAZY)))
+		return false;
+
+	if (__rseq_delay_resched()) {
+		clear_tsk_need_resched(current);
+		return true;
+	}
+	return false;
+}
+
 /**
  * exit_to_user_mode_loop - do any pending work before leaving to user space
  * @regs:	Pointer to pt_regs on entry stack
  * @ti_work:	TIF work flags as read by the caller
  */
 __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-						     unsigned long ti_work)
+						     unsigned long ti_work, bool irq)
 {
 	/*
 	 * Before returning to user space ensure that all pending work
@@ -98,8 +116,10 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 
 		local_irq_enable_exit_to_user(ti_work);
 
-		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
-			schedule();
+		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
+		       if (likely(!irq || !rseq_delay_resched(ti_work)))
+			       schedule();
+		}
 
 		if (ti_work & _TIF_UPROBE)
 			uprobe_notify_resume(regs);
@@ -181,7 +201,7 @@ noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
 {
 	instrumentation_begin();
-	exit_to_user_mode_prepare(regs);
+	exit_to_user_mode_prepare(regs, true);
 	instrumentation_end();
 	exit_to_user_mode();
 }
diff --git a/kernel/rseq.c b/kernel/rseq.c
index b7a1ec327e81..8b6af4e12142 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -448,6 +448,40 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 	force_sigsegv(sig);
 }
 
+#ifdef	CONFIG_RSEQ_RESCHED_DELAY
+bool __rseq_delay_resched(void)
+{
+	struct task_struct *t = current;
+	u32 flags;
+
+	if (copy_from_user_nofault(&flags, &t->rseq->flags, sizeof(flags)))
+		return false;
+
+	if (!(flags & RSEQ_CS_FLAG_DELAY_RESCHED))
+		return false;
+
+	flags &= ~RSEQ_CS_FLAG_DELAY_RESCHED;
+	if (copy_to_user_nofault(&t->rseq->flags, &flags, sizeof(flags)))
+		return false;
+
+	t->rseq_delay_resched = RSEQ_RESCHED_DELAY_REQUESTED;
+
+	return true;
+}
+
+void rseq_delay_resched_arm_timer(void)
+{
+	if (unlikely(current->rseq_delay_resched == RSEQ_RESCHED_DELAY_REQUESTED))
+		hrtick_local_start(30 * NSEC_PER_USEC);
+}
+
+void rseq_delay_resched_tick(void)
+{
+	if (current->rseq_delay_resched == RSEQ_RESCHED_DELAY_REQUESTED)
+		set_tsk_need_resched(current);
+}
+#endif /* CONFIG_RSEQ_RESCHED_DELAY */
+
 #ifdef CONFIG_DEBUG_RSEQ
 
 /*
@@ -493,6 +527,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
 		current->rseq = NULL;
 		current->rseq_sig = 0;
 		current->rseq_len = 0;
+		if (IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
+			current->rseq_delay_resched = RSEQ_RESCHED_DELAY_NONE;
 		return 0;
 	}
 
@@ -561,6 +597,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
 	current->rseq = rseq;
 	current->rseq_len = rseq_len;
 	current->rseq_sig = sig;
+	if (IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
+		current->rseq_delay_resched = RSEQ_RESCHED_DELAY_PROBE;
 
 	/*
 	 * If rseq was previously inactive, and has just been
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4ad7cf3cfdca..e75ecbb2c1f7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -845,6 +845,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 
+	rseq_delay_resched_tick();
+
 	rq_lock(rq, &rf);
 	update_rq_clock(rq);
 	rq->donor->sched_class->task_tick(rq, rq->curr, 1);
@@ -918,6 +920,16 @@ void hrtick_start(struct rq *rq, u64 delay)
 
 #endif /* CONFIG_SMP */
 
+void hrtick_local_start(u64 delay)
+{
+	struct rq *rq = this_rq();
+	struct rq_flags rf;
+
+	rq_lock(rq, &rf);
+	hrtick_start(rq, delay);
+	rq_unlock(rq, &rf);
+}
+
 static void hrtick_rq_init(struct rq *rq)
 {
 #ifdef CONFIG_SMP
@@ -6740,6 +6752,9 @@ static void __sched notrace __schedule(int sched_mode)
 picked:
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
+	if (IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY) &&
+	    prev->rseq_delay_resched == RSEQ_RESCHED_DELAY_REQUESTED)
+		prev->rseq_delay_resched = RSEQ_RESCHED_DELAY_PROBE;
 	rq->last_seen_need_resched_ns = 0;
 
 	is_switch = prev != next;
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index ee5641757838..e684a77ed1fb 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -1379,6 +1379,10 @@ static void do_sched_yield(void)
  */
 SYSCALL_DEFINE0(sched_yield)
 {
+	/* Reschedule if scheduler time delay was granted */
+	if (rseq_delay_set_need_resched())
+		return 0;
+
 	do_sched_yield();
 	return 0;
 }
-- 
2.43.5
Re: [PATCH V7 01/11] sched: Scheduler time slice extension
Posted by Thomas Gleixner 2 months ago
On Thu, Jul 24 2025 at 16:16, Prakash Sangappa wrote:
> @@ -304,7 +304,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs);
>   * exit_to_user_mode_loop - do any pending work before leaving to user space
>   */
>  unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> -				     unsigned long ti_work);
> +				     unsigned long ti_work, bool irq);

I know the kernel-doc already lacks the description for the existing
arguments, but adding more undocumented ones is not the right thing
either.

Also please name this argument 'from_irq' to make it clear what this is
about.

>  /**
>   * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
> @@ -316,7 +316,7 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>   *    EXIT_TO_USER_MODE_WORK are set
>   * 4) check that interrupts are still disabled
>   */
> -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
> +static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs, bool irq)

New argument not documented in kernel-doc.

>  {
>  	unsigned long ti_work;
>  
> @@ -327,7 +327,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
>  
>  	ti_work = read_thread_flags();
>  	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> -		ti_work = exit_to_user_mode_loop(regs, ti_work);
> +		ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
> +
> +	if (IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY) && irq)
> +		rseq_delay_resched_arm_timer();

This is still an unconditional function call which is a NOOP for
everyone who does not use this. It's not that hard to inline the
check. How often do I have to explain that?

>  	arch_exit_to_user_mode_prepare(regs, ti_work);
>  
> @@ -396,6 +399,9 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>  
>  	CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
>  
> +	/* Reschedule if scheduler time delay was granted */

This is not rescheduling. It sets NEED_RESCHED, which is a completely
different thing.

> +	rseq_delay_set_need_resched();

I fundamentally hate this hack as it goes out to user space with
NEED_RESCHED set and absolutely zero debug mechanism which validates
it. Currently going out with NEED_RESCHED set is a plain bug, rigthfully
so.

But now this muck comes along and sets the flag, which is semantically
just wrong and ill defined.

The point is that NEED_RESCHED has been cleared by requesting and
granting the extension, which means the task can go out to userspace,
until it either relinquishes the CPU or hrtick() whacks it over the
head.

And your muck requires this insane hack with sched_yield():

>  SYSCALL_DEFINE0(sched_yield)
>  {
> +	/* Reschedule if scheduler time delay was granted */
> +	if (rseq_delay_set_need_resched())
> +		return 0;
> +
>  	do_sched_yield();
>  	return 0;
>  }

That's just completely wrong. Relinquishing the CPU should be possible
by any arbitrary syscall and not require to make sched_yield() more
ill-defined as it is already.

The obvious way to solve both issues is to clear NEED_RESCHED when
the delay is granted and then do in syscall_enter_from_user_mode_work()

        rseq_delay_sys_enter()
        {
             if (unlikely(current->rseq_delay_resched == GRANTED)) {
		    set_tsk_need_resched(current);
                    schedule();
             }       
        }     	

No?

It's debatable whether the schedule() there is necessary. Removing it
would allow the task to either complete the syscall and reschedule on
exit to user space or go to sleep in the syscall. But that's a trivial
detail.

The important point is that the NEED_RESCHED semantics stay sane and the
problem is solved right on the next syscall entry.

This delay is not for extending CPU time accross syscalls, it's solely
to allow user space to complete a _user space_ critical
section. Everything else is just wrong and we don't implement it as an
invitation for abuse.

For the record: I used GRANTED on purpose, because REQUESTED is
bogus. At the point where __rseq_delay_resched() is invoked _AND_
observes the user space request, it grants the delay, no?

This random nomenclature is just making this stuff annoyingly hard to
follow.

> +static inline bool rseq_delay_resched(unsigned long ti_work)
> +{
> +	if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
> +		return false;
> +
> +	if (unlikely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_PROBE))
> +		return false;

Why unlikely? The majority of applications do not use this.

> +
> +	if (!(ti_work & (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_LAZY)))
> +		return false;

The caller already established that one of these flags is set, no?

> +	if (__rseq_delay_resched()) {
> +		clear_tsk_need_resched(current);

Why has this to be inline and is not done in __rseq_delay_resched()?

> +		return true;
> +	}
> +	return false;

>  /**
>   * exit_to_user_mode_loop - do any pending work before leaving to user space
>   * @regs:	Pointer to pt_regs on entry stack
>   * @ti_work:	TIF work flags as read by the caller
>   */
>  __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> -						     unsigned long ti_work)
> +						     unsigned long ti_work, bool irq)
>  {

Same comments as above.

> +#ifdef	CONFIG_RSEQ_RESCHED_DELAY
> +bool __rseq_delay_resched(void)
> +{
> +	struct task_struct *t = current;
> +	u32 flags;
> +
> +	if (copy_from_user_nofault(&flags, &t->rseq->flags, sizeof(flags)))
> +		return false;
> +
> +	if (!(flags & RSEQ_CS_FLAG_DELAY_RESCHED))
> +		return false;
> +
> +	flags &= ~RSEQ_CS_FLAG_DELAY_RESCHED;
> +	if (copy_to_user_nofault(&t->rseq->flags, &flags, sizeof(flags)))
> +		return false;
> +
> +	t->rseq_delay_resched = RSEQ_RESCHED_DELAY_REQUESTED;
> +
> +	return true;
> +}
> +
> +void rseq_delay_resched_arm_timer(void)
> +{
> +	if (unlikely(current->rseq_delay_resched == RSEQ_RESCHED_DELAY_REQUESTED))
> +		hrtick_local_start(30 * NSEC_PER_USEC);
> +}
> +
> +void rseq_delay_resched_tick(void)
> +{
> +	if (current->rseq_delay_resched == RSEQ_RESCHED_DELAY_REQUESTED)
> +		set_tsk_need_resched(current);

Small enough to inline into hrtick() with a IS_ENABLED() guard, no?

> +}
> +#endif /* CONFIG_RSEQ_RESCHED_DELAY */
> +
>  #ifdef CONFIG_DEBUG_RSEQ
>  
>  /*
> @@ -493,6 +527,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
>  		current->rseq = NULL;
>  		current->rseq_sig = 0;
>  		current->rseq_len = 0;
> +		if (IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
> +			current->rseq_delay_resched = RSEQ_RESCHED_DELAY_NONE;

What's that conditional for?

t->rseq_delay_resched is unconditionally available. Your choice of
optimizing the irrelevant places is amazing.

>  		return 0;
>  	}
>  
> @@ -561,6 +597,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
>  	current->rseq = rseq;
>  	current->rseq_len = rseq_len;
>  	current->rseq_sig = sig;
> +	if (IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
> +		current->rseq_delay_resched = RSEQ_RESCHED_DELAY_PROBE;

Why is this done unconditionally for rseq?

So that any rseq user needs to do a function call and a copy_from_user()
just for nothing?

A task, which needs this muck, can very well opt-in for this and leave
everybody else unaffected, no?

prctl() exists for a reason and that allows even filtering out the
request to enable it if the sysadmin sets up filters accordingly.

As code which wants to utilize this has to be modified anyway, adding
the prctl() is not a unreasonable requirement.

>  	clear_preempt_need_resched();
> +	if (IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY) &&
> +	    prev->rseq_delay_resched == RSEQ_RESCHED_DELAY_REQUESTED)
> +		prev->rseq_delay_resched = RSEQ_RESCHED_DELAY_PROBE;

Yet another code conditional for no reason. These are two bits and you can
use them smart:

#define ENABLED		1
#define GRANTED		3

So you can just go and do

   prev->rseq_delay_resched &= RSEQ_RESCHED_DELAY_ENABLED;

which clears the GRANTED bit without a conditional and that's correct
whether the ENABLED bit was set or not.

In the syscall exit path you then do:

static inline bool rseq_delay_resched(void)
{
   if (prev->rseq_delay_resched != ENABLED)
   	return false;
   return __rseq_delay_resched();
}

and __rseq_delay_resched() does:

    rseq_delay_resched = GRANTED;

No?

Thanks,

        tglx
Re: [PATCH V7 01/11] sched: Scheduler time slice extension
Posted by Prakash Sangappa 1 month, 4 weeks ago

> On Aug 6, 2025, at 1:34 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> On Thu, Jul 24 2025 at 16:16, Prakash Sangappa wrote:
>> @@ -304,7 +304,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs);
>>  * exit_to_user_mode_loop - do any pending work before leaving to user space
>>  */
>> unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>> -     unsigned long ti_work);
>> +     unsigned long ti_work, bool irq);
> 
> I know the kernel-doc already lacks the description for the existing
> arguments, but adding more undocumented ones is not the right thing
> either.
> 
> Also please name this argument 'from_irq' to make it clear what this is
> about.

Ok, will change it.

> 
>> /**
>>  * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
>> @@ -316,7 +316,7 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>>  *    EXIT_TO_USER_MODE_WORK are set
>>  * 4) check that interrupts are still disabled
>>  */
>> -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
>> +static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs, bool irq)
> 
> New argument not documented in kernel-doc.

Will add necessary documentation.

> 
>> {
>> unsigned long ti_work;
>> 
>> @@ -327,7 +327,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
>> 
>> ti_work = read_thread_flags();
>> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
>> - ti_work = exit_to_user_mode_loop(regs, ti_work);
>> + ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
>> +
>> + if (IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY) && irq)
>> + rseq_delay_resched_arm_timer();
> 
> This is still an unconditional function call which is a NOOP for
> everyone who does not use this. It's not that hard to inline the
> check. How often do I have to explain that?

Will fix.

> 
>> arch_exit_to_user_mode_prepare(regs, ti_work);
>> 
>> @@ -396,6 +399,9 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>> 
>> CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
>> 
>> + /* Reschedule if scheduler time delay was granted */
> 
> This is not rescheduling. It sets NEED_RESCHED, which is a completely
> different thing.
> 
>> + rseq_delay_set_need_resched();
> 
> I fundamentally hate this hack as it goes out to user space with
> NEED_RESCHED set and absolutely zero debug mechanism which validates
> it. Currently going out with NEED_RESCHED set is a plain bug, rigthfully
> so.
> 
> But now this muck comes along and sets the flag, which is semantically
> just wrong and ill defined.
> 
> The point is that NEED_RESCHED has been cleared by requesting and
> granting the extension, which means the task can go out to userspace,
> until it either relinquishes the CPU or hrtick() whacks it over the
> head.
> 
> And your muck requires this insane hack with sched_yield():
> 
>> SYSCALL_DEFINE0(sched_yield)
>> {
>> + /* Reschedule if scheduler time delay was granted */
>> + if (rseq_delay_set_need_resched())
>> + return 0;
>> +
>> do_sched_yield();
>> return 0;
>> }
> 
> That's just completely wrong. Relinquishing the CPU should be possible
> by any arbitrary syscall and not require to make sched_yield() more
> ill-defined as it is already.
> 
> The obvious way to solve both issues is to clear NEED_RESCHED when
> the delay is granted and then do in syscall_enter_from_user_mode_work()
> 
>        rseq_delay_sys_enter()
>        {
>             if (unlikely(current->rseq_delay_resched == GRANTED)) {
>    set_tsk_need_resched(current);
>                    schedule();
>             }       
>        }      
> 
> No?
> 
> It's debatable whether the schedule() there is necessary. Removing it
> would allow the task to either complete the syscall and reschedule on
> exit to user space or go to sleep in the syscall. But that's a trivial
> detail.
> 
> The important point is that the NEED_RESCHED semantics stay sane and the
> problem is solved right on the next syscall entry.
> 
> This delay is not for extending CPU time accross syscalls, it's solely
> to allow user space to complete a _user space_ critical
> section. Everything else is just wrong and we don't implement it as an
> invitation for abuse.
> 
> For the record: I used GRANTED on purpose, because REQUESTED is
> bogus. At the point where __rseq_delay_resched() is invoked _AND_
> observes the user space request, it grants the delay, no?
> 
> This random nomenclature is just making this stuff annoyingly hard to
> follow.
> 

Ok I can move the check to relinquish cpu in syscall_enter_from_user_mode_work()
instead of in syscall_exit_to_user_mode_work(). 


>> +static inline bool rseq_delay_resched(unsigned long ti_work)
>> +{
>> + if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
>> + return false;
>> +
>> + if (unlikely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_PROBE))
>> + return false;
> 
> Why unlikely? The majority of applications do not use this.

WIll change to likely().

> 
>> +
>> + if (!(ti_work & (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_LAZY)))
>> + return false;
> 
> The caller already established that one of these flags is set, no?

That is right, will delete this check here. 

> 
>> + if (__rseq_delay_resched()) {
>> + clear_tsk_need_resched(current);
> 
> Why has this to be inline and is not done in __rseq_delay_resched()?

Sure, it could be in __rseq_delay_resched(). 

> 
>> + return true;
>> + }
>> + return false;
> 
>> /**
>>  * exit_to_user_mode_loop - do any pending work before leaving to user space
>>  * @regs: Pointer to pt_regs on entry stack
>>  * @ti_work: TIF work flags as read by the caller
>>  */
>> __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>> -     unsigned long ti_work)
>> +     unsigned long ti_work, bool irq)
>> {
> 
> Same comments as above.
> 
>> +
>> +void rseq_delay_resched_tick(void)
>> +{
>> + if (current->rseq_delay_resched == RSEQ_RESCHED_DELAY_REQUESTED)
>> + set_tsk_need_resched(current);
> 
> Small enough to inline into hrtick() with a IS_ENABLED() guard, no?

 I can move it to hrtick() and delete the rseq_delay_resched_tick() routine.

> 
>> +}
>> +#endif /* CONFIG_RSEQ_RESCHED_DELAY */
>> +
>> #ifdef CONFIG_DEBUG_RSEQ
>> 
>> /*
>> @@ -493,6 +527,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
>> current->rseq = NULL;
>> current->rseq_sig = 0;
>> current->rseq_len = 0;
>> + if (IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
>> + current->rseq_delay_resched = RSEQ_RESCHED_DELAY_NONE;
> 
> What's that conditional for?
> 
> t->rseq_delay_resched is unconditionally available. Your choice of
> optimizing the irrelevant places is amazing.

Will fix.

> 
>> return 0;
>> }
>> 
>> @@ -561,6 +597,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
>> current->rseq = rseq;
>> current->rseq_len = rseq_len;
>> current->rseq_sig = sig;
>> + if (IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
>> + current->rseq_delay_resched = RSEQ_RESCHED_DELAY_PROBE;
> 
> Why is this done unconditionally for rseq?
> 
> So that any rseq user needs to do a function call and a copy_from_user()
> just for nothing?
> 
> A task, which needs this muck, can very well opt-in for this and leave
> everybody else unaffected, no?

Sure, that seems reasonable.

> 
> prctl() exists for a reason and that allows even filtering out the
> request to enable it if the sysadmin sets up filters accordingly.
> 
> As code which wants to utilize this has to be modified anyway, adding
> the prctl() is not a unreasonable requirement.
> 
>> clear_preempt_need_resched();
>> + if (IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY) &&
>> +    prev->rseq_delay_resched == RSEQ_RESCHED_DELAY_REQUESTED)
>> + prev->rseq_delay_resched = RSEQ_RESCHED_DELAY_PROBE;
> 
> Yet another code conditional for no reason. These are two bits and you can
> use them smart:
> 
> #define ENABLED 1
> #define GRANTED 3
> 
> So you can just go and do
> 
>   prev->rseq_delay_resched &= RSEQ_RESCHED_DELAY_ENABLED;
> 
> which clears the GRANTED bit without a conditional and that's correct
> whether the ENABLED bit was set or not.
> 
> In the syscall exit path you then do:
> 
> static inline bool rseq_delay_resched(void)
> {
>   if (prev->rseq_delay_resched != ENABLED)
>    return false;
>   return __rseq_delay_resched();
> }
> 
> and __rseq_delay_resched() does:
> 
>    rseq_delay_resched = GRANTED;
> 
> No?

That is a nice. I can add a prctl() call to enable & disable this functionality
which would help avoid unnecessary copy_from_user() calls.

In addition to registering the ‘rseq’ struct the application that needs the functionality
will have to make  the prctl() call to enable it, which I think should be reasonable.  

Thanks,
-Prakash.


> 
> Thanks,
> 
>        tglx

Re: [PATCH V7 01/11] sched: Scheduler time slice extension
Posted by Sebastian Andrzej Siewior 1 month, 4 weeks ago
On 2025-08-06 22:34:00 [+0200], Thomas Gleixner wrote:
> On Thu, Jul 24 2025 at 16:16, Prakash Sangappa wrote:
> 
> The obvious way to solve both issues is to clear NEED_RESCHED when
> the delay is granted and then do in syscall_enter_from_user_mode_work()
> 
>         rseq_delay_sys_enter()
>         {
>              if (unlikely(current->rseq_delay_resched == GRANTED)) {
> 		    set_tsk_need_resched(current);
>                     schedule();
>              }       
>         }     	
> 
> No?
> 
> It's debatable whether the schedule() there is necessary. Removing it
> would allow the task to either complete the syscall and reschedule on
> exit to user space or go to sleep in the syscall. But that's a trivial
> detail.

Either schedule() or setting NEED_RESCHED is enough.

> The important point is that the NEED_RESCHED semantics stay sane and the
> problem is solved right on the next syscall entry.
> 
…
> > +static inline bool rseq_delay_resched(unsigned long ti_work)
> > +{
> > +	if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
> > +		return false;
> > +
> > +	if (unlikely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_PROBE))

The functions and the task_struct member field share the same.

> > +		return false;
> 
> Why unlikely? The majority of applications do not use this.
> 
> > +
> > +	if (!(ti_work & (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_LAZY)))
> > +		return false;
> 
> The caller already established that one of these flags is set, no?

correct, and if they are set, this never gets to false.

> > +	if (__rseq_delay_resched()) {
> > +		clear_tsk_need_resched(current);
> 
> Why has this to be inline and is not done in __rseq_delay_resched()?

A SCHED_OTHER wake up sets _TIF_NEED_RESCHED_LAZY so
clear_tsk_need_resched() will revoke this granting an extension.

The RT/DL wake up will set _TIF_NEED_RESCHED and
clear_tsk_need_resched() will also clear it. However this one
additionally sets set_preempt_need_resched() so the next preempt
disable/ enable combo will lead to a scheduling event. A remote wakeup
will trigger an IPI (scheduler_ipi()) which also does
set_preempt_need_resched().

If I understand this correct then a RT/DL wake up while the task is in
kernel-mode should lead to a scheduling event assuming we pass a
spinlock_t (ignoring the irq argument).
Should the task be in user-mode then we return to user mode with the TIF
flag cleared and the NEED-RESCHED flag folded into the preemption
counter.

I am once again asking to limit this to _TIF_NEED_RESCHED_LAZY.

> > +		return true;
> > +	}
> > +	return false;
> 

…

> Thanks,
> 
>         tglx

Sebastian
Re: [PATCH V7 01/11] sched: Scheduler time slice extension
Posted by Prakash Sangappa 1 month, 4 weeks ago

> On Aug 7, 2025, at 8:49 AM, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
> On 2025-08-06 22:34:00 [+0200], Thomas Gleixner wrote:
>> On Thu, Jul 24 2025 at 16:16, Prakash Sangappa wrote:
>> 
>> The obvious way to solve both issues is to clear NEED_RESCHED when
>> the delay is granted and then do in syscall_enter_from_user_mode_work()
>> 
>>        rseq_delay_sys_enter()
>>        {
>>             if (unlikely(current->rseq_delay_resched == GRANTED)) {
>>    set_tsk_need_resched(current);
>>                    schedule();
>>             }       
>>        }      
>> 
>> No?
>> 
>> It's debatable whether the schedule() there is necessary. Removing it
>> would allow the task to either complete the syscall and reschedule on
>> exit to user space or go to sleep in the syscall. But that's a trivial
>> detail.
> 
> Either schedule() or setting NEED_RESCHED is enough.
> 
>> The important point is that the NEED_RESCHED semantics stay sane and the
>> problem is solved right on the next syscall entry.
>> 
> …
>>> +static inline bool rseq_delay_resched(unsigned long ti_work)
>>> +{
>>> + if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
>>> + return false;
>>> +
>>> + if (unlikely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_PROBE))
> 
> The functions and the task_struct member field share the same.

I can look at modifying names of the functions. 

> 
>>> + return false;
>> 
>> Why unlikely? The majority of applications do not use this.
>> 
>>> +
>>> + if (!(ti_work & (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_LAZY)))
>>> + return false;
>> 
>> The caller already established that one of these flags is set, no?
> 
> correct, and if they are set, this never gets to false.

Will fix it.

> 
>>> + if (__rseq_delay_resched()) {
>>> + clear_tsk_need_resched(current);
>> 
>> Why has this to be inline and is not done in __rseq_delay_resched()?
> 
> A SCHED_OTHER wake up sets _TIF_NEED_RESCHED_LAZY so
> clear_tsk_need_resched() will revoke this granting an extension.
> 
> The RT/DL wake up will set _TIF_NEED_RESCHED and
> clear_tsk_need_resched() will also clear it. However this one
> additionally sets set_preempt_need_resched() so the next preempt
> disable/ enable combo will lead to a scheduling event. A remote wakeup
> will trigger an IPI (scheduler_ipi()) which also does
> set_preempt_need_resched().
> 
> If I understand this correct then a RT/DL wake up while the task is in
> kernel-mode should lead to a scheduling event assuming we pass a
> spinlock_t (ignoring the irq argument).
> Should the task be in user-mode then we return to user mode with the TIF
> flag cleared and the NEED-RESCHED flag folded into the preemption
> counter.
> 
> I am once again asking to limit this to _TIF_NEED_RESCHED_LAZY.

Would the proposal(patches 7-11) to have an API/Mechanism, as Thomas suggested,
for RT threads to indicate not to be delayed address the concern?.  
Also there is the proposal to have a kernel parameter to disable delaying 
RT threads in general, when granting extra time to the running task.

Thanks,
-Prakash

> 
>>> + return true;
>>> + }
>>> + return false;
>> 
> 
> …
> 
>> Thanks,
>> 
>>        tglx
> 
> Sebastian

Re: [PATCH V7 01/11] sched: Scheduler time slice extension
Posted by Sebastian Andrzej Siewior 1 month, 4 weeks ago
On 2025-08-07 16:56:33 [+0000], Prakash Sangappa wrote:
> >>> + if (__rseq_delay_resched()) {
> >>> + clear_tsk_need_resched(current);
> >> 
> >> Why has this to be inline and is not done in __rseq_delay_resched()?
> > 
> > A SCHED_OTHER wake up sets _TIF_NEED_RESCHED_LAZY so
> > clear_tsk_need_resched() will revoke this granting an extension.
> > 
> > The RT/DL wake up will set _TIF_NEED_RESCHED and
> > clear_tsk_need_resched() will also clear it. However this one
> > additionally sets set_preempt_need_resched() so the next preempt
> > disable/ enable combo will lead to a scheduling event. A remote wakeup
> > will trigger an IPI (scheduler_ipi()) which also does
> > set_preempt_need_resched().
> > 
> > If I understand this correct then a RT/DL wake up while the task is in
> > kernel-mode should lead to a scheduling event assuming we pass a
> > spinlock_t (ignoring the irq argument).
> > Should the task be in user-mode then we return to user mode with the TIF
> > flag cleared and the NEED-RESCHED flag folded into the preemption
> > counter.
> > 
> > I am once again asking to limit this to _TIF_NEED_RESCHED_LAZY.
> 
> Would the proposal(patches 7-11) to have an API/Mechanism, as Thomas suggested,
> for RT threads to indicate not to be delayed address the concern?.  
> Also there is the proposal to have a kernel parameter to disable delaying 
> RT threads in general, when granting extra time to the running task.

While I appreciate the effort I don't see the need for this
functionality atm. I would say just get the basic infrastructure
focusing on LAZY preempt and ignore the wakes for tasks with elevated
priority. If this works reliably and people indeed ask for delayed
wakes for RT threads then this can be added assuming you have enough
flexibility in the API to allow it. Then you would also have a use-case
on how to implement it.

Looking at 07/11, you set a task_sched::sched_nodelay if this is
requested. In 09/11 you set TIF_NEED_RESCHED_NODELAY if that flag is
set. In 08/11 you use that flag additionally for wake ups and propagate
it for the architecture. Puh.
If a task needs to set this flag first in order to be excluded from the
delayed wake ups then I don't see how this can work for kernel threads
such as the threaded interrupts or a user thread which is PI-boosted and
inherits the RT priority.

On the other hand lets assume you check and clear only
TIF_NEED_RESCHED_LAZY. Lets say people ask to extend the delayed wakes
to certain userland RT threads. Then you could add a prctl() to turn
TIF_NEED_RESCHED into TIF_NEED_RESCHED_LAZY for the "marked" threads.
Saying I don't mind if this particular thread gets delayed.
If this is needed for all threads in system you could do a system wide
sysctl and so on.
You would get all this without another TIF bit and tracing would keep
showing reliably a N or L flag.

> Thanks,
> -Prakash
> 
Sebastian
Re: [PATCH V7 01/11] sched: Scheduler time slice extension
Posted by Prakash Sangappa 1 month, 4 weeks ago

> On Aug 8, 2025, at 2:59 AM, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
> On 2025-08-07 16:56:33 [+0000], Prakash Sangappa wrote:
>>>>> + if (__rseq_delay_resched()) {
>>>>> + clear_tsk_need_resched(current);
>>>> 
>>>> Why has this to be inline and is not done in __rseq_delay_resched()?
>>> 
>>> A SCHED_OTHER wake up sets _TIF_NEED_RESCHED_LAZY so
>>> clear_tsk_need_resched() will revoke this granting an extension.
>>> 
>>> The RT/DL wake up will set _TIF_NEED_RESCHED and
>>> clear_tsk_need_resched() will also clear it. However this one
>>> additionally sets set_preempt_need_resched() so the next preempt
>>> disable/ enable combo will lead to a scheduling event. A remote wakeup
>>> will trigger an IPI (scheduler_ipi()) which also does
>>> set_preempt_need_resched().
>>> 
>>> If I understand this correct then a RT/DL wake up while the task is in
>>> kernel-mode should lead to a scheduling event assuming we pass a
>>> spinlock_t (ignoring the irq argument).
>>> Should the task be in user-mode then we return to user mode with the TIF
>>> flag cleared and the NEED-RESCHED flag folded into the preemption
>>> counter.
>>> 
>>> I am once again asking to limit this to _TIF_NEED_RESCHED_LAZY.
>> 
>> Would the proposal(patches 7-11) to have an API/Mechanism, as Thomas suggested,
>> for RT threads to indicate not to be delayed address the concern?.  
>> Also there is the proposal to have a kernel parameter to disable delaying 
>> RT threads in general, when granting extra time to the running task.
> 
> While I appreciate the effort I don't see the need for this
> functionality atm. I would say just get the basic infrastructure
> focusing on LAZY preempt and ignore the wakes for tasks with elevated
> priority. If this works reliably and people indeed ask for delayed
> wakes for RT threads then this can be added assuming you have enough
> flexibility in the API to allow it. Then you would also have a use-case
> on how to implement it.
> 
> Looking at 07/11, you set a task_sched::sched_nodelay if this is
> requested. In 09/11 you set TIF_NEED_RESCHED_NODELAY if that flag is
> set. In 08/11 you use that flag additionally for wake ups and propagate
> it for the architecture. Puh.
> If a task needs to set this flag first in order to be excluded from the
> delayed wake ups then I don't see how this can work for kernel threads
> such as the threaded interrupts or a user thread which is PI-boosted and
> inherits the RT priority.
> 
> On the other hand lets assume you check and clear only
> TIF_NEED_RESCHED_LAZY. Lets say people ask to extend the delayed wakes
> to certain userland RT threads. Then you could add a prctl() to turn
> TIF_NEED_RESCHED into TIF_NEED_RESCHED_LAZY for the "marked" threads.
> Saying I don't mind if this particular thread gets delayed.
> If this is needed for all threads in system you could do a system wide
> sysctl and so on.
> You would get all this without another TIF bit and tracing would keep
> showing reliably a N or L flag.

Ok, Will  drop these patches next round.

Should we just consider adding a sysctl to to choose if we want to delay if 
TIF_NEED_RESCHED Is set?

-Prakash





> 
>> Thanks,
>> -Prakash
>> 
> Sebastian

Re: [PATCH V7 01/11] sched: Scheduler time slice extension
Posted by Sebastian Andrzej Siewior 1 month, 3 weeks ago
On 2025-08-08 17:00:27 [+0000], Prakash Sangappa wrote:
> Should we just consider adding a sysctl to to choose if we want to delay if 
> TIF_NEED_RESCHED Is set?

Please don't. Just focus on the LAZY wake up.

> -Prakash

Sebastian
Re: [PATCH V7 01/11] sched: Scheduler time slice extension
Posted by Thomas Gleixner 1 month, 4 weeks ago
On Wed, Aug 06 2025 at 22:34, Thomas Gleixner wrote:
> On Thu, Jul 24 2025 at 16:16, Prakash Sangappa wrote:
>> @@ -396,6 +399,9 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>>  
>>  	CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
>>  
>> +	/* Reschedule if scheduler time delay was granted */
>
> This is not rescheduling. It sets NEED_RESCHED, which is a completely
> different thing.
>
>> +	rseq_delay_set_need_resched();
>
> I fundamentally hate this hack as it goes out to user space with
> NEED_RESCHED set and absolutely zero debug mechanism which validates
> it. Currently going out with NEED_RESCHED set is a plain bug, rigthfully
> so.
>
> But now this muck comes along and sets the flag, which is semantically
> just wrong and ill defined.
>
> The point is that NEED_RESCHED has been cleared by requesting and
> granting the extension, which means the task can go out to userspace,
> until it either relinquishes the CPU or hrtick() whacks it over the
> head.

Sorry. I misread this. It's placed before it enters the exit work loop
and not afterwards. I got lost in this maze. :(

> The obvious way to solve both issues is to clear NEED_RESCHED when
> the delay is granted and then do in syscall_enter_from_user_mode_work()
>
>         rseq_delay_sys_enter()
>         {
>              if (unlikely(current->rseq_delay_resched == GRANTED)) {
> 		    set_tsk_need_resched(current);
>                     schedule();
>              }       
>         }     	
>
> No?
>
> It's debatable whether the schedule() there is necessary. Removing it
> would allow the task to either complete the syscall and reschedule on
> exit to user space or go to sleep in the syscall. But that's a trivial
> detail.

But, the most important thing is that doing it at entry allows to debug
this stuff for correctness.

I can kinda see that a sched_yield() shortcut might be the right thing
to do for relinguishing the CPU, but if that's the user space contract,
then any other syscall needs to be caught and not silently papered over
at return from syscall.

Let me think about this some more.
Re: [PATCH V7 01/11] sched: Scheduler time slice extension
Posted by Prakash Sangappa 1 month, 4 weeks ago

> On Aug 7, 2025, at 7:07 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> On Wed, Aug 06 2025 at 22:34, Thomas Gleixner wrote:
>> On Thu, Jul 24 2025 at 16:16, Prakash Sangappa wrote:
>>> @@ -396,6 +399,9 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>>> 
>>> CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
>>> 
>>> + /* Reschedule if scheduler time delay was granted */
>> 
>> This is not rescheduling. It sets NEED_RESCHED, which is a completely
>> different thing.
>> 
>>> + rseq_delay_set_need_resched();
>> 
>> I fundamentally hate this hack as it goes out to user space with
>> NEED_RESCHED set and absolutely zero debug mechanism which validates
>> it. Currently going out with NEED_RESCHED set is a plain bug, rigthfully
>> so.
>> 
>> But now this muck comes along and sets the flag, which is semantically
>> just wrong and ill defined.
>> 
>> The point is that NEED_RESCHED has been cleared by requesting and
>> granting the extension, which means the task can go out to userspace,
>> until it either relinquishes the CPU or hrtick() whacks it over the
>> head.
> 
> Sorry. I misread this. It's placed before it enters the exit work loop
> and not afterwards. I got lost in this maze. :(

Yes.

> 
>> The obvious way to solve both issues is to clear NEED_RESCHED when
>> the delay is granted and then do in syscall_enter_from_user_mode_work()
>> 
>>        rseq_delay_sys_enter()
>>        {
>>             if (unlikely(current->rseq_delay_resched == GRANTED)) {
>>    set_tsk_need_resched(current);
>>                    schedule();
>>             }       
>>        }      
>> 
>> No?
>> 
>> It's debatable whether the schedule() there is necessary. Removing it
>> would allow the task to either complete the syscall and reschedule on
>> exit to user space or go to sleep in the syscall. But that's a trivial
>> detail.
> 
> But, the most important thing is that doing it at entry allows to debug
> this stuff for correctness.
> 
> I can kinda see that a sched_yield() shortcut might be the right thing
> to do for relinguishing the CPU, but if that's the user space contract,
> then any other syscall needs to be caught and not silently papered over
> at return from syscall.

Sure.  The check to see if delay was GRANTED in syscall_exit_to_user_mode_work() 
would catch any other system calls. 

> 
> Let me think about this some more.

Sure,
We will need a recommended system call, which the application can call
to relinquish the cpu after extra cpu time was granted. sched_yield(2) seems
appropriate. The shortcut in sched_yield() was to avoid going thru do_sched_yield() 
when called in the extended time. If we move the GRANTED check to
syscall_enter_from_user_mode_work(), then the shortcut in sched_yield()
cannot be implemented.

Thanks,
-Prakash


> 
> 
>