Add support for a thread to request extending its execution time slice on
the cpu. The extra cpu time granted would help in allowing the thread to
complete executing the critical section and drop any locks without getting
preempted. The thread would request this cpu time extension, by setting a
bit in the restartable sequences(rseq) structure registered with the kernel.
Kernel will grant a 50us extension on the cpu, when it sees the bit set.
With the help of a timer, kernel force preempts the thread if it is still
running on the cpu when the 50us timer expires. The thread should yield
the cpu by making a system call after completing the critical section.
Suggested-by: Peter Ziljstra <peterz@infradead.org>
Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
---
v2:
- Add check in syscall_exit_to_user_mode_prepare() and reschedule if
thread has 'rseq_sched_delay' set.
---
include/linux/entry-common.h | 11 +++++--
include/linux/sched.h | 18 ++++++++++++
include/uapi/linux/rseq.h | 5 ++++
kernel/entry/common.c | 21 ++++++++++----
kernel/rseq.c | 56 ++++++++++++++++++++++++++++++++++++
kernel/sched/core.c | 16 +++++++++++
kernel/sched/syscalls.c | 7 +++++
7 files changed, 126 insertions(+), 8 deletions(-)
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index fc61d0205c97..cec343f95210 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -303,7 +303,8 @@ void arch_do_signal_or_restart(struct pt_regs *regs);
* exit_to_user_mode_loop - do any pending work before leaving to user space
*/
unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work);
+ unsigned long ti_work,
+ bool irq);
/**
* exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
@@ -315,7 +316,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
* EXIT_TO_USER_MODE_WORK are set
* 4) check that interrupts are still disabled
*/
-static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs,
+ bool irq)
{
unsigned long ti_work;
@@ -326,7 +328,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
ti_work = read_thread_flags();
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
- ti_work = exit_to_user_mode_loop(regs, ti_work);
+ ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
+
+ if (irq)
+ rseq_delay_resched_fini();
arch_exit_to_user_mode_prepare(regs, ti_work);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 599f077b8019..75abe260de72 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -326,6 +326,7 @@ extern int __must_check io_schedule_prepare(void);
extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);
+extern void hrtick_local_start(u64 delay);
/**
* struct prev_cputime - snapshot of system and user cputime
@@ -930,6 +931,9 @@ struct task_struct {
struct plist_node pushable_tasks;
struct rb_node pushable_dl_tasks;
#endif
+#ifdef CONFIG_RSEQ
+ unsigned rseq_sched_delay:1;
+#endif
struct mm_struct *mm;
struct mm_struct *active_mm;
@@ -2221,6 +2225,20 @@ static inline bool owner_on_cpu(struct task_struct *owner)
unsigned long sched_cpu_util(int cpu);
#endif /* CONFIG_SMP */
+#ifdef CONFIG_RSEQ
+
+extern bool rseq_delay_resched(void);
+extern void rseq_delay_resched_fini(void);
+extern void rseq_delay_resched_tick(void);
+
+#else
+
+static inline bool rseq_delay_resched(void) { return false; }
+static inline void rseq_delay_resched_fini(void) { }
+static inline void rseq_delay_resched_tick(void) { }
+
+#endif
+
#ifdef CONFIG_SCHED_CORE
extern void sched_core_free(struct task_struct *tsk);
extern void sched_core_fork(struct task_struct *p);
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index c233aae5eac9..ec3b45f32bc8 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -26,6 +26,7 @@ enum rseq_cs_flags_bit {
RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
+ RSEQ_CS_FLAG_DELAY_RESCHED_BIT = 3,
};
enum rseq_cs_flags {
@@ -35,6 +36,8 @@ enum rseq_cs_flags {
(1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE =
(1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+ RSEQ_CS_FLAG_DELAY_RESCHED =
+ (1U << RSEQ_CS_FLAG_DELAY_RESCHED_BIT),
};
/*
@@ -128,6 +131,8 @@ struct rseq {
* - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
* Inhibit instruction sequence block restart on migration for
* this thread.
+ * - RSEQ_CS_DELAY_RESCHED
+ * Try delay resched...
*/
__u32 flags;
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 6b7ff1bc1b9b..944027d14198 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -89,7 +89,8 @@ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
* @ti_work: TIF work flags as read by the caller
*/
__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work)
+ unsigned long ti_work,
+ bool irq)
{
/*
* Before returning to user space ensure that all pending work
@@ -99,8 +100,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
local_irq_enable_exit_to_user(ti_work);
- if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
- schedule();
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
+ if (irq && rseq_delay_resched())
+ clear_tsk_need_resched(current);
+ else
+ schedule();
+ }
if (ti_work & _TIF_UPROBE)
uprobe_notify_resume(regs);
@@ -188,6 +193,12 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
+#ifdef CONFIG_RSEQ
+ /* reschedule if sched delay was granted */
+ if (current->rseq_sched_delay)
+ set_tsk_need_resched(current);
+#endif
+
if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
local_irq_enable();
@@ -208,7 +219,7 @@ static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *reg
{
syscall_exit_to_user_mode_prepare(regs);
local_irq_disable_exit_to_user();
- exit_to_user_mode_prepare(regs);
+ exit_to_user_mode_prepare(regs, false);
}
void syscall_exit_to_user_mode_work(struct pt_regs *regs)
@@ -232,7 +243,7 @@ noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
{
instrumentation_begin();
- exit_to_user_mode_prepare(regs);
+ exit_to_user_mode_prepare(regs, true);
instrumentation_end();
exit_to_user_mode();
}
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 442aba29bc4c..9f83d47253ce 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -426,6 +426,62 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
force_sigsegv(sig);
}
+bool rseq_delay_resched(void)
+{
+ struct task_struct *t = current;
+ u32 flags;
+
+ if (!IS_ENABLED(CONFIG_SCHED_HRTICK))
+ return false;
+
+ if (!t->rseq)
+ return false;
+
+ if (t->rseq_sched_delay)
+ return false;
+
+ if (copy_from_user_nofault(&flags, &t->rseq->flags, sizeof(flags)))
+ return false;
+
+ if (!(flags & RSEQ_CS_FLAG_DELAY_RESCHED))
+ return false;
+
+ flags &= ~RSEQ_CS_FLAG_DELAY_RESCHED;
+ if (copy_to_user_nofault(&t->rseq->flags, &flags, sizeof(flags)))
+ return false;
+
+ t->rseq_sched_delay = 1;
+
+ return true;
+}
+
+void rseq_delay_resched_fini(void)
+{
+#ifdef CONFIG_SCHED_HRTICK
+ extern void hrtick_local_start(u64 delay);
+ struct task_struct *t = current;
+ /*
+ * IRQs off, guaranteed to return to userspace, start timer on this CPU
+ * to limit the resched-overdraft.
+ *
+ * If your critical section is longer than 50 us you get to keep the
+ * pieces.
+ */
+ if (t->rseq_sched_delay)
+ hrtick_local_start(50 * NSEC_PER_USEC);
+#endif
+}
+
+void rseq_delay_resched_tick(void)
+{
+#ifdef CONFIG_SCHED_HRTICK
+ struct task_struct *t = current;
+
+ if (t->rseq_sched_delay)
+ set_tsk_need_resched(t);
+#endif
+}
+
#ifdef CONFIG_DEBUG_RSEQ
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 165c90ba64ea..cee50e139723 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -823,6 +823,7 @@ void update_rq_clock(struct rq *rq)
static void hrtick_clear(struct rq *rq)
{
+ rseq_delay_resched_tick();
if (hrtimer_active(&rq->hrtick_timer))
hrtimer_cancel(&rq->hrtick_timer);
}
@@ -838,6 +839,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+ rseq_delay_resched_tick();
+
rq_lock(rq, &rf);
update_rq_clock(rq);
rq->donor->sched_class->task_tick(rq, rq->curr, 1);
@@ -911,6 +914,16 @@ void hrtick_start(struct rq *rq, u64 delay)
#endif /* CONFIG_SMP */
+void hrtick_local_start(u64 delay)
+{
+ struct rq *rq = this_rq();
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ hrtick_start(rq, delay);
+ rq_unlock(rq, &rf);
+}
+
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
@@ -6718,6 +6731,9 @@ static void __sched notrace __schedule(int sched_mode)
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
+#ifdef CONFIG_RSEQ
+ prev->rseq_sched_delay = 0;
+#endif
#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
#endif
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 3919f03fde57..52cc0714dd43 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -1378,6 +1378,13 @@ static void do_sched_yield(void)
*/
SYSCALL_DEFINE0(sched_yield)
{
+#ifdef CONFIG_RSEQ
+ if (current->rseq_sched_delay) {
+ schedule();
+ return 0;
+ }
+#endif
+
do_sched_yield();
return 0;
}
--
2.43.5
On 2025-04-18 19:34:08 [+0000], Prakash Sangappa wrote:
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
…
> @@ -930,6 +931,9 @@ struct task_struct {
> struct plist_node pushable_tasks;
> struct rb_node pushable_dl_tasks;
> #endif
> +#ifdef CONFIG_RSEQ
> + unsigned rseq_sched_delay:1;
> +#endif
There should be somewhere a bitfield already which you could use without
the ifdef. Then you could use IS_ENABLED() if you want to save some code
if RSEQ is not enabled.
>
> struct mm_struct *mm;
> struct mm_struct *active_mm;
> --- a/include/uapi/linux/rseq.h
> +++ b/include/uapi/linux/rseq.h
…
> @@ -128,6 +131,8 @@ struct rseq {
> * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
> * Inhibit instruction sequence block restart on migration for
> * this thread.
> + * - RSEQ_CS_DELAY_RESCHED
> + * Try delay resched...
Delay resched up to $time for $kind-of-stats under $conditions.
> */
> __u32 flags;
>
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index 6b7ff1bc1b9b..944027d14198 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
…
> @@ -99,8 +100,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>
> local_irq_enable_exit_to_user(ti_work);
>
> - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
> - schedule();
> + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
couldn't we restrict this to _TIF_NEED_RESCHED_LAZY? That way we would
still schedule immediately for any SCHED_FIFO/RR/DL tasks and do this
delay only for everything else such as SCHED_OTHER/…
> + if (irq && rseq_delay_resched())
> + clear_tsk_need_resched(current);
> + else
> + schedule();
> + }
>
> if (ti_work & _TIF_UPROBE)
> uprobe_notify_resume(regs);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 165c90ba64ea..cee50e139723 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -823,6 +823,7 @@ void update_rq_clock(struct rq *rq)
>
> static void hrtick_clear(struct rq *rq)
> {
> + rseq_delay_resched_tick();
This is called from __schedule(). If you set the need-resched flag here,
it gets removed shortly after. Do I miss something?
> if (hrtimer_active(&rq->hrtick_timer))
> hrtimer_cancel(&rq->hrtick_timer);
> }
Sebastian
> On Apr 24, 2025, at 7:13 AM, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
>
> On 2025-04-18 19:34:08 [+0000], Prakash Sangappa wrote:
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
> …
>> @@ -930,6 +931,9 @@ struct task_struct {
>> struct plist_node pushable_tasks;
>> struct rb_node pushable_dl_tasks;
>> #endif
>> +#ifdef CONFIG_RSEQ
>> + unsigned rseq_sched_delay:1;
>> +#endif
>
> There should be somewhere a bitfield already which you could use without
> the ifdef. Then you could use IS_ENABLED() if you want to save some code
> if RSEQ is not enabled.
I suppose we could.
Patch 1 is pretty much what PeterZ posted, hope he will comment on it.
Could it be moved below here, call it sched_time_delay, or some variant of this name?
struct task_struct {
..
#ifdef CONFIG_TASK_DELAY_ACCT
/* delay due to memory thrashing */
unsigned in_thrashing:1;
#endif
unsigned sched_time_delay:1;
..
}
This field will be for scheduler time extension use only. Mainly updated in the context of the current thread.
Do we even need to use IS_ENABLED(CONFIG_RSEQ) to access?
>
>>
>> struct mm_struct *mm;
>> struct mm_struct *active_mm;
>> --- a/include/uapi/linux/rseq.h
>> +++ b/include/uapi/linux/rseq.h
> …
>> @@ -128,6 +131,8 @@ struct rseq {
>> * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
>> * Inhibit instruction sequence block restart on migration for
>> * this thread.
>> + * - RSEQ_CS_DELAY_RESCHED
>> + * Try delay resched...
>
> Delay resched up to $time for $kind-of-stats under $conditions.
Will add some comment like
“Delay resched for upto 50us. Checked when thread is about to be preempted"
With the tunable added in the subsequent patch, will change ‘50us' it to the tunable name.
>
>> */
>> __u32 flags;
>>
>> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
>> index 6b7ff1bc1b9b..944027d14198 100644
>> --- a/kernel/entry/common.c
>> +++ b/kernel/entry/common.c
> …
>> @@ -99,8 +100,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>>
>> local_irq_enable_exit_to_user(ti_work);
>>
>> - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
>> - schedule();
>> + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
>
> couldn't we restrict this to _TIF_NEED_RESCHED_LAZY? That way we would
> still schedule immediately for any SCHED_FIFO/RR/DL tasks and do this
> delay only for everything else such as SCHED_OTHER/…
Wasn’t this the entire discussion about whether to limit it to SCHE_OTHER or not?
Will defer it to Peter.
>
>> + if (irq && rseq_delay_resched())
>> + clear_tsk_need_resched(current);
>> + else
>> + schedule();
>> + }
>>
>> if (ti_work & _TIF_UPROBE)
>> uprobe_notify_resume(regs);
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 165c90ba64ea..cee50e139723 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -823,6 +823,7 @@ void update_rq_clock(struct rq *rq)
>>
>> static void hrtick_clear(struct rq *rq)
>> {
>> + rseq_delay_resched_tick();
>
> This is called from __schedule(). If you set the need-resched flag here,
> it gets removed shortly after. Do I miss something?
hrtick_clear() is also called when the cpu is being removed in sched_cpu_dying().
We need to set resched there?
Thanks for your comments.
-Prakash.
>
>> if (hrtimer_active(&rq->hrtick_timer))
>> hrtimer_cancel(&rq->hrtick_timer);
>> }
>
> Sebastian
On 2025-04-25 01:19:07 [+0000], Prakash Sangappa wrote:
> > On Apr 24, 2025, at 7:13 AM, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> >
> > On 2025-04-18 19:34:08 [+0000], Prakash Sangappa wrote:
> >> --- a/include/linux/sched.h
> >> +++ b/include/linux/sched.h
> > …
> >> @@ -930,6 +931,9 @@ struct task_struct {
> >> struct plist_node pushable_tasks;
> >> struct rb_node pushable_dl_tasks;
> >> #endif
> >> +#ifdef CONFIG_RSEQ
> >> + unsigned rseq_sched_delay:1;
> >> +#endif
> >
> > There should be somewhere a bitfield already which you could use without
> > the ifdef. Then you could use IS_ENABLED() if you want to save some code
> > if RSEQ is not enabled.
>
> I suppose we could.
> Patch 1 is pretty much what PeterZ posted, hope he will comment on it.
If it is "pretty much what PeterZ posted" why did he not receive any
credit for it?
> Could it be moved below here, call it sched_time_delay, or some variant of this name?
I don't mind the name. The point is to add to an existing group instead
of starting a new u32 bit field.
> struct task_struct {
> ..
> #ifdef CONFIG_TASK_DELAY_ACCT
> /* delay due to memory thrashing */
> unsigned in_thrashing:1;
> #endif
> unsigned sched_time_delay:1;
> ..
> }
>
> This field will be for scheduler time extension use only. Mainly updated in the context of the current thread.
> Do we even need to use IS_ENABLED(CONFIG_RSEQ) to access?
Well, if you want to avoid the code in the !CONFIG_RSEQ then yes.
> >> struct mm_struct *mm;
> >> struct mm_struct *active_mm;
> >> --- a/include/uapi/linux/rseq.h
> >> +++ b/include/uapi/linux/rseq.h
> > …
> >> @@ -128,6 +131,8 @@ struct rseq {
> >> * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
> >> * Inhibit instruction sequence block restart on migration for
> >> * this thread.
> >> + * - RSEQ_CS_DELAY_RESCHED
> >> + * Try delay resched...
> >
> > Delay resched up to $time for $kind-of-stats under $conditions.
>
> Will add some comment like
> “Delay resched for upto 50us. Checked when thread is about to be preempted"
>
> With the tunable added in the subsequent patch, will change ‘50us' it to the tunable name.
A proper descritption of the flag would nice. The current state is that
I can derive move from the constant than from the comment.
> >> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> >> index 6b7ff1bc1b9b..944027d14198 100644
> >> --- a/kernel/entry/common.c
> >> +++ b/kernel/entry/common.c
> > …
> >> @@ -99,8 +100,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> >>
> >> local_irq_enable_exit_to_user(ti_work);
> >>
> >> - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
> >> - schedule();
> >> + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
> >
> > couldn't we restrict this to _TIF_NEED_RESCHED_LAZY? That way we would
> > still schedule immediately for any SCHED_FIFO/RR/DL tasks and do this
> > delay only for everything else such as SCHED_OTHER/…
>
>
> Wasn’t this the entire discussion about whether to limit it to SCHE_OTHER or not?
> Will defer it to Peter.
Oh. But this still deserves a trace point for this manoeuvre. A trace
would show you a wakeup, the need-resched bit will be shown and then it
will vanish later and people might wonder where did it go.
> >
> >> + if (irq && rseq_delay_resched())
> >> + clear_tsk_need_resched(current);
> >> + else
> >> + schedule();
> >> + }
> >>
> >> if (ti_work & _TIF_UPROBE)
> >> uprobe_notify_resume(regs);
> >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> >> index 165c90ba64ea..cee50e139723 100644
> >> --- a/kernel/sched/core.c
> >> +++ b/kernel/sched/core.c
> >> @@ -823,6 +823,7 @@ void update_rq_clock(struct rq *rq)
> >>
> >> static void hrtick_clear(struct rq *rq)
> >> {
> >> + rseq_delay_resched_tick();
> >
> > This is called from __schedule(). If you set the need-resched flag here,
> > it gets removed shortly after. Do I miss something?
>
> hrtick_clear() is also called when the cpu is being removed in sched_cpu_dying().
> We need to set resched there?
Do we? My understanding is that the NEED_RESCHED flag gets removed once
and then RSEQ_CS_DELAY_RESCHED gets set. RSEQ_CS_DELAY_RESCHED in turn
gets cleared in the scheduler once task leaves the CPU. Once the task
left the CPU then there is no need to set the bit. The sched_cpu_dying()
is the HP thread so if that one is on the CPU then the user task is
gone.
How does this delay thingy work with HZ=100 vs HZ=1000? Like what is the
most you could get in extra time? I could imagine that if a second task
gets on the runqueue and you skip the wake up but the runtime is used up
then the HZ tick should set NEED_RESCHED again and the following HZ tick
should force the schedule point.
> Thanks for your comments.
> -Prakash.
Sebastian
> On Apr 24, 2025, at 11:53 PM, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
>
> On 2025-04-25 01:19:07 [+0000], Prakash Sangappa wrote:
>>> On Apr 24, 2025, at 7:13 AM, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
>>>
>>> On 2025-04-18 19:34:08 [+0000], Prakash Sangappa wrote:
>>>> --- a/include/linux/sched.h
>>>> +++ b/include/linux/sched.h
>>> …
>>>> @@ -930,6 +931,9 @@ struct task_struct {
>>>> struct plist_node pushable_tasks;
>>>> struct rb_node pushable_dl_tasks;
>>>> #endif
>>>> +#ifdef CONFIG_RSEQ
>>>> + unsigned rseq_sched_delay:1;
>>>> +#endif
>>>
>>> There should be somewhere a bitfield already which you could use without
>>> the ifdef. Then you could use IS_ENABLED() if you want to save some code
>>> if RSEQ is not enabled.
>>
>> I suppose we could.
>> Patch 1 is pretty much what PeterZ posted, hope he will comment on it.
>
> If it is "pretty much what PeterZ posted" why did he not receive any
> credit for it?
Sure, he gets credit.
I have included ’Suggested-by’ in the patch. Will change to Signed-Off-by, if that is what it should be.
>
>> Could it be moved below here, call it sched_time_delay, or some variant of this name?
>
> I don't mind the name. The point is to add to an existing group instead
> of starting a new u32 bit field.
Ok, I think this place after ‘in_thrashing' would serve that purpose.
>
>> struct task_struct {
>> ..
>> #ifdef CONFIG_TASK_DELAY_ACCT
>> /* delay due to memory thrashing */
>> unsigned in_thrashing:1;
>> #endif
>> unsigned sched_time_delay:1;
>> ..
>> }
>>
>> This field will be for scheduler time extension use only. Mainly updated in the context of the current thread.
>> Do we even need to use IS_ENABLED(CONFIG_RSEQ) to access?
>
> Well, if you want to avoid the code in the !CONFIG_RSEQ then yes.
Sure, I can include IS_ENABLED() check when accessing.
>
>>>> struct mm_struct *mm;
>>>> struct mm_struct *active_mm;
>>>> --- a/include/uapi/linux/rseq.h
>>>> +++ b/include/uapi/linux/rseq.h
>>> …
>>>> @@ -128,6 +131,8 @@ struct rseq {
>>>> * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
>>>> * Inhibit instruction sequence block restart on migration for
>>>> * this thread.
>>>> + * - RSEQ_CS_DELAY_RESCHED
>>>> + * Try delay resched...
>>>
>>> Delay resched up to $time for $kind-of-stats under $conditions.
>>
>> Will add some comment like
>> “Delay resched for upto 50us. Checked when thread is about to be preempted"
>>
>> With the tunable added in the subsequent patch, will change ‘50us' it to the tunable name.
>
> A proper descritption of the flag would nice. The current state is that
> I can derive move from the constant than from the comment.
Will modify.
>
>>>> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
>>>> index 6b7ff1bc1b9b..944027d14198 100644
>>>> --- a/kernel/entry/common.c
>>>> +++ b/kernel/entry/common.c
>>> …
>>>> @@ -99,8 +100,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>>>>
>>>> local_irq_enable_exit_to_user(ti_work);
>>>>
>>>> - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
>>>> - schedule();
>>>> + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
>>>
>>> couldn't we restrict this to _TIF_NEED_RESCHED_LAZY? That way we would
>>> still schedule immediately for any SCHED_FIFO/RR/DL tasks and do this
>>> delay only for everything else such as SCHED_OTHER/…
>>
>>
>> Wasn’t this the entire discussion about whether to limit it to SCHE_OTHER or not?
>> Will defer it to Peter.
>
> Oh. But this still deserves a trace point for this manoeuvre. A trace
> would show you a wakeup, the need-resched bit will be shown and then it
> will vanish later and people might wonder where did it go.
I can look at adding a tracepoint here to indicate a delay was granted and need-resched bit got cleared.
>
>>>
>>>> + if (irq && rseq_delay_resched())
>>>> + clear_tsk_need_resched(current);
>>>> + else
>>>> + schedule();
>>>> + }
>>>>
>>>> if (ti_work & _TIF_UPROBE)
>>>> uprobe_notify_resume(regs);
>>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>>>> index 165c90ba64ea..cee50e139723 100644
>>>> --- a/kernel/sched/core.c
>>>> +++ b/kernel/sched/core.c
>>>> @@ -823,6 +823,7 @@ void update_rq_clock(struct rq *rq)
>>>>
>>>> static void hrtick_clear(struct rq *rq)
>>>> {
>>>> + rseq_delay_resched_tick();
>>>
>>> This is called from __schedule(). If you set the need-resched flag here,
>>> it gets removed shortly after. Do I miss something?
>>
>> hrtick_clear() is also called when the cpu is being removed in sched_cpu_dying().
>> We need to set resched there?
>
> Do we? My understanding is that the NEED_RESCHED flag gets removed once
> and then RSEQ_CS_DELAY_RESCHED gets set. RSEQ_CS_DELAY_RESCHED in turn
> gets cleared in the scheduler once task leaves the CPU. Once the task
> left the CPU then there is no need to set the bit. The sched_cpu_dying()
> is the HP thread so if that one is on the CPU then the user task is
> gone.
Ok, will remove this call from hrtick_clear()
To clarify, RSEQ_CS_DELAY_RESCHED bit usage is that it is set by user task in the rseq structure, to request additional time when entering the critical section in user space.
When kernel sees this set in exit_to_user_mode_loop(), when the task is about to be rescheduled, it clears the RSEQ_CS_DELAY_RESCHED bit in the ‘rseq’ structure and sets the t->rseq_sched_delay=1 in task_struct structure and the hrtick 50us timer is started.
When the timer fires, if the task is still running(ie. has t->rseq_sched_delay flag set), the task will be rescheduled. In __schedule() when the task leaves the cpu, it will clear t->rseq_sched_delay flag.
>
> How does this delay thingy work with HZ=100 vs HZ=1000? Like what is the
> most you could get in extra time? I could imagine that if a second task
> gets on the runqueue and you skip the wake up but the runtime is used up
> then the HZ tick should set NEED_RESCHED again and the following HZ tick
> should force the schedule point.
The most a task can get is 50us(or tunable value from patch 2) extra time. Yes, if runtime expires within the 50us extension that was granted as part of a wakeup, it will get rescheduled as NEED_RESCHED is set by HZ tick which could result in shorter the 50us extra time. Why do you say following HZ tick?, in that case the 50us timer would reschedule.
If the 50us extension was granted at the end of the runtime due to HZ tick, it will get rescheduled when the 50us timer fires.
Also, if a wakeup occurs and the task running has the 50us extension(rseq_sched_delay is set), it will again get rescheduled by resched_curr() if it is TIF_NEED_SCHED.
If the task on the cpu has t->rseq_sched_delay set, should we consider avoiding force reschedule in the above scenario(both runtime expire and wakeup), as the task would be rescheduled by the 50us timer?
Prakash
>
>> Thanks for your comments.
>> -Prakash.
>
> Sebastian
Hello Prakash,
On 4/25/2025 6:49 AM, Prakash Sangappa wrote:
>>> static void hrtick_clear(struct rq *rq)
>>> {
>>> + rseq_delay_resched_tick();
>> This is called from __schedule(). If you set the need-resched flag here,
>> it gets removed shortly after. Do I miss something?
> hrtick_clear() is also called when the cpu is being removed in sched_cpu_dying().
> We need to set resched there?
sched_cpu_dying() is called from the cpuhp thread which will go away
once the hotplug is done and shouldn't need this. Furthermore, cpuhp
thread will not use the "rseq_sched_delay" API so removing this should
be fine.
--
Thanks and Regards,
Prateek
> On Apr 24, 2025, at 8:48 PM, K Prateek Nayak <kprateek.nayak@amd.com> wrote:
>
> Hello Prakash,
>
> On 4/25/2025 6:49 AM, Prakash Sangappa wrote:
>>>> static void hrtick_clear(struct rq *rq)
>>>> {
>>>> + rseq_delay_resched_tick();
>>> This is called from __schedule(). If you set the need-resched flag here,
>>> it gets removed shortly after. Do I miss something?
>> hrtick_clear() is also called when the cpu is being removed in sched_cpu_dying().
>> We need to set resched there?
>
> sched_cpu_dying() is called from the cpuhp thread which will go away
> once the hotplug is done and shouldn't need this. Furthermore, cpuhp
> thread will not use the "rseq_sched_delay" API so removing this should
> be fine.
Ok, I can make that change.
Thanks,
Prakash
>
> --
> Thanks and Regards,
> Prateek
>
© 2016 - 2025 Red Hat, Inc.