Introduce support for a thread to request extending execution time on
the cpu, when holding locks in user space. Adds a member 'sched_delay' to
the per thread shared mapped structure. Request for cpu execution time
extention is made by the thread by updating 'sched_delay' member.
Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
---
include/linux/entry-common.h | 10 +++++--
include/linux/sched.h | 17 +++++++++++
include/uapi/linux/task_shared.h | 2 +-
kernel/entry/common.c | 15 ++++++----
kernel/sched/core.c | 16 ++++++++++
kernel/sched/syscalls.c | 7 +++++
mm/task_shared.c | 50 ++++++++++++++++++++++++++++++++
7 files changed, 108 insertions(+), 9 deletions(-)
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 1e50cdb83ae5..904f5cdfe0b7 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -302,7 +302,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs);
* exit_to_user_mode_loop - do any pending work before leaving to user space
*/
unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work);
+ unsigned long ti_work, bool irq);
/**
* exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
@@ -314,7 +314,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
* EXIT_TO_USER_MODE_WORK are set
* 4) check that interrupts are still disabled
*/
-static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs,
+ bool irq)
{
unsigned long ti_work;
@@ -325,7 +326,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
ti_work = read_thread_flags();
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
- ti_work = exit_to_user_mode_loop(regs, ti_work);
+ ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
+
+ if (irq)
+ taskshrd_delay_resched_fini();
arch_exit_to_user_mode_prepare(regs, ti_work);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1ca7d4efa932..b53e7a878a01 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -326,6 +326,7 @@ extern int __must_check io_schedule_prepare(void);
extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);
+extern void hrtick_local_start(u64 delay);
/**
* struct prev_cputime - snapshot of system and user cputime
@@ -957,6 +958,9 @@ struct task_struct {
* ->sched_remote_wakeup gets used, so it can be in this word.
*/
unsigned sched_remote_wakeup:1;
+#ifdef CONFIG_TASKSHARED
+ unsigned taskshrd_sched_delay:1;
+#endif
#ifdef CONFIG_RT_MUTEXES
unsigned sched_rt_mutex:1;
#endif
@@ -2186,6 +2190,19 @@ static inline bool owner_on_cpu(struct task_struct *owner)
unsigned long sched_cpu_util(int cpu);
#endif /* CONFIG_SMP */
+#ifdef CONFIG_TASKSHARED
+
+extern bool taskshrd_delay_resched(void);
+extern void taskshrd_delay_resched_fini(void);
+extern void taskshrd_delay_resched_tick(void);
+#else
+
+static inline bool taskshrd_delay_resched(void) { return false; }
+static inline void taskshrd_delay_resched_fini(void) { }
+static inline void taskshrd_delay_resched_tick(void) { }
+
+#endif
+
#ifdef CONFIG_SCHED_CORE
extern void sched_core_free(struct task_struct *tsk);
extern void sched_core_fork(struct task_struct *p);
diff --git a/include/uapi/linux/task_shared.h b/include/uapi/linux/task_shared.h
index a07902c57380..6e4c664eea60 100644
--- a/include/uapi/linux/task_shared.h
+++ b/include/uapi/linux/task_shared.h
@@ -13,6 +13,6 @@
#define TASK_SHAREDINFO 1
struct task_sharedinfo {
- int version;
+ volatile unsigned short sched_delay;
};
#endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 11ec8320b59d..0e0360e8c127 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -89,7 +89,8 @@ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
* @ti_work: TIF work flags as read by the caller
*/
__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work)
+ unsigned long ti_work,
+ bool irq)
{
/*
* Before returning to user space ensure that all pending work
@@ -99,8 +100,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
local_irq_enable_exit_to_user(ti_work);
- if (ti_work & _TIF_NEED_RESCHED)
- schedule();
+ if (ti_work & _TIF_NEED_RESCHED) {
+ if (irq && taskshrd_delay_resched())
+ clear_tsk_need_resched(current);
+ else
+ schedule();
+ }
if (ti_work & _TIF_UPROBE)
uprobe_notify_resume(regs);
@@ -208,7 +213,7 @@ static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *reg
{
syscall_exit_to_user_mode_prepare(regs);
local_irq_disable_exit_to_user();
- exit_to_user_mode_prepare(regs);
+ exit_to_user_mode_prepare(regs, false);
}
void syscall_exit_to_user_mode_work(struct pt_regs *regs)
@@ -232,7 +237,7 @@ noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
{
instrumentation_begin();
- exit_to_user_mode_prepare(regs);
+ exit_to_user_mode_prepare(regs, true);
instrumentation_end();
exit_to_user_mode();
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 71b6396db118..713c43491403 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -815,6 +815,7 @@ void update_rq_clock(struct rq *rq)
static void hrtick_clear(struct rq *rq)
{
+ taskshrd_delay_resched_tick();
if (hrtimer_active(&rq->hrtick_timer))
hrtimer_cancel(&rq->hrtick_timer);
}
@@ -830,6 +831,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+ taskshrd_delay_resched_tick();
+
rq_lock(rq, &rf);
update_rq_clock(rq);
rq->curr->sched_class->task_tick(rq, rq->curr, 1);
@@ -903,6 +906,16 @@ void hrtick_start(struct rq *rq, u64 delay)
#endif /* CONFIG_SMP */
+void hrtick_local_start(u64 delay)
+{
+ struct rq *rq = this_rq();
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ hrtick_start(rq, delay);
+ rq_unlock(rq, &rf);
+}
+
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
@@ -6645,6 +6658,9 @@ static void __sched notrace __schedule(int sched_mode)
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
+#ifdef CONFIG_TASKSHARED
+ prev->taskshrd_sched_delay = 0;
+#endif
#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
#endif
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index d23c34b8b3eb..0904667924d8 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -1419,6 +1419,13 @@ static void do_sched_yield(void)
*/
SYSCALL_DEFINE0(sched_yield)
{
+
+#ifdef CONFIG_TASKSHARED
+ if (current->taskshrd_sched_delay) {
+ schedule();
+ return 0;
+ }
+#endif
do_sched_yield();
return 0;
}
diff --git a/mm/task_shared.c b/mm/task_shared.c
index cea45d913b91..575b335d6879 100644
--- a/mm/task_shared.c
+++ b/mm/task_shared.c
@@ -268,6 +268,56 @@ static int task_ushared_alloc(void)
return ret;
}
+bool taskshrd_delay_resched(void)
+{
+ struct task_struct *t = current;
+ struct task_ushrd_struct *shrdp = t->task_ushrd;
+
+ if (!IS_ENABLED(CONFIG_SCHED_HRTICK))
+ return false;
+
+ if(shrdp == NULL || shrdp->kaddr == NULL)
+ return false;
+
+ if (t->taskshrd_sched_delay)
+ return false;
+
+ if (!(shrdp->kaddr->ts.sched_delay))
+ return false;
+
+ shrdp->kaddr->ts.sched_delay = 0;
+ t->taskshrd_sched_delay = 1;
+
+ return true;
+}
+
+void taskshrd_delay_resched_fini(void)
+{
+#ifdef CONFIG_SCHED_HRTICK
+ struct task_struct *t = current;
+ /*
+ * IRQs off, guaranteed to return to userspace, start timer on this CPU
+ * to limit the resched-overdraft.
+ *
+ * If your critical section is longer than 50 us you get to keep the
+ * pieces.
+ */
+ if (t->taskshrd_sched_delay)
+ hrtick_local_start(50 * NSEC_PER_USEC);
+#endif
+}
+
+void taskshrd_delay_resched_tick(void)
+{
+#ifdef CONFIG_SCHED_HRTICK
+ struct task_struct *t = current;
+
+ if (t->taskshrd_sched_delay) {
+ set_tsk_need_resched(t);
+ }
+#endif
+}
+
/*
* Get Task Shared structure, allocate if needed and return mapped user address.
--
2.43.5
Hello Prakash, Full disclaimer: I haven't looked closely at the complete series but ... On 11/13/2024 5:31 AM, Prakash Sangappa wrote: > [..snip..] > @@ -99,8 +100,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, > > local_irq_enable_exit_to_user(ti_work); > > - if (ti_work & _TIF_NEED_RESCHED) > - schedule(); > + if (ti_work & _TIF_NEED_RESCHED) { > + if (irq && taskshrd_delay_resched()) > + clear_tsk_need_resched(current); Suppose the current task had requested for a delayed resched but an RT task's wakeup sets the TIF_NEED_RESCHED flag via an IPI, doesn't this clear the flag indiscriminately and allow the task to run for an extended amount of time? Am I missing something? > + else > + schedule(); > + } > > if (ti_work & _TIF_UPROBE) > uprobe_notify_resume(regs); > @@ -208,7 +213,7 @@ static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *reg > { > syscall_exit_to_user_mode_prepare(regs); > local_irq_disable_exit_to_user(); > - exit_to_user_mode_prepare(regs); > + exit_to_user_mode_prepare(regs, false); > } > > void syscall_exit_to_user_mode_work(struct pt_regs *regs) > @@ -232,7 +237,7 @@ noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) > noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) > { > instrumentation_begin(); > - exit_to_user_mode_prepare(regs); > + exit_to_user_mode_prepare(regs, true); > instrumentation_end(); > exit_to_user_mode(); > } > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 71b6396db118..713c43491403 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -815,6 +815,7 @@ void update_rq_clock(struct rq *rq) > > static void hrtick_clear(struct rq *rq) > { > + taskshrd_delay_resched_tick(); > if (hrtimer_active(&rq->hrtick_timer)) > hrtimer_cancel(&rq->hrtick_timer); > } > @@ -830,6 +831,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) > > WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); > > + taskshrd_delay_resched_tick(); > + > rq_lock(rq, &rf); > update_rq_clock(rq); > rq->curr->sched_class->task_tick(rq, rq->curr, 1); > @@ -903,6 +906,16 @@ void hrtick_start(struct rq *rq, u64 delay) > > #endif /* CONFIG_SMP */ > > +void hrtick_local_start(u64 delay) > +{ > + struct rq *rq = this_rq(); > + struct rq_flags rf; > + > + rq_lock(rq, &rf); You can use guard(rq_lock)(rq) and avoid declaring rf. > + hrtick_start(rq, delay); > + rq_unlock(rq, &rf); > +} > + > static void hrtick_rq_init(struct rq *rq) > { > #ifdef CONFIG_SMP > @@ -6645,6 +6658,9 @@ static void __sched notrace __schedule(int sched_mode) > picked: > clear_tsk_need_resched(prev); > clear_preempt_need_resched(); > +#ifdef CONFIG_TASKSHARED > + prev->taskshrd_sched_delay = 0; > +#endif > #ifdef CONFIG_SCHED_DEBUG > rq->last_seen_need_resched_ns = 0; > #endif > diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c > index d23c34b8b3eb..0904667924d8 100644 > --- a/kernel/sched/syscalls.c > +++ b/kernel/sched/syscalls.c > @@ -1419,6 +1419,13 @@ static void do_sched_yield(void) > */ > SYSCALL_DEFINE0(sched_yield) > { > + > +#ifdef CONFIG_TASKSHARED > + if (current->taskshrd_sched_delay) { > + schedule(); > + return 0; > + } > +#endif > do_sched_yield(); > return 0; > } > diff --git a/mm/task_shared.c b/mm/task_shared.c > index cea45d913b91..575b335d6879 100644 > --- a/mm/task_shared.c > +++ b/mm/task_shared.c > @@ -268,6 +268,56 @@ static int task_ushared_alloc(void) > return ret; > } > > +bool taskshrd_delay_resched(void) > +{ > + struct task_struct *t = current; > + struct task_ushrd_struct *shrdp = t->task_ushrd; > + > + if (!IS_ENABLED(CONFIG_SCHED_HRTICK)) > + return false; > + > + if(shrdp == NULL || shrdp->kaddr == NULL) > + return false; > + > + if (t->taskshrd_sched_delay) > + return false; > + > + if (!(shrdp->kaddr->ts.sched_delay)) > + return false; > + > + shrdp->kaddr->ts.sched_delay = 0; > + t->taskshrd_sched_delay = 1; > + > + return true; Perhaps this needs to also check "rq->nr_running == rq->cfs.h_nr_running" since I believe it only makes sense for fair tasks to request that extra slice? -- Thanks and Regards, Prateek > +} > + > +void taskshrd_delay_resched_fini(void) > +{ > +#ifdef CONFIG_SCHED_HRTICK > + struct task_struct *t = current; > + /* > + * IRQs off, guaranteed to return to userspace, start timer on this CPU > + * to limit the resched-overdraft. > + * > + * If your critical section is longer than 50 us you get to keep the > + * pieces. > + */ > + if (t->taskshrd_sched_delay) > + hrtick_local_start(50 * NSEC_PER_USEC); > +#endif > +} > + > +void taskshrd_delay_resched_tick(void) > +{ > +#ifdef CONFIG_SCHED_HRTICK > + struct task_struct *t = current; > + > + if (t->taskshrd_sched_delay) { > + set_tsk_need_resched(t); > + } > +#endif > +} > + > > /* > * Get Task Shared structure, allocate if needed and return mapped user address.
> On Nov 12, 2024, at 7:57 PM, K Prateek Nayak <kprateek.nayak@amd.com> wrote: > > Hello Prakash, > > Full disclaimer: I haven't looked closely at the complete series but ... > > On 11/13/2024 5:31 AM, Prakash Sangappa wrote: >> [..snip..] >> @@ -99,8 +100,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, >> local_irq_enable_exit_to_user(ti_work); >> - if (ti_work & _TIF_NEED_RESCHED) >> - schedule(); >> + if (ti_work & _TIF_NEED_RESCHED) { >> + if (irq && taskshrd_delay_resched()) >> + clear_tsk_need_resched(current); > > Suppose the current task had requested for a delayed resched but an RT > task's wakeup sets the TIF_NEED_RESCHED flag via an IPI, doesn't this > clear the flag indiscriminately and allow the task to run for an > extended amount of time? Am I missing something? If the scheduler extension delay has already been granted when the IPI from wakeup occurs, then it would not clear the TIF_NEED_RESCHED flag and so would preempt. [...] > >> } >> @@ -830,6 +831,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) >> WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); >> + taskshrd_delay_resched_tick(); >> + >> rq_lock(rq, &rf); >> update_rq_clock(rq); >> rq->curr->sched_class->task_tick(rq, rq->curr, 1); >> @@ -903,6 +906,16 @@ void hrtick_start(struct rq *rq, u64 delay) >> #endif /* CONFIG_SMP */ >> +void hrtick_local_start(u64 delay) >> +{ >> + struct rq *rq = this_rq(); >> + struct rq_flags rf; >> + >> + rq_lock(rq, &rf); > > You can use guard(rq_lock)(rq) and avoid declaring rf. Will take a look and address it. [...] > >> +bool taskshrd_delay_resched(void) >> +{ >> + struct task_struct *t = current; >> + struct task_ushrd_struct *shrdp = t->task_ushrd; >> + >> + if (!IS_ENABLED(CONFIG_SCHED_HRTICK)) >> + return false; >> + >> + if(shrdp == NULL || shrdp->kaddr == NULL) >> + return false; >> + >> + if (t->taskshrd_sched_delay) >> + return false; >> + >> + if (!(shrdp->kaddr->ts.sched_delay)) >> + return false; >> + >> + shrdp->kaddr->ts.sched_delay = 0; >> + t->taskshrd_sched_delay = 1; >> + >> + return true; > > Perhaps this needs to also check > "rq->nr_running == rq->cfs.h_nr_running" since I believe it only makes > sense for fair tasks to request that extra slice? From the discussion in https://lore.kernel.org/lkml/20231025054219.1acaa3dd@gandalf.local.home/ It was ok to have this behavior for all tasks. It could be changed to work only for fair tasks, if there is agreement. Thanks, -Prakash > > -- > Thanks and Regards, > Prateek > >> +} >> + >> +void taskshrd_delay_resched_fini(void) >> +{ >> +#ifdef CONFIG_SCHED_HRTICK >> + struct task_struct *t = current; >> + /* >> + * IRQs off, guaranteed to return to userspace, start timer on this CPU >> + * to limit the resched-overdraft. >> + * >> + * If your critical section is longer than 50 us you get to keep the >> + * pieces. >> + */ >> + if (t->taskshrd_sched_delay) >> + hrtick_local_start(50 * NSEC_PER_USEC); >> +#endif >> +} >> + >> +void taskshrd_delay_resched_tick(void) >> +{ >> +#ifdef CONFIG_SCHED_HRTICK >> + struct task_struct *t = current; >> + >> + if (t->taskshrd_sched_delay) { >> + set_tsk_need_resched(t); >> + } >> +#endif >> +} >> + >> /* >> * Get Task Shared structure, allocate if needed and return mapped user address. >
© 2016 - 2024 Red Hat, Inc.