[RFC PATCH 2/4] Scheduler time extention

Prakash Sangappa posted 4 patches 1 week, 3 days ago
[RFC PATCH 2/4] Scheduler time extention
Posted by Prakash Sangappa 1 week, 3 days ago
Introduce support for a thread to request extending execution time on
the cpu, when holding locks in user space. Adds a member 'sched_delay' to
the per thread shared mapped structure. Request for cpu execution time
extention is made by the thread by updating 'sched_delay' member.

Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
---
 include/linux/entry-common.h     | 10 +++++--
 include/linux/sched.h            | 17 +++++++++++
 include/uapi/linux/task_shared.h |  2 +-
 kernel/entry/common.c            | 15 ++++++----
 kernel/sched/core.c              | 16 ++++++++++
 kernel/sched/syscalls.c          |  7 +++++
 mm/task_shared.c                 | 50 ++++++++++++++++++++++++++++++++
 7 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 1e50cdb83ae5..904f5cdfe0b7 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -302,7 +302,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs);
  * exit_to_user_mode_loop - do any pending work before leaving to user space
  */
 unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-				     unsigned long ti_work);
+				     unsigned long ti_work, bool irq);
 
 /**
  * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
@@ -314,7 +314,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
  *    EXIT_TO_USER_MODE_WORK are set
  * 4) check that interrupts are still disabled
  */
-static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs,
+						bool irq)
 {
 	unsigned long ti_work;
 
@@ -325,7 +326,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
 
 	ti_work = read_thread_flags();
 	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
-		ti_work = exit_to_user_mode_loop(regs, ti_work);
+		ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
+
+	if (irq)
+		taskshrd_delay_resched_fini();
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1ca7d4efa932..b53e7a878a01 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -326,6 +326,7 @@ extern int __must_check io_schedule_prepare(void);
 extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
+extern void hrtick_local_start(u64 delay);
 
 /**
  * struct prev_cputime - snapshot of system and user cputime
@@ -957,6 +958,9 @@ struct task_struct {
 	 * ->sched_remote_wakeup gets used, so it can be in this word.
 	 */
 	unsigned			sched_remote_wakeup:1;
+#ifdef CONFIG_TASKSHARED
+	unsigned			taskshrd_sched_delay:1;
+#endif
 #ifdef CONFIG_RT_MUTEXES
 	unsigned			sched_rt_mutex:1;
 #endif
@@ -2186,6 +2190,19 @@ static inline bool owner_on_cpu(struct task_struct *owner)
 unsigned long sched_cpu_util(int cpu);
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_TASKSHARED
+
+extern bool taskshrd_delay_resched(void);
+extern void taskshrd_delay_resched_fini(void);
+extern void taskshrd_delay_resched_tick(void);
+#else
+
+static inline bool taskshrd_delay_resched(void) { return false; }
+static inline void taskshrd_delay_resched_fini(void) { }
+static inline void taskshrd_delay_resched_tick(void) { }
+
+#endif
+
 #ifdef CONFIG_SCHED_CORE
 extern void sched_core_free(struct task_struct *tsk);
 extern void sched_core_fork(struct task_struct *p);
diff --git a/include/uapi/linux/task_shared.h b/include/uapi/linux/task_shared.h
index a07902c57380..6e4c664eea60 100644
--- a/include/uapi/linux/task_shared.h
+++ b/include/uapi/linux/task_shared.h
@@ -13,6 +13,6 @@
 #define TASK_SHAREDINFO 1
 
 struct task_sharedinfo {
-		int version;
+		volatile unsigned short sched_delay;
 };
 #endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 11ec8320b59d..0e0360e8c127 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -89,7 +89,8 @@ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
  * @ti_work:	TIF work flags as read by the caller
  */
 __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-						     unsigned long ti_work)
+						     unsigned long ti_work,
+						     bool irq)
 {
 	/*
 	 * Before returning to user space ensure that all pending work
@@ -99,8 +100,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 
 		local_irq_enable_exit_to_user(ti_work);
 
-		if (ti_work & _TIF_NEED_RESCHED)
-			schedule();
+		if (ti_work & _TIF_NEED_RESCHED) {
+			if (irq && taskshrd_delay_resched())
+				clear_tsk_need_resched(current);
+			else
+				schedule();
+		}
 
 		if (ti_work & _TIF_UPROBE)
 			uprobe_notify_resume(regs);
@@ -208,7 +213,7 @@ static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *reg
 {
 	syscall_exit_to_user_mode_prepare(regs);
 	local_irq_disable_exit_to_user();
-	exit_to_user_mode_prepare(regs);
+	exit_to_user_mode_prepare(regs, false);
 }
 
 void syscall_exit_to_user_mode_work(struct pt_regs *regs)
@@ -232,7 +237,7 @@ noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
 {
 	instrumentation_begin();
-	exit_to_user_mode_prepare(regs);
+	exit_to_user_mode_prepare(regs, true);
 	instrumentation_end();
 	exit_to_user_mode();
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 71b6396db118..713c43491403 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -815,6 +815,7 @@ void update_rq_clock(struct rq *rq)
 
 static void hrtick_clear(struct rq *rq)
 {
+	taskshrd_delay_resched_tick();
 	if (hrtimer_active(&rq->hrtick_timer))
 		hrtimer_cancel(&rq->hrtick_timer);
 }
@@ -830,6 +831,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 
+	taskshrd_delay_resched_tick();
+
 	rq_lock(rq, &rf);
 	update_rq_clock(rq);
 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
@@ -903,6 +906,16 @@ void hrtick_start(struct rq *rq, u64 delay)
 
 #endif /* CONFIG_SMP */
 
+void hrtick_local_start(u64 delay)
+{
+	struct rq *rq = this_rq();
+	struct rq_flags rf;
+
+	rq_lock(rq, &rf);
+	hrtick_start(rq, delay);
+	rq_unlock(rq, &rf);
+}
+
 static void hrtick_rq_init(struct rq *rq)
 {
 #ifdef CONFIG_SMP
@@ -6645,6 +6658,9 @@ static void __sched notrace __schedule(int sched_mode)
 picked:
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
+#ifdef CONFIG_TASKSHARED
+	prev->taskshrd_sched_delay = 0;
+#endif
 #ifdef CONFIG_SCHED_DEBUG
 	rq->last_seen_need_resched_ns = 0;
 #endif
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index d23c34b8b3eb..0904667924d8 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -1419,6 +1419,13 @@ static void do_sched_yield(void)
  */
 SYSCALL_DEFINE0(sched_yield)
 {
+
+#ifdef  CONFIG_TASKSHARED
+	if (current->taskshrd_sched_delay) {
+		schedule();
+		return 0;
+	}
+#endif
 	do_sched_yield();
 	return 0;
 }
diff --git a/mm/task_shared.c b/mm/task_shared.c
index cea45d913b91..575b335d6879 100644
--- a/mm/task_shared.c
+++ b/mm/task_shared.c
@@ -268,6 +268,56 @@ static int task_ushared_alloc(void)
 	return ret;
 }
 
+bool taskshrd_delay_resched(void)
+{
+	struct task_struct *t = current;
+	struct task_ushrd_struct *shrdp = t->task_ushrd;
+
+	if (!IS_ENABLED(CONFIG_SCHED_HRTICK))
+		return false;
+
+	if(shrdp == NULL || shrdp->kaddr == NULL)
+		return false;
+
+	if (t->taskshrd_sched_delay)
+		return false;
+
+	if (!(shrdp->kaddr->ts.sched_delay))
+		return false;
+
+	shrdp->kaddr->ts.sched_delay = 0;
+	t->taskshrd_sched_delay = 1;
+
+	return true;
+}
+
+void taskshrd_delay_resched_fini(void)
+{
+#ifdef CONFIG_SCHED_HRTICK
+	struct task_struct *t = current;
+	/*
+	* IRQs off, guaranteed to return to userspace, start timer on this CPU
+	* to limit the resched-overdraft.
+	*
+	* If your critical section is longer than 50 us you get to keep the
+	* pieces.
+	*/
+	if (t->taskshrd_sched_delay)
+		hrtick_local_start(50 * NSEC_PER_USEC);
+#endif
+}
+
+void taskshrd_delay_resched_tick(void)
+{
+#ifdef CONFIG_SCHED_HRTICK
+	struct task_struct *t = current;
+
+	if (t->taskshrd_sched_delay) {
+		set_tsk_need_resched(t);
+	}
+#endif
+}
+
 
 /*
  * Get Task Shared structure, allocate if needed and return mapped user address.
-- 
2.43.5
Re: [RFC PATCH 2/4] Scheduler time extention
Posted by K Prateek Nayak 1 week, 3 days ago
Hello Prakash,

Full disclaimer: I haven't looked closely at the complete series but ...

On 11/13/2024 5:31 AM, Prakash Sangappa wrote:
> [..snip..]
> @@ -99,8 +100,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>   
>   		local_irq_enable_exit_to_user(ti_work);
>   
> -		if (ti_work & _TIF_NEED_RESCHED)
> -			schedule();
> +		if (ti_work & _TIF_NEED_RESCHED) {
> +			if (irq && taskshrd_delay_resched())
> +				clear_tsk_need_resched(current);

Suppose the current task had requested for a delayed resched but an RT
task's wakeup sets the TIF_NEED_RESCHED flag via an IPI, doesn't this
clear the flag indiscriminately and allow the task to run for an
extended amount of time? Am I missing something?

> +			else
> +				schedule();
> +		}
>   
>   		if (ti_work & _TIF_UPROBE)
>   			uprobe_notify_resume(regs);
> @@ -208,7 +213,7 @@ static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *reg
>   {
>   	syscall_exit_to_user_mode_prepare(regs);
>   	local_irq_disable_exit_to_user();
> -	exit_to_user_mode_prepare(regs);
> +	exit_to_user_mode_prepare(regs, false);
>   }
>   
>   void syscall_exit_to_user_mode_work(struct pt_regs *regs)
> @@ -232,7 +237,7 @@ noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
>   noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
>   {
>   	instrumentation_begin();
> -	exit_to_user_mode_prepare(regs);
> +	exit_to_user_mode_prepare(regs, true);
>   	instrumentation_end();
>   	exit_to_user_mode();
>   }
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 71b6396db118..713c43491403 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -815,6 +815,7 @@ void update_rq_clock(struct rq *rq)
>   
>   static void hrtick_clear(struct rq *rq)
>   {
> +	taskshrd_delay_resched_tick();
>   	if (hrtimer_active(&rq->hrtick_timer))
>   		hrtimer_cancel(&rq->hrtick_timer);
>   }
> @@ -830,6 +831,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
>   
>   	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
>   
> +	taskshrd_delay_resched_tick();
> +
>   	rq_lock(rq, &rf);
>   	update_rq_clock(rq);
>   	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
> @@ -903,6 +906,16 @@ void hrtick_start(struct rq *rq, u64 delay)
>   
>   #endif /* CONFIG_SMP */
>   
> +void hrtick_local_start(u64 delay)
> +{
> +	struct rq *rq = this_rq();
> +	struct rq_flags rf;
> +
> +	rq_lock(rq, &rf);

You can use guard(rq_lock)(rq) and avoid declaring rf.

> +	hrtick_start(rq, delay);
> +	rq_unlock(rq, &rf);
> +}
> +
>   static void hrtick_rq_init(struct rq *rq)
>   {
>   #ifdef CONFIG_SMP
> @@ -6645,6 +6658,9 @@ static void __sched notrace __schedule(int sched_mode)
>   picked:
>   	clear_tsk_need_resched(prev);
>   	clear_preempt_need_resched();
> +#ifdef CONFIG_TASKSHARED
> +	prev->taskshrd_sched_delay = 0;
> +#endif
>   #ifdef CONFIG_SCHED_DEBUG
>   	rq->last_seen_need_resched_ns = 0;
>   #endif
> diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
> index d23c34b8b3eb..0904667924d8 100644
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -1419,6 +1419,13 @@ static void do_sched_yield(void)
>    */
>   SYSCALL_DEFINE0(sched_yield)
>   {
> +
> +#ifdef  CONFIG_TASKSHARED
> +	if (current->taskshrd_sched_delay) {
> +		schedule();
> +		return 0;
> +	}
> +#endif
>   	do_sched_yield();
>   	return 0;
>   }
> diff --git a/mm/task_shared.c b/mm/task_shared.c
> index cea45d913b91..575b335d6879 100644
> --- a/mm/task_shared.c
> +++ b/mm/task_shared.c
> @@ -268,6 +268,56 @@ static int task_ushared_alloc(void)
>   	return ret;
>   }
>   
> +bool taskshrd_delay_resched(void)
> +{
> +	struct task_struct *t = current;
> +	struct task_ushrd_struct *shrdp = t->task_ushrd;
> +
> +	if (!IS_ENABLED(CONFIG_SCHED_HRTICK))
> +		return false;
> +
> +	if(shrdp == NULL || shrdp->kaddr == NULL)
> +		return false;
> +
> +	if (t->taskshrd_sched_delay)
> +		return false;
> +
> +	if (!(shrdp->kaddr->ts.sched_delay))
> +		return false;
> +
> +	shrdp->kaddr->ts.sched_delay = 0;
> +	t->taskshrd_sched_delay = 1;
> +
> +	return true;

Perhaps this needs to also check
"rq->nr_running == rq->cfs.h_nr_running" since I believe it only makes
sense for fair tasks to request that extra slice?

-- 
Thanks and Regards,
Prateek

> +}
> +
> +void taskshrd_delay_resched_fini(void)
> +{
> +#ifdef CONFIG_SCHED_HRTICK
> +	struct task_struct *t = current;
> +	/*
> +	* IRQs off, guaranteed to return to userspace, start timer on this CPU
> +	* to limit the resched-overdraft.
> +	*
> +	* If your critical section is longer than 50 us you get to keep the
> +	* pieces.
> +	*/
> +	if (t->taskshrd_sched_delay)
> +		hrtick_local_start(50 * NSEC_PER_USEC);
> +#endif
> +}
> +
> +void taskshrd_delay_resched_tick(void)
> +{
> +#ifdef CONFIG_SCHED_HRTICK
> +	struct task_struct *t = current;
> +
> +	if (t->taskshrd_sched_delay) {
> +		set_tsk_need_resched(t);
> +	}
> +#endif
> +}
> +
>   
>   /*
>    * Get Task Shared structure, allocate if needed and return mapped user address.
Re: [RFC PATCH 2/4] Scheduler time extention
Posted by Prakash Sangappa 1 week, 2 days ago

> On Nov 12, 2024, at 7:57 PM, K Prateek Nayak <kprateek.nayak@amd.com> wrote:
> 
> Hello Prakash,
> 
> Full disclaimer: I haven't looked closely at the complete series but ...
> 
> On 11/13/2024 5:31 AM, Prakash Sangappa wrote:
>> [..snip..]
>> @@ -99,8 +100,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>>     local_irq_enable_exit_to_user(ti_work);
>>  - if (ti_work & _TIF_NEED_RESCHED)
>> - schedule();
>> + if (ti_work & _TIF_NEED_RESCHED) {
>> + if (irq && taskshrd_delay_resched())
>> + clear_tsk_need_resched(current);
> 
> Suppose the current task had requested for a delayed resched but an RT
> task's wakeup sets the TIF_NEED_RESCHED flag via an IPI, doesn't this
> clear the flag indiscriminately and allow the task to run for an
> extended amount of time? Am I missing something?

If the scheduler extension delay has already been granted when the IPI from wakeup occurs, then it would not clear the TIF_NEED_RESCHED flag and so would preempt.

[...]

> 
>>  }
>> @@ -830,6 +831,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
>>     WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
>>  + taskshrd_delay_resched_tick();
>> +
>>   rq_lock(rq, &rf);
>>   update_rq_clock(rq);
>>   rq->curr->sched_class->task_tick(rq, rq->curr, 1);
>> @@ -903,6 +906,16 @@ void hrtick_start(struct rq *rq, u64 delay)
>>    #endif /* CONFIG_SMP */
>>  +void hrtick_local_start(u64 delay)
>> +{
>> + struct rq *rq = this_rq();
>> + struct rq_flags rf;
>> +
>> + rq_lock(rq, &rf);
> 
> You can use guard(rq_lock)(rq) and avoid declaring rf.

Will take a look and address it.

[...]

> 
>>  +bool taskshrd_delay_resched(void)
>> +{
>> + struct task_struct *t = current;
>> + struct task_ushrd_struct *shrdp = t->task_ushrd;
>> +
>> + if (!IS_ENABLED(CONFIG_SCHED_HRTICK))
>> + return false;
>> +
>> + if(shrdp == NULL || shrdp->kaddr == NULL)
>> + return false;
>> +
>> + if (t->taskshrd_sched_delay)
>> + return false;
>> +
>> + if (!(shrdp->kaddr->ts.sched_delay))
>> + return false;
>> +
>> + shrdp->kaddr->ts.sched_delay = 0;
>> + t->taskshrd_sched_delay = 1;
>> +
>> + return true;
> 
> Perhaps this needs to also check
> "rq->nr_running == rq->cfs.h_nr_running" since I believe it only makes
> sense for fair tasks to request that extra slice?

From the discussion in 
https://lore.kernel.org/lkml/20231025054219.1acaa3dd@gandalf.local.home/

It was ok to have this behavior for all tasks. It could be changed to work only for fair tasks, if there is agreement.


Thanks,
-Prakash

> 
> -- 
> Thanks and Regards,
> Prateek
> 
>> +}
>> +
>> +void taskshrd_delay_resched_fini(void)
>> +{
>> +#ifdef CONFIG_SCHED_HRTICK
>> + struct task_struct *t = current;
>> + /*
>> + * IRQs off, guaranteed to return to userspace, start timer on this CPU
>> + * to limit the resched-overdraft.
>> + *
>> + * If your critical section is longer than 50 us you get to keep the
>> + * pieces.
>> + */
>> + if (t->taskshrd_sched_delay)
>> + hrtick_local_start(50 * NSEC_PER_USEC);
>> +#endif
>> +}
>> +
>> +void taskshrd_delay_resched_tick(void)
>> +{
>> +#ifdef CONFIG_SCHED_HRTICK
>> + struct task_struct *t = current;
>> +
>> + if (t->taskshrd_sched_delay) {
>> + set_tsk_need_resched(t);
>> + }
>> +#endif
>> +}
>> +
>>    /*
>>   * Get Task Shared structure, allocate if needed and return mapped user address.
>