[PATCH 2/5] sched: Add Lazy preemption model

Peter Zijlstra posted 5 patches 1 month, 3 weeks ago
[PATCH 2/5] sched: Add Lazy preemption model
Posted by Peter Zijlstra 1 month, 3 weeks ago
Change fair to use resched_curr_lazy(), which, when the lazy
preemption model is selected, will set TIF_NEED_RESCHED_LAZY.

This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
As such, the average delay between setting LAZY and actually
rescheduling will be TICK_NSEC/2.

In short, Lazy preemption will delay preemption for fair class but
will function as Full preemption for all the other classes, most
notably the realtime (RR/FIFO/DEADLINE) classes.

The goal is to bridge the performance gap with Voluntary, such that we
might eventually remove that option entirely.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/preempt.h |    8 ++++-
 kernel/Kconfig.preempt  |   15 +++++++++
 kernel/sched/core.c     |   76 ++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/debug.c    |    5 +--
 kernel/sched/fair.c     |    6 +--
 kernel/sched/sched.h    |    1 
 6 files changed, 103 insertions(+), 8 deletions(-)

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_dis
 extern bool preempt_model_none(void);
 extern bool preempt_model_voluntary(void);
 extern bool preempt_model_full(void);
+extern bool preempt_model_lazy(void);
 
 #else
 
@@ -502,6 +503,11 @@ static inline bool preempt_model_full(vo
 	return IS_ENABLED(CONFIG_PREEMPT);
 }
 
+static inline bool preempt_model_lazy(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
+
 #endif
 
 static inline bool preempt_model_rt(void)
@@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void
  */
 static inline bool preempt_model_preemptible(void)
 {
-	return preempt_model_full() || preempt_model_rt();
+	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
 }
 
 #endif /* __LINUX_PREEMPT_H */
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -11,6 +11,9 @@ config PREEMPT_BUILD
 	select PREEMPTION
 	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
 
+config ARCH_HAS_PREEMPT_LAZY
+	bool
+
 choice
 	prompt "Preemption Model"
 	default PREEMPT_NONE
@@ -67,6 +70,18 @@ config PREEMPT
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_LAZY
+	bool "Scheduler controlled preemption model"
+	depends on !ARCH_NO_PREEMPT
+	depends on ARCH_HAS_PREEMPT_LAZY
+	select PREEMPT_BUILD
+	help
+	  This option provides a scheduler driven preemption model that
+	  is fundamentally similar to full preemption, but is less
+	  eager to preempt SCHED_NORMAL tasks in an attempt to
+	  reduce lock holder preemption and recover some of the performance
+	  gains seen from using Voluntary preemption.
+
 config PREEMPT_RT
 	bool "Fully Preemptible Kernel (Real-Time)"
 	depends on EXPERT && ARCH_SUPPORTS_RT
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1078,6 +1078,9 @@ static void __resched_curr(struct rq *rq
 
 	lockdep_assert_rq_held(rq);
 
+	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
+		tif = TIF_NEED_RESCHED;
+
 	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
 		return;
 
@@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
 	__resched_curr(rq, TIF_NEED_RESCHED);
 }
 
+#ifdef CONFIG_PREEMPT_DYNAMIC
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
+}
+#else
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+	return IS_ENABLED(PREEMPT_LAZY);
+}
+#endif
+
+static __always_inline int tif_need_resched_lazy(void)
+{
+	if (dynamic_preempt_lazy())
+		return TIF_NEED_RESCHED_LAZY;
+
+	return TIF_NEED_RESCHED;
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+	__resched_curr(rq, tif_need_resched_lazy());
+}
+
 void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -5598,6 +5627,10 @@ void sched_tick(void)
 	update_rq_clock(rq);
 	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
 	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
+
+	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+		resched_curr(rq);
+
 	curr->sched_class->task_tick(rq, curr, 0);
 	if (sched_feat(LATENCY_WARN))
 		resched_latency = cpu_resched_latency(rq);
@@ -7334,6 +7367,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
  *   preempt_schedule           <- NOP
  *   preempt_schedule_notrace   <- NOP
  *   irqentry_exit_cond_resched <- NOP
+ *   dynamic_preempt_lazy       <- false
  *
  * VOLUNTARY:
  *   cond_resched               <- __cond_resched
@@ -7341,6 +7375,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
  *   preempt_schedule           <- NOP
  *   preempt_schedule_notrace   <- NOP
  *   irqentry_exit_cond_resched <- NOP
+ *   dynamic_preempt_lazy       <- false
  *
  * FULL:
  *   cond_resched               <- RET0
@@ -7348,6 +7383,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
  *   preempt_schedule           <- preempt_schedule
  *   preempt_schedule_notrace   <- preempt_schedule_notrace
  *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ *   dynamic_preempt_lazy       <- false
+ *
+ * LAZY:
+ *   cond_resched               <- RET0
+ *   might_resched              <- RET0
+ *   preempt_schedule           <- preempt_schedule
+ *   preempt_schedule_notrace   <- preempt_schedule_notrace
+ *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ *   dynamic_preempt_lazy       <- true
  */
 
 enum {
@@ -7355,6 +7399,7 @@ enum {
 	preempt_dynamic_none,
 	preempt_dynamic_voluntary,
 	preempt_dynamic_full,
+	preempt_dynamic_lazy,
 };
 
 int preempt_dynamic_mode = preempt_dynamic_undefined;
@@ -7370,15 +7415,23 @@ int sched_dynamic_mode(const char *str)
 	if (!strcmp(str, "full"))
 		return preempt_dynamic_full;
 
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+	if (!strcmp(str, "lazy"))
+		return preempt_dynamic_lazy;
+#endif
+
 	return -EINVAL;
 }
 
+#define preempt_dynamic_key_enable(f)	static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_key_disable(f)	static_key_disable(&sk_dynamic_##f.key)
+
 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 #define preempt_dynamic_enable(f)	static_call_update(f, f##_dynamic_enabled)
 #define preempt_dynamic_disable(f)	static_call_update(f, f##_dynamic_disabled)
 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-#define preempt_dynamic_enable(f)	static_key_enable(&sk_dynamic_##f.key)
-#define preempt_dynamic_disable(f)	static_key_disable(&sk_dynamic_##f.key)
+#define preempt_dynamic_enable(f)	preempt_dynamic_key_enable(f)
+#define preempt_dynamic_disable(f)	preempt_dynamic_key_disable(f)
 #else
 #error "Unsupported PREEMPT_DYNAMIC mechanism"
 #endif
@@ -7398,6 +7451,7 @@ static void __sched_dynamic_update(int m
 	preempt_dynamic_enable(preempt_schedule);
 	preempt_dynamic_enable(preempt_schedule_notrace);
 	preempt_dynamic_enable(irqentry_exit_cond_resched);
+	preempt_dynamic_key_disable(preempt_lazy);
 
 	switch (mode) {
 	case preempt_dynamic_none:
@@ -7407,6 +7461,7 @@ static void __sched_dynamic_update(int m
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: none\n");
 		break;
@@ -7418,6 +7473,7 @@ static void __sched_dynamic_update(int m
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: voluntary\n");
 		break;
@@ -7429,9 +7485,22 @@ static void __sched_dynamic_update(int m
 		preempt_dynamic_enable(preempt_schedule);
 		preempt_dynamic_enable(preempt_schedule_notrace);
 		preempt_dynamic_enable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: full\n");
 		break;
+
+	case preempt_dynamic_lazy:
+		if (!klp_override)
+			preempt_dynamic_disable(cond_resched);
+		preempt_dynamic_disable(might_resched);
+		preempt_dynamic_enable(preempt_schedule);
+		preempt_dynamic_enable(preempt_schedule_notrace);
+		preempt_dynamic_enable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_enable(preempt_lazy);
+		if (mode != preempt_dynamic_mode)
+			pr_info("Dynamic Preempt: lazy\n");
+		break;
 	}
 
 	preempt_dynamic_mode = mode;
@@ -7494,6 +7563,8 @@ static void __init preempt_dynamic_init(
 			sched_dynamic_update(preempt_dynamic_none);
 		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
 			sched_dynamic_update(preempt_dynamic_voluntary);
+		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+			sched_dynamic_update(preempt_dynamic_lazy);
 		} else {
 			/* Default static call setting, nothing to do */
 			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
@@ -7514,6 +7585,7 @@ static void __init preempt_dynamic_init(
 PREEMPT_MODEL_ACCESSOR(none);
 PREEMPT_MODEL_ACCESSOR(voluntary);
 PREEMPT_MODEL_ACCESSOR(full);
+PREEMPT_MODEL_ACCESSOR(lazy);
 
 #else /* !CONFIG_PREEMPT_DYNAMIC: */
 
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struc
 static int sched_dynamic_show(struct seq_file *m, void *v)
 {
 	static const char * preempt_modes[] = {
-		"none", "voluntary", "full"
+		"none", "voluntary", "full", "lazy",
 	};
+	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+	for (i = 0; i < j; i++) {
 		if (preempt_dynamic_mode == i)
 			seq_puts(m, "(");
 		seq_puts(m, preempt_modes[i]);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *c
 		return;
 
 	if (resched || did_preempt_short(cfs_rq, curr)) {
-		resched_curr(rq);
+		resched_curr_lazy(rq);
 		clear_buddies(cfs_rq, curr);
 	}
 }
@@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
 	 * validating it and just reschedule.
 	 */
 	if (queued) {
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 		return;
 	}
 	/*
@@ -8832,7 +8832,7 @@ static void check_preempt_wakeup_fair(st
 	return;
 
 preempt:
-	resched_curr(rq);
+	resched_curr_lazy(rq);
 }
 
 static struct task_struct *pick_task_fair(struct rq *rq)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2692,6 +2692,7 @@ extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
 
 extern void resched_curr(struct rq *rq);
+extern void resched_curr_lazy(struct rq *rq);
 extern void resched_cpu(int cpu);
 
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
Re: [PATCH 2/5] sched: Add Lazy preemption model
Posted by Shrikanth Hegde 1 month ago

On 10/7/24 13:16, Peter Zijlstra wrote:
> Change fair to use resched_curr_lazy(), which, when the lazy
> preemption model is selected, will set TIF_NEED_RESCHED_LAZY.
> 
> This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
> As such, the average delay between setting LAZY and actually
> rescheduling will be TICK_NSEC/2.
> 
> In short, Lazy preemption will delay preemption for fair class but
> will function as Full preemption for all the other classes, most
> notably the realtime (RR/FIFO/DEADLINE) classes.
> 
> The goal is to bridge the performance gap with Voluntary, such that we
> might eventually remove that option entirely.
> 
> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   include/linux/preempt.h |    8 ++++-
>   kernel/Kconfig.preempt  |   15 +++++++++
>   kernel/sched/core.c     |   76 ++++++++++++++++++++++++++++++++++++++++++++++--
>   kernel/sched/debug.c    |    5 +--
>   kernel/sched/fair.c     |    6 +--
>   kernel/sched/sched.h    |    1
>   6 files changed, 103 insertions(+), 8 deletions(-)
> 
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_dis
>   extern bool preempt_model_none(void);
>   extern bool preempt_model_voluntary(void);
>   extern bool preempt_model_full(void);
> +extern bool preempt_model_lazy(void);
>   
>   #else
>   
> @@ -502,6 +503,11 @@ static inline bool preempt_model_full(vo
>   	return IS_ENABLED(CONFIG_PREEMPT);
>   }
>   
> +static inline bool preempt_model_lazy(void)
> +{
> +	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
> +}
> +
>   #endif
>   
>   static inline bool preempt_model_rt(void)
> @@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void
>    */
>   static inline bool preempt_model_preemptible(void)
>   {
> -	return preempt_model_full() || preempt_model_rt();
> +	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
>   }
>   
>   #endif /* __LINUX_PREEMPT_H */
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -11,6 +11,9 @@ config PREEMPT_BUILD
>   	select PREEMPTION
>   	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
>   
> +config ARCH_HAS_PREEMPT_LAZY
> +	bool
> +
>   choice
>   	prompt "Preemption Model"
>   	default PREEMPT_NONE
> @@ -67,6 +70,18 @@ config PREEMPT
>   	  embedded system with latency requirements in the milliseconds
>   	  range.
>   
> +config PREEMPT_LAZY
> +	bool "Scheduler controlled preemption model"
> +	depends on !ARCH_NO_PREEMPT
> +	depends on ARCH_HAS_PREEMPT_LAZY
> +	select PREEMPT_BUILD
> +	help
> +	  This option provides a scheduler driven preemption model that
> +	  is fundamentally similar to full preemption, but is less
> +	  eager to preempt SCHED_NORMAL tasks in an attempt to
> +	  reduce lock holder preemption and recover some of the performance
> +	  gains seen from using Voluntary preemption.
> +
>   config PREEMPT_RT
>   	bool "Fully Preemptible Kernel (Real-Time)"
>   	depends on EXPERT && ARCH_SUPPORTS_RT
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1078,6 +1078,9 @@ static void __resched_curr(struct rq *rq
>   
>   	lockdep_assert_rq_held(rq);
>   
> +	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
> +		tif = TIF_NEED_RESCHED;
> +
>   	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
>   		return;
>   
> @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
>   	__resched_curr(rq, TIF_NEED_RESCHED);
>   }
>   
> +#ifdef CONFIG_PREEMPT_DYNAMIC
> +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
> +}
> +#else
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return IS_ENABLED(PREEMPT_LAZY);

I had to make it CONFIG_PREEMPT_LAZY for lazy preemption to work
on systems where CONFIG_PREEMPT_DYNAMIC=n.

> +}
> +#endif
> +
> +static __always_inline int tif_need_resched_lazy(void)
> +{
> +	if (dynamic_preempt_lazy())
> +		return TIF_NEED_RESCHED_LAZY;
> +
> +	return TIF_NEED_RESCHED;
> +}
> +
> +void resched_curr_lazy(struct rq *rq)
> +{
> +	__resched_curr(rq, tif_need_resched_lazy());
> +}
> +
>   void resched_cpu(int cpu)
>   {
>   	struct rq *rq = cpu_rq(cpu);
> @@ -5598,6 +5627,10 @@ void sched_tick(void)
>   	update_rq_clock(rq);
>   	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
>   	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
> +
> +	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
> +		resched_curr(rq);
> +
>   	curr->sched_class->task_tick(rq, curr, 0);
>   	if (sched_feat(LATENCY_WARN))
>   		resched_latency = cpu_resched_latency(rq);
> @@ -7334,6 +7367,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>    *   preempt_schedule           <- NOP
>    *   preempt_schedule_notrace   <- NOP
>    *   irqentry_exit_cond_resched <- NOP
> + *   dynamic_preempt_lazy       <- false
>    *
>    * VOLUNTARY:
>    *   cond_resched               <- __cond_resched
> @@ -7341,6 +7375,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>    *   preempt_schedule           <- NOP
>    *   preempt_schedule_notrace   <- NOP
>    *   irqentry_exit_cond_resched <- NOP
> + *   dynamic_preempt_lazy       <- false
>    *
>    * FULL:
>    *   cond_resched               <- RET0
> @@ -7348,6 +7383,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>    *   preempt_schedule           <- preempt_schedule
>    *   preempt_schedule_notrace   <- preempt_schedule_notrace
>    *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
> + *   dynamic_preempt_lazy       <- false
> + *
> + * LAZY:
> + *   cond_resched               <- RET0
> + *   might_resched              <- RET0
> + *   preempt_schedule           <- preempt_schedule
> + *   preempt_schedule_notrace   <- preempt_schedule_notrace
> + *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
> + *   dynamic_preempt_lazy       <- true
>    */
>   
>   enum {
> @@ -7355,6 +7399,7 @@ enum {
>   	preempt_dynamic_none,
>   	preempt_dynamic_voluntary,
>   	preempt_dynamic_full,
> +	preempt_dynamic_lazy,
>   };
>   
>   int preempt_dynamic_mode = preempt_dynamic_undefined;
> @@ -7370,15 +7415,23 @@ int sched_dynamic_mode(const char *str)
>   	if (!strcmp(str, "full"))
>   		return preempt_dynamic_full;
>   
> +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
> +	if (!strcmp(str, "lazy"))
> +		return preempt_dynamic_lazy;
> +#endif
> +
>   	return -EINVAL;
>   }
>   
> +#define preempt_dynamic_key_enable(f)	static_key_enable(&sk_dynamic_##f.key)
> +#define preempt_dynamic_key_disable(f)	static_key_disable(&sk_dynamic_##f.key)
> +
>   #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
>   #define preempt_dynamic_enable(f)	static_call_update(f, f##_dynamic_enabled)
>   #define preempt_dynamic_disable(f)	static_call_update(f, f##_dynamic_disabled)
>   #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
> -#define preempt_dynamic_enable(f)	static_key_enable(&sk_dynamic_##f.key)
> -#define preempt_dynamic_disable(f)	static_key_disable(&sk_dynamic_##f.key)
> +#define preempt_dynamic_enable(f)	preempt_dynamic_key_enable(f)
> +#define preempt_dynamic_disable(f)	preempt_dynamic_key_disable(f)
>   #else
>   #error "Unsupported PREEMPT_DYNAMIC mechanism"
>   #endif
> @@ -7398,6 +7451,7 @@ static void __sched_dynamic_update(int m
>   	preempt_dynamic_enable(preempt_schedule);
>   	preempt_dynamic_enable(preempt_schedule_notrace);
>   	preempt_dynamic_enable(irqentry_exit_cond_resched);
> +	preempt_dynamic_key_disable(preempt_lazy);
>   
>   	switch (mode) {
>   	case preempt_dynamic_none:
> @@ -7407,6 +7461,7 @@ static void __sched_dynamic_update(int m
>   		preempt_dynamic_disable(preempt_schedule);
>   		preempt_dynamic_disable(preempt_schedule_notrace);
>   		preempt_dynamic_disable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_disable(preempt_lazy);
>   		if (mode != preempt_dynamic_mode)
>   			pr_info("Dynamic Preempt: none\n");
>   		break;
> @@ -7418,6 +7473,7 @@ static void __sched_dynamic_update(int m
>   		preempt_dynamic_disable(preempt_schedule);
>   		preempt_dynamic_disable(preempt_schedule_notrace);
>   		preempt_dynamic_disable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_disable(preempt_lazy);
>   		if (mode != preempt_dynamic_mode)
>   			pr_info("Dynamic Preempt: voluntary\n");
>   		break;
> @@ -7429,9 +7485,22 @@ static void __sched_dynamic_update(int m
>   		preempt_dynamic_enable(preempt_schedule);
>   		preempt_dynamic_enable(preempt_schedule_notrace);
>   		preempt_dynamic_enable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_disable(preempt_lazy);
>   		if (mode != preempt_dynamic_mode)
>   			pr_info("Dynamic Preempt: full\n");
>   		break;
> +
> +	case preempt_dynamic_lazy:
> +		if (!klp_override)
> +			preempt_dynamic_disable(cond_resched);
> +		preempt_dynamic_disable(might_resched);
> +		preempt_dynamic_enable(preempt_schedule);
> +		preempt_dynamic_enable(preempt_schedule_notrace);
> +		preempt_dynamic_enable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_enable(preempt_lazy);
> +		if (mode != preempt_dynamic_mode)
> +			pr_info("Dynamic Preempt: lazy\n");
> +		break;
>   	}
>   
>   	preempt_dynamic_mode = mode;
> @@ -7494,6 +7563,8 @@ static void __init preempt_dynamic_init(
>   			sched_dynamic_update(preempt_dynamic_none);
>   		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
>   			sched_dynamic_update(preempt_dynamic_voluntary);
> +		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
> +			sched_dynamic_update(preempt_dynamic_lazy);
>   		} else {
>   			/* Default static call setting, nothing to do */
>   			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
> @@ -7514,6 +7585,7 @@ static void __init preempt_dynamic_init(
>   PREEMPT_MODEL_ACCESSOR(none);
>   PREEMPT_MODEL_ACCESSOR(voluntary);
>   PREEMPT_MODEL_ACCESSOR(full);
> +PREEMPT_MODEL_ACCESSOR(lazy);
>   
>   #else /* !CONFIG_PREEMPT_DYNAMIC: */
>   
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struc
>   static int sched_dynamic_show(struct seq_file *m, void *v)
>   {
>   	static const char * preempt_modes[] = {
> -		"none", "voluntary", "full"
> +		"none", "voluntary", "full", "lazy",
>   	};
> +	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
>   	int i;
>   
> -	for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
> +	for (i = 0; i < j; i++) {
>   		if (preempt_dynamic_mode == i)
>   			seq_puts(m, "(");
>   		seq_puts(m, preempt_modes[i]);
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *c
>   		return;
>   
>   	if (resched || did_preempt_short(cfs_rq, curr)) {



If there is a long running task, only after it is not eligible, LAZY would be set and
subsequent tick would upgrade it to NR. If one sets sysctl_sched_base_slice to a large
value (max 4seconds), LAZY would set thereafter(max 4 seconds) if there in no wakeup in
that CPU.

If i set sysctl_sched_base_slice=300ms, spawn 2 stress-ng on one CPU, then LAZY bit is
set usually after 300ms of sched_switch if there are no wakeups. Subsequent tick NR is set.
Initially I was thinking, if there is a long running process, then LAZY would be set after
one tick and on subsequent tick NR would set. I was wrong. It might take a long time for LAZY
to be set, and On subsequent tick NR would be set.

That would be expected behavior since one setting sysctl_sched_base_slice know what to expect?

> -		resched_curr(rq);
> +		resched_curr_lazy(rq);
>   		clear_buddies(cfs_rq, curr);
>   	}
>   }
> @@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
>   	 * validating it and just reschedule.
>   	 */
>   	if (queued) {

What's this queued used for? hrtick seems to set it. I haven't understood how it works.

> -		resched_curr(rq_of(cfs_rq));
> +		resched_curr_lazy(rq_of(cfs_rq));
>   		return;
>   	}
>   	/*
> @@ -8832,7 +8832,7 @@ static void check_preempt_wakeup_fair(st
>   	return;
>   
>   preempt:
> -	resched_curr(rq);

Is it better to call resched_curr here? When the code arrives here, it wants to
run pse as soon as possible right?

> +	resched_curr_lazy(rq);
>   }
>   
>   static struct task_struct *pick_task_fair(struct rq *rq)
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2692,6 +2692,7 @@ extern void init_sched_rt_class(void);
>   extern void init_sched_fair_class(void);
>   
>   extern void resched_curr(struct rq *rq);
> +extern void resched_curr_lazy(struct rq *rq);
>   extern void resched_cpu(int cpu);
>   
>   extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
> 
> 
>
Re: [PATCH 2/5] sched: Add Lazy preemption model
Posted by Sebastian Andrzej Siewior 1 month ago
On 2024-10-22 22:14:41 [+0530], Shrikanth Hegde wrote:
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *c
> >   		return;
> >   	if (resched || did_preempt_short(cfs_rq, curr)) {
> 
> 
> 
> If there is a long running task, only after it is not eligible, LAZY would be set and
> subsequent tick would upgrade it to NR. If one sets sysctl_sched_base_slice to a large
> value (max 4seconds), LAZY would set thereafter(max 4 seconds) if there in no wakeup in
> that CPU.
> 
> If i set sysctl_sched_base_slice=300ms, spawn 2 stress-ng on one CPU, then LAZY bit is
> set usually after 300ms of sched_switch if there are no wakeups. Subsequent tick NR is set.
> Initially I was thinking, if there is a long running process, then LAZY would be set after
> one tick and on subsequent tick NR would set. I was wrong. It might take a long time for LAZY
> to be set, and On subsequent tick NR would be set.
> 
> That would be expected behavior since one setting sysctl_sched_base_slice know what to expect?

I guess so. Once the slice is up then the NEED_RESCHED bit is replaced
with the LAZY bit. That means a return-to-userland (from a syscall) or
the following tick will lead to a scheduling event.

> > -		resched_curr(rq);
> > +		resched_curr_lazy(rq);
> >   		clear_buddies(cfs_rq, curr);
> >   	}
> >   }
> > @@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
> >   	 * validating it and just reschedule.
> >   	 */
> >   	if (queued) {
> 
> What's this queued used for? hrtick seems to set it. I haven't understood how it works.

from 20241009074631.GH17263@noisy.programming.kicks-ass.net:
| hrtick is disabled by default (because expensive) and so it doesn't
| matter much, but it's purpose is to increase accuracy and hence I left
| it untouched for now.

This setups a hrtimer for the (remaining) time slice and invokes the
task_tick from there (instead of the regular tick).

> > -		resched_curr(rq_of(cfs_rq));
> > +		resched_curr_lazy(rq_of(cfs_rq));
> >   		return;
> >   	}
> >   	/*
> > @@ -8832,7 +8832,7 @@ static void check_preempt_wakeup_fair(st
> >   	return;
> >   preempt:
> > -	resched_curr(rq);
> 
> Is it better to call resched_curr here? When the code arrives here, it wants to
> run pse as soon as possible right?

But wouldn't then every try_to_wakeup()/ wake_up() result in immediate
preemption? Letting it run and waiting to give up on its own, having the
preemption on return to userland results usually in better performance.
At least this is what I observed while playing with this.

> > +	resched_curr_lazy(rq);
> >   }
> >   static struct task_struct *pick_task_fair(struct rq *rq)

Sebastian
Re: [PATCH 2/5] sched: Add Lazy preemption model
Posted by Shrikanth Hegde 4 weeks, 1 day ago
Hi Sebastian.

On 10/25/24 18:49, Sebastian Andrzej Siewior wrote:
> On 2024-10-22 22:14:41 [+0530], Shrikanth Hegde wrote:
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *c
>>>    		return;
>>>    	if (resched || did_preempt_short(cfs_rq, curr)) {
>>
>>
>>
>> If there is a long running task, only after it is not eligible, LAZY would be set and
>> subsequent tick would upgrade it to NR. If one sets sysctl_sched_base_slice to a large
>> value (max 4seconds), LAZY would set thereafter(max 4 seconds) if there in no wakeup in
>> that CPU.
>>
>> If i set sysctl_sched_base_slice=300ms, spawn 2 stress-ng on one CPU, then LAZY bit is
>> set usually after 300ms of sched_switch if there are no wakeups. Subsequent tick NR is set.
>> Initially I was thinking, if there is a long running process, then LAZY would be set after
>> one tick and on subsequent tick NR would set. I was wrong. It might take a long time for LAZY
>> to be set, and On subsequent tick NR would be set.
>>
>> That would be expected behavior since one setting sysctl_sched_base_slice know what to expect?
> 
> I guess so. Once the slice is up then the NEED_RESCHED bit is replaced
> with the LAZY bit. That means a return-to-userland (from a syscall) or
> the following tick will lead to a scheduling event.

ok.

> 
>>> -		resched_curr(rq);
>>> +		resched_curr_lazy(rq);
>>>    		clear_buddies(cfs_rq, curr);
>>>    	}
>>>    }
>>> @@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
>>>    	 * validating it and just reschedule.
>>>    	 */
>>>    	if (queued) {
>>
>> What's this queued used for? hrtick seems to set it. I haven't understood how it works.
> 
> from 20241009074631.GH17263@noisy.programming.kicks-ass.net:
> | hrtick is disabled by default (because expensive) and so it doesn't
> | matter much, but it's purpose is to increase accuracy and hence I left
> | it untouched for now.
> 
> This setups a hrtimer for the (remaining) time slice and invokes the
> task_tick from there (instead of the regular tick).

thanks. will take a look and try to understand.

> 
>>> -		resched_curr(rq_of(cfs_rq));
>>> +		resched_curr_lazy(rq_of(cfs_rq));
>>>    		return;
>>>    	}
>>>    	/*
>>> @@ -8832,7 +8832,7 @@ static void check_preempt_wakeup_fair(st
>>>    	return;
>>>    preempt:
>>> -	resched_curr(rq);
>>
>> Is it better to call resched_curr here? When the code arrives here, it wants to
>> run pse as soon as possible right?
> 
> But wouldn't then every try_to_wakeup()/ wake_up() result in immediate
> preemption? Letting it run and waiting to give up on its own, having the
> preemption on return to userland results usually in better performance.
> At least this is what I observed while playing with this.
> 

yes. I agree that preemption at every ttwu is bad. But that may not 
happen with latest code. i.e if RUN_TO_PARITY is enabled or pick_eevdf 
doesn't pick the waiting task as the best candidate.

My concern was also this code in check_preempt_wakeup_fair
         /*
          * Preempt an idle entity in favor of a non-idle entity (and 
don't preempt
          * in the inverse case).
          */
         if (cse_is_idle && !pse_is_idle)
                 goto preempt;
         if (cse_is_idle != pse_is_idle)
                 return;

If the current is idle and waking is not idle, we should set NR instead 
of LAZY is what I was thinking. Not sure if there is such pattern that 
happen in exit to kernel path, since exit to user is taken care by 
setting LAZY bit.


>>> +	resched_curr_lazy(rq);
>>>    }
>>>    static struct task_struct *pick_task_fair(struct rq *rq)
> 
> Sebastian
Re: [PATCH 2/5] sched: Add Lazy preemption model
Posted by Shrikanth Hegde 1 month, 1 week ago

On 10/7/24 13:16, Peter Zijlstra wrote:
> Change fair to use resched_curr_lazy(), which, when the lazy
> preemption model is selected, will set TIF_NEED_RESCHED_LAZY.
> 
> This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
> As such, the average delay between setting LAZY and actually
> rescheduling will be TICK_NSEC/2.

I didn't understand the math here. How?

> 
> In short, Lazy preemption will delay preemption for fair class but
> will function as Full preemption for all the other classes, most
> notably the realtime (RR/FIFO/DEADLINE) classes.
> 
> The goal is to bridge the performance gap with Voluntary, such that we
> might eventually remove that option entirely.
> 
> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   include/linux/preempt.h |    8 ++++-
>   kernel/Kconfig.preempt  |   15 +++++++++
>   kernel/sched/core.c     |   76 ++++++++++++++++++++++++++++++++++++++++++++++--
>   kernel/sched/debug.c    |    5 +--
>   kernel/sched/fair.c     |    6 +--
>   kernel/sched/sched.h    |    1
>   6 files changed, 103 insertions(+), 8 deletions(-)
> 
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_dis
>   extern bool preempt_model_none(void);
>   extern bool preempt_model_voluntary(void);
>   extern bool preempt_model_full(void);
> +extern bool preempt_model_lazy(void);
>   
>   #else
>   
> @@ -502,6 +503,11 @@ static inline bool preempt_model_full(vo
>   	return IS_ENABLED(CONFIG_PREEMPT);
>   }
>   
> +static inline bool preempt_model_lazy(void)
> +{
> +	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
> +}
> +
>   #endif
>   
>   static inline bool preempt_model_rt(void)
> @@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void
>    */
>   static inline bool preempt_model_preemptible(void)
>   {
> -	return preempt_model_full() || preempt_model_rt();
> +	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
>   }
>   
>   #endif /* __LINUX_PREEMPT_H */
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -11,6 +11,9 @@ config PREEMPT_BUILD
>   	select PREEMPTION
>   	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
>   
> +config ARCH_HAS_PREEMPT_LAZY
> +	bool
> +
>   choice
>   	prompt "Preemption Model"
>   	default PREEMPT_NONE
> @@ -67,6 +70,18 @@ config PREEMPT
>   	  embedded system with latency requirements in the milliseconds
>   	  range.
>   
> +config PREEMPT_LAZY
> +	bool "Scheduler controlled preemption model"
> +	depends on !ARCH_NO_PREEMPT
> +	depends on ARCH_HAS_PREEMPT_LAZY
> +	select PREEMPT_BUILD
> +	help
> +	  This option provides a scheduler driven preemption model that
> +	  is fundamentally similar to full preemption, but is less
> +	  eager to preempt SCHED_NORMAL tasks in an attempt to
> +	  reduce lock holder preemption and recover some of the performance
> +	  gains seen from using Voluntary preemption.
> +
>   config PREEMPT_RT
>   	bool "Fully Preemptible Kernel (Real-Time)"
>   	depends on EXPERT && ARCH_SUPPORTS_RT
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1078,6 +1078,9 @@ static void __resched_curr(struct rq *rq
>   
>   	lockdep_assert_rq_held(rq);
>   
> +	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
> +		tif = TIF_NEED_RESCHED;
> +
>   	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
>   		return;
>   
> @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
>   	__resched_curr(rq, TIF_NEED_RESCHED);
>   }
>   
> +#ifdef CONFIG_PREEMPT_DYNAMIC
> +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
> +}
> +#else
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return IS_ENABLED(PREEMPT_LAZY);


This should be CONFIG_PREEMPT_LAZY no?

> +}
> +#endif
> +
> +static __always_inline int tif_need_resched_lazy(void)
> +{
> +	if (dynamic_preempt_lazy())
> +		return TIF_NEED_RESCHED_LAZY;
> +
> +	return TIF_NEED_RESCHED;
> +}
> +
> +void resched_curr_lazy(struct rq *rq)
> +{
> +	__resched_curr(rq, tif_need_resched_lazy());
> +}
> +
...
Re: [PATCH 2/5] sched: Add Lazy preemption model
Posted by Sebastian Andrzej Siewior 1 month ago
On 2024-10-15 20:07:26 [+0530], Shrikanth Hegde wrote:
> 
> 
> On 10/7/24 13:16, Peter Zijlstra wrote:
> > Change fair to use resched_curr_lazy(), which, when the lazy
> > preemption model is selected, will set TIF_NEED_RESCHED_LAZY.
> > 
> > This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
> > As such, the average delay between setting LAZY and actually
> > rescheduling will be TICK_NSEC/2.
> 
> I didn't understand the math here. How?

If you set the LAZY bit you wait until sched_tick() which fires and this
happens every TICK_NSEC. In extreme case the timer fires either
immediately (right after setting the bit) or after TICK_NSEC (because it
just fired so it takes another TICK_NSEC). Given those two, assuming the
average would be in the middle.

> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
> >   	__resched_curr(rq, TIF_NEED_RESCHED);
> >   }
> > +#ifdef CONFIG_PREEMPT_DYNAMIC
> > +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
> > +static __always_inline bool dynamic_preempt_lazy(void)
> > +{
> > +	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
> > +}
> > +#else
> > +static __always_inline bool dynamic_preempt_lazy(void)
> > +{
> > +	return IS_ENABLED(PREEMPT_LAZY);
> 
> 
> This should be CONFIG_PREEMPT_LAZY no?

Correct.

Sebastian
Re: [PATCH 2/5] sched: Add Lazy preemption model
Posted by Sebastian Andrzej Siewior 1 month, 2 weeks ago
On 2024-10-07 09:46:11 [+0200], Peter Zijlstra wrote:
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
…
> +static __always_inline int tif_need_resched_lazy(void)

The naming is a bit confusing here because tif_need_resched() checks if
the TIF_NEED_RESCHED is set while this returns the proper TIF bit
instead.

> +{
> +	if (dynamic_preempt_lazy())
> +		return TIF_NEED_RESCHED_LAZY;
> +
> +	return TIF_NEED_RESCHED;
> +}

Sebastian
Re: [PATCH 2/5] sched: Add Lazy preemption model
Posted by Peter Zijlstra 1 month, 2 weeks ago
On Wed, Oct 09, 2024 at 10:50:21AM +0200, Sebastian Andrzej Siewior wrote:
> On 2024-10-07 09:46:11 [+0200], Peter Zijlstra wrote:
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
> …
> > +static __always_inline int tif_need_resched_lazy(void)
> 
> The naming is a bit confusing here because tif_need_resched() checks if
> the TIF_NEED_RESCHED is set while this returns the proper TIF bit
> instead.

Right you are; naming things be hard. How about: get_lazy_tif_bit() ?
There's only the single user anyway.

> > +{
> > +	if (dynamic_preempt_lazy())
> > +		return TIF_NEED_RESCHED_LAZY;
> > +
> > +	return TIF_NEED_RESCHED;
> > +}
> 
> Sebastian
Re: [PATCH 2/5] sched: Add Lazy preemption model
Posted by Sebastian Andrzej Siewior 1 month, 2 weeks ago
On 2024-10-09 11:14:01 [+0200], Peter Zijlstra wrote:
> On Wed, Oct 09, 2024 at 10:50:21AM +0200, Sebastian Andrzej Siewior wrote:
> > On 2024-10-07 09:46:11 [+0200], Peter Zijlstra wrote:
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
> > …
> > > +static __always_inline int tif_need_resched_lazy(void)
> > 
> > The naming is a bit confusing here because tif_need_resched() checks if
> > the TIF_NEED_RESCHED is set while this returns the proper TIF bit
> > instead.
> 
> Right you are; naming things be hard. How about: get_lazy_tif_bit() ?
> There's only the single user anyway.

perfect.

> > > +{
> > > +	if (dynamic_preempt_lazy())
> > > +		return TIF_NEED_RESCHED_LAZY;
> > > +
> > > +	return TIF_NEED_RESCHED;
> > > +}

Sebastian
Re: [PATCH 2/5] sched: Add Lazy preemption model
Posted by Ankur Arora 1 month, 2 weeks ago
Peter Zijlstra <peterz@infradead.org> writes:

> Change fair to use resched_curr_lazy(), which, when the lazy
> preemption model is selected, will set TIF_NEED_RESCHED_LAZY.
>
> This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
> As such, the average delay between setting LAZY and actually
> rescheduling will be TICK_NSEC/2.
>
> In short, Lazy preemption will delay preemption for fair class but
> will function as Full preemption for all the other classes, most
> notably the realtime (RR/FIFO/DEADLINE) classes.
>
> The goal is to bridge the performance gap with Voluntary, such that we
> might eventually remove that option entirely.
>
> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/preempt.h |    8 ++++-
>  kernel/Kconfig.preempt  |   15 +++++++++
>  kernel/sched/core.c     |   76 ++++++++++++++++++++++++++++++++++++++++++++++--
>  kernel/sched/debug.c    |    5 +--
>  kernel/sched/fair.c     |    6 +--
>  kernel/sched/sched.h    |    1
>  6 files changed, 103 insertions(+), 8 deletions(-)
>
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_dis
>  extern bool preempt_model_none(void);
>  extern bool preempt_model_voluntary(void);
>  extern bool preempt_model_full(void);
> +extern bool preempt_model_lazy(void);
>
>  #else
>
> @@ -502,6 +503,11 @@ static inline bool preempt_model_full(vo
>  	return IS_ENABLED(CONFIG_PREEMPT);
>  }
>
> +static inline bool preempt_model_lazy(void)
> +{
> +	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
> +}
> +
>  #endif
>
>  static inline bool preempt_model_rt(void)
> @@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void
>   */
>  static inline bool preempt_model_preemptible(void)
>  {
> -	return preempt_model_full() || preempt_model_rt();
> +	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
>  }

In addition to preempt_model_preemptible() we probably also need

  static inline bool preempt_model_minimize_latency(void)
  {
  	return preempt_model_full() || preempt_model_rt();
  }

for spin_needbreak()/rwlock_needbreak().

That would make the behaviour of spin_needbreak() under the lazy model
similar to none/voluntary.

>  #endif /* __LINUX_PREEMPT_H */
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -11,6 +11,9 @@ config PREEMPT_BUILD
>  	select PREEMPTION
>  	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
>
> +config ARCH_HAS_PREEMPT_LAZY
> +	bool
> +
>  choice
>  	prompt "Preemption Model"
>  	default PREEMPT_NONE
> @@ -67,6 +70,18 @@ config PREEMPT
>  	  embedded system with latency requirements in the milliseconds
>  	  range.
>
> +config PREEMPT_LAZY
> +	bool "Scheduler controlled preemption model"
> +	depends on !ARCH_NO_PREEMPT
> +	depends on ARCH_HAS_PREEMPT_LAZY
> +	select PREEMPT_BUILD
> +	help
> +	  This option provides a scheduler driven preemption model that
> +	  is fundamentally similar to full preemption, but is less
> +	  eager to preempt SCHED_NORMAL tasks in an attempt to
> +	  reduce lock holder preemption and recover some of the performance
> +	  gains seen from using Voluntary preemption.
> +
>  config PREEMPT_RT
>  	bool "Fully Preemptible Kernel (Real-Time)"
>  	depends on EXPERT && ARCH_SUPPORTS_RT
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1078,6 +1078,9 @@ static void __resched_curr(struct rq *rq
>
>  	lockdep_assert_rq_held(rq);
>
> +	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
> +		tif = TIF_NEED_RESCHED;
> +

Tasks with idle policy get handled at the usual user space boundary.
Maybe a comment reflecting that?

>  	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
>  		return;
>
> @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
>  	__resched_curr(rq, TIF_NEED_RESCHED);
>  }
>
> +#ifdef CONFIG_PREEMPT_DYNAMIC
> +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
> +}
> +#else
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return IS_ENABLED(PREEMPT_LAZY);
> +}
> +#endif
> +
> +static __always_inline int tif_need_resched_lazy(void)
> +{
> +	if (dynamic_preempt_lazy())
> +		return TIF_NEED_RESCHED_LAZY;
> +
> +	return TIF_NEED_RESCHED;
> +}

Nice. This simplifies things.

> +void resched_curr_lazy(struct rq *rq)
> +{
> +	__resched_curr(rq, tif_need_resched_lazy());
> +}
> +
>  void resched_cpu(int cpu)
>  {
>  	struct rq *rq = cpu_rq(cpu);
> @@ -5598,6 +5627,10 @@ void sched_tick(void)
>  	update_rq_clock(rq);
>  	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
>  	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
> +
> +	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
> +		resched_curr(rq);
> +

So this works for SCHED_NORMAL. But, does this do the right thing for
deadline etc other scheduling classes?


--
ankur
Re: [PATCH 2/5] sched: Add Lazy preemption model
Posted by Peter Zijlstra 1 month, 2 weeks ago
On Mon, Oct 07, 2024 at 10:43:58PM -0700, Ankur Arora wrote:

> > @@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void
> >   */
> >  static inline bool preempt_model_preemptible(void)
> >  {
> > -	return preempt_model_full() || preempt_model_rt();
> > +	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
> >  }
> 
> In addition to preempt_model_preemptible() we probably also need
> 
>   static inline bool preempt_model_minimize_latency(void)
>   {
>   	return preempt_model_full() || preempt_model_rt();
>   }
> 
> for spin_needbreak()/rwlock_needbreak().
> 
> That would make the behaviour of spin_needbreak() under the lazy model
> similar to none/voluntary.

That whole thing needs rethinking, for one the preempt_model_rt() one
doesn't really make sense anymore at the end of this.

> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1078,6 +1078,9 @@ static void __resched_curr(struct rq *rq
> >
> >  	lockdep_assert_rq_held(rq);
> >
> > +	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
> > +		tif = TIF_NEED_RESCHED;
> > +
> 
> Tasks with idle policy get handled at the usual user space boundary.
> Maybe a comment reflecting that?

is_idle_task() != SCHED_IDLE. This is about the idle task, which you
want to force preempt always. But I can stick a comment on.

> > @@ -5598,6 +5627,10 @@ void sched_tick(void)
> >  	update_rq_clock(rq);
> >  	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
> >  	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
> > +
> > +	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
> > +		resched_curr(rq);
> > +
> 
> So this works for SCHED_NORMAL. But, does this do the right thing for
> deadline etc other scheduling classes?

Yeah, only fair.c uses resched_curr_laz8(), the others still use
resched_curr() and will work as if Full.

So that is: SCHED_IDLE, SCHED_BATCH and SCHED_NORMAL/OTHER get the lazy
thing, FIFO, RR and DEADLINE get the traditional Full behaviour.
[tip: sched/core] sched: Add Lazy preemption model
Posted by tip-bot2 for Peter Zijlstra 3 weeks ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     7c70cb94d29cd325fabe4a818c18613e3b9919a1
Gitweb:        https://git.kernel.org/tip/7c70cb94d29cd325fabe4a818c18613e3b9919a1
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 04 Oct 2024 14:46:58 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 05 Nov 2024 12:55:38 +01:00

sched: Add Lazy preemption model

Change fair to use resched_curr_lazy(), which, when the lazy
preemption model is selected, will set TIF_NEED_RESCHED_LAZY.

This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
As such, the average delay between setting LAZY and actually
rescheduling will be TICK_NSEC/2.

In short, Lazy preemption will delay preemption for fair class but
will function as Full preemption for all the other classes, most
notably the realtime (RR/FIFO/DEADLINE) classes.

The goal is to bridge the performance gap with Voluntary, such that we
might eventually remove that option entirely.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20241007075055.331243614@infradead.org
---
 include/linux/preempt.h |  8 +++-
 kernel/Kconfig.preempt  | 15 ++++++++-
 kernel/sched/core.c     | 80 +++++++++++++++++++++++++++++++++++++++-
 kernel/sched/debug.c    |  5 +--
 kernel/sched/fair.c     |  6 +--
 kernel/sched/sched.h    |  1 +-
 6 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index ce76f1a..ca86235 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
 extern bool preempt_model_none(void);
 extern bool preempt_model_voluntary(void);
 extern bool preempt_model_full(void);
+extern bool preempt_model_lazy(void);
 
 #else
 
@@ -502,6 +503,11 @@ static inline bool preempt_model_full(void)
 	return IS_ENABLED(CONFIG_PREEMPT);
 }
 
+static inline bool preempt_model_lazy(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
+
 #endif
 
 static inline bool preempt_model_rt(void)
@@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void)
  */
 static inline bool preempt_model_preemptible(void)
 {
-	return preempt_model_full() || preempt_model_rt();
+	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
 }
 
 #endif /* __LINUX_PREEMPT_H */
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index fe782cd..09f06d8 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -11,6 +11,9 @@ config PREEMPT_BUILD
 	select PREEMPTION
 	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
 
+config ARCH_HAS_PREEMPT_LAZY
+	bool
+
 choice
 	prompt "Preemption Model"
 	default PREEMPT_NONE
@@ -67,6 +70,18 @@ config PREEMPT
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_LAZY
+	bool "Scheduler controlled preemption model"
+	depends on !ARCH_NO_PREEMPT
+	depends on ARCH_HAS_PREEMPT_LAZY
+	select PREEMPT_BUILD
+	help
+	  This option provides a scheduler driven preemption model that
+	  is fundamentally similar to full preemption, but is less
+	  eager to preempt SCHED_NORMAL tasks in an attempt to
+	  reduce lock holder preemption and recover some of the performance
+	  gains seen from using Voluntary preemption.
+
 config PREEMPT_RT
 	bool "Fully Preemptible Kernel (Real-Time)"
 	depends on EXPERT && ARCH_SUPPORTS_RT
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0cd05e3..df6a34d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1083,6 +1083,13 @@ static void __resched_curr(struct rq *rq, int tif)
 
 	lockdep_assert_rq_held(rq);
 
+	/*
+	 * Always immediately preempt the idle task; no point in delaying doing
+	 * actual work.
+	 */
+	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
+		tif = TIF_NEED_RESCHED;
+
 	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
 		return;
 
@@ -1108,6 +1115,32 @@ void resched_curr(struct rq *rq)
 	__resched_curr(rq, TIF_NEED_RESCHED);
 }
 
+#ifdef CONFIG_PREEMPT_DYNAMIC
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
+}
+#else
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
+#endif
+
+static __always_inline int get_lazy_tif_bit(void)
+{
+	if (dynamic_preempt_lazy())
+		return TIF_NEED_RESCHED_LAZY;
+
+	return TIF_NEED_RESCHED;
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+	__resched_curr(rq, get_lazy_tif_bit());
+}
+
 void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -5612,6 +5645,10 @@ void sched_tick(void)
 	update_rq_clock(rq);
 	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
 	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
+
+	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+		resched_curr(rq);
+
 	donor->sched_class->task_tick(rq, donor, 0);
 	if (sched_feat(LATENCY_WARN))
 		resched_latency = cpu_resched_latency(rq);
@@ -7374,6 +7411,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
  *   preempt_schedule           <- NOP
  *   preempt_schedule_notrace   <- NOP
  *   irqentry_exit_cond_resched <- NOP
+ *   dynamic_preempt_lazy       <- false
  *
  * VOLUNTARY:
  *   cond_resched               <- __cond_resched
@@ -7381,6 +7419,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
  *   preempt_schedule           <- NOP
  *   preempt_schedule_notrace   <- NOP
  *   irqentry_exit_cond_resched <- NOP
+ *   dynamic_preempt_lazy       <- false
  *
  * FULL:
  *   cond_resched               <- RET0
@@ -7388,6 +7427,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
  *   preempt_schedule           <- preempt_schedule
  *   preempt_schedule_notrace   <- preempt_schedule_notrace
  *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ *   dynamic_preempt_lazy       <- false
+ *
+ * LAZY:
+ *   cond_resched               <- RET0
+ *   might_resched              <- RET0
+ *   preempt_schedule           <- preempt_schedule
+ *   preempt_schedule_notrace   <- preempt_schedule_notrace
+ *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ *   dynamic_preempt_lazy       <- true
  */
 
 enum {
@@ -7395,6 +7443,7 @@ enum {
 	preempt_dynamic_none,
 	preempt_dynamic_voluntary,
 	preempt_dynamic_full,
+	preempt_dynamic_lazy,
 };
 
 int preempt_dynamic_mode = preempt_dynamic_undefined;
@@ -7410,15 +7459,23 @@ int sched_dynamic_mode(const char *str)
 	if (!strcmp(str, "full"))
 		return preempt_dynamic_full;
 
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+	if (!strcmp(str, "lazy"))
+		return preempt_dynamic_lazy;
+#endif
+
 	return -EINVAL;
 }
 
+#define preempt_dynamic_key_enable(f)	static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_key_disable(f)	static_key_disable(&sk_dynamic_##f.key)
+
 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 #define preempt_dynamic_enable(f)	static_call_update(f, f##_dynamic_enabled)
 #define preempt_dynamic_disable(f)	static_call_update(f, f##_dynamic_disabled)
 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-#define preempt_dynamic_enable(f)	static_key_enable(&sk_dynamic_##f.key)
-#define preempt_dynamic_disable(f)	static_key_disable(&sk_dynamic_##f.key)
+#define preempt_dynamic_enable(f)	preempt_dynamic_key_enable(f)
+#define preempt_dynamic_disable(f)	preempt_dynamic_key_disable(f)
 #else
 #error "Unsupported PREEMPT_DYNAMIC mechanism"
 #endif
@@ -7438,6 +7495,7 @@ static void __sched_dynamic_update(int mode)
 	preempt_dynamic_enable(preempt_schedule);
 	preempt_dynamic_enable(preempt_schedule_notrace);
 	preempt_dynamic_enable(irqentry_exit_cond_resched);
+	preempt_dynamic_key_disable(preempt_lazy);
 
 	switch (mode) {
 	case preempt_dynamic_none:
@@ -7447,6 +7505,7 @@ static void __sched_dynamic_update(int mode)
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: none\n");
 		break;
@@ -7458,6 +7517,7 @@ static void __sched_dynamic_update(int mode)
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: voluntary\n");
 		break;
@@ -7469,9 +7529,22 @@ static void __sched_dynamic_update(int mode)
 		preempt_dynamic_enable(preempt_schedule);
 		preempt_dynamic_enable(preempt_schedule_notrace);
 		preempt_dynamic_enable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: full\n");
 		break;
+
+	case preempt_dynamic_lazy:
+		if (!klp_override)
+			preempt_dynamic_disable(cond_resched);
+		preempt_dynamic_disable(might_resched);
+		preempt_dynamic_enable(preempt_schedule);
+		preempt_dynamic_enable(preempt_schedule_notrace);
+		preempt_dynamic_enable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_enable(preempt_lazy);
+		if (mode != preempt_dynamic_mode)
+			pr_info("Dynamic Preempt: lazy\n");
+		break;
 	}
 
 	preempt_dynamic_mode = mode;
@@ -7534,6 +7607,8 @@ static void __init preempt_dynamic_init(void)
 			sched_dynamic_update(preempt_dynamic_none);
 		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
 			sched_dynamic_update(preempt_dynamic_voluntary);
+		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+			sched_dynamic_update(preempt_dynamic_lazy);
 		} else {
 			/* Default static call setting, nothing to do */
 			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
@@ -7554,6 +7629,7 @@ static void __init preempt_dynamic_init(void)
 PREEMPT_MODEL_ACCESSOR(none);
 PREEMPT_MODEL_ACCESSOR(voluntary);
 PREEMPT_MODEL_ACCESSOR(full);
+PREEMPT_MODEL_ACCESSOR(lazy);
 
 #else /* !CONFIG_PREEMPT_DYNAMIC: */
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f4035c7..44a49f9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
 static int sched_dynamic_show(struct seq_file *m, void *v)
 {
 	static const char * preempt_modes[] = {
-		"none", "voluntary", "full"
+		"none", "voluntary", "full", "lazy",
 	};
+	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+	for (i = 0; i < j; i++) {
 		if (preempt_dynamic_mode == i)
 			seq_puts(m, "(");
 		seq_puts(m, preempt_modes[i]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6512258..3356315 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		return;
 
 	if (resched || did_preempt_short(cfs_rq, curr)) {
-		resched_curr(rq);
+		resched_curr_lazy(rq);
 		clear_buddies(cfs_rq, curr);
 	}
 }
@@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * validating it and just reschedule.
 	 */
 	if (queued) {
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 		return;
 	}
 #endif
@@ -8829,7 +8829,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int 
 	return;
 
 preempt:
-	resched_curr(rq);
+	resched_curr_lazy(rq);
 }
 
 static struct task_struct *pick_task_fair(struct rq *rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e51bf5a..090dd4b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2689,6 +2689,7 @@ extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
 
 extern void resched_curr(struct rq *rq);
+extern void resched_curr_lazy(struct rq *rq);
 extern void resched_cpu(int cpu);
 
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);