Reuse sched_dynamic_update() and related logic to enable choosing
the preemption model at boot or runtime for PREEMPT_AUTO.
The interface is identical to PREEMPT_DYNAMIC.
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
Changelog:
change title
---
include/linux/preempt.h | 2 +-
kernel/sched/core.c | 31 +++++++++++++++++++++++++++----
kernel/sched/debug.c | 6 +++---
kernel/sched/sched.h | 2 +-
4 files changed, 32 insertions(+), 9 deletions(-)
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index d453f5e34390..d4f568606eda 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -481,7 +481,7 @@ DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace())
DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
-#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_PREEMPT_DYNAMIC) || defined(CONFIG_PREEMPT_AUTO)
extern bool preempt_model_none(void);
extern bool preempt_model_voluntary(void);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 349f6257fdcd..d7804e29182d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8713,9 +8713,13 @@ int __cond_resched_rwlock_write(rwlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_rwlock_write);
-#if defined(CONFIG_PREEMPT_DYNAMIC)
+#if defined(CONFIG_PREEMPT_DYNAMIC) || defined(CONFIG_PREEMPT_AUTO)
+#ifdef CONFIG_PREEMPT_DYNAMIC
#define PREEMPT_MODE "Dynamic Preempt"
+#else
+#define PREEMPT_MODE "Preempt Auto"
+#endif
enum {
preempt_dynamic_undefined = -1,
@@ -8790,11 +8794,11 @@ PREEMPT_MODEL_ACCESSOR(none);
PREEMPT_MODEL_ACCESSOR(voluntary);
PREEMPT_MODEL_ACCESSOR(full);
-#else /* !CONFIG_PREEMPT_DYNAMIC */
+#else /* !CONFIG_PREEMPT_DYNAMIC && !CONFIG_PREEMPT_AUTO */
static inline void preempt_dynamic_init(void) { }
-#endif /* !CONFIG_PREEMPT_DYNAMIC */
+#endif /* !CONFIG_PREEMPT_DYNAMIC && !CONFIG_PREEMPT_AUTO */
#ifdef CONFIG_PREEMPT_DYNAMIC
@@ -8925,7 +8929,26 @@ void sched_dynamic_klp_disable(void)
#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
-#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
+#elif defined(CONFIG_PREEMPT_AUTO)
+
+static void __sched_dynamic_update(int mode)
+{
+ switch (mode) {
+ case preempt_dynamic_none:
+ preempt_dynamic_mode = preempt_dynamic_undefined;
+ break;
+
+ case preempt_dynamic_voluntary:
+ preempt_dynamic_mode = preempt_dynamic_undefined;
+ break;
+
+ case preempt_dynamic_full:
+ preempt_dynamic_mode = preempt_dynamic_undefined;
+ break;
+ }
+}
+
+#endif /* CONFIG_PREEMPT_AUTO */
/**
* yield - yield the current processor to other threads.
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8d5d98a5834d..e53f1b73bf4a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -216,7 +216,7 @@ static const struct file_operations sched_scaling_fops = {
#endif /* SMP */
-#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_PREEMPT_DYNAMIC) || defined(CONFIG_PREEMPT_AUTO)
static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -276,7 +276,7 @@ static const struct file_operations sched_dynamic_fops = {
.release = single_release,
};
-#endif /* CONFIG_PREEMPT_DYNAMIC */
+#endif /* CONFIG_PREEMPT_DYNAMIC || CONFIG_PREEMPT_AUTO */
__read_mostly bool sched_debug_verbose;
@@ -343,7 +343,7 @@ static __init int sched_init_debug(void)
debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
debugfs_create_file_unsafe("verbose", 0644, debugfs_sched, &sched_debug_verbose, &sched_verbose_fops);
-#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_PREEMPT_DYNAMIC) || defined(CONFIG_PREEMPT_AUTO)
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ae50f212775e..c9239c0b0095 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3231,7 +3231,7 @@ extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *w
extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags);
-#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_PREEMPT_DYNAMIC) || defined(CONFIG_PREEMPT_AUTO)
extern int preempt_dynamic_mode;
extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
--
2.31.1
On Mon, May 27, 2024 at 05:34:59PM -0700, Ankur Arora wrote: > Reuse sched_dynamic_update() and related logic to enable choosing > the preemption model at boot or runtime for PREEMPT_AUTO. > > The interface is identical to PREEMPT_DYNAMIC. Colour me confused, why?!? What are you doing and why aren't just just adding AUTO to the existing DYNAMIC thing?
Peter Zijlstra <peterz@infradead.org> writes: > On Mon, May 27, 2024 at 05:34:59PM -0700, Ankur Arora wrote: >> Reuse sched_dynamic_update() and related logic to enable choosing >> the preemption model at boot or runtime for PREEMPT_AUTO. >> >> The interface is identical to PREEMPT_DYNAMIC. > > Colour me confused, why?!? What are you doing and why aren't just just > adding AUTO to the existing DYNAMIC thing? You mean have a single __sched_dynamic_update()? AUTO doesn't use any of the static_call/static_key stuff so I'm not sure how that would work. Or am I missing the point of what you are saying? -- ankur
On Thu, May 30, 2024 at 02:29:45AM -0700, Ankur Arora wrote:
>
> Peter Zijlstra <peterz@infradead.org> writes:
>
> > On Mon, May 27, 2024 at 05:34:59PM -0700, Ankur Arora wrote:
> >> Reuse sched_dynamic_update() and related logic to enable choosing
> >> the preemption model at boot or runtime for PREEMPT_AUTO.
> >>
> >> The interface is identical to PREEMPT_DYNAMIC.
> >
> > Colour me confused, why?!? What are you doing and why aren't just just
> > adding AUTO to the existing DYNAMIC thing?
>
> You mean have a single __sched_dynamic_update()? AUTO doesn't use any
> of the static_call/static_key stuff so I'm not sure how that would work.
*sigh*... see the below, seems to work.
---
arch/x86/Kconfig | 1 +
arch/x86/include/asm/thread_info.h | 6 +-
include/linux/entry-common.h | 3 +-
include/linux/entry-kvm.h | 5 +-
include/linux/sched.h | 10 +++-
include/linux/thread_info.h | 21 +++++--
kernel/Kconfig.preempt | 11 ++++
kernel/entry/common.c | 2 +-
kernel/entry/kvm.c | 4 +-
kernel/sched/core.c | 110 ++++++++++++++++++++++++++++++++-----
kernel/sched/debug.c | 2 +-
kernel/sched/fair.c | 4 +-
kernel/sched/sched.h | 1 +
13 files changed, 148 insertions(+), 32 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e8837116704ce..61f86b69524d7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -91,6 +91,7 @@ config X86
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API if X86_64
+ select ARCH_HAS_PREEMPT_LAZY
select ARCH_HAS_PTE_DEVMAP if X86_64
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_HW_PTE_YOUNG
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 12da7dfd5ef13..75bb390f7baf5 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -87,8 +87,9 @@ struct thread_info {
#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
#define TIF_SIGPENDING 2 /* signal pending */
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
-#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
-#define TIF_SSBD 5 /* Speculative store bypass disable */
+#define TIF_NEED_RESCHED_LAZY 4 /* rescheduling necessary */
+#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/
+#define TIF_SSBD 6 /* Speculative store bypass disable */
#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
#define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
@@ -110,6 +111,7 @@ struct thread_info {
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
#define _TIF_SSBD (1 << TIF_SSBD)
#define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index b0fb775a600d9..e66c8a7c113f4 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -64,7 +64,8 @@
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
+ _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
+ _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
ARCH_EXIT_TO_USER_MODE_WORK)
/**
diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 6813171afccb2..16149f6625e48 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -17,8 +17,9 @@
#endif
#define XFER_TO_GUEST_MODE_WORK \
- (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \
- _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
+ (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \
+ _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \
+ ARCH_XFER_TO_GUEST_MODE_WORK)
struct kvm_vcpu;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7635045b2395c..5900d84e08b3c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1968,7 +1968,8 @@ static inline void set_tsk_need_resched(struct task_struct *tsk)
static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
- clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+ atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY,
+ (atomic_long_t *)&task_thread_info(tsk)->flags);
}
static inline int test_tsk_need_resched(struct task_struct *tsk)
@@ -2074,6 +2075,7 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
extern bool preempt_model_none(void);
extern bool preempt_model_voluntary(void);
extern bool preempt_model_full(void);
+extern bool preempt_model_lazy(void);
#else
@@ -2089,6 +2091,10 @@ static inline bool preempt_model_full(void)
{
return IS_ENABLED(CONFIG_PREEMPT);
}
+static inline bool preempt_model_lazy(void)
+{
+ return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
#endif
@@ -2107,7 +2113,7 @@ static inline bool preempt_model_rt(void)
*/
static inline bool preempt_model_preemptible(void)
{
- return preempt_model_full() || preempt_model_rt();
+ return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
}
static __always_inline bool need_resched(void)
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9ea0b28068f49..cf2446c9c30d4 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -59,6 +59,14 @@ enum syscall_work_bit {
#include <asm/thread_info.h>
+#ifndef TIF_NEED_RESCHED_LAZY
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+#error Inconsistent PREEMPT_LAZY
+#endif
+#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
+#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
+#endif
+
#ifdef __KERNEL__
#ifndef arch_set_restart_data
@@ -179,22 +187,27 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti
#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
-static __always_inline bool tif_need_resched(void)
+static __always_inline bool tif_test_bit(int bit)
{
- return arch_test_bit(TIF_NEED_RESCHED,
+ return arch_test_bit(bit,
(unsigned long *)(¤t_thread_info()->flags));
}
#else
-static __always_inline bool tif_need_resched(void)
+static __always_inline bool tif_test_bit(int bit)
{
- return test_bit(TIF_NEED_RESCHED,
+ return test_bit(bit,
(unsigned long *)(¤t_thread_info()->flags));
}
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
+static __always_inline bool tif_need_resched(void)
+{
+ return tif_test_bit(TIF_NEED_RESCHED);
+}
+
#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
const void * const stackend,
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c2f1fd95a8214..1a2e3849e3e5f 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -11,6 +11,9 @@ config PREEMPT_BUILD
select PREEMPTION
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+config ARCH_HAS_PREEMPT_LAZY
+ bool
+
choice
prompt "Preemption Model"
default PREEMPT_NONE
@@ -67,6 +70,14 @@ config PREEMPT
embedded system with latency requirements in the milliseconds
range.
+config PREEMPT_LAZY
+ bool "Scheduler controlled preemption model"
+ depends on !ARCH_NO_PREEMPT
+ depends on ARCH_HAS_PREEMPT_LAZY
+ select PREEMPT_BUILD
+ help
+ Hamsters in your brain...
+
config PREEMPT_RT
bool "Fully Preemptible Kernel (Real-Time)"
depends on EXPERT && ARCH_SUPPORTS_RT
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 90843cc385880..bcb23c866425e 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -98,7 +98,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
local_irq_enable_exit_to_user(ti_work);
- if (ti_work & _TIF_NEED_RESCHED)
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
schedule();
if (ti_work & _TIF_UPROBE)
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 2e0f75bcb7fd1..8485f63863afc 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
return -EINTR;
}
- if (ti_work & _TIF_NEED_RESCHED)
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
schedule();
if (ti_work & _TIF_NOTIFY_RESUME)
@@ -24,7 +24,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
return ret;
ti_work = read_thread_flags();
- } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
+ } while (ti_work & XFER_TO_GUEST_MODE_WORK);
return 0;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 965e6464e68e9..c32de809283cf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -904,10 +904,9 @@ static inline void hrtick_rq_init(struct rq *rq)
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
{
- struct thread_info *ti = task_thread_info(p);
- return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+ return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG);
}
/*
@@ -932,9 +931,9 @@ static bool set_nr_if_polling(struct task_struct *p)
}
#else
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
{
- set_tsk_need_resched(p);
+ atomic_long_or(1 << tif, (atomic_long_t *)&ti->flags);
return true;
}
@@ -1039,28 +1038,66 @@ void wake_up_q(struct wake_q_head *head)
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-void resched_curr(struct rq *rq)
+static void __resched_curr(struct rq *rq, int tif)
{
struct task_struct *curr = rq->curr;
+ struct thread_info *cti = task_thread_info(curr);
int cpu;
lockdep_assert_rq_held(rq);
- if (test_tsk_need_resched(curr))
+ if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
+ tif = TIF_NEED_RESCHED;
+
+ if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
return;
cpu = cpu_of(rq);
if (cpu == smp_processor_id()) {
- set_tsk_need_resched(curr);
- set_preempt_need_resched();
+ set_ti_thread_flag(cti, tif);
+ if (tif == TIF_NEED_RESCHED)
+ set_preempt_need_resched();
return;
}
- if (set_nr_and_not_polling(curr))
- smp_send_reschedule(cpu);
- else
+ if (set_nr_and_not_polling(cti, tif)) {
+ if (tif == TIF_NEED_RESCHED)
+ smp_send_reschedule(cpu);
+ } else {
trace_sched_wake_idle_without_ipi(cpu);
+ }
+}
+
+void resched_curr(struct rq *rq)
+{
+ __resched_curr(rq, TIF_NEED_RESCHED);
+}
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+ return static_branch_unlikely(&sk_dynamic_preempt_lazy);
+}
+#else
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+ return IS_ENABLED(PREEMPT_LAZY);
+}
+#endif
+
+static __always_inline int tif_need_resched_lazy(void)
+{
+ if (dynamic_preempt_lazy())
+ return TIF_NEED_RESCHED_LAZY;
+
+ return TIF_NEED_RESCHED;
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+ __resched_curr(rq, tif_need_resched_lazy());
}
void resched_cpu(int cpu)
@@ -1155,7 +1192,7 @@ static void wake_up_idle_cpu(int cpu)
* and testing of the above solutions didn't appear to report
* much benefits.
*/
- if (set_nr_and_not_polling(rq->idle))
+ if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
@@ -5537,6 +5574,10 @@ void sched_tick(void)
update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
+
+ if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+ resched_curr(rq);
+
curr->sched_class->task_tick(rq, curr, 0);
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
@@ -7245,6 +7286,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
* preempt_schedule <- NOP
* preempt_schedule_notrace <- NOP
* irqentry_exit_cond_resched <- NOP
+ * dynamic_preempt_lazy <- false
*
* VOLUNTARY:
* cond_resched <- __cond_resched
@@ -7252,6 +7294,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
* preempt_schedule <- NOP
* preempt_schedule_notrace <- NOP
* irqentry_exit_cond_resched <- NOP
+ * dynamic_preempt_lazy <- false
*
* FULL:
* cond_resched <- RET0
@@ -7259,6 +7302,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
* preempt_schedule <- preempt_schedule
* preempt_schedule_notrace <- preempt_schedule_notrace
* irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ * dynamic_preempt_lazy <- false
+ *
+ * LAZY:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ * dynamic_preempt_lazy <- true
*/
enum {
@@ -7266,6 +7318,7 @@ enum {
preempt_dynamic_none,
preempt_dynamic_voluntary,
preempt_dynamic_full,
+ preempt_dynamic_lazy,
};
int preempt_dynamic_mode = preempt_dynamic_undefined;
@@ -7281,15 +7334,23 @@ int sched_dynamic_mode(const char *str)
if (!strcmp(str, "full"))
return preempt_dynamic_full;
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+ if (!strcmp(str, "lazy"))
+ return preempt_dynamic_lazy;
+#endif
+
return -EINVAL;
}
+#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
+
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
-#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
+#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
+#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
#else
#error "Unsupported PREEMPT_DYNAMIC mechanism"
#endif
@@ -7309,6 +7370,7 @@ static void __sched_dynamic_update(int mode)
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
switch (mode) {
case preempt_dynamic_none:
@@ -7318,6 +7380,7 @@ static void __sched_dynamic_update(int mode)
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: none\n");
break;
@@ -7329,6 +7392,7 @@ static void __sched_dynamic_update(int mode)
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: voluntary\n");
break;
@@ -7340,9 +7404,22 @@ static void __sched_dynamic_update(int mode)
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: full\n");
break;
+
+ case preempt_dynamic_lazy:
+ if (!klp_override)
+ preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_enable(preempt_lazy);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: lazy\n");
+ break;
}
preempt_dynamic_mode = mode;
@@ -7405,6 +7482,8 @@ static void __init preempt_dynamic_init(void)
sched_dynamic_update(preempt_dynamic_none);
} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
sched_dynamic_update(preempt_dynamic_voluntary);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+ sched_dynamic_update(preempt_dynamic_lazy);
} else {
/* Default static call setting, nothing to do */
WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
@@ -7425,6 +7504,7 @@ static void __init preempt_dynamic_init(void)
PREEMPT_MODEL_ACCESSOR(none);
PREEMPT_MODEL_ACCESSOR(voluntary);
PREEMPT_MODEL_ACCESSOR(full);
+PREEMPT_MODEL_ACCESSOR(lazy);
#else /* !CONFIG_PREEMPT_DYNAMIC: */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1bc24410ae501..87309cf247c68 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -245,7 +245,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
static int sched_dynamic_show(struct seq_file *m, void *v)
{
static const char * preempt_modes[] = {
- "none", "voluntary", "full"
+ "none", "voluntary", "full", "lazy",
};
int i;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b5d50dbc79dc..71b4112cadde0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1007,7 +1007,7 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
* The task has consumed its request, reschedule.
*/
if (cfs_rq->nr_running > 1) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
clear_buddies(cfs_rq, se);
}
}
@@ -8615,7 +8615,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
return;
preempt:
- resched_curr(rq);
+ resched_curr_lazy(rq);
}
static struct task_struct *pick_task_fair(struct rq *rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 041d8e00a1568..48a4617a5b28b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2494,6 +2494,7 @@ extern void init_sched_fair_class(void);
extern void reweight_task(struct task_struct *p, int prio);
extern void resched_curr(struct rq *rq);
+extern void resched_curr_lazy(struct rq *rq);
extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
Peter Zijlstra <peterz@infradead.org> writes:
> On Thu, May 30, 2024 at 02:29:45AM -0700, Ankur Arora wrote:
>>
>> Peter Zijlstra <peterz@infradead.org> writes:
>>
>> > On Mon, May 27, 2024 at 05:34:59PM -0700, Ankur Arora wrote:
>> >> Reuse sched_dynamic_update() and related logic to enable choosing
>> >> the preemption model at boot or runtime for PREEMPT_AUTO.
>> >>
>> >> The interface is identical to PREEMPT_DYNAMIC.
>> >
>> > Colour me confused, why?!? What are you doing and why aren't just just
>> > adding AUTO to the existing DYNAMIC thing?
>>
>> You mean have a single __sched_dynamic_update()? AUTO doesn't use any
>> of the static_call/static_key stuff so I'm not sure how that would work.
>
> *sigh*... see the below, seems to work.
Sorry, didn't mean for you to have to do all that work to prove the
point.
I phrased it badly. I do understand how lazy can be folded in as
you do here:
> + case preempt_dynamic_lazy:
> + if (!klp_override)
> + preempt_dynamic_disable(cond_resched);
> + preempt_dynamic_disable(might_resched);
> + preempt_dynamic_enable(preempt_schedule);
> + preempt_dynamic_enable(preempt_schedule_notrace);
> + preempt_dynamic_enable(irqentry_exit_cond_resched);
> + preempt_dynamic_key_enable(preempt_lazy);
> + if (mode != preempt_dynamic_mode)
> + pr_info("Dynamic Preempt: lazy\n");
> + break;
> }
But, if the long term goal (at least as I understand it) is to get rid
of cond_resched() -- to allow optimizations that needing to call cond_resched()
makes impossible -- does it make sense to pull all of these together?
Say, eventually preempt_dynamic_lazy and preempt_dynamic_full are the
only two models left. Then we will have (modulo figuring out how to
switch over klp from cond_resched() to a different unwinding technique):
static void __sched_dynamic_update(int mode)
{
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
switch (mode) {
case preempt_dynamic_full:
preempt_dynamic_key_disable(preempt_lazy);
if (mode != preempt_dynamic_mode)
pr_info("%s: full\n", PREEMPT_MODE);
break;
case preempt_dynamic_lazy:
preempt_dynamic_key_enable(preempt_lazy);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: lazy\n");
break;
}
preempt_dynamic_mode = mode;
}
Which is pretty similar to what the PREEMPT_AUTO code was doing.
Thanks
Ankur
> ---
> arch/x86/Kconfig | 1 +
> arch/x86/include/asm/thread_info.h | 6 +-
> include/linux/entry-common.h | 3 +-
> include/linux/entry-kvm.h | 5 +-
> include/linux/sched.h | 10 +++-
> include/linux/thread_info.h | 21 +++++--
> kernel/Kconfig.preempt | 11 ++++
> kernel/entry/common.c | 2 +-
> kernel/entry/kvm.c | 4 +-
> kernel/sched/core.c | 110 ++++++++++++++++++++++++++++++++-----
> kernel/sched/debug.c | 2 +-
> kernel/sched/fair.c | 4 +-
> kernel/sched/sched.h | 1 +
> 13 files changed, 148 insertions(+), 32 deletions(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index e8837116704ce..61f86b69524d7 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -91,6 +91,7 @@ config X86
> select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
> select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
> select ARCH_HAS_PMEM_API if X86_64
> + select ARCH_HAS_PREEMPT_LAZY
> select ARCH_HAS_PTE_DEVMAP if X86_64
> select ARCH_HAS_PTE_SPECIAL
> select ARCH_HAS_HW_PTE_YOUNG
> diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
> index 12da7dfd5ef13..75bb390f7baf5 100644
> --- a/arch/x86/include/asm/thread_info.h
> +++ b/arch/x86/include/asm/thread_info.h
> @@ -87,8 +87,9 @@ struct thread_info {
> #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
> #define TIF_SIGPENDING 2 /* signal pending */
> #define TIF_NEED_RESCHED 3 /* rescheduling necessary */
> -#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
> -#define TIF_SSBD 5 /* Speculative store bypass disable */
> +#define TIF_NEED_RESCHED_LAZY 4 /* rescheduling necessary */
> +#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/
> +#define TIF_SSBD 6 /* Speculative store bypass disable */
> #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
> #define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */
> #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
> @@ -110,6 +111,7 @@ struct thread_info {
> #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
> #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
> #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
> +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
> #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
> #define _TIF_SSBD (1 << TIF_SSBD)
> #define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> index b0fb775a600d9..e66c8a7c113f4 100644
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -64,7 +64,8 @@
>
> #define EXIT_TO_USER_MODE_WORK \
> (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
> + _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
> + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
> ARCH_EXIT_TO_USER_MODE_WORK)
>
> /**
> diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
> index 6813171afccb2..16149f6625e48 100644
> --- a/include/linux/entry-kvm.h
> +++ b/include/linux/entry-kvm.h
> @@ -17,8 +17,9 @@
> #endif
>
> #define XFER_TO_GUEST_MODE_WORK \
> - (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \
> - _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
> + (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \
> + _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \
> + ARCH_XFER_TO_GUEST_MODE_WORK)
>
> struct kvm_vcpu;
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 7635045b2395c..5900d84e08b3c 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1968,7 +1968,8 @@ static inline void set_tsk_need_resched(struct task_struct *tsk)
>
> static inline void clear_tsk_need_resched(struct task_struct *tsk)
> {
> - clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
> + atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY,
> + (atomic_long_t *)&task_thread_info(tsk)->flags);
> }
>
> static inline int test_tsk_need_resched(struct task_struct *tsk)
> @@ -2074,6 +2075,7 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
> extern bool preempt_model_none(void);
> extern bool preempt_model_voluntary(void);
> extern bool preempt_model_full(void);
> +extern bool preempt_model_lazy(void);
>
> #else
>
> @@ -2089,6 +2091,10 @@ static inline bool preempt_model_full(void)
> {
> return IS_ENABLED(CONFIG_PREEMPT);
> }
> +static inline bool preempt_model_lazy(void)
> +{
> + return IS_ENABLED(CONFIG_PREEMPT_LAZY);
> +}
>
> #endif
>
> @@ -2107,7 +2113,7 @@ static inline bool preempt_model_rt(void)
> */
> static inline bool preempt_model_preemptible(void)
> {
> - return preempt_model_full() || preempt_model_rt();
> + return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
> }
>
> static __always_inline bool need_resched(void)
> diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
> index 9ea0b28068f49..cf2446c9c30d4 100644
> --- a/include/linux/thread_info.h
> +++ b/include/linux/thread_info.h
> @@ -59,6 +59,14 @@ enum syscall_work_bit {
>
> #include <asm/thread_info.h>
>
> +#ifndef TIF_NEED_RESCHED_LAZY
> +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
> +#error Inconsistent PREEMPT_LAZY
> +#endif
> +#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
> +#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
> +#endif
> +
> #ifdef __KERNEL__
>
> #ifndef arch_set_restart_data
> @@ -179,22 +187,27 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti
>
> #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
>
> -static __always_inline bool tif_need_resched(void)
> +static __always_inline bool tif_test_bit(int bit)
> {
> - return arch_test_bit(TIF_NEED_RESCHED,
> + return arch_test_bit(bit,
> (unsigned long *)(¤t_thread_info()->flags));
> }
>
> #else
>
> -static __always_inline bool tif_need_resched(void)
> +static __always_inline bool tif_test_bit(int bit)
> {
> - return test_bit(TIF_NEED_RESCHED,
> + return test_bit(bit,
> (unsigned long *)(¤t_thread_info()->flags));
> }
>
> #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
>
> +static __always_inline bool tif_need_resched(void)
> +{
> + return tif_test_bit(TIF_NEED_RESCHED);
> +}
> +
> #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
> static inline int arch_within_stack_frames(const void * const stack,
> const void * const stackend,
> diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
> index c2f1fd95a8214..1a2e3849e3e5f 100644
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -11,6 +11,9 @@ config PREEMPT_BUILD
> select PREEMPTION
> select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
>
> +config ARCH_HAS_PREEMPT_LAZY
> + bool
> +
> choice
> prompt "Preemption Model"
> default PREEMPT_NONE
> @@ -67,6 +70,14 @@ config PREEMPT
> embedded system with latency requirements in the milliseconds
> range.
>
> +config PREEMPT_LAZY
> + bool "Scheduler controlled preemption model"
> + depends on !ARCH_NO_PREEMPT
> + depends on ARCH_HAS_PREEMPT_LAZY
> + select PREEMPT_BUILD
> + help
> + Hamsters in your brain...
> +
> config PREEMPT_RT
> bool "Fully Preemptible Kernel (Real-Time)"
> depends on EXPERT && ARCH_SUPPORTS_RT
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index 90843cc385880..bcb23c866425e 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -98,7 +98,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>
> local_irq_enable_exit_to_user(ti_work);
>
> - if (ti_work & _TIF_NEED_RESCHED)
> + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
> schedule();
>
> if (ti_work & _TIF_UPROBE)
> diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
> index 2e0f75bcb7fd1..8485f63863afc 100644
> --- a/kernel/entry/kvm.c
> +++ b/kernel/entry/kvm.c
> @@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
> return -EINTR;
> }
>
> - if (ti_work & _TIF_NEED_RESCHED)
> + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
> schedule();
>
> if (ti_work & _TIF_NOTIFY_RESUME)
> @@ -24,7 +24,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
> return ret;
>
> ti_work = read_thread_flags();
> - } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
> + } while (ti_work & XFER_TO_GUEST_MODE_WORK);
> return 0;
> }
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 965e6464e68e9..c32de809283cf 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -904,10 +904,9 @@ static inline void hrtick_rq_init(struct rq *rq)
> * this avoids any races wrt polling state changes and thereby avoids
> * spurious IPIs.
> */
> -static inline bool set_nr_and_not_polling(struct task_struct *p)
> +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
> {
> - struct thread_info *ti = task_thread_info(p);
> - return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
> + return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG);
> }
>
> /*
> @@ -932,9 +931,9 @@ static bool set_nr_if_polling(struct task_struct *p)
> }
>
> #else
> -static inline bool set_nr_and_not_polling(struct task_struct *p)
> +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
> {
> - set_tsk_need_resched(p);
> + atomic_long_or(1 << tif, (atomic_long_t *)&ti->flags);
> return true;
> }
>
> @@ -1039,28 +1038,66 @@ void wake_up_q(struct wake_q_head *head)
> * might also involve a cross-CPU call to trigger the scheduler on
> * the target CPU.
> */
> -void resched_curr(struct rq *rq)
> +static void __resched_curr(struct rq *rq, int tif)
> {
> struct task_struct *curr = rq->curr;
> + struct thread_info *cti = task_thread_info(curr);
> int cpu;
>
> lockdep_assert_rq_held(rq);
>
> - if (test_tsk_need_resched(curr))
> + if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
> + tif = TIF_NEED_RESCHED;
> +
> + if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
> return;
>
> cpu = cpu_of(rq);
>
> if (cpu == smp_processor_id()) {
> - set_tsk_need_resched(curr);
> - set_preempt_need_resched();
> + set_ti_thread_flag(cti, tif);
> + if (tif == TIF_NEED_RESCHED)
> + set_preempt_need_resched();
> return;
> }
>
> - if (set_nr_and_not_polling(curr))
> - smp_send_reschedule(cpu);
> - else
> + if (set_nr_and_not_polling(cti, tif)) {
> + if (tif == TIF_NEED_RESCHED)
> + smp_send_reschedule(cpu);
> + } else {
> trace_sched_wake_idle_without_ipi(cpu);
> + }
> +}
> +
> +void resched_curr(struct rq *rq)
> +{
> + __resched_curr(rq, TIF_NEED_RESCHED);
> +}
> +
> +#ifdef CONFIG_PREEMPT_DYNAMIC
> +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> + return static_branch_unlikely(&sk_dynamic_preempt_lazy);
> +}
> +#else
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> + return IS_ENABLED(PREEMPT_LAZY);
> +}
> +#endif
> +
> +static __always_inline int tif_need_resched_lazy(void)
> +{
> + if (dynamic_preempt_lazy())
> + return TIF_NEED_RESCHED_LAZY;
> +
> + return TIF_NEED_RESCHED;
> +}
> +
> +void resched_curr_lazy(struct rq *rq)
> +{
> + __resched_curr(rq, tif_need_resched_lazy());
> }
>
> void resched_cpu(int cpu)
> @@ -1155,7 +1192,7 @@ static void wake_up_idle_cpu(int cpu)
> * and testing of the above solutions didn't appear to report
> * much benefits.
> */
> - if (set_nr_and_not_polling(rq->idle))
> + if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
> smp_send_reschedule(cpu);
> else
> trace_sched_wake_idle_without_ipi(cpu);
> @@ -5537,6 +5574,10 @@ void sched_tick(void)
> update_rq_clock(rq);
> hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
> update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
> +
> + if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
> + resched_curr(rq);
> +
> curr->sched_class->task_tick(rq, curr, 0);
> if (sched_feat(LATENCY_WARN))
> resched_latency = cpu_resched_latency(rq);
> @@ -7245,6 +7286,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
> * preempt_schedule <- NOP
> * preempt_schedule_notrace <- NOP
> * irqentry_exit_cond_resched <- NOP
> + * dynamic_preempt_lazy <- false
> *
> * VOLUNTARY:
> * cond_resched <- __cond_resched
> @@ -7252,6 +7294,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
> * preempt_schedule <- NOP
> * preempt_schedule_notrace <- NOP
> * irqentry_exit_cond_resched <- NOP
> + * dynamic_preempt_lazy <- false
> *
> * FULL:
> * cond_resched <- RET0
> @@ -7259,6 +7302,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
> * preempt_schedule <- preempt_schedule
> * preempt_schedule_notrace <- preempt_schedule_notrace
> * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
> + * dynamic_preempt_lazy <- false
> + *
> + * LAZY:
> + * cond_resched <- RET0
> + * might_resched <- RET0
> + * preempt_schedule <- preempt_schedule
> + * preempt_schedule_notrace <- preempt_schedule_notrace
> + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
> + * dynamic_preempt_lazy <- true
> */
>
> enum {
> @@ -7266,6 +7318,7 @@ enum {
> preempt_dynamic_none,
> preempt_dynamic_voluntary,
> preempt_dynamic_full,
> + preempt_dynamic_lazy,
> };
>
> int preempt_dynamic_mode = preempt_dynamic_undefined;
> @@ -7281,15 +7334,23 @@ int sched_dynamic_mode(const char *str)
> if (!strcmp(str, "full"))
> return preempt_dynamic_full;
>
> +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
> + if (!strcmp(str, "lazy"))
> + return preempt_dynamic_lazy;
> +#endif
> +
> return -EINVAL;
> }
>
> +#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
> +#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
> +
> #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
> #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
> #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
> #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
> -#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
> -#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
> +#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
> +#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
> #else
> #error "Unsupported PREEMPT_DYNAMIC mechanism"
> #endif
> @@ -7309,6 +7370,7 @@ static void __sched_dynamic_update(int mode)
> preempt_dynamic_enable(preempt_schedule);
> preempt_dynamic_enable(preempt_schedule_notrace);
> preempt_dynamic_enable(irqentry_exit_cond_resched);
> + preempt_dynamic_key_disable(preempt_lazy);
>
> switch (mode) {
> case preempt_dynamic_none:
> @@ -7318,6 +7380,7 @@ static void __sched_dynamic_update(int mode)
> preempt_dynamic_disable(preempt_schedule);
> preempt_dynamic_disable(preempt_schedule_notrace);
> preempt_dynamic_disable(irqentry_exit_cond_resched);
> + preempt_dynamic_key_disable(preempt_lazy);
> if (mode != preempt_dynamic_mode)
> pr_info("Dynamic Preempt: none\n");
> break;
> @@ -7329,6 +7392,7 @@ static void __sched_dynamic_update(int mode)
> preempt_dynamic_disable(preempt_schedule);
> preempt_dynamic_disable(preempt_schedule_notrace);
> preempt_dynamic_disable(irqentry_exit_cond_resched);
> + preempt_dynamic_key_disable(preempt_lazy);
> if (mode != preempt_dynamic_mode)
> pr_info("Dynamic Preempt: voluntary\n");
> break;
> @@ -7340,9 +7404,22 @@ static void __sched_dynamic_update(int mode)
> preempt_dynamic_enable(preempt_schedule);
> preempt_dynamic_enable(preempt_schedule_notrace);
> preempt_dynamic_enable(irqentry_exit_cond_resched);
> + preempt_dynamic_key_disable(preempt_lazy);
> if (mode != preempt_dynamic_mode)
> pr_info("Dynamic Preempt: full\n");
> break;
> +
> + case preempt_dynamic_lazy:
> + if (!klp_override)
> + preempt_dynamic_disable(cond_resched);
> + preempt_dynamic_disable(might_resched);
> + preempt_dynamic_enable(preempt_schedule);
> + preempt_dynamic_enable(preempt_schedule_notrace);
> + preempt_dynamic_enable(irqentry_exit_cond_resched);
> + preempt_dynamic_key_enable(preempt_lazy);
> + if (mode != preempt_dynamic_mode)
> + pr_info("Dynamic Preempt: lazy\n");
> + break;
> }
>
> preempt_dynamic_mode = mode;
> @@ -7405,6 +7482,8 @@ static void __init preempt_dynamic_init(void)
> sched_dynamic_update(preempt_dynamic_none);
> } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
> sched_dynamic_update(preempt_dynamic_voluntary);
> + } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
> + sched_dynamic_update(preempt_dynamic_lazy);
> } else {
> /* Default static call setting, nothing to do */
> WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
> @@ -7425,6 +7504,7 @@ static void __init preempt_dynamic_init(void)
> PREEMPT_MODEL_ACCESSOR(none);
> PREEMPT_MODEL_ACCESSOR(voluntary);
> PREEMPT_MODEL_ACCESSOR(full);
> +PREEMPT_MODEL_ACCESSOR(lazy);
>
> #else /* !CONFIG_PREEMPT_DYNAMIC: */
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 1bc24410ae501..87309cf247c68 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -245,7 +245,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
> static int sched_dynamic_show(struct seq_file *m, void *v)
> {
> static const char * preempt_modes[] = {
> - "none", "voluntary", "full"
> + "none", "voluntary", "full", "lazy",
> };
> int i;
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5b5d50dbc79dc..71b4112cadde0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1007,7 +1007,7 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
> * The task has consumed its request, reschedule.
> */
> if (cfs_rq->nr_running > 1) {
> - resched_curr(rq_of(cfs_rq));
> + resched_curr_lazy(rq_of(cfs_rq));
> clear_buddies(cfs_rq, se);
> }
> }
> @@ -8615,7 +8615,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> return;
>
> preempt:
> - resched_curr(rq);
> + resched_curr_lazy(rq);
> }
>
> static struct task_struct *pick_task_fair(struct rq *rq)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 041d8e00a1568..48a4617a5b28b 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2494,6 +2494,7 @@ extern void init_sched_fair_class(void);
> extern void reweight_task(struct task_struct *p, int prio);
>
> extern void resched_curr(struct rq *rq);
> +extern void resched_curr_lazy(struct rq *rq);
> extern void resched_cpu(int cpu);
>
> extern struct rt_bandwidth def_rt_bandwidth;
--
ankur
On Thu, Jun 06, 2024 at 08:11:41AM -0700, Ankur Arora wrote:
>
> Peter Zijlstra <peterz@infradead.org> writes:
>
> > On Thu, May 30, 2024 at 02:29:45AM -0700, Ankur Arora wrote:
> >>
> >> Peter Zijlstra <peterz@infradead.org> writes:
> >>
> >> > On Mon, May 27, 2024 at 05:34:59PM -0700, Ankur Arora wrote:
> >> >> Reuse sched_dynamic_update() and related logic to enable choosing
> >> >> the preemption model at boot or runtime for PREEMPT_AUTO.
> >> >>
> >> >> The interface is identical to PREEMPT_DYNAMIC.
> >> >
> >> > Colour me confused, why?!? What are you doing and why aren't just just
> >> > adding AUTO to the existing DYNAMIC thing?
> >>
> >> You mean have a single __sched_dynamic_update()? AUTO doesn't use any
> >> of the static_call/static_key stuff so I'm not sure how that would work.
> >
> > *sigh*... see the below, seems to work.
>
> Sorry, didn't mean for you to have to do all that work to prove the
> point.
Well, for a large part it was needed for me to figure out what your
patches were actually doing anyway. Peel away all the layers and this is
what remains.
> I phrased it badly. I do understand how lazy can be folded in as
> you do here:
>
> > + case preempt_dynamic_lazy:
> > + if (!klp_override)
> > + preempt_dynamic_disable(cond_resched);
> > + preempt_dynamic_disable(might_resched);
> > + preempt_dynamic_enable(preempt_schedule);
> > + preempt_dynamic_enable(preempt_schedule_notrace);
> > + preempt_dynamic_enable(irqentry_exit_cond_resched);
> > + preempt_dynamic_key_enable(preempt_lazy);
> > + if (mode != preempt_dynamic_mode)
> > + pr_info("Dynamic Preempt: lazy\n");
> > + break;
> > }
>
> But, if the long term goal (at least as I understand it) is to get rid
> of cond_resched() -- to allow optimizations that needing to call cond_resched()
> makes impossible -- does it make sense to pull all of these together?
It certainly doesn't make sense to add yet another configurable thing. We
have one, so yes add it here.
> Say, eventually preempt_dynamic_lazy and preempt_dynamic_full are the
> only two models left. Then we will have (modulo figuring out how to
> switch over klp from cond_resched() to a different unwinding technique):
>
> static void __sched_dynamic_update(int mode)
> {
> preempt_dynamic_enable(preempt_schedule);
> preempt_dynamic_enable(preempt_schedule_notrace);
> preempt_dynamic_enable(irqentry_exit_cond_resched);
>
> switch (mode) {
> case preempt_dynamic_full:
> preempt_dynamic_key_disable(preempt_lazy);
> if (mode != preempt_dynamic_mode)
> pr_info("%s: full\n", PREEMPT_MODE);
> break;
>
> case preempt_dynamic_lazy:
> preempt_dynamic_key_enable(preempt_lazy);
> if (mode != preempt_dynamic_mode)
> pr_info("Dynamic Preempt: lazy\n");
> break;
> }
>
> preempt_dynamic_mode = mode;
> }
>
> Which is pretty similar to what the PREEMPT_AUTO code was doing.
Right, but without duplicating all that stuff in the interim.
Peter Zijlstra <peterz@infradead.org> writes:
> On Thu, Jun 06, 2024 at 08:11:41AM -0700, Ankur Arora wrote:
>>
>> Peter Zijlstra <peterz@infradead.org> writes:
>>
>> > On Thu, May 30, 2024 at 02:29:45AM -0700, Ankur Arora wrote:
>> >>
>> >> Peter Zijlstra <peterz@infradead.org> writes:
>> >>
>> >> > On Mon, May 27, 2024 at 05:34:59PM -0700, Ankur Arora wrote:
>> >> >> Reuse sched_dynamic_update() and related logic to enable choosing
>> >> >> the preemption model at boot or runtime for PREEMPT_AUTO.
>> >> >>
>> >> >> The interface is identical to PREEMPT_DYNAMIC.
>> >> >
>> >> > Colour me confused, why?!? What are you doing and why aren't just just
>> >> > adding AUTO to the existing DYNAMIC thing?
>> >>
>> >> You mean have a single __sched_dynamic_update()? AUTO doesn't use any
>> >> of the static_call/static_key stuff so I'm not sure how that would work.
>> >
>> > *sigh*... see the below, seems to work.
>>
>> Sorry, didn't mean for you to have to do all that work to prove the
>> point.
>
> Well, for a large part it was needed for me to figure out what your
> patches were actually doing anyway. Peel away all the layers and this is
> what remains.
>
>> I phrased it badly. I do understand how lazy can be folded in as
>> you do here:
>>
>> > + case preempt_dynamic_lazy:
>> > + if (!klp_override)
>> > + preempt_dynamic_disable(cond_resched);
>> > + preempt_dynamic_disable(might_resched);
>> > + preempt_dynamic_enable(preempt_schedule);
>> > + preempt_dynamic_enable(preempt_schedule_notrace);
>> > + preempt_dynamic_enable(irqentry_exit_cond_resched);
>> > + preempt_dynamic_key_enable(preempt_lazy);
>> > + if (mode != preempt_dynamic_mode)
>> > + pr_info("Dynamic Preempt: lazy\n");
>> > + break;
>> > }
>>
>> But, if the long term goal (at least as I understand it) is to get rid
>> of cond_resched() -- to allow optimizations that needing to call cond_resched()
>> makes impossible -- does it make sense to pull all of these together?
>
> It certainly doesn't make sense to add yet another configurable thing. We
> have one, so yes add it here.
>
>> Say, eventually preempt_dynamic_lazy and preempt_dynamic_full are the
>> only two models left. Then we will have (modulo figuring out how to
>> switch over klp from cond_resched() to a different unwinding technique):
>>
>> static void __sched_dynamic_update(int mode)
>> {
>> preempt_dynamic_enable(preempt_schedule);
>> preempt_dynamic_enable(preempt_schedule_notrace);
>> preempt_dynamic_enable(irqentry_exit_cond_resched);
>>
>> switch (mode) {
>> case preempt_dynamic_full:
>> preempt_dynamic_key_disable(preempt_lazy);
>> if (mode != preempt_dynamic_mode)
>> pr_info("%s: full\n", PREEMPT_MODE);
>> break;
>>
>> case preempt_dynamic_lazy:
>> preempt_dynamic_key_enable(preempt_lazy);
>> if (mode != preempt_dynamic_mode)
>> pr_info("Dynamic Preempt: lazy\n");
>> break;
>> }
>>
>> preempt_dynamic_mode = mode;
>> }
>>
>> Which is pretty similar to what the PREEMPT_AUTO code was doing.
>
> Right, but without duplicating all that stuff in the interim.
Yeah, that makes sense. Joel had suggested something on these lines
earlier [1], to which I was resistant.
However, the duplication (and the fact that the voluntary model
was quite thin) should have told me that (AUTO, preempt=voluntary)
should just be folded under PREEMPT_DYNAMIC.
I'll rework the series to do that.
That should also simplify RCU related choices which I think Paul will
like. Given that the lazy model is meant to eventually replace
none/voluntary, so PREEMPT_RCU configuration can just be:
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -18,7 +18,7 @@ config TREE_RCU
config PREEMPT_RCU
bool
- default y if PREEMPTION
+ default y if PREEMPTION && !PREEMPT_LAZY
Or, maybe we should instead have this:
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -18,7 +18,7 @@ config TREE_RCU
config PREEMPT_RCU
bool
- default y if PREEMPTION
+ default y if PREEMPT || PREEMPT_RT
select TREE_RCU
Though this would be a change in behaviour for current PREEMPT_DYNAMIC
users.
[1] https://lore.kernel.org/lkml/fd48ea5c-bc74-4914-a621-d12c9741c014@joelfernandes.org/
Thanks
--
ankur
On Sat, Jun 08, 2024 at 05:46:26PM -0700, Ankur Arora wrote:
> Peter Zijlstra <peterz@infradead.org> writes:
> > On Thu, Jun 06, 2024 at 08:11:41AM -0700, Ankur Arora wrote:
> >> Peter Zijlstra <peterz@infradead.org> writes:
> >> > On Thu, May 30, 2024 at 02:29:45AM -0700, Ankur Arora wrote:
> >> >> Peter Zijlstra <peterz@infradead.org> writes:
> >> >> > On Mon, May 27, 2024 at 05:34:59PM -0700, Ankur Arora wrote:
> >> >> >> Reuse sched_dynamic_update() and related logic to enable choosing
> >> >> >> the preemption model at boot or runtime for PREEMPT_AUTO.
> >> >> >>
> >> >> >> The interface is identical to PREEMPT_DYNAMIC.
> >> >> >
> >> >> > Colour me confused, why?!? What are you doing and why aren't just just
> >> >> > adding AUTO to the existing DYNAMIC thing?
> >> >>
> >> >> You mean have a single __sched_dynamic_update()? AUTO doesn't use any
> >> >> of the static_call/static_key stuff so I'm not sure how that would work.
> >> >
> >> > *sigh*... see the below, seems to work.
> >>
> >> Sorry, didn't mean for you to have to do all that work to prove the
> >> point.
> >
> > Well, for a large part it was needed for me to figure out what your
> > patches were actually doing anyway. Peel away all the layers and this is
> > what remains.
> >
> >> I phrased it badly. I do understand how lazy can be folded in as
> >> you do here:
> >>
> >> > + case preempt_dynamic_lazy:
> >> > + if (!klp_override)
> >> > + preempt_dynamic_disable(cond_resched);
> >> > + preempt_dynamic_disable(might_resched);
> >> > + preempt_dynamic_enable(preempt_schedule);
> >> > + preempt_dynamic_enable(preempt_schedule_notrace);
> >> > + preempt_dynamic_enable(irqentry_exit_cond_resched);
> >> > + preempt_dynamic_key_enable(preempt_lazy);
> >> > + if (mode != preempt_dynamic_mode)
> >> > + pr_info("Dynamic Preempt: lazy\n");
> >> > + break;
> >> > }
> >>
> >> But, if the long term goal (at least as I understand it) is to get rid
> >> of cond_resched() -- to allow optimizations that needing to call cond_resched()
> >> makes impossible -- does it make sense to pull all of these together?
> >
> > It certainly doesn't make sense to add yet another configurable thing. We
> > have one, so yes add it here.
> >
> >> Say, eventually preempt_dynamic_lazy and preempt_dynamic_full are the
> >> only two models left. Then we will have (modulo figuring out how to
> >> switch over klp from cond_resched() to a different unwinding technique):
> >>
> >> static void __sched_dynamic_update(int mode)
> >> {
> >> preempt_dynamic_enable(preempt_schedule);
> >> preempt_dynamic_enable(preempt_schedule_notrace);
> >> preempt_dynamic_enable(irqentry_exit_cond_resched);
> >>
> >> switch (mode) {
> >> case preempt_dynamic_full:
> >> preempt_dynamic_key_disable(preempt_lazy);
> >> if (mode != preempt_dynamic_mode)
> >> pr_info("%s: full\n", PREEMPT_MODE);
> >> break;
> >>
> >> case preempt_dynamic_lazy:
> >> preempt_dynamic_key_enable(preempt_lazy);
> >> if (mode != preempt_dynamic_mode)
> >> pr_info("Dynamic Preempt: lazy\n");
> >> break;
> >> }
> >>
> >> preempt_dynamic_mode = mode;
> >> }
> >>
> >> Which is pretty similar to what the PREEMPT_AUTO code was doing.
> >
> > Right, but without duplicating all that stuff in the interim.
>
> Yeah, that makes sense. Joel had suggested something on these lines
> earlier [1], to which I was resistant.
>
> However, the duplication (and the fact that the voluntary model
> was quite thin) should have told me that (AUTO, preempt=voluntary)
> should just be folded under PREEMPT_DYNAMIC.
>
> I'll rework the series to do that.
>
> That should also simplify RCU related choices which I think Paul will
> like. Given that the lazy model is meant to eventually replace
> none/voluntary, so PREEMPT_RCU configuration can just be:
>
> --- a/kernel/rcu/Kconfig
> +++ b/kernel/rcu/Kconfig
> @@ -18,7 +18,7 @@ config TREE_RCU
>
> config PREEMPT_RCU
> bool
> - default y if PREEMPTION
> + default y if PREEMPTION && !PREEMPT_LAZY
Given that PREEMPT_DYNAMIC selects PREEMPT_BUILD which in turn selects
PREEMPTION, this should work.
> Or, maybe we should instead have this:
>
> --- a/kernel/rcu/Kconfig
> +++ b/kernel/rcu/Kconfig
> @@ -18,7 +18,7 @@ config TREE_RCU
>
> config PREEMPT_RCU
> bool
> - default y if PREEMPTION
> + default y if PREEMPT || PREEMPT_RT
> select TREE_RCU
>
> Though this would be a change in behaviour for current PREEMPT_DYNAMIC
> users.
Which I believe to be a no-go. I believe that PREEMPT_DYNAMIC users
really need their preemptible kernels to include preemptible RCU.
If PREEMPT_LAZY causes PREEMPT_DYNAMIC non-preemptible kernels to become
lazily preemptible, that is a topic to discuss with PREEMPT_DYNAMIC users.
On the other hand, if PREEMPT_LAZY does not cause PREEMPT_DYNAMIC
kernels to become lazily preemptible, then I would expect there to be
hard questions about removing cond_resched() and might_sleep(), or,
for that matter changing their semantics. Which I again must leave
to PREEMPT_DYNAMIC users.
Thanx, Paul
> [1] https://lore.kernel.org/lkml/fd48ea5c-bc74-4914-a621-d12c9741c014@joelfernandes.org/
>
> Thanks
> --
> ankur
© 2016 - 2026 Red Hat, Inc.