kernel/sched/core.c | 8 +------- kernel/sched/ext.c | 2 ++ kernel/sched/sched.h | 5 +++++ 3 files changed, 8 insertions(+), 7 deletions(-)
While tasks are being iterated to be switched in and out of SCX,
__scx_task_iter_rq_unlock() is called to unlock rq. As
sched_class->switched_from() and friends may schedule balance callbacks,
they should be executed before moving onto the next task.
This brekage is currently theoretical as only RT and DL schedule balance
callbacks on class switches and SCX only switches from/to fair.
Make __balance_callbacks() global and call it from
__scx_task_iter_rq_unlock().
Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20241030154551.GN14555@noisy.programming.kicks-ass.net
---
Hello,
If this looks okay, I'll route it through sched_ext/for-6.12-fixes.
Thanks.
kernel/sched/core.c | 8 +-------
kernel/sched/ext.c | 2 ++
kernel/sched/sched.h | 5 +++++
3 files changed, 8 insertions(+), 7 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5028,7 +5028,7 @@ struct balance_callback *splice_balance_
return __splice_balance_callbacks(rq, true);
}
-static void __balance_callbacks(struct rq *rq)
+void __balance_callbacks(struct rq *rq)
{
do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
}
@@ -5044,12 +5044,6 @@ void balance_callbacks(struct rq *rq, st
}
}
-#else
-
-static inline void __balance_callbacks(struct rq *rq)
-{
-}
-
#endif
static inline void
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1315,6 +1315,8 @@ static void scx_task_iter_start(struct s
static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
{
if (iter->locked) {
+ /* ->switched_from() may have scheduled balance callbacks */
+ __balance_callbacks(iter->rq);
task_rq_unlock(iter->rq, iter->locked, &iter->rf);
iter->locked = NULL;
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3908,6 +3908,7 @@ extern void check_class_changed(struct r
#ifdef CONFIG_SMP
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
+extern void __balance_callbacks(struct rq *rq);
#else
static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
@@ -3919,6 +3920,10 @@ static inline void balance_callbacks(str
{
}
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
#endif
#ifdef CONFIG_SCHED_CLASS_EXT
On Wed, Oct 30, 2024 at 11:41:39AM -1000, Tejun Heo wrote: > --- a/kernel/sched/ext.c > +++ b/kernel/sched/ext.c > @@ -1315,6 +1315,8 @@ static void scx_task_iter_start(struct s > static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) > { > if (iter->locked) { > + /* ->switched_from() may have scheduled balance callbacks */ > + __balance_callbacks(iter->rq); > task_rq_unlock(iter->rq, iter->locked, &iter->rf); > iter->locked = NULL; > } I think you need to unpin/repin around it. The balance callbacks like to drop rq->lock at times.
On Fri, Nov 01, 2024 at 12:36:34AM +0100, Peter Zijlstra wrote: > On Wed, Oct 30, 2024 at 11:41:39AM -1000, Tejun Heo wrote: > > > --- a/kernel/sched/ext.c > > +++ b/kernel/sched/ext.c > > @@ -1315,6 +1315,8 @@ static void scx_task_iter_start(struct s > > static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) > > { > > if (iter->locked) { > > + /* ->switched_from() may have scheduled balance callbacks */ > > + __balance_callbacks(iter->rq); > > task_rq_unlock(iter->rq, iter->locked, &iter->rf); > > iter->locked = NULL; > > } > > I think you need to unpin/repin around it. The balance callbacks like to > drop rq->lock at times. Maybe something like so.. I'm not sure it's an improvement. --- kernel/sched/core.c | 12 +++++------- kernel/sched/ext.c | 1 + kernel/sched/sched.h | 9 +++++++++ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5de31c312189..c826745eedef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5039,7 +5039,7 @@ struct balance_callback *splice_balance_callbacks(struct rq *rq) return __splice_balance_callbacks(rq, true); } -static void __balance_callbacks(struct rq *rq) +void __balance_callbacks(struct rq *rq) { do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); } @@ -6721,9 +6721,8 @@ static void __sched notrace __schedule(int sched_mode) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { - rq_unpin_lock(rq, &rf); - __balance_callbacks(rq); - raw_spin_rq_unlock_irq(rq); + rf.balance = true; + rq_unlock_irq(rq, &rf); } } @@ -7217,9 +7216,8 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) /* Avoid rq from going away on us: */ preempt_disable(); - rq_unpin_lock(rq, &rf); - __balance_callbacks(rq); - raw_spin_rq_unlock(rq); + rf.balance = true; + rq_unlock(rq, &rf); preempt_enable(); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7bc4fb7f3926..8e5b31983f37 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1314,6 +1314,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter) static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) { if (iter->locked) { + iter->rf.balance = true; task_rq_unlock(iter->rq, iter->locked, &iter->rf); iter->locked = NULL; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0aa6da2633aa..5a39ad0fa574 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1704,6 +1704,7 @@ static inline void rq_clock_stop_loop_update(struct rq *rq) struct rq_flags { unsigned long flags; struct pin_cookie cookie; + bool balance; #ifdef CONFIG_SCHED_DEBUG /* * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the @@ -1739,6 +1740,8 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) #endif } +extern void __balance_callbacks(struct rq *rq); + static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) { #ifdef CONFIG_SCHED_DEBUG @@ -1747,6 +1750,12 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) #endif lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); +#ifdef CONFIG_SMP + if (rf->balance) { + rf->balance = false; + __balance_callbacks(rq); + } +#endif } static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
Hello, Peter. Sorry about the delay. On Fri, Nov 01, 2024 at 03:12:18PM +0100, Peter Zijlstra wrote: > On Fri, Nov 01, 2024 at 12:36:34AM +0100, Peter Zijlstra wrote: > > On Wed, Oct 30, 2024 at 11:41:39AM -1000, Tejun Heo wrote: > > > > > --- a/kernel/sched/ext.c > > > +++ b/kernel/sched/ext.c > > > @@ -1315,6 +1315,8 @@ static void scx_task_iter_start(struct s > > > static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) > > > { > > > if (iter->locked) { > > > + /* ->switched_from() may have scheduled balance callbacks */ > > > + __balance_callbacks(iter->rq); > > > task_rq_unlock(iter->rq, iter->locked, &iter->rf); > > > iter->locked = NULL; > > > } > > > > I think you need to unpin/repin around it. The balance callbacks like to > > drop rq->lock at times. > > Maybe something like so.. I'm not sure it's an improvement. I actually easily reproduce the problem by making tasks switch between DL and SCX. e.g. If I run `stress-ng --schedmix 32` while running any SCX sched: rq->balance_callback && rq->balance_callback != &balance_push_callback WARNING: CPU: 5 PID: 2784 at kernel/sched/sched.h:1729 do_sched_yield+0x10a/0x130 ... Sched_ext: simple (enabling+all) RIP: 0010:do_sched_yield+0x10a/0x130 Code: 84 66 e8 7e e8 07 a2 f0 00 48 83 c4 08 5b 41 5e 5d e9 0a 4f f1 00 cc c6 05 09 fb3 RSP: 0018:ffffc900030c3ef0 EFLAGS: 00010082 RAX: 0000000000000046 RBX: ffff8887fab70380 RCX: 0000000000000027 RDX: 0000000000000000 RSI: ffffffff811def79 RDI: ffff8887fab5b448 cb=0xffff8887fad5b040 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 R10: ffffffff811dee91 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: ffff8887fab40000 R15: 0000000000000018 FS: 00007fc8772d8000(0000) GS:ffff8887fab40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005607f90c3078 CR3: 000000012aa1c000 CR4: 0000000000350eb0 Call Trace: <TASK> __x64_sys_sched_yield+0xa/0x20 do_syscall_64+0x7b/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fc87a30ea8b Code: 73 01 c3 48 8b 0d 85 72 0e 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 008 RSP: 002b:00007ffde081a928 EFLAGS: 00000202 ORIG_RAX: 0000000000000018 RAX: ffffffffffffffda RBX: 00007fc86ee1ab18 RCX: 00007fc87a30ea8b RDX: 0000000000000001 RSI: 000000000000001f RDI: 000000000000001b RBP: 00007ffde081ab90 R08: 000000000000001b R09: 00000000000003e8 R10: 00007ffde081a8f0 R11: 0000000000000202 R12: 0000000000005b81 R13: 00000000000061e8 R14: 0000000000000000 R15: 0000000000000003 </TASK> and your patch makes the issue go away. Please feel free to add: Tested-by: Tejun Heo <tj@kernel.org> If you want me to turn it into a proper patch and apply it, plesae let me know. Thanks. -- tejun
© 2016 - 2024 Red Hat, Inc.