[PATCH sched_ext/for-6.12-fixes] sched_ext: Call __balance_callbacks() from __scx_task_iter_rq_unlock()

Tejun Heo posted 1 patch 3 weeks, 4 days ago
kernel/sched/core.c  |    8 +-------
kernel/sched/ext.c   |    2 ++
kernel/sched/sched.h |    5 +++++
3 files changed, 8 insertions(+), 7 deletions(-)
[PATCH sched_ext/for-6.12-fixes] sched_ext: Call __balance_callbacks() from __scx_task_iter_rq_unlock()
Posted by Tejun Heo 3 weeks, 4 days ago
While tasks are being iterated to be switched in and out of SCX,
__scx_task_iter_rq_unlock() is called to unlock rq. As
sched_class->switched_from() and friends may schedule balance callbacks,
they should be executed before moving onto the next task.

This brekage is currently theoretical as only RT and DL schedule balance
callbacks on class switches and SCX only switches from/to fair.

Make __balance_callbacks() global and call it from
__scx_task_iter_rq_unlock().

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20241030154551.GN14555@noisy.programming.kicks-ass.net
---
Hello,

If this looks okay, I'll route it through sched_ext/for-6.12-fixes.

Thanks.

 kernel/sched/core.c  |    8 +-------
 kernel/sched/ext.c   |    2 ++
 kernel/sched/sched.h |    5 +++++
 3 files changed, 8 insertions(+), 7 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5028,7 +5028,7 @@ struct balance_callback *splice_balance_
 	return __splice_balance_callbacks(rq, true);
 }
 
-static void __balance_callbacks(struct rq *rq)
+void __balance_callbacks(struct rq *rq)
 {
 	do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
 }
@@ -5044,12 +5044,6 @@ void balance_callbacks(struct rq *rq, st
 	}
 }
 
-#else
-
-static inline void __balance_callbacks(struct rq *rq)
-{
-}
-
 #endif
 
 static inline void
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1315,6 +1315,8 @@ static void scx_task_iter_start(struct s
 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
 {
 	if (iter->locked) {
+		/* ->switched_from() may have scheduled balance callbacks */
+		__balance_callbacks(iter->rq);
 		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
 		iter->locked = NULL;
 	}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3908,6 +3908,7 @@ extern void check_class_changed(struct r
 #ifdef CONFIG_SMP
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
+extern void __balance_callbacks(struct rq *rq);
 #else
 
 static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
@@ -3919,6 +3920,10 @@ static inline void balance_callbacks(str
 {
 }
 
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
 #endif
 
 #ifdef CONFIG_SCHED_CLASS_EXT
Re: [PATCH sched_ext/for-6.12-fixes] sched_ext: Call __balance_callbacks() from __scx_task_iter_rq_unlock()
Posted by Peter Zijlstra 3 weeks, 2 days ago
On Wed, Oct 30, 2024 at 11:41:39AM -1000, Tejun Heo wrote:

> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1315,6 +1315,8 @@ static void scx_task_iter_start(struct s
>  static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
>  {
>  	if (iter->locked) {
> +		/* ->switched_from() may have scheduled balance callbacks */
> +		__balance_callbacks(iter->rq);
>  		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
>  		iter->locked = NULL;
>  	}

I think you need to unpin/repin around it. The balance callbacks like to
drop rq->lock at times.
Re: [PATCH sched_ext/for-6.12-fixes] sched_ext: Call __balance_callbacks() from __scx_task_iter_rq_unlock()
Posted by Peter Zijlstra 3 weeks, 2 days ago
On Fri, Nov 01, 2024 at 12:36:34AM +0100, Peter Zijlstra wrote:
> On Wed, Oct 30, 2024 at 11:41:39AM -1000, Tejun Heo wrote:
> 
> > --- a/kernel/sched/ext.c
> > +++ b/kernel/sched/ext.c
> > @@ -1315,6 +1315,8 @@ static void scx_task_iter_start(struct s
> >  static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
> >  {
> >  	if (iter->locked) {
> > +		/* ->switched_from() may have scheduled balance callbacks */
> > +		__balance_callbacks(iter->rq);
> >  		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
> >  		iter->locked = NULL;
> >  	}
> 
> I think you need to unpin/repin around it. The balance callbacks like to
> drop rq->lock at times.

Maybe something like so.. I'm not sure it's an improvement.

---
 kernel/sched/core.c  | 12 +++++-------
 kernel/sched/ext.c   |  1 +
 kernel/sched/sched.h |  9 +++++++++
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5de31c312189..c826745eedef 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5039,7 +5039,7 @@ struct balance_callback *splice_balance_callbacks(struct rq *rq)
 	return __splice_balance_callbacks(rq, true);
 }
 
-static void __balance_callbacks(struct rq *rq)
+void __balance_callbacks(struct rq *rq)
 {
 	do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
 }
@@ -6721,9 +6721,8 @@ static void __sched notrace __schedule(int sched_mode)
 		/* Also unlocks the rq: */
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
-		rq_unpin_lock(rq, &rf);
-		__balance_callbacks(rq);
-		raw_spin_rq_unlock_irq(rq);
+		rf.balance = true;
+		rq_unlock_irq(rq, &rf);
 	}
 }
 
@@ -7217,9 +7216,8 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 	/* Avoid rq from going away on us: */
 	preempt_disable();
 
-	rq_unpin_lock(rq, &rf);
-	__balance_callbacks(rq);
-	raw_spin_rq_unlock(rq);
+	rf.balance = true;
+	rq_unlock(rq, &rf);
 
 	preempt_enable();
 }
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7bc4fb7f3926..8e5b31983f37 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1314,6 +1314,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
 {
 	if (iter->locked) {
+		iter->rf.balance = true;
 		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
 		iter->locked = NULL;
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0aa6da2633aa..5a39ad0fa574 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1704,6 +1704,7 @@ static inline void rq_clock_stop_loop_update(struct rq *rq)
 struct rq_flags {
 	unsigned long flags;
 	struct pin_cookie cookie;
+	bool balance;
 #ifdef CONFIG_SCHED_DEBUG
 	/*
 	 * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
@@ -1739,6 +1740,8 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
 #endif
 }
 
+extern void __balance_callbacks(struct rq *rq);
+
 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -1747,6 +1750,12 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
 #endif
 
 	lockdep_unpin_lock(__rq_lockp(rq), rf->cookie);
+#ifdef CONFIG_SMP
+	if (rf->balance) {
+		rf->balance = false;
+		__balance_callbacks(rq);
+	}
+#endif
 }
 
 static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
Re: [PATCH sched_ext/for-6.12-fixes] sched_ext: Call __balance_callbacks() from __scx_task_iter_rq_unlock()
Posted by Tejun Heo 2 weeks, 2 days ago
Hello, Peter.

Sorry about the delay.

On Fri, Nov 01, 2024 at 03:12:18PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 01, 2024 at 12:36:34AM +0100, Peter Zijlstra wrote:
> > On Wed, Oct 30, 2024 at 11:41:39AM -1000, Tejun Heo wrote:
> > 
> > > --- a/kernel/sched/ext.c
> > > +++ b/kernel/sched/ext.c
> > > @@ -1315,6 +1315,8 @@ static void scx_task_iter_start(struct s
> > >  static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
> > >  {
> > >  	if (iter->locked) {
> > > +		/* ->switched_from() may have scheduled balance callbacks */
> > > +		__balance_callbacks(iter->rq);
> > >  		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
> > >  		iter->locked = NULL;
> > >  	}
> > 
> > I think you need to unpin/repin around it. The balance callbacks like to
> > drop rq->lock at times.
> 
> Maybe something like so.. I'm not sure it's an improvement.

I actually easily reproduce the problem by making tasks switch between DL
and SCX. e.g. If I run `stress-ng --schedmix 32` while running any SCX
sched:

  rq->balance_callback && rq->balance_callback != &balance_push_callback
  WARNING: CPU: 5 PID: 2784 at kernel/sched/sched.h:1729 do_sched_yield+0x10a/0x130
  ...
  Sched_ext: simple (enabling+all)
  RIP: 0010:do_sched_yield+0x10a/0x130
  Code: 84 66 e8 7e e8 07 a2 f0 00 48 83 c4 08 5b 41 5e 5d e9 0a 4f f1 00 cc c6 05 09 fb3
  RSP: 0018:ffffc900030c3ef0 EFLAGS: 00010082
  RAX: 0000000000000046 RBX: ffff8887fab70380 RCX: 0000000000000027
  RDX: 0000000000000000 RSI: ffffffff811def79 RDI: ffff8887fab5b448
  cb=0xffff8887fad5b040
  RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000
  R10: ffffffff811dee91 R11: 0000000000000000 R12: 0000000000000000
  R13: 0000000000000000 R14: ffff8887fab40000 R15: 0000000000000018
  FS:  00007fc8772d8000(0000) GS:ffff8887fab40000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00005607f90c3078 CR3: 000000012aa1c000 CR4: 0000000000350eb0
  Call Trace:
   <TASK>
   __x64_sys_sched_yield+0xa/0x20
   do_syscall_64+0x7b/0x140
   entry_SYSCALL_64_after_hwframe+0x76/0x7e
  RIP: 0033:0x7fc87a30ea8b
  Code: 73 01 c3 48 8b 0d 85 72 0e 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 008
  RSP: 002b:00007ffde081a928 EFLAGS: 00000202 ORIG_RAX: 0000000000000018
  RAX: ffffffffffffffda RBX: 00007fc86ee1ab18 RCX: 00007fc87a30ea8b
  RDX: 0000000000000001 RSI: 000000000000001f RDI: 000000000000001b
  RBP: 00007ffde081ab90 R08: 000000000000001b R09: 00000000000003e8
  R10: 00007ffde081a8f0 R11: 0000000000000202 R12: 0000000000005b81
  R13: 00000000000061e8 R14: 0000000000000000 R15: 0000000000000003
   </TASK>

and your patch makes the issue go away. Please feel free to add:

 Tested-by: Tejun Heo <tj@kernel.org>

If you want me to turn it into a proper patch and apply it, plesae let me
know.

Thanks.

-- 
tejun