[PATCH sched_ext/for-6.12] sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()

Tejun Heo posted 1 patch 1 year, 5 months ago
There is a newer version of this series
kernel/sched/core.c  |   13 ++++++++-----
kernel/sched/ext.c   |   34 +++++++++++++++++++++++++++++-----
kernel/sched/sched.h |    3 ++-
3 files changed, 39 insertions(+), 11 deletions(-)
[PATCH sched_ext/for-6.12] sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()
Posted by Tejun Heo 1 year, 5 months ago
sched_ext dispatches tasks from the BPF scheduler from balance_scx() and
thus every pick_task_scx() call must be preceded by balance_scx(). While
this usually holds, there are rare cases where a higher sched class's
balance() returns true indicating that it has tasks to run on the CPU and
thus terminating balance() calls but fails to actually find the next task to
run when pick_task() is called. In such cases, pick_task_scx() can be called
without preceding balance_scx().

Detect this condition using SCX_RQ_BAL_PENDING flags. If detected, keep
running the previous task if possible and avoid stalling from entering idle
without balancing.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/core.c  |   13 ++++++++-----
 kernel/sched/ext.c   |   34 +++++++++++++++++++++++++++++-----
 kernel/sched/sched.h |    3 ++-
 3 files changed, 39 insertions(+), 11 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5901,12 +5901,15 @@ static void prev_balance(struct rq *rq,
 
 #ifdef CONFIG_SCHED_CLASS_EXT
 	/*
-	 * SCX requires a balance() call before every pick_next_task() including
-	 * when waking up from SCHED_IDLE. If @start_class is below SCX, start
-	 * from SCX instead.
+	 * SCX requires a balance() call before every pick_task() including when
+	 * waking up from SCHED_IDLE. If @start_class is below SCX, start from
+	 * SCX instead. Also, set a flag to detect missing balance() call.
 	 */
-	if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
-		start_class = &ext_sched_class;
+	if (scx_enabled()) {
+		rq->scx.flags |= SCX_RQ_BAL_PENDING;
+		if (sched_class_above(&ext_sched_class, start_class))
+			start_class = &ext_sched_class;
+	}
 #endif
 
 	/*
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2591,7 +2591,7 @@ static int balance_one(struct rq *rq, st
 
 	lockdep_assert_rq_held(rq);
 	rq->scx.flags |= SCX_RQ_IN_BALANCE;
-	rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
+	rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
 
 	if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
 	    unlikely(rq->scx.cpu_released)) {
@@ -2904,25 +2904,49 @@ static struct task_struct *pick_task_scx
 {
 	struct task_struct *prev = rq->curr;
 	struct task_struct *p;
+	bool prev_on_scx = prev->sched_class == &ext_sched_class;
+	bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+	bool kick_idle = false;
+
+	if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
+		/*
+		 * pick_task_scx() can be called without preceding balance_scx()
+		 * call if a higher class's balance() returned %true but its
+		 * pick_task() returned %NULL. Keep running @prev if possible
+		 * and avoid stalling from entering idle without balancing.
+		 */
+		if (prev_on_scx) {
+			keep_prev = true;
+		} else {
+			keep_prev = false;
+			kick_idle = true;
+		}
+	} else if (unlikely(keep_prev && !prev_on_scx)) {
+		/* only allowed during transitions */
+		WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+		keep_prev = false;
+	}
 
 	/*
 	 * If balance_scx() is telling us to keep running @prev, replenish slice
 	 * if necessary and keep running @prev. Otherwise, pop the first one
 	 * from the local DSQ.
 	 */
-	if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
-	    !WARN_ON_ONCE(prev->sched_class != &ext_sched_class)) {
+	if (keep_prev) {
 		p = prev;
 		if (!p->scx.slice)
 			p->scx.slice = SCX_SLICE_DFL;
 	} else {
 		p = first_local_task(rq);
-		if (!p)
+		if (!p) {
+			if (kick_idle)
+				scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
 			return NULL;
+		}
 
 		if (unlikely(!p->scx.slice)) {
 			if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
-				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
+				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_task_scx()\n",
 						p->comm, p->pid);
 				scx_warned_zero_slice = true;
 			}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -749,7 +749,8 @@ enum scx_rq_flags {
 	 */
 	SCX_RQ_ONLINE		= 1 << 0,
 	SCX_RQ_CAN_STOP_TICK	= 1 << 1,
-	SCX_RQ_BAL_KEEP		= 1 << 2, /* balance decided to keep current */
+	SCX_RQ_BAL_PENDING	= 1 << 2, /* balance hasn't run yet */
+	SCX_RQ_BAL_KEEP		= 1 << 3, /* balance decided to keep current */
 
 	SCX_RQ_IN_WAKEUP	= 1 << 16,
 	SCX_RQ_IN_BALANCE	= 1 << 17,
Re: [PATCH sched_ext/for-6.12] sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()
Posted by Peter Zijlstra 1 year, 5 months ago
On Wed, Sep 04, 2024 at 02:47:03PM -1000, Tejun Heo wrote:
> sched_ext dispatches tasks from the BPF scheduler from balance_scx() and
> thus every pick_task_scx() call must be preceded by balance_scx(). While
> this usually holds, there are rare cases where a higher sched class's
> balance() returns true indicating that it has tasks to run on the CPU and
> thus terminating balance() calls but fails to actually find the next task to
> run when pick_task() is called. 

Oh cute. Which class in particular did you see this do?

Looking at balance_fair() / sched_balance_newidle() I suppose we could
verify we actually have a runnable task once we've re-acquired the
rq-lock and have pulled_task > 0.


Tightening all that up would probably be better than trying to deal with
the fallout like this, hmm?
Re: [PATCH sched_ext/for-6.12] sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()
Posted by Peter Zijlstra 1 year, 5 months ago
On Thu, Sep 05, 2024 at 11:28:58AM +0200, Peter Zijlstra wrote:
> On Wed, Sep 04, 2024 at 02:47:03PM -1000, Tejun Heo wrote:
> > sched_ext dispatches tasks from the BPF scheduler from balance_scx() and
> > thus every pick_task_scx() call must be preceded by balance_scx(). While
> > this usually holds, there are rare cases where a higher sched class's
> > balance() returns true indicating that it has tasks to run on the CPU and
> > thus terminating balance() calls but fails to actually find the next task to
> > run when pick_task() is called. 
> 
> Oh cute. Which class in particular did you see this do?
> 
> Looking at balance_fair() / sched_balance_newidle() I suppose we could
> verify we actually have a runnable task once we've re-acquired the
> rq-lock and have pulled_task > 0.
> 
> 
> Tightening all that up would probably be better than trying to deal with
> the fallout like this, hmm?

Something like so. Haven't yet looked at the rt/dl classes.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11e890486c1b..7db42735d504 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12716,6 +12716,12 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 	if (this_rq->cfs.h_nr_running && !pulled_task)
 		pulled_task = 1;
 
+	/*
+	 * We pulled a task, but it got stolen before we re-acquired rq->lock.
+	 */
+	if (!this_rq->cfs.h_nr_running && pulled_task)
+		pulled_task = 0;
+
 	/* Is there a task of a high priority class? */
 	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
 		pulled_task = -1;
Re: [PATCH sched_ext/for-6.12] sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()
Posted by Tejun Heo 1 year, 5 months ago
Hello,

On Thu, Sep 05, 2024 at 05:00:12PM +0200, Peter Zijlstra wrote:
...
> > Oh cute. Which class in particular did you see this do?

The easiest repro was fair.

> > Looking at balance_fair() / sched_balance_newidle() I suppose we could
> > verify we actually have a runnable task once we've re-acquired the
> > rq-lock and have pulled_task > 0.
> > 
> > 
> > Tightening all that up would probably be better than trying to deal with
> > the fallout like this, hmm?

Oh, yeah, that would be better and we probably want to add a sanity check so
that we know if balance() and pick_task() disagree.

> Something like so. Haven't yet looked at the rt/dl classes.
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 11e890486c1b..7db42735d504 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12716,6 +12716,12 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
>  	if (this_rq->cfs.h_nr_running && !pulled_task)
>  		pulled_task = 1;
>  
> +	/*
> +	 * We pulled a task, but it got stolen before we re-acquired rq->lock.
> +	 */
> +	if (!this_rq->cfs.h_nr_running && pulled_task)
> +		pulled_task = 0;
> +

Lemme test that.

Thanks.

-- 
tejun
Re: [PATCH sched_ext/for-6.12] sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()
Posted by Tejun Heo 1 year, 5 months ago
On Thu, Sep 05, 2024 at 06:41:42AM -1000, Tejun Heo wrote:
> > @@ -12716,6 +12716,12 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
> >  	if (this_rq->cfs.h_nr_running && !pulled_task)
> >  		pulled_task = 1;
> >  
> > +	/*
> > +	 * We pulled a task, but it got stolen before we re-acquired rq->lock.
> > +	 */
> > +	if (!this_rq->cfs.h_nr_running && pulled_task)
> > +		pulled_task = 0;
> > +
> 
> Lemme test that.

Did a bit of testing and it seems like it's mostly coming from delayed
dequeue handling. pick_next_entity() does this:

	struct sched_entity *se = pick_eevdf(cfs_rq);
	if (se->sched_delayed) {
		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
		SCHED_WARN_ON(se->sched_delayed);
		SCHED_WARN_ON(se->on_rq);
		return NULL;
	}

rq->cfs.nr_running includes the number of delay dequeued tasks which aren't
really runnable, so it seems like balance_fair() saying yes and
pick_next_entity() then hitting a delayed task. Maybe the solution is
tracking the number of delayed ones and subtracting that from nr_running?
I'm trying that but can't get the delayed count straight for some reason.

Thanks.

-- 
tejun
Re: [PATCH sched_ext/for-6.12] sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()
Posted by Peter Zijlstra 1 year, 5 months ago
On Thu, Sep 05, 2024 at 03:17:13PM -1000, Tejun Heo wrote:
> On Thu, Sep 05, 2024 at 06:41:42AM -1000, Tejun Heo wrote:
> > > @@ -12716,6 +12716,12 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
> > >  	if (this_rq->cfs.h_nr_running && !pulled_task)
> > >  		pulled_task = 1;
> > >  
> > > +	/*
> > > +	 * We pulled a task, but it got stolen before we re-acquired rq->lock.
> > > +	 */
> > > +	if (!this_rq->cfs.h_nr_running && pulled_task)
> > > +		pulled_task = 0;
> > > +
> > 
> > Lemme test that.
> 
> Did a bit of testing and it seems like it's mostly coming from delayed
> dequeue handling. pick_next_entity() does this:
> 
> 	struct sched_entity *se = pick_eevdf(cfs_rq);
> 	if (se->sched_delayed) {
> 		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> 		SCHED_WARN_ON(se->sched_delayed);
> 		SCHED_WARN_ON(se->on_rq);
> 		return NULL;
> 	}
> 
> rq->cfs.nr_running includes the number of delay dequeued tasks which aren't
> really runnable, so it seems like balance_fair() saying yes and
> pick_next_entity() then hitting a delayed task.

Duh, yes.

> Maybe the solution is
> tracking the number of delayed ones and subtracting that from nr_running?

That came up yesterday for something else as well. Let me see if I can
make that happen.


Anyway, I suppose you keep your patch for now until I've managed to sort
this out.
Re: [PATCH sched_ext/for-6.12] sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()
Posted by Tejun Heo 1 year, 2 months ago
Hello, Peter.

On Fri, Sep 06, 2024 at 11:04:20AM +0200, Peter Zijlstra wrote:
> On Thu, Sep 05, 2024 at 03:17:13PM -1000, Tejun Heo wrote:
> > On Thu, Sep 05, 2024 at 06:41:42AM -1000, Tejun Heo wrote:
> > > > @@ -12716,6 +12716,12 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
> > > >  	if (this_rq->cfs.h_nr_running && !pulled_task)
> > > >  		pulled_task = 1;
> > > >  
> > > > +	/*
> > > > +	 * We pulled a task, but it got stolen before we re-acquired rq->lock.
> > > > +	 */
> > > > +	if (!this_rq->cfs.h_nr_running && pulled_task)
> > > > +		pulled_task = 0;
> > > > +
> > > 
> > > Lemme test that.
> > 
> > Did a bit of testing and it seems like it's mostly coming from delayed
> > dequeue handling. pick_next_entity() does this:
> > 
> > 	struct sched_entity *se = pick_eevdf(cfs_rq);
> > 	if (se->sched_delayed) {
> > 		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> > 		SCHED_WARN_ON(se->sched_delayed);
> > 		SCHED_WARN_ON(se->on_rq);
> > 		return NULL;
> > 	}
> > 
> > rq->cfs.nr_running includes the number of delay dequeued tasks which aren't
> > really runnable, so it seems like balance_fair() saying yes and
> > pick_next_entity() then hitting a delayed task.
> 
> Duh, yes.
> 
> > Maybe the solution is
> > tracking the number of delayed ones and subtracting that from nr_running?
> 
> That came up yesterday for something else as well. Let me see if I can
> make that happen.
> 
> 
> Anyway, I suppose you keep your patch for now until I've managed to sort
> this out.

This still triggers. I'm going to apply the workaround for now.

Thanks.

-- 
tejun
[PATCH sched_ext/for-6.12] sched_ext: Temporarily work around pick_task_scx() being called without balance_scx()
Posted by Tejun Heo 1 year, 5 months ago
pick_task_scx() must be preceded by balance_scx() but there currently is a
bug where fair could say yes on balance() but no on pick_task(), which then
ends up calling pick_task_scx() without preceding balance_scx(). Work around
by dropping WARN_ON_ONCE() and ignoring cases which don't make sense.

This isn't great and can theoretically lead to stalls. However, for
switch_all cases, this happens only while a BPF scheduler is being loaded or
unloaded, and, for partial cases, fair will likely keep triggering this CPU.

This will be reverted once the fair behavior is fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
Applying a simpler workaround to sched_ext/for-6.12 for the time being.

Thanks.

 kernel/sched/ext.c |   17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2909,9 +2909,24 @@ static struct task_struct *pick_task_scx
 	 * If balance_scx() is telling us to keep running @prev, replenish slice
 	 * if necessary and keep running @prev. Otherwise, pop the first one
 	 * from the local DSQ.
+	 *
+	 * WORKAROUND:
+	 *
+	 * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
+	 * have gone through balance_scx(). Unfortunately, there currently is a
+	 * bug where fair could say yes on balance() but no on pick_task(),
+	 * which then ends up calling pick_task_scx() without preceding
+	 * balance_scx().
+	 *
+	 * For now, ignore cases where $prev is not on SCX. This isn't great and
+	 * can theoretically lead to stalls. However, for switch_all cases, this
+	 * happens only while a BPF scheduler is being loaded or unloaded, and,
+	 * for partial cases, fair will likely keep triggering this CPU.
+	 *
+	 * Once fair is fixed, restore WARN_ON_ONCE().
 	 */
 	if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
-	    !WARN_ON_ONCE(prev->sched_class != &ext_sched_class)) {
+	    prev->sched_class == &ext_sched_class) {
 		p = prev;
 		if (!p->scx.slice)
 			p->scx.slice = SCX_SLICE_DFL;
Re: [PATCH sched_ext/for-6.12] sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()
Posted by Tejun Heo 1 year, 5 months ago
On Thu, Sep 05, 2024 at 03:17:13PM -1000, Tejun Heo wrote:
> Did a bit of testing and it seems like it's mostly coming from delayed
> dequeue handling. pick_next_entity() does this:
> 
> 	struct sched_entity *se = pick_eevdf(cfs_rq);
> 	if (se->sched_delayed) {
> 		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> 		SCHED_WARN_ON(se->sched_delayed);
> 		SCHED_WARN_ON(se->on_rq);
> 		return NULL;
> 	}
> 
> rq->cfs.nr_running includes the number of delay dequeued tasks which aren't
> really runnable, so it seems like balance_fair() saying yes and
> pick_next_entity() then hitting a delayed task. Maybe the solution is
> tracking the number of delayed ones and subtracting that from nr_running?
> I'm trying that but can't get the delayed count straight for some reason.

Backported http://lkml.kernel.org/r/Ztpjt5Pz9pJliblL@slm.duckdns.org to
v6.10 and it doesn't trigger (at least not easily) while the warning
triggers immediately on the current tip/sched/core. It does look like the
problem is delayed dequeue.

Thanks.

-- 
tejun
[PATCH sched_ext/for-6.12-fixes v2] sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()
Posted by Tejun Heo 1 year, 2 months ago
sched_ext dispatches tasks from the BPF scheduler from balance_scx() and
thus every pick_task_scx() call must be preceded by balance_scx(). While
this usually holds, due to a bug, there are cases where the fair class's
balance() returns true indicating that it has tasks to run on the CPU and
thus terminating balance() calls but fails to actually find the next task to
run when pick_task() is called. In such cases, pick_task_scx() can be called
without preceding balance_scx().

Detect this condition using SCX_RQ_BAL_PENDING flags. If detected, keep
running the previous task if possible and avoid stalling from entering idle
without balancing.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/Ztj_h5c2LYsdXYbA@slm.duckdns.org
---
Applied to sched_ext/for-6.12-fixes.

 kernel/sched/core.c  | 13 ++++++++-----
 kernel/sched/ext.c   | 44 +++++++++++++++++++++++++++++++-------------
 kernel/sched/sched.h |  5 +++--
 3 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aeb595514461..a910a5b4c274 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5914,12 +5914,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
 
 #ifdef CONFIG_SCHED_CLASS_EXT
 	/*
-	 * SCX requires a balance() call before every pick_next_task() including
-	 * when waking up from SCHED_IDLE. If @start_class is below SCX, start
-	 * from SCX instead.
+	 * SCX requires a balance() call before every pick_task() including when
+	 * waking up from SCHED_IDLE. If @start_class is below SCX, start from
+	 * SCX instead. Also, set a flag to detect missing balance() call.
 	 */
-	if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
-		start_class = &ext_sched_class;
+	if (scx_enabled()) {
+		rq->scx.flags |= SCX_RQ_BAL_PENDING;
+		if (sched_class_above(&ext_sched_class, start_class))
+			start_class = &ext_sched_class;
+	}
 #endif
 
 	/*
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3bdb08fc2056..19f9cb3a4190 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2634,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 
 	lockdep_assert_rq_held(rq);
 	rq->scx.flags |= SCX_RQ_IN_BALANCE;
-	rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
+	rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
 
 	if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
 	    unlikely(rq->scx.cpu_released)) {
@@ -2948,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq)
 {
 	struct task_struct *prev = rq->curr;
 	struct task_struct *p;
+	bool prev_on_scx = prev->sched_class == &ext_sched_class;
+	bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+	bool kick_idle = false;
 
 	/*
-	 * If balance_scx() is telling us to keep running @prev, replenish slice
-	 * if necessary and keep running @prev. Otherwise, pop the first one
-	 * from the local DSQ.
-	 *
 	 * WORKAROUND:
 	 *
 	 * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
@@ -2962,22 +2961,41 @@ static struct task_struct *pick_task_scx(struct rq *rq)
 	 * which then ends up calling pick_task_scx() without preceding
 	 * balance_scx().
 	 *
-	 * For now, ignore cases where $prev is not on SCX. This isn't great and
-	 * can theoretically lead to stalls. However, for switch_all cases, this
-	 * happens only while a BPF scheduler is being loaded or unloaded, and,
-	 * for partial cases, fair will likely keep triggering this CPU.
+	 * Keep running @prev if possible and avoid stalling from entering idle
+	 * without balancing.
 	 *
-	 * Once fair is fixed, restore WARN_ON_ONCE().
+	 * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
+	 * if pick_task_scx() is called without preceding balance_scx().
 	 */
-	if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
-	    prev->sched_class == &ext_sched_class) {
+	if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
+		if (prev_on_scx) {
+			keep_prev = true;
+		} else {
+			keep_prev = false;
+			kick_idle = true;
+		}
+	} else if (unlikely(keep_prev && !prev_on_scx)) {
+		/* only allowed during transitions */
+		WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+		keep_prev = false;
+	}
+
+	/*
+	 * If balance_scx() is telling us to keep running @prev, replenish slice
+	 * if necessary and keep running @prev. Otherwise, pop the first one
+	 * from the local DSQ.
+	 */
+	if (keep_prev) {
 		p = prev;
 		if (!p->scx.slice)
 			p->scx.slice = SCX_SLICE_DFL;
 	} else {
 		p = first_local_task(rq);
-		if (!p)
+		if (!p) {
+			if (kick_idle)
+				scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
 			return NULL;
+		}
 
 		if (unlikely(!p->scx.slice)) {
 			if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6085ef50febf..4d79804631e4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -751,8 +751,9 @@ enum scx_rq_flags {
 	 */
 	SCX_RQ_ONLINE		= 1 << 0,
 	SCX_RQ_CAN_STOP_TICK	= 1 << 1,
-	SCX_RQ_BAL_KEEP		= 1 << 2, /* balance decided to keep current */
-	SCX_RQ_BYPASSING	= 1 << 3,
+	SCX_RQ_BAL_PENDING	= 1 << 2, /* balance hasn't run yet */
+	SCX_RQ_BAL_KEEP		= 1 << 3, /* balance decided to keep current */
+	SCX_RQ_BYPASSING	= 1 << 4,
 
 	SCX_RQ_IN_WAKEUP	= 1 << 16,
 	SCX_RQ_IN_BALANCE	= 1 << 17,
-- 
2.47.0