Pull out the !sd check to simplify code.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/fair.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12811,14 +12811,16 @@ static int sched_balance_newidle(struct
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (!sd) {
+ rcu_read_unlock();
+ goto out;
+ }
if (!get_rd_overloaded(this_rq->rd) ||
- (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
+ this_rq->avg_idle < sd->max_newidle_lb_cost) {
- if (sd)
- update_next_balance(sd, &next_balance);
+ update_next_balance(sd, &next_balance);
rcu_read_unlock();
-
goto out;
}
rcu_read_unlock();
On 11/7/25 9:36 PM, Peter Zijlstra wrote:
> Pull out the !sd check to simplify code.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> kernel/sched/fair.c | 10 ++++++----
> 1 file changed, 6 insertions(+), 4 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12811,14 +12811,16 @@ static int sched_balance_newidle(struct
>
> rcu_read_lock();
> sd = rcu_dereference_check_sched_domain(this_rq->sd);
> + if (!sd) {
> + rcu_read_unlock();
> + goto out;
> + }
>
when is sd null? on isol_cpus ?
if sd is null, i think we are skipping these compared to earlier.
t0 = sched_clock_cpu(this_cpu);
sched_balance_update_blocked_averages(this_cpu);
> if (!get_rd_overloaded(this_rq->rd) ||
> - (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
> + this_rq->avg_idle < sd->max_newidle_lb_cost) {
>
> - if (sd)
> - update_next_balance(sd, &next_balance);
> + update_next_balance(sd, &next_balance);
> rcu_read_unlock();
> -
> goto out;
> }
> rcu_read_unlock();
>
>
On Wed, Nov 12, 2025 at 08:07:24PM +0530, Shrikanth Hegde wrote:
>
>
> On 11/7/25 9:36 PM, Peter Zijlstra wrote:
> > Pull out the !sd check to simplify code.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> > kernel/sched/fair.c | 10 ++++++----
> > 1 file changed, 6 insertions(+), 4 deletions(-)
> >
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -12811,14 +12811,16 @@ static int sched_balance_newidle(struct
> > rcu_read_lock();
> > sd = rcu_dereference_check_sched_domain(this_rq->sd);
> > + if (!sd) {
> > + rcu_read_unlock();
> > + goto out;
> > + }
>
> when is sd null? on isol_cpus ?
Yeah, partitions (cpuset/isol_cpus) but also spuriously during
domain-rebuild IIRC.
> if sd is null, i think we are skipping these compared to earlier.
>
> t0 = sched_clock_cpu(this_cpu);
> sched_balance_update_blocked_averages(this_cpu);
let me pull that sched_balance_update_blocked_averages() thing up a few
lines.
Thanks!
On Wed, Nov 12, 2025 at 03:42:41PM +0100, Peter Zijlstra wrote:
> > if sd is null, i think we are skipping these compared to earlier.
> >
> > t0 = sched_clock_cpu(this_cpu);
> > sched_balance_update_blocked_averages(this_cpu);
>
> let me pull that sched_balance_update_blocked_averages() thing up a few
> lines.
Something like so..
---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9946,15 +9946,11 @@ static unsigned long task_h_load(struct
}
#endif /* !CONFIG_FAIR_GROUP_SCHED */
-static void sched_balance_update_blocked_averages(int cpu)
+static void __sched_balance_update_blocked_averages(struct rq *rq)
{
bool decayed = false, done = true;
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
- rq_lock_irqsave(rq, &rf);
update_blocked_load_tick(rq);
- update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
decayed |= __update_blocked_fair(rq, &done);
@@ -9962,7 +9958,15 @@ static void sched_balance_update_blocked
update_blocked_load_status(rq, !done);
if (decayed)
cpufreq_update_util(rq, 0);
- rq_unlock_irqrestore(rq, &rf);
+}
+
+static void sched_balance_update_blocked_averages(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ guard(rq_lock_irqsave)(rq);
+ update_rq_clock(rq);
+ __sched_balance_update_blocked_averages(rq);
}
/********** Helpers for sched_balance_find_src_group ************************/
@@ -12865,6 +12869,8 @@ static int sched_balance_newidle(struct
if (!cpu_active(this_cpu))
return 0;
+ __sched_balance_update_blocked_averages(this_rq);
+
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
@@ -12891,7 +12897,6 @@ static int sched_balance_newidle(struct
raw_spin_rq_unlock(this_rq);
t0 = sched_clock_cpu(this_cpu);
- sched_balance_update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
On 11/12/25 8:38 PM, Peter Zijlstra wrote:
> On Wed, Nov 12, 2025 at 03:42:41PM +0100, Peter Zijlstra wrote:
>
>>> if sd is null, i think we are skipping these compared to earlier.
>>>
>>> t0 = sched_clock_cpu(this_cpu);
>>> sched_balance_update_blocked_averages(this_cpu);
>>
>> let me pull that sched_balance_update_blocked_averages() thing up a few
>> lines.
>
> Something like so..
>
> ---
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9946,15 +9946,11 @@ static unsigned long task_h_load(struct
> }
> #endif /* !CONFIG_FAIR_GROUP_SCHED */
>
> -static void sched_balance_update_blocked_averages(int cpu)
> +static void __sched_balance_update_blocked_averages(struct rq *rq)
> {
> bool decayed = false, done = true;
> - struct rq *rq = cpu_rq(cpu);
> - struct rq_flags rf;
>
> - rq_lock_irqsave(rq, &rf);
> update_blocked_load_tick(rq);
> - update_rq_clock(rq);
>
> decayed |= __update_blocked_others(rq, &done);
> decayed |= __update_blocked_fair(rq, &done);
> @@ -9962,7 +9958,15 @@ static void sched_balance_update_blocked
> update_blocked_load_status(rq, !done);
> if (decayed)
> cpufreq_update_util(rq, 0);
> - rq_unlock_irqrestore(rq, &rf);
> +}
> +
> +static void sched_balance_update_blocked_averages(int cpu)
> +{
> + struct rq *rq = cpu_rq(cpu);
> +
> + guard(rq_lock_irqsave)(rq);
> + update_rq_clock(rq);
> + __sched_balance_update_blocked_averages(rq);
> }
>
> /********** Helpers for sched_balance_find_src_group ************************/
> @@ -12865,6 +12869,8 @@ static int sched_balance_newidle(struct
> if (!cpu_active(this_cpu))
> return 0;
>
> + __sched_balance_update_blocked_averages(this_rq);
> +
is this done only when sd == null ?
> /*
> * This is OK, because current is on_cpu, which avoids it being picked
> * for load-balance and preemption/IRQs are still disabled avoiding
> @@ -12891,7 +12897,6 @@ static int sched_balance_newidle(struct
> raw_spin_rq_unlock(this_rq);
>
> t0 = sched_clock_cpu(this_cpu);
> - sched_balance_update_blocked_averages(this_cpu);
>
> rcu_read_lock();
> for_each_domain(this_cpu, sd) {
Referring to commit,
9d783c8dd112a (sched/fair: Skip update_blocked_averages if we are defering load balance)
I think vincent added the max_newidle_lb_cost check since sched_balance_update_blocked_averages is costly.
On Wed, Nov 12, 2025 at 08:58:23PM +0530, Shrikanth Hegde wrote:
> > @@ -12865,6 +12869,8 @@ static int sched_balance_newidle(struct
> > if (!cpu_active(this_cpu))
> > return 0;
> > + __sched_balance_update_blocked_averages(this_rq);
> > +
>
> is this done only when sd == null ?
Its done always.
> > /*
> > * This is OK, because current is on_cpu, which avoids it being picked
> > * for load-balance and preemption/IRQs are still disabled avoiding
> > @@ -12891,7 +12897,6 @@ static int sched_balance_newidle(struct
> > raw_spin_rq_unlock(this_rq);
> > t0 = sched_clock_cpu(this_cpu);
> > - sched_balance_update_blocked_averages(this_cpu);
> > rcu_read_lock();
> > for_each_domain(this_cpu, sd) {
>
> Referring to commit,
> 9d783c8dd112a (sched/fair: Skip update_blocked_averages if we are defering load balance)
> I think vincent added the max_newidle_lb_cost check since sched_balance_update_blocked_averages is costly.
That seems to suggest we only should do
sched_balance_update_blocked_averages() when we're going to do
balancing and so skipping when !sd is fine.
On Fri, 14 Nov 2025 at 10:49, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Nov 12, 2025 at 08:58:23PM +0530, Shrikanth Hegde wrote:
>
> > > @@ -12865,6 +12869,8 @@ static int sched_balance_newidle(struct
> > > if (!cpu_active(this_cpu))
> > > return 0;
> > > + __sched_balance_update_blocked_averages(this_rq);
> > > +
> >
> > is this done only when sd == null ?
>
> Its done always.
>
> > > /*
> > > * This is OK, because current is on_cpu, which avoids it being picked
> > > * for load-balance and preemption/IRQs are still disabled avoiding
> > > @@ -12891,7 +12897,6 @@ static int sched_balance_newidle(struct
> > > raw_spin_rq_unlock(this_rq);
> > > t0 = sched_clock_cpu(this_cpu);
> > > - sched_balance_update_blocked_averages(this_cpu);
> > > rcu_read_lock();
> > > for_each_domain(this_cpu, sd) {
> >
> > Referring to commit,
> > 9d783c8dd112a (sched/fair: Skip update_blocked_averages if we are defering load balance)
> > I think vincent added the max_newidle_lb_cost check since sched_balance_update_blocked_averages is costly.
>
> That seems to suggest we only should do
> sched_balance_update_blocked_averages() when we're going to do
> balancing and so skipping when !sd is fine.
sched_balance_update_blocked_averages() can be costly so we want to
include it in the sd->max_idle_balance_cost so we skip newidle lb and
update blocked load if avg_idle is shorter than doin the update and
the newidle lb of the 1st level
when sd is null we should still skip
sched_balance_update_blocked_averages() is its cost is higher than
avg_idle but it seems that we don't.
>
>
On Fri, Nov 14, 2025 at 11:22:48AM +0100, Vincent Guittot wrote: > sched_balance_update_blocked_averages() can be costly so we want to > include it in the sd->max_idle_balance_cost so we skip newidle lb and > update blocked load if avg_idle is shorter than doin the update and > the newidle lb of the 1st level Ah, fair enough. > when sd is null we should still skip > sched_balance_update_blocked_averages() is its cost is higher than > avg_idle but it seems that we don't. When !sd there is no balancing to be had. Anyway, I'll keep the patch as is -- introducing an early exit for !sd and I'll post a few more cleanups after this.
On Fri, 14 Nov 2025 at 12:05, Peter Zijlstra <peterz@infradead.org> wrote: > > On Fri, Nov 14, 2025 at 11:22:48AM +0100, Vincent Guittot wrote: > > > sched_balance_update_blocked_averages() can be costly so we want to > > include it in the sd->max_idle_balance_cost so we skip newidle lb and > > update blocked load if avg_idle is shorter than doin the update and > > the newidle lb of the 1st level > > Ah, fair enough. > > > when sd is null we should still skip > > sched_balance_update_blocked_averages() is its cost is higher than > > avg_idle but it seems that we don't. > > When !sd there is no balancing to be had. > > Anyway, I'll keep the patch as is -- introducing an early exit for !sd > and I'll post a few more cleanups after this. okay
On 07.11.25 17:06, Peter Zijlstra wrote:
> Pull out the !sd check to simplify code.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> kernel/sched/fair.c | 10 ++++++----
> 1 file changed, 6 insertions(+), 4 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12811,14 +12811,16 @@ static int sched_balance_newidle(struct
>
> rcu_read_lock();
Maybe scoped_guard(rcu) here?
> sd = rcu_dereference_check_sched_domain(this_rq->sd);
> + if (!sd) {
> + rcu_read_unlock();
> + goto out;
> + }
>
> if (!get_rd_overloaded(this_rq->rd) ||
> - (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
> + this_rq->avg_idle < sd->max_newidle_lb_cost) {
>
> - if (sd)
> - update_next_balance(sd, &next_balance);
> + update_next_balance(sd, &next_balance);
> rcu_read_unlock();
> -
> goto out;
> }
> rcu_read_unlock();
On Mon, Nov 10, 2025 at 02:55:34PM +0100, Dietmar Eggemann wrote: > On 07.11.25 17:06, Peter Zijlstra wrote: > > Pull out the !sd check to simplify code. > > > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > --- > > kernel/sched/fair.c | 10 ++++++---- > > 1 file changed, 6 insertions(+), 4 deletions(-) > > > > --- a/kernel/sched/fair.c > > +++ b/kernel/sched/fair.c > > @@ -12811,14 +12811,16 @@ static int sched_balance_newidle(struct > > > > rcu_read_lock(); > > Maybe scoped_guard(rcu) here? Thought about that, but ideally we'd simply remove all that. This has IRQs disabled and that is sufficient.
The following commit has been merged into the sched/core branch of tip:
Commit-ID: e78e70dbf603c1425f15f32b455ca148c932f6c1
Gitweb: https://git.kernel.org/tip/e78e70dbf603c1425f15f32b455ca148c932f6c1
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Fri, 07 Nov 2025 17:01:24 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 17 Nov 2025 17:13:15 +01:00
sched/fair: Small cleanup to sched_balance_newidle()
Pull out the !sd check to simplify code.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Chris Mason <clm@meta.com>
Link: https://patch.msgid.link/20251107161739.525916173@infradead.org
---
kernel/sched/fair.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b73af37..75f891d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12879,14 +12879,16 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (!sd) {
+ rcu_read_unlock();
+ goto out;
+ }
if (!get_rd_overloaded(this_rq->rd) ||
- (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
+ this_rq->avg_idle < sd->max_newidle_lb_cost) {
- if (sd)
- update_next_balance(sd, &next_balance);
+ update_next_balance(sd, &next_balance);
rcu_read_unlock();
-
goto out;
}
rcu_read_unlock();
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 0a37a229bd68647ed01aa687f2249056f080265e
Gitweb: https://git.kernel.org/tip/0a37a229bd68647ed01aa687f2249056f080265e
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Fri, 07 Nov 2025 17:01:24 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 14 Nov 2025 13:03:07 +01:00
sched/fair: Small cleanup to sched_balance_newidle()
Pull out the !sd check to simplify code.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Chris Mason <clm@meta.com>
Link: https://patch.msgid.link/20251107161739.525916173@infradead.org
---
kernel/sched/fair.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfb8935..2870523 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12878,14 +12878,16 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (!sd) {
+ rcu_read_unlock();
+ goto out;
+ }
if (!get_rd_overloaded(this_rq->rd) ||
- (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
+ this_rq->avg_idle < sd->max_newidle_lb_cost) {
- if (sd)
- update_next_balance(sd, &next_balance);
+ update_next_balance(sd, &next_balance);
rcu_read_unlock();
-
goto out;
}
rcu_read_unlock();
© 2016 - 2025 Red Hat, Inc.