include/linux/sched/topology.h | 1 - kernel/sched/fair.c | 16 ++++------------ 2 files changed, 4 insertions(+), 13 deletions(-)
sd->nohz_idle is used to call once inc|dec of &sd->shared->nr_busy_cpus
when entering or leaving idle state but the call to
set_cpu_sd_state_idle|busy is already protected by rq->nohz_tick_stopped
being already set or clear.
Remove the useless sd->nohz_idle field which equals !rq->nohz_tick_stopped.
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
include/linux/sched/topology.h | 1 -
kernel/sched/fair.c | 16 ++++------------
2 files changed, 4 insertions(+), 13 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index a1e1032426dc..59e498006072 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -82,7 +82,6 @@ struct sched_domain {
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
unsigned int imb_numa_nr; /* Nr running tasks that allows a NUMA imbalance */
- int nohz_idle; /* NOHZ IDLE status */
int flags; /* See SD_* */
int level;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d57c02e82f3a..888875e91073 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12818,12 +12818,8 @@ static void set_cpu_sd_state_busy(int cpu)
rcu_read_lock();
sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
- if (!sd || !sd->nohz_idle)
- goto unlock;
- sd->nohz_idle = 0;
-
- atomic_inc(&sd->shared->nr_busy_cpus);
-unlock:
+ if (likely(sd))
+ atomic_inc(&sd->shared->nr_busy_cpus);
rcu_read_unlock();
}
@@ -12847,12 +12843,8 @@ static void set_cpu_sd_state_idle(int cpu)
rcu_read_lock();
sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
- if (!sd || sd->nohz_idle)
- goto unlock;
- sd->nohz_idle = 1;
-
- atomic_dec(&sd->shared->nr_busy_cpus);
-unlock:
+ if (likely(sd))
+ atomic_dec(&sd->shared->nr_busy_cpus);
rcu_read_unlock();
}
--
2.43.0
Hello Vincent,
On 2/27/2026 10:17 PM, Vincent Guittot wrote:
> sd->nohz_idle is used to call once inc|dec of &sd->shared->nr_busy_cpus
> when entering or leaving idle state but the call to
> set_cpu_sd_state_idle|busy is already protected by rq->nohz_tick_stopped
> being already set or clear.
>
> Remove the useless sd->nohz_idle field which equals !rq->nohz_tick_stopped.
I had looked at this recently and I believe the following is
possible with hotplug/cpuset:
CPU0 CPU1
==== ====
nohz_balance_enter_idle()
atomic_dec(&sd->shared->nr_busy_cpus);
... /* Idle */
/* Sched domains are rebuilt */
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
update_top_cache_domain(0 /* CPU0 */)
sd = highest_flag_domain(cpu, SD_SHARE_LLC);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); /* For CPU0 */
... /* Exits idle */
...
... /* First tick hits */
/* Exits idle; First tick hits */
if (rq->nohz_tick_stopped) /* True */
nohz_balance_exit_idle()
set_cpu_sd_state_busy()
atomic_inc(&sd->shared->nr_busy_cpus) /* !!! Crosses sd_weight !!! */
I feel this per-SD indicator is necessary to avoid this scenario.
Is it fixed in some other way that I haven't realised yet?
--
Thanks and Regards,
Prateek
On Fri, 27 Feb 2026 at 18:33, K Prateek Nayak <kprateek.nayak@amd.com> wrote: > > Hello Vincent, > > On 2/27/2026 10:17 PM, Vincent Guittot wrote: > > sd->nohz_idle is used to call once inc|dec of &sd->shared->nr_busy_cpus > > when entering or leaving idle state but the call to > > set_cpu_sd_state_idle|busy is already protected by rq->nohz_tick_stopped > > being already set or clear. > > > > Remove the useless sd->nohz_idle field which equals !rq->nohz_tick_stopped. > > I had looked at this recently and I believe the following is > possible with hotplug/cpuset: > > CPU0 CPU1 > ==== ==== > > nohz_balance_enter_idle() > atomic_dec(&sd->shared->nr_busy_cpus); > ... /* Idle */ > /* Sched domains are rebuilt */ > atomic_set(&sd->shared->nr_busy_cpus, sd_weight); > update_top_cache_domain(0 /* CPU0 */) > sd = highest_flag_domain(cpu, SD_SHARE_LLC); > rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); /* For CPU0 */ > ... /* Exits idle */ > ... > ... /* First tick hits */ > /* Exits idle; First tick hits */ > if (rq->nohz_tick_stopped) /* True */ > nohz_balance_exit_idle() > set_cpu_sd_state_busy() > atomic_inc(&sd->shared->nr_busy_cpus) /* !!! Crosses sd_weight !!! */ > > > I feel this per-SD indicator is necessary to avoid this scenario. > Is it fixed in some other way that I haven't realised yet? You're right ! And I don't have an easy way to fix this > > -- > Thanks and Regards, > Prateek >
© 2016 - 2026 Red Hat, Inc.