kernel/sched/fair.c | 22 +++++---- kernel/sched/sched.h | 2 +- kernel/sched/topology.c | 95 ++++++++++++++++++++++++++++++++++------ 3 files changed, 97 insertions(+), 22 deletions(-)
The following commit has been merged into the sched/core branch of tip:
Commit-ID: fdfe5a8cd8731dd81840f26abfb6527edd27b0cb
Gitweb: https://git.kernel.org/tip/fdfe5a8cd8731dd81840f26abfb6527edd27b0cb
Author: K Prateek Nayak <kprateek.nayak@amd.com>
AuthorDate: Sat, 16 May 2026 07:58:50 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 19 May 2026 12:17:38 +02:00
sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity
On asymmetric CPU capacity systems, the wakeup path uses
select_idle_capacity(), which scans the span of sd_asym_cpucapacity
rather than sd_llc.
The has_idle_cores hint however lives on sd_llc->shared, so the
wakeup-time read of has_idle_cores operates on an LLC-scoped blob while
the actual scan/decision spans the asym domain; nr_busy_cpus also lives
in the same shared sched_domain data, but it's never used in the asym
CPU capacity scenario.
Therefore, move the sched_domain_shared object to sd_asym_cpucapacity
whenever the CPU has a SD_ASYM_CPUCAPACITY_FULL ancestor and that
ancestor is non-overlapping (i.e., not built from SD_NUMA). In that case
the scope of has_idle_cores matches the scope of the wakeup scan.
Fall back to attaching the shared object to sd_llc in three cases:
1) plain symmetric systems (no SD_ASYM_CPUCAPACITY_FULL anywhere);
2) CPUs in an exclusive cpuset that carves out a symmetric capacity
island: has_asym is system-wide but those CPUs have no
SD_ASYM_CPUCAPACITY_FULL ancestor in their hierarchy and follow
the symmetric LLC path in select_idle_sibling();
3) exotic topologies where SD_ASYM_CPUCAPACITY_FULL lands on an
SD_NUMA-built domain. init_sched_domain_shared() keys the shared
blob off cpumask_first(span), which on overlapping NUMA domains
would alias unrelated spans onto the same blob. Keep the shared
object on the LLC there; select_idle_capacity() gracefully skips
the has_idle_cores preference when sd->shared is NULL.
While at it, also rename the per-CPU sd_llc_shared to sd_balance_shared,
as it is no longer strictly tied to the LLC.
Co-developed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://patch.msgid.link/20260516055850.1345932-1-arighi@nvidia.com
---
kernel/sched/fair.c | 22 +++++----
kernel/sched/sched.h | 2 +-
kernel/sched/topology.c | 95 ++++++++++++++++++++++++++++++++++------
3 files changed, 97 insertions(+), 22 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03f63b0..2637a6f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7773,7 +7773,7 @@ static inline void set_idle_cores(int cpu, int val)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
if (sds)
WRITE_ONCE(sds->has_idle_cores, val);
}
@@ -7782,7 +7782,7 @@ static inline bool test_idle_cores(int cpu)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);
@@ -7791,7 +7791,7 @@ static inline bool test_idle_cores(int cpu)
/*
* Scans the local SMT mask to see if the entire core is idle, and records this
- * information in sd_llc_shared->has_idle_cores.
+ * information in sd_balance_shared->has_idle_cores.
*
* Since SMT siblings share all cache levels, inspecting this limited remote
* state should be fairly cheap.
@@ -7821,7 +7821,8 @@ unlock:
/*
* Scan the entire LLC domain for idle cores; this dynamically switches off if
* there are no idle cores left in the system; tracked through
- * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ * sd_balance_shared->has_idle_cores and enabled through update_idle_core()
+ * above.
*/
static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
@@ -7885,7 +7886,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
- if (sched_feat(SIS_UTIL)) {
+ if (sched_feat(SIS_UTIL) && sd->shared) {
/*
* Increment because !--nr is the condition to stop scan.
*
@@ -12764,7 +12765,7 @@ static void nohz_balancer_kick(struct rq *rq)
goto out;
}
- sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
@@ -12792,7 +12793,11 @@ static void set_cpu_sd_state_busy(int cpu)
struct sched_domain *sd;
sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
- if (!sd || !sd->nohz_idle)
+ /*
+ * sd->nohz_idle only pairs with nr_busy_cpus on sd->shared; if this
+ * domain has no shared object there is nothing to clear or account.
+ */
+ if (!sd || !sd->shared || !sd->nohz_idle)
return;
sd->nohz_idle = 0;
@@ -12817,7 +12822,8 @@ static void set_cpu_sd_state_idle(int cpu)
struct sched_domain *sd;
sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
- if (!sd || sd->nohz_idle)
+ /* See set_cpu_sd_state_busy(): nohz_idle is only used with sd->shared. */
+ if (!sd || !sd->shared || sd->nohz_idle)
return;
sd->nohz_idle = 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ffe77b2..bfb4b47 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2164,7 +2164,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(int, sd_share_id);
-DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a1f46e3..f96d501 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -665,7 +665,7 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(int, sd_share_id);
-DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
@@ -680,20 +680,38 @@ static void update_top_cache_domain(int cpu)
int id = cpu;
int size = 1;
+ sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
+ /*
+ * The shared object is attached to sd_asym_cpucapacity only when the
+ * asym domain is non-overlapping (i.e., not built from SD_NUMA).
+ * On overlapping (NUMA) asym domains we fall back to letting the
+ * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
+ * here.
+ */
+ if (sd && sd->shared)
+ sds = sd->shared;
+
+ rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
+
sd = highest_flag_domain(cpu, SD_SHARE_LLC);
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
- /* If sd_llc exists, sd_llc_shared should exist too. */
- WARN_ON_ONCE(!sd->shared);
- sds = sd->shared;
+ /*
+ * If sd_asym_cpucapacity didn't claim the shared object,
+ * sd_llc must have one linked.
+ */
+ if (!sds) {
+ WARN_ON_ONCE(!sd->shared);
+ sds = sd->shared;
+ }
}
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
- rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+ rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds);
sd = lowest_flag_domain(cpu, SD_CLUSTER);
if (sd)
@@ -711,9 +729,6 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
-
- sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
- rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
}
/*
@@ -2648,6 +2663,54 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
}
}
+static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd)
+{
+ int sd_id = cpumask_first(sched_domain_span(sd));
+
+ sd->shared = *per_cpu_ptr(d->sds, sd_id);
+ /*
+ * nr_busy_cpus is consumed only by the NOHZ kick path via
+ * sd_balance_shared; on the asym-capacity path it is initialized but
+ * never read.
+ */
+ atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+ atomic_inc(&sd->shared->ref);
+}
+
+/*
+ * For asymmetric CPU capacity, attach sched_domain_shared on the innermost
+ * SD_ASYM_CPUCAPACITY_FULL ancestor of @cpu's base domain when that ancestor is
+ * not an overlapping NUMA-built domain (then LLC should claim shared).
+ *
+ * A CPU may lack any FULL ancestor (e.g., exclusive cpuset symmetric island),
+ * then LLC must claim shared instead.
+ *
+ * Note: SD_ASYM_CPUCAPACITY_FULL is only set when all CPU capacity values
+ * are present in the domain span, so the asym domain we attach to cannot
+ * degenerate into a single-capacity group. The relevant edge cases are instead
+ * covered by the caveats above.
+ *
+ * Return true if this CPU's asym path claimed sd->shared, false otherwise.
+ */
+static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu)
+{
+ struct sched_domain *sd = *per_cpu_ptr(d->sd, cpu);
+ struct sched_domain *sd_asym;
+
+ if (!sd)
+ return false;
+
+ sd_asym = sd;
+ while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL))
+ sd_asym = sd_asym->parent;
+
+ if (!sd_asym || (sd_asym->flags & SD_NUMA))
+ return false;
+
+ init_sched_domain_shared(d, sd_asym);
+ return true;
+}
+
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
@@ -2706,20 +2769,26 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
for_each_cpu(i, cpu_map) {
+ bool asym_claimed = false;
+
sd = *per_cpu_ptr(d.sd, i);
if (!sd)
continue;
+ if (has_asym)
+ asym_claimed = claim_asym_sched_domain_shared(&d, i);
+
/* First, find the topmost SD_SHARE_LLC domain */
while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
sd = sd->parent;
if (sd->flags & SD_SHARE_LLC) {
- int sd_id = cpumask_first(sched_domain_span(sd));
-
- sd->shared = *per_cpu_ptr(d.sds, sd_id);
- atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
- atomic_inc(&sd->shared->ref);
+ /*
+ * Initialize the sd->shared for SD_SHARE_LLC unless
+ * the asym path above already claimed it.
+ */
+ if (!asym_claimed)
+ init_sched_domain_shared(&d, sd);
/*
* In presence of higher domains, adjust the
© 2016 - 2026 Red Hat, Inc.