sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity

[tip: sched/core] sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity
Posted by tip-bot2 for K Prateek Nayak 3 weeks, 4 days ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     fdfe5a8cd8731dd81840f26abfb6527edd27b0cb
Gitweb:        https://git.kernel.org/tip/fdfe5a8cd8731dd81840f26abfb6527edd27b0cb
Author:        K Prateek Nayak <kprateek.nayak@amd.com>
AuthorDate:    Sat, 16 May 2026 07:58:50 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 19 May 2026 12:17:38 +02:00

sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity

On asymmetric CPU capacity systems, the wakeup path uses
select_idle_capacity(), which scans the span of sd_asym_cpucapacity
rather than sd_llc.

The has_idle_cores hint however lives on sd_llc->shared, so the
wakeup-time read of has_idle_cores operates on an LLC-scoped blob while
the actual scan/decision spans the asym domain; nr_busy_cpus also lives
in the same shared sched_domain data, but it's never used in the asym
CPU capacity scenario.

Therefore, move the sched_domain_shared object to sd_asym_cpucapacity
whenever the CPU has a SD_ASYM_CPUCAPACITY_FULL ancestor and that
ancestor is non-overlapping (i.e., not built from SD_NUMA). In that case
the scope of has_idle_cores matches the scope of the wakeup scan.

Fall back to attaching the shared object to sd_llc in three cases:

  1) plain symmetric systems (no SD_ASYM_CPUCAPACITY_FULL anywhere);

  2) CPUs in an exclusive cpuset that carves out a symmetric capacity
     island: has_asym is system-wide but those CPUs have no
     SD_ASYM_CPUCAPACITY_FULL ancestor in their hierarchy and follow
     the symmetric LLC path in select_idle_sibling();

  3) exotic topologies where SD_ASYM_CPUCAPACITY_FULL lands on an
     SD_NUMA-built domain. init_sched_domain_shared() keys the shared
     blob off cpumask_first(span), which on overlapping NUMA domains
     would alias unrelated spans onto the same blob. Keep the shared
     object on the LLC there; select_idle_capacity() gracefully skips
     the has_idle_cores preference when sd->shared is NULL.

While at it, also rename the per-CPU sd_llc_shared to sd_balance_shared,
as it is no longer strictly tied to the LLC.

Co-developed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://patch.msgid.link/20260516055850.1345932-1-arighi@nvidia.com
---
 kernel/sched/fair.c     | 22 +++++----
 kernel/sched/sched.h    |  2 +-
 kernel/sched/topology.c | 95 ++++++++++++++++++++++++++++++++++------
 3 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03f63b0..2637a6f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7773,7 +7773,7 @@ static inline void set_idle_cores(int cpu, int val)
 {
 	struct sched_domain_shared *sds;
 
-	sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+	sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
 	if (sds)
 		WRITE_ONCE(sds->has_idle_cores, val);
 }
@@ -7782,7 +7782,7 @@ static inline bool test_idle_cores(int cpu)
 {
 	struct sched_domain_shared *sds;
 
-	sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+	sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
 	if (sds)
 		return READ_ONCE(sds->has_idle_cores);
 
@@ -7791,7 +7791,7 @@ static inline bool test_idle_cores(int cpu)
 
 /*
  * Scans the local SMT mask to see if the entire core is idle, and records this
- * information in sd_llc_shared->has_idle_cores.
+ * information in sd_balance_shared->has_idle_cores.
  *
  * Since SMT siblings share all cache levels, inspecting this limited remote
  * state should be fairly cheap.
@@ -7821,7 +7821,8 @@ unlock:
 /*
  * Scan the entire LLC domain for idle cores; this dynamically switches off if
  * there are no idle cores left in the system; tracked through
- * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ * sd_balance_shared->has_idle_cores and enabled through update_idle_core()
+ * above.
  */
 static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
 {
@@ -7885,7 +7886,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool 
 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 	int i, cpu, idle_cpu = -1, nr = INT_MAX;
 
-	if (sched_feat(SIS_UTIL)) {
+	if (sched_feat(SIS_UTIL) && sd->shared) {
 		/*
 		 * Increment because !--nr is the condition to stop scan.
 		 *
@@ -12764,7 +12765,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		goto out;
 	}
 
-	sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
+	sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu));
 	if (sds) {
 		/*
 		 * If there is an imbalance between LLC domains (IOW we could
@@ -12792,7 +12793,11 @@ static void set_cpu_sd_state_busy(int cpu)
 	struct sched_domain *sd;
 	sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
 
-	if (!sd || !sd->nohz_idle)
+	/*
+	 * sd->nohz_idle only pairs with nr_busy_cpus on sd->shared; if this
+	 * domain has no shared object there is nothing to clear or account.
+	 */
+	if (!sd || !sd->shared || !sd->nohz_idle)
 		return;
 	sd->nohz_idle = 0;
 
@@ -12817,7 +12822,8 @@ static void set_cpu_sd_state_idle(int cpu)
 	struct sched_domain *sd;
 	sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
 
-	if (!sd || sd->nohz_idle)
+	/* See set_cpu_sd_state_busy(): nohz_idle is only used with sd->shared. */
+	if (!sd || !sd->shared || sd->nohz_idle)
 		return;
 	sd->nohz_idle = 1;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ffe77b2..bfb4b47 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2164,7 +2164,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
 DECLARE_PER_CPU(int, sd_share_id);
-DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a1f46e3..f96d501 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -665,7 +665,7 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(int, sd_share_id);
-DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
@@ -680,20 +680,38 @@ static void update_top_cache_domain(int cpu)
 	int id = cpu;
 	int size = 1;
 
+	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
+	/*
+	 * The shared object is attached to sd_asym_cpucapacity only when the
+	 * asym domain is non-overlapping (i.e., not built from SD_NUMA).
+	 * On overlapping (NUMA) asym domains we fall back to letting the
+	 * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
+	 * here.
+	 */
+	if (sd && sd->shared)
+		sds = sd->shared;
+
+	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
+
 	sd = highest_flag_domain(cpu, SD_SHARE_LLC);
 	if (sd) {
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
 
-		/* If sd_llc exists, sd_llc_shared should exist too. */
-		WARN_ON_ONCE(!sd->shared);
-		sds = sd->shared;
+		/*
+		 * If sd_asym_cpucapacity didn't claim the shared object,
+		 * sd_llc must have one linked.
+		 */
+		if (!sds) {
+			WARN_ON_ONCE(!sd->shared);
+			sds = sd->shared;
+		}
 	}
 
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
 	per_cpu(sd_llc_id, cpu) = id;
-	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+	rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds);
 
 	sd = lowest_flag_domain(cpu, SD_CLUSTER);
 	if (sd)
@@ -711,9 +729,6 @@ static void update_top_cache_domain(int cpu)
 
 	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
 	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
-
-	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
-	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
 }
 
 /*
@@ -2648,6 +2663,54 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
 	}
 }
 
+static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd)
+{
+	int sd_id = cpumask_first(sched_domain_span(sd));
+
+	sd->shared = *per_cpu_ptr(d->sds, sd_id);
+	/*
+	 * nr_busy_cpus is consumed only by the NOHZ kick path via
+	 * sd_balance_shared; on the asym-capacity path it is initialized but
+	 * never read.
+	 */
+	atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+	atomic_inc(&sd->shared->ref);
+}
+
+/*
+ * For asymmetric CPU capacity, attach sched_domain_shared on the innermost
+ * SD_ASYM_CPUCAPACITY_FULL ancestor of @cpu's base domain when that ancestor is
+ * not an overlapping NUMA-built domain (then LLC should claim shared).
+ *
+ * A CPU may lack any FULL ancestor (e.g., exclusive cpuset symmetric island),
+ * then LLC must claim shared instead.
+ *
+ * Note: SD_ASYM_CPUCAPACITY_FULL is only set when all CPU capacity values
+ * are present in the domain span, so the asym domain we attach to cannot
+ * degenerate into a single-capacity group. The relevant edge cases are instead
+ * covered by the caveats above.
+ *
+ * Return true if this CPU's asym path claimed sd->shared, false otherwise.
+ */
+static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu)
+{
+	struct sched_domain *sd = *per_cpu_ptr(d->sd, cpu);
+	struct sched_domain *sd_asym;
+
+	if (!sd)
+		return false;
+
+	sd_asym = sd;
+	while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL))
+		sd_asym = sd_asym->parent;
+
+	if (!sd_asym || (sd_asym->flags & SD_NUMA))
+		return false;
+
+	init_sched_domain_shared(d, sd_asym);
+	return true;
+}
+
 /*
  * Build sched domains for a given set of CPUs and attach the sched domains
  * to the individual CPUs
@@ -2706,20 +2769,26 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	}
 
 	for_each_cpu(i, cpu_map) {
+		bool asym_claimed = false;
+
 		sd = *per_cpu_ptr(d.sd, i);
 		if (!sd)
 			continue;
 
+		if (has_asym)
+			asym_claimed = claim_asym_sched_domain_shared(&d, i);
+
 		/* First, find the topmost SD_SHARE_LLC domain */
 		while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
 			sd = sd->parent;
 
 		if (sd->flags & SD_SHARE_LLC) {
-			int sd_id = cpumask_first(sched_domain_span(sd));
-
-			sd->shared = *per_cpu_ptr(d.sds, sd_id);
-			atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
-			atomic_inc(&sd->shared->ref);
+			/*
+			 * Initialize the sd->shared for SD_SHARE_LLC unless
+			 * the asym path above already claimed it.
+			 */
+			if (!asym_claimed)
+				init_sched_domain_shared(&d, sd);
 
 			/*
 			 * In presence of higher domains, adjust the