[tip: sched/core] sched/topology: Allow multiple domains to claim sched_domain_shared

tip-bot2 for K Prateek Nayak posted 1 patch 4 days, 14 hours ago
include/linux/sched/topology.h | 16 +++++++-
kernel/sched/topology.c        | 63 ++++++++++++++++++++++++++++-----
2 files changed, 69 insertions(+), 10 deletions(-)
[tip: sched/core] sched/topology: Allow multiple domains to claim sched_domain_shared
Posted by tip-bot2 for K Prateek Nayak 4 days, 14 hours ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     9e005ed21152d4a4bb0ceea71045ff8a642a6feb
Gitweb:        https://git.kernel.org/tip/9e005ed21152d4a4bb0ceea71045ff8a642a6feb
Author:        K Prateek Nayak <kprateek.nayak@amd.com>
AuthorDate:    Tue, 19 May 2026 05:14:23 
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 19 May 2026 13:35:36 +02:00

sched/topology: Allow multiple domains to claim sched_domain_shared

Recent optimizations of sd->shared assignment moved to allocating a
single instance of per-CPU sched_domain_shared objects per s_data.

Recent optimizations to select_idle_capacity() moved the sd->shared
assignments to "sd_asym" domain when ASYM_CPUCAPACITY is detected but
cache-aware scheduling mandates the presence of "sd_llc_shared" to
compute and cache per-LLC statistics.

Use an "alloc_flags" union in sched_domain_shared to claim a
sched_domain_shared object per sched_domain. Allocation starts searching
for an available / matching sched_domain_shared instance from the first
CPU of sched_domain_span(sd) (sd can be sd_llc, or sd_asym). If the
shared object is claimed by another domain, the instance corresponding
to next CPU in the domain span is explored until a matching / available
instance is found.

In case of a single CPU in sched_domain_span(), the domain will be
degenerated and a temporary overlap of ->shared objects across different
domains is acceptable.

"alloc_flags" forms a union with "nr_idle_scan" and the stale flags are
left as is when the sd->shared is published. The expectation is for the
first load balancing instance to correct the value just like the current
behavior, except the initial value is no longer 0.

Originally-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/topology.h | 16 +++++++-
 kernel/sched/topology.c        | 63 ++++++++++++++++++++++++++++-----
 2 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index fe09d32..b5d9d7c 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -67,7 +67,21 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
-	int		nr_idle_scan;
+	union {
+		int	nr_idle_scan;
+		/*
+		 * Used during allocation to claim the sched_domain_shared
+		 * object at multiple levels.
+		 *
+		 * Note: between build and the first periodic LB tick, which
+		 * rewrites the union via update_idle_cpu_scan(), readers of
+		 * nr_idle_scan may observe the transient SD_* flag value as
+		 * the scan bound. The flag bits are small positive integers,
+		 * so the effect is just a slightly relaxed scan bound for one
+		 * window and self-heals on the first tick.
+		 */
+		int	alloc_flags;
+	};
 #ifdef CONFIG_SCHED_CACHE
 	unsigned long	util_avg;
 	unsigned long	capacity;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index dbfd965..df2ceb5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -623,6 +623,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
 	} while (sg != first);
 }
 
+static void free_sched_domain_shared(struct sched_domain_shared *sds)
+{
+	if (sds && atomic_dec_and_test(&sds->ref))
+		kfree(sds);
+}
+
 static void destroy_sched_domain(struct sched_domain *sd)
 {
 	/*
@@ -631,9 +637,7 @@ static void destroy_sched_domain(struct sched_domain *sd)
 	 * dropping group/capacity references, freeing where none remain.
 	 */
 	free_sched_groups(sd->groups, 1);
-
-	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
-		kfree(sd->shared);
+	free_sched_domain_shared(sd->shared);
 
 #ifdef CONFIG_SCHED_CACHE
 	/* only the bottom sd has llc_counts array */
@@ -755,7 +759,14 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 
 			/* Pick reference to parent->shared. */
 			if (parent->shared) {
-				WARN_ON_ONCE(tmp->shared);
+				/*
+				 * It is safe to free a sd->shared that
+				 * has not been published yet. If a
+				 * sd->shared was published, the refcount
+				 * will end up being non-zero and it will
+				 * not be freed here.
+				 */
+				free_sched_domain_shared(tmp->shared);
 				tmp->shared = parent->shared;
 				parent->shared = NULL;
 			}
@@ -2916,11 +2927,45 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
 	}
 }
 
-static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd)
+static void
+init_sched_domain_shared(struct s_data *d, struct sched_domain *sd, int flags)
 {
-	int sd_id = cpumask_first(sched_domain_span(sd));
+	struct sched_domain_shared *sds = NULL;
+	int cpu;
+
+	/*
+	 * Multiple domains can try to claim a shared object like
+	 * SD_ASYM_CPUCAPACITY and SD_SHARE_LLC which can alias to
+	 * same cpumask_first(sched_domain_span(sd)) CPU and can
+	 * cause "nr_idle_scan" to be populated incorrectly during
+	 * load balancing.
+	 *
+	 * Find the first CPU in sched_domain_span(sd) with an
+	 * unclaimed domain (!alloc_flags) or where the alloc_flag
+	 * matches the requested flag (SD_* flag)
+	 *
+	 * If the domain only has single CPU, allow temporary overlap
+	 * in allocation since the domains will be degenerated later.
+	 */
+	for_each_cpu(cpu, sched_domain_span(sd)) {
+		sds = *per_cpu_ptr(d->sds, cpu);
+
+		if (!sds->alloc_flags ||
+		    sd->span_weight == 1 ||
+		    sds->alloc_flags == flags) {
+			sds->alloc_flags = flags;
+			sd->shared = sds;
+			break;
+		}
+	}
+
+	/*
+	 * Use the sd_shared corresponding to the last
+	 * CPU in the span if none are avaialable.
+	 */
+	if (WARN_ON_ONCE(!sd->shared))
+		sd->shared = sds;
 
-	sd->shared = *per_cpu_ptr(d->sds, sd_id);
 	/*
 	 * nr_busy_cpus is consumed only by the NOHZ kick path via
 	 * sd_balance_shared; on the asym-capacity path it is initialized but
@@ -2960,7 +3005,7 @@ static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu)
 	if (!sd_asym || (sd_asym->flags & SD_NUMA))
 		return false;
 
-	init_sched_domain_shared(d, sd_asym);
+	init_sched_domain_shared(d, sd_asym, SD_ASYM_CPUCAPACITY);
 	return true;
 }
 
@@ -3115,7 +3160,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 			sd = sd->parent;
 
 		if (sd->flags & SD_SHARE_LLC) {
-			init_sched_domain_shared(&d, sd);
+			init_sched_domain_shared(&d, sd, SD_SHARE_LLC);
 
 			/*
 			 * In presence of higher domains, adjust the