[tip: sched/core] sched/topology: Extract "imb_numa_nr" calculation into a separate helper

tip-bot2 for K Prateek Nayak posted 1 patch 2 weeks, 5 days ago
kernel/sched/topology.c | 133 +++++++++++++++++++++++----------------
1 file changed, 80 insertions(+), 53 deletions(-)
[tip: sched/core] sched/topology: Extract "imb_numa_nr" calculation into a separate helper
Posted by tip-bot2 for K Prateek Nayak 2 weeks, 5 days ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     5a7b576b3ec1acc2694c5b58f80cd1d44a11b2c1
Gitweb:        https://git.kernel.org/tip/5a7b576b3ec1acc2694c5b58f80cd1d44a11b2c1
Author:        K Prateek Nayak <kprateek.nayak@amd.com>
AuthorDate:    Thu, 12 Mar 2026 04:44:27 
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 18 Mar 2026 09:06:48 +01:00

sched/topology: Extract "imb_numa_nr" calculation into a separate helper

Subsequent changes to assign "sd->shared" from "s_data" would
necessitate finding the topmost SD_SHARE_LLC to assign shared object to.

This is very similar to the "imb_numa_nr" computation loop except that
"imb_numa_nr" cares about the first domain without the SD_SHARE_LLC flag
(immediate parent of sd_llc) whereas the "sd->shared" assignment would
require sd_llc itself.

Extract the "imb_numa_nr" calculation into a helper
adjust_numa_imbalance() and use the current loop in the
build_sched_domains() to find the sd_llc.

While at it, guard the call behind CONFIG_NUMA's status since
"imb_numa_nr" only makes sense on NUMA enabled configs with SD_NUMA
domains.

No functional changes intended.

Suggested-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://patch.msgid.link/20260312044434.1974-3-kprateek.nayak@amd.com
---
 kernel/sched/topology.c | 133 +++++++++++++++++++++++----------------
 1 file changed, 80 insertions(+), 53 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 79bab80..6303790 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2550,6 +2550,74 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
 }
 
 /*
+ * Calculate an allowed NUMA imbalance such that LLCs do not get
+ * imbalanced.
+ */
+static void adjust_numa_imbalance(struct sched_domain *sd_llc)
+{
+	struct sched_domain *parent;
+	unsigned int imb_span = 1;
+	unsigned int imb = 0;
+	unsigned int nr_llcs;
+
+	WARN_ON(!(sd_llc->flags & SD_SHARE_LLC));
+	WARN_ON(!sd_llc->parent);
+
+	/*
+	 * For a single LLC per node, allow an
+	 * imbalance up to 12.5% of the node. This is
+	 * arbitrary cutoff based two factors -- SMT and
+	 * memory channels. For SMT-2, the intent is to
+	 * avoid premature sharing of HT resources but
+	 * SMT-4 or SMT-8 *may* benefit from a different
+	 * cutoff. For memory channels, this is a very
+	 * rough estimate of how many channels may be
+	 * active and is based on recent CPUs with
+	 * many cores.
+	 *
+	 * For multiple LLCs, allow an imbalance
+	 * until multiple tasks would share an LLC
+	 * on one node while LLCs on another node
+	 * remain idle. This assumes that there are
+	 * enough logical CPUs per LLC to avoid SMT
+	 * factors and that there is a correlation
+	 * between LLCs and memory channels.
+	 */
+	nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
+	if (nr_llcs == 1)
+		imb = sd_llc->parent->span_weight >> 3;
+	else
+		imb = nr_llcs;
+
+	imb = max(1U, imb);
+	sd_llc->parent->imb_numa_nr = imb;
+
+	/*
+	 * Set span based on the first NUMA domain.
+	 *
+	 * NUMA systems always add a NODE domain before
+	 * iterating the NUMA domains. Since this is before
+	 * degeneration, start from sd_llc's parent's
+	 * parent which is the lowest an SD_NUMA domain can
+	 * be relative to sd_llc.
+	 */
+	parent = sd_llc->parent->parent;
+	while (parent && !(parent->flags & SD_NUMA))
+		parent = parent->parent;
+
+	imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
+
+	/* Update the upper remainder of the topology */
+	parent = sd_llc->parent;
+	while (parent) {
+		int factor = max(1U, (parent->span_weight / imb_span));
+
+		parent->imb_numa_nr = imb * factor;
+		parent = parent->parent;
+	}
+}
+
+/*
  * Build sched domains for a given set of CPUs and attach the sched domains
  * to the individual CPUs
  */
@@ -2606,62 +2674,21 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		}
 	}
 
-	/*
-	 * Calculate an allowed NUMA imbalance such that LLCs do not get
-	 * imbalanced.
-	 */
 	for_each_cpu(i, cpu_map) {
-		unsigned int imb = 0;
-		unsigned int imb_span = 1;
+		sd = *per_cpu_ptr(d.sd, i);
+		if (!sd)
+			continue;
 
-		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-			struct sched_domain *child = sd->child;
-
-			if (!(sd->flags & SD_SHARE_LLC) && child &&
-			    (child->flags & SD_SHARE_LLC)) {
-				struct sched_domain __rcu *top_p;
-				unsigned int nr_llcs;
-
-				/*
-				 * For a single LLC per node, allow an
-				 * imbalance up to 12.5% of the node. This is
-				 * arbitrary cutoff based two factors -- SMT and
-				 * memory channels. For SMT-2, the intent is to
-				 * avoid premature sharing of HT resources but
-				 * SMT-4 or SMT-8 *may* benefit from a different
-				 * cutoff. For memory channels, this is a very
-				 * rough estimate of how many channels may be
-				 * active and is based on recent CPUs with
-				 * many cores.
-				 *
-				 * For multiple LLCs, allow an imbalance
-				 * until multiple tasks would share an LLC
-				 * on one node while LLCs on another node
-				 * remain idle. This assumes that there are
-				 * enough logical CPUs per LLC to avoid SMT
-				 * factors and that there is a correlation
-				 * between LLCs and memory channels.
-				 */
-				nr_llcs = sd->span_weight / child->span_weight;
-				if (nr_llcs == 1)
-					imb = sd->span_weight >> 3;
-				else
-					imb = nr_llcs;
-				imb = max(1U, imb);
-				sd->imb_numa_nr = imb;
-
-				/* Set span based on the first NUMA domain. */
-				top_p = sd->parent;
-				while (top_p && !(top_p->flags & SD_NUMA)) {
-					top_p = top_p->parent;
-				}
-				imb_span = top_p ? top_p->span_weight : sd->span_weight;
-			} else {
-				int factor = max(1U, (sd->span_weight / imb_span));
+		/* First, find the topmost SD_SHARE_LLC domain */
+		while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+			sd = sd->parent;
 
-				sd->imb_numa_nr = imb * factor;
-			}
-		}
+		/*
+		 * In presence of higher domains, adjust the
+		 * NUMA imbalance stats for the hierarchy.
+		 */
+		if (IS_ENABLED(CONFIG_NUMA) && (sd->flags & SD_SHARE_LLC) && sd->parent)
+			adjust_numa_imbalance(sd);
 	}
 
 	/* Calculate CPU capacity for physical packages and nodes */