Subsequent changes to assign "sd->shared" from "s_data" would
necessitate finding the topmost SD_SHARE_LLC to assign shared object to.
This is very similar to the "imb_numa_nr" computation loop except that
"imb_numa_nr" cares about the first domain without the SD_SHARE_LLC flag
(immediate parent of sd_llc) whereas the "sd->shared" assignment would
require sd_llc itself.
Extract the "imb_numa_nr" calculation into a helper
adjust_numa_imbalance() and use the current loop in the
build_sched_domains() to find the sd_llc.
While at it, guard the call behind CONFIG_NUMA's status since
"imb_numa_nr" only makes sense on NUMA enabled configs with SD_NUMA
domains.
No functional changes intended.
Suggested-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
Changelog v3..v4:
o New patch based on the suggestion from Valentin and Chenyu in
https://lore.kernel.org/lkml/xhsmh343e43fd.mognet@vschneid-thinkpadt14sgen2i.remote.csb/
Notable deviation is moving the entire "imb_numa_nr" loop into the
adjust_numa_imbalance() helper to keep all the bits in one place
instead of passing "imb" and "imb_span" as references to the helper.
o Guarded the call behind CONFIG_NUMA's status to save overhead when
NUMA domains don't exist.
---
kernel/sched/topology.c | 133 ++++++++++++++++++++++++----------------
1 file changed, 80 insertions(+), 53 deletions(-)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 34b20b0e1867..7f25c784c038 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2551,6 +2551,74 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
return true;
}
+/*
+ * Calculate an allowed NUMA imbalance such that LLCs do not get
+ * imbalanced.
+ */
+static void adjust_numa_imbalance(struct sched_domain *sd_llc)
+{
+ struct sched_domain *parent;
+ unsigned int imb_span = 1;
+ unsigned int imb = 0;
+ unsigned int nr_llcs;
+
+ WARN_ON(!(sd_llc->flags & SD_SHARE_LLC));
+ WARN_ON(!sd_llc->parent);
+
+ /*
+ * For a single LLC per node, allow an
+ * imbalance up to 12.5% of the node. This is
+ * arbitrary cutoff based two factors -- SMT and
+ * memory channels. For SMT-2, the intent is to
+ * avoid premature sharing of HT resources but
+ * SMT-4 or SMT-8 *may* benefit from a different
+ * cutoff. For memory channels, this is a very
+ * rough estimate of how many channels may be
+ * active and is based on recent CPUs with
+ * many cores.
+ *
+ * For multiple LLCs, allow an imbalance
+ * until multiple tasks would share an LLC
+ * on one node while LLCs on another node
+ * remain idle. This assumes that there are
+ * enough logical CPUs per LLC to avoid SMT
+ * factors and that there is a correlation
+ * between LLCs and memory channels.
+ */
+ nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
+ if (nr_llcs == 1)
+ imb = sd_llc->parent->span_weight >> 3;
+ else
+ imb = nr_llcs;
+
+ imb = max(1U, imb);
+ sd_llc->parent->imb_numa_nr = imb;
+
+ /*
+ * Set span based on the first NUMA domain.
+ *
+ * NUMA systems always add a NODE domain before
+ * iterating the NUMA domains. Since this is before
+ * degeneration, start from sd_llc's parent's
+ * parent which is the lowest an SD_NUMA domain can
+ * be relative to sd_llc.
+ */
+ parent = sd_llc->parent->parent;
+ while (parent && !(parent->flags & SD_NUMA))
+ parent = parent->parent;
+
+ imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
+
+ /* Update the upper remainder of the topology */
+ parent = sd_llc->parent;
+ while (parent) {
+ int factor = max(1U, (parent->span_weight / imb_span));
+
+ parent->imb_numa_nr = imb * factor;
+ parent = parent->parent;
+ }
+}
+
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
@@ -2608,62 +2676,21 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
}
- /*
- * Calculate an allowed NUMA imbalance such that LLCs do not get
- * imbalanced.
- */
for_each_cpu(i, cpu_map) {
- unsigned int imb = 0;
- unsigned int imb_span = 1;
+ sd = *per_cpu_ptr(d.sd, i);
+ if (!sd)
+ continue;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- struct sched_domain *child = sd->child;
-
- if (!(sd->flags & SD_SHARE_LLC) && child &&
- (child->flags & SD_SHARE_LLC)) {
- struct sched_domain __rcu *top_p;
- unsigned int nr_llcs;
-
- /*
- * For a single LLC per node, allow an
- * imbalance up to 12.5% of the node. This is
- * arbitrary cutoff based two factors -- SMT and
- * memory channels. For SMT-2, the intent is to
- * avoid premature sharing of HT resources but
- * SMT-4 or SMT-8 *may* benefit from a different
- * cutoff. For memory channels, this is a very
- * rough estimate of how many channels may be
- * active and is based on recent CPUs with
- * many cores.
- *
- * For multiple LLCs, allow an imbalance
- * until multiple tasks would share an LLC
- * on one node while LLCs on another node
- * remain idle. This assumes that there are
- * enough logical CPUs per LLC to avoid SMT
- * factors and that there is a correlation
- * between LLCs and memory channels.
- */
- nr_llcs = sd->span_weight / child->span_weight;
- if (nr_llcs == 1)
- imb = sd->span_weight >> 3;
- else
- imb = nr_llcs;
- imb = max(1U, imb);
- sd->imb_numa_nr = imb;
-
- /* Set span based on the first NUMA domain. */
- top_p = sd->parent;
- while (top_p && !(top_p->flags & SD_NUMA)) {
- top_p = top_p->parent;
- }
- imb_span = top_p ? top_p->span_weight : sd->span_weight;
- } else {
- int factor = max(1U, (sd->span_weight / imb_span));
+ /* First, find the topmost SD_SHARE_LLC domain */
+ while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+ sd = sd->parent;
- sd->imb_numa_nr = imb * factor;
- }
- }
+ /*
+ * In presence of higher domains, adjust the
+ * NUMA imbalance stats for the hierarchy.
+ */
+ if (IS_ENABLED(CONFIG_NUMA) && (sd->flags & SD_SHARE_LLC) && sd->parent)
+ adjust_numa_imbalance(sd);
}
/* Calculate CPU capacity for physical packages and nodes */
--
2.34.1
On 12.03.26 05:44, K Prateek Nayak wrote:
[...]
> +/*
> + * Calculate an allowed NUMA imbalance such that LLCs do not get
> + * imbalanced.
> + */
> +static void adjust_numa_imbalance(struct sched_domain *sd_llc)
> +{
> + struct sched_domain *parent;
> + unsigned int imb_span = 1;
> + unsigned int imb = 0;
> + unsigned int nr_llcs;
> +
> + WARN_ON(!(sd_llc->flags & SD_SHARE_LLC));
> + WARN_ON(!sd_llc->parent);
> +
> + /*
> + * For a single LLC per node, allow an
> + * imbalance up to 12.5% of the node. This is
> + * arbitrary cutoff based two factors -- SMT and
> + * memory channels. For SMT-2, the intent is to
> + * avoid premature sharing of HT resources but
> + * SMT-4 or SMT-8 *may* benefit from a different
> + * cutoff. For memory channels, this is a very
> + * rough estimate of how many channels may be
> + * active and is based on recent CPUs with
> + * many cores.
> + *
> + * For multiple LLCs, allow an imbalance
> + * until multiple tasks would share an LLC
> + * on one node while LLCs on another node
> + * remain idle. This assumes that there are
> + * enough logical CPUs per LLC to avoid SMT
> + * factors and that there is a correlation
> + * between LLCs and memory channels.
> + */
> + nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
> + if (nr_llcs == 1)
> + imb = sd_llc->parent->span_weight >> 3;
> + else
> + imb = nr_llcs;
> +
> + imb = max(1U, imb);
> + sd_llc->parent->imb_numa_nr = imb;
Here you set imb_numa_nr e.g. for PKG ...
> +
> + /*
> + * Set span based on the first NUMA domain.
> + *
> + * NUMA systems always add a NODE domain before
> + * iterating the NUMA domains. Since this is before
> + * degeneration, start from sd_llc's parent's
> + * parent which is the lowest an SD_NUMA domain can
> + * be relative to sd_llc.
> + */
> + parent = sd_llc->parent->parent;
> + while (parent && !(parent->flags & SD_NUMA))
> + parent = parent->parent;
> +
> + imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
> +
> + /* Update the upper remainder of the topology */
> + parent = sd_llc->parent;
> + while (parent) {
> + int factor = max(1U, (parent->span_weight / imb_span));
> +
> + parent->imb_numa_nr = imb * factor;
... and here again.
Shouldn't we only set it for 'if (parent->flags & SD_NUMA)'?
Not sure if there are case in which PKG would persist in
... -> MC -> PKG -> NODE -> NUMA -> ... ?
Although access to sd->imb_numa_nr seems to be guarded by sd->flags &
SD_NUMA.
> + parent = parent->parent;
> + }
> +}
> +
[...]
Hello Dietmar,
On 3/16/2026 5:48 AM, Dietmar Eggemann wrote:
>> + /*
>> + * For a single LLC per node, allow an
>> + * imbalance up to 12.5% of the node. This is
>> + * arbitrary cutoff based two factors -- SMT and
>> + * memory channels. For SMT-2, the intent is to
>> + * avoid premature sharing of HT resources but
>> + * SMT-4 or SMT-8 *may* benefit from a different
>> + * cutoff. For memory channels, this is a very
>> + * rough estimate of how many channels may be
>> + * active and is based on recent CPUs with
>> + * many cores.
>> + *
>> + * For multiple LLCs, allow an imbalance
>> + * until multiple tasks would share an LLC
>> + * on one node while LLCs on another node
>> + * remain idle. This assumes that there are
>> + * enough logical CPUs per LLC to avoid SMT
>> + * factors and that there is a correlation
>> + * between LLCs and memory channels.
>> + */
>> + nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
>> + if (nr_llcs == 1)
>> + imb = sd_llc->parent->span_weight >> 3;
>> + else
>> + imb = nr_llcs;
>> +
>> + imb = max(1U, imb);
>> + sd_llc->parent->imb_numa_nr = imb;
>
> Here you set imb_numa_nr e.g. for PKG ...
Ack! That is indeed a redundant assign since it gets reassigned
in the bottom loop. For this commit, we have kept it 1:1 with the
loop that existed before in build_sched_domains().
>
>> +
>> + /*
>> + * Set span based on the first NUMA domain.
>> + *
>> + * NUMA systems always add a NODE domain before
>> + * iterating the NUMA domains. Since this is before
>> + * degeneration, start from sd_llc's parent's
>> + * parent which is the lowest an SD_NUMA domain can
>> + * be relative to sd_llc.
>> + */
>> + parent = sd_llc->parent->parent;
>> + while (parent && !(parent->flags & SD_NUMA))
>> + parent = parent->parent;
>> +
>> + imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
>> +
>> + /* Update the upper remainder of the topology */
>> + parent = sd_llc->parent;
>> + while (parent) {
>> + int factor = max(1U, (parent->span_weight / imb_span));
>> +
>> + parent->imb_numa_nr = imb * factor;
>
> ... and here again.
>
> Shouldn't we only set it for 'if (parent->flags & SD_NUMA)'?
>
> Not sure if there are case in which PKG would persist in
>
> ... -> MC -> PKG -> NODE -> NUMA -> ... ?
>
> Although access to sd->imb_numa_nr seems to be guarded by sd->flags &
> SD_NUMA.
Indeed! "imb_numa_nr" only makes sense when looking at NUMA domains
and having it assigned to 1 for lower domains is harmless
(but wasteful indeed). I'm 99% sure we can simply do:
(Only build tested)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 43150591914b..e9068a809dbc 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2623,9 +2623,6 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
else
imb = nr_llcs;
- imb = max(1U, imb);
- sd_llc->parent->imb_numa_nr = imb;
-
/*
* Set span based on the first NUMA domain.
*
@@ -2639,10 +2636,14 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
while (parent && !(parent->flags & SD_NUMA))
parent = parent->parent;
- imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
+ /* No NUMA domain to adjust imbalance for! */
+ if (!parent)
+ return;
+
+ imb = max(1U, imb);
+ imb_span = parent->span_weight;
/* Update the upper remainder of the topology */
- parent = sd_llc->parent;
while (parent) {
int factor = max(1U, (parent->span_weight / imb_span));
---
If we have NUMA domains, we definitely have NODE and NODE sets neither
SD_SHARE_LLC, nor SD_NUMA so likely sd->parent is PKG / NODE domain and
NUMA has to start at sd->parent->parent and it has to break at the first
SD_NUMA domains.
If it doesn't exist, we don't have any NUMA domains and nothing to worry
about, and if we do, the final loop will adjust the NUMA imbalance.
Thoughts? Again, this commit was kept 1:1 with the previous loop but we
can always improve :-)
>
>> + parent = parent->parent;
>> + }
>> +}
>> +
> [...]
--
Thanks and Regards,
Prateek
Hi Prateek,
On 16.03.26 04:41, K Prateek Nayak wrote:
> Hello Dietmar,
>
> On 3/16/2026 5:48 AM, Dietmar Eggemann wrote:
[...]
> Indeed! "imb_numa_nr" only makes sense when looking at NUMA domains
> and having it assigned to 1 for lower domains is harmless
> (but wasteful indeed). I'm 99% sure we can simply do:
>
> (Only build tested)
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 43150591914b..e9068a809dbc 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2623,9 +2623,6 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
> else
> imb = nr_llcs;
>
> - imb = max(1U, imb);
> - sd_llc->parent->imb_numa_nr = imb;
> -
> /*
> * Set span based on the first NUMA domain.
> *
> @@ -2639,10 +2636,14 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
> while (parent && !(parent->flags & SD_NUMA))
> parent = parent->parent;
>
> - imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
> + /* No NUMA domain to adjust imbalance for! */
> + if (!parent)
> + return;
> +
> + imb = max(1U, imb);
> + imb_span = parent->span_weight;
>
> /* Update the upper remainder of the topology */
> - parent = sd_llc->parent;
> while (parent) {
> int factor = max(1U, (parent->span_weight / imb_span));
>
> ---
>
> If we have NUMA domains, we definitely have NODE and NODE sets neither
> SD_SHARE_LLC, nor SD_NUMA so likely sd->parent is PKG / NODE domain and
> NUMA has to start at sd->parent->parent and it has to break at the first
> SD_NUMA domains.
>
> If it doesn't exist, we don't have any NUMA domains and nothing to worry
> about, and if we do, the final loop will adjust the NUMA imbalance.
>
> Thoughts? Again, this commit was kept 1:1 with the previous loop but we
> can always improve :-)
Ah, I see!
This would work, IMHO.
Tested on qemu-system-aarch64 w/
-smp 8,sockets=2,clusters=2,cores=2,threads=1
Are you aware of a setup in which PKG would survive between MC and
lowest NUMA?
Hello Dietmar,
On 3/16/2026 1:54 PM, Dietmar Eggemann wrote:
> Hi Prateek,
>
> On 16.03.26 04:41, K Prateek Nayak wrote:
>> Hello Dietmar,
>>
>> On 3/16/2026 5:48 AM, Dietmar Eggemann wrote:
>
> [...]
>
>> Indeed! "imb_numa_nr" only makes sense when looking at NUMA domains
>> and having it assigned to 1 for lower domains is harmless
>> (but wasteful indeed). I'm 99% sure we can simply do:
>>
>> (Only build tested)
>>
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index 43150591914b..e9068a809dbc 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -2623,9 +2623,6 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
>> else
>> imb = nr_llcs;
>>
>> - imb = max(1U, imb);
>> - sd_llc->parent->imb_numa_nr = imb;
>> -
>> /*
>> * Set span based on the first NUMA domain.
>> *
>> @@ -2639,10 +2636,14 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
>> while (parent && !(parent->flags & SD_NUMA))
>> parent = parent->parent;
>>
>> - imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
>> + /* No NUMA domain to adjust imbalance for! */
>> + if (!parent)
>> + return;
>> +
>> + imb = max(1U, imb);
>> + imb_span = parent->span_weight;
>>
>> /* Update the upper remainder of the topology */
>> - parent = sd_llc->parent;
>> while (parent) {
>> int factor = max(1U, (parent->span_weight / imb_span));
>>
>> ---
>>
>> If we have NUMA domains, we definitely have NODE and NODE sets neither
>> SD_SHARE_LLC, nor SD_NUMA so likely sd->parent is PKG / NODE domain and
>> NUMA has to start at sd->parent->parent and it has to break at the first
>> SD_NUMA domains.
>>
>> If it doesn't exist, we don't have any NUMA domains and nothing to worry
>> about, and if we do, the final loop will adjust the NUMA imbalance.
>>
>> Thoughts? Again, this commit was kept 1:1 with the previous loop but we
>> can always improve :-)
> Ah, I see!
>
> This would work, IMHO.
>
> Tested on qemu-system-aarch64 w/
>
> -smp 8,sockets=2,clusters=2,cores=2,threads=1
>
> Are you aware of a setup in which PKG would survive between MC and
> lowest NUMA?
On x86, you can have:
-smp 8,sockets=2,dies=2,cores=2,threads=1
and each "die" will appear as an MC within the socket so we get
NUMA { 0-7 }
NODE { 0-3 } { 4-7 }
PKG { 0-3 } { 4-7 }
MC {0,1} {2,3} {4,5} {6,7}
In the above case, NODE is degenerated since it matches with PKG
and MC, PKG, NUMA survive at the end.
--
Thanks and Regards,
Prateek
Hi Prateek,
kernel test robot noticed the following build warnings:
[auto build test WARNING on 54a66e431eeacf23e1dc47cb3507f2d0c068aaf0]
url: https://github.com/intel-lab-lkp/linux/commits/K-Prateek-Nayak/sched-topology-Compute-sd_weight-considering-cpuset-partitions/20260312-125021
base: 54a66e431eeacf23e1dc47cb3507f2d0c068aaf0
patch link: https://lore.kernel.org/r/20260312044434.1974-3-kprateek.nayak%40amd.com
patch subject: [PATCH v4 2/9] sched/topology: Extract "imb_numa_nr" calculation into a separate helper
config: nios2-randconfig-r131-20260312 (https://download.01.org/0day-ci/archive/20260312/202603122149.xyvcIkPY-lkp@intel.com/config)
compiler: nios2-linux-gcc (GCC) 8.5.0
sparse: v0.6.5-rc1
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260312/202603122149.xyvcIkPY-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603122149.xyvcIkPY-lkp@intel.com/
sparse warnings: (new ones prefixed by >>)
kernel/sched/build_utility.c: note: in included file:
kernel/sched/debug.c:730:17: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/debug.c:730:17: sparse: expected struct sched_domain *[assigned] sd
kernel/sched/debug.c:730:17: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/debug.c:1069:9: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct task_struct *tsk @@ got struct task_struct [noderef] __rcu *curr @@
kernel/sched/debug.c:1069:9: sparse: expected struct task_struct *tsk
kernel/sched/debug.c:1069:9: sparse: got struct task_struct [noderef] __rcu *curr
kernel/sched/debug.c:1069:9: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct task_struct *tsk @@ got struct task_struct [noderef] __rcu *curr @@
kernel/sched/debug.c:1069:9: sparse: expected struct task_struct *tsk
kernel/sched/debug.c:1069:9: sparse: got struct task_struct [noderef] __rcu *curr
kernel/sched/build_utility.c: note: in included file:
kernel/sched/stats.c:136:17: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/stats.c:136:17: sparse: expected struct sched_domain *[assigned] sd
kernel/sched/stats.c:136:17: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/build_utility.c: note: in included file:
kernel/sched/topology.c:116:56: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct sched_domain *sd @@ got struct sched_domain [noderef] __rcu *child @@
kernel/sched/topology.c:116:56: sparse: expected struct sched_domain *sd
kernel/sched/topology.c:116:56: sparse: got struct sched_domain [noderef] __rcu *child
kernel/sched/topology.c:135:60: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct sched_domain *sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:135:60: sparse: expected struct sched_domain *sd
kernel/sched/topology.c:135:60: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:158:20: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:158:20: sparse: expected struct sched_domain *sd
kernel/sched/topology.c:158:20: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:469:19: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct perf_domain *pd @@ got struct perf_domain [noderef] __rcu *pd @@
kernel/sched/topology.c:469:19: sparse: expected struct perf_domain *pd
kernel/sched/topology.c:469:19: sparse: got struct perf_domain [noderef] __rcu *pd
kernel/sched/topology.c:644:49: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct sched_domain *parent @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:644:49: sparse: expected struct sched_domain *parent
kernel/sched/topology.c:644:49: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:729:50: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct sched_domain *parent @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:729:50: sparse: expected struct sched_domain *parent
kernel/sched/topology.c:729:50: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:737:55: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain [noderef] __rcu *[noderef] __rcu child @@ got struct sched_domain *[assigned] tmp @@
kernel/sched/topology.c:737:55: sparse: expected struct sched_domain [noderef] __rcu *[noderef] __rcu child
kernel/sched/topology.c:737:55: sparse: got struct sched_domain *[assigned] tmp
kernel/sched/topology.c:750:29: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] tmp @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:750:29: sparse: expected struct sched_domain *[assigned] tmp
kernel/sched/topology.c:750:29: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:755:20: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:755:20: sparse: expected struct sched_domain *sd
kernel/sched/topology.c:755:20: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:776:13: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] tmp @@ got struct sched_domain [noderef] __rcu *sd @@
kernel/sched/topology.c:776:13: sparse: expected struct sched_domain *[assigned] tmp
kernel/sched/topology.c:776:13: sparse: got struct sched_domain [noderef] __rcu *sd
kernel/sched/topology.c:938:70: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct sched_domain *sd @@ got struct sched_domain [noderef] __rcu *child @@
kernel/sched/topology.c:938:70: sparse: expected struct sched_domain *sd
kernel/sched/topology.c:938:70: sparse: got struct sched_domain [noderef] __rcu *child
kernel/sched/topology.c:967:59: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct sched_domain *sd @@ got struct sched_domain [noderef] __rcu *child @@
kernel/sched/topology.c:967:59: sparse: expected struct sched_domain *sd
kernel/sched/topology.c:967:59: sparse: got struct sched_domain [noderef] __rcu *child
kernel/sched/topology.c:1013:57: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct sched_domain *sd @@ got struct sched_domain [noderef] __rcu *child @@
kernel/sched/topology.c:1013:57: sparse: expected struct sched_domain *sd
kernel/sched/topology.c:1013:57: sparse: got struct sched_domain [noderef] __rcu *child
kernel/sched/topology.c:1015:25: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *sibling @@ got struct sched_domain [noderef] __rcu *child @@
kernel/sched/topology.c:1015:25: sparse: expected struct sched_domain *sibling
kernel/sched/topology.c:1015:25: sparse: got struct sched_domain [noderef] __rcu *child
kernel/sched/topology.c:1023:55: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct sched_domain *sd @@ got struct sched_domain [noderef] __rcu *child @@
kernel/sched/topology.c:1023:55: sparse: expected struct sched_domain *sd
kernel/sched/topology.c:1023:55: sparse: got struct sched_domain [noderef] __rcu *child
kernel/sched/topology.c:1025:25: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *sibling @@ got struct sched_domain [noderef] __rcu *child @@
kernel/sched/topology.c:1025:25: sparse: expected struct sched_domain *sibling
kernel/sched/topology.c:1025:25: sparse: got struct sched_domain [noderef] __rcu *child
kernel/sched/topology.c:1095:62: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct sched_domain *sd @@ got struct sched_domain [noderef] __rcu *child @@
kernel/sched/topology.c:1095:62: sparse: expected struct sched_domain *sd
kernel/sched/topology.c:1095:62: sparse: got struct sched_domain [noderef] __rcu *child
kernel/sched/topology.c:1199:40: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct sched_domain *child @@ got struct sched_domain [noderef] __rcu *child @@
kernel/sched/topology.c:1199:40: sparse: expected struct sched_domain *child
kernel/sched/topology.c:1199:40: sparse: got struct sched_domain [noderef] __rcu *child
kernel/sched/topology.c:1337:9: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:1337:9: sparse: expected struct sched_domain *[assigned] sd
kernel/sched/topology.c:1337:9: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:1683:43: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct sched_domain [noderef] __rcu *child @@ got struct sched_domain *child @@
kernel/sched/topology.c:1683:43: sparse: expected struct sched_domain [noderef] __rcu *child
kernel/sched/topology.c:1683:43: sparse: got struct sched_domain *child
kernel/sched/topology.c:2478:31: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain [noderef] __rcu *parent @@ got struct sched_domain *sd @@
kernel/sched/topology.c:2478:31: sparse: expected struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:2478:31: sparse: got struct sched_domain *sd
>> kernel/sched/topology.c:2606:16: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *parent @@ got struct sched_domain [noderef] __rcu *[noderef] __rcu parent @@
kernel/sched/topology.c:2606:16: sparse: expected struct sched_domain *parent
kernel/sched/topology.c:2606:16: sparse: got struct sched_domain [noderef] __rcu *[noderef] __rcu parent
>> kernel/sched/topology.c:2608:24: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *parent @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:2608:24: sparse: expected struct sched_domain *parent
kernel/sched/topology.c:2608:24: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:2613:16: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *parent @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:2613:16: sparse: expected struct sched_domain *parent
kernel/sched/topology.c:2613:16: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:2618:24: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *parent @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:2618:24: sparse: expected struct sched_domain *parent
kernel/sched/topology.c:2618:24: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:2667:57: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:2667:57: sparse: expected struct sched_domain *[assigned] sd
kernel/sched/topology.c:2667:57: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:2686:28: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:2686:28: sparse: expected struct sched_domain *[assigned] sd
kernel/sched/topology.c:2686:28: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/topology.c:2701:57: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/topology.c:2701:57: sparse: expected struct sched_domain *[assigned] sd
kernel/sched/topology.c:2701:57: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/build_utility.c: note: in included file:
kernel/sched/build_utility.c: note: in included file:
kernel/sched/sched.h:2367:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
kernel/sched/sched.h:2367:25: sparse: struct task_struct [noderef] __rcu *
kernel/sched/sched.h:2367:25: sparse: struct task_struct *
vim +2606 kernel/sched/topology.c
2553
2554 /*
2555 * Calculate an allowed NUMA imbalance such that LLCs do not get
2556 * imbalanced.
2557 */
2558 static void adjust_numa_imbalance(struct sched_domain *sd_llc)
2559 {
2560 struct sched_domain *parent;
2561 unsigned int imb_span = 1;
2562 unsigned int imb = 0;
2563 unsigned int nr_llcs;
2564
2565 WARN_ON(!(sd_llc->flags & SD_SHARE_LLC));
2566 WARN_ON(!sd_llc->parent);
2567
2568 /*
2569 * For a single LLC per node, allow an
2570 * imbalance up to 12.5% of the node. This is
2571 * arbitrary cutoff based two factors -- SMT and
2572 * memory channels. For SMT-2, the intent is to
2573 * avoid premature sharing of HT resources but
2574 * SMT-4 or SMT-8 *may* benefit from a different
2575 * cutoff. For memory channels, this is a very
2576 * rough estimate of how many channels may be
2577 * active and is based on recent CPUs with
2578 * many cores.
2579 *
2580 * For multiple LLCs, allow an imbalance
2581 * until multiple tasks would share an LLC
2582 * on one node while LLCs on another node
2583 * remain idle. This assumes that there are
2584 * enough logical CPUs per LLC to avoid SMT
2585 * factors and that there is a correlation
2586 * between LLCs and memory channels.
2587 */
2588 nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
2589 if (nr_llcs == 1)
2590 imb = sd_llc->parent->span_weight >> 3;
2591 else
2592 imb = nr_llcs;
2593
2594 imb = max(1U, imb);
2595 sd_llc->parent->imb_numa_nr = imb;
2596
2597 /*
2598 * Set span based on the first NUMA domain.
2599 *
2600 * NUMA systems always add a NODE domain before
2601 * iterating the NUMA domains. Since this is before
2602 * degeneration, start from sd_llc's parent's
2603 * parent which is the lowest an SD_NUMA domain can
2604 * be relative to sd_llc.
2605 */
> 2606 parent = sd_llc->parent->parent;
2607 while (parent && !(parent->flags & SD_NUMA))
> 2608 parent = parent->parent;
2609
2610 imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
2611
2612 /* Update the upper remainder of the topology */
2613 parent = sd_llc->parent;
2614 while (parent) {
2615 int factor = max(1U, (parent->span_weight / imb_span));
2616
2617 parent->imb_numa_nr = imb * factor;
2618 parent = parent->parent;
2619 }
2620 }
2621
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On 3/12/2026 7:07 PM, kernel test robot wrote:
> sparse warnings: (new ones prefixed by >>)
> kernel/sched/build_utility.c: note: in included file:
> kernel/sched/debug.c:730:17: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] sd @@ got struct sched_domain [noderef] __rcu *parent @@
So what is out official stance on sparse in the sched bits? Because I
can make this go away with:
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 963007d83216..7bf1f830067f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2591,7 +2591,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
*/
static void adjust_numa_imbalance(struct sched_domain *sd_llc)
{
- struct sched_domain *parent;
+ struct sched_domain __rcu *parent;
unsigned int imb_span = 1;
unsigned int imb = 0;
unsigned int nr_llcs;
---
But I can make a ton more go away by doing:
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 51c29581f15e..7d1efd981caf 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,8 +72,8 @@ struct sched_domain_shared {
struct sched_domain {
/* These fields must be setup */
- struct sched_domain __rcu *parent; /* top domain must be null terminated */
- struct sched_domain __rcu *child; /* bottom domain must be null terminated */
+ struct sched_domain *parent; /* top domain must be null terminated */
+ struct sched_domain *child; /* bottom domain must be null terminated */
struct sched_group *groups; /* the balancing groups of the domain */
unsigned long min_interval; /* Minimum balance interval ms */
unsigned long max_interval; /* Maximum balance interval ms */
---
"__rcu" evaluates to "noderef, address_space(__rcu)" but we do end up
dereferencing a bunch of these directly (like sd->parent->parent) but
noderef suggests that is illegal?
One place this probably helps is to spot cases where a pointer *needs*
to be accessed via rcu_dereference*() but it isn't - that is indeed nice
to have but ...
Then it also complains about using rcu_dereference*() on pointers that
isn't __rcu annotated but perhaps that is solvable (although some of it
isn't very pretty like "cpumask ** __rcu *sched_domains_numa_masks").
--
Thanks and Regards,
Prateek
On Thu, Mar 12, 2026 at 09:12:50PM +0530, K Prateek Nayak wrote: > On 3/12/2026 7:07 PM, kernel test robot wrote: > > sparse warnings: (new ones prefixed by >>) > > kernel/sched/build_utility.c: note: in included file: > > kernel/sched/debug.c:730:17: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] sd @@ got struct sched_domain [noderef] __rcu *parent @@ > > So what is out official stance on sparse in the sched bits? Because I > can make this go away with: I take patches for correctness :-) I do not take patches that don't affect correctness but make the code unreadable -- there was a submission along those lines recently. I can be convinced to take patches in the middle provided they don't affect readability too much.
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 5a7b576b3ec1acc2694c5b58f80cd1d44a11b2c1
Gitweb: https://git.kernel.org/tip/5a7b576b3ec1acc2694c5b58f80cd1d44a11b2c1
Author: K Prateek Nayak <kprateek.nayak@amd.com>
AuthorDate: Thu, 12 Mar 2026 04:44:27
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 18 Mar 2026 09:06:48 +01:00
sched/topology: Extract "imb_numa_nr" calculation into a separate helper
Subsequent changes to assign "sd->shared" from "s_data" would
necessitate finding the topmost SD_SHARE_LLC to assign shared object to.
This is very similar to the "imb_numa_nr" computation loop except that
"imb_numa_nr" cares about the first domain without the SD_SHARE_LLC flag
(immediate parent of sd_llc) whereas the "sd->shared" assignment would
require sd_llc itself.
Extract the "imb_numa_nr" calculation into a helper
adjust_numa_imbalance() and use the current loop in the
build_sched_domains() to find the sd_llc.
While at it, guard the call behind CONFIG_NUMA's status since
"imb_numa_nr" only makes sense on NUMA enabled configs with SD_NUMA
domains.
No functional changes intended.
Suggested-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://patch.msgid.link/20260312044434.1974-3-kprateek.nayak@amd.com
---
kernel/sched/topology.c | 133 +++++++++++++++++++++++----------------
1 file changed, 80 insertions(+), 53 deletions(-)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 79bab80..6303790 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2550,6 +2550,74 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
}
/*
+ * Calculate an allowed NUMA imbalance such that LLCs do not get
+ * imbalanced.
+ */
+static void adjust_numa_imbalance(struct sched_domain *sd_llc)
+{
+ struct sched_domain *parent;
+ unsigned int imb_span = 1;
+ unsigned int imb = 0;
+ unsigned int nr_llcs;
+
+ WARN_ON(!(sd_llc->flags & SD_SHARE_LLC));
+ WARN_ON(!sd_llc->parent);
+
+ /*
+ * For a single LLC per node, allow an
+ * imbalance up to 12.5% of the node. This is
+ * arbitrary cutoff based two factors -- SMT and
+ * memory channels. For SMT-2, the intent is to
+ * avoid premature sharing of HT resources but
+ * SMT-4 or SMT-8 *may* benefit from a different
+ * cutoff. For memory channels, this is a very
+ * rough estimate of how many channels may be
+ * active and is based on recent CPUs with
+ * many cores.
+ *
+ * For multiple LLCs, allow an imbalance
+ * until multiple tasks would share an LLC
+ * on one node while LLCs on another node
+ * remain idle. This assumes that there are
+ * enough logical CPUs per LLC to avoid SMT
+ * factors and that there is a correlation
+ * between LLCs and memory channels.
+ */
+ nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
+ if (nr_llcs == 1)
+ imb = sd_llc->parent->span_weight >> 3;
+ else
+ imb = nr_llcs;
+
+ imb = max(1U, imb);
+ sd_llc->parent->imb_numa_nr = imb;
+
+ /*
+ * Set span based on the first NUMA domain.
+ *
+ * NUMA systems always add a NODE domain before
+ * iterating the NUMA domains. Since this is before
+ * degeneration, start from sd_llc's parent's
+ * parent which is the lowest an SD_NUMA domain can
+ * be relative to sd_llc.
+ */
+ parent = sd_llc->parent->parent;
+ while (parent && !(parent->flags & SD_NUMA))
+ parent = parent->parent;
+
+ imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
+
+ /* Update the upper remainder of the topology */
+ parent = sd_llc->parent;
+ while (parent) {
+ int factor = max(1U, (parent->span_weight / imb_span));
+
+ parent->imb_numa_nr = imb * factor;
+ parent = parent->parent;
+ }
+}
+
+/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
*/
@@ -2606,62 +2674,21 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
}
- /*
- * Calculate an allowed NUMA imbalance such that LLCs do not get
- * imbalanced.
- */
for_each_cpu(i, cpu_map) {
- unsigned int imb = 0;
- unsigned int imb_span = 1;
+ sd = *per_cpu_ptr(d.sd, i);
+ if (!sd)
+ continue;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- struct sched_domain *child = sd->child;
-
- if (!(sd->flags & SD_SHARE_LLC) && child &&
- (child->flags & SD_SHARE_LLC)) {
- struct sched_domain __rcu *top_p;
- unsigned int nr_llcs;
-
- /*
- * For a single LLC per node, allow an
- * imbalance up to 12.5% of the node. This is
- * arbitrary cutoff based two factors -- SMT and
- * memory channels. For SMT-2, the intent is to
- * avoid premature sharing of HT resources but
- * SMT-4 or SMT-8 *may* benefit from a different
- * cutoff. For memory channels, this is a very
- * rough estimate of how many channels may be
- * active and is based on recent CPUs with
- * many cores.
- *
- * For multiple LLCs, allow an imbalance
- * until multiple tasks would share an LLC
- * on one node while LLCs on another node
- * remain idle. This assumes that there are
- * enough logical CPUs per LLC to avoid SMT
- * factors and that there is a correlation
- * between LLCs and memory channels.
- */
- nr_llcs = sd->span_weight / child->span_weight;
- if (nr_llcs == 1)
- imb = sd->span_weight >> 3;
- else
- imb = nr_llcs;
- imb = max(1U, imb);
- sd->imb_numa_nr = imb;
-
- /* Set span based on the first NUMA domain. */
- top_p = sd->parent;
- while (top_p && !(top_p->flags & SD_NUMA)) {
- top_p = top_p->parent;
- }
- imb_span = top_p ? top_p->span_weight : sd->span_weight;
- } else {
- int factor = max(1U, (sd->span_weight / imb_span));
+ /* First, find the topmost SD_SHARE_LLC domain */
+ while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+ sd = sd->parent;
- sd->imb_numa_nr = imb * factor;
- }
- }
+ /*
+ * In presence of higher domains, adjust the
+ * NUMA imbalance stats for the hierarchy.
+ */
+ if (IS_ENABLED(CONFIG_NUMA) && (sd->flags & SD_SHARE_LLC) && sd->parent)
+ adjust_numa_imbalance(sd);
}
/* Calculate CPU capacity for physical packages and nodes */
© 2016 - 2026 Red Hat, Inc.