kernel/sched/topology.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-)
Leon noted a topology_span_sane() warning in their guest deployment
starting from v6.16-rc1 [1]. Debug that followed pointed to the
tl->mask() for the NODE domain being incorrectly resolved to that of the
highest NUMA domain.
tl->mask() for NODE is set to the sd_numa_mask() which depends on the
global "sched_domains_curr_level" hack. "sched_domains_curr_level" is
set to the "tl->numa_level" during tl traversal in build_sched_domains()
calling sd_init() but was not reset before topology_span_sane().
Since "tl->numa_level" still reflected the old value from
build_sched_domains(), topology_span_sane() for the NODE domain trips
when the span of the last NUMA domain overlaps.
Instead of replicating the "sched_domains_curr_level" hack, Valentin
suggested using the spans from the sched_domain objects constructed
during build_sched_domains() which can also catch overlaps when the
domain spans are fixed up by build_sched_domain().
Since build_sched_domain() is skipped when tl->mask() of a child domain
already covers the entire cpumap, use sd->private to skip domains that
have not been initialized in CPU's hierarchy.
The original warning was reproducible on the following NUMA topology
reported by Leon:
$ sudo numactl -H
available: 5 nodes (0-4)
node 0 cpus: 0 1
node 0 size: 2927 MB
node 0 free: 1603 MB
node 1 cpus: 2 3
node 1 size: 3023 MB
node 1 free: 3008 MB
node 2 cpus: 4 5
node 2 size: 3023 MB
node 2 free: 3007 MB
node 3 cpus: 6 7
node 3 size: 3023 MB
node 3 free: 3002 MB
node 4 cpus: 8 9
node 4 size: 3022 MB
node 4 free: 2718 MB
node distances:
node 0 1 2 3 4
0: 10 39 38 37 36
1: 39 10 38 37 36
2: 38 38 10 37 36
3: 37 37 37 10 36
4: 36 36 36 36 10
The above topology can be mimicked using the following QEMU cmd that was
used to reproduce the warning and test the fix:
sudo qemu-system-x86_64 -enable-kvm -cpu host \
-m 20G -smp cpus=10,sockets=10 -machine q35 \
-object memory-backend-ram,size=4G,id=m0 \
-object memory-backend-ram,size=4G,id=m1 \
-object memory-backend-ram,size=4G,id=m2 \
-object memory-backend-ram,size=4G,id=m3 \
-object memory-backend-ram,size=4G,id=m4 \
-numa node,cpus=0-1,memdev=m0,nodeid=0 \
-numa node,cpus=2-3,memdev=m1,nodeid=1 \
-numa node,cpus=4-5,memdev=m2,nodeid=2 \
-numa node,cpus=6-7,memdev=m3,nodeid=3 \
-numa node,cpus=8-9,memdev=m4,nodeid=4 \
-numa dist,src=0,dst=1,val=39 \
-numa dist,src=0,dst=2,val=38 \
-numa dist,src=0,dst=3,val=37 \
-numa dist,src=0,dst=4,val=36 \
-numa dist,src=1,dst=0,val=39 \
-numa dist,src=1,dst=2,val=38 \
-numa dist,src=1,dst=3,val=37 \
-numa dist,src=1,dst=4,val=36 \
-numa dist,src=2,dst=0,val=38 \
-numa dist,src=2,dst=1,val=38 \
-numa dist,src=2,dst=3,val=37 \
-numa dist,src=2,dst=4,val=36 \
-numa dist,src=3,dst=0,val=37 \
-numa dist,src=3,dst=1,val=37 \
-numa dist,src=3,dst=2,val=37 \
-numa dist,src=3,dst=4,val=36 \
-numa dist,src=4,dst=0,val=36 \
-numa dist,src=4,dst=1,val=36 \
-numa dist,src=4,dst=2,val=36 \
-numa dist,src=4,dst=3,val=36 \
...
Suggested-by: Valentin Schneider <vschneid@redhat.com>
Reported-by: Leon Romanovsky <leon@kernel.org>
Closes: https://lore.kernel.org/lkml/20250610110701.GA256154@unreal/ [1]
Fixes: ccf74128d66c ("sched/topology: Assert non-NUMA topology masks don't (partially) overlap") # ce29a7da84cd, f55dac1dafb3
Reviewed-by: Steve Wahl <steve.wahl@hpe.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
Changes are based on tip:sched/urgent at commit fc975cfb3639
("sched/deadline: Fix dl_server runtime calculation formula")
Changelog v2..v3:
o Added a check to skip uninitialized sd that can cause dereference of
sdd->sd beyond the percpu boundary (reported by Boris).
Tested on the trivial case using the QEMU cmdline:
sudo qemu-system-x86_64 -enable-kvm -cpu host -m 20G \
-smp cpus=10,socket=1,thread=10 -machine q35 \
-object memory-backend-ram,size=20G,id=m0 \
-numa node,cpus=0-9,memdev=m0,nodeid=0 \
...
o Collected tags from Steve and Valentin on v2 since the approach is
still the same. Only the check for uninitialized sd is new.
v2: https://lore.kernel.org/lkml/20250630061059.1547-1-kprateek.nayak@amd.com/
---
kernel/sched/topology.c | 25 +++++++++++++++++++------
1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b958fe48e020..e682bf991ce6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2403,6 +2403,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
id_seen = sched_domains_tmpmask2;
for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
/* NUMA levels are allowed to overlap */
if (tl->flags & SDTL_OVERLAP)
@@ -2418,22 +2419,34 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
* breaks the linking done for an earlier span.
*/
for_each_cpu(cpu, cpu_map) {
- const struct cpumask *tl_cpu_mask = tl->mask(cpu);
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ struct cpumask *sd_span = sched_domain_span(sd);
int id;
+ /*
+ * If a child level for a CPU has already covered
+ * the entire cpumap, build_sched_domain() for the
+ * domains above is skipped. Use sd->private to detect
+ * levels that have not been initialized in the CPU's
+ * hierarchy and skip them.
+ */
+ if (!sd->private)
+ continue;
+
/* lowest bit set in this mask is used as a unique id */
- id = cpumask_first(tl_cpu_mask);
+ id = cpumask_first(sd_span);
if (cpumask_test_cpu(id, id_seen)) {
- /* First CPU has already been seen, ensure identical spans */
- if (!cpumask_equal(tl->mask(id), tl_cpu_mask))
+ /* First CPU has already been seen, ensure identical sd spans */
+ sd = *per_cpu_ptr(sdd->sd, id);
+ if (!cpumask_equal(sched_domain_span(sd), sd_span))
return false;
} else {
/* First CPU hasn't been seen before, ensure it's a completely new span */
- if (cpumask_intersects(tl_cpu_mask, covered))
+ if (cpumask_intersects(sd_span, covered))
return false;
- cpumask_or(covered, covered, tl_cpu_mask);
+ cpumask_or(covered, covered, sd_span);
cpumask_set_cpu(id, id_seen);
}
}
base-commit: fc975cfb36393db1db517fbbe366e550bcdcff14
--
2.34.1
On 07/07/25 10:53, K Prateek Nayak wrote: > Changelog v2..v3: > > o Added a check to skip uninitialized sd that can cause dereference of > sdd->sd beyond the percpu boundary (reported by Boris). > > Tested on the trivial case using the QEMU cmdline: > > sudo qemu-system-x86_64 -enable-kvm -cpu host -m 20G \ > -smp cpus=10,socket=1,thread=10 -machine q35 \ > -object memory-backend-ram,size=20G,id=m0 \ > -numa node,cpus=0-9,memdev=m0,nodeid=0 \ > ... > Urgh, of course directly using @sdd is not like walking up the sd hierarchy where we end up getting a NULL sentinel... Sorry for suggesting that and not thinking about that "small" detail, and thanks for being on top of it. > --- > kernel/sched/topology.c | 25 +++++++++++++++++++------ > 1 file changed, 19 insertions(+), 6 deletions(-) > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > index b958fe48e020..e682bf991ce6 100644 > --- a/kernel/sched/topology.c > +++ b/kernel/sched/topology.c > @@ -2403,6 +2403,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) > id_seen = sched_domains_tmpmask2; > > for_each_sd_topology(tl) { > + struct sd_data *sdd = &tl->data; > > /* NUMA levels are allowed to overlap */ > if (tl->flags & SDTL_OVERLAP) > @@ -2418,22 +2419,34 @@ static bool topology_span_sane(const struct cpumask *cpu_map) > * breaks the linking done for an earlier span. > */ > for_each_cpu(cpu, cpu_map) { > - const struct cpumask *tl_cpu_mask = tl->mask(cpu); > + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); > + struct cpumask *sd_span = sched_domain_span(sd); > int id; > > + /* > + * If a child level for a CPU has already covered > + * the entire cpumap, build_sched_domain() for the > + * domains above is skipped. Use sd->private to detect > + * levels that have not been initialized in the CPU's > + * hierarchy and skip them. > + */ > + if (!sd->private) > + continue; > + So this works, but how about using a cpumask_empty(sd_span) check instead? It's IMO a bit more future proof than relying on how sd->private is used.
Hello Valentin, Thank you for taking a look! On 7/8/2025 5:14 PM, Valentin Schneider wrote: >> @@ -2418,22 +2419,34 @@ static bool topology_span_sane(const struct cpumask *cpu_map) >> * breaks the linking done for an earlier span. >> */ >> for_each_cpu(cpu, cpu_map) { >> - const struct cpumask *tl_cpu_mask = tl->mask(cpu); >> + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); >> + struct cpumask *sd_span = sched_domain_span(sd); >> int id; >> >> + /* >> + * If a child level for a CPU has already covered >> + * the entire cpumap, build_sched_domain() for the >> + * domains above is skipped. Use sd->private to detect >> + * levels that have not been initialized in the CPU's >> + * hierarchy and skip them. >> + */ >> + if (!sd->private) >> + continue; >> + > > So this works, but how about using a cpumask_empty(sd_span) check instead? > It's IMO a bit more future proof than relying on how sd->private is used. How about the following instead of cpumask_empty() to avoid two cpumask operation in the common case where sd is initialized: diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index e682bf991ce6..e889ae012c17 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2423,19 +2423,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map) struct cpumask *sd_span = sched_domain_span(sd); int id; - /* - * If a child level for a CPU has already covered - * the entire cpumap, build_sched_domain() for the - * domains above is skipped. Use sd->private to detect - * levels that have not been initialized in the CPU's - * hierarchy and skip them. - */ - if (!sd->private) - continue; - /* lowest bit set in this mask is used as a unique id */ id = cpumask_first(sd_span); + /* Skip if span is empty */ + if (id >= nr_cpu_ids) + continue; + if (cpumask_test_cpu(id, id_seen)) { /* First CPU has already been seen, ensure identical sd spans */ sd = *per_cpu_ptr(sdd->sd, id); --- -- Thanks and Regards, Prateek
On 09/07/25 09:31, K Prateek Nayak wrote: > > How about the following instead of cpumask_empty() to avoid two cpumask > operation in the common case where sd is initialized: > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > index e682bf991ce6..e889ae012c17 100644 > --- a/kernel/sched/topology.c > +++ b/kernel/sched/topology.c > @@ -2423,19 +2423,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map) > struct cpumask *sd_span = sched_domain_span(sd); > int id; > > - /* > - * If a child level for a CPU has already covered > - * the entire cpumap, build_sched_domain() for the > - * domains above is skipped. Use sd->private to detect > - * levels that have not been initialized in the CPU's > - * hierarchy and skip them. > - */ > - if (!sd->private) > - continue; > - > /* lowest bit set in this mask is used as a unique id */ > id = cpumask_first(sd_span); > > + /* Skip if span is empty */ > + if (id >= nr_cpu_ids) > + continue; > + Oh yeah, even better. Just slap a comment like the below and ship it! /* * Span can be empty if that topology level won't be used for this CPU, * i.e. a lower level already fully describes the topology and * build_sched_domain() stopped there. */ > if (cpumask_test_cpu(id, id_seen)) { > /* First CPU has already been seen, ensure identical sd spans */ > sd = *per_cpu_ptr(sdd->sd, id); > --- > > -- > Thanks and Regards, > Prateek
© 2016 - 2025 Red Hat, Inc.