kernel/sched/topology.c | 79 ++++++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 25 deletions(-)
Use a different approach to topology_span_sane(), that checks for the
same constraint of no partial overlaps for any two CPU sets for
non-NUMA topology levels, but does so in a way that is O(N) rather
than O(N^2).
Instead of comparing with all other masks to detect collisions, keep
one mask that includes all CPUs seen so far and detect collisions with
a single cpumask_intersects test.
If the current mask has no collisions with previously seen masks, it
should be a new mask, which can be uniquely identified by the lowest
bit set in this mask. Keep a pointer to this mask for future
reference (in an array indexed by the lowest bit set), and add the
CPUs in this mask to the list of those seen.
If the current mask does collide with previously seen masks, it should
be exactly equal to a mask seen before, looked up in the same array
indexed by the lowest bit set in the mask, a single comparison.
Move the topology_span_sane() check out of the existing topology level
loop, let it use its own loop so that the array allocation can be done
only once, shared across levels.
On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
the average time to take one processor offline is reduced from 2.18
seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
34m49.765s without this change, 16m10.038s with this change in place.)
Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
---
kernel/sched/topology.c | 79 ++++++++++++++++++++++++++++-------------
1 file changed, 54 insertions(+), 25 deletions(-)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9748a4c8d668..c150074033d3 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2356,36 +2356,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
/*
* Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
- * any two given CPUs at this (non-NUMA) topology level.
+ * any two given CPUs on non-NUMA topology levels.
*/
-static bool topology_span_sane(struct sched_domain_topology_level *tl,
- const struct cpumask *cpu_map, int cpu)
+static bool topology_span_sane(const struct cpumask *cpu_map)
{
- int i = cpu + 1;
+ struct sched_domain_topology_level *tl;
+ const struct cpumask **masks;
+ struct cpumask *covered;
+ int cpu, id;
+ bool ret = false;
- /* NUMA levels are allowed to overlap */
- if (tl->flags & SDTL_OVERLAP)
- return true;
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
+
+ masks = kmalloc_array(num_possible_cpus(), sizeof(struct cpumask *), GFP_KERNEL);
+ if (!masks)
+ return ret;
+
+ for_each_sd_topology(tl) {
+
+ /* NUMA levels are allowed to overlap */
+ if (tl->flags & SDTL_OVERLAP)
+ continue;
+
+ cpumask_clear(covered);
+ memset(masks, 0, num_possible_cpus() * sizeof(struct cpumask *));
- /*
- * Non-NUMA levels cannot partially overlap - they must be either
- * completely equal or completely disjoint. Otherwise we can end up
- * breaking the sched_group lists - i.e. a later get_group() pass
- * breaks the linking done for an earlier span.
- */
- for_each_cpu_from(i, cpu_map) {
/*
- * We should 'and' all those masks with 'cpu_map' to exactly
- * match the topology we're about to build, but that can only
- * remove CPUs, which only lessens our ability to detect
- * overlaps
+ * Non-NUMA levels cannot partially overlap - they must be either
+ * completely equal or completely disjoint. Otherwise we can end up
+ * breaking the sched_group lists - i.e. a later get_group() pass
+ * breaks the linking done for an earlier span.
*/
- if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
- cpumask_intersects(tl->mask(cpu), tl->mask(i)))
- return false;
+ for_each_cpu(cpu, cpu_map) {
+ /* lowest bit set in this mask is used as a unique id */
+ id = cpumask_first(tl->mask(cpu));
+
+ /* if this mask doesn't collide with what we've already seen */
+ if (!cpumask_intersects(tl->mask(cpu), covered)) {
+ /* this failing would be an error in this algorithm */
+ if (WARN_ON(masks[id]))
+ goto notsane;
+
+ /* record the mask we saw for this id */
+ masks[id] = tl->mask(cpu);
+ cpumask_or(covered, tl->mask(cpu), covered);
+ } else if ((!masks[id]) || !cpumask_equal(masks[id], tl->mask(cpu))) {
+ /*
+ * a collision with covered should have exactly matched
+ * a previously seen mask with the same id
+ */
+ goto notsane;
+ }
+ }
}
+ ret = true;
- return true;
+ notsane:
+ kfree(masks);
+ return ret;
}
/*
@@ -2417,9 +2446,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
sd = NULL;
for_each_sd_topology(tl) {
- if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
- goto error;
-
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
@@ -2433,6 +2459,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
}
+ if (WARN_ON(!topology_span_sane(cpu_map)))
+ goto error;
+
/* Build the groups for the domains */
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
--
2.26.2
On 2024-10-10 21:21, Steve Wahl wrote:
> Use a different approach to topology_span_sane(), that checks for the
> same constraint of no partial overlaps for any two CPU sets for
> non-NUMA topology levels, but does so in a way that is O(N) rather
> than O(N^2).
>
> Instead of comparing with all other masks to detect collisions, keep
> one mask that includes all CPUs seen so far and detect collisions with
> a single cpumask_intersects test.
>
> If the current mask has no collisions with previously seen masks, it
> should be a new mask, which can be uniquely identified by the lowest
> bit set in this mask. Keep a pointer to this mask for future
> reference (in an array indexed by the lowest bit set), and add the
> CPUs in this mask to the list of those seen.
>
> If the current mask does collide with previously seen masks, it should
> be exactly equal to a mask seen before, looked up in the same array
> indexed by the lowest bit set in the mask, a single comparison.
>
> Move the topology_span_sane() check out of the existing topology level
> loop, let it use its own loop so that the array allocation can be done
> only once, shared across levels.
>
> On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
> the average time to take one processor offline is reduced from 2.18
> seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
> 34m49.765s without this change, 16m10.038s with this change in place.)
>
> Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
> ---
> kernel/sched/topology.c | 79 ++++++++++++++++++++++++++++-------------
> 1 file changed, 54 insertions(+), 25 deletions(-)
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9748a4c8d668..c150074033d3 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2356,36 +2356,65 @@ static struct sched_domain
> *build_sched_domain(struct sched_domain_topology_leve
>
> /*
> * Ensure topology masks are sane, i.e. there are no conflicts
> (overlaps) for
> - * any two given CPUs at this (non-NUMA) topology level.
> + * any two given CPUs on non-NUMA topology levels.
> */
> -static bool topology_span_sane(struct sched_domain_topology_level *tl,
> - const struct cpumask *cpu_map, int cpu)
> +static bool topology_span_sane(const struct cpumask *cpu_map)
> {
> - int i = cpu + 1;
> + struct sched_domain_topology_level *tl;
> + const struct cpumask **masks;
> + struct cpumask *covered;
> + int cpu, id;
> + bool ret = false;
>
> - /* NUMA levels are allowed to overlap */
> - if (tl->flags & SDTL_OVERLAP)
> - return true;
> + lockdep_assert_held(&sched_domains_mutex);
> + covered = sched_domains_tmpmask;
> +
> + masks = kmalloc_array(num_possible_cpus(), sizeof(struct cpumask *),
> GFP_KERNEL);
> + if (!masks)
> + return ret;
> +
> + for_each_sd_topology(tl) {
> +
> + /* NUMA levels are allowed to overlap */
> + if (tl->flags & SDTL_OVERLAP)
> + continue;
> +
> + cpumask_clear(covered);
> + memset(masks, 0, num_possible_cpus() * sizeof(struct cpumask *));
>
> - /*
> - * Non-NUMA levels cannot partially overlap - they must be either
> - * completely equal or completely disjoint. Otherwise we can end up
> - * breaking the sched_group lists - i.e. a later get_group() pass
> - * breaks the linking done for an earlier span.
> - */
> - for_each_cpu_from(i, cpu_map) {
> /*
> - * We should 'and' all those masks with 'cpu_map' to exactly
> - * match the topology we're about to build, but that can only
> - * remove CPUs, which only lessens our ability to detect
> - * overlaps
> + * Non-NUMA levels cannot partially overlap - they must be either
> + * completely equal or completely disjoint. Otherwise we can end up
> + * breaking the sched_group lists - i.e. a later get_group() pass
> + * breaks the linking done for an earlier span.
> */
> - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
> - cpumask_intersects(tl->mask(cpu), tl->mask(i)))
> - return false;
> + for_each_cpu(cpu, cpu_map) {
> + /* lowest bit set in this mask is used as a unique id */
> + id = cpumask_first(tl->mask(cpu));
> +
> + /* if this mask doesn't collide with what we've already seen */
> + if (!cpumask_intersects(tl->mask(cpu), covered)) {
> + /* this failing would be an error in this algorithm */
> + if (WARN_ON(masks[id]))
> + goto notsane;
> +
> + /* record the mask we saw for this id */
> + masks[id] = tl->mask(cpu);
> + cpumask_or(covered, tl->mask(cpu), covered);
> + } else if ((!masks[id]) || !cpumask_equal(masks[id],
> tl->mask(cpu))) {
> + /*
> + * a collision with covered should have exactly matched
> + * a previously seen mask with the same id
> + */
> + goto notsane;
> + }
> + }
> }
> + ret = true;
>
> - return true;
> + notsane:
> + kfree(masks);
> + return ret;
> }
>
> /*
> @@ -2417,9 +2446,6 @@ build_sched_domains(const struct cpumask
> *cpu_map, struct sched_domain_attr *att
> sd = NULL;
> for_each_sd_topology(tl) {
>
> - if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
> - goto error;
> -
> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>
> has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
> @@ -2433,6 +2459,9 @@ build_sched_domains(const struct cpumask
> *cpu_map, struct sched_domain_attr *att
> }
> }
>
> + if (WARN_ON(!topology_span_sane(cpu_map)))
> + goto error;
> +
> /* Build the groups for the domains */
> for_each_cpu(i, cpu_map) {
> for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
Hello,
I have verified this patch on PowerPC and below are the results for
"time ppc64_cpu —smt =off/4" mode,
Here are the 5 iteration data for “time ppc64_cpu --smt=off/4”
command(min, max, Average, and Std Dev).
——————Without patch——————
————uname -a————
6.12.0-rc5
————lscpu————
lscpu
Architecture: ppc64le
Byte Order: Little Endian
CPU(s): 360
On-line CPU(s) list: 0-359
NUMA:
NUMA node(s): 4
NUMA node0 CPU(s): 0-95
NUMA node1 CPU(s): 96-191
NUMA node2 CPU(s): 192-271
NUMA node3 CPU(s): 272-359
Without Patch:
Metric SMT Off (s) SMT 4 (s)
Min 68.63 37.64
Max 74.92 39.39
Average 70.92 38.48
Std Dev 2.22 0.63
——————With patch——————
————uname -a————
6.12.0-rc5-dirty
————lscpu————
lscpu
Architecture: ppc64le
Byte Order: Little Endian
CPU(s): 360
On-line CPU(s) list: 0-359
NUMA:
NUMA node(s): 4
NUMA node0 CPU(s): 0-95
NUMA node1 CPU(s): 96-191
NUMA node2 CPU(s): 192-271
NUMA node3 CPU(s): 272-359
With Patch:
Metric SMT Off (s) SMT 4 (s)
Min 68.748 33.442
Max 72.954 38.042
Average 70.309 36.206
Std Dev 1.41 1.66
From the results it’s seen that there is no significant improvement,
however, with the patch applied, the SMT=4 state shows a decrease in
both average time, as reflected in the lower average (36.21s vs. 38.48s)
and higher standard deviation (1.66s vs. 0.63s) compared to the previous
without patch apply result.
Thanks,
Samir
On Tue, Oct 29, 2024 at 11:04:52PM +0530, samir wrote:
>
> I have verified this patch on PowerPC and below are the results for "time
> ppc64_cpu —smt =off/4" mode,
> Here are the 5 iteration data for “time ppc64_cpu --smt=off/4” command(min,
> max, Average, and Std Dev).
>
> ——————Without patch——————
> ————uname -a————
> 6.12.0-rc5
>
> ————lscpu————
> lscpu
> Architecture: ppc64le
> Byte Order: Little Endian
> CPU(s): 360
> On-line CPU(s) list: 0-359
> NUMA:
> NUMA node(s): 4
> NUMA node0 CPU(s): 0-95
> NUMA node1 CPU(s): 96-191
> NUMA node2 CPU(s): 192-271
> NUMA node3 CPU(s): 272-359
>
> Without Patch:
> Metric SMT Off (s) SMT 4 (s)
> Min 68.63 37.64
> Max 74.92 39.39
> Average 70.92 38.48
> Std Dev 2.22 0.63
>
>
> ——————With patch——————
> ————uname -a————
> 6.12.0-rc5-dirty
>
> ————lscpu————
> lscpu
> Architecture: ppc64le
> Byte Order: Little Endian
> CPU(s): 360
> On-line CPU(s) list: 0-359
> NUMA:
> NUMA node(s): 4
> NUMA node0 CPU(s): 0-95
> NUMA node1 CPU(s): 96-191
> NUMA node2 CPU(s): 192-271
> NUMA node3 CPU(s): 272-359
>
> With Patch:
> Metric SMT Off (s) SMT 4 (s)
> Min 68.748 33.442
> Max 72.954 38.042
> Average 70.309 36.206
> Std Dev 1.41 1.66
>
> From the results it’s seen that there is no significant improvement,
> however, with the patch applied, the SMT=4 state shows a decrease in both
> average time, as reflected in the lower average (36.21s vs. 38.48s) and
> higher standard deviation (1.66s vs. 0.63s) compared to the previous without
> patch apply result.
Samir,
I found your results interesting. So I tried to compare with our
systems, and I get similar results. Around 300 processors, this patch
makes little difference. At higher counts, the topology_span_sane
function change has more influence.
I don't have PPC system access, but I tried to recreate similar
results on our x86_64 systems. I took a 8 socket, 60 core/socket, 2
thread/core system (960 CPUs), and limited it to 20 physical
cores/socket (320 CPUs) for comparison.
I'm using scripts from Intel's System Health Check,
"Set-Half-Of-The-Cores-Offline.sh" and "Set-All-Cores-Online.sh", but
similar results could be obtained with anything that manipulates
/sys/devices/system/cpu/cpu*/online.
I also found that the first offlining attempt after a reboot goes much
faster, so I threw out the first result after reboot and then measured
5 iterations. (The reason for this probably needs exploration, but it
happens for me on both patched and unpatched versions.)
All times in seconds.
With 20 cores / socket (320 CPUs counting hyperthreads):
Without patch:
Half-Offline All-Online
min 21.47 30.76
max 22.35 31.31
avg 22.04 31.124
std.dev. 0.3419795 0.2175545
With patch:
Half-Offline All-Online
min 20.43 28.23
max 21.93 29.76
avg 20.786 28.874
std.dev. 0.6435293 0.6366553
Not a huge difference at this level.
At 60 cores / socket (960 CPUs counting hyperthreads):
Without patch:
Half-Offline All-Online
min 275.34 321.47
max 288.05 331.89
avg 282.964 326.884
std.dev. 5.8835813 4.0268945
With patch:
Half-Offline All-Online
min 208.9 247.17
max 219.49 251.48
avg 212.392 249.394
std.dev. 4.1717586 1.6904526
Here it starts to make a difference, and as the number of CPUs goes
up, it gets worse.
I should note that I made my measurements with v2 of the patch,
recently posted. Version 2 does remove a memory allocation, which
might have improved things.
Thanks,
--> Steve Wahl
--
Steve Wahl, Hewlett Packard Enterprise
On 10/10/24 21:21, Steve Wahl wrote:
> Use a different approach to topology_span_sane(), that checks for the
> same constraint of no partial overlaps for any two CPU sets for
> non-NUMA topology levels, but does so in a way that is O(N) rather
> than O(N^2).
>
> Instead of comparing with all other masks to detect collisions, keep
> one mask that includes all CPUs seen so far and detect collisions with
> a single cpumask_intersects test.
>
> If the current mask has no collisions with previously seen masks, it
> should be a new mask, which can be uniquely identified by the lowest
> bit set in this mask. Keep a pointer to this mask for future
> reference (in an array indexed by the lowest bit set), and add the
> CPUs in this mask to the list of those seen.
>
> If the current mask does collide with previously seen masks, it should
> be exactly equal to a mask seen before, looked up in the same array
> indexed by the lowest bit set in the mask, a single comparison.
>
> Move the topology_span_sane() check out of the existing topology level
> loop, let it use its own loop so that the array allocation can be done
> only once, shared across levels.
>
> On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
> the average time to take one processor offline is reduced from 2.18
> seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
> 34m49.765s without this change, 16m10.038s with this change in place.)
>
> Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
I was trying to go through this issue and observed below.
Looks like the computations are repeated in below manner.
Assume SMT4 system.
[[0 2 4 6] [1 3 5 7] ] [ [8 10 12 14] [9 11 13 15] ]
<--SMT--> <--SMT--> <---SMT----> <----SMT--->
<---------PKG----------> <------------PKG------------->
Lets say it happening for CPU0, at SMT level, then it will do, masking
in below manner.
2: [0 2 4 6] & [0 2 4 6]
4: [0 2 4 6] & [0 2 4 6]
6: [0 2 4 6] & [0 2 4 6]
1: [0 2 4 6] & [1 3 5 7]
3: [0 2 4 6] & [1 3 5 7]
5: [0 2 4 6] & [1 3 5 7]
7: [0 2 4 6] & [1 3 5 7]
8: [0 2 4 6] & [8 10 12 14]
10:[0 2 4 6] & [8 10 12 14]
12:[0 2 4 6] & [8 10 12 14]
14:[0 2 4 6] & [8 10 12 14]
9: [0 2 4 6] & [9 11 13 15]
11:[0 2 4 6] & [9 11 13 15]
13:[0 2 4 6] & [9 11 13 15]
15:[0 2 4 6] & [9 11 13 15]
And when it happens for CPU2, it will do the exact same computation.
Maybe that can be avoided with something like below. Do the computation
if it is the first cpu in that topology level mask.
Not sure if it works in all scenarios. Tested very very lightly on
power10 system with SMT=4.
Please correct me if i got it all wrong.
------------------------------------------------------------------------
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9748a4c8d668..541631ca32bd 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2367,6 +2367,13 @@ static bool topology_span_sane(struct
sched_domain_topology_level *tl,
if (tl->flags & SDTL_OVERLAP)
return true;
+ /* Do the computation only if this cpu is first CPU
+ * in the topology level mask. Same computation is kind of
+ * Repetitions on other CPUS */
+ if (!(cpu == cpumask_first(tl->mask(cpu)))) {
+ return true;
+ }
+
/*
* Non-NUMA levels cannot partially overlap - they must be either
* completely equal or completely disjoint. Otherwise we can end up
On Thu, Oct 10, 2024 at 10:51:11AM -0500, Steve Wahl wrote:
> Use a different approach to topology_span_sane(), that checks for the
> same constraint of no partial overlaps for any two CPU sets for
> non-NUMA topology levels, but does so in a way that is O(N) rather
> than O(N^2).
>
> Instead of comparing with all other masks to detect collisions, keep
> one mask that includes all CPUs seen so far and detect collisions with
> a single cpumask_intersects test.
>
> If the current mask has no collisions with previously seen masks, it
> should be a new mask, which can be uniquely identified by the lowest
> bit set in this mask. Keep a pointer to this mask for future
> reference (in an array indexed by the lowest bit set), and add the
> CPUs in this mask to the list of those seen.
>
> If the current mask does collide with previously seen masks, it should
> be exactly equal to a mask seen before, looked up in the same array
> indexed by the lowest bit set in the mask, a single comparison.
>
> Move the topology_span_sane() check out of the existing topology level
> loop, let it use its own loop so that the array allocation can be done
> only once, shared across levels.
>
> On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
> the average time to take one processor offline is reduced from 2.18
> seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
> 34m49.765s without this change, 16m10.038s with this change in place.)
>
> Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
> ---
> kernel/sched/topology.c | 79 ++++++++++++++++++++++++++++-------------
> 1 file changed, 54 insertions(+), 25 deletions(-)
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9748a4c8d668..c150074033d3 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2356,36 +2356,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
>
> /*
> * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
> - * any two given CPUs at this (non-NUMA) topology level.
> + * any two given CPUs on non-NUMA topology levels.
> */
> -static bool topology_span_sane(struct sched_domain_topology_level *tl,
> - const struct cpumask *cpu_map, int cpu)
> +static bool topology_span_sane(const struct cpumask *cpu_map)
> {
> - int i = cpu + 1;
> + struct sched_domain_topology_level *tl;
> + const struct cpumask **masks;
> + struct cpumask *covered;
> + int cpu, id;
> + bool ret = false;
>
> - /* NUMA levels are allowed to overlap */
> - if (tl->flags & SDTL_OVERLAP)
> - return true;
> + lockdep_assert_held(&sched_domains_mutex);
> + covered = sched_domains_tmpmask;
> +
> + masks = kmalloc_array(num_possible_cpus(), sizeof(struct cpumask *), GFP_KERNEL);
> + if (!masks)
> + return ret;
> +
> + for_each_sd_topology(tl) {
> +
> + /* NUMA levels are allowed to overlap */
> + if (tl->flags & SDTL_OVERLAP)
> + continue;
> +
> + cpumask_clear(covered);
> + memset(masks, 0, num_possible_cpus() * sizeof(struct cpumask *));
>
> - /*
> - * Non-NUMA levels cannot partially overlap - they must be either
> - * completely equal or completely disjoint. Otherwise we can end up
> - * breaking the sched_group lists - i.e. a later get_group() pass
> - * breaks the linking done for an earlier span.
> - */
> - for_each_cpu_from(i, cpu_map) {
> /*
> - * We should 'and' all those masks with 'cpu_map' to exactly
> - * match the topology we're about to build, but that can only
> - * remove CPUs, which only lessens our ability to detect
> - * overlaps
> + * Non-NUMA levels cannot partially overlap - they must be either
> + * completely equal or completely disjoint. Otherwise we can end up
> + * breaking the sched_group lists - i.e. a later get_group() pass
> + * breaks the linking done for an earlier span.
> */
> - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
> - cpumask_intersects(tl->mask(cpu), tl->mask(i)))
> - return false;
> + for_each_cpu(cpu, cpu_map) {
> + /* lowest bit set in this mask is used as a unique id */
> + id = cpumask_first(tl->mask(cpu));
> +
> + /* if this mask doesn't collide with what we've already seen */
> + if (!cpumask_intersects(tl->mask(cpu), covered)) {
> + /* this failing would be an error in this algorithm */
> + if (WARN_ON(masks[id]))
> + goto notsane;
> +
> + /* record the mask we saw for this id */
> + masks[id] = tl->mask(cpu);
> + cpumask_or(covered, tl->mask(cpu), covered);
> + } else if ((!masks[id]) || !cpumask_equal(masks[id], tl->mask(cpu))) {
> + /*
> + * a collision with covered should have exactly matched
> + * a previously seen mask with the same id
> + */
> + goto notsane;
> + }
> + }
> }
> + ret = true;
>
> - return true;
> + notsane:
> + kfree(masks);
> + return ret;
> }
>
> /*
> @@ -2417,9 +2446,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> sd = NULL;
> for_each_sd_topology(tl) {
>
> - if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
> - goto error;
> -
> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>
> has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
> @@ -2433,6 +2459,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> }
> }
>
> + if (WARN_ON(!topology_span_sane(cpu_map)))
> + goto error;
Hi Steve,
Is there any reason why above check is done after initializing
sched domain struct for all the CPUs in the cpu_map?
It looks to me, that this check can be performed before the call to
__visit_domain_allocation_hell() in the build_sched_domains()
resulting in early return if topology_span_sane() detects incorrect
topology.
Also, the error path in the current code only cleans up d->rd struct, keeping
all the work done by build_sched_domain() inside the loop and __alloc_sdt()
called from __visit_domain_allocation_hell()
is it because we need all that work to remain intact?
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map)
{
switch (what) {
case sa_rootdomain:
if (!atomic_read(&d->rd->refcount))
free_rootdomain(&d->rd->rcu);
fallthrough;
case sa_sd:
free_percpu(d->sd);
fallthrough;
case sa_sd_storage:
__sdt_free(cpu_map);
fallthrough;
case sa_none:
break;
}
}
> +
> /* Build the groups for the domains */
> for_each_cpu(i, cpu_map) {
> for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
> --
> 2.26.2
>
On Fri, Oct 18, 2024 at 05:05:43PM +0530, Vishal Chourasia wrote:
> On Thu, Oct 10, 2024 at 10:51:11AM -0500, Steve Wahl wrote:
> > Use a different approach to topology_span_sane(), that checks for the
> > same constraint of no partial overlaps for any two CPU sets for
> > non-NUMA topology levels, but does so in a way that is O(N) rather
> > than O(N^2).
> >
> > Instead of comparing with all other masks to detect collisions, keep
> > one mask that includes all CPUs seen so far and detect collisions with
> > a single cpumask_intersects test.
> >
> > If the current mask has no collisions with previously seen masks, it
> > should be a new mask, which can be uniquely identified by the lowest
> > bit set in this mask. Keep a pointer to this mask for future
> > reference (in an array indexed by the lowest bit set), and add the
> > CPUs in this mask to the list of those seen.
> >
> > If the current mask does collide with previously seen masks, it should
> > be exactly equal to a mask seen before, looked up in the same array
> > indexed by the lowest bit set in the mask, a single comparison.
> >
> > Move the topology_span_sane() check out of the existing topology level
> > loop, let it use its own loop so that the array allocation can be done
> > only once, shared across levels.
> >
> > On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
> > the average time to take one processor offline is reduced from 2.18
> > seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
> > 34m49.765s without this change, 16m10.038s with this change in place.)
> >
> > Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
> > ---
> > kernel/sched/topology.c | 79 ++++++++++++++++++++++++++++-------------
> > 1 file changed, 54 insertions(+), 25 deletions(-)
> >
> > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > index 9748a4c8d668..c150074033d3 100644
> > --- a/kernel/sched/topology.c
> > +++ b/kernel/sched/topology.c
> > @@ -2356,36 +2356,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
> >
> > /*
> > * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
> > - * any two given CPUs at this (non-NUMA) topology level.
> > + * any two given CPUs on non-NUMA topology levels.
> > */
> > -static bool topology_span_sane(struct sched_domain_topology_level *tl,
> > - const struct cpumask *cpu_map, int cpu)
> > +static bool topology_span_sane(const struct cpumask *cpu_map)
> > {
> > - int i = cpu + 1;
> > + struct sched_domain_topology_level *tl;
> > + const struct cpumask **masks;
> > + struct cpumask *covered;
> > + int cpu, id;
> > + bool ret = false;
> >
> > - /* NUMA levels are allowed to overlap */
> > - if (tl->flags & SDTL_OVERLAP)
> > - return true;
> > + lockdep_assert_held(&sched_domains_mutex);
> > + covered = sched_domains_tmpmask;
> > +
> > + masks = kmalloc_array(num_possible_cpus(), sizeof(struct cpumask *), GFP_KERNEL);
> > + if (!masks)
> > + return ret;
> > +
> > + for_each_sd_topology(tl) {
> > +
> > + /* NUMA levels are allowed to overlap */
> > + if (tl->flags & SDTL_OVERLAP)
> > + continue;
> > +
> > + cpumask_clear(covered);
> > + memset(masks, 0, num_possible_cpus() * sizeof(struct cpumask *));
> >
> > - /*
> > - * Non-NUMA levels cannot partially overlap - they must be either
> > - * completely equal or completely disjoint. Otherwise we can end up
> > - * breaking the sched_group lists - i.e. a later get_group() pass
> > - * breaks the linking done for an earlier span.
> > - */
> > - for_each_cpu_from(i, cpu_map) {
> > /*
> > - * We should 'and' all those masks with 'cpu_map' to exactly
> > - * match the topology we're about to build, but that can only
> > - * remove CPUs, which only lessens our ability to detect
> > - * overlaps
> > + * Non-NUMA levels cannot partially overlap - they must be either
> > + * completely equal or completely disjoint. Otherwise we can end up
> > + * breaking the sched_group lists - i.e. a later get_group() pass
> > + * breaks the linking done for an earlier span.
> > */
> > - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
> > - cpumask_intersects(tl->mask(cpu), tl->mask(i)))
> > - return false;
> > + for_each_cpu(cpu, cpu_map) {
> > + /* lowest bit set in this mask is used as a unique id */
> > + id = cpumask_first(tl->mask(cpu));
> > +
> > + /* if this mask doesn't collide with what we've already seen */
> > + if (!cpumask_intersects(tl->mask(cpu), covered)) {
> > + /* this failing would be an error in this algorithm */
> > + if (WARN_ON(masks[id]))
> > + goto notsane;
> > +
> > + /* record the mask we saw for this id */
> > + masks[id] = tl->mask(cpu);
> > + cpumask_or(covered, tl->mask(cpu), covered);
> > + } else if ((!masks[id]) || !cpumask_equal(masks[id], tl->mask(cpu))) {
> > + /*
> > + * a collision with covered should have exactly matched
> > + * a previously seen mask with the same id
> > + */
> > + goto notsane;
> > + }
> > + }
> > }
> > + ret = true;
> >
> > - return true;
> > + notsane:
> > + kfree(masks);
> > + return ret;
> > }
> >
> > /*
> > @@ -2417,9 +2446,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > sd = NULL;
> > for_each_sd_topology(tl) {
> >
> > - if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
> > - goto error;
> > -
> > sd = build_sched_domain(tl, cpu_map, attr, sd, i);
> >
> > has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
> > @@ -2433,6 +2459,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > }
> > }
> >
> > + if (WARN_ON(!topology_span_sane(cpu_map)))
> > + goto error;
> Hi Steve,
Vishal, thank you for taking the time to review.
> Is there any reason why above check is done after initializing
> sched domain struct for all the CPUs in the cpu_map?
The original check was done in the same for_each_sd_topology(tl) loop
that calls build_sched_domain(). I had trouble 100% convincing myself
that calls to build_sched_domain() on the previous levels couldn't
affect calls to tl->mask() in later levels, so I placed the new check
after all calls to build_sched_domain were complete.
> It looks to me, that this check can be performed before the call to
> __visit_domain_allocation_hell() in the build_sched_domains()
> resulting in early return if topology_span_sane() detects incorrect
> topology.
This might be OK to do. I would greatly appreciate somebody well
versed in this code area telling me for certain that it would work.
> Also, the error path in the current code only cleans up d->rd struct, keeping
> all the work done by build_sched_domain() inside the loop and __alloc_sdt()
> called from __visit_domain_allocation_hell()
>
> is it because we need all that work to remain intact?
I'm not seeing this. The return from __visit_domain_allocation_hell()
is stored in alloc_state immediately checked to be == sa_rootdomain;
if not, the error path is taken, deallocating everything and
returning.
The rest of the function does not touch alloc_state, so any error from
that point on makes the call to __free_domain_allocs with what ==
sa_rootdomain, which seems to undo everything.
Are you possibly missing the fallthroughs in __free_domain_allocs()
even though they're clearly emphasized?
> static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
> const struct cpumask *cpu_map)
> {
> switch (what) {
> case sa_rootdomain:
> if (!atomic_read(&d->rd->refcount))
> free_rootdomain(&d->rd->rcu);
> fallthrough;
> case sa_sd:
> free_percpu(d->sd);
> fallthrough;
> case sa_sd_storage:
> __sdt_free(cpu_map);
> fallthrough;
> case sa_none:
> break;
> }
> }
>
Thanks,
--> Steve Wahl
--
Steve Wahl, Hewlett Packard Enterprise
On Mon, Oct 21, 2024 at 11:20:58AM -0500, Steve Wahl wrote:
> On Fri, Oct 18, 2024 at 05:05:43PM +0530, Vishal Chourasia wrote:
> > On Thu, Oct 10, 2024 at 10:51:11AM -0500, Steve Wahl wrote:
> > @@ -2417,9 +2446,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > > sd = NULL;
> > > for_each_sd_topology(tl) {
> > >
> > > - if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
> > > - goto error;
> > > -
> > > sd = build_sched_domain(tl, cpu_map, attr, sd, i);
> > >
> > > has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
> > > @@ -2433,6 +2459,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > > }
> > > }
> > >
> > > + if (WARN_ON(!topology_span_sane(cpu_map)))
> > > + goto error;
> > Hi Steve,
>
> Vishal, thank you for taking the time to review.
>
> > Is there any reason why above check is done after initializing
> > sched domain struct for all the CPUs in the cpu_map?
>
> The original check was done in the same for_each_sd_topology(tl) loop
> that calls build_sched_domain(). I had trouble 100% convincing myself
> that calls to build_sched_domain() on the previous levels couldn't
> affect calls to tl->mask() in later levels, so I placed the new check
> after all calls to build_sched_domain were complete.
>
Yeah, I don't see build_sched_domain() modifying the cpumask
returned from tl->mask(cpu)
> > It looks to me, that this check can be performed before the call to
> > __visit_domain_allocation_hell() in the build_sched_domains()
> > resulting in early return if topology_span_sane() detects incorrect
> > topology.
>
> This might be OK to do. I would greatly appreciate somebody well
> versed in this code area telling me for certain that it would work.
>
Same.
> > Also, the error path in the current code only cleans up d->rd struct, keeping
> > all the work done by build_sched_domain() inside the loop and __alloc_sdt()
> > called from __visit_domain_allocation_hell()
> >
> > is it because we need all that work to remain intact?
>
> I'm not seeing this. The return from __visit_domain_allocation_hell()
> is stored in alloc_state immediately checked to be == sa_rootdomain;
> if not, the error path is taken, deallocating everything and
> returning.
>
> The rest of the function does not touch alloc_state, so any error from
> that point on makes the call to __free_domain_allocs with what ==
> sa_rootdomain, which seems to undo everything.
>
> Are you possibly missing the fallthroughs in __free_domain_allocs()
> even though they're clearly emphasized?
>
Yes, you are right. Thank you for pointing that out.
> > static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
> > const struct cpumask *cpu_map)
> > {
> > switch (what) {
> > case sa_rootdomain:
> > if (!atomic_read(&d->rd->refcount))
> > free_rootdomain(&d->rd->rcu);
> > fallthrough;
> > case sa_sd:
> > free_percpu(d->sd);
> > fallthrough;
> > case sa_sd_storage:
> > __sdt_free(cpu_map);
> > fallthrough;
> > case sa_none:
> > break;
> > }
> > }
> >
>
> Thanks,
>
> --> Steve Wahl
>
> --
> Steve Wahl, Hewlett Packard Enterprise
On 10/10/24 10:51, Steve Wahl wrote:
> Use a different approach to topology_span_sane(), that checks for the
> same constraint of no partial overlaps for any two CPU sets for
> non-NUMA topology levels, but does so in a way that is O(N) rather
> than O(N^2).
>
> Instead of comparing with all other masks to detect collisions, keep
> one mask that includes all CPUs seen so far and detect collisions with
> a single cpumask_intersects test.
>
> If the current mask has no collisions with previously seen masks, it
> should be a new mask, which can be uniquely identified by the lowest
> bit set in this mask. Keep a pointer to this mask for future
> reference (in an array indexed by the lowest bit set), and add the
> CPUs in this mask to the list of those seen.
>
> If the current mask does collide with previously seen masks, it should
> be exactly equal to a mask seen before, looked up in the same array
> indexed by the lowest bit set in the mask, a single comparison.
>
> Move the topology_span_sane() check out of the existing topology level
> loop, let it use its own loop so that the array allocation can be done
> only once, shared across levels.
>
> On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
> the average time to take one processor offline is reduced from 2.18
> seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
> 34m49.765s without this change, 16m10.038s with this change in place.)
>
This isn't the first complaint about topology_span_sane() vs big
systems. It might be worth to disable the check once it has scanned all
CPUs once - not necessarily at init, since some folks have their systems
boot with only a subset of the available CPUs and online them later on.
I'd have to think more about how this behaves vs the dynamic NUMA topology
code we got as of
0fb3978b0aac ("sched/numa: Fix NUMA topology for systems with CPU-less nodes")
(i.e. is scanning all possible CPUs enough to guarantee no overlaps when
having only a subset of online CPUs? I think so...)
but maybe something like so?
---
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9748a4c8d6685..bf95c3d4f6072 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2361,12 +2361,25 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
static bool topology_span_sane(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, int cpu)
{
+ static bool validated;
int i = cpu + 1;
+ if (validated)
+ return true;
+
/* NUMA levels are allowed to overlap */
if (tl->flags & SDTL_OVERLAP)
return true;
+ /*
+ * We're visiting all CPUs available in the system, no need to re-check
+ * things after that. Even if we end up finding overlaps here, we'll
+ * have issued a warning and can skip the per-CPU scan in later
+ * calls to this function.
+ */
+ if (cpumask_equal(cpu_map, cpu_possible_mask))
+ validated = true;
+
/*
* Non-NUMA levels cannot partially overlap - they must be either
* completely equal or completely disjoint. Otherwise we can end up
On Tue, Oct 15, 2024 at 04:37:35PM +0200, Valentin Schneider wrote:
> On 10/10/24 10:51, Steve Wahl wrote:
> > Use a different approach to topology_span_sane(), that checks for the
> > same constraint of no partial overlaps for any two CPU sets for
> > non-NUMA topology levels, but does so in a way that is O(N) rather
> > than O(N^2).
> >
> > Instead of comparing with all other masks to detect collisions, keep
> > one mask that includes all CPUs seen so far and detect collisions with
> > a single cpumask_intersects test.
> >
> > If the current mask has no collisions with previously seen masks, it
> > should be a new mask, which can be uniquely identified by the lowest
> > bit set in this mask. Keep a pointer to this mask for future
> > reference (in an array indexed by the lowest bit set), and add the
> > CPUs in this mask to the list of those seen.
> >
> > If the current mask does collide with previously seen masks, it should
> > be exactly equal to a mask seen before, looked up in the same array
> > indexed by the lowest bit set in the mask, a single comparison.
> >
> > Move the topology_span_sane() check out of the existing topology level
> > loop, let it use its own loop so that the array allocation can be done
> > only once, shared across levels.
> >
> > On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
> > the average time to take one processor offline is reduced from 2.18
> > seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
> > 34m49.765s without this change, 16m10.038s with this change in place.)
> >
>
> This isn't the first complaint about topology_span_sane() vs big
> systems. It might be worth to disable the check once it has scanned all
> CPUs once - not necessarily at init, since some folks have their systems
> boot with only a subset of the available CPUs and online them later on.
>
> I'd have to think more about how this behaves vs the dynamic NUMA topology
> code we got as of
>
> 0fb3978b0aac ("sched/numa: Fix NUMA topology for systems with CPU-less nodes")
>
> (i.e. is scanning all possible CPUs enough to guarantee no overlaps when
> having only a subset of online CPUs? I think so...)
>
> but maybe something like so?
> ---
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9748a4c8d6685..bf95c3d4f6072 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2361,12 +2361,25 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
> static bool topology_span_sane(struct sched_domain_topology_level *tl,
> const struct cpumask *cpu_map, int cpu)
> {
> + static bool validated;
> int i = cpu + 1;
>
> + if (validated)
> + return true;
> +
> /* NUMA levels are allowed to overlap */
> if (tl->flags & SDTL_OVERLAP)
> return true;
>
> + /*
> + * We're visiting all CPUs available in the system, no need to re-check
> + * things after that. Even if we end up finding overlaps here, we'll
> + * have issued a warning and can skip the per-CPU scan in later
> + * calls to this function.
> + */
> + if (cpumask_equal(cpu_map, cpu_possible_mask))
> + validated = true;
> +
> /*
> * Non-NUMA levels cannot partially overlap - they must be either
> * completely equal or completely disjoint. Otherwise we can end up
I tried adding this, surprisingly I saw no effect on the time taken,
perhaps even a small slowdown, when combined with my patch. So at
this point I don't intend to add it to v2 of the patch.
--> Steve
--
Steve Wahl, Hewlett Packard Enterprise
On 25/10/24 10:06, Steve Wahl wrote:
> On Tue, Oct 15, 2024 at 04:37:35PM +0200, Valentin Schneider wrote:
>> On 10/10/24 10:51, Steve Wahl wrote:
>> > Use a different approach to topology_span_sane(), that checks for the
>> > same constraint of no partial overlaps for any two CPU sets for
>> > non-NUMA topology levels, but does so in a way that is O(N) rather
>> > than O(N^2).
>> >
>> > Instead of comparing with all other masks to detect collisions, keep
>> > one mask that includes all CPUs seen so far and detect collisions with
>> > a single cpumask_intersects test.
>> >
>> > If the current mask has no collisions with previously seen masks, it
>> > should be a new mask, which can be uniquely identified by the lowest
>> > bit set in this mask. Keep a pointer to this mask for future
>> > reference (in an array indexed by the lowest bit set), and add the
>> > CPUs in this mask to the list of those seen.
>> >
>> > If the current mask does collide with previously seen masks, it should
>> > be exactly equal to a mask seen before, looked up in the same array
>> > indexed by the lowest bit set in the mask, a single comparison.
>> >
>> > Move the topology_span_sane() check out of the existing topology level
>> > loop, let it use its own loop so that the array allocation can be done
>> > only once, shared across levels.
>> >
>> > On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
>> > the average time to take one processor offline is reduced from 2.18
>> > seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
>> > 34m49.765s without this change, 16m10.038s with this change in place.)
>> >
>>
>> This isn't the first complaint about topology_span_sane() vs big
>> systems. It might be worth to disable the check once it has scanned all
>> CPUs once - not necessarily at init, since some folks have their systems
>> boot with only a subset of the available CPUs and online them later on.
>>
>> I'd have to think more about how this behaves vs the dynamic NUMA topology
>> code we got as of
>>
>> 0fb3978b0aac ("sched/numa: Fix NUMA topology for systems with CPU-less nodes")
>>
>> (i.e. is scanning all possible CPUs enough to guarantee no overlaps when
>> having only a subset of online CPUs? I think so...)
>>
>> but maybe something like so?
>> ---
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index 9748a4c8d6685..bf95c3d4f6072 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -2361,12 +2361,25 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
>> static bool topology_span_sane(struct sched_domain_topology_level *tl,
>> const struct cpumask *cpu_map, int cpu)
>> {
>> + static bool validated;
>> int i = cpu + 1;
>>
>> + if (validated)
>> + return true;
>> +
>> /* NUMA levels are allowed to overlap */
>> if (tl->flags & SDTL_OVERLAP)
>> return true;
>>
>> + /*
>> + * We're visiting all CPUs available in the system, no need to re-check
>> + * things after that. Even if we end up finding overlaps here, we'll
>> + * have issued a warning and can skip the per-CPU scan in later
>> + * calls to this function.
>> + */
>> + if (cpumask_equal(cpu_map, cpu_possible_mask))
>> + validated = true;
>> +
>> /*
>> * Non-NUMA levels cannot partially overlap - they must be either
>> * completely equal or completely disjoint. Otherwise we can end up
>
> I tried adding this, surprisingly I saw no effect on the time taken,
> perhaps even a small slowdown, when combined with my patch. So at
> this point I don't intend to add it to v2 of the patch.
>
Thanks for testing, I assume your cpu_possible_mask reports more CPUs than
you have physically plugged in... I guess it would make sense to
short-circuit the function when cpu_map is a subset of what we've
previously checked, and then re-kick the testing once new CPU(s) are
plugged in. Something like the untested below?
Optimisations notwithstanding, IMO we shouldn't be repeating checks if we
can avoid it.
---
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9748a4c8d6685..87ba730c34800 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2354,6 +2354,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
return sd;
}
+static cpumask_var_t topology_sane_cpus;
+
/*
* Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
* any two given CPUs at this (non-NUMA) topology level.
@@ -2367,6 +2369,11 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl,
if (tl->flags & SDTL_OVERLAP)
return true;
+ if (cpumask_subset(cpu_map, topology_sane_cpus))
+ return true;
+
+ cpumask_or(topology_sane_cpus, cpu_map, topology_sane_cpus);
+
/*
* Non-NUMA levels cannot partially overlap - they must be either
* completely equal or completely disjoint. Otherwise we can end up
@@ -2607,6 +2614,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+ zalloc_cpumask_var(&topology_sane_cpus, GFP_KERNEL);
arch_update_cpu_topology();
asym_cpu_capacity_scan();
On 15/10/24 16:37, Valentin Schneider wrote:
> On 10/10/24 10:51, Steve Wahl wrote:
>> Use a different approach to topology_span_sane(), that checks for the
>> same constraint of no partial overlaps for any two CPU sets for
>> non-NUMA topology levels, but does so in a way that is O(N) rather
>> than O(N^2).
>>
>> Instead of comparing with all other masks to detect collisions, keep
>> one mask that includes all CPUs seen so far and detect collisions with
>> a single cpumask_intersects test.
>>
>> If the current mask has no collisions with previously seen masks, it
>> should be a new mask, which can be uniquely identified by the lowest
>> bit set in this mask. Keep a pointer to this mask for future
>> reference (in an array indexed by the lowest bit set), and add the
>> CPUs in this mask to the list of those seen.
>>
>> If the current mask does collide with previously seen masks, it should
>> be exactly equal to a mask seen before, looked up in the same array
>> indexed by the lowest bit set in the mask, a single comparison.
>>
>> Move the topology_span_sane() check out of the existing topology level
>> loop, let it use its own loop so that the array allocation can be done
>> only once, shared across levels.
>>
>> On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
>> the average time to take one processor offline is reduced from 2.18
>> seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
>> 34m49.765s without this change, 16m10.038s with this change in place.)
>>
>
> This isn't the first complaint about topology_span_sane() vs big
> systems. It might be worth to disable the check once it has scanned all
> CPUs once - not necessarily at init, since some folks have their systems
> boot with only a subset of the available CPUs and online them later on.
>
> I'd have to think more about how this behaves vs the dynamic NUMA topology
> code we got as of
>
> 0fb3978b0aac ("sched/numa: Fix NUMA topology for systems with CPU-less nodes")
>
> (i.e. is scanning all possible CPUs enough to guarantee no overlaps when
> having only a subset of online CPUs? I think so...)
>
> but maybe something like so?
I'd also be tempted to shove this under SCHED_DEBUG + sched_verbose, like
the sched_domain debug fluff. Most distros ship with SCHED_DEBUG anyway, so
if there is suspicion of topology mask fail, they can slap the extra
cmdline argument and have it checked.
On Wed, Oct 16, 2024 at 10:10:11AM +0200, Valentin Schneider wrote:
> On 15/10/24 16:37, Valentin Schneider wrote:
> > On 10/10/24 10:51, Steve Wahl wrote:
> >> Use a different approach to topology_span_sane(), that checks for the
> >> same constraint of no partial overlaps for any two CPU sets for
> >> non-NUMA topology levels, but does so in a way that is O(N) rather
> >> than O(N^2).
> >>
> >> Instead of comparing with all other masks to detect collisions, keep
> >> one mask that includes all CPUs seen so far and detect collisions with
> >> a single cpumask_intersects test.
> >>
> >> If the current mask has no collisions with previously seen masks, it
> >> should be a new mask, which can be uniquely identified by the lowest
> >> bit set in this mask. Keep a pointer to this mask for future
> >> reference (in an array indexed by the lowest bit set), and add the
> >> CPUs in this mask to the list of those seen.
> >>
> >> If the current mask does collide with previously seen masks, it should
> >> be exactly equal to a mask seen before, looked up in the same array
> >> indexed by the lowest bit set in the mask, a single comparison.
> >>
> >> Move the topology_span_sane() check out of the existing topology level
> >> loop, let it use its own loop so that the array allocation can be done
> >> only once, shared across levels.
> >>
> >> On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
> >> the average time to take one processor offline is reduced from 2.18
> >> seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
> >> 34m49.765s without this change, 16m10.038s with this change in place.)
> >>
> >
> > This isn't the first complaint about topology_span_sane() vs big
> > systems. It might be worth to disable the check once it has scanned all
> > CPUs once - not necessarily at init, since some folks have their systems
> > boot with only a subset of the available CPUs and online them later on.
> >
> > I'd have to think more about how this behaves vs the dynamic NUMA topology
> > code we got as of
> >
> > 0fb3978b0aac ("sched/numa: Fix NUMA topology for systems with CPU-less nodes")
> >
> > (i.e. is scanning all possible CPUs enough to guarantee no overlaps when
> > having only a subset of online CPUs? I think so...)
> >
> > but maybe something like so?
>
>
> I'd also be tempted to shove this under SCHED_DEBUG + sched_verbose, like
> the sched_domain debug fluff. Most distros ship with SCHED_DEBUG anyway, so
> if there is suspicion of topology mask fail, they can slap the extra
> cmdline argument and have it checked.
I understand the idea, but as I read closer, things under SCHED_DEBUG
and sched_verbose currently don't change any actions, just add
additional information output, and some checks that may or may not
print things but do not otherwise change what gets executed.
However, topology_span_sane failing currently causes
build_sched_domains to abort and return an error. I don't think I
should change the action / skip the test if SCHED_DEBUG isn't on,
because I believe the debug version should always take the same paths
as non-debug.
So there's not much that could be removed when SCHED_DEBUG isn't on or
sched_verbose isn't set.
--> Steve
--
Steve Wahl, Hewlett Packard Enterprise
Hello Steve,
On 10/10/2024 9:21 PM, Steve Wahl wrote:
> Use a different approach to topology_span_sane(), that checks for the
> same constraint of no partial overlaps for any two CPU sets for
> non-NUMA topology levels, but does so in a way that is O(N) rather
> than O(N^2).
>
> Instead of comparing with all other masks to detect collisions, keep
> one mask that includes all CPUs seen so far and detect collisions with
> a single cpumask_intersects test.
>
> If the current mask has no collisions with previously seen masks, it
> should be a new mask, which can be uniquely identified by the lowest
> bit set in this mask. Keep a pointer to this mask for future
> reference (in an array indexed by the lowest bit set), and add the
> CPUs in this mask to the list of those seen.
>
> If the current mask does collide with previously seen masks, it should
> be exactly equal to a mask seen before, looked up in the same array
> indexed by the lowest bit set in the mask, a single comparison.
>
> Move the topology_span_sane() check out of the existing topology level
> loop, let it use its own loop so that the array allocation can be done
> only once, shared across levels.
>
> On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
> the average time to take one processor offline is reduced from 2.18
> seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
> 34m49.765s without this change, 16m10.038s with this change in place.)
>
> Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
> ---
> kernel/sched/topology.c | 79 ++++++++++++++++++++++++++++-------------
> 1 file changed, 54 insertions(+), 25 deletions(-)
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9748a4c8d668..c150074033d3 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2356,36 +2356,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
>
> /*
> * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
> - * any two given CPUs at this (non-NUMA) topology level.
> + * any two given CPUs on non-NUMA topology levels.
> */
> -static bool topology_span_sane(struct sched_domain_topology_level *tl,
> - const struct cpumask *cpu_map, int cpu)
> +static bool topology_span_sane(const struct cpumask *cpu_map)
> {
> - int i = cpu + 1;
> + struct sched_domain_topology_level *tl;
> + const struct cpumask **masks;
> + struct cpumask *covered;
> + int cpu, id;
> + bool ret = false;
>
> - /* NUMA levels are allowed to overlap */
> - if (tl->flags & SDTL_OVERLAP)
> - return true;
> + lockdep_assert_held(&sched_domains_mutex);
> + covered = sched_domains_tmpmask;
> +
> + masks = kmalloc_array(num_possible_cpus(), sizeof(struct cpumask *), GFP_KERNEL);
> + if (!masks)
> + return ret;
That looks like a very large array that seems unnecessary. Instead, is
it possible to use "tl->mask(id)" down blow to check for equality? (I'll
elaborate more below)
> +
> + for_each_sd_topology(tl) {
> +
> + /* NUMA levels are allowed to overlap */
> + if (tl->flags & SDTL_OVERLAP)
> + continue;
> +
> + cpumask_clear(covered);
> + memset(masks, 0, num_possible_cpus() * sizeof(struct cpumask *));
>
> - /*
> - * Non-NUMA levels cannot partially overlap - they must be either
> - * completely equal or completely disjoint. Otherwise we can end up
> - * breaking the sched_group lists - i.e. a later get_group() pass
> - * breaks the linking done for an earlier span.
> - */
> - for_each_cpu_from(i, cpu_map) {
> /*
> - * We should 'and' all those masks with 'cpu_map' to exactly
> - * match the topology we're about to build, but that can only
> - * remove CPUs, which only lessens our ability to detect
> - * overlaps
> + * Non-NUMA levels cannot partially overlap - they must be either
> + * completely equal or completely disjoint. Otherwise we can end up
> + * breaking the sched_group lists - i.e. a later get_group() pass
> + * breaks the linking done for an earlier span.
> */
> - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
> - cpumask_intersects(tl->mask(cpu), tl->mask(i)))
> - return false;
> + for_each_cpu(cpu, cpu_map) {
> + /* lowest bit set in this mask is used as a unique id */
> + id = cpumask_first(tl->mask(cpu));
nit. "id" can be declared in this scope since it is not used anywhere
outside. Also perhaps you can store "tl->mask(cpu)" instead of calling
the function multiple times below. Thoughts?
> +
> + /* if this mask doesn't collide with what we've already seen */
> + if (!cpumask_intersects(tl->mask(cpu), covered)) {
> + /* this failing would be an error in this algorithm */
> + if (WARN_ON(masks[id]))
> + goto notsane;
> +
> + /* record the mask we saw for this id */
> + masks[id] = tl->mask(cpu);
> + cpumask_or(covered, tl->mask(cpu), covered);
> + } else if ((!masks[id]) || !cpumask_equal(masks[id], tl->mask(cpu))) {
Elaborating the above point, instead of caching cpumask in an array with:
masks[id] = tl->mask(cpu);
how about, having a single "struct cpumask *id_seen" that is cleared at
the same time as "covered" cpumask, and a bit corresponding to every
"id" seen is set. If there is an intersection, and this bit against the
"id" seen is not set or if the "tl->mask(cpu)" is not same as
"tl->mask(id)", one can detect if the range is overlapping. This
requires much less space compared to an array of cpumasks but I'm not
sure how much more expensive it is to access "tl->mask(id)" compared
to reading it from the array of cpumasks.
--
Thanks and Regards,
Prateek
> + /*
> + * a collision with covered should have exactly matched
> + * a previously seen mask with the same id
> + */
> + goto notsane;
> + }
> + }
> }
> + ret = true;
>
> - return true;
> + notsane:
> + kfree(masks);
> + return ret;
> }
>
> /*
> @@ -2417,9 +2446,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> sd = NULL;
> for_each_sd_topology(tl) {
>
> - if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
> - goto error;
> -
> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>
> has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
> @@ -2433,6 +2459,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> }
> }
>
> + if (WARN_ON(!topology_span_sane(cpu_map)))
> + goto error;
> +
> /* Build the groups for the domains */
> for_each_cpu(i, cpu_map) {
> for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
On Tue, Oct 15, 2024 at 03:50:49PM +0530, K Prateek Nayak wrote:
> Hello Steve,
>
> On 10/10/2024 9:21 PM, Steve Wahl wrote:
> > Use a different approach to topology_span_sane(), that checks for the
> > same constraint of no partial overlaps for any two CPU sets for
> > non-NUMA topology levels, but does so in a way that is O(N) rather
> > than O(N^2).
> >
> > Instead of comparing with all other masks to detect collisions, keep
> > one mask that includes all CPUs seen so far and detect collisions with
> > a single cpumask_intersects test.
> >
> > If the current mask has no collisions with previously seen masks, it
> > should be a new mask, which can be uniquely identified by the lowest
> > bit set in this mask. Keep a pointer to this mask for future
> > reference (in an array indexed by the lowest bit set), and add the
> > CPUs in this mask to the list of those seen.
> >
> > If the current mask does collide with previously seen masks, it should
> > be exactly equal to a mask seen before, looked up in the same array
> > indexed by the lowest bit set in the mask, a single comparison.
> >
> > Move the topology_span_sane() check out of the existing topology level
> > loop, let it use its own loop so that the array allocation can be done
> > only once, shared across levels.
> >
> > On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
> > the average time to take one processor offline is reduced from 2.18
> > seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
> > 34m49.765s without this change, 16m10.038s with this change in place.)
> >
> > Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
> > ---
> > kernel/sched/topology.c | 79 ++++++++++++++++++++++++++++-------------
> > 1 file changed, 54 insertions(+), 25 deletions(-)
> >
> > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > index 9748a4c8d668..c150074033d3 100644
> > --- a/kernel/sched/topology.c
> > +++ b/kernel/sched/topology.c
> > @@ -2356,36 +2356,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
> > /*
> > * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
> > - * any two given CPUs at this (non-NUMA) topology level.
> > + * any two given CPUs on non-NUMA topology levels.
> > */
> > -static bool topology_span_sane(struct sched_domain_topology_level *tl,
> > - const struct cpumask *cpu_map, int cpu)
> > +static bool topology_span_sane(const struct cpumask *cpu_map)
> > {
> > - int i = cpu + 1;
> > + struct sched_domain_topology_level *tl;
> > + const struct cpumask **masks;
> > + struct cpumask *covered;
> > + int cpu, id;
> > + bool ret = false;
> > - /* NUMA levels are allowed to overlap */
> > - if (tl->flags & SDTL_OVERLAP)
> > - return true;
> > + lockdep_assert_held(&sched_domains_mutex);
> > + covered = sched_domains_tmpmask;
> > +
> > + masks = kmalloc_array(num_possible_cpus(), sizeof(struct cpumask *), GFP_KERNEL);
> > + if (!masks)
> > + return ret;
>
> That looks like a very large array that seems unnecessary. Instead, is
> it possible to use "tl->mask(id)" down blow to check for equality? (I'll
> elaborate more below)
>
> > +
> > + for_each_sd_topology(tl) {
> > +
> > + /* NUMA levels are allowed to overlap */
> > + if (tl->flags & SDTL_OVERLAP)
> > + continue;
> > +
> > + cpumask_clear(covered);
> > + memset(masks, 0, num_possible_cpus() * sizeof(struct cpumask *));
> > - /*
> > - * Non-NUMA levels cannot partially overlap - they must be either
> > - * completely equal or completely disjoint. Otherwise we can end up
> > - * breaking the sched_group lists - i.e. a later get_group() pass
> > - * breaks the linking done for an earlier span.
> > - */
> > - for_each_cpu_from(i, cpu_map) {
> > /*
> > - * We should 'and' all those masks with 'cpu_map' to exactly
> > - * match the topology we're about to build, but that can only
> > - * remove CPUs, which only lessens our ability to detect
> > - * overlaps
> > + * Non-NUMA levels cannot partially overlap - they must be either
> > + * completely equal or completely disjoint. Otherwise we can end up
> > + * breaking the sched_group lists - i.e. a later get_group() pass
> > + * breaks the linking done for an earlier span.
> > */
> > - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
> > - cpumask_intersects(tl->mask(cpu), tl->mask(i)))
> > - return false;
> > + for_each_cpu(cpu, cpu_map) {
> > + /* lowest bit set in this mask is used as a unique id */
> > + id = cpumask_first(tl->mask(cpu));
>
> nit. "id" can be declared in this scope since it is not used anywhere
> outside. Also perhaps you can store "tl->mask(cpu)" instead of calling
> the function multiple times below. Thoughts?
I think that would work. I'll see if I can work it in.
> > +
> > + /* if this mask doesn't collide with what we've already seen */
> > + if (!cpumask_intersects(tl->mask(cpu), covered)) {
> > + /* this failing would be an error in this algorithm */
> > + if (WARN_ON(masks[id]))
> > + goto notsane;
> > +
> > + /* record the mask we saw for this id */
> > + masks[id] = tl->mask(cpu);
> > + cpumask_or(covered, tl->mask(cpu), covered);
> > + } else if ((!masks[id]) || !cpumask_equal(masks[id], tl->mask(cpu))) {
>
> Elaborating the above point, instead of caching cpumask in an array with:
>
> masks[id] = tl->mask(cpu);
>
> how about, having a single "struct cpumask *id_seen" that is cleared at
> the same time as "covered" cpumask, and a bit corresponding to every
> "id" seen is set. If there is an intersection, and this bit against the
> "id" seen is not set or if the "tl->mask(cpu)" is not same as
> "tl->mask(id)", one can detect if the range is overlapping. This
> requires much less space compared to an array of cpumasks but I'm not
> sure how much more expensive it is to access "tl->mask(id)" compared
> to reading it from the array of cpumasks.
So, I was at first skeptical that this would work, I thought it might
miss some error cases. But after a lot of thought I think I was wrong
and it will work. And it lets me get rid of the array and with it
the use of num_possible_cpus vs. nr_cpu_ids that Peter Z. commented
about.
Thanks, I will re-work and post a new version.
--> Steve
--
Steve Wahl, Hewlett Packard Enterprise
On Tue, Oct 15, 2024 at 03:50:49PM +0530, K Prateek Nayak wrote: > > + masks = kmalloc_array(num_possible_cpus(), sizeof(struct cpumask *), GFP_KERNEL); > > + if (!masks) > > + return ret; > > That looks like a very large array that seems unnecessary. Instead, is > it possible to use "tl->mask(id)" down blow to check for equality? (I'll > elaborate more below) drive-by comments, haven't got time atm to read this, but num_possible_cpus() is wrong, this should be nr_cpu_ids. The CPU mask must not be assumed dense.
On Tue, Oct 15, 2024 at 01:05:44PM +0200, Peter Zijlstra wrote: > On Tue, Oct 15, 2024 at 03:50:49PM +0530, K Prateek Nayak wrote: > > > > + masks = kmalloc_array(num_possible_cpus(), sizeof(struct cpumask *), GFP_KERNEL); > > > + if (!masks) > > > + return ret; > > > > That looks like a very large array that seems unnecessary. Instead, is > > it possible to use "tl->mask(id)" down blow to check for equality? (I'll > > elaborate more below) > > drive-by comments, haven't got time atm to read this, but > num_possible_cpus() is wrong, this should be nr_cpu_ids. > > The CPU mask must not be assumed dense. You're absolutly right, not sure how I did this, but Prateek's suggestion will remove the array and needing to size it, so this problem will be removed in the next iteration. --> Steve -- Steve Wahl, Hewlett Packard Enterprise
© 2016 - 2025 Red Hat, Inc.