From: Li Chen <chenl311@chinatelecom.cn>
Currently, the SMT domain is added into sched_domain_topology by default.
If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight
is just 1, it will destroy it.
On a large machine, such as one with 512 cores, this results in
512 redundant domain attach/destroy operations.
Avoid these unnecessary operations by simply checking
cpu_smt_num_threads and remove SMT domain if the SMT domain is not
enabled, and adjust the PKG index accordingly if NUMA-in-package
invalidates that level as well.
Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
changelog:
v2: fix wording issue as suggested by Thomas [1]
v3: remove pointless memset and adjust PKG index accordingly,
as suggested by Thomas [2]
[1]: https://lore.kernel.org/all/87msa2r018.ffs@tglx/
[2]: https://lore.kernel.org/all/875xglntx1.ffs@tglx/
arch/x86/kernel/smpboot.c | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7d202f9785362..4b6daa1545445 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -494,13 +494,29 @@ static struct sched_domain_topology_level x86_topology[] = {
static void __init build_sched_topology(void)
{
+ bool smt_dropped = false;
+
+ if (cpu_smt_num_threads <= 1) {
+ /*
+ * SMT level is x86_topology[0]. Shift the array left by one,
+ */
+ memmove(&x86_topology[0], &x86_topology[1],
+ sizeof(x86_topology) - sizeof(x86_topology[0]));
+ smt_dropped = true;
+ }
+
/*
* When there is NUMA topology inside the package invalidate the
* PKG domain since the NUMA domains will auto-magically create the
* right spanning domains based on the SLIT.
*/
if (x86_has_numa_in_package) {
- unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
+ unsigned int pkgdom;
+
+ if (smt_dropped)
+ pkgdom = ARRAY_SIZE(x86_topology) - 3;
+ else
+ pkgdom = ARRAY_SIZE(x86_topology) - 2;
memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
}
--
2.49.0
On Wed, Jun 25, 2025 at 11:45:50AM +0800, Li Chen wrote: > From: Li Chen <chenl311@chinatelecom.cn> > > Currently, the SMT domain is added into sched_domain_topology by default. > > If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight > is just 1, it will destroy it. > > On a large machine, such as one with 512 cores, this results in > 512 redundant domain attach/destroy operations. > > Avoid these unnecessary operations by simply checking > cpu_smt_num_threads and remove SMT domain if the SMT domain is not > enabled, and adjust the PKG index accordingly if NUMA-in-package > invalidates that level as well. > > Signed-off-by: Li Chen <chenl311@chinatelecom.cn> > --- > changelog: > v2: fix wording issue as suggested by Thomas [1] > v3: remove pointless memset and adjust PKG index accordingly, > as suggested by Thomas [2] > > [1]: https://lore.kernel.org/all/87msa2r018.ffs@tglx/ > [2]: https://lore.kernel.org/all/875xglntx1.ffs@tglx/ > > arch/x86/kernel/smpboot.c | 18 +++++++++++++++++- > 1 file changed, 17 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c > index 7d202f9785362..4b6daa1545445 100644 > --- a/arch/x86/kernel/smpboot.c > +++ b/arch/x86/kernel/smpboot.c > @@ -494,13 +494,29 @@ static struct sched_domain_topology_level x86_topology[] = { > > static void __init build_sched_topology(void) > { > + bool smt_dropped = false; > + > + if (cpu_smt_num_threads <= 1) { > + /* > + * SMT level is x86_topology[0]. Shift the array left by one, > + */ > + memmove(&x86_topology[0], &x86_topology[1], > + sizeof(x86_topology) - sizeof(x86_topology[0])); > + smt_dropped = true; > + } > + > /* > * When there is NUMA topology inside the package invalidate the > * PKG domain since the NUMA domains will auto-magically create the > * right spanning domains based on the SLIT. > */ > if (x86_has_numa_in_package) { > - unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2; > + unsigned int pkgdom; > + > + if (smt_dropped) > + pkgdom = ARRAY_SIZE(x86_topology) - 3; > + else > + pkgdom = ARRAY_SIZE(x86_topology) - 2; > > memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); > } Oh gawd, and you all really think this is better than dynamically creating that array? This is quite terrible.
Hi Peter, ---- On Wed, 25 Jun 2025 16:29:32 +0800 Peter Zijlstra <peterz@infradead.org> wrote --- > On Wed, Jun 25, 2025 at 11:45:50AM +0800, Li Chen wrote: > > From: Li Chen <chenl311@chinatelecom.cn> > > > > Currently, the SMT domain is added into sched_domain_topology by default. > > > > If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight > > is just 1, it will destroy it. > > > > On a large machine, such as one with 512 cores, this results in > > 512 redundant domain attach/destroy operations. > > > > Avoid these unnecessary operations by simply checking > > cpu_smt_num_threads and remove SMT domain if the SMT domain is not > > enabled, and adjust the PKG index accordingly if NUMA-in-package > > invalidates that level as well. > > > > Signed-off-by: Li Chen <chenl311@chinatelecom.cn> > > --- > > changelog: > > v2: fix wording issue as suggested by Thomas [1] > > v3: remove pointless memset and adjust PKG index accordingly, > > as suggested by Thomas [2] > > > > [1]: https://lore.kernel.org/all/87msa2r018.ffs@tglx/ > > [2]: https://lore.kernel.org/all/875xglntx1.ffs@tglx/ > > > > arch/x86/kernel/smpboot.c | 18 +++++++++++++++++- > > 1 file changed, 17 insertions(+), 1 deletion(-) > > > > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c > > index 7d202f9785362..4b6daa1545445 100644 > > --- a/arch/x86/kernel/smpboot.c > > +++ b/arch/x86/kernel/smpboot.c > > @@ -494,13 +494,29 @@ static struct sched_domain_topology_level x86_topology[] = { > > > > static void __init build_sched_topology(void) > > { > > + bool smt_dropped = false; > > + > > + if (cpu_smt_num_threads <= 1) { > > + /* > > + * SMT level is x86_topology[0]. Shift the array left by one, > > + */ > > + memmove(&x86_topology[0], &x86_topology[1], > > + sizeof(x86_topology) - sizeof(x86_topology[0])); > > + smt_dropped = true; > > + } > > + > > /* > > * When there is NUMA topology inside the package invalidate the > > * PKG domain since the NUMA domains will auto-magically create the > > * right spanning domains based on the SLIT. > > */ > > if (x86_has_numa_in_package) { > > - unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2; > > + unsigned int pkgdom; > > + > > + if (smt_dropped) > > + pkgdom = ARRAY_SIZE(x86_topology) - 3; > > + else > > + pkgdom = ARRAY_SIZE(x86_topology) - 2; > > > > memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); > > } > > Oh gawd, and you all really think this is better than dynamically > creating that array? > > This is quite terrible. > Do Thomas and you still need to agree on switching to a static array? I'm unsure of the rule. BTW, initially, I had a patch that didn't include that dynamic/static array change. I don't know if this is ok for you: https://lore.kernel.org/all/1965cae22a0.12ab5a70c833868.7155412488566097801@linux.beauty/ Regards, Li
Hello Li, On 6/25/2025 9:15 AM, Li Chen wrote: > From: Li Chen <chenl311@chinatelecom.cn> > > Currently, the SMT domain is added into sched_domain_topology by default. > > If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight > is just 1, it will destroy it. > > On a large machine, such as one with 512 cores, this results in > 512 redundant domain attach/destroy operations. > > Avoid these unnecessary operations by simply checking > cpu_smt_num_threads and remove SMT domain if the SMT domain is not > enabled, and adjust the PKG index accordingly if NUMA-in-package > invalidates that level as well. > > Signed-off-by: Li Chen <chenl311@chinatelecom.cn> > --- > changelog: > v2: fix wording issue as suggested by Thomas [1] > v3: remove pointless memset and adjust PKG index accordingly, > as suggested by Thomas [2] > > [1]: https://lore.kernel.org/all/87msa2r018.ffs@tglx/ > [2]: https://lore.kernel.org/all/875xglntx1.ffs@tglx/ > > arch/x86/kernel/smpboot.c | 18 +++++++++++++++++- > 1 file changed, 17 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c > index 7d202f9785362..4b6daa1545445 100644 > --- a/arch/x86/kernel/smpboot.c > +++ b/arch/x86/kernel/smpboot.c > @@ -494,13 +494,29 @@ static struct sched_domain_topology_level x86_topology[] = { > > static void __init build_sched_topology(void) > { > + bool smt_dropped = false; > + > + if (cpu_smt_num_threads <= 1) { > + /* > + * SMT level is x86_topology[0]. Shift the array left by one, > + */ > + memmove(&x86_topology[0], &x86_topology[1], > + sizeof(x86_topology) - sizeof(x86_topology[0])); > + smt_dropped = true; > + } > + Instead of this memmove, wouldn't just passing the "&x86_topology[1]" to set_sched_topology() when "cpu_smt_num_threads <= 1" work? Something like below: (Only tested on a QEMU based VM) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3818b16c19e1..1793248d28cd 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -494,6 +494,8 @@ static struct sched_domain_topology_level x86_topology[] = { static void __init build_sched_topology(void) { + struct sched_domain_topology_level *topology = x86_topology; + /* * When there is NUMA topology inside the package invalidate the * PKG domain since the NUMA domains will auto-magically create the @@ -504,7 +506,15 @@ static void __init build_sched_topology(void) memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); } - set_sched_topology(x86_topology); + + /* + * Drop the SMT domains if there is only one thread per-core + * since it'll get degenerated by the scheduler anyways. + */ + if (cpu_smt_num_threads <= 1) + ++topology; + + set_sched_topology(topology); } void set_cpu_sibling_map(int cpu) > /* > * When there is NUMA topology inside the package invalidate the > * PKG domain since the NUMA domains will auto-magically create the > * right spanning domains based on the SLIT. > */ > if (x86_has_numa_in_package) { > - unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2; > + unsigned int pkgdom; > + > + if (smt_dropped) > + pkgdom = ARRAY_SIZE(x86_topology) - 3; > + else > + pkgdom = ARRAY_SIZE(x86_topology) - 2; > > memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); > } -- Thanks and Regards, Prateek
Hi K, ---- On Wed, 25 Jun 2025 13:45:22 +0800 K Prateek Nayak <kprateek.nayak@amd.com> wrote --- > Hello Li, > > On 6/25/2025 9:15 AM, Li Chen wrote: > > From: Li Chen <chenl311@chinatelecom.cn> > > > > Currently, the SMT domain is added into sched_domain_topology by default. > > > > If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight > > is just 1, it will destroy it. > > > > On a large machine, such as one with 512 cores, this results in > > 512 redundant domain attach/destroy operations. > > > > Avoid these unnecessary operations by simply checking > > cpu_smt_num_threads and remove SMT domain if the SMT domain is not > > enabled, and adjust the PKG index accordingly if NUMA-in-package > > invalidates that level as well. > > > > Signed-off-by: Li Chen <chenl311@chinatelecom.cn> > > --- > > changelog: > > v2: fix wording issue as suggested by Thomas [1] > > v3: remove pointless memset and adjust PKG index accordingly, > > as suggested by Thomas [2] > > > > [1]: https://lore.kernel.org/all/87msa2r018.ffs@tglx/ > > [2]: https://lore.kernel.org/all/875xglntx1.ffs@tglx/ > > > > arch/x86/kernel/smpboot.c | 18 +++++++++++++++++- > > 1 file changed, 17 insertions(+), 1 deletion(-) > > > > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c > > index 7d202f9785362..4b6daa1545445 100644 > > --- a/arch/x86/kernel/smpboot.c > > +++ b/arch/x86/kernel/smpboot.c > > @@ -494,13 +494,29 @@ static struct sched_domain_topology_level x86_topology[] = { > > > > static void __init build_sched_topology(void) > > { > > + bool smt_dropped = false; > > + > > + if (cpu_smt_num_threads <= 1) { > > + /* > > + * SMT level is x86_topology[0]. Shift the array left by one, > > + */ > > + memmove(&x86_topology[0], &x86_topology[1], > > + sizeof(x86_topology) - sizeof(x86_topology[0])); > > + smt_dropped = true; > > + } > > + > > Instead of this memmove, wouldn't just passing the "&x86_topology[1]" > to set_sched_topology() when "cpu_smt_num_threads <= 1" work? > > Something like below: > > (Only tested on a QEMU based VM) > > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c > index 3818b16c19e1..1793248d28cd 100644 > --- a/arch/x86/kernel/smpboot.c > +++ b/arch/x86/kernel/smpboot.c > @@ -494,6 +494,8 @@ static struct sched_domain_topology_level x86_topology[] = { > > static void __init build_sched_topology(void) > { > + struct sched_domain_topology_level *topology = x86_topology; > + > /* > * When there is NUMA topology inside the package invalidate the > * PKG domain since the NUMA domains will auto-magically create the > @@ -504,7 +506,15 @@ static void __init build_sched_topology(void) > > memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); > } > - set_sched_topology(x86_topology); > + > + /* > + * Drop the SMT domains if there is only one thread per-core > + * since it'll get degenerated by the scheduler anyways. > + */ > + if (cpu_smt_num_threads <= 1) > + ++topology; > + > + set_sched_topology(topology); > } > > void set_cpu_sibling_map(int cpu) Yes, I also agree. I would switch to it in the next series and add your signoff. Thanks! > > /* > > * When there is NUMA topology inside the package invalidate the > > * PKG domain since the NUMA domains will auto-magically create the > > * right spanning domains based on the SLIT. > > */ > > if (x86_has_numa_in_package) { > > - unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2; > > + unsigned int pkgdom; > > + > > + if (smt_dropped) > > + pkgdom = ARRAY_SIZE(x86_topology) - 3; > > + else > > + pkgdom = ARRAY_SIZE(x86_topology) - 2; > > > > memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); > > } > > -- > Thanks and Regards, > Prateek > > Regards, Li
On Wed, Jun 25, 2025 at 11:15:22AM +0530, K Prateek Nayak wrote: > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c > index 3818b16c19e1..1793248d28cd 100644 > --- a/arch/x86/kernel/smpboot.c > +++ b/arch/x86/kernel/smpboot.c > @@ -494,6 +494,8 @@ static struct sched_domain_topology_level x86_topology[] = { > static void __init build_sched_topology(void) > { > + struct sched_domain_topology_level *topology = x86_topology; > + > /* > * When there is NUMA topology inside the package invalidate the > * PKG domain since the NUMA domains will auto-magically create the > @@ -504,7 +506,15 @@ static void __init build_sched_topology(void) > memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); > } > - set_sched_topology(x86_topology); > + > + /* > + * Drop the SMT domains if there is only one thread per-core > + * since it'll get degenerated by the scheduler anyways. > + */ > + if (cpu_smt_num_threads <= 1) > + ++topology; > + > + set_sched_topology(topology); > } > void set_cpu_sibling_map(int cpu) Yes, that is far saner.
© 2016 - 2025 Red Hat, Inc.