From: Li Chen <chenl311@chinatelecom.cn>
Currently, the SMT domain is added into sched_domain_topology by default.
If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight
is just 1, it will destroy it.
On a large machine, such as one with 512 cores, this results in
512 redundant domain attach/destroy operations.
Avoid these unnecessary operations by simply checking
cpu_smt_num_threads and remove SMT domain if the SMT domain is not
enabled, and adjust the PKG index accordingly if NUMA-in-package
invalidates that level as well.
Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
changelog:
v2: fix wording issue as suggested by Thomas [1]
v3: remove pointless memset and adjust PKG index accordingly,
as suggested by Thomas [2]
[1]: https://lore.kernel.org/all/87msa2r018.ffs@tglx/
[2]: https://lore.kernel.org/all/875xglntx1.ffs@tglx/
arch/x86/kernel/smpboot.c | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7d202f9785362..4b6daa1545445 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -494,13 +494,29 @@ static struct sched_domain_topology_level x86_topology[] = {
static void __init build_sched_topology(void)
{
+ bool smt_dropped = false;
+
+ if (cpu_smt_num_threads <= 1) {
+ /*
+ * SMT level is x86_topology[0]. Shift the array left by one,
+ */
+ memmove(&x86_topology[0], &x86_topology[1],
+ sizeof(x86_topology) - sizeof(x86_topology[0]));
+ smt_dropped = true;
+ }
+
/*
* When there is NUMA topology inside the package invalidate the
* PKG domain since the NUMA domains will auto-magically create the
* right spanning domains based on the SLIT.
*/
if (x86_has_numa_in_package) {
- unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
+ unsigned int pkgdom;
+
+ if (smt_dropped)
+ pkgdom = ARRAY_SIZE(x86_topology) - 3;
+ else
+ pkgdom = ARRAY_SIZE(x86_topology) - 2;
memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
}
--
2.49.0
On Wed, Jun 25, 2025 at 11:45:50AM +0800, Li Chen wrote:
> From: Li Chen <chenl311@chinatelecom.cn>
>
> Currently, the SMT domain is added into sched_domain_topology by default.
>
> If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight
> is just 1, it will destroy it.
>
> On a large machine, such as one with 512 cores, this results in
> 512 redundant domain attach/destroy operations.
>
> Avoid these unnecessary operations by simply checking
> cpu_smt_num_threads and remove SMT domain if the SMT domain is not
> enabled, and adjust the PKG index accordingly if NUMA-in-package
> invalidates that level as well.
>
> Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
> ---
> changelog:
> v2: fix wording issue as suggested by Thomas [1]
> v3: remove pointless memset and adjust PKG index accordingly,
> as suggested by Thomas [2]
>
> [1]: https://lore.kernel.org/all/87msa2r018.ffs@tglx/
> [2]: https://lore.kernel.org/all/875xglntx1.ffs@tglx/
>
> arch/x86/kernel/smpboot.c | 18 +++++++++++++++++-
> 1 file changed, 17 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 7d202f9785362..4b6daa1545445 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -494,13 +494,29 @@ static struct sched_domain_topology_level x86_topology[] = {
>
> static void __init build_sched_topology(void)
> {
> + bool smt_dropped = false;
> +
> + if (cpu_smt_num_threads <= 1) {
> + /*
> + * SMT level is x86_topology[0]. Shift the array left by one,
> + */
> + memmove(&x86_topology[0], &x86_topology[1],
> + sizeof(x86_topology) - sizeof(x86_topology[0]));
> + smt_dropped = true;
> + }
> +
> /*
> * When there is NUMA topology inside the package invalidate the
> * PKG domain since the NUMA domains will auto-magically create the
> * right spanning domains based on the SLIT.
> */
> if (x86_has_numa_in_package) {
> - unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
> + unsigned int pkgdom;
> +
> + if (smt_dropped)
> + pkgdom = ARRAY_SIZE(x86_topology) - 3;
> + else
> + pkgdom = ARRAY_SIZE(x86_topology) - 2;
>
> memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
> }
Oh gawd, and you all really think this is better than dynamically
creating that array?
This is quite terrible.
Hi Peter,
---- On Wed, 25 Jun 2025 16:29:32 +0800 Peter Zijlstra <peterz@infradead.org> wrote ---
> On Wed, Jun 25, 2025 at 11:45:50AM +0800, Li Chen wrote:
> > From: Li Chen <chenl311@chinatelecom.cn>
> >
> > Currently, the SMT domain is added into sched_domain_topology by default.
> >
> > If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight
> > is just 1, it will destroy it.
> >
> > On a large machine, such as one with 512 cores, this results in
> > 512 redundant domain attach/destroy operations.
> >
> > Avoid these unnecessary operations by simply checking
> > cpu_smt_num_threads and remove SMT domain if the SMT domain is not
> > enabled, and adjust the PKG index accordingly if NUMA-in-package
> > invalidates that level as well.
> >
> > Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
> > ---
> > changelog:
> > v2: fix wording issue as suggested by Thomas [1]
> > v3: remove pointless memset and adjust PKG index accordingly,
> > as suggested by Thomas [2]
> >
> > [1]: https://lore.kernel.org/all/87msa2r018.ffs@tglx/
> > [2]: https://lore.kernel.org/all/875xglntx1.ffs@tglx/
> >
> > arch/x86/kernel/smpboot.c | 18 +++++++++++++++++-
> > 1 file changed, 17 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> > index 7d202f9785362..4b6daa1545445 100644
> > --- a/arch/x86/kernel/smpboot.c
> > +++ b/arch/x86/kernel/smpboot.c
> > @@ -494,13 +494,29 @@ static struct sched_domain_topology_level x86_topology[] = {
> >
> > static void __init build_sched_topology(void)
> > {
> > + bool smt_dropped = false;
> > +
> > + if (cpu_smt_num_threads <= 1) {
> > + /*
> > + * SMT level is x86_topology[0]. Shift the array left by one,
> > + */
> > + memmove(&x86_topology[0], &x86_topology[1],
> > + sizeof(x86_topology) - sizeof(x86_topology[0]));
> > + smt_dropped = true;
> > + }
> > +
> > /*
> > * When there is NUMA topology inside the package invalidate the
> > * PKG domain since the NUMA domains will auto-magically create the
> > * right spanning domains based on the SLIT.
> > */
> > if (x86_has_numa_in_package) {
> > - unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
> > + unsigned int pkgdom;
> > +
> > + if (smt_dropped)
> > + pkgdom = ARRAY_SIZE(x86_topology) - 3;
> > + else
> > + pkgdom = ARRAY_SIZE(x86_topology) - 2;
> >
> > memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
> > }
>
> Oh gawd, and you all really think this is better than dynamically
> creating that array?
>
> This is quite terrible.
>
Do Thomas and you still need to agree on switching to a static array? I'm unsure of the rule.
BTW, initially, I had a patch that didn't include that dynamic/static array change. I don't know if this is ok for you:
https://lore.kernel.org/all/1965cae22a0.12ab5a70c833868.7155412488566097801@linux.beauty/
Regards,
Li
Hello Li,
On 6/25/2025 9:15 AM, Li Chen wrote:
> From: Li Chen <chenl311@chinatelecom.cn>
>
> Currently, the SMT domain is added into sched_domain_topology by default.
>
> If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight
> is just 1, it will destroy it.
>
> On a large machine, such as one with 512 cores, this results in
> 512 redundant domain attach/destroy operations.
>
> Avoid these unnecessary operations by simply checking
> cpu_smt_num_threads and remove SMT domain if the SMT domain is not
> enabled, and adjust the PKG index accordingly if NUMA-in-package
> invalidates that level as well.
>
> Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
> ---
> changelog:
> v2: fix wording issue as suggested by Thomas [1]
> v3: remove pointless memset and adjust PKG index accordingly,
> as suggested by Thomas [2]
>
> [1]: https://lore.kernel.org/all/87msa2r018.ffs@tglx/
> [2]: https://lore.kernel.org/all/875xglntx1.ffs@tglx/
>
> arch/x86/kernel/smpboot.c | 18 +++++++++++++++++-
> 1 file changed, 17 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 7d202f9785362..4b6daa1545445 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -494,13 +494,29 @@ static struct sched_domain_topology_level x86_topology[] = {
>
> static void __init build_sched_topology(void)
> {
> + bool smt_dropped = false;
> +
> + if (cpu_smt_num_threads <= 1) {
> + /*
> + * SMT level is x86_topology[0]. Shift the array left by one,
> + */
> + memmove(&x86_topology[0], &x86_topology[1],
> + sizeof(x86_topology) - sizeof(x86_topology[0]));
> + smt_dropped = true;
> + }
> +
Instead of this memmove, wouldn't just passing the "&x86_topology[1]"
to set_sched_topology() when "cpu_smt_num_threads <= 1" work?
Something like below:
(Only tested on a QEMU based VM)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3818b16c19e1..1793248d28cd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -494,6 +494,8 @@ static struct sched_domain_topology_level x86_topology[] = {
static void __init build_sched_topology(void)
{
+ struct sched_domain_topology_level *topology = x86_topology;
+
/*
* When there is NUMA topology inside the package invalidate the
* PKG domain since the NUMA domains will auto-magically create the
@@ -504,7 +506,15 @@ static void __init build_sched_topology(void)
memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
}
- set_sched_topology(x86_topology);
+
+ /*
+ * Drop the SMT domains if there is only one thread per-core
+ * since it'll get degenerated by the scheduler anyways.
+ */
+ if (cpu_smt_num_threads <= 1)
+ ++topology;
+
+ set_sched_topology(topology);
}
void set_cpu_sibling_map(int cpu)
> /*
> * When there is NUMA topology inside the package invalidate the
> * PKG domain since the NUMA domains will auto-magically create the
> * right spanning domains based on the SLIT.
> */
> if (x86_has_numa_in_package) {
> - unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
> + unsigned int pkgdom;
> +
> + if (smt_dropped)
> + pkgdom = ARRAY_SIZE(x86_topology) - 3;
> + else
> + pkgdom = ARRAY_SIZE(x86_topology) - 2;
>
> memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
> }
--
Thanks and Regards,
Prateek
Hi K,
---- On Wed, 25 Jun 2025 13:45:22 +0800 K Prateek Nayak <kprateek.nayak@amd.com> wrote ---
> Hello Li,
>
> On 6/25/2025 9:15 AM, Li Chen wrote:
> > From: Li Chen <chenl311@chinatelecom.cn>
> >
> > Currently, the SMT domain is added into sched_domain_topology by default.
> >
> > If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight
> > is just 1, it will destroy it.
> >
> > On a large machine, such as one with 512 cores, this results in
> > 512 redundant domain attach/destroy operations.
> >
> > Avoid these unnecessary operations by simply checking
> > cpu_smt_num_threads and remove SMT domain if the SMT domain is not
> > enabled, and adjust the PKG index accordingly if NUMA-in-package
> > invalidates that level as well.
> >
> > Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
> > ---
> > changelog:
> > v2: fix wording issue as suggested by Thomas [1]
> > v3: remove pointless memset and adjust PKG index accordingly,
> > as suggested by Thomas [2]
> >
> > [1]: https://lore.kernel.org/all/87msa2r018.ffs@tglx/
> > [2]: https://lore.kernel.org/all/875xglntx1.ffs@tglx/
> >
> > arch/x86/kernel/smpboot.c | 18 +++++++++++++++++-
> > 1 file changed, 17 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> > index 7d202f9785362..4b6daa1545445 100644
> > --- a/arch/x86/kernel/smpboot.c
> > +++ b/arch/x86/kernel/smpboot.c
> > @@ -494,13 +494,29 @@ static struct sched_domain_topology_level x86_topology[] = {
> >
> > static void __init build_sched_topology(void)
> > {
> > + bool smt_dropped = false;
> > +
> > + if (cpu_smt_num_threads <= 1) {
> > + /*
> > + * SMT level is x86_topology[0]. Shift the array left by one,
> > + */
> > + memmove(&x86_topology[0], &x86_topology[1],
> > + sizeof(x86_topology) - sizeof(x86_topology[0]));
> > + smt_dropped = true;
> > + }
> > +
>
> Instead of this memmove, wouldn't just passing the "&x86_topology[1]"
> to set_sched_topology() when "cpu_smt_num_threads <= 1" work?
>
> Something like below:
>
> (Only tested on a QEMU based VM)
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 3818b16c19e1..1793248d28cd 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -494,6 +494,8 @@ static struct sched_domain_topology_level x86_topology[] = {
>
> static void __init build_sched_topology(void)
> {
> + struct sched_domain_topology_level *topology = x86_topology;
> +
> /*
> * When there is NUMA topology inside the package invalidate the
> * PKG domain since the NUMA domains will auto-magically create the
> @@ -504,7 +506,15 @@ static void __init build_sched_topology(void)
>
> memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
> }
> - set_sched_topology(x86_topology);
> +
> + /*
> + * Drop the SMT domains if there is only one thread per-core
> + * since it'll get degenerated by the scheduler anyways.
> + */
> + if (cpu_smt_num_threads <= 1)
> + ++topology;
> +
> + set_sched_topology(topology);
> }
>
> void set_cpu_sibling_map(int cpu)
Yes, I also agree. I would switch to it in the next series and add your signoff.
Thanks!
> > /*
> > * When there is NUMA topology inside the package invalidate the
> > * PKG domain since the NUMA domains will auto-magically create the
> > * right spanning domains based on the SLIT.
> > */
> > if (x86_has_numa_in_package) {
> > - unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
> > + unsigned int pkgdom;
> > +
> > + if (smt_dropped)
> > + pkgdom = ARRAY_SIZE(x86_topology) - 3;
> > + else
> > + pkgdom = ARRAY_SIZE(x86_topology) - 2;
> >
> > memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
> > }
>
> --
> Thanks and Regards,
> Prateek
>
>
Regards,
Li
On Wed, Jun 25, 2025 at 11:15:22AM +0530, K Prateek Nayak wrote:
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 3818b16c19e1..1793248d28cd 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -494,6 +494,8 @@ static struct sched_domain_topology_level x86_topology[] = {
> static void __init build_sched_topology(void)
> {
> + struct sched_domain_topology_level *topology = x86_topology;
> +
> /*
> * When there is NUMA topology inside the package invalidate the
> * PKG domain since the NUMA domains will auto-magically create the
> @@ -504,7 +506,15 @@ static void __init build_sched_topology(void)
> memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
> }
> - set_sched_topology(x86_topology);
> +
> + /*
> + * Drop the SMT domains if there is only one thread per-core
> + * since it'll get degenerated by the scheduler anyways.
> + */
> + if (cpu_smt_num_threads <= 1)
> + ++topology;
> +
> + set_sched_topology(topology);
> }
> void set_cpu_sibling_map(int cpu)
Yes, that is far saner.
© 2016 - 2026 Red Hat, Inc.