From: Li Chen <chenl311@chinatelecom.cn>
Define a small SDTL_INIT(maskfn, flagsfn, name) macro and use it to build the
sched_domain_topology_level array. Purely a cleanup; behaviour is unchanged.
Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
arch/powerpc/kernel/smp.c | 34 +++++++++++++---------------------
arch/s390/kernel/topology.c | 10 +++++-----
arch/x86/kernel/smpboot.c | 21 ++++++---------------
include/linux/sched/topology.h | 4 ++--
kernel/sched/topology.c | 24 ++++++++----------------
5 files changed, 34 insertions(+), 59 deletions(-)
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5ac7084eebc0b..0b7ab7d2eb142 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1700,28 +1700,20 @@ static void __init build_sched_topology(void)
#ifdef CONFIG_SCHED_SMT
if (has_big_cores) {
pr_info("Big cores detected but using small core scheduling\n");
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
- };
- } else {
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
- };
- }
+ powerpc_topology[i++] =
+ SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT);
+ } else
+ powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT);
#endif
- if (shared_caches) {
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE)
- };
- }
- if (has_coregroup_support()) {
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC)
- };
- }
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG)
- };
+ if (shared_caches)
+ powerpc_topology[i++] =
+ SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE);
+
+ if (has_coregroup_support())
+ powerpc_topology[i++] =
+ SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC);
+
+ powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG);
/* There must be one trailing NULL entry left. */
BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 3df048e190b11..46569b8e47dde 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -531,11 +531,11 @@ static const struct cpumask *cpu_drawer_mask(int cpu)
}
static struct sched_domain_topology_level s390_topology[] = {
- { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
- { cpu_book_mask, SD_INIT_NAME(BOOK) },
- { cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
- { cpu_cpu_mask, SD_INIT_NAME(PKG) },
+ SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT),
+ SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
+ SDTL_INIT(cpu_book_mask, NULL, BOOK),
+ SDTL_INIT(cpu_drawer_mask, NULL, DRAWER),
+ SDTL_INIT(cpu_cpu_mask, NULL, PKG),
{ NULL, },
};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 58ede3fa6a75b..445127df2cb19 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -485,35 +485,26 @@ static void __init build_sched_topology(void)
int i = 0;
#ifdef CONFIG_SCHED_SMT
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
- };
+ x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT);
#endif
#ifdef CONFIG_SCHED_CLUSTER
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
- };
+ x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS);
#endif
#ifdef CONFIG_SCHED_MC
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
- };
+ x86_topology[i++] = SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC);
#endif
/*
* When there is NUMA topology inside the package skip the PKG domain
* since the NUMA domains will auto-magically create the right spanning
* domains based on the SLIT.
*/
- if (!x86_has_numa_in_package) {
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG)
- };
- }
+ if (!x86_has_numa_in_package)
+ x86_topology[i++] = SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG);
/*
* There must be one trailing NULL entry left.
*/
- BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
+ BUG_ON(i >= ARRAY_SIZE(x86_topology) - 1);
set_sched_topology(x86_topology);
}
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index e54e7fa76ba63..0d5daaa277b75 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -196,8 +196,8 @@ struct sched_domain_topology_level {
extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio);
-
-# define SD_INIT_NAME(type) .name = #type
+#define SDTL_INIT(maskfn, flagsfn, dname) ((struct sched_domain_topology_level) \
+ { .mask = maskfn, .sd_flags = flagsfn, .name = #dname })
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
extern void rebuild_sched_domains_energy(void);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8e06b1d22e91e..d01f5a49f2e7a 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1737,17 +1737,17 @@ sd_init(struct sched_domain_topology_level *tl,
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
#endif
#ifdef CONFIG_SCHED_CLUSTER
- { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+ SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
#endif
- { cpu_cpu_mask, SD_INIT_NAME(PKG) },
+ SDTL_INIT(cpu_cpu_mask, NULL, PKG),
{ NULL, },
};
@@ -2008,23 +2008,15 @@ void sched_init_numa(int offline_node)
/*
* Add the NUMA identity distance, aka single NODE.
*/
- tl[i++] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .numa_level = 0,
- SD_INIT_NAME(NODE)
- };
+ tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE);
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 1; j < nr_levels; i++, j++) {
- tl[i] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .sd_flags = cpu_numa_flags,
- .flags = SDTL_OVERLAP,
- .numa_level = j,
- SD_INIT_NAME(NUMA)
- };
+ tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
+ tl[i].numa_level = j;
+ tl[i].flags = SDTL_OVERLAP;
}
sched_domain_topology_saved = sched_domain_topology;
--
2.50.0
On Thu, Jul 10, 2025 at 06:57:07PM +0800, Li Chen wrote:
> From: Li Chen <chenl311@chinatelecom.cn>
>
> Define a small SDTL_INIT(maskfn, flagsfn, name) macro and use it to build the
> sched_domain_topology_level array. Purely a cleanup; behaviour is unchanged.
>
> Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> ---
> arch/powerpc/kernel/smp.c | 34 +++++++++++++---------------------
> arch/s390/kernel/topology.c | 10 +++++-----
> arch/x86/kernel/smpboot.c | 21 ++++++---------------
> include/linux/sched/topology.h | 4 ++--
> kernel/sched/topology.c | 24 ++++++++----------------
> 5 files changed, 34 insertions(+), 59 deletions(-)
>
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 5ac7084eebc0b..0b7ab7d2eb142 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1700,28 +1700,20 @@ static void __init build_sched_topology(void)
> #ifdef CONFIG_SCHED_SMT
> if (has_big_cores) {
> pr_info("Big cores detected but using small core scheduling\n");
> - powerpc_topology[i++] = (struct sched_domain_topology_level){
> - smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
> - };
> - } else {
> - powerpc_topology[i++] = (struct sched_domain_topology_level){
> - cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
> - };
> - }
> + powerpc_topology[i++] =
> + SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT);
> + } else
> + powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT);
> #endif
> - if (shared_caches) {
> - powerpc_topology[i++] = (struct sched_domain_topology_level){
> - shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE)
> - };
> - }
> - if (has_coregroup_support()) {
> - powerpc_topology[i++] = (struct sched_domain_topology_level){
> - cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC)
> - };
> - }
> - powerpc_topology[i++] = (struct sched_domain_topology_level){
> - cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG)
> - };
> + if (shared_caches)
> + powerpc_topology[i++] =
> + SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE);
> +
> + if (has_coregroup_support())
> + powerpc_topology[i++] =
> + SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC);
> +
> + powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG);
You wrecked coding-style here and lost a bunch of curlies. I've fixed
that up for you.
On 7/10/2025 4:27 PM, Li Chen wrote:
> /*
> * .. and append 'j' levels of NUMA goodness.
> */
> for (j = 1; j < nr_levels; i++, j++) {
> - tl[i] = (struct sched_domain_topology_level){
> - .mask = sd_numa_mask,
> - .sd_flags = cpu_numa_flags,
> - .flags = SDTL_OVERLAP,
> - .numa_level = j,
> - SD_INIT_NAME(NUMA)
> - };
> + tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
> + tl[i].numa_level = j;
> + tl[i].flags = SDTL_OVERLAP;
Tangential discussion: I was looking at this and was wondering why we
need a "tl->flags" when there is already sd_flags() function and we can
simply add SD_OVERLAP to sd_numa_flags().
I think "tl->flags" was needed when the idea of overlap domains was
added in commit e3589f6c81e4 ("sched: Allow for overlapping sched_domain
spans") when it depended on "FORCE_SD_OVERLAP" sched_feat() which
allowed toggling this off but that was done away with in commit
af85596c74de ("sched/topology: Remove FORCE_SD_OVERLAP") so perhaps we
can get rid of it now?
Relying on SD_NUMA should be enough currently. Peter, Valentin, what do
you think of something like below?
(Build and boot tested on top of this series on tip:sched/core)
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index b04a5d04dee9..42839cfa2778 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -153,14 +153,6 @@ SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS)
*/
SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS)
-/*
- * sched_groups of this level overlap
- *
- * SHARED_PARENT: Set for all NUMA levels above NODE.
- * NEEDS_GROUPS: Overlaps can only exist with more than one group.
- */
-SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
-
/*
* Cross-node balancing
*
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 0d5daaa277b7..5263746b63e8 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -175,8 +175,6 @@ bool cpus_share_resources(int this_cpu, int that_cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
typedef int (*sched_domain_flags_f)(void);
-#define SDTL_OVERLAP 0x01
-
struct sd_data {
struct sched_domain *__percpu *sd;
struct sched_domain_shared *__percpu *sds;
@@ -187,7 +185,6 @@ struct sd_data {
struct sched_domain_topology_level {
sched_domain_mask_f mask;
sched_domain_flags_f sd_flags;
- int flags;
int numa_level;
struct sd_data data;
char *name;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 20a845697c1d..b9b4bbbf0af6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9926,9 +9926,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
min_capacity = ULONG_MAX;
max_capacity = 0;
- if (child->flags & SD_OVERLAP) {
+ if (child->flags & SD_NUMA) {
/*
- * SD_OVERLAP domains cannot assume that child groups
+ * SD_NUMA domains cannot assume that child groups
* span the current group.
*/
@@ -9941,7 +9941,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
} else {
/*
- * !SD_OVERLAP domains can assume that child groups
+ * !SD_NUMA domains can assume that child groups
* span the current group.
*/
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d01f5a49f2e7..977e133bb8a4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -89,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!(sd->flags & SD_OVERLAP) &&
+ if (!(sd->flags & SD_NUMA) &&
cpumask_intersects(groupmask, sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
@@ -102,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
group->sgc->id,
cpumask_pr_args(sched_group_span(group)));
- if ((sd->flags & SD_OVERLAP) &&
+ if ((sd->flags & SD_NUMA) &&
!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
printk(KERN_CONT " mask=%*pbl",
cpumask_pr_args(group_balance_mask(group)));
@@ -1344,7 +1344,7 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
* "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu"
* which is shared by all the overlapping groups.
*/
- WARN_ON_ONCE(sd->flags & SD_OVERLAP);
+ WARN_ON_ONCE(sd->flags & SD_NUMA);
sg = sd->groups;
if (cpu != sg->asym_prefer_cpu) {
@@ -2016,7 +2016,6 @@ void sched_init_numa(int offline_node)
for (j = 1; j < nr_levels; i++, j++) {
tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
tl[i].numa_level = j;
- tl[i].flags = SDTL_OVERLAP;
}
sched_domain_topology_saved = sched_domain_topology;
@@ -2327,7 +2326,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sd) {
sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
+ if (sd && (sd->flags & SD_NUMA))
free_sched_groups(sd->groups, 0);
kfree(*per_cpu_ptr(sdd->sd, j));
}
@@ -2393,9 +2392,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
id_seen = sched_domains_tmpmask2;
for_each_sd_topology(tl) {
+ int tl_common_flags = 0;
+
+ if (tl->sd_flags)
+ tl_common_flags = (*tl->sd_flags)();
/* NUMA levels are allowed to overlap */
- if (tl->flags & SDTL_OVERLAP)
+ if (tl_common_flags & SD_NUMA)
continue;
cpumask_clear(covered);
@@ -2466,8 +2469,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP)
- sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
@@ -2480,7 +2481,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd));
- if (sd->flags & SD_OVERLAP) {
+ if (sd->flags & SD_NUMA) {
if (build_overlap_sched_groups(sd, i))
goto error;
} else {
---
We can also keep SD_OVERLAP and only remove SDTL_OVERLAP, tl->flags if
that is preferred or just have them both if you see a future !NUMA
usecases for overlapping domains.
> }
>
> sched_domain_topology_saved = sched_domain_topology;
--
Thanks and Regards,
Prateek
On 11/07/25 11:20, K Prateek Nayak wrote:
> Tangential discussion: I was looking at this and was wondering why we
> need a "tl->flags" when there is already sd_flags() function and we can
> simply add SD_OVERLAP to sd_numa_flags().
>
> I think "tl->flags" was needed when the idea of overlap domains was
> added in commit e3589f6c81e4 ("sched: Allow for overlapping sched_domain
> spans") when it depended on "FORCE_SD_OVERLAP" sched_feat() which
> allowed toggling this off but that was done away with in commit
> af85596c74de ("sched/topology: Remove FORCE_SD_OVERLAP") so perhaps we
> can get rid of it now?
>
> Relying on SD_NUMA should be enough currently. Peter, Valentin, what do
> you think of something like below?
>
I remember asking myself the same question when I mucked about the SD
flags; I ended up convincing myself to let it be but I couldn't find any
note as to why. Looking at things in their current state, I agree with you,
we could just bin it.
On Fri, Jul 11, 2025 at 11:20:30AM +0530, K Prateek Nayak wrote:
> On 7/10/2025 4:27 PM, Li Chen wrote:
> > /*
> > * .. and append 'j' levels of NUMA goodness.
> > */
> > for (j = 1; j < nr_levels; i++, j++) {
> > - tl[i] = (struct sched_domain_topology_level){
> > - .mask = sd_numa_mask,
> > - .sd_flags = cpu_numa_flags,
> > - .flags = SDTL_OVERLAP,
> > - .numa_level = j,
> > - SD_INIT_NAME(NUMA)
> > - };
> > + tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
> > + tl[i].numa_level = j;
> > + tl[i].flags = SDTL_OVERLAP;
>
> Tangential discussion: I was looking at this and was wondering why we
> need a "tl->flags" when there is already sd_flags() function and we can
> simply add SD_OVERLAP to sd_numa_flags().
>
> I think "tl->flags" was needed when the idea of overlap domains was
> added in commit e3589f6c81e4 ("sched: Allow for overlapping sched_domain
> spans") when it depended on "FORCE_SD_OVERLAP" sched_feat() which
> allowed toggling this off but that was done away with in commit
> af85596c74de ("sched/topology: Remove FORCE_SD_OVERLAP") so perhaps we
> can get rid of it now?
>
> Relying on SD_NUMA should be enough currently. Peter, Valentin, what do
> you think of something like below?
I think you're right. SD_NUMA appears to be the one and only case that
also has SDTL_OVERLAP which then results in setting SD_OVERLAP, making
SD_NUMA and SD_OVERLAP equivalent and SDTL_OVERLAP redundant.
I'll presume you're okay with me adding your SoB to things, and I'll
push out all 5 patches to queue/sched/core to let the robots have a go
at things.
(trimming the cc to only kernel/sched folks to reduce the noise)
On 7/11/2025 6:36 PM, Peter Zijlstra wrote:
> On Fri, Jul 11, 2025 at 11:20:30AM +0530, K Prateek Nayak wrote:
>> On 7/10/2025 4:27 PM, Li Chen wrote:
>>> /*
>>> * .. and append 'j' levels of NUMA goodness.
>>> */
>>> for (j = 1; j < nr_levels; i++, j++) {
>>> - tl[i] = (struct sched_domain_topology_level){
>>> - .mask = sd_numa_mask,
>>> - .sd_flags = cpu_numa_flags,
>>> - .flags = SDTL_OVERLAP,
>>> - .numa_level = j,
>>> - SD_INIT_NAME(NUMA)
>>> - };
>>> + tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
>>> + tl[i].numa_level = j;
>>> + tl[i].flags = SDTL_OVERLAP;
>>
>> Tangential discussion: I was looking at this and was wondering why we
>> need a "tl->flags" when there is already sd_flags() function and we can
>> simply add SD_OVERLAP to sd_numa_flags().
>>
>> I think "tl->flags" was needed when the idea of overlap domains was
>> added in commit e3589f6c81e4 ("sched: Allow for overlapping sched_domain
>> spans") when it depended on "FORCE_SD_OVERLAP" sched_feat() which
>> allowed toggling this off but that was done away with in commit
>> af85596c74de ("sched/topology: Remove FORCE_SD_OVERLAP") so perhaps we
>> can get rid of it now?
>>
>> Relying on SD_NUMA should be enough currently. Peter, Valentin, what do
>> you think of something like below?
>
> I think you're right. SD_NUMA appears to be the one and only case that
> also has SDTL_OVERLAP which then results in setting SD_OVERLAP, making
> SD_NUMA and SD_OVERLAP equivalent and SDTL_OVERLAP redundant.
>
> I'll presume you're okay with me adding your SoB to things, and I'll
> push out all 5 patches to queue/sched/core to let the robots have a go
> at things.
Works for me! If you need a formal commit message:
Support for overlapping domains added in commit e3589f6c81e4 ("sched:
Allow for overlapping sched_domain spans") also allowed forcefully
setting SD_OVERLAP for !NUMA domains via FORCE_SD_OVERLAP sched_feat().
Since NUMA domains had to be presumed overlapping to ensure correct
behavior, "sched_domain_topology_level::flags" was introduced. NUMA
domains added the SDTL_OVERLAP flag would ensure SD_OVERLAP was always
added during build_sched_domains() for these domains, even when
FORCE_SD_OVERLAP was off.
Condition for adding the SD_OVERLAP flag at the aforementioned commit
was as follows:
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
sd->flags |= SD_OVERLAP;
The FORCE_SD_OVERLAP debug feature was removed in commit af85596c74de
("sched/topology: Remove FORCE_SD_OVERLAP") which left the NUMA domains
as the exclusive users of SDTL_OVERLAP, SD_OVERLAP, and SD_NUMA flags.
Get rid of SDTL_OVERLAP and SD_OVERLAP as they have become redundant
and instead rely on SD_NUMA to detect the only overlapping domain
currently supported. Since SDTL_OVERLAP was the only user of
"tl->flags", get rid of "sched_domain_topology_level::flags" too.
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
P.S. Are we still considering the following for v6.16 cycle?
https://lore.kernel.org/lkml/20250709161917.14298-1-kprateek.nayak@amd.com/
If not, I can rebase it on top of queue:sched/core and send it out with
the conflicts resolved to save you a couple of edits :)
--
Thanks and Regards,
Prateek
On Mon, Jul 14, 2025 at 09:33:42AM +0530, K Prateek Nayak wrote: > Works for me! If you need a formal commit message: Thanks, much better than the badly edited thing I put in place. > P.S. Are we still considering the following for v6.16 cycle? > https://lore.kernel.org/lkml/20250709161917.14298-1-kprateek.nayak@amd.com/ > > If not, I can rebase it on top of queue:sched/core and send it out with > the conflicts resolved to save you a couple of edits :) Already done, about to push out.
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 1eec89a671413ce38df9fe9e70f5130a9eb79a59
Gitweb: https://git.kernel.org/tip/1eec89a671413ce38df9fe9e70f5130a9eb79a59
Author: K Prateek Nayak <kprateek.nayak@amd.com>
AuthorDate: Fri, 11 Jul 2025 11:20:30 +05:30
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 14 Jul 2025 10:59:35 +02:00
sched/topology: Remove sched_domain_topology_level::flags
Support for overlapping domains added in commit e3589f6c81e4 ("sched:
Allow for overlapping sched_domain spans") also allowed forcefully
setting SD_OVERLAP for !NUMA domains via FORCE_SD_OVERLAP sched_feat().
Since NUMA domains had to be presumed overlapping to ensure correct
behavior, "sched_domain_topology_level::flags" was introduced. NUMA
domains added the SDTL_OVERLAP flag would ensure SD_OVERLAP was always
added during build_sched_domains() for these domains, even when
FORCE_SD_OVERLAP was off.
Condition for adding the SD_OVERLAP flag at the aforementioned commit
was as follows:
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
sd->flags |= SD_OVERLAP;
The FORCE_SD_OVERLAP debug feature was removed in commit af85596c74de
("sched/topology: Remove FORCE_SD_OVERLAP") which left the NUMA domains
as the exclusive users of SDTL_OVERLAP, SD_OVERLAP, and SD_NUMA flags.
Get rid of SDTL_OVERLAP and SD_OVERLAP as they have become redundant
and instead rely on SD_NUMA to detect the only overlapping domain
currently supported. Since SDTL_OVERLAP was the only user of
"tl->flags", get rid of "sched_domain_topology_level::flags" too.
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/ba4dbdf8-bc37-493d-b2e0-2efb00ea3e19@amd.com
---
include/linux/sched/sd_flags.h | 8 --------
include/linux/sched/topology.h | 3 ---
kernel/sched/fair.c | 6 +++---
kernel/sched/topology.c | 19 ++++++++++---------
4 files changed, 13 insertions(+), 23 deletions(-)
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index b04a5d0..42839cf 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -154,14 +154,6 @@ SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS)
SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS)
/*
- * sched_groups of this level overlap
- *
- * SHARED_PARENT: Set for all NUMA levels above NODE.
- * NEEDS_GROUPS: Overlaps can only exist with more than one group.
- */
-SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
-
-/*
* Cross-node balancing
*
* SHARED_PARENT: Set for all NUMA levels above NODE.
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 0d5daaa..5263746 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -175,8 +175,6 @@ bool cpus_share_resources(int this_cpu, int that_cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
typedef int (*sched_domain_flags_f)(void);
-#define SDTL_OVERLAP 0x01
-
struct sd_data {
struct sched_domain *__percpu *sd;
struct sched_domain_shared *__percpu *sds;
@@ -187,7 +185,6 @@ struct sd_data {
struct sched_domain_topology_level {
sched_domain_mask_f mask;
sched_domain_flags_f sd_flags;
- int flags;
int numa_level;
struct sd_data data;
char *name;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 20a8456..b9b4bbb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9926,9 +9926,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
min_capacity = ULONG_MAX;
max_capacity = 0;
- if (child->flags & SD_OVERLAP) {
+ if (child->flags & SD_NUMA) {
/*
- * SD_OVERLAP domains cannot assume that child groups
+ * SD_NUMA domains cannot assume that child groups
* span the current group.
*/
@@ -9941,7 +9941,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
} else {
/*
- * !SD_OVERLAP domains can assume that child groups
+ * !SD_NUMA domains can assume that child groups
* span the current group.
*/
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d01f5a4..977e133 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -89,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!(sd->flags & SD_OVERLAP) &&
+ if (!(sd->flags & SD_NUMA) &&
cpumask_intersects(groupmask, sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
@@ -102,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
group->sgc->id,
cpumask_pr_args(sched_group_span(group)));
- if ((sd->flags & SD_OVERLAP) &&
+ if ((sd->flags & SD_NUMA) &&
!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
printk(KERN_CONT " mask=%*pbl",
cpumask_pr_args(group_balance_mask(group)));
@@ -1344,7 +1344,7 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
* "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu"
* which is shared by all the overlapping groups.
*/
- WARN_ON_ONCE(sd->flags & SD_OVERLAP);
+ WARN_ON_ONCE(sd->flags & SD_NUMA);
sg = sd->groups;
if (cpu != sg->asym_prefer_cpu) {
@@ -2016,7 +2016,6 @@ void sched_init_numa(int offline_node)
for (j = 1; j < nr_levels; i++, j++) {
tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
tl[i].numa_level = j;
- tl[i].flags = SDTL_OVERLAP;
}
sched_domain_topology_saved = sched_domain_topology;
@@ -2327,7 +2326,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sd) {
sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
+ if (sd && (sd->flags & SD_NUMA))
free_sched_groups(sd->groups, 0);
kfree(*per_cpu_ptr(sdd->sd, j));
}
@@ -2393,9 +2392,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
id_seen = sched_domains_tmpmask2;
for_each_sd_topology(tl) {
+ int tl_common_flags = 0;
+
+ if (tl->sd_flags)
+ tl_common_flags = (*tl->sd_flags)();
/* NUMA levels are allowed to overlap */
- if (tl->flags & SDTL_OVERLAP)
+ if (tl_common_flags & SD_NUMA)
continue;
cpumask_clear(covered);
@@ -2466,8 +2469,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP)
- sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
@@ -2480,7 +2481,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd));
- if (sd->flags & SD_OVERLAP) {
+ if (sd->flags & SD_NUMA) {
if (build_overlap_sched_groups(sd, i))
goto error;
} else {
The following commit has been merged into the sched/core branch of tip:
Commit-ID: e075f4360931263f5ec006ea5dadc065e5e98eb8
Gitweb: https://git.kernel.org/tip/e075f4360931263f5ec006ea5dadc065e5e98eb8
Author: Li Chen <chenl311@chinatelecom.cn>
AuthorDate: Thu, 10 Jul 2025 18:57:07 +08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 14 Jul 2025 10:59:34 +02:00
smpboot: introduce SDTL_INIT() helper to tidy sched topology setup
Define a small SDTL_INIT(maskfn, flagsfn, name) macro and use it to build the
sched_domain_topology_level array. Purely a cleanup; behaviour is unchanged.
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20250710105715.66594-2-me@linux.beauty
---
arch/powerpc/kernel/smp.c | 25 ++++++++++---------------
arch/s390/kernel/topology.c | 10 +++++-----
arch/x86/kernel/smpboot.c | 21 ++++++---------------
include/linux/sched/topology.h | 4 ++--
kernel/sched/topology.c | 24 ++++++++----------------
5 files changed, 31 insertions(+), 53 deletions(-)
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5ac7084..f59e4b9 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1700,28 +1700,23 @@ static void __init build_sched_topology(void)
#ifdef CONFIG_SCHED_SMT
if (has_big_cores) {
pr_info("Big cores detected but using small core scheduling\n");
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
- };
+ powerpc_topology[i++] =
+ SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT);
} else {
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
- };
+ powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT);
}
#endif
if (shared_caches) {
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE)
- };
+ powerpc_topology[i++] =
+ SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE);
}
+
if (has_coregroup_support()) {
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC)
- };
+ powerpc_topology[i++] =
+ SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC);
}
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG)
- };
+
+ powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG);
/* There must be one trailing NULL entry left. */
BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 3df048e..46569b8 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -531,11 +531,11 @@ static const struct cpumask *cpu_drawer_mask(int cpu)
}
static struct sched_domain_topology_level s390_topology[] = {
- { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
- { cpu_book_mask, SD_INIT_NAME(BOOK) },
- { cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
- { cpu_cpu_mask, SD_INIT_NAME(PKG) },
+ SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT),
+ SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
+ SDTL_INIT(cpu_book_mask, NULL, BOOK),
+ SDTL_INIT(cpu_drawer_mask, NULL, DRAWER),
+ SDTL_INIT(cpu_cpu_mask, NULL, PKG),
{ NULL, },
};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fc78c23..e0adf75 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -485,35 +485,26 @@ static void __init build_sched_topology(void)
int i = 0;
#ifdef CONFIG_SCHED_SMT
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
- };
+ x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT);
#endif
#ifdef CONFIG_SCHED_CLUSTER
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
- };
+ x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS);
#endif
#ifdef CONFIG_SCHED_MC
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
- };
+ x86_topology[i++] = SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC);
#endif
/*
* When there is NUMA topology inside the package skip the PKG domain
* since the NUMA domains will auto-magically create the right spanning
* domains based on the SLIT.
*/
- if (!x86_has_numa_in_package) {
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG)
- };
- }
+ if (!x86_has_numa_in_package)
+ x86_topology[i++] = SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG);
/*
* There must be one trailing NULL entry left.
*/
- BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
+ BUG_ON(i >= ARRAY_SIZE(x86_topology) - 1);
set_sched_topology(x86_topology);
}
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index e54e7fa..0d5daaa 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -196,8 +196,8 @@ struct sched_domain_topology_level {
extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio);
-
-# define SD_INIT_NAME(type) .name = #type
+#define SDTL_INIT(maskfn, flagsfn, dname) ((struct sched_domain_topology_level) \
+ { .mask = maskfn, .sd_flags = flagsfn, .name = #dname })
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
extern void rebuild_sched_domains_energy(void);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8e06b1d..d01f5a4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1737,17 +1737,17 @@ sd_init(struct sched_domain_topology_level *tl,
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
#endif
#ifdef CONFIG_SCHED_CLUSTER
- { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+ SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
#endif
- { cpu_cpu_mask, SD_INIT_NAME(PKG) },
+ SDTL_INIT(cpu_cpu_mask, NULL, PKG),
{ NULL, },
};
@@ -2008,23 +2008,15 @@ void sched_init_numa(int offline_node)
/*
* Add the NUMA identity distance, aka single NODE.
*/
- tl[i++] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .numa_level = 0,
- SD_INIT_NAME(NODE)
- };
+ tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE);
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 1; j < nr_levels; i++, j++) {
- tl[i] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .sd_flags = cpu_numa_flags,
- .flags = SDTL_OVERLAP,
- .numa_level = j,
- SD_INIT_NAME(NUMA)
- };
+ tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
+ tl[i].numa_level = j;
+ tl[i].flags = SDTL_OVERLAP;
}
sched_domain_topology_saved = sched_domain_topology;
© 2016 - 2026 Red Hat, Inc.