[PATCH v4 1/2] sched: Create architecture specific sched domain distances

Tim Chen posted 2 patches 1 week, 5 days ago
[PATCH v4 1/2] sched: Create architecture specific sched domain distances
Posted by Tim Chen 1 week, 5 days ago
Allow architecture specific sched domain NUMA distances that are
modified from actual NUMA node distances for the purpose of building
NUMA sched domains.

Keep actual NUMA distances separately if modified distances
are used for building sched domains. Such distances
are still needed as NUMA balancing benefits from finding the
NUMA nodes that are actually closer to a task numa_group.

Consolidate the recording of unique NUMA distances in an array to
sched_record_numa_dist() so the function can be reused to record NUMA
distances when the NUMA distance metric is changed.

No functional change and additional distance array
allocated if there're no arch specific NUMA distances
being defined.

Co-developed-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/sched/topology.h |   1 +
 kernel/sched/topology.c        | 117 ++++++++++++++++++++++++++-------
 2 files changed, 96 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 5263746b63e8..2d2d29553df8 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -56,6 +56,7 @@ static inline int cpu_numa_flags(void)
 {
 	return SD_NUMA;
 }
+extern int arch_sched_node_distance(int from, int to);
 #endif
 
 extern int arch_asym_cpu_priority(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6e2f54169e66..f25e4402c63e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1591,10 +1591,12 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 enum numa_topology_type sched_numa_topology_type;
 
 static int			sched_domains_numa_levels;
+static int			sched_numa_node_levels;
 static int			sched_domains_curr_level;
 
 int				sched_max_numa_distance;
 static int			*sched_domains_numa_distance;
+static int			*sched_numa_node_distance;
 static struct cpumask		***sched_domains_numa_masks;
 #endif /* CONFIG_NUMA */
 
@@ -1808,10 +1810,10 @@ bool find_numa_distance(int distance)
 		return true;
 
 	rcu_read_lock();
-	distances = rcu_dereference(sched_domains_numa_distance);
+	distances = rcu_dereference(sched_numa_node_distance);
 	if (!distances)
 		goto unlock;
-	for (i = 0; i < sched_domains_numa_levels; i++) {
+	for (i = 0; i < sched_numa_node_levels; i++) {
 		if (distances[i] == distance) {
 			found = true;
 			break;
@@ -1887,14 +1889,48 @@ static void init_numa_topology_type(int offline_node)
 
 #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
 
-void sched_init_numa(int offline_node)
+/*
+ * An architecture could modify its NUMA distance, to change
+ * grouping of NUMA nodes and number of NUMA levels when creating
+ * NUMA level sched domains.
+ *
+ * A NUMA level is created for each unique
+ * arch_sched_node_distance.
+ */
+static bool __modified_sched_node_dist = true;
+
+int __weak arch_sched_node_distance(int from, int to)
 {
-	struct sched_domain_topology_level *tl;
-	unsigned long *distance_map;
+	if (__modified_sched_node_dist)
+		__modified_sched_node_dist = false;
+
+	return node_distance(from, to);
+}
+
+static bool modified_sched_node_distance(void)
+{
+	/*
+	 * Call arch_sched_node_distance()
+	 * to determine if arch_sched_node_distance
+	 * has been modified from node_distance()
+	 * to arch specific distance.
+	 */
+	arch_sched_node_distance(0, 0);
+	return __modified_sched_node_dist;
+}
+
+static int numa_node_dist(int i, int j)
+{
+	return node_distance(i, j);
+}
+
+static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
+				  int **dist, int *levels)
+{
+	unsigned long *distance_map __free(bitmap) = NULL;
 	int nr_levels = 0;
 	int i, j;
 	int *distances;
-	struct cpumask ***masks;
 
 	/*
 	 * O(nr_nodes^2) de-duplicating selection sort -- in order to find the
@@ -1902,17 +1938,16 @@ void sched_init_numa(int offline_node)
 	 */
 	distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
 	if (!distance_map)
-		return;
+		return -ENOMEM;
 
 	bitmap_zero(distance_map, NR_DISTANCE_VALUES);
 	for_each_cpu_node_but(i, offline_node) {
 		for_each_cpu_node_but(j, offline_node) {
-			int distance = node_distance(i, j);
+			int distance = n_dist(i, j);
 
 			if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
 				sched_numa_warn("Invalid distance value range");
-				bitmap_free(distance_map);
-				return;
+				return -EINVAL;
 			}
 
 			bitmap_set(distance_map, distance, 1);
@@ -1925,18 +1960,46 @@ void sched_init_numa(int offline_node)
 	nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
 
 	distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
-	if (!distances) {
-		bitmap_free(distance_map);
-		return;
-	}
+	if (!distances)
+		return -ENOMEM;
 
 	for (i = 0, j = 0; i < nr_levels; i++, j++) {
 		j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
 		distances[i] = j;
 	}
-	rcu_assign_pointer(sched_domains_numa_distance, distances);
+	*dist = distances;
+	*levels = nr_levels;
+
+	return 0;
+}
+
+void sched_init_numa(int offline_node)
+{
+	struct sched_domain_topology_level *tl;
+	int nr_levels, nr_node_levels;
+	int i, j;
+	int *distances, *domain_distances;
+	struct cpumask ***masks;
+
+	/* Record the NUMA distances from SLIT table */
+	if (sched_record_numa_dist(offline_node, numa_node_dist, &distances,
+				   &nr_node_levels))
+		return;
 
-	bitmap_free(distance_map);
+	/* Record modified NUMA distances for building sched domains */
+	if (modified_sched_node_distance()) {
+		if (sched_record_numa_dist(offline_node, arch_sched_node_distance,
+					   &domain_distances, &nr_levels)) {
+			kfree(distances);
+			return;
+		}
+	} else {
+		domain_distances = distances;
+		nr_levels = nr_node_levels;
+	}
+	rcu_assign_pointer(sched_numa_node_distance, distances);
+	WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]);
+	WRITE_ONCE(sched_numa_node_levels, nr_node_levels);
 
 	/*
 	 * 'nr_levels' contains the number of unique distances
@@ -1954,6 +2017,8 @@ void sched_init_numa(int offline_node)
 	 *
 	 * We reset it to 'nr_levels' at the end of this function.
 	 */
+	rcu_assign_pointer(sched_domains_numa_distance, domain_distances);
+
 	sched_domains_numa_levels = 0;
 
 	masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
@@ -1979,10 +2044,13 @@ void sched_init_numa(int offline_node)
 			masks[i][j] = mask;
 
 			for_each_cpu_node_but(k, offline_node) {
-				if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+				if (sched_debug() &&
+				    (arch_sched_node_distance(j, k) !=
+				     arch_sched_node_distance(k, j)))
 					sched_numa_warn("Node-distance not symmetric");
 
-				if (node_distance(j, k) > sched_domains_numa_distance[i])
+				if (arch_sched_node_distance(j, k) >
+				    sched_domains_numa_distance[i])
 					continue;
 
 				cpumask_or(mask, mask, cpumask_of_node(k));
@@ -2022,7 +2090,6 @@ void sched_init_numa(int offline_node)
 	sched_domain_topology = tl;
 
 	sched_domains_numa_levels = nr_levels;
-	WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
 
 	init_numa_topology_type(offline_node);
 }
@@ -2030,14 +2097,18 @@ void sched_init_numa(int offline_node)
 
 static void sched_reset_numa(void)
 {
-	int nr_levels, *distances;
+	int nr_levels, *distances, *dom_distances = NULL;
 	struct cpumask ***masks;
 
 	nr_levels = sched_domains_numa_levels;
+	sched_numa_node_levels = 0;
 	sched_domains_numa_levels = 0;
 	sched_max_numa_distance = 0;
 	sched_numa_topology_type = NUMA_DIRECT;
-	distances = sched_domains_numa_distance;
+	distances = sched_numa_node_distance;
+	if (sched_numa_node_distance != sched_domains_numa_distance)
+		dom_distances = sched_domains_numa_distance;
+	rcu_assign_pointer(sched_numa_node_distance, NULL);
 	rcu_assign_pointer(sched_domains_numa_distance, NULL);
 	masks = sched_domains_numa_masks;
 	rcu_assign_pointer(sched_domains_numa_masks, NULL);
@@ -2046,6 +2117,7 @@ static void sched_reset_numa(void)
 
 		synchronize_rcu();
 		kfree(distances);
+		kfree(dom_distances);
 		for (i = 0; i < nr_levels && masks; i++) {
 			if (!masks[i])
 				continue;
@@ -2092,7 +2164,8 @@ void sched_domains_numa_masks_set(unsigned int cpu)
 				continue;
 
 			/* Set ourselves in the remote node's masks */
-			if (node_distance(j, node) <= sched_domains_numa_distance[i])
+			if (arch_sched_node_distance(j, node) <=
+			    sched_domains_numa_distance[i])
 				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
 		}
 	}
-- 
2.32.0
Re: [PATCH v4 1/2] sched: Create architecture specific sched domain distances
Posted by Chen, Yu C 4 days, 17 hours ago
On 9/20/2025 1:50 AM, Tim Chen wrote:
> Allow architecture specific sched domain NUMA distances that are
> modified from actual NUMA node distances for the purpose of building
> NUMA sched domains.
> 
> Keep actual NUMA distances separately if modified distances
> are used for building sched domains. Such distances
> are still needed as NUMA balancing benefits from finding the
> NUMA nodes that are actually closer to a task numa_group.
> 
> Consolidate the recording of unique NUMA distances in an array to
> sched_record_numa_dist() so the function can be reused to record NUMA
> distances when the NUMA distance metric is changed.
> 
> No functional change and additional distance array
> allocated if there're no arch specific NUMA distances
> being defined.
> 
> Co-developed-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>

[snip]

> @@ -1591,10 +1591,12 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
>   enum numa_topology_type sched_numa_topology_type;
>   
>   static int			sched_domains_numa_levels;
> +static int			sched_numa_node_levels;

I agree that the benefit of maintaining two NUMA distances - one for the
sched_domain and another for the NUMA balancing/page allocation policy - is
to avoid complicating the sched_domain hierarchy while preserving the
advantages of NUMA locality.

Meanwhile, I wonder if we could also add a "orig" prefix to the original
NUMA distance. This way, we can quickly understand its meaning later.
For example,
sched_orig_node_levels
sched_orig_node_distance

>   static int			sched_domains_curr_level;
>   
>   int				sched_max_numa_distance;
>   static int			*sched_domains_numa_distance;
> +static int			*sched_numa_node_distance;
>   static struct cpumask		***sched_domains_numa_masks;
>   #endif /* CONFIG_NUMA */
>   
> @@ -1808,10 +1810,10 @@ bool find_numa_distance(int distance)
>   		return true;
>   
>   	rcu_read_lock();
> -	distances = rcu_dereference(sched_domains_numa_distance);
> +	distances = rcu_dereference(sched_numa_node_distance);
>   	if (!distances)
>   		goto unlock;
> -	for (i = 0; i < sched_domains_numa_levels; i++) {
> +	for (i = 0; i < sched_numa_node_levels; i++) {
>   		if (distances[i] == distance) {
>   			found = true;
>   			break;
> @@ -1887,14 +1889,48 @@ static void init_numa_topology_type(int offline_node)
>   
>   #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
>   
> -void sched_init_numa(int offline_node)
> +/*
> + * An architecture could modify its NUMA distance, to change
> + * grouping of NUMA nodes and number of NUMA levels when creating
> + * NUMA level sched domains.
> + *
> + * A NUMA level is created for each unique
> + * arch_sched_node_distance.
> + */
> +static bool __modified_sched_node_dist = true;
> +
> +int __weak arch_sched_node_distance(int from, int to)
>   {
> -	struct sched_domain_topology_level *tl;
> -	unsigned long *distance_map;
> +	if (__modified_sched_node_dist)
> +		__modified_sched_node_dist = false;
> +
> +	return node_distance(from, to);
> +}
> +
> +static bool modified_sched_node_distance(void)
> +{
> +	/*
> +	 * Call arch_sched_node_distance()
> +	 * to determine if arch_sched_node_distance
> +	 * has been modified from node_distance()
> +	 * to arch specific distance.
> +	 */
> +	arch_sched_node_distance(0, 0);
> +	return __modified_sched_node_dist;
> +}
> +

If our goal is to figure out whether the arch_sched_node_distance()
has been overridden, how about the following alias?

int __weak arch_sched_node_distance(int from, int to)
{
	return __node_distance(from, to);
}
int arch_sched_node_distance_original(int from, int to) __weak 
__alias(arch_sched_node_distance);

static bool arch_sched_node_distance_is_overridden(void)
{
	return arch_sched_node_distance != arch_sched_node_distance_original;
}

so arch_sched_node_distance_is_overridden() can replace 
modified_sched_node_distance()

thanks,
Chenyu
Re: [PATCH v4 1/2] sched: Create architecture specific sched domain distances
Posted by Tim Chen 2 days, 7 hours ago
On Sat, 2025-09-27 at 20:34 +0800, Chen, Yu C wrote:
> [snip]
> 
> > @@ -1591,10 +1591,12 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
> >   enum numa_topology_type sched_numa_topology_type;
> >   
> >   static int			sched_domains_numa_levels;
> > +static int			sched_numa_node_levels;
> 
> I agree that the benefit of maintaining two NUMA distances - one for the
> sched_domain and another for the NUMA balancing/page allocation policy - is
> to avoid complicating the sched_domain hierarchy while preserving the
> advantages of NUMA locality.
> 
> Meanwhile, I wonder if we could also add a "orig" prefix to the original
> NUMA distance. This way, we can quickly understand its meaning later.
> For example,
> sched_orig_node_levels
> sched_orig_node_distance

I am not sure adding orig will make the meaning any clearer.
I can add comments to note that

sched_numa_node_distance mean the node distance between numa nodes
sched_numa_nodel_levels  mean the number of unique distances between numa nodes

> 
> >   static int			sched_domains_curr_level;
> >   
> >   int				sched_max_numa_distance;
> >   static int			*sched_domains_numa_distance;
> > +static int			*sched_numa_node_distance;
> >   static struct cpumask		***sched_domains_numa_masks;
> >   #endif /* CONFIG_NUMA */
> >   
> > @@ -1808,10 +1810,10 @@ bool find_numa_distance(int distance)
> >   		return true;
> >   
> >   	rcu_read_lock();
> > -	distances = rcu_dereference(sched_domains_numa_distance);
> > +	distances = rcu_dereference(sched_numa_node_distance);
> >   	if (!distances)
> >   		goto unlock;
> > -	for (i = 0; i < sched_domains_numa_levels; i++) {
> > +	for (i = 0; i < sched_numa_node_levels; i++) {
> >   		if (distances[i] == distance) {
> >   			found = true;
> >   			break;
> > @@ -1887,14 +1889,48 @@ static void init_numa_topology_type(int offline_node)
> >   
> >   #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
> >   
> > -void sched_init_numa(int offline_node)
> > +/*
> > + * An architecture could modify its NUMA distance, to change
> > + * grouping of NUMA nodes and number of NUMA levels when creating
> > + * NUMA level sched domains.
> > + *
> > + * A NUMA level is created for each unique
> > + * arch_sched_node_distance.
> > + */
> > +static bool __modified_sched_node_dist = true;
> > +
> > +int __weak arch_sched_node_distance(int from, int to)
> >   {
> > -	struct sched_domain_topology_level *tl;
> > -	unsigned long *distance_map;
> > +	if (__modified_sched_node_dist)
> > +		__modified_sched_node_dist = false;
> > +
> > +	return node_distance(from, to);
> > +}
> > +
> > +static bool modified_sched_node_distance(void)
> > +{
> > +	/*
> > +	 * Call arch_sched_node_distance()
> > +	 * to determine if arch_sched_node_distance
> > +	 * has been modified from node_distance()
> > +	 * to arch specific distance.
> > +	 */
> > +	arch_sched_node_distance(0, 0);
> > +	return __modified_sched_node_dist;
> > +}
> > +
> 
> If our goal is to figure out whether the arch_sched_node_distance()
> has been overridden, how about the following alias?
> 
> int __weak arch_sched_node_distance(int from, int to)
> {
> 	return __node_distance(from, to);
> }
> int arch_sched_node_distance_original(int from, int to) __weak 
> __alias(arch_sched_node_distance);
> 
> static bool arch_sched_node_distance_is_overridden(void)
> {
> 	return arch_sched_node_distance != arch_sched_node_distance_original;
> }
> 
> so arch_sched_node_distance_is_overridden() can replace 
> modified_sched_node_distance()
> 

I think that the alias version will still point to the replaced function and not
the originally defined one.

How about not using __weak and just explicitly define arch_sched_node_distance
as a function pointer.  Change the code like below.

Tim

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d6b772990ec2..12db78af09d5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -545,7 +545,7 @@ static int avg_remote_numa_distance(void)
 	return sched_avg_remote_distance;
 }
 
-int arch_sched_node_distance(int from, int to)
+static int x86_arch_sched_node_distance(int from, int to)
 {
 	int d = node_distance(from, to);
 
@@ -918,6 +918,9 @@ static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
 	/* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
 	if (apic->wakeup_secondary_cpu_64)
 		start_ip = real_mode_header->trampoline_start64;
+#endif
+#ifdef CONFIG_NUMA
+	arch_sched_node_distance = x86_arch_sched_node_distance;
 #endif
 	idle->thread.sp = (unsigned long)task_pt_regs(idle);
 	initial_code = (unsigned long)start_secondary;
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 2d2d29553df8..3549c4a19816 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -56,7 +56,7 @@ static inline int cpu_numa_flags(void)
 {
 	return SD_NUMA;
 }
-extern int arch_sched_node_distance(int from, int to);
+extern int (*arch_sched_node_distance)(int, int);
 #endif
 
 extern int arch_asym_cpu_priority(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f25e4402c63e..7cfb7422e9d4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1897,26 +1897,17 @@ static void init_numa_topology_type(int offline_node)
  * A NUMA level is created for each unique
  * arch_sched_node_distance.
  */
-static bool __modified_sched_node_dist = true;
 
-int __weak arch_sched_node_distance(int from, int to)
+static int default_sched_node_distance(int from, int to)
 {
-	if (__modified_sched_node_dist)
-		__modified_sched_node_dist = false;
-
 	return node_distance(from, to);
 }
 
+int (*arch_sched_node_distance)(int, int) = default_sched_node_distance;
+
 static bool modified_sched_node_distance(void)
 {
-	/*
-	 * Call arch_sched_node_distance()
-	 * to determine if arch_sched_node_distance
-	 * has been modified from node_distance()
-	 * to arch specific distance.
-	 */
-	arch_sched_node_distance(0, 0);
-	return __modified_sched_node_dist;
+	return arch_sched_node_distance != default_sched_node_distance;
 }
 
 static int numa_node_dist(int i, int j)
Re: [PATCH v4 1/2] sched: Create architecture specific sched domain distances
Posted by Chen, Yu C 2 days, 3 hours ago
On 9/30/2025 6:18 AM, Tim Chen wrote:
> On Sat, 2025-09-27 at 20:34 +0800, Chen, Yu C wrote:
>> [snip]
>>
>>> @@ -1591,10 +1591,12 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
>>>    enum numa_topology_type sched_numa_topology_type;
>>>    
>>>    static int			sched_domains_numa_levels;
>>> +static int			sched_numa_node_levels;
>>
>> I agree that the benefit of maintaining two NUMA distances - one for the
>> sched_domain and another for the NUMA balancing/page allocation policy - is
>> to avoid complicating the sched_domain hierarchy while preserving the
>> advantages of NUMA locality.
>>
>> Meanwhile, I wonder if we could also add a "orig" prefix to the original
>> NUMA distance. This way, we can quickly understand its meaning later.
>> For example,
>> sched_orig_node_levels
>> sched_orig_node_distance
> 
> I am not sure adding orig will make the meaning any clearer.
> I can add comments to note that
> 
> sched_numa_node_distance mean the node distance between numa nodes
> sched_numa_nodel_levels  mean the number of unique distances between numa nodes
> 

OK, looks good to me.

>>
>>>    static int			sched_domains_curr_level;
>>>    
>>>    int				sched_max_numa_distance;
>>>    static int			*sched_domains_numa_distance;
>>> +static int			*sched_numa_node_distance;
>>>    static struct cpumask		***sched_domains_numa_masks;
>>>    #endif /* CONFIG_NUMA */
>>>    
>>> @@ -1808,10 +1810,10 @@ bool find_numa_distance(int distance)
>>>    		return true;
>>>    
>>>    	rcu_read_lock();
>>> -	distances = rcu_dereference(sched_domains_numa_distance);
>>> +	distances = rcu_dereference(sched_numa_node_distance);
>>>    	if (!distances)
>>>    		goto unlock;
>>> -	for (i = 0; i < sched_domains_numa_levels; i++) {
>>> +	for (i = 0; i < sched_numa_node_levels; i++) {
>>>    		if (distances[i] == distance) {
>>>    			found = true;
>>>    			break;
>>> @@ -1887,14 +1889,48 @@ static void init_numa_topology_type(int offline_node)
>>>    
>>>    #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
>>>    
>>> -void sched_init_numa(int offline_node)
>>> +/*
>>> + * An architecture could modify its NUMA distance, to change
>>> + * grouping of NUMA nodes and number of NUMA levels when creating
>>> + * NUMA level sched domains.
>>> + *
>>> + * A NUMA level is created for each unique
>>> + * arch_sched_node_distance.
>>> + */
>>> +static bool __modified_sched_node_dist = true;
>>> +
>>> +int __weak arch_sched_node_distance(int from, int to)
>>>    {
>>> -	struct sched_domain_topology_level *tl;
>>> -	unsigned long *distance_map;
>>> +	if (__modified_sched_node_dist)
>>> +		__modified_sched_node_dist = false;
>>> +
>>> +	return node_distance(from, to);
>>> +}
>>> +
>>> +static bool modified_sched_node_distance(void)
>>> +{
>>> +	/*
>>> +	 * Call arch_sched_node_distance()
>>> +	 * to determine if arch_sched_node_distance
>>> +	 * has been modified from node_distance()
>>> +	 * to arch specific distance.
>>> +	 */
>>> +	arch_sched_node_distance(0, 0);
>>> +	return __modified_sched_node_dist;
>>> +}
>>> +
>>
>> If our goal is to figure out whether the arch_sched_node_distance()
>> has been overridden, how about the following alias?
>>
>> int __weak arch_sched_node_distance(int from, int to)
>> {
>> 	return __node_distance(from, to);
>> }
>> int arch_sched_node_distance_original(int from, int to) __weak
>> __alias(arch_sched_node_distance);
>>
>> static bool arch_sched_node_distance_is_overridden(void)
>> {
>> 	return arch_sched_node_distance != arch_sched_node_distance_original;
>> }
>>
>> so arch_sched_node_distance_is_overridden() can replace
>> modified_sched_node_distance()
>>
> 
> I think that the alias version will still point to the replaced function and not
> the originally defined one.
> 
> How about not using __weak and just explicitly define arch_sched_node_distance
> as a function pointer.  Change the code like below.
> 

The arch_sched_node_distance_original is defined as __weak, so it
should point to the old function even if the function has been
overridden. I did a test on a X86 VM and it seems to be so.
But using the arch_sched_node_distance as a function point
should also be OK.
> Tim
> 
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index d6b772990ec2..12db78af09d5 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -545,7 +545,7 @@ static int avg_remote_numa_distance(void)
>   	return sched_avg_remote_distance;
>   }
>   
> -int arch_sched_node_distance(int from, int to)
> +static int x86_arch_sched_node_distance(int from, int to)
>   {
>   	int d = node_distance(from, to);
>   
> @@ -918,6 +918,9 @@ static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
>   	/* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
>   	if (apic->wakeup_secondary_cpu_64)
>   		start_ip = real_mode_header->trampoline_start64;
> +#endif
> +#ifdef CONFIG_NUMA
> +	arch_sched_node_distance = x86_arch_sched_node_distance;
>   #endif

Above might be called for several APs, maybe we can just call it
once in smp_prepare_cpus_common().

thanks,
Chenyu

>   	idle->thread.sp = (unsigned long)task_pt_regs(idle);
>   	initial_code = (unsigned long)start_secondary;
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 2d2d29553df8..3549c4a19816 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -56,7 +56,7 @@ static inline int cpu_numa_flags(void)
>   {
>   	return SD_NUMA;
>   }
> -extern int arch_sched_node_distance(int from, int to);
> +extern int (*arch_sched_node_distance)(int, int);
>   #endif
>   
>   extern int arch_asym_cpu_priority(int cpu);
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index f25e4402c63e..7cfb7422e9d4 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1897,26 +1897,17 @@ static void init_numa_topology_type(int offline_node)
>    * A NUMA level is created for each unique
>    * arch_sched_node_distance.
>    */
> -static bool __modified_sched_node_dist = true;
>   
> -int __weak arch_sched_node_distance(int from, int to)
> +static int default_sched_node_distance(int from, int to)
>   {
> -	if (__modified_sched_node_dist)
> -		__modified_sched_node_dist = false;
> -
>   	return node_distance(from, to);
>   }
>   
> +int (*arch_sched_node_distance)(int, int) = default_sched_node_distance;
> +
>   static bool modified_sched_node_distance(void)
>   {
> -	/*
> -	 * Call arch_sched_node_distance()
> -	 * to determine if arch_sched_node_distance
> -	 * has been modified from node_distance()
> -	 * to arch specific distance.
> -	 */
> -	arch_sched_node_distance(0, 0);
> -	return __modified_sched_node_dist;
> +	return arch_sched_node_distance != default_sched_node_distance;
>   }
>   
>   static int numa_node_dist(int i, int j)
Re: [PATCH v4 1/2] sched: Create architecture specific sched domain distances
Posted by Tim Chen 1 day, 12 hours ago
On Tue, 2025-09-30 at 10:28 +0800, Chen, Yu C wrote:
> On 9/30/2025 6:18 AM, Tim Chen wrote:
> > On Sat, 2025-09-27 at 20:34 +0800, Chen, Yu C wrote:
> > > 

[snip]

> > > 
> > > If our goal is to figure out whether the arch_sched_node_distance()
> > > has been overridden, how about the following alias?
> > > 
> > > int __weak arch_sched_node_distance(int from, int to)
> > > {
> > > 	return __node_distance(from, to);
> > > }
> > > int arch_sched_node_distance_original(int from, int to) __weak
> > > __alias(arch_sched_node_distance);
> > > 
> > > static bool arch_sched_node_distance_is_overridden(void)
> > > {
> > > 	return arch_sched_node_distance != arch_sched_node_distance_original;
> > > }
> > > 
> > > so arch_sched_node_distance_is_overridden() can replace
> > > modified_sched_node_distance()
> > > 
> > 
> > I think that the alias version will still point to the replaced function and not
> > the originally defined one.
> > 
> > How about not using __weak and just explicitly define arch_sched_node_distance
> > as a function pointer.  Change the code like below.
> > 
> 
> The arch_sched_node_distance_original is defined as __weak, so it
> should point to the old function even if the function has been
> overridden. I did a test on a X86 VM and it seems to be so.
> But using the arch_sched_node_distance as a function point
> should also be OK.
> 

How about changing the code as follow. I think this change is cleaner.
I tested it in my VM and works for detecting sched distance substitution.
Thanks.

Tim

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f25e4402c63e..3dc941258df3 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1897,31 +1897,17 @@ static void init_numa_topology_type(int offline_node)
  * A NUMA level is created for each unique
  * arch_sched_node_distance.
  */
-static bool __modified_sched_node_dist = true;
-
-int __weak arch_sched_node_distance(int from, int to)
+static int numa_node_dist(int i, int j)
 {
-	if (__modified_sched_node_dist)
-		__modified_sched_node_dist = false;
-
-	return node_distance(from, to);
+	return node_distance(i, j);
 }
 
-static bool modified_sched_node_distance(void)
-{
-	/*
-	 * Call arch_sched_node_distance()
-	 * to determine if arch_sched_node_distance
-	 * has been modified from node_distance()
-	 * to arch specific distance.
-	 */
-	arch_sched_node_distance(0, 0);
-	return __modified_sched_node_dist;
-}
+int arch_sched_node_distance(int from, int to)
+			     __weak __alias(numa_node_dist);
 
-static int numa_node_dist(int i, int j)
+static bool modified_sched_node_distance(void)
 {
-	return node_distance(i, j);
+	return numa_node_dist != arch_sched_node_distance;
 }
 
 static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
Re: [PATCH v4 1/2] sched: Create architecture specific sched domain distances
Posted by Chen, Yu C 1 day, 5 hours ago
On 10/1/2025 1:30 AM, Tim Chen wrote:
> On Tue, 2025-09-30 at 10:28 +0800, Chen, Yu C wrote:
>> On 9/30/2025 6:18 AM, Tim Chen wrote:
>>> On Sat, 2025-09-27 at 20:34 +0800, Chen, Yu C wrote:
>>>>
> 
> [snip]
> 
>>>>
>>>> If our goal is to figure out whether the arch_sched_node_distance()
>>>> has been overridden, how about the following alias?
>>>>
>>>> int __weak arch_sched_node_distance(int from, int to)
>>>> {
>>>> 	return __node_distance(from, to);
>>>> }
>>>> int arch_sched_node_distance_original(int from, int to) __weak
>>>> __alias(arch_sched_node_distance);
>>>>
>>>> static bool arch_sched_node_distance_is_overridden(void)
>>>> {
>>>> 	return arch_sched_node_distance != arch_sched_node_distance_original;
>>>> }
>>>>
>>>> so arch_sched_node_distance_is_overridden() can replace
>>>> modified_sched_node_distance()
>>>>
>>>
>>> I think that the alias version will still point to the replaced function and not
>>> the originally defined one.
>>>
>>> How about not using __weak and just explicitly define arch_sched_node_distance
>>> as a function pointer.  Change the code like below.
>>>
>>
>> The arch_sched_node_distance_original is defined as __weak, so it
>> should point to the old function even if the function has been
>> overridden. I did a test on a X86 VM and it seems to be so.
>> But using the arch_sched_node_distance as a function point
>> should also be OK.
>>
> 
> How about changing the code as follow. I think this change is cleaner.
> I tested it in my VM and works for detecting sched distance substitution.
> Thanks.
> 

Yes, the following change looks good to me.

Thanks,
Chenyu> Tim
> 
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index f25e4402c63e..3dc941258df3 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1897,31 +1897,17 @@ static void init_numa_topology_type(int offline_node)
>    * A NUMA level is created for each unique
>    * arch_sched_node_distance.
>    */
> -static bool __modified_sched_node_dist = true;
> -
> -int __weak arch_sched_node_distance(int from, int to)
> +static int numa_node_dist(int i, int j)
>   {
> -	if (__modified_sched_node_dist)
> -		__modified_sched_node_dist = false;
> -
> -	return node_distance(from, to);
> +	return node_distance(i, j);
>   }
>   
> -static bool modified_sched_node_distance(void)
> -{
> -	/*
> -	 * Call arch_sched_node_distance()
> -	 * to determine if arch_sched_node_distance
> -	 * has been modified from node_distance()
> -	 * to arch specific distance.
> -	 */
> -	arch_sched_node_distance(0, 0);
> -	return __modified_sched_node_dist;
> -}
> +int arch_sched_node_distance(int from, int to)
> +			     __weak __alias(numa_node_dist);
>   
> -static int numa_node_dist(int i, int j)
> +static bool modified_sched_node_distance(void)
>   {
> -	return node_distance(i, j);
> +	return numa_node_dist != arch_sched_node_distance;
>   }
>   
>   static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
>