[PATCH v2 1/2] sched: Create architecture specific sched domain distances

Tim Chen posted 2 patches 4 days, 10 hours ago
[PATCH v2 1/2] sched: Create architecture specific sched domain distances
Posted by Tim Chen 4 days, 10 hours ago
Allow architecture specific sched domain NUMA distances that can be
modified from NUMA node distances for the purpose of building NUMA
sched domains.

The actual NUMA distances are kept separately.  This allows for NUMA
domain levels modification when building sched domains for specific
architectures.

Consolidate the recording of unique NUMA distances in an array to
sched_record_numa_dist() so the function can be reused to record NUMA
distances when the NUMA distance metric is changed.

No functional change if there's no arch specific NUMA distances
are being defined.

Co-developed-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/sched/topology.h |   2 +
 kernel/sched/topology.c        | 118 ++++++++++++++++++++++++++++-----
 2 files changed, 103 insertions(+), 17 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 5263746b63e8..4f58e78ca52e 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -59,6 +59,8 @@ static inline int cpu_numa_flags(void)
 #endif
 
 extern int arch_asym_cpu_priority(int cpu);
+extern int arch_sched_node_distance(int from, int to);
+extern int sched_avg_remote_numa_distance;
 
 struct sched_domain_attr {
 	int relax_domain_level;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 977e133bb8a4..1f08dfef2ea5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1591,10 +1591,13 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 enum numa_topology_type sched_numa_topology_type;
 
 static int			sched_domains_numa_levels;
+static int			sched_numa_node_levels;
 static int			sched_domains_curr_level;
 
 int				sched_max_numa_distance;
+int				sched_avg_remote_numa_distance;
 static int			*sched_domains_numa_distance;
+static int			*sched_numa_node_distance;
 static struct cpumask		***sched_domains_numa_masks;
 #endif /* CONFIG_NUMA */
 
@@ -1808,10 +1811,10 @@ bool find_numa_distance(int distance)
 		return true;
 
 	rcu_read_lock();
-	distances = rcu_dereference(sched_domains_numa_distance);
+	distances = rcu_dereference(sched_numa_node_distance);
 	if (!distances)
 		goto unlock;
-	for (i = 0; i < sched_domains_numa_levels; i++) {
+	for (i = 0; i < sched_numa_node_levels; i++) {
 		if (distances[i] == distance) {
 			found = true;
 			break;
@@ -1887,14 +1890,29 @@ static void init_numa_topology_type(int offline_node)
 
 #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
 
-void sched_init_numa(int offline_node)
+/*
+ * Architecture could simplify NUMA distance, to avoid
+ * creating too many NUMA levels.
+ */
+int __weak arch_sched_node_distance(int from, int to)
+{
+	return node_distance(from, to);
+}
+
+static int numa_node_dist(int i, int j)
+{
+	return node_distance(i, j);
+}
+
+static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
+		int **dist, int *maximum_dist, int *levels)
+
 {
-	struct sched_domain_topology_level *tl;
 	unsigned long *distance_map;
 	int nr_levels = 0;
 	int i, j;
 	int *distances;
-	struct cpumask ***masks;
+	int max_dist = 0;
 
 	/*
 	 * O(nr_nodes^2) de-duplicating selection sort -- in order to find the
@@ -1902,17 +1920,20 @@ void sched_init_numa(int offline_node)
 	 */
 	distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
 	if (!distance_map)
-		return;
+		return -ENOMEM;
 
 	bitmap_zero(distance_map, NR_DISTANCE_VALUES);
 	for_each_cpu_node_but(i, offline_node) {
 		for_each_cpu_node_but(j, offline_node) {
-			int distance = node_distance(i, j);
+			int distance = n_dist(i, j);
+
+			if (distance > max_dist)
+				max_dist = distance;
 
 			if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
 				sched_numa_warn("Invalid distance value range");
 				bitmap_free(distance_map);
-				return;
+				return -EINVAL;
 			}
 
 			bitmap_set(distance_map, distance, 1);
@@ -1927,17 +1948,70 @@ void sched_init_numa(int offline_node)
 	distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
 	if (!distances) {
 		bitmap_free(distance_map);
-		return;
+		return -ENOMEM;
 	}
-
 	for (i = 0, j = 0; i < nr_levels; i++, j++) {
 		j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
 		distances[i] = j;
 	}
-	rcu_assign_pointer(sched_domains_numa_distance, distances);
+	*dist = distances;
+	if (levels)
+		*levels = nr_levels;
+
+	if (maximum_dist)
+		*maximum_dist = max_dist;
 
 	bitmap_free(distance_map);
 
+	return 0;
+}
+
+static int avg_remote_numa_distance(int offline_node)
+{
+	int i, j;
+	int distance, nr_remote = 0, total_distance = 0;
+
+	for_each_cpu_node_but(i, offline_node) {
+		for_each_cpu_node_but(j, offline_node) {
+			distance = node_distance(i, j);
+
+			if (distance >= REMOTE_DISTANCE) {
+				nr_remote++;
+				total_distance += distance;
+			}
+		}
+	}
+	if (nr_remote)
+		return total_distance / nr_remote;
+	else
+		return REMOTE_DISTANCE;
+}
+
+void sched_init_numa(int offline_node)
+{
+	struct sched_domain_topology_level *tl;
+	int nr_levels, nr_node_levels;
+	int i, j;
+	int *distances, *domain_distances;
+	int max_dist;
+	struct cpumask ***masks;
+
+	if (sched_record_numa_dist(offline_node, numa_node_dist, &distances,
+				   &max_dist, &nr_node_levels))
+		return;
+
+	WRITE_ONCE(sched_avg_remote_numa_distance,
+		   avg_remote_numa_distance(offline_node));
+
+	if (sched_record_numa_dist(offline_node,
+				   arch_sched_node_distance, &domain_distances,
+				   NULL, &nr_levels)) {
+		kfree(distances);
+		return;
+	}
+	rcu_assign_pointer(sched_numa_node_distance, distances);
+	WRITE_ONCE(sched_numa_node_levels, nr_node_levels);
+
 	/*
 	 * 'nr_levels' contains the number of unique distances
 	 *
@@ -1954,6 +2028,8 @@ void sched_init_numa(int offline_node)
 	 *
 	 * We reset it to 'nr_levels' at the end of this function.
 	 */
+	rcu_assign_pointer(sched_domains_numa_distance, domain_distances);
+
 	sched_domains_numa_levels = 0;
 
 	masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
@@ -1979,10 +2055,13 @@ void sched_init_numa(int offline_node)
 			masks[i][j] = mask;
 
 			for_each_cpu_node_but(k, offline_node) {
-				if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+				if (sched_debug() &&
+				    (arch_sched_node_distance(j, k) !=
+				     arch_sched_node_distance(k, j)))
 					sched_numa_warn("Node-distance not symmetric");
 
-				if (node_distance(j, k) > sched_domains_numa_distance[i])
+				if (arch_sched_node_distance(j, k) >
+					sched_domains_numa_distance[i])
 					continue;
 
 				cpumask_or(mask, mask, cpumask_of_node(k));
@@ -2022,7 +2101,7 @@ void sched_init_numa(int offline_node)
 	sched_domain_topology = tl;
 
 	sched_domains_numa_levels = nr_levels;
-	WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
+	WRITE_ONCE(sched_max_numa_distance, max_dist);
 
 	init_numa_topology_type(offline_node);
 }
@@ -2030,14 +2109,18 @@ void sched_init_numa(int offline_node)
 
 static void sched_reset_numa(void)
 {
-	int nr_levels, *distances;
+	int nr_levels, *distances, *dom_distances;
 	struct cpumask ***masks;
 
 	nr_levels = sched_domains_numa_levels;
+	sched_numa_node_levels = 0;
 	sched_domains_numa_levels = 0;
 	sched_max_numa_distance = 0;
+	sched_avg_remote_numa_distance = 0;
 	sched_numa_topology_type = NUMA_DIRECT;
-	distances = sched_domains_numa_distance;
+	distances = sched_numa_node_distance;
+	dom_distances = sched_domains_numa_distance;
+	rcu_assign_pointer(sched_numa_node_distance, NULL);
 	rcu_assign_pointer(sched_domains_numa_distance, NULL);
 	masks = sched_domains_numa_masks;
 	rcu_assign_pointer(sched_domains_numa_masks, NULL);
@@ -2054,6 +2137,7 @@ static void sched_reset_numa(void)
 			kfree(masks[i]);
 		}
 		kfree(masks);
+		kfree(dom_distances);
 	}
 	if (sched_domain_topology_saved) {
 		kfree(sched_domain_topology);
@@ -2092,7 +2176,7 @@ void sched_domains_numa_masks_set(unsigned int cpu)
 				continue;
 
 			/* Set ourselves in the remote node's masks */
-			if (node_distance(j, node) <= sched_domains_numa_distance[i])
+			if (arch_sched_node_distance(j, node) <= sched_domains_numa_distance[i])
 				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
 		}
 	}
-- 
2.32.0
Re: [PATCH v2 1/2] sched: Create architecture specific sched domain distances
Posted by Chen, Yu C 2 days, 12 hours ago
On 9/6/2025 2:36 AM, Tim Chen wrote:
> Allow architecture specific sched domain NUMA distances that can be
> modified from NUMA node distances for the purpose of building NUMA
> sched domains.
> 
> The actual NUMA distances are kept separately.  This allows for NUMA
> domain levels modification when building sched domains for specific
> architectures.
> 
> Consolidate the recording of unique NUMA distances in an array to
> sched_record_numa_dist() so the function can be reused to record NUMA
> distances when the NUMA distance metric is changed.
> 
> No functional change if there's no arch specific NUMA distances
> are being defined.
> 
> Co-developed-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> ---
>   include/linux/sched/topology.h |   2 +
>   kernel/sched/topology.c        | 118 ++++++++++++++++++++++++++++-----
>   2 files changed, 103 insertions(+), 17 deletions(-)
> 
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 5263746b63e8..4f58e78ca52e 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -59,6 +59,8 @@ static inline int cpu_numa_flags(void)
>   #endif
>   
>   extern int arch_asym_cpu_priority(int cpu);
> +extern int arch_sched_node_distance(int from, int to);
> +extern int sched_avg_remote_numa_distance;
>   
>   struct sched_domain_attr {
>   	int relax_domain_level;
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 977e133bb8a4..1f08dfef2ea5 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1591,10 +1591,13 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
>   enum numa_topology_type sched_numa_topology_type;
>   
>   static int			sched_domains_numa_levels;
> +static int			sched_numa_node_levels;
>   static int			sched_domains_curr_level;
>   
>   int				sched_max_numa_distance;
> +int				sched_avg_remote_numa_distance;
>   static int			*sched_domains_numa_distance;
> +static int			*sched_numa_node_distance;
>   static struct cpumask		***sched_domains_numa_masks;
>   #endif /* CONFIG_NUMA */
>   
> @@ -1808,10 +1811,10 @@ bool find_numa_distance(int distance)
>   		return true;
>   
>   	rcu_read_lock();
> -	distances = rcu_dereference(sched_domains_numa_distance);
> +	distances = rcu_dereference(sched_numa_node_distance);
>   	if (!distances)
>   		goto unlock;
> -	for (i = 0; i < sched_domains_numa_levels; i++) {
> +	for (i = 0; i < sched_numa_node_levels; i++) {
>   		if (distances[i] == distance) {
>   			found = true;
>   			break;
> @@ -1887,14 +1890,29 @@ static void init_numa_topology_type(int offline_node)
>   
>   #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
>   
> -void sched_init_numa(int offline_node)
> +/*
> + * Architecture could simplify NUMA distance, to avoid
> + * creating too many NUMA levels.
> + */
> +int __weak arch_sched_node_distance(int from, int to)
> +{
> +	return node_distance(from, to);
> +}
> +
> +static int numa_node_dist(int i, int j)
> +{
> +	return node_distance(i, j);
> +}
> +

numa_node_dist() seems to be used only once by
sched_record_numa_dist(), would it be possible to
use node_distance() directly
sched_record_numa_dist(offline_node, node_distance, &distances,
				   &max_dist, &nr_node_levels))?

> +static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
> +		int **dist, int *maximum_dist, int *levels)
> +
>   {
> -	struct sched_domain_topology_level *tl;
>   	unsigned long *distance_map;
>   	int nr_levels = 0;
>   	int i, j;
>   	int *distances;
> -	struct cpumask ***masks;
> +	int max_dist = 0;
>   
>   	/*
>   	 * O(nr_nodes^2) de-duplicating selection sort -- in order to find the
> @@ -1902,17 +1920,20 @@ void sched_init_numa(int offline_node)
>   	 */
>   	distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
>   	if (!distance_map)
> -		return;
> +		return -ENOMEM;
>   
>   	bitmap_zero(distance_map, NR_DISTANCE_VALUES);
>   	for_each_cpu_node_but(i, offline_node) {
>   		for_each_cpu_node_but(j, offline_node) {
> -			int distance = node_distance(i, j);
> +			int distance = n_dist(i, j);
> +
> +			if (distance > max_dist)
> +				max_dist = distance;
>   
>   			if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
>   				sched_numa_warn("Invalid distance value range");
>   				bitmap_free(distance_map);
> -				return;
> +				return -EINVAL;
>   			}
>   
>   			bitmap_set(distance_map, distance, 1);
> @@ -1927,17 +1948,70 @@ void sched_init_numa(int offline_node)
>   	distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
>   	if (!distances) {
>   		bitmap_free(distance_map);
> -		return;
> +		return -ENOMEM;
>   	}
> -
>   	for (i = 0, j = 0; i < nr_levels; i++, j++) {
>   		j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
>   		distances[i] = j;
>   	}
> -	rcu_assign_pointer(sched_domains_numa_distance, distances);
> +	*dist = distances;
> +	if (levels)
> +		*levels = nr_levels;
> +
> +	if (maximum_dist)
> +		*maximum_dist = max_dist;
>   
>   	bitmap_free(distance_map);
>   
> +	return 0;
> +}
> +
> +static int avg_remote_numa_distance(int offline_node)
> +{
> +	int i, j;
> +	int distance, nr_remote = 0, total_distance = 0;
> +
> +	for_each_cpu_node_but(i, offline_node) {
> +		for_each_cpu_node_but(j, offline_node) {
> +			distance = node_distance(i, j);
> +
> +			if (distance >= REMOTE_DISTANCE) {
> +				nr_remote++;
> +				total_distance += distance;
> +			}
> +		}
> +	}
> +	if (nr_remote)
> +		return total_distance / nr_remote;
> +	else
> +		return REMOTE_DISTANCE;
> +}
> +
> +void sched_init_numa(int offline_node)
> +{
> +	struct sched_domain_topology_level *tl;
> +	int nr_levels, nr_node_levels;
> +	int i, j;
> +	int *distances, *domain_distances;
> +	int max_dist;
> +	struct cpumask ***masks;
> +
> +	if (sched_record_numa_dist(offline_node, numa_node_dist, &distances,
> +				   &max_dist, &nr_node_levels))
> +		return;
> +
> +	WRITE_ONCE(sched_avg_remote_numa_distance,
> +		   avg_remote_numa_distance(offline_node));
> +
> +	if (sched_record_numa_dist(offline_node,
> +				   arch_sched_node_distance, &domain_distances,
> +				   NULL, &nr_levels)) {
> +		kfree(distances);
> +		return;
> +	}
> +	rcu_assign_pointer(sched_numa_node_distance, distances);
> +	WRITE_ONCE(sched_numa_node_levels, nr_node_levels);
> +
>   	/*
>   	 * 'nr_levels' contains the number of unique distances
>   	 *
> @@ -1954,6 +2028,8 @@ void sched_init_numa(int offline_node)
>   	 *
>   	 * We reset it to 'nr_levels' at the end of this function.
>   	 */
> +	rcu_assign_pointer(sched_domains_numa_distance, domain_distances);
> +
>   	sched_domains_numa_levels = 0;
>   
>   	masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
> @@ -1979,10 +2055,13 @@ void sched_init_numa(int offline_node)
>   			masks[i][j] = mask;
>   
>   			for_each_cpu_node_but(k, offline_node) {
> -				if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
> +				if (sched_debug() &&
> +				    (arch_sched_node_distance(j, k) !=
> +				     arch_sched_node_distance(k, j)))
>   					sched_numa_warn("Node-distance not symmetric");
>   
> -				if (node_distance(j, k) > sched_domains_numa_distance[i])
> +				if (arch_sched_node_distance(j, k) >
> +					sched_domains_numa_distance[i])
>   					continue;
>   
>   				cpumask_or(mask, mask, cpumask_of_node(k));
> @@ -2022,7 +2101,7 @@ void sched_init_numa(int offline_node)
>   	sched_domain_topology = tl;
>   
>   	sched_domains_numa_levels = nr_levels;
> -	WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
> +	WRITE_ONCE(sched_max_numa_distance, max_dist);
>   

Would it be possible to use
WRITE_ONCE(sched_max_numa_distance, distance[nr_node_levels - 1]);
so we can simplify the code by removing the introduced 'max_dist'
both in sched_record_numa_dist() and sched_init_numa().

thanks,
Chenyu

>   	init_numa_topology_type(offline_node);
>   }
> @@ -2030,14 +2109,18 @@ void sched_init_numa(int offline_node)
>   
>   static void sched_reset_numa(void)
>   {
> -	int nr_levels, *distances;
> +	int nr_levels, *distances, *dom_distances;
>   	struct cpumask ***masks;
>   
>   	nr_levels = sched_domains_numa_levels;
> +	sched_numa_node_levels = 0;
>   	sched_domains_numa_levels = 0;
>   	sched_max_numa_distance = 0;
> +	sched_avg_remote_numa_distance = 0;
>   	sched_numa_topology_type = NUMA_DIRECT;
> -	distances = sched_domains_numa_distance;
> +	distances = sched_numa_node_distance;
> +	dom_distances = sched_domains_numa_distance;
> +	rcu_assign_pointer(sched_numa_node_distance, NULL);
>   	rcu_assign_pointer(sched_domains_numa_distance, NULL);
>   	masks = sched_domains_numa_masks;
>   	rcu_assign_pointer(sched_domains_numa_masks, NULL);
> @@ -2054,6 +2137,7 @@ static void sched_reset_numa(void)
>   			kfree(masks[i]);
>   		}
>   		kfree(masks);
> +		kfree(dom_distances);
>   	}
>   	if (sched_domain_topology_saved) {
>   		kfree(sched_domain_topology);
> @@ -2092,7 +2176,7 @@ void sched_domains_numa_masks_set(unsigned int cpu)
>   				continue;
>   
>   			/* Set ourselves in the remote node's masks */
> -			if (node_distance(j, node) <= sched_domains_numa_distance[i])
> +			if (arch_sched_node_distance(j, node) <= sched_domains_numa_distance[i])
>   				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
>   		}
>   	}
Re: [PATCH v2 1/2] sched: Create architecture specific sched domain distances
Posted by Tim Chen 1 day, 10 hours ago
On Mon, 2025-09-08 at 00:28 +0800, Chen, Yu C wrote:
> On 9/6/2025 2:36 AM, Tim Chen wrote:

... snip ...
> > -void sched_init_numa(int offline_node)
> > +/*
> > + * Architecture could simplify NUMA distance, to avoid
> > + * creating too many NUMA levels.
> > + */
> > +int __weak arch_sched_node_distance(int from, int to)
> > +{
> > +	return node_distance(from, to);
> > +}
> > +
> > +static int numa_node_dist(int i, int j)
> > +{
> > +	return node_distance(i, j);
> > +}
> > +
> 
> numa_node_dist() seems to be used only once by
> sched_record_numa_dist(), would it be possible to
> use node_distance() directly
> sched_record_numa_dist(offline_node, node_distance, &distances,
> 				   &max_dist, &nr_node_levels))?

Otherwise I will need to pass a flag to sched_record_numa_dist to
choose which distance to use.  I am okay either way. Choosing
the current method so it makes sched_record_numa_dist() simpler.


> 
> > +static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
> > +		int **dist, int *maximum_dist, int *levels)
> > +
> >   {
> > -	struct sched_domain_topology_level *tl;
> >   	unsigned long *distance_map;
> >   	int nr_levels = 0;
> >   	int i, j;
> >   	int *distances;
> > -	struct cpumask ***masks;
> > +	int max_dist = 0;
> >   
> > 
... snip ...

> > +static int avg_remote_numa_distance(int offline_node)
> > +{
> > +	int i, j;
> > +	int distance, nr_remote = 0, total_distance = 0;
> > +
> > +	for_each_cpu_node_but(i, offline_node) {
> > +		for_each_cpu_node_but(j, offline_node) {
> > +			distance = node_distance(i, j);
> > +
> > +			if (distance >= REMOTE_DISTANCE) {
> > +				nr_remote++;
> > +				total_distance += distance;
> > +			}
> > +		}
> > +	}
> > +	if (nr_remote)
> > +		return total_distance / nr_remote;
> > +	else
> > +		return REMOTE_DISTANCE;
> > +}
> > +
> > +void sched_init_numa(int offline_node)
> > +{
> > +	struct sched_domain_topology_level *tl;
> > +	int nr_levels, nr_node_levels;
> > +	int i, j;
> > +	int *distances, *domain_distances;
> > +	int max_dist;
> > +	struct cpumask ***masks;
> > +
> > +	if (sched_record_numa_dist(offline_node, numa_node_dist, &distances,
> > +				   &max_dist, &nr_node_levels))
> > +		return;
> > +
> > +	WRITE_ONCE(sched_avg_remote_numa_distance,
> > +		   avg_remote_numa_distance(offline_node));
> > +
> > +	if (sched_record_numa_dist(offline_node,
> > +				   arch_sched_node_distance, &domain_distances,
> > +				   NULL, &nr_levels)) {
> > +		kfree(distances);
> > +		return;
> > +	}
> > +	rcu_assign_pointer(sched_numa_node_distance, distances);
> > +	WRITE_ONCE(sched_numa_node_levels, nr_node_levels);
> > +
> >   	/*
> >   	 * 'nr_levels' contains the number of unique distances
> >   	 *
> > @@ -1954,6 +2028,8 @@ void sched_init_numa(int offline_node)
> >   	 *
> >   	 * We reset it to 'nr_levels' at the end of this function.
> >   	 */
> > +	rcu_assign_pointer(sched_domains_numa_distance, domain_distances);
> > +
> >   	sched_domains_numa_levels = 0;
> >   
> >   	masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
> > @@ -1979,10 +2055,13 @@ void sched_init_numa(int offline_node)
> >   			masks[i][j] = mask;
> >   
> >   			for_each_cpu_node_but(k, offline_node) {
> > -				if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
> > +				if (sched_debug() &&
> > +				    (arch_sched_node_distance(j, k) !=
> > +				     arch_sched_node_distance(k, j)))
> >   					sched_numa_warn("Node-distance not symmetric");
> >   
> > -				if (node_distance(j, k) > sched_domains_numa_distance[i])
> > +				if (arch_sched_node_distance(j, k) >
> > +					sched_domains_numa_distance[i])
> >   					continue;
> >   
> >   				cpumask_or(mask, mask, cpumask_of_node(k));
> > @@ -2022,7 +2101,7 @@ void sched_init_numa(int offline_node)
> >   	sched_domain_topology = tl;
> >   
> >   	sched_domains_numa_levels = nr_levels;
> > -	WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
> > +	WRITE_ONCE(sched_max_numa_distance, max_dist);
> >   
> 
> Would it be possible to use
> WRITE_ONCE(sched_max_numa_distance, distance[nr_node_levels - 1]);
> so we can simplify the code by removing the introduced 'max_dist'
> both in sched_record_numa_dist() and sched_init_numa().

Sure, I think that simplifies sched_record_numa_dist().


Tim