[PATCH v4] sched/fair: Cache NUMA node statistics to avoid O(N) scanning

Qiliang Yuan posted 1 patch 1 week, 2 days ago
kernel/sched/fair.c     | 64 +++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h    | 17 +++++++++++
kernel/sched/topology.c | 20 +++++++++++++
3 files changed, 101 insertions(+)
[PATCH v4] sched/fair: Cache NUMA node statistics to avoid O(N) scanning
Posted by Qiliang Yuan 1 week, 2 days ago
Optimize update_numa_stats() by reducing the algorithmic complexity of
node statistics aggregation from O(N_cpus_per_node) to O(1) for the source
node.

Currently, the NUMA balancing path performs an O(N) scan across all CPUs
in a node to aggregate load, utilization, and runnable statistics. On
large-scale systems, this repetitive traversal of per-CPU runqueues
incurs significant overhead.

Introduce a per-node statistics cache within the 'root_domain' and update
it during the load-balancing hot path (update_sg_lb_stats). Leverage these
pre-aggregated stats to allow update_numa_stats() to bypass the O(N) scan
for the source node, achieving O(1) complexity. Maintain consistency
through a seqlock mechanism to prevent data tearing during concurrent
updates. Amortize the cost of data collection and significantly reduce
scheduling overhead in the NUMA placement hot path.

Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
---
v4:
 - Implement seqlock (seqcount_t + spinlock_t) to ensure multi-variable consistency and prevent tearing.
 - Move cache to 'root_domain' with dynamic allocation to handle partitioning and save memory on non-NUMA.
 - Add #ifdef CONFIG_NUMA guards and improve code robustness.
v3:
 - Optimize update_numa_stats() to O(1) for source node by caching stats during load balancing.
 - Introduce a global cache (later refined in v4).
v2:
 - Use cached stats from the first group in the hierarchy.
v1:
 - Initial implementation of NUMA stats caching.

 kernel/sched/fair.c     | 64 +++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h    | 17 +++++++++++
 kernel/sched/topology.c | 20 +++++++++++++
 3 files changed, 101 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e71302282671..eafe1cfd7ff6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -48,6 +48,7 @@
 #include <linux/psi.h>
 #include <linux/ratelimit.h>
 #include <linux/task_work.h>
+#include <linux/seqlock.h>
 #include <linux/rbtree_augmented.h>
 
 #include <asm/switch_to.h>
@@ -2104,6 +2105,43 @@ static void update_numa_stats(struct task_numa_env *env,
 	ns->idle_cpu = -1;
 
 	rcu_read_lock();
+	/*
+	 * Algorithmic Optimization: Avoid O(N) scan by using cached stats.
+	 * Only applicable for the source node where we don't need to find
+	 * an idle CPU. This skip is safe because we only need the totals
+	 * for the source node's load, util, and nr_running to check for
+	 * relative imbalance.
+	 */
+	if (!find_idle && nid == env->src_nid) {
+		struct root_domain *rd = rcu_dereference(cpu_rq(env->src_cpu)->rd);
+		struct numa_stats_cache *cache;
+		unsigned int seq;
+
+		if (rd && rd->node_stats_cache) {
+			cache = &rd->node_stats_cache[nid];
+			do {
+				seq = read_seqcount_begin(&cache->seq);
+				if (!time_before(jiffies, cache->last_update + msecs_to_jiffies(10))) {
+					ns->compute_capacity = 0;
+					break;
+				}
+
+				ns->load = cache->load;
+				ns->runnable = cache->runnable;
+				ns->util = cache->util;
+				ns->nr_running = cache->nr_running;
+				ns->compute_capacity = cache->capacity;
+			} while (read_seqcount_retry(&cache->seq, seq));
+
+			if (ns->compute_capacity)
+				goto skip_scan;
+
+			/* Reset stats before O(N) scan if cache was stale or torn */
+			memset(ns, 0, sizeof(*ns));
+			ns->idle_cpu = -1;
+		}
+	}
+
 	for_each_cpu(cpu, cpumask_of_node(nid)) {
 		struct rq *rq = cpu_rq(cpu);
 
@@ -2124,6 +2162,8 @@ static void update_numa_stats(struct task_numa_env *env,
 			idle_core = numa_idle_core(idle_core, cpu);
 		}
 	}
+
+skip_scan:
 	rcu_read_unlock();
 
 	ns->weight = cpumask_weight(cpumask_of_node(nid));
@@ -10488,6 +10528,30 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	if (sgs->group_type == group_overloaded)
 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
 				sgs->group_capacity;
+
+	/* Algorithmic Optimization: Cache node stats for O(1) NUMA lookups */
+#ifdef CONFIG_NUMA
+	if (env->sd->flags & SD_NUMA) {
+		int nid = cpu_to_node(cpumask_first(sched_group_span(group)));
+		struct root_domain *rd = env->dst_rq->rd;
+		struct numa_stats_cache *cache;
+
+		if (rd && rd->node_stats_cache) {
+			cache = &rd->node_stats_cache[nid];
+
+			spin_lock(&cache->lock);
+			write_seqcount_begin(&cache->seq);
+			cache->nr_running = sgs->sum_h_nr_running;
+			cache->load = sgs->group_load;
+			cache->util = sgs->group_util;
+			cache->runnable = sgs->group_runnable;
+			cache->capacity = sgs->group_capacity;
+			cache->last_update = jiffies;
+			write_seqcount_end(&cache->seq);
+			spin_unlock(&cache->lock);
+		}
+	}
+#endif
 }
 
 /**
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d30cca6870f5..643e6c743fc8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -974,6 +974,19 @@ struct perf_domain {
 	struct rcu_head rcu;
 };
 
+#ifdef CONFIG_NUMA
+struct numa_stats_cache {
+	spinlock_t lock;
+	seqcount_t seq;
+	unsigned long load;
+	unsigned long runnable;
+	unsigned long util;
+	unsigned long capacity;
+	unsigned int nr_running;
+	unsigned long last_update;
+} ____cacheline_aligned;
+#endif
+
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
@@ -1042,6 +1055,10 @@ struct root_domain {
 	 * CPUs of the rd. Protected by RCU.
 	 */
 	struct perf_domain __rcu *pd;
+
+#ifdef CONFIG_NUMA
+	struct numa_stats_cache *node_stats_cache;
+#endif
 };
 
 extern void init_defrootdomain(void);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..094a4f197af3 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -5,6 +5,7 @@
 
 #include <linux/sched/isolation.h>
 #include <linux/bsearch.h>
+#include <linux/slab.h>
 #include "sched.h"
 
 DEFINE_MUTEX(sched_domains_mutex);
@@ -466,6 +467,9 @@ static void free_rootdomain(struct rcu_head *rcu)
 	free_cpumask_var(rd->online);
 	free_cpumask_var(rd->span);
 	free_pd(rd->pd);
+#ifdef CONFIG_NUMA
+	kfree(rd->node_stats_cache);
+#endif
 	kfree(rd);
 }
 
@@ -551,8 +555,24 @@ static int init_rootdomain(struct root_domain *rd)
 
 	if (cpupri_init(&rd->cpupri) != 0)
 		goto free_cpudl;
+
+#ifdef CONFIG_NUMA
+	rd->node_stats_cache = kcalloc(nr_node_ids, sizeof(struct numa_stats_cache),
+				      GFP_KERNEL);
+	if (!rd->node_stats_cache)
+		goto free_cpupri;
+
+	int i;
+	for (i = 0; i < nr_node_ids; i++) {
+		spin_lock_init(&rd->node_stats_cache[i].lock);
+		seqcount_init(&rd->node_stats_cache[i].seq);
+	}
+#endif
+
 	return 0;
 
+free_cpupri:
+	cpupri_cleanup(&rd->cpupri);
 free_cpudl:
 	cpudl_cleanup(&rd->cpudl);
 free_rto_mask:
-- 
2.51.0
Re: [PATCH v4] sched/fair: Cache NUMA node statistics to avoid O(N) scanning
Posted by kernel test robot 1 week, 2 days ago
Hi Qiliang,

kernel test robot noticed the following build warnings:

[auto build test WARNING on tip/sched/core]
[also build test WARNING on tip/master peterz-queue/sched/core linus/master v6.19-rc7 next-20260128]
[cannot apply to tip/auto-latest]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Qiliang-Yuan/sched-fair-Cache-NUMA-node-statistics-to-avoid-O-N-scanning/20260129-112302
base:   tip/sched/core
patch link:    https://lore.kernel.org/r/20260129031948.2384178-1-realwujing%40gmail.com
patch subject: [PATCH v4] sched/fair: Cache NUMA node statistics to avoid O(N) scanning
config: arc-allnoconfig (https://download.01.org/0day-ci/archive/20260129/202601291924.phVnpnOi-lkp@intel.com/config)
compiler: arc-linux-gcc (GCC) 15.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260129/202601291924.phVnpnOi-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601291924.phVnpnOi-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from kernel/sched/build_utility.c:86:
   kernel/sched/topology.c: In function 'init_rootdomain':
>> kernel/sched/topology.c:574:1: warning: label 'free_cpupri' defined but not used [-Wunused-label]
     574 | free_cpupri:
         | ^~~~~~~~~~~


vim +/free_cpupri +574 kernel/sched/topology.c

   571	
   572		return 0;
   573	
 > 574	free_cpupri:
   575		cpupri_cleanup(&rd->cpupri);
   576	free_cpudl:
   577		cpudl_cleanup(&rd->cpudl);
   578	free_rto_mask:
   579		free_cpumask_var(rd->rto_mask);
   580	free_dlo_mask:
   581		free_cpumask_var(rd->dlo_mask);
   582	free_online:
   583		free_cpumask_var(rd->online);
   584	free_span:
   585		free_cpumask_var(rd->span);
   586	out:
   587		return -ENOMEM;
   588	}
   589	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki