[v3] sched/fair: Optimize cfs_rq and sched_entity allocation for better data locality

[PATCH v3 3/3] sched/fair: Allocate both cfs_rq and sched_entity with per-cpu

Posted by Zecheng Li 3 months, 1 week ago

To remove the cfs_rq pointer array in task_group, allocate the combined
cfs_rq and sched_entity using the per-cpu allocator.

This patch implements the following:

- Changes task_group->cfs_rq from struct cfs_rq ** to struct cfs_rq
__percpu *.

- Updates memory allocation in alloc_fair_sched_group() and
free_fair_sched_group() to use alloc_percpu() and free_percpu()
respectively.

- Uses the inline accessor tg_cfs_rq(tg, cpu) with per_cpu_ptr() to
retrieve the pointer to cfs_rq for the given task group and CPU.

- Replaces direct accesses tg->cfs_rq[cpu] with calls to the new
tg_cfs_rq(tg, cpu) helper.

- Handles the root_task_group: since struct rq is already a per-cpu
variable (runqueues), its embedded cfs_rq (rq->cfs) is also per-cpu.
Therefore, we assign root_task_group.cfs_rq = &runqueues.cfs.

- Cleanup the code in initializing the root task group.

This change places each CPU's cfs_rq and sched_entity in its local
per-cpu memory area to remove the per-task_group pointer arrays.

Signed-off-by: Zecheng Li <zecheng@google.com>
---
 kernel/sched/core.c  | 35 ++++++++++----------------
 kernel/sched/fair.c  | 59 +++++++++++++++++---------------------------
 kernel/sched/sched.h | 13 +++++++---
 3 files changed, 45 insertions(+), 62 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2efa7e9590c7..377361fae8e8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8508,7 +8508,7 @@ static struct kmem_cache *task_group_cache __ro_after_init;
 
 void __init sched_init(void)
 {
-	unsigned long ptr = 0;
+	unsigned long __maybe_unused ptr = 0;
 	int i;
 
 	/* Make sure the linker didn't screw up */
@@ -8526,33 +8526,24 @@ void __init sched_init(void)
 	wait_bit_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	ptr += nr_cpu_ids * sizeof(void **);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-	ptr += 2 * nr_cpu_ids * sizeof(void **);
-#endif
-	if (ptr) {
-		ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
+	root_task_group.cfs_rq = &runqueues.cfs;
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
-		ptr += nr_cpu_ids * sizeof(void **);
-
-		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
-		init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
+	root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+	init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_EXT_GROUP_SCHED
-		scx_tg_init(&root_task_group);
+	scx_tg_init(&root_task_group);
 #endif /* CONFIG_EXT_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
-		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
-		ptr += nr_cpu_ids * sizeof(void **);
+	ptr += 2 * nr_cpu_ids * sizeof(void **);
+	ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
+	root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+	ptr += nr_cpu_ids * sizeof(void **);
 
-		root_task_group.rt_rq = (struct rt_rq **)ptr;
-		ptr += nr_cpu_ids * sizeof(void **);
+	root_task_group.rt_rq = (struct rt_rq **)ptr;
+	ptr += nr_cpu_ids * sizeof(void **);
 
 #endif /* CONFIG_RT_GROUP_SCHED */
-	}
 
 #ifdef CONFIG_SMP
 	init_defrootdomain();
@@ -9497,7 +9488,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
 	}
 
 	for_each_online_cpu(i) {
-		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+		struct cfs_rq *cfs_rq = tg_cfs_rq(tg, i);
 		struct rq *rq = cfs_rq->rq;
 
 		guard(rq_lock_irq)(rq);
@@ -9745,7 +9736,7 @@ static u64 throttled_time_self(struct task_group *tg)
 	u64 total = 0;
 
 	for_each_possible_cpu(i) {
-		total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
+		total += READ_ONCE(tg_cfs_rq(tg, i)->throttled_clock_self_time);
 	}
 
 	return total;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 244b20222eb5..37d6b00b3a3b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -329,7 +329,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 	 * to a tree or when we reach the top of the tree
 	 */
 	if (cfs_rq->tg->parent &&
-	    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+	    tg_cfs_rq(cfs_rq->tg->parent, cpu)->on_list) {
 		/*
 		 * If parent is already on the list, we add the child
 		 * just before. Thanks to circular linked property of
@@ -337,7 +337,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 		 * of the list that starts by parent.
 		 */
 		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
-			&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+			&(tg_cfs_rq(cfs_rq->tg->parent, cpu)->leaf_cfs_rq_list));
 		/*
 		 * The branch is now connected to its tree so we can
 		 * reset tmp_alone_branch to the beginning of the
@@ -4180,7 +4180,7 @@ static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(tg, &task_groups, list) {
-		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+		struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
 
 		clear_tg_load_avg(cfs_rq);
 	}
@@ -5828,8 +5828,8 @@ static inline int throttled_lb_pair(struct task_group *tg,
 {
 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
 
-	src_cfs_rq = tg->cfs_rq[src_cpu];
-	dest_cfs_rq = tg->cfs_rq[dest_cpu];
+	src_cfs_rq = tg_cfs_rq(tg, src_cpu);
+	dest_cfs_rq = tg_cfs_rq(tg, dest_cpu);
 
 	return throttled_hierarchy(src_cfs_rq) ||
 	       throttled_hierarchy(dest_cfs_rq);
@@ -5838,7 +5838,7 @@ static inline int throttled_lb_pair(struct task_group *tg,
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
-	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+	struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
 
 	cfs_rq->throttle_count--;
 	if (!cfs_rq->throttle_count) {
@@ -5867,7 +5867,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 static int tg_throttle_down(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
-	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+	struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
 
 	/* group is entering throttled state, stop time */
 	if (!cfs_rq->throttle_count) {
@@ -6454,8 +6454,8 @@ static void sync_throttle(struct task_group *tg, int cpu)
 	if (!tg->parent)
 		return;
 
-	cfs_rq = tg->cfs_rq[cpu];
-	pcfs_rq = tg->parent->cfs_rq[cpu];
+	cfs_rq = tg_cfs_rq(tg, cpu);
+	pcfs_rq = tg_cfs_rq(tg->parent, cpu);
 
 	cfs_rq->throttle_count = pcfs_rq->throttle_count;
 	cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
@@ -6640,7 +6640,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
 	rcu_read_lock();
 	list_for_each_entry_rcu(tg, &task_groups, list) {
 		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
-		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+		struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
 
 		raw_spin_lock(&cfs_b->lock);
 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
@@ -6669,7 +6669,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(tg, &task_groups, list) {
-		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+		struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
 
 		if (!cfs_rq->runtime_enabled)
 			continue;
@@ -9378,7 +9378,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
 	struct cfs_rq *dst_cfs_rq;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu];
+	dst_cfs_rq = tg_cfs_rq(task_group(p), dest_cpu);
 #else
 	dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
 #endif
@@ -13095,7 +13095,7 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu)
 	struct cfs_rq *cfs_rq;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	cfs_rq = task_group(p)->cfs_rq[cpu];
+	cfs_rq = tg_cfs_rq(task_group(p), cpu);
 #else
 	cfs_rq = &cpu_rq(cpu)->cfs;
 #endif
@@ -13351,42 +13351,31 @@ static void task_change_group_fair(struct task_struct *p)
 
 void free_fair_sched_group(struct task_group *tg)
 {
-	int i;
-
-	for_each_possible_cpu(i) {
-		if (tg->cfs_rq && tg->cfs_rq[i]) {
-			struct cfs_rq_with_se *combined =
-				container_of(tg->cfs_rq[i], struct cfs_rq_with_se, cfs_rq);
-			kfree(combined);
-		}
-	}
-
-	kfree(tg->cfs_rq);
+	free_percpu(tg->cfs_rq);
 }
 
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
-	struct cfs_rq_with_se *combined;
+	struct cfs_rq_with_se __percpu *combined;
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
 	int i;
 
-	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
-	if (!tg->cfs_rq)
+	combined = alloc_percpu_gfp(struct cfs_rq_with_se, GFP_KERNEL);
+	if (!combined)
 		goto err;
 
+	tg->cfs_rq = &combined->cfs_rq;
 	tg->shares = NICE_0_LOAD;
 
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
 
 	for_each_possible_cpu(i) {
-		combined = kzalloc_node(sizeof(struct cfs_rq_with_se),
-				      GFP_KERNEL, cpu_to_node(i));
-		if (!combined)
+		cfs_rq = tg_cfs_rq(tg, i);
+		if (!cfs_rq)
 			goto err;
 
-		cfs_rq = &combined->cfs_rq;
-		se = &combined->se;
+		se = tg_se(tg, i);
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, tg_se(parent, i));
 		init_entity_runnable_average(se);
@@ -13423,7 +13412,7 @@ void unregister_fair_sched_group(struct task_group *tg)
 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
 	for_each_possible_cpu(cpu) {
-		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+		struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu);
 		struct sched_entity *se = tg_se(tg, cpu);
 		struct rq *rq = cpu_rq(cpu);
 
@@ -13460,8 +13449,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	cfs_rq->rq = rq;
 	init_cfs_rq_runtime(cfs_rq);
 
-	tg->cfs_rq[cpu] = cfs_rq;
-
 	/* se could be NULL for root_task_group */
 	if (!se)
 		return;
@@ -13554,7 +13541,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
 	for_each_possible_cpu(i) {
 		struct rq *rq = cpu_rq(i);
 		struct sched_entity *se = tg_se(tg, i);
-		struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+		struct cfs_rq *grp_cfs_rq = tg_cfs_rq(tg, i);
 		bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
 		long idle_task_delta;
 		struct rq_flags rf;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3fdcdcdab76c..a794bec99604 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -438,7 +438,7 @@ struct task_group {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* runqueue "owned" by this group on each CPU */
-	struct cfs_rq		**cfs_rq;
+	struct cfs_rq __percpu	*cfs_rq;
 	unsigned long		shares;
 #ifdef	CONFIG_SMP
 	/*
@@ -1592,6 +1592,11 @@ static inline struct task_struct *task_of(struct sched_entity *se)
 	WARN_ON_ONCE(!entity_is_task(se));
 	return container_of(se, struct task_struct, se);
 }
+/* Access a specific CPU's cfs_rq from a task group */
+static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu)
+{
+	return per_cpu_ptr(tg->cfs_rq, cpu);
+}
 
 static inline struct sched_entity *tg_se(struct task_group *tg, int cpu)
 {
@@ -1599,7 +1604,7 @@ static inline struct sched_entity *tg_se(struct task_group *tg, int cpu)
 		return NULL;
 
 	struct cfs_rq_with_se *combined =
-		container_of(tg->cfs_rq[cpu], struct cfs_rq_with_se, cfs_rq);
+		container_of(tg_cfs_rq(tg, cpu), struct cfs_rq_with_se, cfs_rq);
 	return &combined->se;
 }
 
@@ -2185,8 +2190,8 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
-	p->se.cfs_rq = tg->cfs_rq[cpu];
+	set_task_rq_fair(&p->se, p->se.cfs_rq, tg_cfs_rq(tg, cpu));
+	p->se.cfs_rq = tg_cfs_rq(tg, cpu);
 	p->se.parent = tg_se(tg, cpu);
 	p->se.depth = p->se.parent ? p->se.parent->depth + 1 : 0;
 #endif
-- 
2.50.0

Re: [PATCH v3 3/3] sched/fair: Allocate both cfs_rq and sched_entity with per-cpu

Posted by kernel test robot 2 months, 3 weeks ago


Hello,

kernel test robot noticed a 8.8% improvement of stress-ng.session.ops_per_sec on:


commit: ac215b990e70e247344522e1736fd878cb3c25b6 ("[PATCH v3 3/3] sched/fair: Allocate both cfs_rq and sched_entity with per-cpu")
url: https://github.com/intel-lab-lkp/linux/commits/Zecheng-Li/sched-fair-Co-locate-cfs_rq-and-sched_entity/20250702-050528
patch link: https://lore.kernel.org/all/20250701210230.2985885-4-zecheng@google.com/
patch subject: [PATCH v3 3/3] sched/fair: Allocate both cfs_rq and sched_entity with per-cpu

testcase: stress-ng
config: x86_64-rhel-9.4
compiler: gcc-12
test machine: 192 threads 2 sockets Intel(R) Xeon(R) 6740E  CPU @ 2.4GHz (Sierra Forest) with 256G memory
parameters:

	nr_threads: 100%
	testtime: 60s
	test: session
	cpufreq_governor: performance






Details are as below:
-------------------------------------------------------------------------------------------------->


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20250716/202507161052.ed3213f4-lkp@intel.com

=========================================================================================
compiler/cpufreq_governor/kconfig/nr_threads/rootfs/tbox_group/test/testcase/testtime:
  gcc-12/performance/x86_64-rhel-9.4/100%/debian-12-x86_64-20240206.cgz/lkp-srf-2sp3/session/stress-ng/60s

commit: 
  10d69ea6ba ("sched/fair: Remove task_group->se pointer array")
  ac215b990e ("sched/fair: Allocate both cfs_rq and sched_entity with per-cpu")

10d69ea6ba7641d0 ac215b990e70e247344522e1736 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      0.00 ± 33%      -0.0        0.00        mpstat.cpu.all.iowait%
      4.46            -1.9        2.60        mpstat.cpu.all.soft%
    119077           -40.8%      70550        numa-vmstat.node0.nr_slab_unreclaimable
    108930 ±  2%     -47.0%      57734        numa-vmstat.node1.nr_slab_unreclaimable
    182080            +6.2%     193357        vmstat.system.cs
    523378            +2.8%     538219        vmstat.system.in
    492392 ±  2%     +13.0%     556325 ±  4%  sched_debug.cfs_rq:/.avg_vruntime.min
    492392 ±  2%     +13.0%     556325 ±  4%  sched_debug.cfs_rq:/.min_vruntime.min
    138.58 ±  5%      -7.4%     128.29 ±  2%  sched_debug.cfs_rq:/.util_est.stddev
    472216           -40.6%     280726        numa-meminfo.node0.SUnreclaim
    574944 ±  4%     -37.1%     361520 ±  7%  numa-meminfo.node0.Slab
    432134 ±  2%     -46.3%     231873 ±  2%  numa-meminfo.node1.SUnreclaim
    479462 ±  5%     -37.2%     301105 ±  9%  numa-meminfo.node1.Slab
  10287656           -10.7%    9189229        meminfo.Memused
    135165          +249.4%     472245 ±  2%  meminfo.Percpu
    904298           -43.4%     512235        meminfo.SUnreclaim
   1054350           -37.2%     662242        meminfo.Slab
  10568312           -11.6%    9345843        meminfo.max_used_kB
   1207478            +8.7%    1312849        stress-ng.session.ops
     20155            +8.8%      21919        stress-ng.session.ops_per_sec
 1.019e+08 ±  2%      +6.9%   1.09e+08        stress-ng.time.minor_page_faults
     16524            +2.1%      16869        stress-ng.time.percent_of_cpu_this_job_got
      9893            +2.1%      10097        stress-ng.time.system_time
   4781984            +7.2%    5125238        stress-ng.time.voluntary_context_switches
    481639            +2.6%     494200        proc-vmstat.nr_active_anon
   1169838            +1.0%    1181222        proc-vmstat.nr_file_pages
     41310            +2.1%      42196        proc-vmstat.nr_kernel_stack
    279620            +4.1%     291007        proc-vmstat.nr_shmem
    226450           -43.4%     128195        proc-vmstat.nr_slab_unreclaimable
    481639            +2.6%     494200        proc-vmstat.nr_zone_active_anon
  89251914 ±  3%      +7.3%   95746015 ±  3%  proc-vmstat.numa_hit
  87248173 ±  3%      +7.5%   93770181 ±  3%  proc-vmstat.numa_local
 1.169e+08 ±  2%     -15.8%   98422922 ±  3%  proc-vmstat.pgalloc_normal
 1.024e+08 ±  2%      +6.9%  1.095e+08        proc-vmstat.pgfault
 1.161e+08 ±  2%     -16.0%   97515011 ±  3%  proc-vmstat.pgfree
      0.73            +0.0        0.76        perf-stat.i.branch-miss-rate%
 1.795e+08            +5.7%  1.897e+08        perf-stat.i.branch-misses
     39.65            -2.8       36.83        perf-stat.i.cache-miss-rate%
 1.485e+09            +7.2%  1.592e+09        perf-stat.i.cache-references
    188846            +7.1%     202265        perf-stat.i.context-switches
      4.35            -1.1%       4.30        perf-stat.i.cpi
     50615            +8.6%      54987        perf-stat.i.cpu-migrations
     18.17            +8.4%      19.69        perf-stat.i.metric.K/sec
   1682096 ±  2%      +6.7%    1795278        perf-stat.i.minor-faults
   1682116 ±  2%      +6.7%    1795306        perf-stat.i.page-faults
      0.71            +0.0        0.74        perf-stat.overall.branch-miss-rate%
     39.60            -2.9       36.74        perf-stat.overall.cache-miss-rate%
      4.38            -1.1%       4.33        perf-stat.overall.cpi
      0.23            +1.1%       0.23        perf-stat.overall.ipc
  1.75e+08            +5.6%  1.847e+08        perf-stat.ps.branch-misses
 1.454e+09            +6.9%  1.555e+09        perf-stat.ps.cache-references
    184886            +6.9%     197661        perf-stat.ps.context-switches
     49237            +8.5%      53411        perf-stat.ps.cpu-migrations
   1646954 ±  2%      +6.5%    1754401        perf-stat.ps.minor-faults
   1646973 ±  2%      +6.5%    1754428        perf-stat.ps.page-faults
      0.14 ± 12%    -100.0%       0.00        perf-sched.sch_delay.avg.ms.__cond_resched.__kmalloc_cache_node_noprof.alloc_fair_sched_group.sched_create_group.sched_autogroup_create_attach
      1.86 ± 55%     -95.6%       0.08 ± 56%  perf-sched.sch_delay.avg.ms.__cond_resched.__kmalloc_node_noprof.alloc_slab_obj_exts.allocate_slab.___slab_alloc
      0.07 ±115%    -100.0%       0.00        perf-sched.sch_delay.avg.ms.__cond_resched.__kmalloc_noprof.alloc_fair_sched_group.sched_create_group.sched_autogroup_create_attach
      0.03 ± 30%     -89.1%       0.00 ± 30%  perf-sched.sch_delay.avg.ms.__cond_resched.__put_anon_vma.unlink_anon_vmas.free_pgtables.exit_mmap
      0.02 ± 43%     -69.1%       0.01 ± 33%  perf-sched.sch_delay.avg.ms.__cond_resched.__tlb_batch_free_encoded_pages.tlb_finish_mmu.exit_mmap.__mmput
      0.47 ±  8%     -16.7%       0.39 ±  5%  perf-sched.sch_delay.avg.ms.__cond_resched.__wait_for_common.affine_move_task.__set_cpus_allowed_ptr.__sched_setaffinity
      0.27 ± 47%     -69.6%       0.08 ± 97%  perf-sched.sch_delay.avg.ms.__cond_resched.__wait_for_common.wait_for_completion_state.kernel_clone.__x64_sys_vfork
      0.26 ± 27%     -40.1%       0.15 ±  7%  perf-sched.sch_delay.avg.ms.__cond_resched.copy_page_range.dup_mmap.dup_mm.constprop
      0.27 ± 16%     -43.3%       0.15 ± 26%  perf-sched.sch_delay.avg.ms.__cond_resched.down_write.anon_vma_clone.anon_vma_fork.dup_mmap
      0.23 ± 10%     -26.2%       0.17 ±  9%  perf-sched.sch_delay.avg.ms.__cond_resched.down_write.dup_mmap.dup_mm.constprop
      0.03 ± 41%     -86.7%       0.00 ± 32%  perf-sched.sch_delay.avg.ms.__cond_resched.down_write.unlink_anon_vmas.free_pgtables.exit_mmap
      0.04 ± 69%     -93.6%       0.00 ± 62%  perf-sched.sch_delay.avg.ms.__cond_resched.down_write.unlink_file_vma_batch_process.unlink_file_vma_batch_add.free_pgtables
      0.56 ± 49%     -75.3%       0.14 ± 39%  perf-sched.sch_delay.avg.ms.__cond_resched.kmem_cache_alloc_noprof.alloc_pid.copy_process.kernel_clone
      0.25 ±  7%     -21.0%       0.19 ±  8%  perf-sched.sch_delay.avg.ms.__cond_resched.kmem_cache_alloc_noprof.anon_vma_fork.dup_mmap.dup_mm
      0.43 ± 44%     -86.3%       0.06 ± 55%  perf-sched.sch_delay.avg.ms.__cond_resched.mutex_lock_killable.pcpu_alloc_noprof.mm_init.dup_mm
      0.01 ± 76%  +13586.7%       1.03 ± 72%  perf-sched.sch_delay.avg.ms.__cond_resched.process_one_work.worker_thread.kthread.ret_from_fork
      0.09 ± 44%     -76.0%       0.02 ± 26%  perf-sched.sch_delay.avg.ms.__cond_resched.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm
      0.39 ± 52%     -75.9%       0.10 ± 20%  perf-sched.sch_delay.avg.ms.__cond_resched.wp_page_copy.__handle_mm_fault.handle_mm_fault.do_user_addr_fault
      0.09 ± 21%     -30.0%       0.06 ± 20%  perf-sched.sch_delay.avg.ms.anon_pipe_read.fifo_pipe_read.vfs_read.ksys_read
      0.02 ±  4%     -47.3%       0.01        perf-sched.sch_delay.avg.ms.do_task_dead.do_exit.do_group_exit.__x64_sys_exit_group.x64_sys_call
      0.20           -12.8%       0.17 ±  3%  perf-sched.sch_delay.avg.ms.do_wait.kernel_wait4.do_syscall_64.entry_SYSCALL_64_after_hwframe
      0.22 ± 26%     -52.3%       0.10 ± 29%  perf-sched.sch_delay.avg.ms.schedule_hrtimeout_range_clock.poll_schedule_timeout.constprop.0.do_poll
      0.02 ± 12%     -55.8%       0.01 ± 16%  perf-sched.sch_delay.avg.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.__put_anon_vma
      0.07 ± 10%     -24.5%       0.05 ± 14%  perf-sched.sch_delay.avg.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.anon_vma_clone
      0.07 ±  6%     -22.6%       0.05 ± 13%  perf-sched.sch_delay.avg.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.anon_vma_fork
      0.01 ± 20%     -57.3%       0.01 ±  6%  perf-sched.sch_delay.avg.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.unlink_anon_vmas
      0.10 ± 18%     -27.5%       0.07 ± 10%  perf-sched.sch_delay.avg.ms.schedule_timeout.rcu_gp_fqs_loop.rcu_gp_kthread.kthread
      0.44 ±  4%     -20.2%       0.35 ±  2%  perf-sched.sch_delay.avg.ms.worker_thread.kthread.ret_from_fork.ret_from_fork_asm
      8.47 ± 85%    -100.0%       0.00        perf-sched.sch_delay.max.ms.__cond_resched.__kmalloc_cache_node_noprof.alloc_fair_sched_group.sched_create_group.sched_autogroup_create_attach
     10.95 ± 57%     -95.9%       0.45 ± 59%  perf-sched.sch_delay.max.ms.__cond_resched.__kmalloc_node_noprof.alloc_slab_obj_exts.allocate_slab.___slab_alloc
      0.34 ±160%    -100.0%       0.00        perf-sched.sch_delay.max.ms.__cond_resched.__kmalloc_noprof.alloc_fair_sched_group.sched_create_group.sched_autogroup_create_attach
     17.30 ± 16%     -86.0%       2.42 ± 65%  perf-sched.sch_delay.max.ms.__cond_resched.__put_anon_vma.unlink_anon_vmas.free_pgtables.exit_mmap
      8.99 ± 57%     -81.2%       1.69 ± 22%  perf-sched.sch_delay.max.ms.__cond_resched.__tlb_batch_free_encoded_pages.tlb_finish_mmu.exit_mmap.__mmput
     36.94 ± 97%     -90.9%       3.35 ±  9%  perf-sched.sch_delay.max.ms.__cond_resched.__wait_for_common.affine_move_task.__set_cpus_allowed_ptr.__sched_setaffinity
      0.67 ± 47%     -75.4%       0.16 ±148%  perf-sched.sch_delay.max.ms.__cond_resched.__wait_for_common.wait_for_completion_state.kernel_clone.__x64_sys_vfork
     26.40 ± 89%     -69.8%       7.96 ± 67%  perf-sched.sch_delay.max.ms.__cond_resched.copy_page_range.dup_mmap.dup_mm.constprop
     36.26 ± 33%     -70.8%      10.58 ±109%  perf-sched.sch_delay.max.ms.__cond_resched.down_write.anon_vma_clone.anon_vma_fork.dup_mmap
     10.02 ± 42%     -86.2%       1.38 ±  6%  perf-sched.sch_delay.max.ms.__cond_resched.down_write.unlink_anon_vmas.free_pgtables.exit_mmap
     11.63 ± 51%     -92.3%       0.89 ± 39%  perf-sched.sch_delay.max.ms.__cond_resched.down_write.unlink_file_vma_batch_process.unlink_file_vma_batch_add.free_pgtables
     13.19 ± 50%     -88.0%       1.58 ± 44%  perf-sched.sch_delay.max.ms.__cond_resched.kmem_cache_alloc_noprof.alloc_pid.copy_process.kernel_clone
      9.58 ± 68%     -91.3%       0.83 ± 49%  perf-sched.sch_delay.max.ms.__cond_resched.mutex_lock_killable.pcpu_alloc_noprof.mm_init.dup_mm
      0.01 ± 83%  +17570.0%       1.47 ± 76%  perf-sched.sch_delay.max.ms.__cond_resched.process_one_work.worker_thread.kthread.ret_from_fork
     13.57 ± 16%     -89.9%       1.37 ± 65%  perf-sched.sch_delay.max.ms.__cond_resched.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm
     13.09 ± 35%     -80.8%       2.51 ± 59%  perf-sched.sch_delay.max.ms.__cond_resched.wp_page_copy.__handle_mm_fault.handle_mm_fault.do_user_addr_fault
     23.68 ± 15%     -82.9%       4.05 ± 33%  perf-sched.sch_delay.max.ms.do_task_dead.do_exit.do_group_exit.__x64_sys_exit_group.x64_sys_call
      3.16 ± 15%     +26.9%       4.01 ±  7%  perf-sched.sch_delay.max.ms.io_schedule.folio_wait_bit_common.filemap_fault.__do_fault
      4.74 ±147%    +499.9%      28.44 ± 59%  perf-sched.sch_delay.max.ms.schedule_preempt_disabled.__mutex_lock.constprop.0.pcpu_alloc_noprof
     22.00 ± 24%     -86.8%       2.90 ± 43%  perf-sched.sch_delay.max.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.__put_anon_vma
     19.90 ±  9%     -85.2%       2.95 ± 29%  perf-sched.sch_delay.max.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.unlink_anon_vmas
     26.25 ± 10%     -75.7%       6.37 ± 12%  perf-sched.sch_delay.max.ms.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm
     29.41 ± 22%     -78.6%       6.29 ± 19%  perf-sched.sch_delay.max.ms.worker_thread.kthread.ret_from_fork.ret_from_fork_asm
      0.13 ±  3%     -14.3%       0.11 ±  3%  perf-sched.total_sch_delay.average.ms
    554.44 ± 80%     -84.6%      85.42 ± 13%  perf-sched.total_sch_delay.max.ms
      2379 ± 19%     +49.8%       3565 ± 24%  perf-sched.total_wait_and_delay.max.ms
      2379 ± 19%     +49.8%       3564 ± 24%  perf-sched.total_wait_time.max.ms
      0.08 ± 30%     -87.6%       0.01 ± 15%  perf-sched.wait_and_delay.avg.ms.__cond_resched.__put_anon_vma.unlink_anon_vmas.free_pgtables.exit_mmap
      4.97 ±  2%      -8.4%       4.56        perf-sched.wait_and_delay.avg.ms.__cond_resched.kmem_cache_alloc_noprof.anon_vma_fork.dup_mmap.dup_mm
      0.04 ±  4%     -46.9%       0.02 ±  3%  perf-sched.wait_and_delay.avg.ms.do_task_dead.do_exit.do_group_exit.__x64_sys_exit_group.x64_sys_call
      0.10 ±  4%     -33.6%       0.06 ±  4%  perf-sched.wait_and_delay.avg.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.__put_anon_vma
      0.08 ± 10%     -35.0%       0.05        perf-sched.wait_and_delay.avg.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.unlink_anon_vmas
      5676 ±  2%     -10.9%       5057        perf-sched.wait_and_delay.count.__cond_resched.__put_anon_vma.unlink_anon_vmas.free_pgtables.exit_mmap
      5076           -11.4%       4498 ±  2%  perf-sched.wait_and_delay.count.__cond_resched.kmem_cache_alloc_noprof.anon_vma_fork.dup_mmap.dup_mm
      1072 ± 15%     -57.1%     460.33 ± 13%  perf-sched.wait_and_delay.count.__cond_resched.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm
      4236 ±  8%     +29.2%       5471 ± 10%  perf-sched.wait_and_delay.count.io_schedule.folio_wait_bit_common.filemap_fault.__do_fault
    102.00 ± 29%     +39.7%     142.50 ± 15%  perf-sched.wait_and_delay.count.schedule_hrtimeout_range_clock.poll_schedule_timeout.constprop.0.do_poll
      2.17 ±223%  +2.1e+05%       4619 ± 12%  perf-sched.wait_and_delay.count.schedule_preempt_disabled.__mutex_lock.constprop.0.pcpu_alloc_noprof
     11630 ±  5%     -12.4%      10192 ±  7%  perf-sched.wait_and_delay.count.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.__put_anon_vma
     24971 ±  4%      -9.2%      22682 ±  2%  perf-sched.wait_and_delay.count.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.anon_vma_clone
     22567 ±  2%      -9.3%      20475 ±  2%  perf-sched.wait_and_delay.count.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.unlink_anon_vmas
     37.53 ± 12%     -84.8%       5.69 ± 48%  perf-sched.wait_and_delay.max.ms.__cond_resched.__put_anon_vma.unlink_anon_vmas.free_pgtables.exit_mmap
      7.92 ±223%    +839.2%      74.40 ± 30%  perf-sched.wait_and_delay.max.ms.schedule_preempt_disabled.__mutex_lock.constprop.0.pcpu_alloc_noprof
     45.50 ± 23%     -86.0%       6.35 ± 35%  perf-sched.wait_and_delay.max.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.__put_anon_vma
     39.97 ±  9%     -83.3%       6.69 ± 22%  perf-sched.wait_and_delay.max.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.unlink_anon_vmas
      4.05 ± 50%     -77.4%       0.92 ± 64%  perf-sched.wait_time.avg.ms.__cond_resched.__kmalloc_cache_node_noprof.__get_vm_area_node.__vmalloc_node_range_noprof.__vmalloc_node_noprof
      0.01 ± 42%    -100.0%       0.00        perf-sched.wait_time.avg.ms.__cond_resched.__kmalloc_cache_node_noprof.alloc_fair_sched_group.sched_create_group.sched_autogroup_create_attach
      0.05 ± 29%     -86.9%       0.01 ± 11%  perf-sched.wait_time.avg.ms.__cond_resched.__put_anon_vma.unlink_anon_vmas.free_pgtables.exit_mmap
      0.04 ± 30%     -73.8%       0.01 ± 20%  perf-sched.wait_time.avg.ms.__cond_resched.__tlb_batch_free_encoded_pages.tlb_finish_mmu.exit_mmap.__mmput
      0.05 ± 34%     -85.3%       0.01 ± 36%  perf-sched.wait_time.avg.ms.__cond_resched.down_write.unlink_anon_vmas.free_pgtables.exit_mmap
      0.07 ± 50%     -89.2%       0.01 ± 27%  perf-sched.wait_time.avg.ms.__cond_resched.down_write.unlink_file_vma_batch_process.unlink_file_vma_batch_add.free_pgtables
      2.74 ± 49%     +78.9%       4.90 ± 24%  perf-sched.wait_time.avg.ms.__cond_resched.kmem_cache_alloc_noprof.mas_dup_build.constprop.0
      1.52 ± 12%     +32.6%       2.02 ± 14%  perf-sched.wait_time.avg.ms.__cond_resched.kmem_cache_alloc_noprof.prepare_creds.copy_creds.copy_process
      1.29 ±  9%     -21.5%       1.01 ±  7%  perf-sched.wait_time.avg.ms.__cond_resched.mutex_lock.perf_event_exit_task.do_exit.do_group_exit
      3.12 ±  4%      +9.4%       3.41 ±  3%  perf-sched.wait_time.avg.ms.__cond_resched.mutex_lock_killable.pcpu_alloc_noprof.__percpu_counter_init_many.mm_init
      0.02 ±  4%     -48.1%       0.01 ±  4%  perf-sched.wait_time.avg.ms.do_task_dead.do_exit.do_group_exit.__x64_sys_exit_group.x64_sys_call
      0.55 ± 19%     -38.1%       0.34 ± 41%  perf-sched.wait_time.avg.ms.irqentry_exit_to_user_mode.asm_sysvec_reschedule_ipi.[unknown]
      0.08 ±  4%     -28.8%       0.06 ±  4%  perf-sched.wait_time.avg.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.__put_anon_vma
      0.06 ±  9%     -29.7%       0.04        perf-sched.wait_time.avg.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.unlink_anon_vmas
     34.86 ± 44%     -80.5%       6.80 ± 49%  perf-sched.wait_time.max.ms.__cond_resched.__kmalloc_cache_node_noprof.__get_vm_area_node.__vmalloc_node_range_noprof.__vmalloc_node_noprof
      3.98 ± 62%    -100.0%       0.00        perf-sched.wait_time.max.ms.__cond_resched.__kmalloc_cache_node_noprof.alloc_fair_sched_group.sched_create_group.sched_autogroup_create_attach
     23.75 ± 25%     -81.5%       4.40 ± 23%  perf-sched.wait_time.max.ms.__cond_resched.__put_anon_vma.unlink_anon_vmas.free_pgtables.exit_mmap
     12.62 ± 54%     -83.7%       2.06 ± 20%  perf-sched.wait_time.max.ms.__cond_resched.__tlb_batch_free_encoded_pages.tlb_finish_mmu.exit_mmap.__mmput
     12.65 ± 29%     -74.1%       3.27 ± 36%  perf-sched.wait_time.max.ms.__cond_resched.down_write.unlink_anon_vmas.free_pgtables.exit_mmap
     14.23 ± 33%     -85.1%       2.12 ± 49%  perf-sched.wait_time.max.ms.__cond_resched.down_write.unlink_file_vma_batch_process.unlink_file_vma_batch_add.free_pgtables
     26.52 ± 54%     -58.3%      11.06 ± 10%  perf-sched.wait_time.max.ms.__cond_resched.kmem_cache_alloc_noprof.alloc_pid.copy_process.kernel_clone
     21.46 ± 31%     -79.3%       4.44 ± 50%  perf-sched.wait_time.max.ms.__cond_resched.mutex_lock.perf_event_exit_task.do_exit.do_group_exit
     36.27 ± 38%     -61.4%      14.00 ± 27%  perf-sched.wait_time.max.ms.__cond_resched.mutex_lock_killable.pcpu_alloc_noprof.__percpu_counter_init_many.mm_init
     43.85 ± 20%     -52.4%      20.88 ± 63%  perf-sched.wait_time.max.ms.__cond_resched.uprobe_start_dup_mmap.dup_mm.constprop.0
      1.21 ± 30%     -65.9%       0.41 ± 86%  perf-sched.wait_time.max.ms.exit_to_user_mode_loop.ret_from_fork.ret_from_fork_asm.[unknown]
      3.06 ± 38%    +816.6%      28.04 ± 89%  perf-sched.wait_time.max.ms.io_schedule.folio_wait_bit_common.filemap_fault.__do_fault
     11.39 ± 63%    +393.7%      56.25 ± 18%  perf-sched.wait_time.max.ms.schedule_preempt_disabled.__mutex_lock.constprop.0.pcpu_alloc_noprof
     26.67 ± 14%     -83.3%       4.45 ± 13%  perf-sched.wait_time.max.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.__put_anon_vma
     24.94 ± 26%     -82.1%       4.48 ±  5%  perf-sched.wait_time.max.ms.schedule_preempt_disabled.rwsem_down_write_slowpath.down_write.unlink_anon_vmas




Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.


-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH v3 3/3] sched/fair: Allocate both cfs_rq and sched_entity with per-cpu

Posted by Zecheng Li 2 months, 2 weeks ago

Gentle ping on this patch series.

I'll be communicating from a new email address, zli94@ncsu.edu as I'll
soon lose access to my current corp account.

On Wed, Jul 16, 2025 at 4:51 AM kernel test robot <oliver.sang@intel.com> wrote:
>
>
>
> Hello,
>
> kernel test robot noticed a 8.8% improvement of stress-ng.session.ops_per_sec on:
>
>
> commit: ac215b990e70e247344522e1736fd878cb3c25b6 ("[PATCH v3 3/3] sched/fair: Allocate both cfs_rq and sched_entity with per-cpu")
> url: https://github.com/intel-lab-lkp/linux/commits/Zecheng-Li/sched-fair-Co-locate-cfs_rq-and-sched_entity/20250702-050528
> patch link: https://lore.kernel.org/all/20250701210230.2985885-4-zecheng@google.com/
> patch subject: [PATCH v3 3/3] sched/fair: Allocate both cfs_rq and sched_entity with per-cpu

It's great to see the kernel test robot can measure the improvement of
this patch.

[PATCH v3 1/3] sched/fair: Co-locate cfs_rq and sched_entity
[PATCH v3 2/3] sched/fair: Remove task_group->se pointer array
[PATCH v3 3/3] sched/fair: Allocate both cfs_rq and sched_entity with per-cpu