To improve data locality and avoid pointer chasing, embed struct
sched_entity within struct cfs_rq. This co-locates the runqueue state
(cfs_rq) and the entity's scheduling state (se).
This patch implements the following:
- Adds a struct sched_entity field to struct cfs_rq.
- Modifies alloc_fair_sched_group() and free_fair_sched_group() to
remove the separate allocation and freeing logic for sched_entity
objects themselves.
- The task_group->se pointer array (struct sched_entity **se) is
retained. The pointers in this array are updated to point to the
corresponding embedded &cfs_rq->se for each CPU.
Signed-off-by: Zecheng Li <zecheng@google.com>
---
kernel/sched/fair.c | 10 +---------
kernel/sched/sched.h | 4 ++++
2 files changed, 5 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fb9bf995a47..c2af9896eef4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13343,8 +13343,6 @@ void free_fair_sched_group(struct task_group *tg)
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
- if (tg->se)
- kfree(tg->se[i]);
}
kfree(tg->cfs_rq);
@@ -13374,11 +13372,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
if (!cfs_rq)
goto err;
- se = kzalloc_node(sizeof(struct sched_entity_stats),
- GFP_KERNEL, cpu_to_node(i));
- if (!se)
- goto err_free_rq;
-
+ se = &cfs_rq->se;
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
@@ -13386,8 +13380,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
-err_free_rq:
- kfree(cfs_rq);
err:
return 0;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 47972f34ea70..6e26b7d59c13 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -738,6 +738,10 @@ struct cfs_rq {
struct list_head throttled_csd_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* sched_entity on parent runqueue */
+ struct sched_entity se ____cacheline_aligned;
+#endif
};
#ifdef CONFIG_SCHED_CLASS_EXT
--
2.50.0
On Wed, Jun 04, 2025 at 07:58:41PM +0000, Zecheng Li wrote:
> To improve data locality and avoid pointer chasing, embed struct
> sched_entity within struct cfs_rq. This co-locates the runqueue state
> (cfs_rq) and the entity's scheduling state (se).
>
> This patch implements the following:
>
> - Adds a struct sched_entity field to struct cfs_rq.
>
> - Modifies alloc_fair_sched_group() and free_fair_sched_group() to
> remove the separate allocation and freeing logic for sched_entity
> objects themselves.
>
> - The task_group->se pointer array (struct sched_entity **se) is
> retained. The pointers in this array are updated to point to the
> corresponding embedded &cfs_rq->se for each CPU.
>
> Signed-off-by: Zecheng Li <zecheng@google.com>
> ---
> kernel/sched/fair.c | 10 +---------
> kernel/sched/sched.h | 4 ++++
> 2 files changed, 5 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0fb9bf995a47..c2af9896eef4 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -13343,8 +13343,6 @@ void free_fair_sched_group(struct task_group *tg)
> for_each_possible_cpu(i) {
> if (tg->cfs_rq)
> kfree(tg->cfs_rq[i]);
> - if (tg->se)
> - kfree(tg->se[i]);
> }
>
> kfree(tg->cfs_rq);
> @@ -13374,11 +13372,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> if (!cfs_rq)
> goto err;
>
> - se = kzalloc_node(sizeof(struct sched_entity_stats),
> - GFP_KERNEL, cpu_to_node(i));
> - if (!se)
> - goto err_free_rq;
> -
> + se = &cfs_rq->se;
> init_cfs_rq(cfs_rq);
> init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
> init_entity_runnable_average(se);
> @@ -13386,8 +13380,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
>
> return 1;
>
> -err_free_rq:
> - kfree(cfs_rq);
> err:
> return 0;
> }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 47972f34ea70..6e26b7d59c13 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -738,6 +738,10 @@ struct cfs_rq {
> struct list_head throttled_csd_list;
> #endif /* CONFIG_CFS_BANDWIDTH */
> #endif /* CONFIG_FAIR_GROUP_SCHED */
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> + /* sched_entity on parent runqueue */
> + struct sched_entity se ____cacheline_aligned;
> +#endif
> };
This also blows up struct rq for no reason.
I would much rather you all investigate if you can make the flattened
cgroup stuff from Rik work.
Hi Peter,
On Thu, Jun 5, 2025 at 10:28 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> This also blows up struct rq for no reason.
I'm working on a v2 change that could avoid this overhead by allocating
struct cfs_rq and struct sched_entity together as a combined struct
(e.g., struct {struct cfs_rq cfs_rq; struct sched_entity se;}) on
non-root task groups. This would allow us to save the extra sched_entity
inside struct rq.
> I would much rather you all investigate if you can make the flattened
> cgroup stuff from Rik work.
The flattened cgroup work seems to be a separate, larger architectural
change. This patch series focuses on micro-architecture optimizations
to existing memory layouts without changing functionality.
Regards,
Zecheng
© 2016 - 2025 Red Hat, Inc.