[RFC PATCH v3 18/24] sched/deadline: Allow deeper hierarchies of RT cgroups

Yuri Andriaccio posted 24 patches 4 months, 1 week ago
There is a newer version of this series
[RFC PATCH v3 18/24] sched/deadline: Allow deeper hierarchies of RT cgroups
Posted by Yuri Andriaccio 4 months, 1 week ago
From: luca abeni <luca.abeni@santannapisa.it>

Allow creation of cgroup hierachies with depth greater than two.
Add check to prevent attaching tasks to a child cgroup of an active cgroup (i.e.
with a running FIFO/RR task).
Add check to prevent attaching tasks to cgroups which have children with
non-zero runtime.
Update rt-cgroups allocated bandwidth accounting for nested cgroup hierachies.

Co-developed-by: Yuri Andriaccio <yurand2000@gmail.com>
Signed-off-by: Yuri Andriaccio <yurand2000@gmail.com>
Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
---
 kernel/sched/core.c     |  6 -----
 kernel/sched/deadline.c | 51 +++++++++++++++++++++++++++++++++++++----
 kernel/sched/rt.c       | 16 ++++++++++---
 kernel/sched/sched.h    |  3 ++-
 4 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6f516cdc7bb..d1d7215c4a2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9281,12 +9281,6 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		return &root_task_group.css;
 	}
 
-	/* Do not allow cpu_cgroup hierachies with depth greater than 2. */
-#ifdef CONFIG_RT_GROUP_SCHED
-	if (parent != &root_task_group)
-		return ERR_PTR(-EINVAL);
-#endif
-
 	tg = sched_create_group(parent);
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5d93b3ca030..abe11985c41 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -388,11 +388,42 @@ int dl_check_tg(unsigned long total)
 	return 1;
 }
 
-void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
+bool is_active_sched_group(struct task_group *tg)
 {
+	struct task_group *child;
+	bool is_active = 1;
+
+	// if there are no children, this is a leaf group, thus it is active
+	list_for_each_entry_rcu(child, &tg->children, siblings) {
+		if (child->dl_bandwidth.dl_runtime > 0) {
+			is_active = 0;
+		}
+	}
+	return is_active;
+}
+
+static inline bool sched_group_has_active_siblings(struct task_group *tg)
+{
+	struct task_group *child;
+	bool has_active_siblings = 0;
+
+	// if there are no children, this is a leaf group, thus it is active
+	list_for_each_entry_rcu(child, &tg->parent->children, siblings) {
+		if (child != tg && child->dl_bandwidth.dl_runtime > 0) {
+			has_active_siblings = 1;
+		}
+	}
+	return has_active_siblings;
+}
+
+void dl_init_tg(struct task_group *tg, int cpu, u64 rt_runtime, u64 rt_period)
+{
+	struct sched_dl_entity *dl_se = tg->dl_se[cpu];
 	struct rq *rq = container_of(dl_se->dl_rq, struct rq, dl);
-	int is_active;
-	u64 new_bw;
+	int is_active, is_active_group;
+	u64 old_runtime, new_bw;
+
+	is_active_group = is_active_sched_group(tg);
 
 	raw_spin_rq_lock_irq(rq);
 	is_active = dl_se->my_q->rt.rt_nr_running > 0;
@@ -400,8 +431,10 @@ void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
 	update_rq_clock(rq);
 	dl_server_stop(dl_se);
 
+	old_runtime = dl_se->dl_runtime;
 	new_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
-	dl_rq_change_utilization(rq, dl_se, new_bw);
+	if (is_active_group)
+		dl_rq_change_utilization(rq, dl_se, new_bw);
 
 	dl_se->dl_runtime  = rt_runtime;
 	dl_se->dl_deadline = rt_period;
@@ -413,6 +446,16 @@ void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
 	dl_se->dl_bw = new_bw;
 	dl_se->dl_density = new_bw;
 
+	// add/remove the parent's bw
+	if (tg->parent && tg->parent != &root_task_group)
+	{
+		if (rt_runtime == 0 && old_runtime != 0 && !sched_group_has_active_siblings(tg)) {
+			__add_rq_bw(tg->parent->dl_se[cpu]->dl_bw, dl_se->dl_rq);
+		} else if (rt_runtime != 0 && old_runtime == 0 && !sched_group_has_active_siblings(tg)) {
+			__sub_rq_bw(tg->parent->dl_se[cpu]->dl_bw, dl_se->dl_rq);
+		}
+	}
+
 	if (is_active)
 		dl_server_start(dl_se);
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1cdc699bbb7..17ad91261cb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -114,7 +114,8 @@ void free_rt_sched_group(struct task_group *tg)
 		 * Fix this issue by changing the group runtime
 		 * to 0 immediately before freeing it.
 		 */
-		dl_init_tg(tg->dl_se[i], 0, tg->dl_se[i]->dl_period);
+		if (tg->dl_se[i]->dl_runtime)
+			dl_init_tg(tg, i, 0, tg->dl_se[i]->dl_period);
 
 		raw_spin_rq_lock_irqsave(cpu_rq(i), flags);
 		BUG_ON(tg->rt_rq[i]->rt_nr_running);
@@ -2122,6 +2123,14 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 	static DEFINE_MUTEX(rt_constraints_mutex);
 	int i, err = 0;
 
+	/*
+	 * Do not allow to set a RT runtime > 0 if the parent has RT tasks
+	 * (and is not the root group)
+	 */
+	if (rt_runtime && (tg != &root_task_group) && (tg->parent != &root_task_group) && tg_has_rt_tasks(tg->parent)) {
+		return -EINVAL;
+	}
+
 	/* No period doesn't make any sense. */
 	if (rt_period == 0)
 		return -EINVAL;
@@ -2145,7 +2154,7 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 		return 0;
 
 	for_each_possible_cpu(i) {
-		dl_init_tg(tg->dl_se[i], rt_runtime, rt_period);
+		dl_init_tg(tg, i, rt_runtime, rt_period);
 	}
 
 	return 0;
@@ -2216,7 +2225,8 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 	if (rt_group_sched_enabled() && tg->dl_bandwidth.dl_runtime == 0)
 		return 0;
 
-	return 1;
+	/* tasks can be attached only if the taskgroup has no active children. */
+	return (int)is_active_sched_group(tg);
 }
 
 #else /* !CONFIG_RT_GROUP_SCHED: */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fddb171145e..55631d93e02 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -386,7 +386,8 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq,
 		    dl_server_pick_f pick_task);
 extern void sched_init_dl_servers(void);
 extern int dl_check_tg(unsigned long total);
-extern void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period);
+extern void dl_init_tg(struct task_group *tg, int cpu, u64 rt_runtime, u64 rt_period);
+extern bool is_active_sched_group(struct task_group *tg);
 
 extern void dl_server_update_idle_time(struct rq *rq,
 		    struct task_struct *p);
-- 
2.51.0
Re: [RFC PATCH v3 18/24] sched/deadline: Allow deeper hierarchies of RT cgroups
Posted by Juri Lelli 3 months, 3 weeks ago
Hello,

On 29/09/25 11:22, Yuri Andriaccio wrote:
> From: luca abeni <luca.abeni@santannapisa.it>
> 
> Allow creation of cgroup hierachies with depth greater than two.
> Add check to prevent attaching tasks to a child cgroup of an active cgroup (i.e.
> with a running FIFO/RR task).
> Add check to prevent attaching tasks to cgroups which have children with
> non-zero runtime.
> Update rt-cgroups allocated bandwidth accounting for nested cgroup hierachies.
> 
> Co-developed-by: Yuri Andriaccio <yurand2000@gmail.com>
> Signed-off-by: Yuri Andriaccio <yurand2000@gmail.com>
> Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
> ---
>  kernel/sched/core.c     |  6 -----
>  kernel/sched/deadline.c | 51 +++++++++++++++++++++++++++++++++++++----
>  kernel/sched/rt.c       | 16 ++++++++++---
>  kernel/sched/sched.h    |  3 ++-
>  4 files changed, 62 insertions(+), 14 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 6f516cdc7bb..d1d7215c4a2 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -9281,12 +9281,6 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
>  		return &root_task_group.css;
>  	}
>  
> -	/* Do not allow cpu_cgroup hierachies with depth greater than 2. */
> -#ifdef CONFIG_RT_GROUP_SCHED
> -	if (parent != &root_task_group)
> -		return ERR_PTR(-EINVAL);
> -#endif
> -
>  	tg = sched_create_group(parent);
>  	if (IS_ERR(tg))
>  		return ERR_PTR(-ENOMEM);
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 5d93b3ca030..abe11985c41 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -388,11 +388,42 @@ int dl_check_tg(unsigned long total)
>  	return 1;
>  }
>  
> -void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
> +bool is_active_sched_group(struct task_group *tg)

I wonder if the function name could be misleading, as this checks runtime
and not if there are tasks in the group.

>  {
> +	struct task_group *child;
> +	bool is_active = 1;
> +
> +	// if there are no children, this is a leaf group, thus it is active
> +	list_for_each_entry_rcu(child, &tg->children, siblings) {
> +		if (child->dl_bandwidth.dl_runtime > 0) {
> +			is_active = 0;
> +		}
> +	}
> +	return is_active;
> +}
> +
> +static inline bool sched_group_has_active_siblings(struct task_group *tg)
> +{
> +	struct task_group *child;
> +	bool has_active_siblings = 0;
> +
> +	// if there are no children, this is a leaf group, thus it is active

Copy-pasta from above? :) Also not the correct comment style.

> +	list_for_each_entry_rcu(child, &tg->parent->children, siblings) {
> +		if (child != tg && child->dl_bandwidth.dl_runtime > 0) {
> +			has_active_siblings = 1;
> +		}
> +	}
> +	return has_active_siblings;
> +}
> +
> +void dl_init_tg(struct task_group *tg, int cpu, u64 rt_runtime, u64 rt_period)
> +{
> +	struct sched_dl_entity *dl_se = tg->dl_se[cpu];
>  	struct rq *rq = container_of(dl_se->dl_rq, struct rq, dl);
> -	int is_active;
> -	u64 new_bw;
> +	int is_active, is_active_group;
> +	u64 old_runtime, new_bw;
> +
> +	is_active_group = is_active_sched_group(tg);
>  
>  	raw_spin_rq_lock_irq(rq);
>  	is_active = dl_se->my_q->rt.rt_nr_running > 0;
> @@ -400,8 +431,10 @@ void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
>  	update_rq_clock(rq);
>  	dl_server_stop(dl_se);
>  
> +	old_runtime = dl_se->dl_runtime;
>  	new_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
> -	dl_rq_change_utilization(rq, dl_se, new_bw);
> +	if (is_active_group)
> +		dl_rq_change_utilization(rq, dl_se, new_bw);
>  
>  	dl_se->dl_runtime  = rt_runtime;
>  	dl_se->dl_deadline = rt_period;
> @@ -413,6 +446,16 @@ void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
>  	dl_se->dl_bw = new_bw;
>  	dl_se->dl_density = new_bw;
>  
> +	// add/remove the parent's bw

Comment style is not correct. Also the comment itself is not very much
informative. What about something like (IIUC)

 /*
  * Handle parent bandwidth accounting when child runtime changes:
  * - Disabling the last active child: parent becomes a leaf group,
  *   so add the parent's bandwidth back to active accounting
  * - Enabling the first child: parent becomes a non-leaf group,
  *   so remove the parent's bandwidth from active accounting
  * Only leaf groups (those without active children) should have
  * non-zero bandwidth.
  */

> +	if (tg->parent && tg->parent != &root_task_group)
> +	{
> +		if (rt_runtime == 0 && old_runtime != 0 && !sched_group_has_active_siblings(tg)) {
> +			__add_rq_bw(tg->parent->dl_se[cpu]->dl_bw, dl_se->dl_rq);
> +		} else if (rt_runtime != 0 && old_runtime == 0 && !sched_group_has_active_siblings(tg)) {
> +			__sub_rq_bw(tg->parent->dl_se[cpu]->dl_bw, dl_se->dl_rq);
> +		}
> +	}
> +

Thanks,
Juri