[RFC PATCH v4 19/28] sched/deadline: Allow deeper hierarchies of RT cgroups

Yuri Andriaccio posted 28 patches 8 hours ago
[RFC PATCH v4 19/28] sched/deadline: Allow deeper hierarchies of RT cgroups
Posted by Yuri Andriaccio 8 hours ago
From: luca abeni <luca.abeni@santannapisa.it>

Remove the check for depth two only cgroup hierachies.

Introduce the concept of live and active groups:
- A group is live if it is a leaf group or if all its children have zero
  runtime.
- A live group with non-zero runtime can be used to schedule tasks.
- A live group with running tasks is deemed active.
- A non-live group cannot be used to run tasks, but it is only used for
  bandwidth accounting, i.e. the sum of its children bandwidth must be
  less than or equal to the bandwidth of the parent. This change allows
  to use cgroups for bandwidth management for different users.
- While the root cgroup specifies the total allocatable bandwidth of rt
  cgroups, a further accounting is performed to keep track of the live
  bandwidth, i.e. the sum of the bandwidth of live groups. The hierarchy
  invariant states that the live bandwidth must always be less than or
  equal to the total allocatable bw.

Add is_live_sched_group() and sched_group_has_live_siblings() in
deadline.c. These utility functions are used by dl_init_tg to perform
updates only when necessary:
- Only live groups may update the active dl bandwidth of dl entities
  (call to dl_rq_change_utilization), while non-live groups must not use
  servers, and thus must not change the active dl bandwidth.
- The total bandwidth accounting must be changed to follow the
  live/non-live rules:
  - When disabling (runtime zero) the last child of a group, the parent
    becomes a live group, and so the parent's bw must be accounted back.
  - When enabling (runtime non-zero) the first child, the parent becomes a
    non-live group, and so the parent's bandwidth must be removed.

Update free_rt_sched_group() to only zero out the runtime of non-zeroed
servers. This is also necessary to force the bandwidth accounting of
live groups.

Update tg_set_rt_bandwidth() to change the runtime of a group to a
non-zero value only if its parent is inactive, thus forcing it to become
non-live if it was precedently (it would've already been non-live if a
sibling cgroup was live).

Update sched_rt_can_attach() to allow attaching only on live groups.

Update dl_init_tg() to take a task_group pointer and a cpu's id rather
than passing directly the pointer to the cpu's deadline server. The
task_group pointer is necessary to check and update the live bandwidth
accounting.

Co-developed-by: Yuri Andriaccio <yurand2000@gmail.com>
Signed-off-by: Yuri Andriaccio <yurand2000@gmail.com>
Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
---
 kernel/sched/core.c     |  6 ----
 kernel/sched/deadline.c | 61 ++++++++++++++++++++++++++++++++++++++---
 kernel/sched/rt.c       | 16 +++++++++--
 kernel/sched/sched.h    |  3 +-
 4 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cfb39050a2..983cd1b478 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9253,12 +9253,6 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		return &root_task_group.css;
 	}

-	/* Do not allow cpu_cgroup hierachies with depth greater than 2. */
-#ifdef CONFIG_RT_GROUP_SCHED
-	if (parent != &root_task_group)
-		return ERR_PTR(-EINVAL);
-#endif
-
 	tg = sched_create_group(parent);
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7ed157dfa6..082bccc30b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -388,11 +388,44 @@ int dl_check_tg(unsigned long total)
 	return 1;
 }

-void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
+/*
+ * A cgroup is deemed live if:
+ * - It is a leaf cgroup.
+ * - All it's children have zero runtime.
+ */
+bool is_live_sched_group(struct task_group *tg)
+{
+	struct task_group *child;
+	bool is_active = 1;
+
+	/* if there are no children, this is a leaf group, thus it is live */
+	list_for_each_entry_rcu(child, &tg->children, siblings) {
+		if (child->dl_bandwidth.dl_runtime > 0)
+			is_active = 0;
+	}
+	return is_active;
+}
+
+static inline bool sched_group_has_live_siblings(struct task_group *tg)
+{
+	struct task_group *child;
+	bool has_active_siblings = 0;
+
+	list_for_each_entry_rcu(child, &tg->parent->children, siblings) {
+		if (child != tg && child->dl_bandwidth.dl_runtime > 0)
+			has_active_siblings = 1;
+	}
+	return has_active_siblings;
+}
+
+void dl_init_tg(struct task_group *tg, int cpu, u64 rt_runtime, u64 rt_period)
 {
+	struct sched_dl_entity *dl_se = tg->dl_se[cpu];
 	struct rq *rq = container_of(dl_se->dl_rq, struct rq, dl);
-	int is_active;
-	u64 new_bw;
+	int is_active, is_live_group;
+	u64 old_runtime, new_bw;
+
+	is_live_group = is_live_sched_group(tg);

 	guard(raw_spin_rq_lock_irq)(rq);
 	is_active = dl_se->my_q->rt.rt_nr_running > 0;
@@ -400,8 +433,10 @@ void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
 	update_rq_clock(rq);
 	dl_server_stop(dl_se);

+	old_runtime = dl_se->dl_runtime;
 	new_bw = to_ratio(rt_period, rt_runtime);
-	dl_rq_change_utilization(rq, dl_se, new_bw);
+	if (is_live_group)
+		dl_rq_change_utilization(rq, dl_se, new_bw);

 	dl_se->dl_runtime  = rt_runtime;
 	dl_se->dl_deadline = rt_period;
@@ -413,6 +448,24 @@ void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
 	dl_se->dl_bw = new_bw;
 	dl_se->dl_density = new_bw;

+	/*
+	 * Handle parent bandwidth accounting when child runtime changes:
+	 * - When disabling the last child, the parent becomes a leaf group,
+	 *   and so the parent's bandwidth must be accounted back.
+	 * - When enabling the first child, the parent becomes a non-leaf group,
+	 *   and so the parent's bandwidth must be removed.
+	 * Only leaf groups (those without active children) have non-zero bandwidth.
+	 */
+	if (tg->parent && tg->parent != &root_task_group) {
+		if (rt_runtime == 0 && old_runtime != 0 &&
+		    !sched_group_has_live_siblings(tg)) {
+			__add_rq_bw(tg->parent->dl_se[cpu]->dl_bw, dl_se->dl_rq);
+		} else if (rt_runtime != 0 && old_runtime == 0 &&
+			   !sched_group_has_live_siblings(tg)) {
+			__sub_rq_bw(tg->parent->dl_se[cpu]->dl_bw, dl_se->dl_rq);
+		}
+	}
+
 	if (is_active)
 		dl_server_start(dl_se);
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 928f53c1b0..a2084e9dc5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -113,7 +113,8 @@ void free_rt_sched_group(struct task_group *tg)
 		 * Fix this issue by changing the group runtime
 		 * to 0 immediately before freeing it.
 		 */
-		dl_init_tg(tg->dl_se[i], 0, tg->dl_se[i]->dl_period);
+		if (tg->dl_se[i]->dl_runtime)
+			dl_init_tg(tg, i, 0, tg->dl_se[i]->dl_period);

 		raw_spin_rq_lock_irqsave(cpu_rq(i), flags);
 		hrtimer_cancel(&tg->dl_se[i]->dl_timer);
@@ -2134,6 +2135,14 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 	static DEFINE_MUTEX(rt_constraints_mutex);
 	int i, err = 0;

+	/*
+	 * Do not allow to set a RT runtime > 0 if the parent has RT tasks
+	 * (and is not the root group)
+	 */
+	if (rt_runtime && tg != &root_task_group &&
+		tg->parent != &root_task_group && tg_has_rt_tasks(tg->parent))
+		return -EINVAL;
+
 	/* No period doesn't make any sense. */
 	if (rt_period == 0)
 		return -EINVAL;
@@ -2157,7 +2166,7 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 		return 0;

 	for_each_possible_cpu(i) {
-		dl_init_tg(tg->dl_se[i], rt_runtime, rt_period);
+		dl_init_tg(tg, i, rt_runtime, rt_period);
 	}

 	return 0;
@@ -2228,7 +2237,8 @@ int sched_rt_can_attach(struct task_group *tg)
 	if (rt_group_sched_enabled() && tg->dl_bandwidth.dl_runtime == 0)
 		return 0;

-	return 1;
+	/* tasks can be attached only if the taskgroup has no live children. */
+	return (int)is_live_sched_group(tg);
 }

 #else /* !CONFIG_RT_GROUP_SCHED */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4b65775ada..6c3fbfe84f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -411,7 +411,8 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq,
 		    dl_server_pick_f pick_task);
 extern void sched_init_dl_servers(void);
 extern int dl_check_tg(unsigned long total);
-extern void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period);
+extern void dl_init_tg(struct task_group *tg, int cpu, u64 rt_runtime, u64 rt_period);
+extern bool is_live_sched_group(struct task_group *tg);

 extern void dl_server_update_idle_time(struct rq *rq,
 		    struct task_struct *p);
--
2.51.0