[v4] Hierarchical Constant Bandwidth Server

[RFC PATCH v4 14/28] sched/rt: Update rt-cgroup schedulability checks

Posted by Yuri Andriaccio 2 months, 1 week ago

From: luca abeni <luca.abeni@santannapisa.it>

Update sched_group_rt_runtime/period and sched_group_set_rt_runtime/period
to use the newly defined data structures and perform necessary checks to
update both the runtime and period of a given group.

The set functions call tg_set_rt_bandwidth() which is also updated:
- Use the newly added HCBS dl_bandwidth structure instead of rt_bandwidth.
- Update __rt_schedulable() to check for numerical issues:
  - Prevent a non-zero runtime that is too small, since a non-zero very
    small runtime will make the servers behave as they had zero runtime.
  - Since some computation use signed integers, the period might be so
    big that when read as a signed integer becomes a negative number, and
    we don't want that. If the period satisfies this prerequisite, also
    the runtime will do, since the runtime is always less than or equal
    to the period.
- Update tg_rt_schedulable(), used when walking the cgroup tree to check
  if all invariants are met:
  - Update most of the instructions to obtain data from the newly added
    data structures (dl_bandwidth).
  - If the task group is the root group, run a total bandwidth check with
    the newly added dl_check_tg() function.
- After all checks are successful, if the changed group is not the root
  cgroup, update the assigned runtime and period to all the local
  deadline servers.
- Additionally use a mutex guard instead of manually locking/unlocking.

Add dl_check_tg(), which performs an admission control test similar to
__dl_overflow, but this time we are updating the cgroup's total bandwidth
rather than scheduling a new DEADLINE task or updating a non-cgroup
deadline server.

Finally, prevent creation of a cgroup hierarchy with depth greater than
two, as this will be addressed in a future patch. A depth two hierarchy
is sufficient for now for testing the patchset.

Co-developed-by: Alessio Balsini <a.balsini@sssup.it>
Signed-off-by: Alessio Balsini <a.balsini@sssup.it>
Co-developed-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Andrea Parri <parri.andrea@gmail.com>
Co-developed-by: Yuri Andriaccio <yurand2000@gmail.com>
Signed-off-by: Yuri Andriaccio <yurand2000@gmail.com>
Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
---
 kernel/sched/core.c     |  6 ++++
 kernel/sched/deadline.c | 46 +++++++++++++++++++++++----
 kernel/sched/rt.c       | 70 +++++++++++++++++++++++------------------
 kernel/sched/sched.h    |  1 +
 4 files changed, 87 insertions(+), 36 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d7fc83cdae..bdf1bebe52 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9253,6 +9253,12 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		return &root_task_group.css;
 	}
 
+	/* Do not allow cpu_cgroup hierachies with depth greater than 2. */
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (parent != &root_task_group)
+		return ERR_PTR(-EINVAL);
+#endif
+
 	tg = sched_create_group(parent);
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b890fdd4b2..7ed157dfa6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -347,7 +347,47 @@ void cancel_inactive_timer(struct sched_dl_entity *dl_se)
 	cancel_dl_timer(dl_se, &dl_se->inactive_timer);
 }
 
+/*
+ * Used for dl_bw check and update, used under sched_rt_handler()::mutex and
+ * sched_domains_mutex.
+ */
+u64 dl_cookie;
+
 #ifdef CONFIG_RT_GROUP_SCHED
+int dl_check_tg(unsigned long total)
+{
+	unsigned long flags;
+	int which_cpu;
+	int cap;
+	struct dl_bw *dl_b;
+	u64 gen = ++dl_cookie;
+
+	for_each_possible_cpu(which_cpu) {
+		rcu_read_lock_sched();
+
+		if (!dl_bw_visited(which_cpu, gen)) {
+			cap = dl_bw_capacity(which_cpu);
+			dl_b = dl_bw_of(which_cpu);
+
+			raw_spin_lock_irqsave(&dl_b->lock, flags);
+
+			if (dl_b->bw != -1 &&
+			    cap_scale(dl_b->bw, cap) < dl_b->total_bw + cap_scale(total, cap)) {
+				raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+				rcu_read_unlock_sched();
+
+				return 0;
+			}
+
+			raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+		}
+
+		rcu_read_unlock_sched();
+	}
+
+	return 1;
+}
+
 void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
 {
 	struct rq *rq = container_of(dl_se->dl_rq, struct rq, dl);
@@ -3150,12 +3190,6 @@ DEFINE_SCHED_CLASS(dl) = {
 #endif
 };
 
-/*
- * Used for dl_bw check and update, used under sched_rt_handler()::mutex and
- * sched_domains_mutex.
- */
-u64 dl_cookie;
-
 int sched_dl_global_validate(void)
 {
 	u64 runtime = global_rt_runtime();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2b7c4b7754..b0a6da20b5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2007,11 +2007,6 @@ DEFINE_SCHED_CLASS(rt) = {
 };
 
 #ifdef CONFIG_RT_GROUP_SCHED
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
-
 static inline int tg_has_rt_tasks(struct task_group *tg)
 {
 	struct task_struct *task;
@@ -2045,8 +2040,8 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 	unsigned long total, sum = 0;
 	u64 period, runtime;
 
-	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-	runtime = tg->rt_bandwidth.rt_runtime;
+	period  = tg->dl_bandwidth.dl_period;
+	runtime = tg->dl_bandwidth.dl_runtime;
 
 	if (tg == d->tg) {
 		period = d->rt_period;
@@ -2062,8 +2057,7 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 	/*
 	 * Ensure we don't starve existing RT tasks if runtime turns zero.
 	 */
-	if (rt_bandwidth_enabled() && !runtime &&
-	    tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
+	if (dl_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 
 	if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group))
@@ -2077,12 +2071,17 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
 		return -EINVAL;
 
+	if (tg == &root_task_group) {
+		if (!dl_check_tg(total))
+			return -EBUSY;
+	}
+
 	/*
 	 * The sum of our children's runtime should not exceed our own.
 	 */
 	list_for_each_entry_rcu(child, &tg->children, siblings) {
-		period = ktime_to_ns(child->rt_bandwidth.rt_period);
-		runtime = child->rt_bandwidth.rt_runtime;
+		period  = child->dl_bandwidth.dl_period;
+		runtime = child->dl_bandwidth.dl_runtime;
 
 		if (child == d->tg) {
 			period = d->rt_period;
@@ -2108,6 +2107,20 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 		.rt_runtime = runtime,
 	};
 
+	/*
+	 * Since we truncate DL_SCALE bits, make sure we're at least
+	 * that big.
+	 */
+	if (runtime != 0 && runtime < (1ULL << DL_SCALE))
+		return -EINVAL;
+
+	/*
+	 * Since we use the MSB for wrap-around and sign issues, make
+	 * sure it's not set (mind that period can be equal to zero).
+	 */
+	if (period & (1ULL << 63))
+		return -EINVAL;
+
 	rcu_read_lock();
 	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
 	rcu_read_unlock();
@@ -2118,6 +2131,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 static int tg_set_rt_bandwidth(struct task_group *tg,
 		u64 rt_period, u64 rt_runtime)
 {
+	static DEFINE_MUTEX(rt_constraints_mutex);
 	int i, err = 0;
 
 	/*
@@ -2137,34 +2151,30 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 	if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
 		return -EINVAL;
 
-	mutex_lock(&rt_constraints_mutex);
+	guard(mutex)(&rt_constraints_mutex);
 	err = __rt_schedulable(tg, rt_period, rt_runtime);
 	if (err)
-		goto unlock;
+		return err;
 
-	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
-	tg->rt_bandwidth.rt_runtime = rt_runtime;
+	guard(raw_spinlock_irq)(&tg->dl_bandwidth.dl_runtime_lock);
+	tg->dl_bandwidth.dl_period  = rt_period;
+	tg->dl_bandwidth.dl_runtime = rt_runtime;
 
-	for_each_possible_cpu(i) {
-		struct rt_rq *rt_rq = tg->rt_rq[i];
+	if (tg == &root_task_group)
+		return 0;
 
-		raw_spin_lock(&rt_rq->rt_runtime_lock);
-		rt_rq->rt_runtime = rt_runtime;
-		raw_spin_unlock(&rt_rq->rt_runtime_lock);
+	for_each_possible_cpu(i) {
+		dl_init_tg(tg->dl_se[i], rt_runtime, rt_period);
 	}
-	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-unlock:
-	mutex_unlock(&rt_constraints_mutex);
 
-	return err;
+	return 0;
 }
 
 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
 	u64 rt_runtime, rt_period;
 
-	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	rt_period  = tg->dl_bandwidth.dl_period;
 	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
 	if (rt_runtime_us < 0)
 		rt_runtime = RUNTIME_INF;
@@ -2178,10 +2188,10 @@ long sched_group_rt_runtime(struct task_group *tg)
 {
 	u64 rt_runtime_us;
 
-	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
+	if (tg->dl_bandwidth.dl_runtime == RUNTIME_INF)
 		return -1;
 
-	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
+	rt_runtime_us = tg->dl_bandwidth.dl_runtime;
 	do_div(rt_runtime_us, NSEC_PER_USEC);
 	return rt_runtime_us;
 }
@@ -2194,7 +2204,7 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
 		return -EINVAL;
 
 	rt_period = rt_period_us * NSEC_PER_USEC;
-	rt_runtime = tg->rt_bandwidth.rt_runtime;
+	rt_runtime = tg->dl_bandwidth.dl_runtime;
 
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
@@ -2203,7 +2213,7 @@ long sched_group_rt_period(struct task_group *tg)
 {
 	u64 rt_period_us;
 
-	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	rt_period_us = tg->dl_bandwidth.dl_period;
 	do_div(rt_period_us, NSEC_PER_USEC);
 	return rt_period_us;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bc3ed02e40..334ab6d597 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -419,6 +419,7 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq,
 		    struct rq *served_rq,
 		    dl_server_pick_f pick_task);
 extern void sched_init_dl_servers(void);
+extern int dl_check_tg(unsigned long total);
 extern void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period);
 
 extern void dl_server_update_idle_time(struct rq *rq,
-- 
2.51.0

Re: [RFC PATCH v4 14/28] sched/rt: Update rt-cgroup schedulability checks

Posted by Juri Lelli 2 weeks, 4 days ago

Hello,

On 01/12/25 13:41, Yuri Andriaccio wrote:
> From: luca abeni <luca.abeni@santannapisa.it>
> 
> Update sched_group_rt_runtime/period and sched_group_set_rt_runtime/period
> to use the newly defined data structures and perform necessary checks to
> update both the runtime and period of a given group.
> 
> The set functions call tg_set_rt_bandwidth() which is also updated:
> - Use the newly added HCBS dl_bandwidth structure instead of rt_bandwidth.
> - Update __rt_schedulable() to check for numerical issues:
>   - Prevent a non-zero runtime that is too small, since a non-zero very
>     small runtime will make the servers behave as they had zero runtime.
>   - Since some computation use signed integers, the period might be so
>     big that when read as a signed integer becomes a negative number, and
>     we don't want that. If the period satisfies this prerequisite, also
>     the runtime will do, since the runtime is always less than or equal
>     to the period.
> - Update tg_rt_schedulable(), used when walking the cgroup tree to check
>   if all invariants are met:
>   - Update most of the instructions to obtain data from the newly added
>     data structures (dl_bandwidth).
>   - If the task group is the root group, run a total bandwidth check with
>     the newly added dl_check_tg() function.
> - After all checks are successful, if the changed group is not the root
>   cgroup, update the assigned runtime and period to all the local
>   deadline servers.
> - Additionally use a mutex guard instead of manually locking/unlocking.
> 
> Add dl_check_tg(), which performs an admission control test similar to
> __dl_overflow, but this time we are updating the cgroup's total bandwidth
> rather than scheduling a new DEADLINE task or updating a non-cgroup
> deadline server.
> 
> Finally, prevent creation of a cgroup hierarchy with depth greater than
> two, as this will be addressed in a future patch. A depth two hierarchy
> is sufficient for now for testing the patchset.
> 
> Co-developed-by: Alessio Balsini <a.balsini@sssup.it>
> Signed-off-by: Alessio Balsini <a.balsini@sssup.it>
> Co-developed-by: Andrea Parri <parri.andrea@gmail.com>
> Signed-off-by: Andrea Parri <parri.andrea@gmail.com>
> Co-developed-by: Yuri Andriaccio <yurand2000@gmail.com>
> Signed-off-by: Yuri Andriaccio <yurand2000@gmail.com>
> Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
> ---

...

>  #ifdef CONFIG_RT_GROUP_SCHED
> +int dl_check_tg(unsigned long total)
> +{
> +	unsigned long flags;
> +	int which_cpu;
> +	int cap;
> +	struct dl_bw *dl_b;
> +	u64 gen = ++dl_cookie;
> +
> +	for_each_possible_cpu(which_cpu) {
> +		rcu_read_lock_sched();
> +
> +		if (!dl_bw_visited(which_cpu, gen)) {
> +			cap = dl_bw_capacity(which_cpu);
> +			dl_b = dl_bw_of(which_cpu);
> +
> +			raw_spin_lock_irqsave(&dl_b->lock, flags);
> +
> +			if (dl_b->bw != -1 &&
> +			    cap_scale(dl_b->bw, cap) < dl_b->total_bw + cap_scale(total, cap)) {
> +				raw_spin_unlock_irqrestore(&dl_b->lock, flags);
> +				rcu_read_unlock_sched();
> +
> +				return 0;
> +			}
> +
> +			raw_spin_unlock_irqrestore(&dl_b->lock, flags);
> +		}
> +
> +		rcu_read_unlock_sched();

I believe we can use lock guards in the above?

...

> @@ -2108,6 +2107,20 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
>  		.rt_runtime = runtime,
>  	};
>  
> +	/*
> +	 * Since we truncate DL_SCALE bits, make sure we're at least
> +	 * that big.
> +	 */
> +	if (runtime != 0 && runtime < (1ULL << DL_SCALE))
> +		return -EINVAL;
> +
> +	/*
> +	 * Since we use the MSB for wrap-around and sign issues, make
> +	 * sure it's not set (mind that period can be equal to zero).
> +	 */
> +	if (period & (1ULL << 63))
> +		return -EINVAL;
> +

This is the same as in __checkparam_dl(), is it? Maybe we can create an
helper?

Thanks,
Juri