[v3] Hierarchical Constant Bandwidth Server

[RFC PATCH v3 13/24] sched/rt: Update rt-cgroup schedulability checks

Posted by Yuri Andriaccio 4 months, 1 week ago

From: luca abeni <luca.abeni@santannapisa.it>

Update schedulability checks and setup of runtime/period for rt-cgroups.

Co-developed-by: Alessio Balsini <a.balsini@sssup.it>
Signed-off-by: Alessio Balsini <a.balsini@sssup.it>
Co-developed-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Andrea Parri <parri.andrea@gmail.com>
Co-developed-by: Yuri Andriaccio <yurand2000@gmail.com>
Signed-off-by: Yuri Andriaccio <yurand2000@gmail.com>
Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
---
 kernel/sched/core.c     |  6 ++++
 kernel/sched/deadline.c | 46 +++++++++++++++++++++++----
 kernel/sched/rt.c       | 70 +++++++++++++++++++++++------------------
 kernel/sched/sched.h    |  1 +
 4 files changed, 87 insertions(+), 36 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2cfbe3b7b17..1217f714dd2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9281,6 +9281,12 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		return &root_task_group.css;
 	}
 
+	/* Do not allow cpu_cgroup hierachies with depth greater than 2. */
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (parent != &root_task_group)
+		return ERR_PTR(-EINVAL);
+#endif
+
 	tg = sched_create_group(parent);
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1293b9a252b..5d93b3ca030 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -347,7 +347,47 @@ void cancel_inactive_timer(struct sched_dl_entity *dl_se)
 	cancel_dl_timer(dl_se, &dl_se->inactive_timer);
 }
 
+/*
+ * Used for dl_bw check and update, used under sched_rt_handler()::mutex and
+ * sched_domains_mutex.
+ */
+u64 dl_cookie;
+
 #ifdef CONFIG_RT_GROUP_SCHED
+int dl_check_tg(unsigned long total)
+{
+	unsigned long flags;
+	int which_cpu;
+	int cpus;
+	struct dl_bw *dl_b;
+	u64 gen = ++dl_cookie;
+
+	for_each_possible_cpu(which_cpu) {
+		rcu_read_lock_sched();
+
+		if (!dl_bw_visited(which_cpu, gen)) {
+			cpus = dl_bw_cpus(which_cpu);
+			dl_b = dl_bw_of(which_cpu);
+
+			raw_spin_lock_irqsave(&dl_b->lock, flags);
+
+			if (dl_b->bw != -1 &&
+			    dl_b->bw * cpus < dl_b->total_bw + total * cpus) {
+				raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+				rcu_read_unlock_sched();
+
+				return 0;
+			}
+
+			raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+		}
+
+		rcu_read_unlock_sched();
+	}
+
+	return 1;
+}
+
 void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
 {
 	struct rq *rq = container_of(dl_se->dl_rq, struct rq, dl);
@@ -3153,12 +3193,6 @@ DEFINE_SCHED_CLASS(dl) = {
 #endif
 };
 
-/*
- * Used for dl_bw check and update, used under sched_rt_handler()::mutex and
- * sched_domains_mutex.
- */
-u64 dl_cookie;
-
 int sched_dl_global_validate(void)
 {
 	u64 runtime = global_rt_runtime();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ce114823fe7..7c7622303e8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1996,11 +1996,6 @@ DEFINE_SCHED_CLASS(rt) = {
 };
 
 #ifdef CONFIG_RT_GROUP_SCHED
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
-
 static inline int tg_has_rt_tasks(struct task_group *tg)
 {
 	struct task_struct *task;
@@ -2034,8 +2029,8 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 	unsigned long total, sum = 0;
 	u64 period, runtime;
 
-	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-	runtime = tg->rt_bandwidth.rt_runtime;
+	period  = tg->dl_bandwidth.dl_period;
+	runtime = tg->dl_bandwidth.dl_runtime;
 
 	if (tg == d->tg) {
 		period = d->rt_period;
@@ -2051,8 +2046,7 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 	/*
 	 * Ensure we don't starve existing RT tasks if runtime turns zero.
 	 */
-	if (rt_bandwidth_enabled() && !runtime &&
-	    tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
+	if (dl_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 
 	if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group))
@@ -2066,12 +2060,17 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
 		return -EINVAL;
 
+	if (tg == &root_task_group) {
+		if (!dl_check_tg(total))
+			return -EBUSY;
+	}
+
 	/*
 	 * The sum of our children's runtime should not exceed our own.
 	 */
 	list_for_each_entry_rcu(child, &tg->children, siblings) {
-		period = ktime_to_ns(child->rt_bandwidth.rt_period);
-		runtime = child->rt_bandwidth.rt_runtime;
+		period  = child->dl_bandwidth.dl_period;
+		runtime = child->dl_bandwidth.dl_runtime;
 
 		if (child == d->tg) {
 			period = d->rt_period;
@@ -2097,6 +2096,20 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 		.rt_runtime = runtime,
 	};
 
+	/*
+	* Since we truncate DL_SCALE bits, make sure we're at least
+	* that big.
+	*/
+	if (runtime != 0 && runtime < (1ULL << DL_SCALE))
+		return -EINVAL;
+
+	/*
+	* Since we use the MSB for wrap-around and sign issues, make
+	* sure it's not set (mind that period can be equal to zero).
+	*/
+	if (period & (1ULL << 63))
+		return -EINVAL;
+
 	rcu_read_lock();
 	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
 	rcu_read_unlock();
@@ -2107,6 +2120,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 static int tg_set_rt_bandwidth(struct task_group *tg,
 		u64 rt_period, u64 rt_runtime)
 {
+	static DEFINE_MUTEX(rt_constraints_mutex);
 	int i, err = 0;
 
 	/*
@@ -2126,34 +2140,30 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 	if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
 		return -EINVAL;
 
-	mutex_lock(&rt_constraints_mutex);
+	guard(mutex)(&rt_constraints_mutex);
 	err = __rt_schedulable(tg, rt_period, rt_runtime);
 	if (err)
-		goto unlock;
+		return err;
 
-	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
-	tg->rt_bandwidth.rt_runtime = rt_runtime;
+	guard(raw_spinlock_irq)(&tg->dl_bandwidth.dl_runtime_lock);
+	tg->dl_bandwidth.dl_period  = rt_period;
+	tg->dl_bandwidth.dl_runtime = rt_runtime;
 
-	for_each_possible_cpu(i) {
-		struct rt_rq *rt_rq = tg->rt_rq[i];
+	if (tg == &root_task_group)
+		return 0;
 
-		raw_spin_lock(&rt_rq->rt_runtime_lock);
-		rt_rq->rt_runtime = rt_runtime;
-		raw_spin_unlock(&rt_rq->rt_runtime_lock);
+	for_each_possible_cpu(i) {
+		dl_init_tg(tg->dl_se[i], rt_runtime, rt_period);
 	}
-	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-unlock:
-	mutex_unlock(&rt_constraints_mutex);
 
-	return err;
+	return 0;
 }
 
 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
 	u64 rt_runtime, rt_period;
 
-	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	rt_period  = tg->dl_bandwidth.dl_period;
 	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
 	if (rt_runtime_us < 0)
 		rt_runtime = RUNTIME_INF;
@@ -2167,10 +2177,10 @@ long sched_group_rt_runtime(struct task_group *tg)
 {
 	u64 rt_runtime_us;
 
-	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
+	if (tg->dl_bandwidth.dl_runtime == RUNTIME_INF)
 		return -1;
 
-	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
+	rt_runtime_us = tg->dl_bandwidth.dl_runtime;
 	do_div(rt_runtime_us, NSEC_PER_USEC);
 	return rt_runtime_us;
 }
@@ -2183,7 +2193,7 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
 		return -EINVAL;
 
 	rt_period = rt_period_us * NSEC_PER_USEC;
-	rt_runtime = tg->rt_bandwidth.rt_runtime;
+	rt_runtime = tg->dl_bandwidth.dl_runtime;
 
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
@@ -2192,7 +2202,7 @@ long sched_group_rt_period(struct task_group *tg)
 {
 	u64 rt_period_us;
 
-	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	rt_period_us = tg->dl_bandwidth.dl_period;
 	do_div(rt_period_us, NSEC_PER_USEC);
 	return rt_period_us;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2c87541257..97e1e779df9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -394,6 +394,7 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq,
 		    dl_server_has_tasks_f has_tasks,
 		    dl_server_pick_f pick_task);
 extern void sched_init_dl_servers(void);
+extern int dl_check_tg(unsigned long total);
 extern void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period);
 
 extern void dl_server_update_idle_time(struct rq *rq,
-- 
2.51.0

Re: [RFC PATCH v3 13/24] sched/rt: Update rt-cgroup schedulability checks

Posted by Juri Lelli 4 months ago

Hello,

On 29/09/25 11:22, Yuri Andriaccio wrote:
> From: luca abeni <luca.abeni@santannapisa.it>
> 
> Update schedulability checks and setup of runtime/period for rt-cgroups.

So, it looks like changelogs are all too minimal and dry. Having a more
comprehensive (but concise) description of what the patch does, why it
does it and how, will help reviewing and is essential in the future to
trace back design decisions and issues. Please consider this for current
and previous/subsequent patches.

> Co-developed-by: Alessio Balsini <a.balsini@sssup.it>
> Signed-off-by: Alessio Balsini <a.balsini@sssup.it>
> Co-developed-by: Andrea Parri <parri.andrea@gmail.com>
> Signed-off-by: Andrea Parri <parri.andrea@gmail.com>
> Co-developed-by: Yuri Andriaccio <yurand2000@gmail.com>
> Signed-off-by: Yuri Andriaccio <yurand2000@gmail.com>
> Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
> ---
>  kernel/sched/core.c     |  6 ++++
>  kernel/sched/deadline.c | 46 +++++++++++++++++++++++----
>  kernel/sched/rt.c       | 70 +++++++++++++++++++++++------------------
>  kernel/sched/sched.h    |  1 +
>  4 files changed, 87 insertions(+), 36 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 2cfbe3b7b17..1217f714dd2 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -9281,6 +9281,12 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
>  		return &root_task_group.css;
>  	}
>  
> +	/* Do not allow cpu_cgroup hierachies with depth greater than 2. */

I believe this limit gets removed with later patches, but since we have
it here, we should state why we have the limit in place.

> +#ifdef CONFIG_RT_GROUP_SCHED
> +	if (parent != &root_task_group)
> +		return ERR_PTR(-EINVAL);
> +#endif
> +
>  	tg = sched_create_group(parent);
>  	if (IS_ERR(tg))
>  		return ERR_PTR(-ENOMEM);
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 1293b9a252b..5d93b3ca030 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -347,7 +347,47 @@ void cancel_inactive_timer(struct sched_dl_entity *dl_se)
>  	cancel_dl_timer(dl_se, &dl_se->inactive_timer);
>  }
>  
> +/*
> + * Used for dl_bw check and update, used under sched_rt_handler()::mutex and
> + * sched_domains_mutex.
> + */
> +u64 dl_cookie;
> +
>  #ifdef CONFIG_RT_GROUP_SCHED
> +int dl_check_tg(unsigned long total)
> +{
> +	unsigned long flags;
> +	int which_cpu;
> +	int cpus;
> +	struct dl_bw *dl_b;
> +	u64 gen = ++dl_cookie;
> +
> +	for_each_possible_cpu(which_cpu) {
> +		rcu_read_lock_sched();
> +
> +		if (!dl_bw_visited(which_cpu, gen)) {
> +			cpus = dl_bw_cpus(which_cpu);
> +			dl_b = dl_bw_of(which_cpu);
> +
> +			raw_spin_lock_irqsave(&dl_b->lock, flags);
> +
> +			if (dl_b->bw != -1 &&
> +			    dl_b->bw * cpus < dl_b->total_bw + total * cpus) {

Does this need to use cap_scale()?

> +				raw_spin_unlock_irqrestore(&dl_b->lock, flags);
> +				rcu_read_unlock_sched();
> +
> +				return 0;
> +			}
> +
> +			raw_spin_unlock_irqrestore(&dl_b->lock, flags);
> +		}
> +
> +		rcu_read_unlock_sched();
> +	}
> +
> +	return 1;
> +}
> +

...

> @@ -2034,8 +2029,8 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
>  	unsigned long total, sum = 0;
>  	u64 period, runtime;
>  
> -	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
> -	runtime = tg->rt_bandwidth.rt_runtime;
> +	period  = tg->dl_bandwidth.dl_period;
> +	runtime = tg->dl_bandwidth.dl_runtime;

Just as an example, this is the kind of important change (rt_bandwidth
-> dl_bandwidth) that usually deserves to be explicitly mentioned in the
changelog.

...

> @@ -2097,6 +2096,20 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
>  		.rt_runtime = runtime,
>  	};
>  
> +	/*
> +	* Since we truncate DL_SCALE bits, make sure we're at least
> +	* that big.
> +	*/
> +	if (runtime != 0 && runtime < (1ULL << DL_SCALE))
> +		return -EINVAL;
> +
> +	/*
        ^
Nit, fix alignment.

> +	* Since we use the MSB for wrap-around and sign issues, make
> +	* sure it's not set (mind that period can be equal to zero).
> +	*/
> +	if (period & (1ULL << 63))
> +		return -EINVAL;
> +

Thanks,
Juri

Re: [RFC PATCH v3 13/24] sched/rt: Update rt-cgroup schedulability checks

Posted by Markus Elfring 4 months, 1 week ago

…
> +++ b/kernel/sched/deadline.c
> @@ -340,6 +340,39 @@ void cancel_inactive_timer(struct sched_dl_entity *dl_se)
…
> +#ifdef CONFIG_RT_GROUP_SCHED
> +void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
> +{
…
> +	u64 new_bw;
> +
> +	raw_spin_rq_lock_irq(rq);
…
> +	raw_spin_rq_unlock_irq(rq);
> +}
> +#endif
…

How do you think about to define and use a corresponding lock guard?

Regards,
Markus