[v3] Hierarchical Constant Bandwidth Server

[RFC PATCH v3 09/24] sched/rt: Add {alloc/free}_rt_sched_group

Posted by Yuri Andriaccio 4 months, 1 week ago

From: luca abeni <luca.abeni@santannapisa.it>

Add allocation and deallocation code for rt-cgroups.
Declare dl_server specific functions (no implementation yet), needed by
the allocation code.

Co-developed-by: Alessio Balsini <a.balsini@sssup.it>
Signed-off-by: Alessio Balsini <a.balsini@sssup.it>
Co-developed-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Andrea Parri <parri.andrea@gmail.com>
Co-developed-by: Yuri Andriaccio <yurand2000@gmail.com>
Signed-off-by: Yuri Andriaccio <yurand2000@gmail.com>
Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
---
 kernel/sched/rt.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 84dbb4853b6..3094f59d0c8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -93,8 +93,43 @@ void unregister_rt_sched_group(struct task_group *tg)
 
 void free_rt_sched_group(struct task_group *tg)
 {
+	int i;
+	unsigned long flags;
+	struct rq *served_rq;
+
 	if (!rt_group_sched_enabled())
 		return;
+
+	if (!tg->dl_se || !tg->rt_rq)
+		return;
+
+	for_each_possible_cpu(i) {
+		if (!tg->dl_se[i] || !tg->rt_rq[i])
+			continue;
+
+		/*
+		 * Shutdown the dl_server and free it
+		 *
+		 * Since the dl timer is going to be cancelled,
+		 * we risk to never decrease the running bw...
+		 * Fix this issue by changing the group runtime
+		 * to 0 immediately before freeing it.
+		 */
+		dl_init_tg(tg->dl_se[i], 0, tg->dl_se[i]->dl_period);
+
+		raw_spin_rq_lock_irqsave(cpu_rq(i), flags);
+		BUG_ON(tg->rt_rq[i]->rt_nr_running);
+		hrtimer_cancel(&tg->dl_se[i]->dl_timer);
+		raw_spin_rq_unlock_irqrestore(cpu_rq(i), flags);
+		kfree(tg->dl_se[i]);
+
+		/* Free the local per-cpu runqueue */
+		served_rq = container_of(tg->rt_rq[i], struct rq, rt);
+		kfree(served_rq);
+	}
+
+	kfree(tg->rt_rq);
+	kfree(tg->dl_se);
 }
 
 void init_tg_rt_entry(struct task_group *tg, struct rq *served_rq,
@@ -109,11 +144,65 @@ void init_tg_rt_entry(struct task_group *tg, struct rq *served_rq,
 	tg->dl_se[cpu] = dl_se;
 }
 
+static bool rt_server_has_tasks(struct sched_dl_entity *dl_se)
+{
+	return false;
+}
+
+static struct task_struct *rt_server_pick(struct sched_dl_entity *dl_se)
+{
+	return NULL;
+}
+
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
+	struct rq *s_rq;
+	struct sched_dl_entity *dl_se;
+	int i;
+
 	if (!rt_group_sched_enabled())
 		return 1;
 
+	tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(struct rt_rq *), GFP_KERNEL);
+	if (!tg->rt_rq)
+		return 0;
+
+	tg->dl_se = kcalloc(nr_cpu_ids, sizeof(dl_se), GFP_KERNEL);
+	if (!tg->dl_se) {
+		kfree(tg->rt_rq);
+		tg->rt_rq = NULL;
+		return 0;
+	}
+
+	init_dl_bandwidth(&tg->dl_bandwidth, 0, 0);
+
+	for_each_possible_cpu(i) {
+		s_rq = kzalloc_node(sizeof(struct rq),
+				     GFP_KERNEL, cpu_to_node(i));
+		if (!s_rq)
+			return 0;
+
+		dl_se = kzalloc_node(sizeof(struct sched_dl_entity),
+				     GFP_KERNEL, cpu_to_node(i));
+		if (!dl_se) {
+			kfree(s_rq);
+			return 0;
+		}
+
+		init_rt_rq(&s_rq->rt);
+		init_dl_entity(dl_se);
+		dl_se->dl_runtime = tg->dl_bandwidth.dl_runtime;
+		dl_se->dl_period = tg->dl_bandwidth.dl_period;
+		dl_se->dl_deadline = dl_se->dl_period;
+		dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+		dl_se->dl_density = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+		dl_se->dl_server = 1;
+
+		dl_server_init(dl_se, &cpu_rq(i)->dl, s_rq, rt_server_has_tasks, rt_server_pick);
+
+		init_tg_rt_entry(tg, s_rq, dl_se, i, parent->dl_se[i]);
+	}
+
 	return 1;
 }
 
-- 
2.51.0

Re: [RFC PATCH v3 09/24] sched/rt: Add {alloc/free}_rt_sched_group

Posted by Juri Lelli 4 months ago

Hello,

On 29/09/25 11:22, Yuri Andriaccio wrote:
> From: luca abeni <luca.abeni@santannapisa.it>
> 
> Add allocation and deallocation code for rt-cgroups.
> Declare dl_server specific functions (no implementation yet), needed by
> the allocation code.
> 
> Co-developed-by: Alessio Balsini <a.balsini@sssup.it>
> Signed-off-by: Alessio Balsini <a.balsini@sssup.it>
> Co-developed-by: Andrea Parri <parri.andrea@gmail.com>
> Signed-off-by: Andrea Parri <parri.andrea@gmail.com>
> Co-developed-by: Yuri Andriaccio <yurand2000@gmail.com>
> Signed-off-by: Yuri Andriaccio <yurand2000@gmail.com>
> Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
> ---
>  kernel/sched/rt.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 89 insertions(+)
> 
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 84dbb4853b6..3094f59d0c8 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -93,8 +93,43 @@ void unregister_rt_sched_group(struct task_group *tg)
>  
>  void free_rt_sched_group(struct task_group *tg)
>  {
> +	int i;
> +	unsigned long flags;
> +	struct rq *served_rq;
> +
>  	if (!rt_group_sched_enabled())
>  		return;
> +
> +	if (!tg->dl_se || !tg->rt_rq)
> +		return;
> +
> +	for_each_possible_cpu(i) {
> +		if (!tg->dl_se[i] || !tg->rt_rq[i])
> +			continue;
> +
> +		/*
> +		 * Shutdown the dl_server and free it
> +		 *
> +		 * Since the dl timer is going to be cancelled,
> +		 * we risk to never decrease the running bw...
> +		 * Fix this issue by changing the group runtime
> +		 * to 0 immediately before freeing it.
> +		 */
> +		dl_init_tg(tg->dl_se[i], 0, tg->dl_se[i]->dl_period);
> +
> +		raw_spin_rq_lock_irqsave(cpu_rq(i), flags);
> +		BUG_ON(tg->rt_rq[i]->rt_nr_running);

Crashing is probably a bit harsh. What didn't happen that should have
happened if there are still tasks in the group at this point? Can we
maybe WARN and try to recover w/o crashing?

> +		hrtimer_cancel(&tg->dl_se[i]->dl_timer);
> +		raw_spin_rq_unlock_irqrestore(cpu_rq(i), flags);
> +		kfree(tg->dl_se[i]);
> +
> +		/* Free the local per-cpu runqueue */
> +		served_rq = container_of(tg->rt_rq[i], struct rq, rt);
> +		kfree(served_rq);
> +	}
> +
> +	kfree(tg->rt_rq);
> +	kfree(tg->dl_se);

...

>  int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
>  {
> +	struct rq *s_rq;
> +	struct sched_dl_entity *dl_se;
> +	int i;
> +
>  	if (!rt_group_sched_enabled())
>  		return 1;
>  
> +	tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(struct rt_rq *), GFP_KERNEL);
> +	if (!tg->rt_rq)
> +		return 0;
> +
> +	tg->dl_se = kcalloc(nr_cpu_ids, sizeof(dl_se), GFP_KERNEL);

Nit. Maybe sizeof(struct sched_dl_entity *) is more clear and consistent
with the above (current form is still correct, of course).

> +	if (!tg->dl_se) {
> +		kfree(tg->rt_rq);
> +		tg->rt_rq = NULL;
> +		return 0;
> +	}
> +
> +	init_dl_bandwidth(&tg->dl_bandwidth, 0, 0);
> +
> +	for_each_possible_cpu(i) {
> +		s_rq = kzalloc_node(sizeof(struct rq),
> +				     GFP_KERNEL, cpu_to_node(i));
> +		if (!s_rq)
> +			return 0;
> +
> +		dl_se = kzalloc_node(sizeof(struct sched_dl_entity),
> +				     GFP_KERNEL, cpu_to_node(i));
> +		if (!dl_se) {
> +			kfree(s_rq);
> +			return 0;
> +		}

Are we not leaking if allocation failure happens mid way during this for
loop? Do we free data structures allocated for previous CPUs?

> +
> +		init_rt_rq(&s_rq->rt);
> +		init_dl_entity(dl_se);
> +		dl_se->dl_runtime = tg->dl_bandwidth.dl_runtime;
> +		dl_se->dl_period = tg->dl_bandwidth.dl_period;
> +		dl_se->dl_deadline = dl_se->dl_period;
> +		dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
> +		dl_se->dl_density = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
> +		dl_se->dl_server = 1;
> +
> +		dl_server_init(dl_se, &cpu_rq(i)->dl, s_rq, rt_server_has_tasks, rt_server_pick);
> +
> +		init_tg_rt_entry(tg, s_rq, dl_se, i, parent->dl_se[i]);
> +	}
> +
>  	return 1;
>  }

Thanks,
Juri