[v3] Hierarchical Constant Bandwidth Server

[RFC PATCH v3 11/24] sched/rt: Add rt-cgroups' dl-servers operations.

Posted by Yuri Andriaccio 4 months, 1 week ago

Implement the servers' functions that pick the next eligible task to run.
Enable/Disable dl-servers on task enqueue/dequeue when necessary.
Update dl-servers on task update.
Account the number of active rt-tasks in the cgroups' specific runqueue.
Account the number of active rt-tasks on the global counter of active tasks when
a cgroup is enqueued/dequeued (dl-server started/stopped).
Update rq's cpuprio only if the cgroup's is root control group.
Record which dl_server is managing a task when it changes runqueue.

Co-developed-by: Alessio Balsini <a.balsini@sssup.it>
Signed-off-by: Alessio Balsini <a.balsini@sssup.it>
Co-developed-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Andrea Parri <parri.andrea@gmail.com>
Co-developed-by: luca abeni <luca.abeni@santannapisa.it>
Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
Signed-off-by: Yuri Andriaccio <yurand2000@gmail.com>
---
 kernel/sched/deadline.c | 16 ++++++---
 kernel/sched/rt.c       | 79 ++++++++++++++++++++++++++++++++++++-----
 kernel/sched/sched.h    |  3 +-
 3 files changed, 85 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 754bfe231b4..1293b9a252b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1869,9 +1869,13 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	u64 deadline = dl_se->deadline;
 
 	dl_rq->dl_nr_running++;
-
-	if (!dl_server(dl_se))
+	if (!dl_server(dl_se)) {
 		add_nr_running(rq_of_dl_rq(dl_rq), 1);
+	} else if (dl_se != &rq_of_dl_rq(dl_rq)->fair_server) {
+		struct rt_rq *rt_rq = &dl_se->my_q->rt;
+
+		add_nr_running(rq_of_dl_rq(dl_rq), rt_rq->rt_nr_running);
+	}
 
 	inc_dl_deadline(dl_rq, deadline);
 }
@@ -1881,9 +1885,13 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 	WARN_ON(!dl_rq->dl_nr_running);
 	dl_rq->dl_nr_running--;
-
-	if (!dl_server(dl_se))
+	if (!dl_server(dl_se)) {
 		sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+	} else if (dl_se != &rq_of_dl_rq(dl_rq)->fair_server) {
+		struct rt_rq *rt_rq = &dl_se->my_q->rt;
+
+		sub_nr_running(rq_of_dl_rq(dl_rq), rt_rq->rt_nr_running);
+	}
 
 	dec_dl_deadline(dl_rq, dl_se->deadline);
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3094f59d0c8..d9442f64c6b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -144,14 +144,27 @@ void init_tg_rt_entry(struct task_group *tg, struct rq *served_rq,
 	tg->dl_se[cpu] = dl_se;
 }
 
+static struct task_struct *_pick_next_task_rt(struct rt_rq *rt_rq);
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first);
+
 static bool rt_server_has_tasks(struct sched_dl_entity *dl_se)
 {
-	return false;
+	return !!dl_se->my_q->rt.rt_nr_running;
 }
 
 static struct task_struct *rt_server_pick(struct sched_dl_entity *dl_se)
 {
-	return NULL;
+	struct rt_rq *rt_rq = &dl_se->my_q->rt;
+	struct rq *rq = rq_of_rt_rq(rt_rq);
+	struct task_struct *p;
+
+	if (dl_se->my_q->rt.rt_nr_running == 0)
+		return NULL;
+
+	p = _pick_next_task_rt(rt_rq);
+	set_next_task_rt(rq, p, true);
+
+	return p;
 }
 
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
@@ -416,6 +429,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 static void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *donor = rq->donor;
+	struct rt_rq *rt_rq;
 	s64 delta_exec;
 
 	if (donor->sched_class != &rt_sched_class)
@@ -425,8 +439,18 @@ static void update_curr_rt(struct rq *rq)
 	if (unlikely(delta_exec <= 0))
 		return;
 
-	if (!rt_bandwidth_enabled())
+	if (!rt_group_sched_enabled())
+		return;
+
+	if (!dl_bandwidth_enabled())
 		return;
+
+	rt_rq = rt_rq_of_se(&donor->rt);
+	if (is_dl_group(rt_rq)) {
+		struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+		dl_server_update(dl_se, delta_exec);
+	}
 }
 
 static void
@@ -437,7 +461,7 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 	/*
 	 * Change rq's cpupri only if rt_rq is the top queue.
 	 */
-	if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
+	if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && is_dl_group(rt_rq))
 		return;
 
 	if (rq->online && prio < prev_prio)
@@ -452,7 +476,7 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 	/*
 	 * Change rq's cpupri only if rt_rq is the top queue.
 	 */
-	if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
+	if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && is_dl_group(rt_rq))
 		return;
 
 	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
@@ -521,6 +545,15 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
 
 	inc_rt_prio(rt_rq, rt_se_prio(rt_se));
+
+	if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && is_dl_group(rt_rq)) {
+		struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+		if (!dl_se->dl_throttled)
+			add_nr_running(rq_of_rt_rq(rt_rq), 1);
+	} else {
+		add_nr_running(rq_of_rt_rq(rt_rq), 1);
+	}
 }
 
 static inline
@@ -531,6 +564,15 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
 
 	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+
+	if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && is_dl_group(rt_rq)) {
+		struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+		if (!dl_se->dl_throttled)
+			sub_nr_running(rq_of_rt_rq(rt_rq), 1);
+	} else {
+		sub_nr_running(rq_of_rt_rq(rt_rq), 1);
+	}
 }
 
 /*
@@ -712,6 +754,14 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	check_schedstat_required();
 	update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
 
+	/* Task arriving in an idle group of tasks. */
+	if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) &&
+	    is_dl_group(rt_rq) && rt_rq->rt_nr_running == 0) {
+		struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+		dl_server_start(dl_se);
+	}
+
 	enqueue_rt_entity(rt_se, flags);
 
 	if (task_is_blocked(p))
@@ -731,6 +781,14 @@ static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
 	dequeue_pushable_task(rt_rq, p);
 
+	/* Last task of the task group. */
+	if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) &&
+	    is_dl_group(rt_rq) && rt_rq->rt_nr_running == 0) {
+		struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+		dl_server_stop(dl_se);
+	}
+
 	return true;
 }
 
@@ -953,9 +1011,14 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
 	return next;
 }
 
-static struct task_struct *_pick_next_task_rt(struct rq *rq)
+static struct task_struct *_pick_next_task_rt(struct rt_rq *rt_rq)
 {
-	return NULL;
+	struct sched_rt_entity *rt_se;
+
+	rt_se = pick_next_rt_entity(rt_rq);
+	BUG_ON(!rt_se);
+
+	return rt_task_of(rt_se);
 }
 
 static struct task_struct *pick_task_rt(struct rq *rq)
@@ -965,7 +1028,7 @@ static struct task_struct *pick_task_rt(struct rq *rq)
 	if (!sched_rt_runnable(rq))
 		return NULL;
 
-	p = _pick_next_task_rt(rq);
+	p = _pick_next_task_rt(&rq->rt);
 
 	return p;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9853f321363..b2c87541257 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2170,7 +2170,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 	if (!rt_group_sched_enabled())
 		tg = &root_task_group;
 	p->rt.rt_rq  = tg->rt_rq[cpu];
-	p->rt.parent = tg->rt_se[cpu];
+	p->dl.dl_rq  = &cpu_rq(cpu)->dl;
 #endif /* CONFIG_RT_GROUP_SCHED */
 }
 
@@ -2726,6 +2726,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 
 static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
+	BUG_ON(rq->nr_running < count);
 	rq->nr_running -= count;
 	if (trace_sched_update_nr_running_tp_enabled()) {
 		call_trace_sched_update_nr_running(rq, -count);
-- 
2.51.0

Re: [RFC PATCH v3 11/24] sched/rt: Add rt-cgroups' dl-servers operations.

Posted by Juri Lelli 4 months ago

Hello,

On 29/09/25 11:22, Yuri Andriaccio wrote:
> Implement the servers' functions that pick the next eligible task to run.
> Enable/Disable dl-servers on task enqueue/dequeue when necessary.
> Update dl-servers on task update.
> Account the number of active rt-tasks in the cgroups' specific runqueue.
> Account the number of active rt-tasks on the global counter of active tasks when
> a cgroup is enqueued/dequeued (dl-server started/stopped).
> Update rq's cpuprio only if the cgroup's is root control group.
> Record which dl_server is managing a task when it changes runqueue.

Changelog looks a little dry. Claude suggests a little rewrite like

---
sched/rt: Implement dl-server operations for rt-cgroups

Implement the dl-server backend that enables rt-cgroups to run as
deadline servers. This allows RT tasks within a cgroup to be scheduled
according to the cgroup's allocated bandwidth using deadline scheduling.

The implementation consists of three main parts:

1) Server task selection callbacks:
   - rt_server_has_tasks(): Check if the rt_rq has runnable tasks
   - rt_server_pick(): Pick and set the next RT task from the cgroup's
     rt_rq when the server gets CPU time

2) Server lifecycle management:
   - Start the dl-server when the first RT task enqueues to an idle
     rt-cgroup
   - Stop the dl-server when the last RT task dequeues from an rt-cgroup
   - Update the server's consumed runtime in update_curr_rt() via
     dl_server_update()

3) Per-cpu priority and nr_running accounting:
   - Only update rq->cpupri for the root rt_rq (not for cgroup rt_rqs)
     since cgroups are scheduled via their dl-server priority
   - For cgroup rt_rqs, update global nr_running only when the dl-server
     is active (not throttled), as the server acts as the runnable entity
   - Bulk update nr_running when the server starts/stops based on the
     rt_rq's current rt_nr_running count

The rt.parent field is removed as the new implementation doesn't use
hierarchical RT scheduling entities. Instead, tasks record their dl_rq
to track which dl-server manages them.
---

Which seems to correspond to what this patch does. If that's the case,
however, I wonder if we are maybe doing too many things at once?

> Co-developed-by: Alessio Balsini <a.balsini@sssup.it>
> Signed-off-by: Alessio Balsini <a.balsini@sssup.it>
> Co-developed-by: Andrea Parri <parri.andrea@gmail.com>
> Signed-off-by: Andrea Parri <parri.andrea@gmail.com>
> Co-developed-by: luca abeni <luca.abeni@santannapisa.it>
> Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
> Signed-off-by: Yuri Andriaccio <yurand2000@gmail.com>
> ---
>  kernel/sched/deadline.c | 16 ++++++---
>  kernel/sched/rt.c       | 79 ++++++++++++++++++++++++++++++++++++-----
>  kernel/sched/sched.h    |  3 +-
>  3 files changed, 85 insertions(+), 13 deletions(-)
> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 754bfe231b4..1293b9a252b 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1869,9 +1869,13 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
>  	u64 deadline = dl_se->deadline;
>  
>  	dl_rq->dl_nr_running++;
> -
> -	if (!dl_server(dl_se))
> +	if (!dl_server(dl_se)) {
>  		add_nr_running(rq_of_dl_rq(dl_rq), 1);
> +	} else if (dl_se != &rq_of_dl_rq(dl_rq)->fair_server) {

I fear this condition might get unwieldy with the addition of new
servers (e.g. sched_scx).

> +		struct rt_rq *rt_rq = &dl_se->my_q->rt;
> +
> +		add_nr_running(rq_of_dl_rq(dl_rq), rt_rq->rt_nr_running);
> +	}
>  
>  	inc_dl_deadline(dl_rq, deadline);
>  }
> @@ -1881,9 +1885,13 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
>  {
>  	WARN_ON(!dl_rq->dl_nr_running);
>  	dl_rq->dl_nr_running--;
> -
> -	if (!dl_server(dl_se))
> +	if (!dl_server(dl_se)) {
>  		sub_nr_running(rq_of_dl_rq(dl_rq), 1);
> +	} else if (dl_se != &rq_of_dl_rq(dl_rq)->fair_server) {

Ditto.

> +		struct rt_rq *rt_rq = &dl_se->my_q->rt;
> +
> +		sub_nr_running(rq_of_dl_rq(dl_rq), rt_rq->rt_nr_running);
> +	}
>  
>  	dec_dl_deadline(dl_rq, dl_se->deadline);
>  }
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 3094f59d0c8..d9442f64c6b 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -144,14 +144,27 @@ void init_tg_rt_entry(struct task_group *tg, struct rq *served_rq,
>  	tg->dl_se[cpu] = dl_se;
>  }
>  
> +static struct task_struct *_pick_next_task_rt(struct rt_rq *rt_rq);
> +static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first);
> +
>  static bool rt_server_has_tasks(struct sched_dl_entity *dl_se)
>  {
> -	return false;
> +	return !!dl_se->my_q->rt.rt_nr_running;
>  }
>  
>  static struct task_struct *rt_server_pick(struct sched_dl_entity *dl_se)
>  {
> -	return NULL;
> +	struct rt_rq *rt_rq = &dl_se->my_q->rt;
> +	struct rq *rq = rq_of_rt_rq(rt_rq);
> +	struct task_struct *p;
> +
> +	if (dl_se->my_q->rt.rt_nr_running == 0)

Can't we use rt_server_has_tasks()?

> +		return NULL;
> +
> +	p = _pick_next_task_rt(rt_rq);
> +	set_next_task_rt(rq, p, true);
> +
> +	return p;

...

> @@ -953,9 +1011,14 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
>  	return next;
>  }
>  
> -static struct task_struct *_pick_next_task_rt(struct rq *rq)
> +static struct task_struct *_pick_next_task_rt(struct rt_rq *rt_rq)
>  {
> -	return NULL;
> +	struct sched_rt_entity *rt_se;
> +
> +	rt_se = pick_next_rt_entity(rt_rq);
> +	BUG_ON(!rt_se);

Can we WARN and recover somehow?

> +
> +	return rt_task_of(rt_se);
>  }
>  
>  static struct task_struct *pick_task_rt(struct rq *rq)
> @@ -965,7 +1028,7 @@ static struct task_struct *pick_task_rt(struct rq *rq)
>  	if (!sched_rt_runnable(rq))
>  		return NULL;
>  
> -	p = _pick_next_task_rt(rq);
> +	p = _pick_next_task_rt(&rq->rt);
>  
>  	return p;
>  }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 9853f321363..b2c87541257 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2170,7 +2170,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
>  	if (!rt_group_sched_enabled())
>  		tg = &root_task_group;
>  	p->rt.rt_rq  = tg->rt_rq[cpu];
> -	p->rt.parent = tg->rt_se[cpu];
> +	p->dl.dl_rq  = &cpu_rq(cpu)->dl;

Guess rt.parent is then removed in a subsequent patch? Do we want to
consolidate the cleanup?

>  #endif /* CONFIG_RT_GROUP_SCHED */
>  }
>  
> @@ -2726,6 +2726,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
>  
>  static inline void sub_nr_running(struct rq *rq, unsigned count)
>  {
> +	BUG_ON(rq->nr_running < count);

Can we WARN and recover somehow?

>  	rq->nr_running -= count;
>  	if (trace_sched_update_nr_running_tp_enabled()) {
>  		call_trace_sched_update_nr_running(rq, -count);

Thanks,
Juri