[v7] SCHED_DEADLINE server infrastructure

[PATCH V7 9/9] sched/rt: Remove default bandwidth control

Posted by Daniel Bristot de Oliveira 1 year, 8 months ago

From: Peter Zijlstra <peterz@infradead.org>

Now that fair_server exists, we no longer need RT bandwidth control
unless RT_GROUP_SCHED.

Enable fair_server with parameters equivalent to RT throttling.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
---
 kernel/sched/core.c     |   9 +-
 kernel/sched/deadline.c |   5 +-
 kernel/sched/debug.c    |   3 +
 kernel/sched/rt.c       | 242 ++++++++++++++++++----------------------
 kernel/sched/sched.h    |   3 +-
 5 files changed, 120 insertions(+), 142 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 01336277eac9..8439c2f992db 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9987,8 +9987,6 @@ void __init sched_init(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 	}
 
-	init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
-
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 #endif
@@ -10043,8 +10041,13 @@ void __init sched_init(void)
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
+		/*
+		 * This is required for init cpu because rt.c:__enable_runtime()
+		 * starts working after scheduler_running, which is not the case
+		 * yet.
+		 */
+		rq->rt.rt_runtime = global_rt_runtime();
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0dbb42cf7fe6..7df8179bfa08 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1554,6 +1554,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
 	if (dl_se == &rq->fair_server)
 		return;
 
+#ifdef CONFIG_RT_GROUP_SCHED
 	/*
 	 * Because -- for now -- we share the rt bandwidth, we need to
 	 * account our runtime there too, otherwise actual rt tasks
@@ -1578,6 +1579,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
 			rt_rq->rt_time += delta_exec;
 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 	}
+#endif
 }
 
 /*
@@ -1632,8 +1634,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
 	 * this before getting generic.
 	 */
 	if (!dl_server(dl_se)) {
-		/* Disabled */
-		u64 runtime = 0;
+		u64 runtime =  50 * NSEC_PER_MSEC;
 		u64 period = 1000 * NSEC_PER_MSEC;
 
 		dl_server_apply_params(dl_se, runtime, period, 1);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index b14ffb100867..2d5851d65c67 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -889,9 +889,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
 
 	PU(rt_nr_running);
+
+#ifdef CONFIG_RT_GROUP_SCHED
 	P(rt_throttled);
 	PN(rt_time);
 	PN(rt_runtime);
+#endif
 
 #undef PN
 #undef PU
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aa4c1c874fa4..fb591b98d71f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,10 +8,6 @@ int sched_rr_timeslice = RR_TIMESLICE;
 /* More than 4 hours if BW_SHIFT equals 20. */
 static const u64 max_rt_runtime = MAX_BW;
 
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
-
-struct rt_bandwidth def_rt_bandwidth;
-
 /*
  * period over which we measure -rt task CPU usage in us.
  * default: 1s
@@ -66,6 +62,40 @@ static int __init sched_rt_sysctl_init(void)
 late_initcall(sched_rt_sysctl_init);
 #endif
 
+void init_rt_rq(struct rt_rq *rt_rq)
+{
+	struct rt_prio_array *array;
+	int i;
+
+	array = &rt_rq->active;
+	for (i = 0; i < MAX_RT_PRIO; i++) {
+		INIT_LIST_HEAD(array->queue + i);
+		__clear_bit(i, array->bitmap);
+	}
+	/* delimiter for bitsearch: */
+	__set_bit(MAX_RT_PRIO, array->bitmap);
+
+#if defined CONFIG_SMP
+	rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
+	rt_rq->highest_prio.next = MAX_RT_PRIO-1;
+	rt_rq->overloaded = 0;
+	plist_head_init(&rt_rq->pushable_tasks);
+#endif /* CONFIG_SMP */
+	/* We start is dequeued state, because no RT tasks are queued */
+	rt_rq->rt_queued = 0;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	rt_rq->rt_time = 0;
+	rt_rq->rt_throttled = 0;
+	rt_rq->rt_runtime = 0;
+	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+#endif
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+
 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 {
 	struct rt_bandwidth *rt_b =
@@ -130,35 +160,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 	do_start_rt_bandwidth(rt_b);
 }
 
-void init_rt_rq(struct rt_rq *rt_rq)
-{
-	struct rt_prio_array *array;
-	int i;
-
-	array = &rt_rq->active;
-	for (i = 0; i < MAX_RT_PRIO; i++) {
-		INIT_LIST_HEAD(array->queue + i);
-		__clear_bit(i, array->bitmap);
-	}
-	/* delimiter for bitsearch: */
-	__set_bit(MAX_RT_PRIO, array->bitmap);
-
-#if defined CONFIG_SMP
-	rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
-	rt_rq->highest_prio.next = MAX_RT_PRIO-1;
-	rt_rq->overloaded = 0;
-	plist_head_init(&rt_rq->pushable_tasks);
-#endif /* CONFIG_SMP */
-	/* We start is dequeued state, because no RT tasks are queued */
-	rt_rq->rt_queued = 0;
-
-	rt_rq->rt_time = 0;
-	rt_rq->rt_throttled = 0;
-	rt_rq->rt_runtime = 0;
-	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-}
-
-#ifdef CONFIG_RT_GROUP_SCHED
 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	hrtimer_cancel(&rt_b->rt_period_timer);
@@ -195,7 +196,6 @@ void unregister_rt_sched_group(struct task_group *tg)
 {
 	if (tg->rt_se)
 		destroy_rt_bandwidth(&tg->rt_bandwidth);
-
 }
 
 void free_rt_sched_group(struct task_group *tg)
@@ -253,8 +253,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 	if (!tg->rt_se)
 		goto err;
 
-	init_rt_bandwidth(&tg->rt_bandwidth,
-			ktime_to_ns(def_rt_bandwidth.rt_period), 0);
+	init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0);
 
 	for_each_possible_cpu(i) {
 		rt_rq = kzalloc_node(sizeof(struct rt_rq),
@@ -604,70 +603,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 	return &rt_rq->tg->rt_bandwidth;
 }
 
-#else /* !CONFIG_RT_GROUP_SCHED */
-
-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
-{
-	return rt_rq->rt_runtime;
-}
-
-static inline u64 sched_rt_period(struct rt_rq *rt_rq)
-{
-	return ktime_to_ns(def_rt_bandwidth.rt_period);
-}
-
-typedef struct rt_rq *rt_rq_iter_t;
-
-#define for_each_rt_rq(rt_rq, iter, rq) \
-	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
-#define for_each_sched_rt_entity(rt_se) \
-	for (; rt_se; rt_se = NULL)
-
-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
-{
-	return NULL;
-}
-
-static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
-{
-	struct rq *rq = rq_of_rt_rq(rt_rq);
-
-	if (!rt_rq->rt_nr_running)
-		return;
-
-	enqueue_top_rt_rq(rt_rq);
-	resched_curr(rq);
-}
-
-static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
-{
-	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
-}
-
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-	return rt_rq->rt_throttled;
-}
-
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-	return cpu_online_mask;
-}
-
-static inline
-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
-{
-	return &cpu_rq(cpu)->rt;
-}
-
-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
-{
-	return &def_rt_bandwidth;
-}
-
-#endif /* CONFIG_RT_GROUP_SCHED */
-
 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 {
 	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -859,7 +794,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 	const struct cpumask *span;
 
 	span = sched_rt_period_mask();
-#ifdef CONFIG_RT_GROUP_SCHED
+
 	/*
 	 * FIXME: isolated CPUs should really leave the root task group,
 	 * whether they are isolcpus or were isolated via cpusets, lest
@@ -871,7 +806,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 	 */
 	if (rt_b == &root_task_group.rt_bandwidth)
 		span = cpu_online_mask;
-#endif
+
 	for_each_cpu(i, span) {
 		int enqueue = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
@@ -938,18 +873,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 	return idle;
 }
 
-static inline int rt_se_prio(struct sched_rt_entity *rt_se)
-{
-#ifdef CONFIG_RT_GROUP_SCHED
-	struct rt_rq *rt_rq = group_rt_rq(rt_se);
-
-	if (rt_rq)
-		return rt_rq->highest_prio.curr;
-#endif
-
-	return rt_task_of(rt_se)->prio;
-}
-
 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
 	u64 runtime = sched_rt_runtime(rt_rq);
@@ -993,6 +916,72 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 	return 0;
 }
 
+#else /* !CONFIG_RT_GROUP_SCHED */
+
+typedef struct rt_rq *rt_rq_iter_t;
+
+#define for_each_rt_rq(rt_rq, iter, rq) \
+	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
+#define for_each_sched_rt_entity(rt_se) \
+	for (; rt_se; rt_se = NULL)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+	return NULL;
+}
+
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+{
+	struct rq *rq = rq_of_rt_rq(rt_rq);
+
+	if (!rt_rq->rt_nr_running)
+		return;
+
+	enqueue_top_rt_rq(rt_rq);
+	resched_curr(rq);
+}
+
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+{
+	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return false;
+}
+
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+	return cpu_online_mask;
+}
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+	return &cpu_rq(cpu)->rt;
+}
+
+#ifdef CONFIG_SMP
+static void __enable_runtime(struct rq *rq) { }
+static void __disable_runtime(struct rq *rq) { }
+#endif
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+	struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+	if (rt_rq)
+		return rt_rq->highest_prio.curr;
+#endif
+
+	return rt_task_of(rt_se)->prio;
+}
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -1000,7 +989,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 static void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
-	struct sched_rt_entity *rt_se = &curr->rt;
 	s64 delta_exec;
 
 	if (curr->sched_class != &rt_sched_class)
@@ -1010,6 +998,9 @@ static void update_curr_rt(struct rq *rq)
 	if (unlikely(delta_exec <= 0))
 		return;
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	struct sched_rt_entity *rt_se = &curr->rt;
+
 	if (!rt_bandwidth_enabled())
 		return;
 
@@ -1028,6 +1019,7 @@ static void update_curr_rt(struct rq *rq)
 				do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
 		}
 	}
+#endif
 }
 
 static void
@@ -1184,7 +1176,6 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static void
 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-	start_rt_bandwidth(&def_rt_bandwidth);
 }
 
 static inline
@@ -2912,19 +2903,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 #ifdef CONFIG_SYSCTL
 static int sched_rt_global_constraints(void)
 {
-	unsigned long flags;
-	int i;
-
-	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
-	for_each_possible_cpu(i) {
-		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-
-		raw_spin_lock(&rt_rq->rt_runtime_lock);
-		rt_rq->rt_runtime = global_rt_runtime();
-		raw_spin_unlock(&rt_rq->rt_runtime_lock);
-	}
-	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-
 	return 0;
 }
 #endif /* CONFIG_SYSCTL */
@@ -2944,12 +2922,6 @@ static int sched_rt_global_validate(void)
 
 static void sched_rt_do_global(void)
 {
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
-	def_rt_bandwidth.rt_runtime = global_rt_runtime();
-	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
-	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
 }
 
 static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3b8684b5ec8e..c93de331171b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -729,13 +729,13 @@ struct rt_rq {
 #endif /* CONFIG_SMP */
 	int			rt_queued;
 
+#ifdef CONFIG_RT_GROUP_SCHED
 	int			rt_throttled;
 	u64			rt_time;
 	u64			rt_runtime;
 	/* Nests inside the rq lock: */
 	raw_spinlock_t		rt_runtime_lock;
 
-#ifdef CONFIG_RT_GROUP_SCHED
 	unsigned int		rt_nr_boosted;
 
 	struct rq		*rq;
@@ -2478,7 +2478,6 @@ extern void reweight_task(struct task_struct *p, int prio);
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
 
-extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
 extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
 
-- 
2.45.1

Re: [PATCH V7 9/9] sched/rt: Remove default bandwidth control

Posted by Michal Koutný 1 year, 2 months ago

Hello.

(I'm replying now as I installed v6.12 and this message has the
context.)

On Mon, May 27, 2024 at 02:06:55PM GMT, Daniel Bristot de Oliveira <bristot@kernel.org> wrote:
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 0dbb42cf7fe6..7df8179bfa08 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1554,6 +1554,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
>  	if (dl_se == &rq->fair_server)
>  		return;
>  
> +#ifdef CONFIG_RT_GROUP_SCHED
>  	/*
>  	 * Because -- for now -- we share the rt bandwidth, we need to
>  	 * account our runtime there too, otherwise actual rt tasks
> @@ -1578,6 +1579,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
>  			rt_rq->rt_time += delta_exec;
>  		raw_spin_unlock(&rt_rq->rt_runtime_lock);
>  	}
> +#endif
>  }
>  
>  /*
> @@ -1632,8 +1634,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
>  	 * this before getting generic.
>  	 */
>  	if (!dl_server(dl_se)) {
> -		/* Disabled */
> -		u64 runtime = 0;
> +		u64 runtime =  50 * NSEC_PER_MSEC;
>  		u64 period = 1000 * NSEC_PER_MSEC;
>  
>  		dl_server_apply_params(dl_se, runtime, period, 1);

The global_rt_runtime() also applies to deadline class when CPU's DL
bandwidth is init'd in init_dl_rq_bw_ratio().

The default DL bandwidth is thus is 95%. The fair server is given 5%.
Is that 5% of those 95%?

Or is it meant to be complementary? (Perhaps not as I can configure
rt_runtime_us/rt_period_us > 95% without an error. But then I don't
understand what the global rt_runtime_us (w/out CONFIG_RT_GROUP_SCHED)
configures.)

Thanks for some hints,
Michal

Re: [PATCH V7 9/9] sched/rt: Remove default bandwidth control

Posted by Juri Lelli 1 year, 2 months ago

Hi Michal,

On 27/11/24 11:55, Michal Koutný wrote:
> Hello.
> 
> (I'm replying now as I installed v6.12 and this message has the
> context.)
> 
> On Mon, May 27, 2024 at 02:06:55PM GMT, Daniel Bristot de Oliveira <bristot@kernel.org> wrote:
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index 0dbb42cf7fe6..7df8179bfa08 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -1554,6 +1554,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
> >  	if (dl_se == &rq->fair_server)
> >  		return;
> >  
> > +#ifdef CONFIG_RT_GROUP_SCHED
> >  	/*
> >  	 * Because -- for now -- we share the rt bandwidth, we need to
> >  	 * account our runtime there too, otherwise actual rt tasks
> > @@ -1578,6 +1579,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
> >  			rt_rq->rt_time += delta_exec;
> >  		raw_spin_unlock(&rt_rq->rt_runtime_lock);
> >  	}
> > +#endif
> >  }
> >  
> >  /*
> > @@ -1632,8 +1634,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
> >  	 * this before getting generic.
> >  	 */
> >  	if (!dl_server(dl_se)) {
> > -		/* Disabled */
> > -		u64 runtime = 0;
> > +		u64 runtime =  50 * NSEC_PER_MSEC;
> >  		u64 period = 1000 * NSEC_PER_MSEC;
> >  
> >  		dl_server_apply_params(dl_se, runtime, period, 1);
> 
> The global_rt_runtime() also applies to deadline class when CPU's DL
> bandwidth is init'd in init_dl_rq_bw_ratio().
> 
> The default DL bandwidth is thus is 95%. The fair server is given 5%.
> Is that 5% of those 95%?

Yes, it is indeed.

Best,
Juri

Re: [PATCH V7 9/9] sched/rt: Remove default bandwidth control

Posted by Michal Koutný 1 year, 2 months ago

On Wed, Nov 27, 2024 at 04:35:39PM GMT, Juri Lelli <juri.lelli@redhat.com> wrote:
> > The default DL bandwidth is thus is 95%. The fair server is given 5%.
> > Is that 5% of those 95%?
> 
> Yes, it is indeed.

Thanks for navigating me. I have followup questions about
/proc/sys/kernel/sched_rt_runtime_us / /proc/sys/kernel/sched_rt_period_us
(a ratio, without CONGIG_RT_GROUP_SCHED)

- 0
  - disables DL, (not RT, so they can monopolize a CPU)
- 1
  - DL tasks can monopolize CPU, SCHED_NORMAL have 5% thanks to
    fair_server
- 1-Δ
  - SCHED_NORMAL tasks have
    - (1-Δ)*5% on behalf of DL (above RT)
    - Δ regularly (below RT)

Is this breakdown correct?
I'm wondering if different values of Δ mean anything or how they can be
used.

Regards,
Michal

Re: [PATCH V7 9/9] sched/rt: Remove default bandwidth control

Posted by Juri Lelli 1 year, 2 months ago

On 29/11/24 11:02, Michal Koutný wrote:
> On Wed, Nov 27, 2024 at 04:35:39PM GMT, Juri Lelli <juri.lelli@redhat.com> wrote:
> > > The default DL bandwidth is thus is 95%. The fair server is given 5%.
> > > Is that 5% of those 95%?
> > 
> > Yes, it is indeed.
> 
> Thanks for navigating me. I have followup questions about
> /proc/sys/kernel/sched_rt_runtime_us / /proc/sys/kernel/sched_rt_period_us
> (a ratio, without CONGIG_RT_GROUP_SCHED)
> 
> - 0
>   - disables DL, (not RT, so they can monopolize a CPU)
> - 1
>   - DL tasks can monopolize CPU, SCHED_NORMAL have 5% thanks to
>     fair_server
> - 1-Δ
>   - SCHED_NORMAL tasks have
>     - (1-Δ)*5% on behalf of DL (above RT)
>     - Δ regularly (below RT)
> 
> Is this breakdown correct?

So, sched_rt_runtime_us/sched_rt_period_us only applies to admission
control (for DEADLINE tasks, including dl_servers). The actual
parameters controlling the dl_server for SCHED_NORMAL are under sched/
debug as per-cpu values, e.g.:

# grep . /sys/kernel/debug/sched/fair_server/cpu*/*
/sys/kernel/debug/sched/fair_server/cpu0/period:1000000000
/sys/kernel/debug/sched/fair_server/cpu0/runtime:50000000
/sys/kernel/debug/sched/fair_server/cpu1/period:1000000000
/sys/kernel/debug/sched/fair_server/cpu1/runtime:50000000
/sys/kernel/debug/sched/fair_server/cpu2/period:1000000000
/sys/kernel/debug/sched/fair_server/cpu2/runtime:50000000
/sys/kernel/debug/sched/fair_server/cpu3/period:1000000000
/sys/kernel/debug/sched/fair_server/cpu3/runtime:50000000
/sys/kernel/debug/sched/fair_server/cpu4/period:1000000000
/sys/kernel/debug/sched/fair_server/cpu4/runtime:50000000
/sys/kernel/debug/sched/fair_server/cpu5/period:1000000000
/sys/kernel/debug/sched/fair_server/cpu5/runtime:50000000
/sys/kernel/debug/sched/fair_server/cpu6/period:1000000000
/sys/kernel/debug/sched/fair_server/cpu6/runtime:50000000
/sys/kernel/debug/sched/fair_server/cpu7/period:1000000000
/sys/kernel/debug/sched/fair_server/cpu7/runtime:50000000

You can disable admission control by echoing -1 in sched_rt_runtime_us,
but still have the dl_server working for SCHED_NORMAL tasks. By
disabling admission control SCHED_DEADLINE can indeed monopolize CPU
(over subscription).

Best,
Juri

Re: [PATCH V7 9/9] sched/rt: Remove default bandwidth control

Posted by Michal Koutný 1 year, 2 months ago

On Fri, Nov 29, 2024 at 03:44:53PM GMT, Juri Lelli <juri.lelli@redhat.com> wrote:
> You can disable admission control by echoing -1 in sched_rt_runtime_us,
> but still have the dl_server working for SCHED_NORMAL tasks. By
> disabling admission control SCHED_DEADLINE can indeed monopolize CPU
> (over subscription).

| I'm wondering if different values of Δ mean anything or how they can
| be used.

Aha, the knob therefore remains relevant for the DL admission control.

So if I put it together, I will write it down like this:

-- 8< --
Subject: [PATCH] sched/RT: Update paragraphs about RT bandwidth control

This has slightly changed with the introductions of fair_server.
Update the most relevant parts.

Link: https://lore.kernel.org/r/Z0c8S8i3qt7SEU14@jlelli-thinkpadt14gen4.remote.csb/
Signed-off-by: Michal Koutný <mkoutny@suse.com>
---
 Documentation/scheduler/sched-deadline.rst | 13 +++++++------
 Documentation/scheduler/sched-rt-group.rst |  8 ++++----
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
index 22838ed8e13aa..a727827b8dd52 100644
--- a/Documentation/scheduler/sched-deadline.rst
+++ b/Documentation/scheduler/sched-deadline.rst
@@ -591,12 +591,13 @@ Deadline Task Scheduling
 
  The system wide settings are configured under the /proc virtual file system.
 
- For now the -rt knobs are used for -deadline admission control and the
- -deadline runtime is accounted against the -rt runtime. We realize that this
- isn't entirely desirable; however, it is better to have a small interface for
- now, and be able to change it easily later. The ideal situation (see 5.) is to
- run -rt tasks from a -deadline server; in which case the -rt bandwidth is a
- direct subset of dl_bw.
+ For now the -rt knobs are used for -deadline admission control and with
+ CONFIG_RT_GROUP_SCHED the -deadline runtime is accounted against the (root)
+ -rt runtime. With !CONFIG_RT_GROUP_SCHED the knob only serves for the -dl
+ admission control. We realize that this isn't entirely desirable; however, it
+ is better to have a small interface for now, and be able to change it easily
+ later. The ideal situation (see 5.) is to run -rt tasks from a -deadline
+ server; in which case the -rt bandwidth is a direct subset of dl_bw.
 
  This means that, for a root_domain comprising M CPUs, -deadline tasks
  can be created while the sum of their bandwidths stays below:
diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst
index d685609ed3d7a..80b05a3009ea2 100644
--- a/Documentation/scheduler/sched-rt-group.rst
+++ b/Documentation/scheduler/sched-rt-group.rst
@@ -92,10 +92,10 @@ The system wide settings are configured under the /proc virtual file system:
 /proc/sys/kernel/sched_rt_runtime_us:
   A global limit on how much time real-time scheduling may use. This is always
   less or equal to the period_us, as it denotes the time allocated from the
-  period_us for the real-time tasks. Even without CONFIG_RT_GROUP_SCHED enabled,
-  this will limit time reserved to real-time processes. With
-  CONFIG_RT_GROUP_SCHED=y it signifies the total bandwidth available to all
-  real-time groups.
+  period_us for the real-time tasks. Without CONFIG_RT_GROUP_SCHED enabled,
+  this only serves for admission control of deadline tasks. With
+  CONFIG_RT_GROUP_SCHED=y it also signifies the total bandwidth available to
+  all real-time groups.
 
   * Time is specified in us because the interface is s32. This gives an
     operating range from 1us to about 35 minutes.
-- 
2.47.0

Re: [PATCH V7 9/9] sched/rt: Remove default bandwidth control

Posted by Juri Lelli 1 year, 2 months ago

On 29/11/24 21:21, Michal Koutný wrote:
> On Fri, Nov 29, 2024 at 03:44:53PM GMT, Juri Lelli <juri.lelli@redhat.com> wrote:
> > You can disable admission control by echoing -1 in sched_rt_runtime_us,
> > but still have the dl_server working for SCHED_NORMAL tasks. By
> > disabling admission control SCHED_DEADLINE can indeed monopolize CPU
> > (over subscription).
> 
> | I'm wondering if different values of Δ mean anything or how they can
> | be used.
> 
> Aha, the knob therefore remains relevant for the DL admission control.
> 
> So if I put it together, I will write it down like this:

Ah, yes sounds good to me, thanks for writing it down properly! :)

If you send it out as a separate patch, please feel free to add

Acked-by: Juri Lelli <juri.lelli@redhat.com>

Best,
Juri