Proxy Execution: A generalized form of Priority Inheritance v6

[PATCH v6 01/20] sched: Unify runtime accounting across classes

Posted by John Stultz 2 years, 3 months ago

From: Peter Zijlstra <peterz@infradead.org>

All classes use sched_entity::exec_start to track runtime and have
copies of the exact same code around to compute runtime.

Collapse all that.

Cc: Joel Fernandes <joelaf@google.com>
Cc: Qais Yousef <qyousef@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Zimuzo Ezeozue <zezeozue@google.com>
Cc: Youssef Esmat <youssefesmat@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: kernel-team@android.com
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[fix conflicts, fold in update_current_exec_runtime]
Signed-off-by: Connor O'Brien <connoro@google.com>
[jstultz: rebased, resovling minor conflicts]
Signed-off-by: John Stultz <jstultz@google.com>
---
NOTE: This patch is a general cleanup and if no one objects
could be merged at this point. If needed, I'll resend separately
if it isn't picked up on its own.
---
 include/linux/sched.h    |  2 +-
 kernel/sched/deadline.c  | 13 +++-------
 kernel/sched/fair.c      | 56 ++++++++++++++++++++++++++++++----------
 kernel/sched/rt.c        | 13 +++-------
 kernel/sched/sched.h     | 12 ++-------
 kernel/sched/stop_task.c | 13 +---------
 6 files changed, 52 insertions(+), 57 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..4f5b0710c0f1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -520,7 +520,7 @@ struct sched_statistics {
 	u64				block_max;
 	s64				sum_block_runtime;
 
-	u64				exec_max;
+	s64				exec_max;
 	u64				slice_max;
 
 	u64				nr_migrations_cold;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 58b542bf2893..9522e6607754 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1299,9 +1299,8 @@ static void update_curr_dl(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_dl_entity *dl_se = &curr->dl;
-	u64 delta_exec, scaled_delta_exec;
+	s64 delta_exec, scaled_delta_exec;
 	int cpu = cpu_of(rq);
-	u64 now;
 
 	if (!dl_task(curr) || !on_dl_rq(dl_se))
 		return;
@@ -1314,21 +1313,15 @@ static void update_curr_dl(struct rq *rq)
 	 * natural solution, but the full ramifications of this
 	 * approach need further study.
 	 */
-	now = rq_clock_task(rq);
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec <= 0)) {
+	delta_exec = update_curr_common(rq);
+	if (unlikely(delta_exec <= 0)) {
 		if (unlikely(dl_se->dl_yielded))
 			goto throttle;
 		return;
 	}
 
-	schedstat_set(curr->stats.exec_max,
-		      max(curr->stats.exec_max, delta_exec));
-
 	trace_sched_stat_runtime(curr, delta_exec, 0);
 
-	update_current_exec_runtime(curr, now, delta_exec);
-
 	if (dl_entity_is_special(dl_se))
 		return;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df348aa55d3c..c919633acd3d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1144,23 +1144,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_SMP */
 
-/*
- * Update the current task's runtime statistics.
- */
-static void update_curr(struct cfs_rq *cfs_rq)
+static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
 {
-	struct sched_entity *curr = cfs_rq->curr;
-	u64 now = rq_clock_task(rq_of(cfs_rq));
-	u64 delta_exec;
-
-	if (unlikely(!curr))
-		return;
+	u64 now = rq_clock_task(rq);
+	s64 delta_exec;
 
 	delta_exec = now - curr->exec_start;
-	if (unlikely((s64)delta_exec <= 0))
-		return;
+	if (unlikely(delta_exec <= 0))
+		return delta_exec;
 
 	curr->exec_start = now;
+	curr->sum_exec_runtime += delta_exec;
 
 	if (schedstat_enabled()) {
 		struct sched_statistics *stats;
@@ -1170,9 +1164,43 @@ static void update_curr(struct cfs_rq *cfs_rq)
 				max(delta_exec, stats->exec_max));
 	}
 
-	curr->sum_exec_runtime += delta_exec;
-	schedstat_add(cfs_rq->exec_clock, delta_exec);
+	return delta_exec;
+}
+
+/*
+ * Used by other classes to account runtime.
+ */
+s64 update_curr_common(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	s64 delta_exec;
 
+	delta_exec = update_curr_se(rq, &curr->se);
+	if (unlikely(delta_exec <= 0))
+		return delta_exec;
+
+	account_group_exec_runtime(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
+
+	return delta_exec;
+}
+
+/*
+ * Update the current task's runtime statistics.
+ */
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr = cfs_rq->curr;
+	s64 delta_exec;
+
+	if (unlikely(!curr))
+		return;
+
+	delta_exec = update_curr_se(rq_of(cfs_rq), curr);
+	if (unlikely(delta_exec <= 0))
+		return;
+
+	schedstat_add(cfs_rq->exec_clock, delta_exec);
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
 	update_deadline(cfs_rq, curr);
 	update_min_vruntime(cfs_rq);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0597ba0f85ff..327ae4148aec 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1046,24 +1046,17 @@ static void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_rt_entity *rt_se = &curr->rt;
-	u64 delta_exec;
-	u64 now;
+	s64 delta_exec;
 
 	if (curr->sched_class != &rt_sched_class)
 		return;
 
-	now = rq_clock_task(rq);
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec <= 0))
+	delta_exec = update_curr_common(rq);
+	if (unlikely(delta_exec < 0))
 		return;
 
-	schedstat_set(curr->stats.exec_max,
-		      max(curr->stats.exec_max, delta_exec));
-
 	trace_sched_stat_runtime(curr, delta_exec, 0);
 
-	update_current_exec_runtime(curr, now, delta_exec);
-
 	if (!rt_bandwidth_enabled())
 		return;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 04846272409c..1def5b7fa1df 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2228,6 +2228,8 @@ struct affinity_context {
 	unsigned int flags;
 };
 
+extern s64 update_curr_common(struct rq *rq);
+
 struct sched_class {
 
 #ifdef CONFIG_UCLAMP_TASK
@@ -3280,16 +3282,6 @@ extern int sched_dynamic_mode(const char *str);
 extern void sched_dynamic_update(int mode);
 #endif
 
-static inline void update_current_exec_runtime(struct task_struct *curr,
-						u64 now, u64 delta_exec)
-{
-	curr->se.sum_exec_runtime += delta_exec;
-	account_group_exec_runtime(curr, delta_exec);
-
-	curr->se.exec_start = now;
-	cgroup_account_cputime(curr, delta_exec);
-}
-
 #ifdef CONFIG_SCHED_MM_CID
 
 #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 85590599b4d6..7595494ceb6d 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq)
 
 static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *curr = rq->curr;
-	u64 now, delta_exec;
-
-	now = rq_clock_task(rq);
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec < 0))
-		delta_exec = 0;
-
-	schedstat_set(curr->stats.exec_max,
-		      max(curr->stats.exec_max, delta_exec));
-
-	update_current_exec_runtime(curr, now, delta_exec);
+	update_curr_common(rq);
 }
 
 /*
-- 
2.42.0.869.gea05f2083d-goog

Re: [PATCH v6 01/20] sched: Unify runtime accounting across classes

Posted by Qais Yousef 2 years, 1 month ago

On 11/06/23 19:34, John Stultz wrote:
> From: Peter Zijlstra <peterz@infradead.org>
> 
> All classes use sched_entity::exec_start to track runtime and have
> copies of the exact same code around to compute runtime.
> 
> Collapse all that.
> 
> Cc: Joel Fernandes <joelaf@google.com>
> Cc: Qais Yousef <qyousef@google.com>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Juri Lelli <juri.lelli@redhat.com>
> Cc: Vincent Guittot <vincent.guittot@linaro.org>
> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Cc: Valentin Schneider <vschneid@redhat.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Ben Segall <bsegall@google.com>
> Cc: Zimuzo Ezeozue <zezeozue@google.com>
> Cc: Youssef Esmat <youssefesmat@google.com>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Waiman Long <longman@redhat.com>
> Cc: Boqun Feng <boqun.feng@gmail.com>
> Cc: "Paul E . McKenney" <paulmck@kernel.org>
> Cc: kernel-team@android.com
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> [fix conflicts, fold in update_current_exec_runtime]
> Signed-off-by: Connor O'Brien <connoro@google.com>
> [jstultz: rebased, resovling minor conflicts]
> Signed-off-by: John Stultz <jstultz@google.com>
> ---
> NOTE: This patch is a general cleanup and if no one objects
> could be merged at this point. If needed, I'll resend separately
> if it isn't picked up on its own.

Looks like this actually got merged into tip via the deadline server work :-)

Though not sure if I caught a bug here

> diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
> index 85590599b4d6..7595494ceb6d 100644
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq)
>  
>  static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
>  {
> -	struct task_struct *curr = rq->curr;
> -	u64 now, delta_exec;
> -
> -	now = rq_clock_task(rq);
> -	delta_exec = now - curr->se.exec_start;
> -	if (unlikely((s64)delta_exec < 0))
> -		delta_exec = 0;

If negative instead of returning for stopper task; we set delta_exec to 0

> -
> -	schedstat_set(curr->stats.exec_max,
> -		      max(curr->stats.exec_max, delta_exec));
> -
> -	update_current_exec_runtime(curr, now, delta_exec);

And curry on to do time accounting

> +	update_curr_common(rq);

But the new function will return early without doing accounting. Wouldn't this
re-introrduce 8f6189684eb4 ("sched: Fix migration thread runtime bogosity")?

>  }


Cheers

--
Qais Yousef

Re: [PATCH v6 01/20] sched: Unify runtime accounting across classes

Posted by John Stultz 2 years, 1 month ago

On Sun, Dec 17, 2023 at 8:19 AM Qais Yousef <qyousef@layalina.io> wrote:
> On 11/06/23 19:34, John Stultz wrote:
> > From: Peter Zijlstra <peterz@infradead.org>
> >
> > All classes use sched_entity::exec_start to track runtime and have
> > copies of the exact same code around to compute runtime.
> >
> > Collapse all that.
> >
...
> Looks like this actually got merged into tip via the deadline server work :-)

Oh! That's great to see! The patch has been floating around for a while.

> Though not sure if I caught a bug here
>
> > diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
> > index 85590599b4d6..7595494ceb6d 100644
> > --- a/kernel/sched/stop_task.c
> > +++ b/kernel/sched/stop_task.c
> > @@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq)
> >
> >  static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
> >  {
> > -     struct task_struct *curr = rq->curr;
> > -     u64 now, delta_exec;
> > -
> > -     now = rq_clock_task(rq);
> > -     delta_exec = now - curr->se.exec_start;
> > -     if (unlikely((s64)delta_exec < 0))
> > -             delta_exec = 0;
>
> If negative instead of returning for stopper task; we set delta_exec to 0
>
> > -
> > -     schedstat_set(curr->stats.exec_max,
> > -                   max(curr->stats.exec_max, delta_exec));
> > -
> > -     update_current_exec_runtime(curr, now, delta_exec);
>
> And curry on to do time accounting
>
> > +     update_curr_common(rq);
>
> But the new function will return early without doing accounting. Wouldn't this
> re-introrduce 8f6189684eb4 ("sched: Fix migration thread runtime bogosity")?

Hrm. So first, good eye for catching this!
Looking through the code, much of the accounting logic we end up
skipping doesn't have much effect when delta_exec = 0, so it seems
mostly harmless to return early without the accounting.

Though, there is one side-effect that does get skipped, which is the
removed update_current_exec_runtime() unconditionally sets:
  curr->se.exec_start = now;

Which basically resets the accounting window.

From the commit, It's unclear how intentional this side-effect is for
the edge case where the interval is negative.

I can't say I've really wrapped my head around the cases where the
se.exec_start would get ahead of the rq_clock_task(), so it's not
clear in which cases we would want to reset the accounting window vs
wait for the rq_clock_task() to catch up.  But as this is getting
called from put_prev_task_stop(), it seems we're closing the
accounting window here anyway, and later set_next_task_stop() would be
called (which sets se.exec_start, resetting the accounting) to start
the accounting window again.

So you are right that there is a practical change in behavior, but I
don't think I see it having an effect.

But I've added Mike and Daniel to the CC in case I'm missing something.

thanks
-john

Re: [PATCH v6 01/20] sched: Unify runtime accounting across classes

Posted by Qais Yousef 2 years, 1 month ago

On 12/18/23 12:23, John Stultz wrote:
> On Sun, Dec 17, 2023 at 8:19 AM Qais Yousef <qyousef@layalina.io> wrote:
> > On 11/06/23 19:34, John Stultz wrote:
> > > From: Peter Zijlstra <peterz@infradead.org>
> > >
> > > All classes use sched_entity::exec_start to track runtime and have
> > > copies of the exact same code around to compute runtime.
> > >
> > > Collapse all that.
> > >
> ...
> > Looks like this actually got merged into tip via the deadline server work :-)
> 
> Oh! That's great to see! The patch has been floating around for a while.
> 
> > Though not sure if I caught a bug here
> >
> > > diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
> > > index 85590599b4d6..7595494ceb6d 100644
> > > --- a/kernel/sched/stop_task.c
> > > +++ b/kernel/sched/stop_task.c
> > > @@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq)
> > >
> > >  static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
> > >  {
> > > -     struct task_struct *curr = rq->curr;
> > > -     u64 now, delta_exec;
> > > -
> > > -     now = rq_clock_task(rq);
> > > -     delta_exec = now - curr->se.exec_start;
> > > -     if (unlikely((s64)delta_exec < 0))
> > > -             delta_exec = 0;
> >
> > If negative instead of returning for stopper task; we set delta_exec to 0
> >
> > > -
> > > -     schedstat_set(curr->stats.exec_max,
> > > -                   max(curr->stats.exec_max, delta_exec));
> > > -
> > > -     update_current_exec_runtime(curr, now, delta_exec);
> >
> > And curry on to do time accounting
> >
> > > +     update_curr_common(rq);
> >
> > But the new function will return early without doing accounting. Wouldn't this
> > re-introrduce 8f6189684eb4 ("sched: Fix migration thread runtime bogosity")?
> 
> Hrm. So first, good eye for catching this!
> Looking through the code, much of the accounting logic we end up
> skipping doesn't have much effect when delta_exec = 0, so it seems
> mostly harmless to return early without the accounting.
> 
> Though, there is one side-effect that does get skipped, which is the
> removed update_current_exec_runtime() unconditionally sets:
>   curr->se.exec_start = now;
> 
> Which basically resets the accounting window.
> 
> From the commit, It's unclear how intentional this side-effect is for
> the edge case where the interval is negative.
> 
> I can't say I've really wrapped my head around the cases where the
> se.exec_start would get ahead of the rq_clock_task(), so it's not
> clear in which cases we would want to reset the accounting window vs
> wait for the rq_clock_task() to catch up.  But as this is getting
> called from put_prev_task_stop(), it seems we're closing the
> accounting window here anyway, and later set_next_task_stop() would be
> called (which sets se.exec_start, resetting the accounting) to start
> the accounting window again.
> 
> So you are right that there is a practical change in behavior, but I
> don't think I see it having an effect.

Yes, agreed. I couldn't reproduce any problem and I can't see a terrible side
effect of returning early as well compared to continuing to do the accounting.


Cheers

--
Qais Yousef