Without proxy-exec, we normally charge the "current" task for
both its vruntime as well as its sum_exec_runtime.
With proxy, however, we have two "current" contexts: the
scheduler context and the execution context. We want to charge
the execution context rq->curr (ie: proxy/lock holder) execution
time to its sum_exec_runtime (so it's clear to userland the
rq->curr task *is* running), as well as its thread group.
However the rest of the time accounting (such a vruntime and
cgroup accounting), we charge against the scheduler context
(rq->donor) task, because it is from that task that the time
is being "donated".
If the donor and curr tasks are the same, then it's the same as
without proxy.
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: John Stultz <jstultz@google.com>
---
v16:
* Renamed update_curr_se to update_se_times, as suggested by
Steven Rostedt.
* Reworded the commit message as suggested by Steven Rostedt
v17:
* Renamed update_se_times to update_se, as suggested by Peter
* Added comment calrifying cfs_rq->curr being rq->donor.se as
suggested by Peter
* Reworked to fix issue Peter pointed out with thread group
accounting being done on the donor, rather then the running
execution context.
* Slight rewording of the commit message to further clairifiy
things.
Cc: Joel Fernandes <joelagnelf@nvidia.com>
Cc: Qais Yousef <qyousef@layalina.io>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Zimuzo Ezeozue <zezeozue@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Metin Kaya <Metin.Kaya@arm.com>
Cc: Xuewen Yan <xuewen.yan94@gmail.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: kuyo chang <kuyo.chang@mediatek.com>
Cc: hupu <hupu.gm@gmail.com>
Cc: kernel-team@android.com
---
kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++------------
1 file changed, 28 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index edcc7d59ecc3b..c34e0891193a7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1143,30 +1143,40 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_SMP */
-static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+static s64 update_se(struct rq *rq, struct sched_entity *se)
{
u64 now = rq_clock_task(rq);
s64 delta_exec;
- delta_exec = now - curr->exec_start;
+ delta_exec = now - se->exec_start;
if (unlikely(delta_exec <= 0))
return delta_exec;
- curr->exec_start = now;
- curr->sum_exec_runtime += delta_exec;
+ se->exec_start = now;
+ if (entity_is_task(se)) {
+ struct task_struct *donor = task_of(se);
+ struct task_struct *running = rq->curr;
+ /*
+ * If se is a task, we account the time against the running
+ * task, as w/ proxy-exec they may not be the same.
+ */
+ running->se.exec_start = now;
+ running->se.sum_exec_runtime += delta_exec;
- if (entity_is_task(curr)) {
- struct task_struct *p = task_of(curr);
+ trace_sched_stat_runtime(running, delta_exec);
+ account_group_exec_runtime(running, delta_exec);
- trace_sched_stat_runtime(p, delta_exec);
- account_group_exec_runtime(p, delta_exec);
- cgroup_account_cputime(p, delta_exec);
+ /* cgroup time is always accounted against the donor */
+ cgroup_account_cputime(donor, delta_exec);
+ } else {
+ /* If not task, account the time against donor se */
+ se->sum_exec_runtime += delta_exec;
}
if (schedstat_enabled()) {
struct sched_statistics *stats;
- stats = __schedstats_from_se(curr);
+ stats = __schedstats_from_se(se);
__schedstat_set(stats->exec_max,
max(delta_exec, stats->exec_max));
}
@@ -1213,7 +1223,7 @@ s64 update_curr_common(struct rq *rq)
{
struct task_struct *donor = rq->donor;
- return update_curr_se(rq, &donor->se);
+ return update_se(rq, &donor->se);
}
/*
@@ -1221,6 +1231,12 @@ s64 update_curr_common(struct rq *rq)
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
+ /*
+ * Note: cfs_rq->curr corresponds to the task picked to
+ * run (ie: rq->donor.se) which due to proxy-exec may
+ * not necessarily be the actual task running
+ * (rq->curr.se). This is easy to confuse!
+ */
struct sched_entity *curr = cfs_rq->curr;
struct rq *rq = rq_of(cfs_rq);
s64 delta_exec;
@@ -1229,7 +1245,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
if (unlikely(!curr))
return;
- delta_exec = update_curr_se(rq, curr);
+ delta_exec = update_se(rq, curr);
if (unlikely(delta_exec <= 0))
return;
--
2.50.0.727.gbf7dc18ff4-goog
On Mon, Jul 07, 2025 at 08:43:52PM +0000, John Stultz wrote: > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index edcc7d59ecc3b..c34e0891193a7 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -1143,30 +1143,40 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) > } > #endif /* CONFIG_SMP */ > > +static s64 update_se(struct rq *rq, struct sched_entity *se) > { > u64 now = rq_clock_task(rq); > s64 delta_exec; > > + delta_exec = now - se->exec_start; > if (unlikely(delta_exec <= 0)) > return delta_exec; > > + se->exec_start = now; > + if (entity_is_task(se)) { > + struct task_struct *donor = task_of(se); > + struct task_struct *running = rq->curr; > + /* > + * If se is a task, we account the time against the running > + * task, as w/ proxy-exec they may not be the same. > + */ > + running->se.exec_start = now; > + running->se.sum_exec_runtime += delta_exec; > > + trace_sched_stat_runtime(running, delta_exec); > + account_group_exec_runtime(running, delta_exec); > > + /* cgroup time is always accounted against the donor */ > + cgroup_account_cputime(donor, delta_exec); > + } else { > + /* If not task, account the time against donor se */ > + se->sum_exec_runtime += delta_exec; > } Bah.. this is all terrible :-) But yeah, I suppose this wil do. > > if (schedstat_enabled()) { > struct sched_statistics *stats; > > + stats = __schedstats_from_se(se); > __schedstat_set(stats->exec_max, > max(delta_exec, stats->exec_max)); > } > @@ -1213,7 +1223,7 @@ s64 update_curr_common(struct rq *rq) > { > struct task_struct *donor = rq->donor; > > + return update_se(rq, &donor->se); > } At this point this might as well read: return update_se(rq, &rq->donor->se);
On Thu, Jul 10, 2025 at 2:45 AM Peter Zijlstra <peterz@infradead.org> wrote: > On Mon, Jul 07, 2025 at 08:43:52PM +0000, John Stultz wrote: > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > > index edcc7d59ecc3b..c34e0891193a7 100644 > > --- a/kernel/sched/fair.c > > +++ b/kernel/sched/fair.c > > @@ -1143,30 +1143,40 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) > > } > > #endif /* CONFIG_SMP */ > > > > +static s64 update_se(struct rq *rq, struct sched_entity *se) > > { > > u64 now = rq_clock_task(rq); > > s64 delta_exec; > > > > + delta_exec = now - se->exec_start; > > if (unlikely(delta_exec <= 0)) > > return delta_exec; > > > > + se->exec_start = now; > > + if (entity_is_task(se)) { > > + struct task_struct *donor = task_of(se); > > + struct task_struct *running = rq->curr; > > + /* > > + * If se is a task, we account the time against the running > > + * task, as w/ proxy-exec they may not be the same. > > + */ > > + running->se.exec_start = now; > > + running->se.sum_exec_runtime += delta_exec; > > > > + trace_sched_stat_runtime(running, delta_exec); > > + account_group_exec_runtime(running, delta_exec); > > > > + /* cgroup time is always accounted against the donor */ > > + cgroup_account_cputime(donor, delta_exec); > > + } else { > > + /* If not task, account the time against donor se */ > > + se->sum_exec_runtime += delta_exec; > > } > > Bah.. this is all terrible :-) But yeah, I suppose this wil do. Yeah, :/ I'm happy to rework it further if you have ideas. > > @@ -1213,7 +1223,7 @@ s64 update_curr_common(struct rq *rq) > > { > > struct task_struct *donor = rq->donor; > > > > + return update_se(rq, &donor->se); > > } > > At this point this might as well read: > > return update_se(rq, &rq->donor->se); Good point. Done. Thanks so much for the review! I'll get the next iteration ready to go out soon. -john
On Thu, Jul 10, 2025 at 10:25:46AM -0700, John Stultz wrote: > On Thu, Jul 10, 2025 at 2:45 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Mon, Jul 07, 2025 at 08:43:52PM +0000, John Stultz wrote: > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > > > index edcc7d59ecc3b..c34e0891193a7 100644 > > > --- a/kernel/sched/fair.c > > > +++ b/kernel/sched/fair.c > > > @@ -1143,30 +1143,40 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) > > > } > > > #endif /* CONFIG_SMP */ > > > > > > +static s64 update_se(struct rq *rq, struct sched_entity *se) > > > { > > > u64 now = rq_clock_task(rq); > > > s64 delta_exec; > > > > > > + delta_exec = now - se->exec_start; > > > if (unlikely(delta_exec <= 0)) > > > return delta_exec; > > > > > > + se->exec_start = now; > > > + if (entity_is_task(se)) { > > > + struct task_struct *donor = task_of(se); > > > + struct task_struct *running = rq->curr; > > > + /* > > > + * If se is a task, we account the time against the running > > > + * task, as w/ proxy-exec they may not be the same. > > > + */ > > > + running->se.exec_start = now; > > > + running->se.sum_exec_runtime += delta_exec; > > > > > > + trace_sched_stat_runtime(running, delta_exec); > > > + account_group_exec_runtime(running, delta_exec); > > > > > > + /* cgroup time is always accounted against the donor */ > > > + cgroup_account_cputime(donor, delta_exec); > > > + } else { > > > + /* If not task, account the time against donor se */ > > > + se->sum_exec_runtime += delta_exec; > > > } > > > > Bah.. this is all terrible :-) But yeah, I suppose this wil do. > > Yeah, :/ I'm happy to rework it further if you have ideas. Not really; I stared at this for a bit yesterday. Its a confusing mess, but I didn't come up with anything better.
© 2016 - 2025 Red Hat, Inc.