tools/perf/Documentation/perf-sched.txt | 8 +++ tools/perf/builtin-sched.c | 83 ++++++++++++++++--------- 2 files changed, 60 insertions(+), 31 deletions(-)
pre-migration wait time is the time that a task unnecessarily spends
on the runqueue of a CPU but doesn't get switched-in there. In terms
of tracepoints, it is the time between sched:sched_wakeup and
sched:sched_migrate_task.
Let's say a task woke up on CPU2, then it got migrated to CPU4 and
then it's switched-in to CPU4. So, here pre-migration wait time is
time that it was waiting on runqueue of CPU2 after it is woken up.
The general pattern for pre-migration to occur is:
sched:sched_wakeup
sched:sched_migrate_task
sched:sched_switch
The sched:sched_waking event is used to capture the wakeup time,
as it aligns with the existing code and only introduces a negligible
time difference.
pre-migrations are generally not useful and it increases migrations.
This metric would be helpful in testing patches mainly related to wakeup
and load-balancer code paths as better wakeup logic would choose an
optimal CPU where task would be switched-in and thereby reducing pre-
migrations.
The sample output(s) when -P or --pre-migrations is used:
=================
time cpu task name wait time sch delay run time pre-mig time
[tid/pid] (msec) (msec) (msec) (msec)
--------------- ------ ------------------------------ --------- --------- --------- ---------
38456.720806 [0001] schbench[28634/28574] 4.917 4.768 1.004 0.000
38456.720810 [0001] rcu_preempt[18] 3.919 0.003 0.004 0.000
38456.721800 [0006] schbench[28779/28574] 23.465 23.465 1.999 0.000
38456.722800 [0002] schbench[28773/28574] 60.371 60.237 3.955 60.197
38456.722806 [0001] schbench[28634/28574] 0.004 0.004 1.996 0.000
38456.722811 [0001] rcu_preempt[18] 1.996 0.005 0.005 0.000
38456.723800 [0000] schbench[28833/28574] 4.000 4.000 3.999 0.000
38456.723800 [0004] schbench[28762/28574] 42.951 42.839 3.999 39.867
38456.723802 [0007] schbench[28812/28574] 43.947 43.817 3.999 40.866
38456.723804 [0001] schbench[28587/28574] 7.935 7.822 0.993 0.000
Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
---
Changes in v3:
- Use the sched:sched_waking event to calculate the wakeup time. (Namhyung Kim)
- Rebase against perf-tools-next commit 80f192724e31 ("perf tests: Add more
topdown events regroup tests")
Changes in v2:
- Use timehist_sched_wakeup_event() to get the sched_wakeup time. (Namhyung Kim)
- Rebase against perf-tools-next commit b38c49d8296b ("perf/test: Speed up test
case perf annotate basic tests")
Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
---
tools/perf/Documentation/perf-sched.txt | 8 +++
tools/perf/builtin-sched.c | 83 ++++++++++++++++---------
2 files changed, 60 insertions(+), 31 deletions(-)
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 3db64954a267..6dbbddb6464d 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -221,6 +221,14 @@ OPTIONS for 'perf sched timehist'
priorities are specified with -: 120-129. A combination of both can also be
provided: 0,120-129.
+-P::
+--pre-migrations::
+ Show pre-migration wait time. pre-migration wait time is the time spent
+ by a task waiting on a runqueue but not getting the chance to run there
+ and is migrated to a different runqueue where it is finally run. This
+ time between sched_wakeup and migrate_task is the pre-migration wait
+ time.
+
OPTIONS for 'perf sched replay'
------------------------------
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index fdf979aaf275..e6540803ced6 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -225,6 +225,7 @@ struct perf_sched {
bool show_wakeups;
bool show_next;
bool show_migrations;
+ bool pre_migrations;
bool show_state;
bool show_prio;
u64 skipped_samples;
@@ -244,7 +245,9 @@ struct thread_runtime {
u64 dt_iowait; /* time between CPU access by iowait (off cpu) */
u64 dt_preempt; /* time between CPU access by preempt (off cpu) */
u64 dt_delay; /* time between wakeup and sched-in */
+ u64 dt_pre_mig; /* time between migration and wakeup */
u64 ready_to_run; /* time of wakeup */
+ u64 migrated; /* time when a thread is migrated */
struct stats run_stats;
u64 total_run_time;
@@ -252,6 +255,7 @@ struct thread_runtime {
u64 total_iowait_time;
u64 total_preempt_time;
u64 total_delay_time;
+ u64 total_pre_mig_time;
char last_state;
@@ -2073,14 +2077,15 @@ static void timehist_header(struct perf_sched *sched)
printf(" ");
}
- if (sched->show_prio) {
- printf(" %-*s %-*s %9s %9s %9s",
- comm_width, "task name", MAX_PRIO_STR_LEN, "prio",
- "wait time", "sch delay", "run time");
- } else {
- printf(" %-*s %9s %9s %9s", comm_width,
- "task name", "wait time", "sch delay", "run time");
- }
+ printf(" %-*s", comm_width, "task name");
+
+ if (sched->show_prio)
+ printf(" %-*s", MAX_PRIO_STR_LEN, "prio");
+
+ printf(" %9s %9s %9s", "wait time", "sch delay", "run time");
+
+ if (sched->pre_migrations)
+ printf(" %9s", "pre-mig time");
if (sched->show_state)
printf(" %s", "state");
@@ -2095,17 +2100,15 @@ static void timehist_header(struct perf_sched *sched)
if (sched->show_cpu_visual)
printf(" %*s ", ncpus, "");
- if (sched->show_prio) {
- printf(" %-*s %-*s %9s %9s %9s",
- comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "",
- "(msec)", "(msec)", "(msec)");
- } else {
- printf(" %-*s %9s %9s %9s", comm_width,
- "[tid/pid]", "(msec)", "(msec)", "(msec)");
- }
+ printf(" %-*s", comm_width, "[tid/pid]");
- if (sched->show_state)
- printf(" %5s", "");
+ if (sched->show_prio)
+ printf(" %-*s", MAX_PRIO_STR_LEN, "");
+
+ printf(" %9s %9s %9s", "(msec)", "(msec)", "(msec)");
+
+ if (sched->pre_migrations)
+ printf(" %9s", "(msec)");
printf("\n");
@@ -2117,15 +2120,15 @@ static void timehist_header(struct perf_sched *sched)
if (sched->show_cpu_visual)
printf(" %.*s ", ncpus, graph_dotted_line);
- if (sched->show_prio) {
- printf(" %.*s %.*s %.9s %.9s %.9s",
- comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line,
- graph_dotted_line, graph_dotted_line, graph_dotted_line);
- } else {
- printf(" %.*s %.9s %.9s %.9s", comm_width,
- graph_dotted_line, graph_dotted_line, graph_dotted_line,
- graph_dotted_line);
- }
+ printf(" %.*s", comm_width, graph_dotted_line);
+
+ if (sched->show_prio)
+ printf(" %.*s", MAX_PRIO_STR_LEN, graph_dotted_line);
+
+ printf(" %.9s %.9s %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line);
+
+ if (sched->pre_migrations)
+ printf(" %.9s", graph_dotted_line);
if (sched->show_state)
printf(" %.5s", graph_dotted_line);
@@ -2180,6 +2183,8 @@ static void timehist_print_sample(struct perf_sched *sched,
print_sched_time(tr->dt_delay, 6);
print_sched_time(tr->dt_run, 6);
+ if (sched->pre_migrations)
+ print_sched_time(tr->dt_pre_mig, 6);
if (sched->show_state)
printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state);
@@ -2239,6 +2244,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
r->dt_iowait = 0;
r->dt_preempt = 0;
r->dt_run = 0;
+ r->dt_pre_mig = 0;
if (tprev) {
r->dt_run = t - tprev;
@@ -2247,6 +2253,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
else
r->dt_delay = tprev - r->ready_to_run;
+
+ if (r->ready_to_run && r->migrated) {
+ if ((r->migrated > r->ready_to_run) && (r->migrated < tprev))
+ r->dt_pre_mig = r->migrated - r->ready_to_run;
+ }
}
if (r->last_time > tprev)
@@ -2270,6 +2281,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
r->total_sleep_time += r->dt_sleep;
r->total_iowait_time += r->dt_iowait;
r->total_preempt_time += r->dt_preempt;
+ r->total_pre_mig_time += r->dt_pre_mig;
}
static bool is_idle_sample(struct perf_sample *sample,
@@ -2684,8 +2696,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool,
tr->migrations++;
+ if (tr->migrated == 0)
+ tr->migrated = sample->time;
+
/* show migrations if requested */
- timehist_print_migration_event(sched, evsel, sample, machine, thread);
+ if (sched->show_migrations) {
+ timehist_print_migration_event(sched, evsel, sample,
+ machine, thread);
+ }
return 0;
}
@@ -2836,11 +2854,13 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
/* last state is used to determine where to account wait time */
tr->last_state = state;
- /* sched out event for task so reset ready to run time */
+ /* sched out event for task so reset ready to run time and migrated time */
if (state == 'R')
tr->ready_to_run = t;
else
tr->ready_to_run = 0;
+
+ tr->migrated = 0;
}
evsel__save_time(evsel, sample->time, sample->cpu);
@@ -3280,8 +3300,8 @@ static int perf_sched__timehist(struct perf_sched *sched)
goto out;
}
- if (sched->show_migrations &&
- perf_session__set_tracepoints_handlers(session, migrate_handlers))
+ if ((sched->show_migrations || sched->pre_migrations) &&
+ perf_session__set_tracepoints_handlers(session, migrate_handlers))
goto out;
/* pre-allocate struct for per-CPU idle stats */
@@ -3823,6 +3843,7 @@ int cmd_sched(int argc, const char **argv)
OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
OPT_STRING(0, "prio", &sched.prio_str, "prio",
"analyze events only for given task priority(ies)"),
+ OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
OPT_PARENT(sched_options)
};
--
2.43.2
On Wed, 2024-10-02 at 22:09 +0530, Madadi Vineeth Reddy wrote: > pre-migration wait time is the time that a task unnecessarily spends > on the runqueue of a CPU but doesn't get switched-in there. In terms > of tracepoints, it is the time between sched:sched_wakeup and > sched:sched_migrate_task. > > Let's say a task woke up on CPU2, then it got migrated to CPU4 and > then it's switched-in to CPU4. So, here pre-migration wait time is > time that it was waiting on runqueue of CPU2 after it is woken up. > > The general pattern for pre-migration to occur is: > sched:sched_wakeup > sched:sched_migrate_task > sched:sched_switch If a task migrate from CPU A to CPU B, but is unlucky that someone took CPU B and it has to wait. Then it is yet again migrated to CPU C. Do you only compute pre-mig time as t_sched_migrate_task_CPU_A - t_sched_migrate_task_CPU_A? The task also spend some pre-mig time on CPU_B that probably should be included. And that time is when it migrates to B till it is migrated away from B. Do you take that into account? > > The sched:sched_waking event is used to capture the wakeup time, > as it aligns with the existing code and only introduces a negligible > time difference. > > pre-migrations are generally not useful and it increases migrations. Not sure long pre-migration itself leads to migrations. But rather migrations is a function of the aggressiveness of the idle load balancer. > This metric would be helpful in testing patches mainly related to wakeup > and load-balancer code paths > I think it is most relevant for the idle load balancer code paths, where newly idle CPU pull task from other CPUs to run. Are there cases where you have used this metric to tune the scheduler? Thanks. Tim > as better wakeup logic would choose an > optimal CPU where task would be switched-in and thereby reducing pre- > migrations. > > The sample output(s) when -P or --pre-migrations is used: > ================= > time cpu task name wait time sch delay run time pre-mig time > [tid/pid] (msec) (msec) (msec) (msec) > --------------- ------ ------------------------------ --------- --------- --------- --------- > 38456.720806 [0001] schbench[28634/28574] 4.917 4.768 1.004 0.000 > 38456.720810 [0001] rcu_preempt[18] 3.919 0.003 0.004 0.000 > 38456.721800 [0006] schbench[28779/28574] 23.465 23.465 1.999 0.000 > 38456.722800 [0002] schbench[28773/28574] 60.371 60.237 3.955 60.197 > 38456.722806 [0001] schbench[28634/28574] 0.004 0.004 1.996 0.000 > 38456.722811 [0001] rcu_preempt[18] 1.996 0.005 0.005 0.000 > 38456.723800 [0000] schbench[28833/28574] 4.000 4.000 3.999 0.000 > 38456.723800 [0004] schbench[28762/28574] 42.951 42.839 3.999 39.867 > 38456.723802 [0007] schbench[28812/28574] 43.947 43.817 3.999 40.866 > 38456.723804 [0001] schbench[28587/28574] 7.935 7.822 0.993 0.000 > > Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com> > > --- > Changes in v3: > - Use the sched:sched_waking event to calculate the wakeup time. (Namhyung Kim) > - Rebase against perf-tools-next commit 80f192724e31 ("perf tests: Add more > topdown events regroup tests") > > Changes in v2: > - Use timehist_sched_wakeup_event() to get the sched_wakeup time. (Namhyung Kim) > - Rebase against perf-tools-next commit b38c49d8296b ("perf/test: Speed up test > case perf annotate basic tests") > > Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com> > --- > tools/perf/Documentation/perf-sched.txt | 8 +++ > tools/perf/builtin-sched.c | 83 ++++++++++++++++--------- > 2 files changed, 60 insertions(+), 31 deletions(-) > > diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt > index 3db64954a267..6dbbddb6464d 100644 > --- a/tools/perf/Documentation/perf-sched.txt > +++ b/tools/perf/Documentation/perf-sched.txt > @@ -221,6 +221,14 @@ OPTIONS for 'perf sched timehist' > priorities are specified with -: 120-129. A combination of both can also be > provided: 0,120-129. > > +-P:: > +--pre-migrations:: > + Show pre-migration wait time. pre-migration wait time is the time spent > + by a task waiting on a runqueue but not getting the chance to run there > + and is migrated to a different runqueue where it is finally run. This > + time between sched_wakeup and migrate_task is the pre-migration wait > + time. > + > OPTIONS for 'perf sched replay' > ------------------------------ > > diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c > index fdf979aaf275..e6540803ced6 100644 > --- a/tools/perf/builtin-sched.c > +++ b/tools/perf/builtin-sched.c > @@ -225,6 +225,7 @@ struct perf_sched { > bool show_wakeups; > bool show_next; > bool show_migrations; > + bool pre_migrations; > bool show_state; > bool show_prio; > u64 skipped_samples; > @@ -244,7 +245,9 @@ struct thread_runtime { > u64 dt_iowait; /* time between CPU access by iowait (off cpu) */ > u64 dt_preempt; /* time between CPU access by preempt (off cpu) */ > u64 dt_delay; /* time between wakeup and sched-in */ > + u64 dt_pre_mig; /* time between migration and wakeup */ > u64 ready_to_run; /* time of wakeup */ > + u64 migrated; /* time when a thread is migrated */ > > struct stats run_stats; > u64 total_run_time; > @@ -252,6 +255,7 @@ struct thread_runtime { > u64 total_iowait_time; > u64 total_preempt_time; > u64 total_delay_time; > + u64 total_pre_mig_time; > > char last_state; > > @@ -2073,14 +2077,15 @@ static void timehist_header(struct perf_sched *sched) > printf(" "); > } > > - if (sched->show_prio) { > - printf(" %-*s %-*s %9s %9s %9s", > - comm_width, "task name", MAX_PRIO_STR_LEN, "prio", > - "wait time", "sch delay", "run time"); > - } else { > - printf(" %-*s %9s %9s %9s", comm_width, > - "task name", "wait time", "sch delay", "run time"); > - } > + printf(" %-*s", comm_width, "task name"); > + > + if (sched->show_prio) > + printf(" %-*s", MAX_PRIO_STR_LEN, "prio"); > + > + printf(" %9s %9s %9s", "wait time", "sch delay", "run time"); > + > + if (sched->pre_migrations) > + printf(" %9s", "pre-mig time"); > > if (sched->show_state) > printf(" %s", "state"); > @@ -2095,17 +2100,15 @@ static void timehist_header(struct perf_sched *sched) > if (sched->show_cpu_visual) > printf(" %*s ", ncpus, ""); > > - if (sched->show_prio) { > - printf(" %-*s %-*s %9s %9s %9s", > - comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "", > - "(msec)", "(msec)", "(msec)"); > - } else { > - printf(" %-*s %9s %9s %9s", comm_width, > - "[tid/pid]", "(msec)", "(msec)", "(msec)"); > - } > + printf(" %-*s", comm_width, "[tid/pid]"); > > - if (sched->show_state) > - printf(" %5s", ""); > + if (sched->show_prio) > + printf(" %-*s", MAX_PRIO_STR_LEN, ""); > + > + printf(" %9s %9s %9s", "(msec)", "(msec)", "(msec)"); > + > + if (sched->pre_migrations) > + printf(" %9s", "(msec)"); > > printf("\n"); > > @@ -2117,15 +2120,15 @@ static void timehist_header(struct perf_sched *sched) > if (sched->show_cpu_visual) > printf(" %.*s ", ncpus, graph_dotted_line); > > - if (sched->show_prio) { > - printf(" %.*s %.*s %.9s %.9s %.9s", > - comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line, > - graph_dotted_line, graph_dotted_line, graph_dotted_line); > - } else { > - printf(" %.*s %.9s %.9s %.9s", comm_width, > - graph_dotted_line, graph_dotted_line, graph_dotted_line, > - graph_dotted_line); > - } > + printf(" %.*s", comm_width, graph_dotted_line); > + > + if (sched->show_prio) > + printf(" %.*s", MAX_PRIO_STR_LEN, graph_dotted_line); > + > + printf(" %.9s %.9s %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line); > + > + if (sched->pre_migrations) > + printf(" %.9s", graph_dotted_line); > > if (sched->show_state) > printf(" %.5s", graph_dotted_line); > @@ -2180,6 +2183,8 @@ static void timehist_print_sample(struct perf_sched *sched, > > print_sched_time(tr->dt_delay, 6); > print_sched_time(tr->dt_run, 6); > + if (sched->pre_migrations) > + print_sched_time(tr->dt_pre_mig, 6); > > if (sched->show_state) > printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state); > @@ -2239,6 +2244,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, > r->dt_iowait = 0; > r->dt_preempt = 0; > r->dt_run = 0; > + r->dt_pre_mig = 0; > > if (tprev) { > r->dt_run = t - tprev; > @@ -2247,6 +2253,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, > pr_debug("time travel: wakeup time for task > previous sched_switch event\n"); > else > r->dt_delay = tprev - r->ready_to_run; > + > + if (r->ready_to_run && r->migrated) { > + if ((r->migrated > r->ready_to_run) && (r->migrated < tprev)) > + r->dt_pre_mig = r->migrated - r->ready_to_run; > + } > } > > if (r->last_time > tprev) > @@ -2270,6 +2281,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, > r->total_sleep_time += r->dt_sleep; > r->total_iowait_time += r->dt_iowait; > r->total_preempt_time += r->dt_preempt; > + r->total_pre_mig_time += r->dt_pre_mig; > } > > static bool is_idle_sample(struct perf_sample *sample, > @@ -2684,8 +2696,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool, > > tr->migrations++; > > + if (tr->migrated == 0) > + tr->migrated = sample->time; > + > /* show migrations if requested */ > - timehist_print_migration_event(sched, evsel, sample, machine, thread); > + if (sched->show_migrations) { > + timehist_print_migration_event(sched, evsel, sample, > + machine, thread); > + } > > return 0; > } > @@ -2836,11 +2854,13 @@ static int timehist_sched_change_event(const struct perf_tool *tool, > /* last state is used to determine where to account wait time */ > tr->last_state = state; > > - /* sched out event for task so reset ready to run time */ > + /* sched out event for task so reset ready to run time and migrated time */ > if (state == 'R') > tr->ready_to_run = t; > else > tr->ready_to_run = 0; > + > + tr->migrated = 0; > } > > evsel__save_time(evsel, sample->time, sample->cpu); > @@ -3280,8 +3300,8 @@ static int perf_sched__timehist(struct perf_sched *sched) > goto out; > } > > - if (sched->show_migrations && > - perf_session__set_tracepoints_handlers(session, migrate_handlers)) > + if ((sched->show_migrations || sched->pre_migrations) && > + perf_session__set_tracepoints_handlers(session, migrate_handlers)) > goto out; > > /* pre-allocate struct for per-CPU idle stats */ > @@ -3823,6 +3843,7 @@ int cmd_sched(int argc, const char **argv) > OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"), > OPT_STRING(0, "prio", &sched.prio_str, "prio", > "analyze events only for given task priority(ies)"), > + OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"), > OPT_PARENT(sched_options) > }; >
Hi Tim Chen, On 04/10/24 03:34, Tim Chen wrote: > On Wed, 2024-10-02 at 22:09 +0530, Madadi Vineeth Reddy wrote: >> pre-migration wait time is the time that a task unnecessarily spends >> on the runqueue of a CPU but doesn't get switched-in there. In terms >> of tracepoints, it is the time between sched:sched_wakeup and >> sched:sched_migrate_task. >> >> Let's say a task woke up on CPU2, then it got migrated to CPU4 and >> then it's switched-in to CPU4. So, here pre-migration wait time is >> time that it was waiting on runqueue of CPU2 after it is woken up. >> >> The general pattern for pre-migration to occur is: >> sched:sched_wakeup >> sched:sched_migrate_task >> sched:sched_switch > > If a task migrate from CPU A to CPU B, but is unlucky that > someone took CPU B and it has to wait. Then it is yet again migrated to CPU C. > Do you only compute pre-mig time as > t_sched_migrate_task_CPU_A - t_sched_migrate_task_CPU_A? > > The task also spend some pre-mig time on CPU_B that probably > should be included. And that time is when it migrates to B > till it is migrated away from B. Do you take that into > account? > You are right. Currently the time spent on CPU B is not accounted. I will consider the latest migrate time and that should fix it. >> >> The sched:sched_waking event is used to capture the wakeup time, >> as it aligns with the existing code and only introduces a negligible >> time difference. >> >> pre-migrations are generally not useful and it increases migrations. > > Not sure long pre-migration itself leads to migrations. But rather migrations > is a function of the aggressiveness of the idle load balancer. You're right. pre-migration wait time itself doesn't directly cause migrations, but it is a useful metric for understanding the overall impact on task scheduling, especially in the context of the idle load balancer as you mentioned. I agree that this is most relevant for scenarios where newly idle CPUs pull tasks from other CPUs. By capturing the pre-migration wait time, the intention is to evaluate how efficiently tasks are balanced and whether there are opportunities for improvement in the wakeup and load-balancing paths. Thank you for the feedback. Thanks, Madadi Vineeth Reddy > >> This metric would be helpful in testing patches mainly related to wakeup >> and load-balancer code paths >> > > I think it is most relevant for the idle load balancer code paths, where > newly idle CPU pull task from other CPUs to run. > > Are there cases where you have used this metric to tune the scheduler? > > Thanks. > > Tim > >> as better wakeup logic would choose an >> optimal CPU where task would be switched-in and thereby reducing pre- >> migrations. >> >> The sample output(s) when -P or --pre-migrations is used: >> ================= >> time cpu task name wait time sch delay run time pre-mig time >> [tid/pid] (msec) (msec) (msec) (msec) >> --------------- ------ ------------------------------ --------- --------- --------- --------- >> 38456.720806 [0001] schbench[28634/28574] 4.917 4.768 1.004 0.000 >> 38456.720810 [0001] rcu_preempt[18] 3.919 0.003 0.004 0.000 >> 38456.721800 [0006] schbench[28779/28574] 23.465 23.465 1.999 0.000 >> 38456.722800 [0002] schbench[28773/28574] 60.371 60.237 3.955 60.197 >> 38456.722806 [0001] schbench[28634/28574] 0.004 0.004 1.996 0.000 >> 38456.722811 [0001] rcu_preempt[18] 1.996 0.005 0.005 0.000 >> 38456.723800 [0000] schbench[28833/28574] 4.000 4.000 3.999 0.000 >> 38456.723800 [0004] schbench[28762/28574] 42.951 42.839 3.999 39.867 >> 38456.723802 [0007] schbench[28812/28574] 43.947 43.817 3.999 40.866 >> 38456.723804 [0001] schbench[28587/28574] 7.935 7.822 0.993 0.000 >> >> Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com> >> >> --- >> Changes in v3: >> - Use the sched:sched_waking event to calculate the wakeup time. (Namhyung Kim) >> - Rebase against perf-tools-next commit 80f192724e31 ("perf tests: Add more >> topdown events regroup tests") >> >> Changes in v2: >> - Use timehist_sched_wakeup_event() to get the sched_wakeup time. (Namhyung Kim) >> - Rebase against perf-tools-next commit b38c49d8296b ("perf/test: Speed up test >> case perf annotate basic tests") >> >> Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com> >> --- >> tools/perf/Documentation/perf-sched.txt | 8 +++ >> tools/perf/builtin-sched.c | 83 ++++++++++++++++--------- >> 2 files changed, 60 insertions(+), 31 deletions(-) >> >> diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt >> index 3db64954a267..6dbbddb6464d 100644 >> --- a/tools/perf/Documentation/perf-sched.txt >> +++ b/tools/perf/Documentation/perf-sched.txt >> @@ -221,6 +221,14 @@ OPTIONS for 'perf sched timehist' >> priorities are specified with -: 120-129. A combination of both can also be >> provided: 0,120-129. >> >> +-P:: >> +--pre-migrations:: >> + Show pre-migration wait time. pre-migration wait time is the time spent >> + by a task waiting on a runqueue but not getting the chance to run there >> + and is migrated to a different runqueue where it is finally run. This >> + time between sched_wakeup and migrate_task is the pre-migration wait >> + time. >> + >> OPTIONS for 'perf sched replay' >> ------------------------------ >> >> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c >> index fdf979aaf275..e6540803ced6 100644 >> --- a/tools/perf/builtin-sched.c >> +++ b/tools/perf/builtin-sched.c >> @@ -225,6 +225,7 @@ struct perf_sched { >> bool show_wakeups; >> bool show_next; >> bool show_migrations; >> + bool pre_migrations; >> bool show_state; >> bool show_prio; >> u64 skipped_samples; >> @@ -244,7 +245,9 @@ struct thread_runtime { >> u64 dt_iowait; /* time between CPU access by iowait (off cpu) */ >> u64 dt_preempt; /* time between CPU access by preempt (off cpu) */ >> u64 dt_delay; /* time between wakeup and sched-in */ >> + u64 dt_pre_mig; /* time between migration and wakeup */ >> u64 ready_to_run; /* time of wakeup */ >> + u64 migrated; /* time when a thread is migrated */ >> >> struct stats run_stats; >> u64 total_run_time; >> @@ -252,6 +255,7 @@ struct thread_runtime { >> u64 total_iowait_time; >> u64 total_preempt_time; >> u64 total_delay_time; >> + u64 total_pre_mig_time; >> >> char last_state; >> >> @@ -2073,14 +2077,15 @@ static void timehist_header(struct perf_sched *sched) >> printf(" "); >> } >> >> - if (sched->show_prio) { >> - printf(" %-*s %-*s %9s %9s %9s", >> - comm_width, "task name", MAX_PRIO_STR_LEN, "prio", >> - "wait time", "sch delay", "run time"); >> - } else { >> - printf(" %-*s %9s %9s %9s", comm_width, >> - "task name", "wait time", "sch delay", "run time"); >> - } >> + printf(" %-*s", comm_width, "task name"); >> + >> + if (sched->show_prio) >> + printf(" %-*s", MAX_PRIO_STR_LEN, "prio"); >> + >> + printf(" %9s %9s %9s", "wait time", "sch delay", "run time"); >> + >> + if (sched->pre_migrations) >> + printf(" %9s", "pre-mig time"); >> >> if (sched->show_state) >> printf(" %s", "state"); >> @@ -2095,17 +2100,15 @@ static void timehist_header(struct perf_sched *sched) >> if (sched->show_cpu_visual) >> printf(" %*s ", ncpus, ""); >> >> - if (sched->show_prio) { >> - printf(" %-*s %-*s %9s %9s %9s", >> - comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "", >> - "(msec)", "(msec)", "(msec)"); >> - } else { >> - printf(" %-*s %9s %9s %9s", comm_width, >> - "[tid/pid]", "(msec)", "(msec)", "(msec)"); >> - } >> + printf(" %-*s", comm_width, "[tid/pid]"); >> >> - if (sched->show_state) >> - printf(" %5s", ""); >> + if (sched->show_prio) >> + printf(" %-*s", MAX_PRIO_STR_LEN, ""); >> + >> + printf(" %9s %9s %9s", "(msec)", "(msec)", "(msec)"); >> + >> + if (sched->pre_migrations) >> + printf(" %9s", "(msec)"); >> >> printf("\n"); >> >> @@ -2117,15 +2120,15 @@ static void timehist_header(struct perf_sched *sched) >> if (sched->show_cpu_visual) >> printf(" %.*s ", ncpus, graph_dotted_line); >> >> - if (sched->show_prio) { >> - printf(" %.*s %.*s %.9s %.9s %.9s", >> - comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line, >> - graph_dotted_line, graph_dotted_line, graph_dotted_line); >> - } else { >> - printf(" %.*s %.9s %.9s %.9s", comm_width, >> - graph_dotted_line, graph_dotted_line, graph_dotted_line, >> - graph_dotted_line); >> - } >> + printf(" %.*s", comm_width, graph_dotted_line); >> + >> + if (sched->show_prio) >> + printf(" %.*s", MAX_PRIO_STR_LEN, graph_dotted_line); >> + >> + printf(" %.9s %.9s %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line); >> + >> + if (sched->pre_migrations) >> + printf(" %.9s", graph_dotted_line); >> >> if (sched->show_state) >> printf(" %.5s", graph_dotted_line); >> @@ -2180,6 +2183,8 @@ static void timehist_print_sample(struct perf_sched *sched, >> >> print_sched_time(tr->dt_delay, 6); >> print_sched_time(tr->dt_run, 6); >> + if (sched->pre_migrations) >> + print_sched_time(tr->dt_pre_mig, 6); >> >> if (sched->show_state) >> printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state); >> @@ -2239,6 +2244,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, >> r->dt_iowait = 0; >> r->dt_preempt = 0; >> r->dt_run = 0; >> + r->dt_pre_mig = 0; >> >> if (tprev) { >> r->dt_run = t - tprev; >> @@ -2247,6 +2253,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, >> pr_debug("time travel: wakeup time for task > previous sched_switch event\n"); >> else >> r->dt_delay = tprev - r->ready_to_run; >> + >> + if (r->ready_to_run && r->migrated) { >> + if ((r->migrated > r->ready_to_run) && (r->migrated < tprev)) >> + r->dt_pre_mig = r->migrated - r->ready_to_run; >> + } >> } >> >> if (r->last_time > tprev) >> @@ -2270,6 +2281,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, >> r->total_sleep_time += r->dt_sleep; >> r->total_iowait_time += r->dt_iowait; >> r->total_preempt_time += r->dt_preempt; >> + r->total_pre_mig_time += r->dt_pre_mig; >> } >> >> static bool is_idle_sample(struct perf_sample *sample, >> @@ -2684,8 +2696,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool, >> >> tr->migrations++; >> >> + if (tr->migrated == 0) >> + tr->migrated = sample->time; >> + >> /* show migrations if requested */ >> - timehist_print_migration_event(sched, evsel, sample, machine, thread); >> + if (sched->show_migrations) { >> + timehist_print_migration_event(sched, evsel, sample, >> + machine, thread); >> + } >> >> return 0; >> } >> @@ -2836,11 +2854,13 @@ static int timehist_sched_change_event(const struct perf_tool *tool, >> /* last state is used to determine where to account wait time */ >> tr->last_state = state; >> >> - /* sched out event for task so reset ready to run time */ >> + /* sched out event for task so reset ready to run time and migrated time */ >> if (state == 'R') >> tr->ready_to_run = t; >> else >> tr->ready_to_run = 0; >> + >> + tr->migrated = 0; >> } >> >> evsel__save_time(evsel, sample->time, sample->cpu); >> @@ -3280,8 +3300,8 @@ static int perf_sched__timehist(struct perf_sched *sched) >> goto out; >> } >> >> - if (sched->show_migrations && >> - perf_session__set_tracepoints_handlers(session, migrate_handlers)) >> + if ((sched->show_migrations || sched->pre_migrations) && >> + perf_session__set_tracepoints_handlers(session, migrate_handlers)) >> goto out; >> >> /* pre-allocate struct for per-CPU idle stats */ >> @@ -3823,6 +3843,7 @@ int cmd_sched(int argc, const char **argv) >> OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"), >> OPT_STRING(0, "prio", &sched.prio_str, "prio", >> "analyze events only for given task priority(ies)"), >> + OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"), >> OPT_PARENT(sched_options) >> }; >> >
On Thu, Oct 03, 2024 at 03:04:28PM -0700, Tim Chen wrote: > On Wed, 2024-10-02 at 22:09 +0530, Madadi Vineeth Reddy wrote: > > pre-migration wait time is the time that a task unnecessarily spends > > on the runqueue of a CPU but doesn't get switched-in there. In terms > > of tracepoints, it is the time between sched:sched_wakeup and > > sched:sched_migrate_task. > > > > Let's say a task woke up on CPU2, then it got migrated to CPU4 and > > then it's switched-in to CPU4. So, here pre-migration wait time is > > time that it was waiting on runqueue of CPU2 after it is woken up. > > > > The general pattern for pre-migration to occur is: > > sched:sched_wakeup > > sched:sched_migrate_task > > sched:sched_switch > > If a task migrate from CPU A to CPU B, but is unlucky that > someone took CPU B and it has to wait. Then it is yet again migrated to CPU C. > Do you only compute pre-mig time as > t_sched_migrate_task_CPU_A - t_sched_migrate_task_CPU_A? > > The task also spend some pre-mig time on CPU_B that probably > should be included. And that time is when it migrates to B > till it is migrated away from B. Do you take that into > account? Hmm.. right. The current code updates the migrate time only if it's 0. But I think it can just always update the time to get the latest one. Thanks, Namhyung
On 04/10/24 03:49, Namhyung Kim wrote: > On Thu, Oct 03, 2024 at 03:04:28PM -0700, Tim Chen wrote: >> On Wed, 2024-10-02 at 22:09 +0530, Madadi Vineeth Reddy wrote: >>> pre-migration wait time is the time that a task unnecessarily spends >>> on the runqueue of a CPU but doesn't get switched-in there. In terms >>> of tracepoints, it is the time between sched:sched_wakeup and >>> sched:sched_migrate_task. >>> >>> Let's say a task woke up on CPU2, then it got migrated to CPU4 and >>> then it's switched-in to CPU4. So, here pre-migration wait time is >>> time that it was waiting on runqueue of CPU2 after it is woken up. >>> >>> The general pattern for pre-migration to occur is: >>> sched:sched_wakeup >>> sched:sched_migrate_task >>> sched:sched_switch >> >> If a task migrate from CPU A to CPU B, but is unlucky that >> someone took CPU B and it has to wait. Then it is yet again migrated to CPU C. >> Do you only compute pre-mig time as >> t_sched_migrate_task_CPU_A - t_sched_migrate_task_CPU_A? >> >> The task also spend some pre-mig time on CPU_B that probably >> should be included. And that time is when it migrates to B >> till it is migrated away from B. Do you take that into >> account? > > Hmm.. right. The current code updates the migrate time only if it's 0. > But I think it can just always update the time to get the latest one. > Right. I’ll update the code to always pick the latest migrate time, ensuring that scenarios with multiple migrations in between are handled correctly. Thanks, Madadi Vineeth Reddy > Thanks, > Namhyung
On Wed, Oct 02, 2024 at 10:09:17PM +0530, Madadi Vineeth Reddy wrote: > pre-migration wait time is the time that a task unnecessarily spends > on the runqueue of a CPU but doesn't get switched-in there. In terms > of tracepoints, it is the time between sched:sched_wakeup and > sched:sched_migrate_task. > > Let's say a task woke up on CPU2, then it got migrated to CPU4 and > then it's switched-in to CPU4. So, here pre-migration wait time is > time that it was waiting on runqueue of CPU2 after it is woken up. > > The general pattern for pre-migration to occur is: > sched:sched_wakeup > sched:sched_migrate_task > sched:sched_switch > > The sched:sched_waking event is used to capture the wakeup time, > as it aligns with the existing code and only introduces a negligible > time difference. > > pre-migrations are generally not useful and it increases migrations. > This metric would be helpful in testing patches mainly related to wakeup > and load-balancer code paths as better wakeup logic would choose an > optimal CPU where task would be switched-in and thereby reducing pre- > migrations. > > The sample output(s) when -P or --pre-migrations is used: > ================= > time cpu task name wait time sch delay run time pre-mig time > [tid/pid] (msec) (msec) (msec) (msec) > --------------- ------ ------------------------------ --------- --------- --------- --------- > 38456.720806 [0001] schbench[28634/28574] 4.917 4.768 1.004 0.000 > 38456.720810 [0001] rcu_preempt[18] 3.919 0.003 0.004 0.000 > 38456.721800 [0006] schbench[28779/28574] 23.465 23.465 1.999 0.000 > 38456.722800 [0002] schbench[28773/28574] 60.371 60.237 3.955 60.197 > 38456.722806 [0001] schbench[28634/28574] 0.004 0.004 1.996 0.000 > 38456.722811 [0001] rcu_preempt[18] 1.996 0.005 0.005 0.000 > 38456.723800 [0000] schbench[28833/28574] 4.000 4.000 3.999 0.000 > 38456.723800 [0004] schbench[28762/28574] 42.951 42.839 3.999 39.867 > 38456.723802 [0007] schbench[28812/28574] 43.947 43.817 3.999 40.866 > 38456.723804 [0001] schbench[28587/28574] 7.935 7.822 0.993 0.000 > > Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com> > > --- > Changes in v3: > - Use the sched:sched_waking event to calculate the wakeup time. (Namhyung Kim) > - Rebase against perf-tools-next commit 80f192724e31 ("perf tests: Add more > topdown events regroup tests") > > Changes in v2: > - Use timehist_sched_wakeup_event() to get the sched_wakeup time. (Namhyung Kim) > - Rebase against perf-tools-next commit b38c49d8296b ("perf/test: Speed up test > case perf annotate basic tests") > > Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com> > --- > tools/perf/Documentation/perf-sched.txt | 8 +++ > tools/perf/builtin-sched.c | 83 ++++++++++++++++--------- > 2 files changed, 60 insertions(+), 31 deletions(-) > > diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt > index 3db64954a267..6dbbddb6464d 100644 > --- a/tools/perf/Documentation/perf-sched.txt > +++ b/tools/perf/Documentation/perf-sched.txt > @@ -221,6 +221,14 @@ OPTIONS for 'perf sched timehist' > priorities are specified with -: 120-129. A combination of both can also be > provided: 0,120-129. > > +-P:: > +--pre-migrations:: > + Show pre-migration wait time. pre-migration wait time is the time spent > + by a task waiting on a runqueue but not getting the chance to run there > + and is migrated to a different runqueue where it is finally run. This > + time between sched_wakeup and migrate_task is the pre-migration wait > + time. > + > OPTIONS for 'perf sched replay' > ------------------------------ > > diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c > index fdf979aaf275..e6540803ced6 100644 > --- a/tools/perf/builtin-sched.c > +++ b/tools/perf/builtin-sched.c > @@ -225,6 +225,7 @@ struct perf_sched { > bool show_wakeups; > bool show_next; > bool show_migrations; > + bool pre_migrations; > bool show_state; > bool show_prio; > u64 skipped_samples; > @@ -244,7 +245,9 @@ struct thread_runtime { > u64 dt_iowait; /* time between CPU access by iowait (off cpu) */ > u64 dt_preempt; /* time between CPU access by preempt (off cpu) */ > u64 dt_delay; /* time between wakeup and sched-in */ > + u64 dt_pre_mig; /* time between migration and wakeup */ > u64 ready_to_run; /* time of wakeup */ > + u64 migrated; /* time when a thread is migrated */ > > struct stats run_stats; > u64 total_run_time; > @@ -252,6 +255,7 @@ struct thread_runtime { > u64 total_iowait_time; > u64 total_preempt_time; > u64 total_delay_time; > + u64 total_pre_mig_time; > > char last_state; > > @@ -2073,14 +2077,15 @@ static void timehist_header(struct perf_sched *sched) > printf(" "); > } > > - if (sched->show_prio) { > - printf(" %-*s %-*s %9s %9s %9s", > - comm_width, "task name", MAX_PRIO_STR_LEN, "prio", > - "wait time", "sch delay", "run time"); > - } else { > - printf(" %-*s %9s %9s %9s", comm_width, > - "task name", "wait time", "sch delay", "run time"); > - } > + printf(" %-*s", comm_width, "task name"); > + > + if (sched->show_prio) > + printf(" %-*s", MAX_PRIO_STR_LEN, "prio"); > + > + printf(" %9s %9s %9s", "wait time", "sch delay", "run time"); > + > + if (sched->pre_migrations) > + printf(" %9s", "pre-mig time"); > > if (sched->show_state) > printf(" %s", "state"); > @@ -2095,17 +2100,15 @@ static void timehist_header(struct perf_sched *sched) > if (sched->show_cpu_visual) > printf(" %*s ", ncpus, ""); > > - if (sched->show_prio) { > - printf(" %-*s %-*s %9s %9s %9s", > - comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "", > - "(msec)", "(msec)", "(msec)"); > - } else { > - printf(" %-*s %9s %9s %9s", comm_width, > - "[tid/pid]", "(msec)", "(msec)", "(msec)"); > - } > + printf(" %-*s", comm_width, "[tid/pid]"); > > - if (sched->show_state) > - printf(" %5s", ""); > + if (sched->show_prio) > + printf(" %-*s", MAX_PRIO_STR_LEN, ""); > + > + printf(" %9s %9s %9s", "(msec)", "(msec)", "(msec)"); > + > + if (sched->pre_migrations) > + printf(" %9s", "(msec)"); > > printf("\n"); > > @@ -2117,15 +2120,15 @@ static void timehist_header(struct perf_sched *sched) > if (sched->show_cpu_visual) > printf(" %.*s ", ncpus, graph_dotted_line); > > - if (sched->show_prio) { > - printf(" %.*s %.*s %.9s %.9s %.9s", > - comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line, > - graph_dotted_line, graph_dotted_line, graph_dotted_line); > - } else { > - printf(" %.*s %.9s %.9s %.9s", comm_width, > - graph_dotted_line, graph_dotted_line, graph_dotted_line, > - graph_dotted_line); > - } > + printf(" %.*s", comm_width, graph_dotted_line); > + > + if (sched->show_prio) > + printf(" %.*s", MAX_PRIO_STR_LEN, graph_dotted_line); > + > + printf(" %.9s %.9s %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line); > + > + if (sched->pre_migrations) > + printf(" %.9s", graph_dotted_line); > > if (sched->show_state) > printf(" %.5s", graph_dotted_line); > @@ -2180,6 +2183,8 @@ static void timehist_print_sample(struct perf_sched *sched, > > print_sched_time(tr->dt_delay, 6); > print_sched_time(tr->dt_run, 6); > + if (sched->pre_migrations) > + print_sched_time(tr->dt_pre_mig, 6); > > if (sched->show_state) > printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state); > @@ -2239,6 +2244,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, It'd be nice if you update the comment on timehist_update_runtime_stats. > r->dt_iowait = 0; > r->dt_preempt = 0; > r->dt_run = 0; > + r->dt_pre_mig = 0; > > if (tprev) { > r->dt_run = t - tprev; > @@ -2247,6 +2253,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, > pr_debug("time travel: wakeup time for task > previous sched_switch event\n"); > else > r->dt_delay = tprev - r->ready_to_run; > + > + if (r->ready_to_run && r->migrated) { At this point r->read_to_run is not zero. And the r->migrated should not be zero when the below condition is met. So I think you can remove this condition. Thanks, Namhyung > + if ((r->migrated > r->ready_to_run) && (r->migrated < tprev)) > + r->dt_pre_mig = r->migrated - r->ready_to_run; > + } > } > > if (r->last_time > tprev) > @@ -2270,6 +2281,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, > r->total_sleep_time += r->dt_sleep; > r->total_iowait_time += r->dt_iowait; > r->total_preempt_time += r->dt_preempt; > + r->total_pre_mig_time += r->dt_pre_mig; > } > > static bool is_idle_sample(struct perf_sample *sample, > @@ -2684,8 +2696,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool, > > tr->migrations++; > > + if (tr->migrated == 0) > + tr->migrated = sample->time; > + > /* show migrations if requested */ > - timehist_print_migration_event(sched, evsel, sample, machine, thread); > + if (sched->show_migrations) { > + timehist_print_migration_event(sched, evsel, sample, > + machine, thread); > + } > > return 0; > } > @@ -2836,11 +2854,13 @@ static int timehist_sched_change_event(const struct perf_tool *tool, > /* last state is used to determine where to account wait time */ > tr->last_state = state; > > - /* sched out event for task so reset ready to run time */ > + /* sched out event for task so reset ready to run time and migrated time */ > if (state == 'R') > tr->ready_to_run = t; > else > tr->ready_to_run = 0; > + > + tr->migrated = 0; > } > > evsel__save_time(evsel, sample->time, sample->cpu); > @@ -3280,8 +3300,8 @@ static int perf_sched__timehist(struct perf_sched *sched) > goto out; > } > > - if (sched->show_migrations && > - perf_session__set_tracepoints_handlers(session, migrate_handlers)) > + if ((sched->show_migrations || sched->pre_migrations) && > + perf_session__set_tracepoints_handlers(session, migrate_handlers)) > goto out; > > /* pre-allocate struct for per-CPU idle stats */ > @@ -3823,6 +3843,7 @@ int cmd_sched(int argc, const char **argv) > OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"), > OPT_STRING(0, "prio", &sched.prio_str, "prio", > "analyze events only for given task priority(ies)"), > + OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"), > OPT_PARENT(sched_options) > }; > > -- > 2.43.2 >
Hi Namhyung, On 03/10/24 06:05, Namhyung Kim wrote: > On Wed, Oct 02, 2024 at 10:09:17PM +0530, Madadi Vineeth Reddy wrote: >> pre-migration wait time is the time that a task unnecessarily spends >> on the runqueue of a CPU but doesn't get switched-in there. In terms >> of tracepoints, it is the time between sched:sched_wakeup and >> sched:sched_migrate_task. >> [snip] >> @@ -2239,6 +2244,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, > > It'd be nice if you update the comment on timehist_update_runtime_stats. > > >> r->dt_iowait = 0; >> r->dt_preempt = 0; >> r->dt_run = 0; >> + r->dt_pre_mig = 0; >> >> if (tprev) { >> r->dt_run = t - tprev; >> @@ -2247,6 +2253,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, >> pr_debug("time travel: wakeup time for task > previous sched_switch event\n"); >> else >> r->dt_delay = tprev - r->ready_to_run; >> + >> + if (r->ready_to_run && r->migrated) { > > At this point r->read_to_run is not zero. And the r->migrated should > not be zero when the below condition is met. So I think you can remove > this condition. You're right, my mistake. r->ready_to_run is already checked, and if r->migrated is zero, it will naturally fail the condition below. I've removed the redundant check and sent a v4. Thanks for the feedback! Thanks, Madadi Vineeth Reddy > > Thanks, > Namhyung > > >> + if ((r->migrated > r->ready_to_run) && (r->migrated < tprev)) >> + r->dt_pre_mig = r->migrated - r->ready_to_run; >> + } >> } >> >> if (r->last_time > tprev) >> @@ -2270,6 +2281,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r, >> r->total_sleep_time += r->dt_sleep; >> r->total_iowait_time += r->dt_iowait; >> r->total_preempt_time += r->dt_preempt; >> + r->total_pre_mig_time += r->dt_pre_mig; >> } >> >> static bool is_idle_sample(struct perf_sample *sample, >> @@ -2684,8 +2696,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool, >> >> tr->migrations++; >> >> + if (tr->migrated == 0) >> + tr->migrated = sample->time; >> + >> /* show migrations if requested */ >> - timehist_print_migration_event(sched, evsel, sample, machine, thread); >> + if (sched->show_migrations) { >> + timehist_print_migration_event(sched, evsel, sample, >> + machine, thread); >> + } >> >> return 0; >> } >> @@ -2836,11 +2854,13 @@ static int timehist_sched_change_event(const struct perf_tool *tool, >> /* last state is used to determine where to account wait time */ >> tr->last_state = state; >> >> - /* sched out event for task so reset ready to run time */ >> + /* sched out event for task so reset ready to run time and migrated time */ >> if (state == 'R') >> tr->ready_to_run = t; >> else >> tr->ready_to_run = 0; >> + >> + tr->migrated = 0; >> } >> >> evsel__save_time(evsel, sample->time, sample->cpu); >> @@ -3280,8 +3300,8 @@ static int perf_sched__timehist(struct perf_sched *sched) >> goto out; >> } >> >> - if (sched->show_migrations && >> - perf_session__set_tracepoints_handlers(session, migrate_handlers)) >> + if ((sched->show_migrations || sched->pre_migrations) && >> + perf_session__set_tracepoints_handlers(session, migrate_handlers)) >> goto out; >> >> /* pre-allocate struct for per-CPU idle stats */ >> @@ -3823,6 +3843,7 @@ int cmd_sched(int argc, const char **argv) >> OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"), >> OPT_STRING(0, "prio", &sched.prio_str, "prio", >> "analyze events only for given task priority(ies)"), >> + OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"), >> OPT_PARENT(sched_options) >> }; >> >> -- >> 2.43.2 >>
© 2016 - 2024 Red Hat, Inc.