[PATCH v3] perf sched timehist: Add pre-migration wait time option

Madadi Vineeth Reddy posted 1 patch 1 month, 3 weeks ago
There is a newer version of this series
tools/perf/Documentation/perf-sched.txt |  8 +++
tools/perf/builtin-sched.c              | 83 ++++++++++++++++---------
2 files changed, 60 insertions(+), 31 deletions(-)
[PATCH v3] perf sched timehist: Add pre-migration wait time option
Posted by Madadi Vineeth Reddy 1 month, 3 weeks ago
pre-migration wait time is the time that a task unnecessarily spends
on the runqueue of a CPU but doesn't get switched-in there. In terms
of tracepoints, it is the time between sched:sched_wakeup and
sched:sched_migrate_task.

Let's say a task woke up on CPU2, then it got migrated to CPU4 and
then it's switched-in to CPU4. So, here pre-migration wait time is
time that it was waiting on runqueue of CPU2 after it is woken up.

The general pattern for pre-migration to occur is:
sched:sched_wakeup
sched:sched_migrate_task
sched:sched_switch

The sched:sched_waking event is used to capture the wakeup time,
as it aligns with the existing code and only introduces a negligible
time difference.

pre-migrations are generally not useful and it increases migrations.
This metric would be helpful in testing patches mainly related to wakeup
and load-balancer code paths as better wakeup logic would choose an
optimal CPU where task would be switched-in and thereby reducing pre-
migrations.

The sample output(s) when -P or --pre-migrations is used:
=================
           time    cpu  task name                       wait time  sch delay   run time  pre-mig time
                        [tid/pid]                          (msec)     (msec)     (msec)     (msec)
--------------- ------  ------------------------------  ---------  ---------  ---------  ---------
   38456.720806 [0001]  schbench[28634/28574]               4.917      4.768      1.004      0.000
   38456.720810 [0001]  rcu_preempt[18]                     3.919      0.003      0.004      0.000
   38456.721800 [0006]  schbench[28779/28574]              23.465     23.465      1.999      0.000
   38456.722800 [0002]  schbench[28773/28574]              60.371     60.237      3.955     60.197
   38456.722806 [0001]  schbench[28634/28574]               0.004      0.004      1.996      0.000
   38456.722811 [0001]  rcu_preempt[18]                     1.996      0.005      0.005      0.000
   38456.723800 [0000]  schbench[28833/28574]               4.000      4.000      3.999      0.000
   38456.723800 [0004]  schbench[28762/28574]              42.951     42.839      3.999     39.867
   38456.723802 [0007]  schbench[28812/28574]              43.947     43.817      3.999     40.866
   38456.723804 [0001]  schbench[28587/28574]               7.935      7.822      0.993      0.000

Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>

---
Changes in v3:
- Use the sched:sched_waking event to calculate the wakeup time. (Namhyung Kim)
- Rebase against perf-tools-next commit 80f192724e31 ("perf tests: Add more
  topdown events regroup tests")

Changes in v2:
- Use timehist_sched_wakeup_event() to get the sched_wakeup time. (Namhyung Kim)
- Rebase against perf-tools-next commit b38c49d8296b ("perf/test: Speed up test
  case perf annotate basic tests")

Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
---
 tools/perf/Documentation/perf-sched.txt |  8 +++
 tools/perf/builtin-sched.c              | 83 ++++++++++++++++---------
 2 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 3db64954a267..6dbbddb6464d 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -221,6 +221,14 @@ OPTIONS for 'perf sched timehist'
 	priorities are specified with -: 120-129. A combination of both can also be
 	provided: 0,120-129.
 
+-P::
+--pre-migrations::
+	Show pre-migration wait time. pre-migration wait time is the time spent
+	by a task waiting on a runqueue but not getting the chance to run there
+	and is migrated to a different runqueue where it is finally run. This
+	time between sched_wakeup and migrate_task is the pre-migration wait
+	time.
+
 OPTIONS for 'perf sched replay'
 ------------------------------
 
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index fdf979aaf275..e6540803ced6 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -225,6 +225,7 @@ struct perf_sched {
 	bool		show_wakeups;
 	bool		show_next;
 	bool		show_migrations;
+	bool		pre_migrations;
 	bool		show_state;
 	bool		show_prio;
 	u64		skipped_samples;
@@ -244,7 +245,9 @@ struct thread_runtime {
 	u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
 	u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
 	u64 dt_delay;       /* time between wakeup and sched-in */
+	u64 dt_pre_mig;     /* time between migration and wakeup */
 	u64 ready_to_run;   /* time of wakeup */
+	u64 migrated;	    /* time when a thread is migrated */
 
 	struct stats run_stats;
 	u64 total_run_time;
@@ -252,6 +255,7 @@ struct thread_runtime {
 	u64 total_iowait_time;
 	u64 total_preempt_time;
 	u64 total_delay_time;
+	u64 total_pre_mig_time;
 
 	char last_state;
 
@@ -2073,14 +2077,15 @@ static void timehist_header(struct perf_sched *sched)
 		printf(" ");
 	}
 
-	if (sched->show_prio) {
-		printf(" %-*s  %-*s  %9s  %9s  %9s",
-		       comm_width, "task name", MAX_PRIO_STR_LEN, "prio",
-		       "wait time", "sch delay", "run time");
-	} else {
-		printf(" %-*s  %9s  %9s  %9s", comm_width,
-		       "task name", "wait time", "sch delay", "run time");
-	}
+	printf(" %-*s", comm_width, "task name");
+
+	if (sched->show_prio)
+		printf("  %-*s", MAX_PRIO_STR_LEN, "prio");
+
+	printf("  %9s  %9s  %9s", "wait time", "sch delay", "run time");
+
+	if (sched->pre_migrations)
+		printf("  %9s", "pre-mig time");
 
 	if (sched->show_state)
 		printf("  %s", "state");
@@ -2095,17 +2100,15 @@ static void timehist_header(struct perf_sched *sched)
 	if (sched->show_cpu_visual)
 		printf(" %*s ", ncpus, "");
 
-	if (sched->show_prio) {
-		printf(" %-*s  %-*s  %9s  %9s  %9s",
-		       comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "",
-		       "(msec)", "(msec)", "(msec)");
-	} else {
-		printf(" %-*s  %9s  %9s  %9s", comm_width,
-		       "[tid/pid]", "(msec)", "(msec)", "(msec)");
-	}
+	printf(" %-*s", comm_width, "[tid/pid]");
 
-	if (sched->show_state)
-		printf("  %5s", "");
+	if (sched->show_prio)
+		printf("  %-*s", MAX_PRIO_STR_LEN, "");
+
+	printf("  %9s  %9s  %9s", "(msec)", "(msec)", "(msec)");
+
+	if (sched->pre_migrations)
+		printf("  %9s", "(msec)");
 
 	printf("\n");
 
@@ -2117,15 +2120,15 @@ static void timehist_header(struct perf_sched *sched)
 	if (sched->show_cpu_visual)
 		printf(" %.*s ", ncpus, graph_dotted_line);
 
-	if (sched->show_prio) {
-		printf(" %.*s  %.*s  %.9s  %.9s  %.9s",
-		       comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line,
-		       graph_dotted_line, graph_dotted_line, graph_dotted_line);
-	} else {
-		printf(" %.*s  %.9s  %.9s  %.9s", comm_width,
-		       graph_dotted_line, graph_dotted_line, graph_dotted_line,
-		       graph_dotted_line);
-	}
+	printf(" %.*s", comm_width, graph_dotted_line);
+
+	if (sched->show_prio)
+		printf("  %.*s", MAX_PRIO_STR_LEN, graph_dotted_line);
+
+	printf("  %.9s  %.9s  %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line);
+
+	if (sched->pre_migrations)
+		printf("  %.9s", graph_dotted_line);
 
 	if (sched->show_state)
 		printf("  %.5s", graph_dotted_line);
@@ -2180,6 +2183,8 @@ static void timehist_print_sample(struct perf_sched *sched,
 
 	print_sched_time(tr->dt_delay, 6);
 	print_sched_time(tr->dt_run, 6);
+	if (sched->pre_migrations)
+		print_sched_time(tr->dt_pre_mig, 6);
 
 	if (sched->show_state)
 		printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state);
@@ -2239,6 +2244,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
 	r->dt_iowait  = 0;
 	r->dt_preempt = 0;
 	r->dt_run     = 0;
+	r->dt_pre_mig = 0;
 
 	if (tprev) {
 		r->dt_run = t - tprev;
@@ -2247,6 +2253,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
 				pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
 			else
 				r->dt_delay = tprev - r->ready_to_run;
+
+			if (r->ready_to_run && r->migrated) {
+				if ((r->migrated > r->ready_to_run) && (r->migrated < tprev))
+					r->dt_pre_mig = r->migrated - r->ready_to_run;
+			}
 		}
 
 		if (r->last_time > tprev)
@@ -2270,6 +2281,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
 	r->total_sleep_time   += r->dt_sleep;
 	r->total_iowait_time  += r->dt_iowait;
 	r->total_preempt_time += r->dt_preempt;
+	r->total_pre_mig_time += r->dt_pre_mig;
 }
 
 static bool is_idle_sample(struct perf_sample *sample,
@@ -2684,8 +2696,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool,
 
 	tr->migrations++;
 
+	if (tr->migrated == 0)
+		tr->migrated = sample->time;
+
 	/* show migrations if requested */
-	timehist_print_migration_event(sched, evsel, sample, machine, thread);
+	if (sched->show_migrations) {
+		timehist_print_migration_event(sched, evsel, sample,
+							machine, thread);
+	}
 
 	return 0;
 }
@@ -2836,11 +2854,13 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
 		/* last state is used to determine where to account wait time */
 		tr->last_state = state;
 
-		/* sched out event for task so reset ready to run time */
+		/* sched out event for task so reset ready to run time and migrated time */
 		if (state == 'R')
 			tr->ready_to_run = t;
 		else
 			tr->ready_to_run = 0;
+
+		tr->migrated = 0;
 	}
 
 	evsel__save_time(evsel, sample->time, sample->cpu);
@@ -3280,8 +3300,8 @@ static int perf_sched__timehist(struct perf_sched *sched)
 		goto out;
 	}
 
-	if (sched->show_migrations &&
-	    perf_session__set_tracepoints_handlers(session, migrate_handlers))
+	if ((sched->show_migrations || sched->pre_migrations) &&
+		perf_session__set_tracepoints_handlers(session, migrate_handlers))
 		goto out;
 
 	/* pre-allocate struct for per-CPU idle stats */
@@ -3823,6 +3843,7 @@ int cmd_sched(int argc, const char **argv)
 	OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
 	OPT_STRING(0, "prio", &sched.prio_str, "prio",
 		   "analyze events only for given task priority(ies)"),
+	OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
 	OPT_PARENT(sched_options)
 	};
 
-- 
2.43.2
Re: [PATCH v3] perf sched timehist: Add pre-migration wait time option
Posted by Tim Chen 1 month, 3 weeks ago
On Wed, 2024-10-02 at 22:09 +0530, Madadi Vineeth Reddy wrote:
> pre-migration wait time is the time that a task unnecessarily spends
> on the runqueue of a CPU but doesn't get switched-in there. In terms
> of tracepoints, it is the time between sched:sched_wakeup and
> sched:sched_migrate_task.
> 
> Let's say a task woke up on CPU2, then it got migrated to CPU4 and
> then it's switched-in to CPU4. So, here pre-migration wait time is
> time that it was waiting on runqueue of CPU2 after it is woken up.
> 
> The general pattern for pre-migration to occur is:
> sched:sched_wakeup
> sched:sched_migrate_task
> sched:sched_switch

If a task migrate from CPU A to CPU B, but is unlucky that
someone took CPU B and it has to wait. Then it is yet again migrated to CPU C.
Do you only compute pre-mig time as 
t_sched_migrate_task_CPU_A - t_sched_migrate_task_CPU_A?

The task also spend some pre-mig time on CPU_B that probably
should be included.  And that time is when it migrates to B
till it is migrated away from B.  Do you take that into
account?

> 
> The sched:sched_waking event is used to capture the wakeup time,
> as it aligns with the existing code and only introduces a negligible
> time difference.
> 
> pre-migrations are generally not useful and it increases migrations.

Not sure long pre-migration itself leads to migrations.  But rather migrations
is a function of the aggressiveness of the idle load balancer.

> This metric would be helpful in testing patches mainly related to wakeup
> and load-balancer code paths 
> 

I think it is most relevant for the idle load balancer code paths, where
newly idle CPU pull task from other CPUs to run.

Are there cases where you have used this metric to tune the scheduler?

Thanks.

Tim 

> as better wakeup logic would choose an
> optimal CPU where task would be switched-in and thereby reducing pre-
> migrations.
> 
> The sample output(s) when -P or --pre-migrations is used:
> =================
>            time    cpu  task name                       wait time  sch delay   run time  pre-mig time
>                         [tid/pid]                          (msec)     (msec)     (msec)     (msec)
> --------------- ------  ------------------------------  ---------  ---------  ---------  ---------
>    38456.720806 [0001]  schbench[28634/28574]               4.917      4.768      1.004      0.000
>    38456.720810 [0001]  rcu_preempt[18]                     3.919      0.003      0.004      0.000
>    38456.721800 [0006]  schbench[28779/28574]              23.465     23.465      1.999      0.000
>    38456.722800 [0002]  schbench[28773/28574]              60.371     60.237      3.955     60.197
>    38456.722806 [0001]  schbench[28634/28574]               0.004      0.004      1.996      0.000
>    38456.722811 [0001]  rcu_preempt[18]                     1.996      0.005      0.005      0.000
>    38456.723800 [0000]  schbench[28833/28574]               4.000      4.000      3.999      0.000
>    38456.723800 [0004]  schbench[28762/28574]              42.951     42.839      3.999     39.867
>    38456.723802 [0007]  schbench[28812/28574]              43.947     43.817      3.999     40.866
>    38456.723804 [0001]  schbench[28587/28574]               7.935      7.822      0.993      0.000
> 
> Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
> 
> ---
> Changes in v3:
> - Use the sched:sched_waking event to calculate the wakeup time. (Namhyung Kim)
> - Rebase against perf-tools-next commit 80f192724e31 ("perf tests: Add more
>   topdown events regroup tests")
> 
> Changes in v2:
> - Use timehist_sched_wakeup_event() to get the sched_wakeup time. (Namhyung Kim)
> - Rebase against perf-tools-next commit b38c49d8296b ("perf/test: Speed up test
>   case perf annotate basic tests")
> 
> Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
> ---
>  tools/perf/Documentation/perf-sched.txt |  8 +++
>  tools/perf/builtin-sched.c              | 83 ++++++++++++++++---------
>  2 files changed, 60 insertions(+), 31 deletions(-)
> 
> diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
> index 3db64954a267..6dbbddb6464d 100644
> --- a/tools/perf/Documentation/perf-sched.txt
> +++ b/tools/perf/Documentation/perf-sched.txt
> @@ -221,6 +221,14 @@ OPTIONS for 'perf sched timehist'
>  	priorities are specified with -: 120-129. A combination of both can also be
>  	provided: 0,120-129.
>  
> +-P::
> +--pre-migrations::
> +	Show pre-migration wait time. pre-migration wait time is the time spent
> +	by a task waiting on a runqueue but not getting the chance to run there
> +	and is migrated to a different runqueue where it is finally run. This
> +	time between sched_wakeup and migrate_task is the pre-migration wait
> +	time.
> +
>  OPTIONS for 'perf sched replay'
>  ------------------------------
>  
> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
> index fdf979aaf275..e6540803ced6 100644
> --- a/tools/perf/builtin-sched.c
> +++ b/tools/perf/builtin-sched.c
> @@ -225,6 +225,7 @@ struct perf_sched {
>  	bool		show_wakeups;
>  	bool		show_next;
>  	bool		show_migrations;
> +	bool		pre_migrations;
>  	bool		show_state;
>  	bool		show_prio;
>  	u64		skipped_samples;
> @@ -244,7 +245,9 @@ struct thread_runtime {
>  	u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
>  	u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
>  	u64 dt_delay;       /* time between wakeup and sched-in */
> +	u64 dt_pre_mig;     /* time between migration and wakeup */
>  	u64 ready_to_run;   /* time of wakeup */
> +	u64 migrated;	    /* time when a thread is migrated */
>  
>  	struct stats run_stats;
>  	u64 total_run_time;
> @@ -252,6 +255,7 @@ struct thread_runtime {
>  	u64 total_iowait_time;
>  	u64 total_preempt_time;
>  	u64 total_delay_time;
> +	u64 total_pre_mig_time;
>  
>  	char last_state;
>  
> @@ -2073,14 +2077,15 @@ static void timehist_header(struct perf_sched *sched)
>  		printf(" ");
>  	}
>  
> -	if (sched->show_prio) {
> -		printf(" %-*s  %-*s  %9s  %9s  %9s",
> -		       comm_width, "task name", MAX_PRIO_STR_LEN, "prio",
> -		       "wait time", "sch delay", "run time");
> -	} else {
> -		printf(" %-*s  %9s  %9s  %9s", comm_width,
> -		       "task name", "wait time", "sch delay", "run time");
> -	}
> +	printf(" %-*s", comm_width, "task name");
> +
> +	if (sched->show_prio)
> +		printf("  %-*s", MAX_PRIO_STR_LEN, "prio");
> +
> +	printf("  %9s  %9s  %9s", "wait time", "sch delay", "run time");
> +
> +	if (sched->pre_migrations)
> +		printf("  %9s", "pre-mig time");
>  
>  	if (sched->show_state)
>  		printf("  %s", "state");
> @@ -2095,17 +2100,15 @@ static void timehist_header(struct perf_sched *sched)
>  	if (sched->show_cpu_visual)
>  		printf(" %*s ", ncpus, "");
>  
> -	if (sched->show_prio) {
> -		printf(" %-*s  %-*s  %9s  %9s  %9s",
> -		       comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "",
> -		       "(msec)", "(msec)", "(msec)");
> -	} else {
> -		printf(" %-*s  %9s  %9s  %9s", comm_width,
> -		       "[tid/pid]", "(msec)", "(msec)", "(msec)");
> -	}
> +	printf(" %-*s", comm_width, "[tid/pid]");
>  
> -	if (sched->show_state)
> -		printf("  %5s", "");
> +	if (sched->show_prio)
> +		printf("  %-*s", MAX_PRIO_STR_LEN, "");
> +
> +	printf("  %9s  %9s  %9s", "(msec)", "(msec)", "(msec)");
> +
> +	if (sched->pre_migrations)
> +		printf("  %9s", "(msec)");
>  
>  	printf("\n");
>  
> @@ -2117,15 +2120,15 @@ static void timehist_header(struct perf_sched *sched)
>  	if (sched->show_cpu_visual)
>  		printf(" %.*s ", ncpus, graph_dotted_line);
>  
> -	if (sched->show_prio) {
> -		printf(" %.*s  %.*s  %.9s  %.9s  %.9s",
> -		       comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line,
> -		       graph_dotted_line, graph_dotted_line, graph_dotted_line);
> -	} else {
> -		printf(" %.*s  %.9s  %.9s  %.9s", comm_width,
> -		       graph_dotted_line, graph_dotted_line, graph_dotted_line,
> -		       graph_dotted_line);
> -	}
> +	printf(" %.*s", comm_width, graph_dotted_line);
> +
> +	if (sched->show_prio)
> +		printf("  %.*s", MAX_PRIO_STR_LEN, graph_dotted_line);
> +
> +	printf("  %.9s  %.9s  %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line);
> +
> +	if (sched->pre_migrations)
> +		printf("  %.9s", graph_dotted_line);
>  
>  	if (sched->show_state)
>  		printf("  %.5s", graph_dotted_line);
> @@ -2180,6 +2183,8 @@ static void timehist_print_sample(struct perf_sched *sched,
>  
>  	print_sched_time(tr->dt_delay, 6);
>  	print_sched_time(tr->dt_run, 6);
> +	if (sched->pre_migrations)
> +		print_sched_time(tr->dt_pre_mig, 6);
>  
>  	if (sched->show_state)
>  		printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state);
> @@ -2239,6 +2244,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>  	r->dt_iowait  = 0;
>  	r->dt_preempt = 0;
>  	r->dt_run     = 0;
> +	r->dt_pre_mig = 0;
>  
>  	if (tprev) {
>  		r->dt_run = t - tprev;
> @@ -2247,6 +2253,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>  				pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
>  			else
>  				r->dt_delay = tprev - r->ready_to_run;
> +
> +			if (r->ready_to_run && r->migrated) {
> +				if ((r->migrated > r->ready_to_run) && (r->migrated < tprev))
> +					r->dt_pre_mig = r->migrated - r->ready_to_run;
> +			}
>  		}
>  
>  		if (r->last_time > tprev)
> @@ -2270,6 +2281,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>  	r->total_sleep_time   += r->dt_sleep;
>  	r->total_iowait_time  += r->dt_iowait;
>  	r->total_preempt_time += r->dt_preempt;
> +	r->total_pre_mig_time += r->dt_pre_mig;
>  }
>  
>  static bool is_idle_sample(struct perf_sample *sample,
> @@ -2684,8 +2696,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool,
>  
>  	tr->migrations++;
>  
> +	if (tr->migrated == 0)
> +		tr->migrated = sample->time;
> +
>  	/* show migrations if requested */
> -	timehist_print_migration_event(sched, evsel, sample, machine, thread);
> +	if (sched->show_migrations) {
> +		timehist_print_migration_event(sched, evsel, sample,
> +							machine, thread);
> +	}
>  
>  	return 0;
>  }
> @@ -2836,11 +2854,13 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
>  		/* last state is used to determine where to account wait time */
>  		tr->last_state = state;
>  
> -		/* sched out event for task so reset ready to run time */
> +		/* sched out event for task so reset ready to run time and migrated time */
>  		if (state == 'R')
>  			tr->ready_to_run = t;
>  		else
>  			tr->ready_to_run = 0;
> +
> +		tr->migrated = 0;
>  	}
>  
>  	evsel__save_time(evsel, sample->time, sample->cpu);
> @@ -3280,8 +3300,8 @@ static int perf_sched__timehist(struct perf_sched *sched)
>  		goto out;
>  	}
>  
> -	if (sched->show_migrations &&
> -	    perf_session__set_tracepoints_handlers(session, migrate_handlers))
> +	if ((sched->show_migrations || sched->pre_migrations) &&
> +		perf_session__set_tracepoints_handlers(session, migrate_handlers))
>  		goto out;
>  
>  	/* pre-allocate struct for per-CPU idle stats */
> @@ -3823,6 +3843,7 @@ int cmd_sched(int argc, const char **argv)
>  	OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
>  	OPT_STRING(0, "prio", &sched.prio_str, "prio",
>  		   "analyze events only for given task priority(ies)"),
> +	OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
>  	OPT_PARENT(sched_options)
>  	};
>  
Re: [PATCH v3] perf sched timehist: Add pre-migration wait time option
Posted by Madadi Vineeth Reddy 1 month, 3 weeks ago
Hi Tim Chen,

On 04/10/24 03:34, Tim Chen wrote:
> On Wed, 2024-10-02 at 22:09 +0530, Madadi Vineeth Reddy wrote:
>> pre-migration wait time is the time that a task unnecessarily spends
>> on the runqueue of a CPU but doesn't get switched-in there. In terms
>> of tracepoints, it is the time between sched:sched_wakeup and
>> sched:sched_migrate_task.
>>
>> Let's say a task woke up on CPU2, then it got migrated to CPU4 and
>> then it's switched-in to CPU4. So, here pre-migration wait time is
>> time that it was waiting on runqueue of CPU2 after it is woken up.
>>
>> The general pattern for pre-migration to occur is:
>> sched:sched_wakeup
>> sched:sched_migrate_task
>> sched:sched_switch
> 
> If a task migrate from CPU A to CPU B, but is unlucky that
> someone took CPU B and it has to wait. Then it is yet again migrated to CPU C.
> Do you only compute pre-mig time as 
> t_sched_migrate_task_CPU_A - t_sched_migrate_task_CPU_A?
> 
> The task also spend some pre-mig time on CPU_B that probably
> should be included.  And that time is when it migrates to B
> till it is migrated away from B.  Do you take that into
> account?
> 

You are right. Currently the time spent on CPU B is not accounted. I will
consider the latest migrate time and that should fix it.

>>
>> The sched:sched_waking event is used to capture the wakeup time,
>> as it aligns with the existing code and only introduces a negligible
>> time difference.
>>
>> pre-migrations are generally not useful and it increases migrations.
> 
> Not sure long pre-migration itself leads to migrations.  But rather migrations
> is a function of the aggressiveness of the idle load balancer.

You're right. pre-migration wait time itself doesn't directly cause migrations,
but it is a useful metric for understanding the overall impact on task scheduling,
especially in the context of the idle load balancer as you mentioned. I agree
that this is most relevant for scenarios where newly idle CPUs pull tasks from
other CPUs.

By capturing the pre-migration wait time, the intention is to evaluate how
efficiently tasks are balanced and whether there are opportunities for improvement
in the wakeup and load-balancing paths.

Thank you for the feedback.

Thanks,
Madadi Vineeth Reddy

> 
>> This metric would be helpful in testing patches mainly related to wakeup
>> and load-balancer code paths 
>>
> 
> I think it is most relevant for the idle load balancer code paths, where
> newly idle CPU pull task from other CPUs to run.
> 
> Are there cases where you have used this metric to tune the scheduler?
> 
> Thanks.
> 
> Tim 
> 
>> as better wakeup logic would choose an
>> optimal CPU where task would be switched-in and thereby reducing pre-
>> migrations.
>>
>> The sample output(s) when -P or --pre-migrations is used:
>> =================
>>            time    cpu  task name                       wait time  sch delay   run time  pre-mig time
>>                         [tid/pid]                          (msec)     (msec)     (msec)     (msec)
>> --------------- ------  ------------------------------  ---------  ---------  ---------  ---------
>>    38456.720806 [0001]  schbench[28634/28574]               4.917      4.768      1.004      0.000
>>    38456.720810 [0001]  rcu_preempt[18]                     3.919      0.003      0.004      0.000
>>    38456.721800 [0006]  schbench[28779/28574]              23.465     23.465      1.999      0.000
>>    38456.722800 [0002]  schbench[28773/28574]              60.371     60.237      3.955     60.197
>>    38456.722806 [0001]  schbench[28634/28574]               0.004      0.004      1.996      0.000
>>    38456.722811 [0001]  rcu_preempt[18]                     1.996      0.005      0.005      0.000
>>    38456.723800 [0000]  schbench[28833/28574]               4.000      4.000      3.999      0.000
>>    38456.723800 [0004]  schbench[28762/28574]              42.951     42.839      3.999     39.867
>>    38456.723802 [0007]  schbench[28812/28574]              43.947     43.817      3.999     40.866
>>    38456.723804 [0001]  schbench[28587/28574]               7.935      7.822      0.993      0.000
>>
>> Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
>>
>> ---
>> Changes in v3:
>> - Use the sched:sched_waking event to calculate the wakeup time. (Namhyung Kim)
>> - Rebase against perf-tools-next commit 80f192724e31 ("perf tests: Add more
>>   topdown events regroup tests")
>>
>> Changes in v2:
>> - Use timehist_sched_wakeup_event() to get the sched_wakeup time. (Namhyung Kim)
>> - Rebase against perf-tools-next commit b38c49d8296b ("perf/test: Speed up test
>>   case perf annotate basic tests")
>>
>> Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
>> ---
>>  tools/perf/Documentation/perf-sched.txt |  8 +++
>>  tools/perf/builtin-sched.c              | 83 ++++++++++++++++---------
>>  2 files changed, 60 insertions(+), 31 deletions(-)
>>
>> diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
>> index 3db64954a267..6dbbddb6464d 100644
>> --- a/tools/perf/Documentation/perf-sched.txt
>> +++ b/tools/perf/Documentation/perf-sched.txt
>> @@ -221,6 +221,14 @@ OPTIONS for 'perf sched timehist'
>>  	priorities are specified with -: 120-129. A combination of both can also be
>>  	provided: 0,120-129.
>>  
>> +-P::
>> +--pre-migrations::
>> +	Show pre-migration wait time. pre-migration wait time is the time spent
>> +	by a task waiting on a runqueue but not getting the chance to run there
>> +	and is migrated to a different runqueue where it is finally run. This
>> +	time between sched_wakeup and migrate_task is the pre-migration wait
>> +	time.
>> +
>>  OPTIONS for 'perf sched replay'
>>  ------------------------------
>>  
>> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
>> index fdf979aaf275..e6540803ced6 100644
>> --- a/tools/perf/builtin-sched.c
>> +++ b/tools/perf/builtin-sched.c
>> @@ -225,6 +225,7 @@ struct perf_sched {
>>  	bool		show_wakeups;
>>  	bool		show_next;
>>  	bool		show_migrations;
>> +	bool		pre_migrations;
>>  	bool		show_state;
>>  	bool		show_prio;
>>  	u64		skipped_samples;
>> @@ -244,7 +245,9 @@ struct thread_runtime {
>>  	u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
>>  	u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
>>  	u64 dt_delay;       /* time between wakeup and sched-in */
>> +	u64 dt_pre_mig;     /* time between migration and wakeup */
>>  	u64 ready_to_run;   /* time of wakeup */
>> +	u64 migrated;	    /* time when a thread is migrated */
>>  
>>  	struct stats run_stats;
>>  	u64 total_run_time;
>> @@ -252,6 +255,7 @@ struct thread_runtime {
>>  	u64 total_iowait_time;
>>  	u64 total_preempt_time;
>>  	u64 total_delay_time;
>> +	u64 total_pre_mig_time;
>>  
>>  	char last_state;
>>  
>> @@ -2073,14 +2077,15 @@ static void timehist_header(struct perf_sched *sched)
>>  		printf(" ");
>>  	}
>>  
>> -	if (sched->show_prio) {
>> -		printf(" %-*s  %-*s  %9s  %9s  %9s",
>> -		       comm_width, "task name", MAX_PRIO_STR_LEN, "prio",
>> -		       "wait time", "sch delay", "run time");
>> -	} else {
>> -		printf(" %-*s  %9s  %9s  %9s", comm_width,
>> -		       "task name", "wait time", "sch delay", "run time");
>> -	}
>> +	printf(" %-*s", comm_width, "task name");
>> +
>> +	if (sched->show_prio)
>> +		printf("  %-*s", MAX_PRIO_STR_LEN, "prio");
>> +
>> +	printf("  %9s  %9s  %9s", "wait time", "sch delay", "run time");
>> +
>> +	if (sched->pre_migrations)
>> +		printf("  %9s", "pre-mig time");
>>  
>>  	if (sched->show_state)
>>  		printf("  %s", "state");
>> @@ -2095,17 +2100,15 @@ static void timehist_header(struct perf_sched *sched)
>>  	if (sched->show_cpu_visual)
>>  		printf(" %*s ", ncpus, "");
>>  
>> -	if (sched->show_prio) {
>> -		printf(" %-*s  %-*s  %9s  %9s  %9s",
>> -		       comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "",
>> -		       "(msec)", "(msec)", "(msec)");
>> -	} else {
>> -		printf(" %-*s  %9s  %9s  %9s", comm_width,
>> -		       "[tid/pid]", "(msec)", "(msec)", "(msec)");
>> -	}
>> +	printf(" %-*s", comm_width, "[tid/pid]");
>>  
>> -	if (sched->show_state)
>> -		printf("  %5s", "");
>> +	if (sched->show_prio)
>> +		printf("  %-*s", MAX_PRIO_STR_LEN, "");
>> +
>> +	printf("  %9s  %9s  %9s", "(msec)", "(msec)", "(msec)");
>> +
>> +	if (sched->pre_migrations)
>> +		printf("  %9s", "(msec)");
>>  
>>  	printf("\n");
>>  
>> @@ -2117,15 +2120,15 @@ static void timehist_header(struct perf_sched *sched)
>>  	if (sched->show_cpu_visual)
>>  		printf(" %.*s ", ncpus, graph_dotted_line);
>>  
>> -	if (sched->show_prio) {
>> -		printf(" %.*s  %.*s  %.9s  %.9s  %.9s",
>> -		       comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line,
>> -		       graph_dotted_line, graph_dotted_line, graph_dotted_line);
>> -	} else {
>> -		printf(" %.*s  %.9s  %.9s  %.9s", comm_width,
>> -		       graph_dotted_line, graph_dotted_line, graph_dotted_line,
>> -		       graph_dotted_line);
>> -	}
>> +	printf(" %.*s", comm_width, graph_dotted_line);
>> +
>> +	if (sched->show_prio)
>> +		printf("  %.*s", MAX_PRIO_STR_LEN, graph_dotted_line);
>> +
>> +	printf("  %.9s  %.9s  %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line);
>> +
>> +	if (sched->pre_migrations)
>> +		printf("  %.9s", graph_dotted_line);
>>  
>>  	if (sched->show_state)
>>  		printf("  %.5s", graph_dotted_line);
>> @@ -2180,6 +2183,8 @@ static void timehist_print_sample(struct perf_sched *sched,
>>  
>>  	print_sched_time(tr->dt_delay, 6);
>>  	print_sched_time(tr->dt_run, 6);
>> +	if (sched->pre_migrations)
>> +		print_sched_time(tr->dt_pre_mig, 6);
>>  
>>  	if (sched->show_state)
>>  		printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state);
>> @@ -2239,6 +2244,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>>  	r->dt_iowait  = 0;
>>  	r->dt_preempt = 0;
>>  	r->dt_run     = 0;
>> +	r->dt_pre_mig = 0;
>>  
>>  	if (tprev) {
>>  		r->dt_run = t - tprev;
>> @@ -2247,6 +2253,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>>  				pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
>>  			else
>>  				r->dt_delay = tprev - r->ready_to_run;
>> +
>> +			if (r->ready_to_run && r->migrated) {
>> +				if ((r->migrated > r->ready_to_run) && (r->migrated < tprev))
>> +					r->dt_pre_mig = r->migrated - r->ready_to_run;
>> +			}
>>  		}
>>  
>>  		if (r->last_time > tprev)
>> @@ -2270,6 +2281,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>>  	r->total_sleep_time   += r->dt_sleep;
>>  	r->total_iowait_time  += r->dt_iowait;
>>  	r->total_preempt_time += r->dt_preempt;
>> +	r->total_pre_mig_time += r->dt_pre_mig;
>>  }
>>  
>>  static bool is_idle_sample(struct perf_sample *sample,
>> @@ -2684,8 +2696,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool,
>>  
>>  	tr->migrations++;
>>  
>> +	if (tr->migrated == 0)
>> +		tr->migrated = sample->time;
>> +
>>  	/* show migrations if requested */
>> -	timehist_print_migration_event(sched, evsel, sample, machine, thread);
>> +	if (sched->show_migrations) {
>> +		timehist_print_migration_event(sched, evsel, sample,
>> +							machine, thread);
>> +	}
>>  
>>  	return 0;
>>  }
>> @@ -2836,11 +2854,13 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
>>  		/* last state is used to determine where to account wait time */
>>  		tr->last_state = state;
>>  
>> -		/* sched out event for task so reset ready to run time */
>> +		/* sched out event for task so reset ready to run time and migrated time */
>>  		if (state == 'R')
>>  			tr->ready_to_run = t;
>>  		else
>>  			tr->ready_to_run = 0;
>> +
>> +		tr->migrated = 0;
>>  	}
>>  
>>  	evsel__save_time(evsel, sample->time, sample->cpu);
>> @@ -3280,8 +3300,8 @@ static int perf_sched__timehist(struct perf_sched *sched)
>>  		goto out;
>>  	}
>>  
>> -	if (sched->show_migrations &&
>> -	    perf_session__set_tracepoints_handlers(session, migrate_handlers))
>> +	if ((sched->show_migrations || sched->pre_migrations) &&
>> +		perf_session__set_tracepoints_handlers(session, migrate_handlers))
>>  		goto out;
>>  
>>  	/* pre-allocate struct for per-CPU idle stats */
>> @@ -3823,6 +3843,7 @@ int cmd_sched(int argc, const char **argv)
>>  	OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
>>  	OPT_STRING(0, "prio", &sched.prio_str, "prio",
>>  		   "analyze events only for given task priority(ies)"),
>> +	OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
>>  	OPT_PARENT(sched_options)
>>  	};
>>  
> 

Re: [PATCH v3] perf sched timehist: Add pre-migration wait time option
Posted by Namhyung Kim 1 month, 3 weeks ago
On Thu, Oct 03, 2024 at 03:04:28PM -0700, Tim Chen wrote:
> On Wed, 2024-10-02 at 22:09 +0530, Madadi Vineeth Reddy wrote:
> > pre-migration wait time is the time that a task unnecessarily spends
> > on the runqueue of a CPU but doesn't get switched-in there. In terms
> > of tracepoints, it is the time between sched:sched_wakeup and
> > sched:sched_migrate_task.
> > 
> > Let's say a task woke up on CPU2, then it got migrated to CPU4 and
> > then it's switched-in to CPU4. So, here pre-migration wait time is
> > time that it was waiting on runqueue of CPU2 after it is woken up.
> > 
> > The general pattern for pre-migration to occur is:
> > sched:sched_wakeup
> > sched:sched_migrate_task
> > sched:sched_switch
> 
> If a task migrate from CPU A to CPU B, but is unlucky that
> someone took CPU B and it has to wait. Then it is yet again migrated to CPU C.
> Do you only compute pre-mig time as 
> t_sched_migrate_task_CPU_A - t_sched_migrate_task_CPU_A?
> 
> The task also spend some pre-mig time on CPU_B that probably
> should be included.  And that time is when it migrates to B
> till it is migrated away from B.  Do you take that into
> account?

Hmm.. right.  The current code updates the migrate time only if it's 0.
But I think it can just always update the time to get the latest one.

Thanks,
Namhyung
Re: [PATCH v3] perf sched timehist: Add pre-migration wait time option
Posted by Madadi Vineeth Reddy 1 month, 3 weeks ago
On 04/10/24 03:49, Namhyung Kim wrote:
> On Thu, Oct 03, 2024 at 03:04:28PM -0700, Tim Chen wrote:
>> On Wed, 2024-10-02 at 22:09 +0530, Madadi Vineeth Reddy wrote:
>>> pre-migration wait time is the time that a task unnecessarily spends
>>> on the runqueue of a CPU but doesn't get switched-in there. In terms
>>> of tracepoints, it is the time between sched:sched_wakeup and
>>> sched:sched_migrate_task.
>>>
>>> Let's say a task woke up on CPU2, then it got migrated to CPU4 and
>>> then it's switched-in to CPU4. So, here pre-migration wait time is
>>> time that it was waiting on runqueue of CPU2 after it is woken up.
>>>
>>> The general pattern for pre-migration to occur is:
>>> sched:sched_wakeup
>>> sched:sched_migrate_task
>>> sched:sched_switch
>>
>> If a task migrate from CPU A to CPU B, but is unlucky that
>> someone took CPU B and it has to wait. Then it is yet again migrated to CPU C.
>> Do you only compute pre-mig time as 
>> t_sched_migrate_task_CPU_A - t_sched_migrate_task_CPU_A?
>>
>> The task also spend some pre-mig time on CPU_B that probably
>> should be included.  And that time is when it migrates to B
>> till it is migrated away from B.  Do you take that into
>> account?
> 
> Hmm.. right.  The current code updates the migrate time only if it's 0.
> But I think it can just always update the time to get the latest one.
> 

Right. I’ll update the code to always pick the latest migrate time, ensuring
that scenarios with multiple migrations in between are handled correctly.

Thanks,
Madadi Vineeth Reddy

> Thanks,
> Namhyung

Re: [PATCH v3] perf sched timehist: Add pre-migration wait time option
Posted by Namhyung Kim 1 month, 3 weeks ago
On Wed, Oct 02, 2024 at 10:09:17PM +0530, Madadi Vineeth Reddy wrote:
> pre-migration wait time is the time that a task unnecessarily spends
> on the runqueue of a CPU but doesn't get switched-in there. In terms
> of tracepoints, it is the time between sched:sched_wakeup and
> sched:sched_migrate_task.
> 
> Let's say a task woke up on CPU2, then it got migrated to CPU4 and
> then it's switched-in to CPU4. So, here pre-migration wait time is
> time that it was waiting on runqueue of CPU2 after it is woken up.
> 
> The general pattern for pre-migration to occur is:
> sched:sched_wakeup
> sched:sched_migrate_task
> sched:sched_switch
> 
> The sched:sched_waking event is used to capture the wakeup time,
> as it aligns with the existing code and only introduces a negligible
> time difference.
> 
> pre-migrations are generally not useful and it increases migrations.
> This metric would be helpful in testing patches mainly related to wakeup
> and load-balancer code paths as better wakeup logic would choose an
> optimal CPU where task would be switched-in and thereby reducing pre-
> migrations.
> 
> The sample output(s) when -P or --pre-migrations is used:
> =================
>            time    cpu  task name                       wait time  sch delay   run time  pre-mig time
>                         [tid/pid]                          (msec)     (msec)     (msec)     (msec)
> --------------- ------  ------------------------------  ---------  ---------  ---------  ---------
>    38456.720806 [0001]  schbench[28634/28574]               4.917      4.768      1.004      0.000
>    38456.720810 [0001]  rcu_preempt[18]                     3.919      0.003      0.004      0.000
>    38456.721800 [0006]  schbench[28779/28574]              23.465     23.465      1.999      0.000
>    38456.722800 [0002]  schbench[28773/28574]              60.371     60.237      3.955     60.197
>    38456.722806 [0001]  schbench[28634/28574]               0.004      0.004      1.996      0.000
>    38456.722811 [0001]  rcu_preempt[18]                     1.996      0.005      0.005      0.000
>    38456.723800 [0000]  schbench[28833/28574]               4.000      4.000      3.999      0.000
>    38456.723800 [0004]  schbench[28762/28574]              42.951     42.839      3.999     39.867
>    38456.723802 [0007]  schbench[28812/28574]              43.947     43.817      3.999     40.866
>    38456.723804 [0001]  schbench[28587/28574]               7.935      7.822      0.993      0.000
> 
> Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
> 
> ---
> Changes in v3:
> - Use the sched:sched_waking event to calculate the wakeup time. (Namhyung Kim)
> - Rebase against perf-tools-next commit 80f192724e31 ("perf tests: Add more
>   topdown events regroup tests")
> 
> Changes in v2:
> - Use timehist_sched_wakeup_event() to get the sched_wakeup time. (Namhyung Kim)
> - Rebase against perf-tools-next commit b38c49d8296b ("perf/test: Speed up test
>   case perf annotate basic tests")
> 
> Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
> ---
>  tools/perf/Documentation/perf-sched.txt |  8 +++
>  tools/perf/builtin-sched.c              | 83 ++++++++++++++++---------
>  2 files changed, 60 insertions(+), 31 deletions(-)
> 
> diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
> index 3db64954a267..6dbbddb6464d 100644
> --- a/tools/perf/Documentation/perf-sched.txt
> +++ b/tools/perf/Documentation/perf-sched.txt
> @@ -221,6 +221,14 @@ OPTIONS for 'perf sched timehist'
>  	priorities are specified with -: 120-129. A combination of both can also be
>  	provided: 0,120-129.
>  
> +-P::
> +--pre-migrations::
> +	Show pre-migration wait time. pre-migration wait time is the time spent
> +	by a task waiting on a runqueue but not getting the chance to run there
> +	and is migrated to a different runqueue where it is finally run. This
> +	time between sched_wakeup and migrate_task is the pre-migration wait
> +	time.
> +
>  OPTIONS for 'perf sched replay'
>  ------------------------------
>  
> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
> index fdf979aaf275..e6540803ced6 100644
> --- a/tools/perf/builtin-sched.c
> +++ b/tools/perf/builtin-sched.c
> @@ -225,6 +225,7 @@ struct perf_sched {
>  	bool		show_wakeups;
>  	bool		show_next;
>  	bool		show_migrations;
> +	bool		pre_migrations;
>  	bool		show_state;
>  	bool		show_prio;
>  	u64		skipped_samples;
> @@ -244,7 +245,9 @@ struct thread_runtime {
>  	u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
>  	u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
>  	u64 dt_delay;       /* time between wakeup and sched-in */
> +	u64 dt_pre_mig;     /* time between migration and wakeup */
>  	u64 ready_to_run;   /* time of wakeup */
> +	u64 migrated;	    /* time when a thread is migrated */
>  
>  	struct stats run_stats;
>  	u64 total_run_time;
> @@ -252,6 +255,7 @@ struct thread_runtime {
>  	u64 total_iowait_time;
>  	u64 total_preempt_time;
>  	u64 total_delay_time;
> +	u64 total_pre_mig_time;
>  
>  	char last_state;
>  
> @@ -2073,14 +2077,15 @@ static void timehist_header(struct perf_sched *sched)
>  		printf(" ");
>  	}
>  
> -	if (sched->show_prio) {
> -		printf(" %-*s  %-*s  %9s  %9s  %9s",
> -		       comm_width, "task name", MAX_PRIO_STR_LEN, "prio",
> -		       "wait time", "sch delay", "run time");
> -	} else {
> -		printf(" %-*s  %9s  %9s  %9s", comm_width,
> -		       "task name", "wait time", "sch delay", "run time");
> -	}
> +	printf(" %-*s", comm_width, "task name");
> +
> +	if (sched->show_prio)
> +		printf("  %-*s", MAX_PRIO_STR_LEN, "prio");
> +
> +	printf("  %9s  %9s  %9s", "wait time", "sch delay", "run time");
> +
> +	if (sched->pre_migrations)
> +		printf("  %9s", "pre-mig time");
>  
>  	if (sched->show_state)
>  		printf("  %s", "state");
> @@ -2095,17 +2100,15 @@ static void timehist_header(struct perf_sched *sched)
>  	if (sched->show_cpu_visual)
>  		printf(" %*s ", ncpus, "");
>  
> -	if (sched->show_prio) {
> -		printf(" %-*s  %-*s  %9s  %9s  %9s",
> -		       comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "",
> -		       "(msec)", "(msec)", "(msec)");
> -	} else {
> -		printf(" %-*s  %9s  %9s  %9s", comm_width,
> -		       "[tid/pid]", "(msec)", "(msec)", "(msec)");
> -	}
> +	printf(" %-*s", comm_width, "[tid/pid]");
>  
> -	if (sched->show_state)
> -		printf("  %5s", "");
> +	if (sched->show_prio)
> +		printf("  %-*s", MAX_PRIO_STR_LEN, "");
> +
> +	printf("  %9s  %9s  %9s", "(msec)", "(msec)", "(msec)");
> +
> +	if (sched->pre_migrations)
> +		printf("  %9s", "(msec)");
>  
>  	printf("\n");
>  
> @@ -2117,15 +2120,15 @@ static void timehist_header(struct perf_sched *sched)
>  	if (sched->show_cpu_visual)
>  		printf(" %.*s ", ncpus, graph_dotted_line);
>  
> -	if (sched->show_prio) {
> -		printf(" %.*s  %.*s  %.9s  %.9s  %.9s",
> -		       comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line,
> -		       graph_dotted_line, graph_dotted_line, graph_dotted_line);
> -	} else {
> -		printf(" %.*s  %.9s  %.9s  %.9s", comm_width,
> -		       graph_dotted_line, graph_dotted_line, graph_dotted_line,
> -		       graph_dotted_line);
> -	}
> +	printf(" %.*s", comm_width, graph_dotted_line);
> +
> +	if (sched->show_prio)
> +		printf("  %.*s", MAX_PRIO_STR_LEN, graph_dotted_line);
> +
> +	printf("  %.9s  %.9s  %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line);
> +
> +	if (sched->pre_migrations)
> +		printf("  %.9s", graph_dotted_line);
>  
>  	if (sched->show_state)
>  		printf("  %.5s", graph_dotted_line);
> @@ -2180,6 +2183,8 @@ static void timehist_print_sample(struct perf_sched *sched,
>  
>  	print_sched_time(tr->dt_delay, 6);
>  	print_sched_time(tr->dt_run, 6);
> +	if (sched->pre_migrations)
> +		print_sched_time(tr->dt_pre_mig, 6);
>  
>  	if (sched->show_state)
>  		printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state);
> @@ -2239,6 +2244,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,

It'd be nice if you update the comment on timehist_update_runtime_stats.


>  	r->dt_iowait  = 0;
>  	r->dt_preempt = 0;
>  	r->dt_run     = 0;
> +	r->dt_pre_mig = 0;
>  
>  	if (tprev) {
>  		r->dt_run = t - tprev;
> @@ -2247,6 +2253,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>  				pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
>  			else
>  				r->dt_delay = tprev - r->ready_to_run;
> +
> +			if (r->ready_to_run && r->migrated) {

At this point r->read_to_run is not zero.  And the r->migrated should
not be zero when the below condition is met.  So I think you can remove
this condition.

Thanks,
Namhyung


> +				if ((r->migrated > r->ready_to_run) && (r->migrated < tprev))
> +					r->dt_pre_mig = r->migrated - r->ready_to_run;
> +			}
>  		}
>  
>  		if (r->last_time > tprev)
> @@ -2270,6 +2281,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>  	r->total_sleep_time   += r->dt_sleep;
>  	r->total_iowait_time  += r->dt_iowait;
>  	r->total_preempt_time += r->dt_preempt;
> +	r->total_pre_mig_time += r->dt_pre_mig;
>  }
>  
>  static bool is_idle_sample(struct perf_sample *sample,
> @@ -2684,8 +2696,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool,
>  
>  	tr->migrations++;
>  
> +	if (tr->migrated == 0)
> +		tr->migrated = sample->time;
> +
>  	/* show migrations if requested */
> -	timehist_print_migration_event(sched, evsel, sample, machine, thread);
> +	if (sched->show_migrations) {
> +		timehist_print_migration_event(sched, evsel, sample,
> +							machine, thread);
> +	}
>  
>  	return 0;
>  }
> @@ -2836,11 +2854,13 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
>  		/* last state is used to determine where to account wait time */
>  		tr->last_state = state;
>  
> -		/* sched out event for task so reset ready to run time */
> +		/* sched out event for task so reset ready to run time and migrated time */
>  		if (state == 'R')
>  			tr->ready_to_run = t;
>  		else
>  			tr->ready_to_run = 0;
> +
> +		tr->migrated = 0;
>  	}
>  
>  	evsel__save_time(evsel, sample->time, sample->cpu);
> @@ -3280,8 +3300,8 @@ static int perf_sched__timehist(struct perf_sched *sched)
>  		goto out;
>  	}
>  
> -	if (sched->show_migrations &&
> -	    perf_session__set_tracepoints_handlers(session, migrate_handlers))
> +	if ((sched->show_migrations || sched->pre_migrations) &&
> +		perf_session__set_tracepoints_handlers(session, migrate_handlers))
>  		goto out;
>  
>  	/* pre-allocate struct for per-CPU idle stats */
> @@ -3823,6 +3843,7 @@ int cmd_sched(int argc, const char **argv)
>  	OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
>  	OPT_STRING(0, "prio", &sched.prio_str, "prio",
>  		   "analyze events only for given task priority(ies)"),
> +	OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
>  	OPT_PARENT(sched_options)
>  	};
>  
> -- 
> 2.43.2
>
Re: [PATCH v3] perf sched timehist: Add pre-migration wait time option
Posted by Madadi Vineeth Reddy 1 month, 3 weeks ago
Hi Namhyung,

On 03/10/24 06:05, Namhyung Kim wrote:
> On Wed, Oct 02, 2024 at 10:09:17PM +0530, Madadi Vineeth Reddy wrote:
>> pre-migration wait time is the time that a task unnecessarily spends
>> on the runqueue of a CPU but doesn't get switched-in there. In terms
>> of tracepoints, it is the time between sched:sched_wakeup and
>> sched:sched_migrate_task.
>>

[snip]

>> @@ -2239,6 +2244,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
> 
> It'd be nice if you update the comment on timehist_update_runtime_stats.
> 
> 
>>  	r->dt_iowait  = 0;
>>  	r->dt_preempt = 0;
>>  	r->dt_run     = 0;
>> +	r->dt_pre_mig = 0;
>>  
>>  	if (tprev) {
>>  		r->dt_run = t - tprev;
>> @@ -2247,6 +2253,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>>  				pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
>>  			else
>>  				r->dt_delay = tprev - r->ready_to_run;
>> +
>> +			if (r->ready_to_run && r->migrated) {
> 
> At this point r->read_to_run is not zero.  And the r->migrated should
> not be zero when the below condition is met.  So I think you can remove
> this condition.

You're right, my mistake. r->ready_to_run is already checked, and if
r->migrated is zero, it will naturally fail the condition below.

I've removed the redundant check and sent a v4. Thanks for the feedback!

Thanks,
Madadi Vineeth Reddy

> 
> Thanks,
> Namhyung
> 
> 
>> +				if ((r->migrated > r->ready_to_run) && (r->migrated < tprev))
>> +					r->dt_pre_mig = r->migrated - r->ready_to_run;
>> +			}
>>  		}
>>  
>>  		if (r->last_time > tprev)
>> @@ -2270,6 +2281,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>>  	r->total_sleep_time   += r->dt_sleep;
>>  	r->total_iowait_time  += r->dt_iowait;
>>  	r->total_preempt_time += r->dt_preempt;
>> +	r->total_pre_mig_time += r->dt_pre_mig;
>>  }
>>  
>>  static bool is_idle_sample(struct perf_sample *sample,
>> @@ -2684,8 +2696,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool,
>>  
>>  	tr->migrations++;
>>  
>> +	if (tr->migrated == 0)
>> +		tr->migrated = sample->time;
>> +
>>  	/* show migrations if requested */
>> -	timehist_print_migration_event(sched, evsel, sample, machine, thread);
>> +	if (sched->show_migrations) {
>> +		timehist_print_migration_event(sched, evsel, sample,
>> +							machine, thread);
>> +	}
>>  
>>  	return 0;
>>  }
>> @@ -2836,11 +2854,13 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
>>  		/* last state is used to determine where to account wait time */
>>  		tr->last_state = state;
>>  
>> -		/* sched out event for task so reset ready to run time */
>> +		/* sched out event for task so reset ready to run time and migrated time */
>>  		if (state == 'R')
>>  			tr->ready_to_run = t;
>>  		else
>>  			tr->ready_to_run = 0;
>> +
>> +		tr->migrated = 0;
>>  	}
>>  
>>  	evsel__save_time(evsel, sample->time, sample->cpu);
>> @@ -3280,8 +3300,8 @@ static int perf_sched__timehist(struct perf_sched *sched)
>>  		goto out;
>>  	}
>>  
>> -	if (sched->show_migrations &&
>> -	    perf_session__set_tracepoints_handlers(session, migrate_handlers))
>> +	if ((sched->show_migrations || sched->pre_migrations) &&
>> +		perf_session__set_tracepoints_handlers(session, migrate_handlers))
>>  		goto out;
>>  
>>  	/* pre-allocate struct for per-CPU idle stats */
>> @@ -3823,6 +3843,7 @@ int cmd_sched(int argc, const char **argv)
>>  	OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
>>  	OPT_STRING(0, "prio", &sched.prio_str, "prio",
>>  		   "analyze events only for given task priority(ies)"),
>> +	OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
>>  	OPT_PARENT(sched_options)
>>  	};
>>  
>> -- 
>> 2.43.2
>>