[RFC PATCH v3 4/6] perf stat: Create another thread for sample data processing

weilin.wang@intel.com posted 6 patches 1 year, 11 months ago
There is a newer version of this series
[RFC PATCH v3 4/6] perf stat: Create another thread for sample data processing
Posted by weilin.wang@intel.com 1 year, 11 months ago
From: Weilin Wang <weilin.wang@intel.com>

Another thread is required to synchronize between perf stat and perf record
when we pass data through pipe.

Signed-off-by: Weilin Wang <weilin.wang@intel.com>
---
 tools/perf/builtin-stat.c | 58 +++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 3e5865572266..2372175d3408 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -777,7 +777,6 @@ static int process_sample_event(struct perf_tool *tool,
 	const char *evname;
 	struct tpebs_retire_lat *t;
 
-	pr_debug("entering function %s\n ", __func__);
 	evname = evsel__name(evsel);
 
 	pr_debug("[%03d] ", sample->cpu);
@@ -808,9 +807,9 @@ static int process_feature_event(struct perf_session *session,
 	return 0;
 }
 
-static int __cmd_script(struct child_process *cmd __maybe_unused)
+static void *__cmd_script(void *arg __maybe_unused)
 {
-	int err = 0;
+	struct child_process *cmd = arg;
 	struct perf_session *session;
 	struct perf_data data = {
 		.mode = PERF_DATA_MODE_READ,
@@ -826,29 +825,15 @@ static int __cmd_script(struct child_process *cmd __maybe_unused)
 		.attr		 = perf_event__process_attr,
 		},
 	};
-	struct tpebs_event *e;
-
-	list_for_each_entry(e, &stat_config.tpebs_events, nd) {
-		struct tpebs_retire_lat *new = malloc(sizeof(struct tpebs_retire_lat));
-
-		if (!new)
-			return -1;
-		new->name = strdup(e->name);
-		new->tpebs_name = strdup(e->tpebs_name);
-		new->count = 0;
-		new->sum = 0;
-		list_add_tail(&new->nd, &stat_config.tpebs_results);
-	}
 
-	kill(cmd->pid, SIGTERM);
 	session = perf_session__new(&data, &script.tool);
 	if (IS_ERR(session))
-		return PTR_ERR(session);
+		return NULL;
 	script.session = session;
-	err = perf_session__process_events(session);
+	perf_session__process_events(session);
 	perf_session__delete(session);
 
-	return err;
+	return NULL;
 }
 
 static int __run_perf_stat(int argc, const char **argv, int run_idx)
@@ -868,15 +853,37 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	int err;
 	bool second_pass = false;
 	struct child_process cmd;
+	pthread_t thread_script;
 
 	//Prepare perf record for sampling event retire_latency before fork and prepare workload
 	if (stat_config.tpebs_event_size > 0) {
 		int ret;
 
+		struct tpebs_event *e;
 		pr_debug("perf stat pid = %d\n", getpid());
 		ret = prepare_perf_record(&cmd);
 		if (ret)
 			return ret;
+
+		list_for_each_entry(e, &stat_config.tpebs_events, nd) {
+			struct tpebs_retire_lat *new = malloc(sizeof(struct tpebs_retire_lat));
+
+			if (!new)
+				return -1;
+			new->name = strdup(e->name);
+			new->tpebs_name = strdup(e->tpebs_name);
+			new->count = 0;
+			new->sum = 0;
+			list_add_tail(&new->nd, &stat_config.tpebs_results);
+		}
+
+		if (pthread_create(&thread_script, NULL, __cmd_script, &cmd)) {
+			kill(cmd.pid, SIGTERM);
+			close(cmd.out);
+			pr_err("Could not create thread to process sample data.\n");
+			return -1;
+		}
+		sleep(2);
 	}
 
 	if (forks) {
@@ -1087,12 +1094,15 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	if (stat_config.tpebs_event_size > 0) {
 		int ret;
 
-		pr_debug("pid = %d\n", getpid());
-		pr_debug("cmd.pid = %d\n", cmd.pid);
+		pr_debug("Workload finished, finishing record\n");
+		pr_debug("Perf stat pid = %d, Perf record pid = %d\n", getpid(), cmd.pid);
 
-		ret = __cmd_script(&cmd);
+		kill(cmd.pid, SIGTERM);
+		pthread_join(thread_script, NULL);
 		close(cmd.out);
-		pr_debug("%d\n", ret);
+		ret = finish_command(&cmd);
+		if (ret != -ERR_RUN_COMMAND_WAITPID_SIGNAL)
+			return ret;
 	}
 
 	if (stat_config.walltime_run_table)
-- 
2.43.0
Re: [RFC PATCH v3 4/6] perf stat: Create another thread for sample data processing
Posted by Namhyung Kim 1 year, 11 months ago
On Fri, Mar 1, 2024 at 4:11 PM <weilin.wang@intel.com> wrote:
>
> From: Weilin Wang <weilin.wang@intel.com>
>
> Another thread is required to synchronize between perf stat and perf record
> when we pass data through pipe.
>
> Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> ---
>  tools/perf/builtin-stat.c | 58 +++++++++++++++++++++++----------------
>  1 file changed, 34 insertions(+), 24 deletions(-)
>
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 3e5865572266..2372175d3408 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -777,7 +777,6 @@ static int process_sample_event(struct perf_tool *tool,
>         const char *evname;
>         struct tpebs_retire_lat *t;
>
> -       pr_debug("entering function %s\n ", __func__);
>         evname = evsel__name(evsel);
>
>         pr_debug("[%03d] ", sample->cpu);
> @@ -808,9 +807,9 @@ static int process_feature_event(struct perf_session *session,
>         return 0;
>  }
>
> -static int __cmd_script(struct child_process *cmd __maybe_unused)
> +static void *__cmd_script(void *arg __maybe_unused)
>  {
> -       int err = 0;
> +       struct child_process *cmd = arg;
>         struct perf_session *session;
>         struct perf_data data = {
>                 .mode = PERF_DATA_MODE_READ,
> @@ -826,29 +825,15 @@ static int __cmd_script(struct child_process *cmd __maybe_unused)
>                 .attr            = perf_event__process_attr,
>                 },
>         };
> -       struct tpebs_event *e;
> -
> -       list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> -               struct tpebs_retire_lat *new = malloc(sizeof(struct tpebs_retire_lat));
> -
> -               if (!new)
> -                       return -1;
> -               new->name = strdup(e->name);
> -               new->tpebs_name = strdup(e->tpebs_name);
> -               new->count = 0;
> -               new->sum = 0;
> -               list_add_tail(&new->nd, &stat_config.tpebs_results);
> -       }
>
> -       kill(cmd->pid, SIGTERM);
>         session = perf_session__new(&data, &script.tool);
>         if (IS_ERR(session))
> -               return PTR_ERR(session);
> +               return NULL;
>         script.session = session;
> -       err = perf_session__process_events(session);
> +       perf_session__process_events(session);
>         perf_session__delete(session);
>
> -       return err;
> +       return NULL;
>  }
>
>  static int __run_perf_stat(int argc, const char **argv, int run_idx)
> @@ -868,15 +853,37 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
>         int err;
>         bool second_pass = false;
>         struct child_process cmd;
> +       pthread_t thread_script;
>
>         //Prepare perf record for sampling event retire_latency before fork and prepare workload
>         if (stat_config.tpebs_event_size > 0) {
>                 int ret;
>
> +               struct tpebs_event *e;
>                 pr_debug("perf stat pid = %d\n", getpid());
>                 ret = prepare_perf_record(&cmd);
>                 if (ret)
>                         return ret;
> +
> +               list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> +                       struct tpebs_retire_lat *new = malloc(sizeof(struct tpebs_retire_lat));
> +
> +                       if (!new)
> +                               return -1;

This would leave the record command.  I think you'd better
do this before prepare_perf_record().


> +                       new->name = strdup(e->name);
> +                       new->tpebs_name = strdup(e->tpebs_name);
> +                       new->count = 0;
> +                       new->sum = 0;
> +                       list_add_tail(&new->nd, &stat_config.tpebs_results);
> +               }
> +
> +               if (pthread_create(&thread_script, NULL, __cmd_script, &cmd)) {
> +                       kill(cmd.pid, SIGTERM);
> +                       close(cmd.out);
> +                       pr_err("Could not create thread to process sample data.\n");
> +                       return -1;
> +               }
> +               sleep(2);

Why?

Thanks,
Namhyung


>         }
>
>         if (forks) {
> @@ -1087,12 +1094,15 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
>         if (stat_config.tpebs_event_size > 0) {
>                 int ret;
>
> -               pr_debug("pid = %d\n", getpid());
> -               pr_debug("cmd.pid = %d\n", cmd.pid);
> +               pr_debug("Workload finished, finishing record\n");
> +               pr_debug("Perf stat pid = %d, Perf record pid = %d\n", getpid(), cmd.pid);
>
> -               ret = __cmd_script(&cmd);
> +               kill(cmd.pid, SIGTERM);
> +               pthread_join(thread_script, NULL);
>                 close(cmd.out);
> -               pr_debug("%d\n", ret);
> +               ret = finish_command(&cmd);
> +               if (ret != -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> +                       return ret;
>         }
>
>         if (stat_config.walltime_run_table)
> --
> 2.43.0
>
RE: [RFC PATCH v3 4/6] perf stat: Create another thread for sample data processing
Posted by Wang, Weilin 1 year, 11 months ago

> -----Original Message-----
> From: Namhyung Kim <namhyung@kernel.org>
> Sent: Monday, March 11, 2024 11:48 PM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Ian Rogers <irogers@google.com>; Peter Zijlstra <peterz@infradead.org>;
> Ingo Molnar <mingo@redhat.com>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v3 4/6] perf stat: Create another thread for sample
> data processing
> 
> On Fri, Mar 1, 2024 at 4:11 PM <weilin.wang@intel.com> wrote:
> >
> > From: Weilin Wang <weilin.wang@intel.com>
> >
> > Another thread is required to synchronize between perf stat and perf record
> > when we pass data through pipe.
> >
> > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > ---
> >  tools/perf/builtin-stat.c | 58 +++++++++++++++++++++++----------------
> >  1 file changed, 34 insertions(+), 24 deletions(-)
> >
> > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > index 3e5865572266..2372175d3408 100644
> > --- a/tools/perf/builtin-stat.c
> > +++ b/tools/perf/builtin-stat.c
> > @@ -777,7 +777,6 @@ static int process_sample_event(struct perf_tool
> *tool,
> >         const char *evname;
> >         struct tpebs_retire_lat *t;
> >
> > -       pr_debug("entering function %s\n ", __func__);
> >         evname = evsel__name(evsel);
> >
> >         pr_debug("[%03d] ", sample->cpu);
> > @@ -808,9 +807,9 @@ static int process_feature_event(struct perf_session
> *session,
> >         return 0;
> >  }
> >
> > -static int __cmd_script(struct child_process *cmd __maybe_unused)
> > +static void *__cmd_script(void *arg __maybe_unused)
> >  {
> > -       int err = 0;
> > +       struct child_process *cmd = arg;
> >         struct perf_session *session;
> >         struct perf_data data = {
> >                 .mode = PERF_DATA_MODE_READ,
> > @@ -826,29 +825,15 @@ static int __cmd_script(struct child_process *cmd
> __maybe_unused)
> >                 .attr            = perf_event__process_attr,
> >                 },
> >         };
> > -       struct tpebs_event *e;
> > -
> > -       list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > -               struct tpebs_retire_lat *new = malloc(sizeof(struct
> tpebs_retire_lat));
> > -
> > -               if (!new)
> > -                       return -1;
> > -               new->name = strdup(e->name);
> > -               new->tpebs_name = strdup(e->tpebs_name);
> > -               new->count = 0;
> > -               new->sum = 0;
> > -               list_add_tail(&new->nd, &stat_config.tpebs_results);
> > -       }
> >
> > -       kill(cmd->pid, SIGTERM);
> >         session = perf_session__new(&data, &script.tool);
> >         if (IS_ERR(session))
> > -               return PTR_ERR(session);
> > +               return NULL;
> >         script.session = session;
> > -       err = perf_session__process_events(session);
> > +       perf_session__process_events(session);
> >         perf_session__delete(session);
> >
> > -       return err;
> > +       return NULL;
> >  }
> >
> >  static int __run_perf_stat(int argc, const char **argv, int run_idx)
> > @@ -868,15 +853,37 @@ static int __run_perf_stat(int argc, const char
> **argv, int run_idx)
> >         int err;
> >         bool second_pass = false;
> >         struct child_process cmd;
> > +       pthread_t thread_script;
> >
> >         //Prepare perf record for sampling event retire_latency before fork and
> prepare workload
> >         if (stat_config.tpebs_event_size > 0) {
> >                 int ret;
> >
> > +               struct tpebs_event *e;
> >                 pr_debug("perf stat pid = %d\n", getpid());
> >                 ret = prepare_perf_record(&cmd);
> >                 if (ret)
> >                         return ret;
> > +
> > +               list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > +                       struct tpebs_retire_lat *new = malloc(sizeof(struct
> tpebs_retire_lat));
> > +
> > +                       if (!new)
> > +                               return -1;
> 
> This would leave the record command.  I think you'd better
> do this before prepare_perf_record().
> 
> 
> > +                       new->name = strdup(e->name);
> > +                       new->tpebs_name = strdup(e->tpebs_name);
> > +                       new->count = 0;
> > +                       new->sum = 0;
> > +                       list_add_tail(&new->nd, &stat_config.tpebs_results);
> > +               }
> > +
> > +               if (pthread_create(&thread_script, NULL, __cmd_script, &cmd)) {
> > +                       kill(cmd.pid, SIGTERM);
> > +                       close(cmd.out);
> > +                       pr_err("Could not create thread to process sample data.\n");
> > +                       return -1;
> > +               }
> > +               sleep(2);
> 
> Why?

In my experiments, I found that perf record initialization time is usually longer than 
the time needed in perf stat, especially when we start a lot of initialization of perf 
stat before perf record already. Without a waiting time here, perf record may not 
have enough time to launch and collect data if the execution time of the parent 
process (perf stat) is very short (a few seconds). Therefore, I added a sleep here to 
wait for perf record as a quick but ad hoc solution here. The two second time is 
selected based on my experiment data. I mentioned this as a remaining issue in 
the cover letter of patch v1.

I will add a comment here. Please let me know if you have any suggestions on how
To solve this issue. Thanks a lot!

> 
> Thanks,
> Namhyung
> 
> 
> >         }
> >
> >         if (forks) {
> > @@ -1087,12 +1094,15 @@ static int __run_perf_stat(int argc, const char
> **argv, int run_idx)
> >         if (stat_config.tpebs_event_size > 0) {
> >                 int ret;
> >
> > -               pr_debug("pid = %d\n", getpid());
> > -               pr_debug("cmd.pid = %d\n", cmd.pid);
> > +               pr_debug("Workload finished, finishing record\n");
> > +               pr_debug("Perf stat pid = %d, Perf record pid = %d\n", getpid(),
> cmd.pid);
> >
> > -               ret = __cmd_script(&cmd);
> > +               kill(cmd.pid, SIGTERM);
> > +               pthread_join(thread_script, NULL);
> >                 close(cmd.out);
> > -               pr_debug("%d\n", ret);
> > +               ret = finish_command(&cmd);
> > +               if (ret != -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> > +                       return ret;
> >         }
> >
> >         if (stat_config.walltime_run_table)
> > --
> > 2.43.0
> >