[RFC PATCH v4 2/6] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.

weilin.wang@intel.com posted 6 patches 1 year, 11 months ago
There is a newer version of this series
[RFC PATCH v4 2/6] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by weilin.wang@intel.com 1 year, 11 months ago
From: Weilin Wang <weilin.wang@intel.com>

When retire_latency value is used in a metric formula, perf stat would fork a
perf record process with "-e" and "-W" options. Perf record will collect
required retire_latency values in parallel while perf stat is collecting
counting values.

At the point of time that perf stat stops counting, it would send sigterm signal
to perf record process and receiving sampling data back from perf record from a
pipe. Perf stat will then process the received data to get retire latency data
and calculate metric result.

Signed-off-by: Weilin Wang <weilin.wang@intel.com>
---
 tools/perf/builtin-stat.c     | 165 +++++++++++++++++++++++++++++++++-
 tools/perf/util/data.c        |   4 +
 tools/perf/util/data.h        |   1 +
 tools/perf/util/metricgroup.h |  12 +++
 tools/perf/util/stat.h        |   2 +
 5 files changed, 182 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 6291e1e24535..4e92e73cbeaf 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -94,8 +94,13 @@
 #include <perf/evlist.h>
 #include <internal/threadmap.h>
 
+#include "util/sample.h"
+#include <sys/param.h>
+#include <subcmd/run-command.h>
+
 #define DEFAULT_SEPARATOR	" "
 #define FREEZE_ON_SMI_PATH	"devices/cpu/freeze_on_smi"
+#define PERF_DATA		"-"
 
 static void print_counters(struct timespec *ts, int argc, const char **argv);
 
@@ -163,6 +168,8 @@ static struct perf_stat_config stat_config = {
 	.ctl_fd_ack		= -1,
 	.iostat_run		= false,
 	.tpebs_events		= LIST_HEAD_INIT(stat_config.tpebs_events),
+	.tpebs_results		= LIST_HEAD_INIT(stat_config.tpebs_results),
+	.tpebs_pid              = -1,
 };
 
 static bool cpus_map_matched(struct evsel *a, struct evsel *b)
@@ -687,12 +694,155 @@ static enum counter_recovery stat_handle_error(struct evsel *counter)
 	return COUNTER_FATAL;
 }
 
-static int __run_perf_record(void)
+static int __run_perf_record(const char **record_argv)
 {
+	int i = 0;
+	struct tpebs_event *e;
+
 	pr_debug("Prepare perf record for retire_latency\n");
+
+	record_argv[i++] = "perf";
+	record_argv[i++] = "record";
+	record_argv[i++] = "-W";
+	record_argv[i++] = "--synth=no";
+
+	if (stat_config.user_requested_cpu_list) {
+		record_argv[i++] = "-C";
+		record_argv[i++] = stat_config.user_requested_cpu_list;
+	}
+
+	if (stat_config.system_wide)
+		record_argv[i++] = "-a";
+
+	list_for_each_entry(e, &stat_config.tpebs_events, nd) {
+		record_argv[i++] = "-e";
+		record_argv[i++] = e->name;
+	}
+
+	record_argv[i++] = "-o";
+	record_argv[i++] = PERF_DATA;
+
 	return 0;
 }
 
+static void prepare_run_command(struct child_process *cmd,
+			       const char **argv)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->argv = argv;
+	cmd->out = -1;
+}
+
+static int prepare_perf_record(struct child_process *cmd)
+{
+	const char **record_argv;
+
+	record_argv = calloc(10 + 2 * stat_config.tpebs_event_size, sizeof(char *));
+	if (!record_argv)
+		return -1;
+	__run_perf_record(record_argv);
+
+	prepare_run_command(cmd, record_argv);
+	return start_command(cmd);
+}
+
+struct perf_script {
+	struct perf_tool	tool;
+	struct perf_session	*session;
+};
+
+static void tpebs_data__delete(void)
+{
+	struct tpebs_retire_lat *r, *rtmp;
+	struct tpebs_event *e, *etmp;
+	list_for_each_entry_safe(r, rtmp, &stat_config.tpebs_results, nd) {
+		list_del_init(&r->nd);
+		free(r);
+	}
+	list_for_each_entry_safe(e, etmp, &stat_config.tpebs_events, nd) {
+		list_del_init(&e->nd);
+		free(e);
+	}
+}
+
+static int process_sample_event(struct perf_tool *tool __maybe_unused,
+				union perf_event *event __maybe_unused,
+				struct perf_sample *sample,
+				struct evsel *evsel,
+				struct machine *machine __maybe_unused)
+{
+	int ret = 0;
+	const char *evname;
+	struct tpebs_retire_lat *t;
+
+	evname = evsel__name(evsel);
+
+	/*
+	 * Need to handle per core results? We are assuming average retire
+	 * latency value will be used. Save the number of samples and the sum of
+	 * retire latency value for each event.
+	 */
+	list_for_each_entry(t, &stat_config.tpebs_results, nd) {
+		if (!strcmp(evname, t->name)) {
+			t->count += 1;
+			t->sum += sample->retire_lat;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int process_feature_event(struct perf_session *session,
+				 union perf_event *event)
+{
+	if (event->feat.feat_id < HEADER_LAST_FEATURE)
+		return perf_event__process_feature(session, event);
+	return 0;
+}
+
+static int __cmd_script(struct child_process *cmd __maybe_unused)
+{
+	int err = 0;
+	struct perf_session *session;
+	struct perf_data data = {
+		.mode = PERF_DATA_MODE_READ,
+		.path = PERF_DATA,
+		.fd   = cmd->out,
+	};
+	struct perf_script script = {
+		.tool = {
+		.sample		 = process_sample_event,
+		.ordering_requires_timestamps = true,
+		.feature	 = process_feature_event,
+		.attr		 = perf_event__process_attr,
+		},
+	};
+	struct tpebs_event *e;
+
+	list_for_each_entry(e, &stat_config.tpebs_events, nd) {
+		struct tpebs_retire_lat *new = malloc(sizeof(struct tpebs_retire_lat));
+
+		if (!new)
+			return -1;
+		new->name = strdup(e->name);
+		new->tpebs_name = strdup(e->tpebs_name);
+		new->count = 0;
+		new->sum = 0;
+		list_add_tail(&new->nd, &stat_config.tpebs_results);
+	}
+
+	kill(cmd->pid, SIGTERM);
+	session = perf_session__new(&data, &script.tool);
+	if (IS_ERR(session))
+		return PTR_ERR(session);
+	script.session = session;
+	err = perf_session__process_events(session);
+	perf_session__delete(session);
+
+	return err;
+}
+
 static int __run_perf_stat(int argc, const char **argv, int run_idx)
 {
 	int interval = stat_config.interval;
@@ -709,13 +859,15 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	struct affinity saved_affinity, *affinity = NULL;
 	int err;
 	bool second_pass = false;
+	struct child_process cmd;
 
 	/* Prepare perf record for sampling event retire_latency before fork and
 	 * prepare workload */
 	if (stat_config.tpebs_event_size > 0) {
 		int ret;
 
-		ret = __run_perf_record();
+		pr_debug("perf stat pid = %d\n", getpid());
+		ret = prepare_perf_record(&cmd);
 		if (ret)
 			return ret;
 	}
@@ -925,6 +1077,13 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 
 	t1 = rdclock();
 
+	if (stat_config.tpebs_event_size > 0) {
+		int ret;
+
+		ret = __cmd_script(&cmd);
+		close(cmd.out);
+	}
+
 	if (stat_config.walltime_run_table)
 		stat_config.walltime_run[run_idx] = t1 - t0;
 
@@ -2972,5 +3131,7 @@ int cmd_stat(int argc, const char **argv)
 	metricgroup__rblist_exit(&stat_config.metric_events);
 	evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack, &stat_config.ctl_fd_close);
 
+	tpebs_data__delete();
+
 	return status;
 }
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 08c4bfbd817f..2e2a20fc5c30 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -185,6 +185,10 @@ static bool check_pipe(struct perf_data *data)
 	int fd = perf_data__is_read(data) ?
 		 STDIN_FILENO : STDOUT_FILENO;
 
+	if (data->fd > 0) {
+		fd = data->fd;
+	}
+
 	if (!data->path) {
 		if (!fstat(fd, &st) && S_ISFIFO(st.st_mode))
 			is_pipe = true;
diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h
index 110f3ebde30f..720638116ca0 100644
--- a/tools/perf/util/data.h
+++ b/tools/perf/util/data.h
@@ -28,6 +28,7 @@ struct perf_data_file {
 
 struct perf_data {
 	const char		*path;
+	int			 fd;
 	struct perf_data_file	 file;
 	bool			 is_pipe;
 	bool			 is_dir;
diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
index 7c24ed768ff3..3c37d80c4d34 100644
--- a/tools/perf/util/metricgroup.h
+++ b/tools/perf/util/metricgroup.h
@@ -68,10 +68,22 @@ struct metric_expr {
 
 struct tpebs_event {
 	struct list_head nd;
+	/* Event name */
 	const char *name;
+	/* Event name with TPEBS modifier */
 	const char *tpebs_name;
 };
 
+struct tpebs_retire_lat {
+	struct list_head nd;
+	/* Event name */
+	const char *name;
+	/* Event name with TPEBS modifier */
+	const char *tpebs_name;
+	size_t count;
+	int sum;
+};
+
 struct metric_event *metricgroup__lookup(struct rblist *metric_events,
 					 struct evsel *evsel,
 					 bool create);
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index b987960df3c5..0726bdc06681 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -111,6 +111,8 @@ struct perf_stat_config {
 	struct rblist		 metric_events;
 	struct list_head	 tpebs_events;
 	size_t			 tpebs_event_size;
+	struct list_head	 tpebs_results;
+	pid_t			 tpebs_pid;
 	int			 ctl_fd;
 	int			 ctl_fd_ack;
 	bool			 ctl_fd_close;
-- 
2.43.0
Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by Ian Rogers 1 year, 10 months ago
On Tue, Mar 12, 2024 at 4:49 PM <weilin.wang@intel.com> wrote:
>
> From: Weilin Wang <weilin.wang@intel.com>
>
> When retire_latency value is used in a metric formula, perf stat would fork a
> perf record process with "-e" and "-W" options. Perf record will collect
> required retire_latency values in parallel while perf stat is collecting
> counting values.
>
> At the point of time that perf stat stops counting, it would send sigterm signal
> to perf record process and receiving sampling data back from perf record from a
> pipe. Perf stat will then process the received data to get retire latency data
> and calculate metric result.
>
> Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> ---
>  tools/perf/builtin-stat.c     | 165 +++++++++++++++++++++++++++++++++-
>  tools/perf/util/data.c        |   4 +
>  tools/perf/util/data.h        |   1 +
>  tools/perf/util/metricgroup.h |  12 +++
>  tools/perf/util/stat.h        |   2 +
>  5 files changed, 182 insertions(+), 2 deletions(-)
>
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 6291e1e24535..4e92e73cbeaf 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -94,8 +94,13 @@
>  #include <perf/evlist.h>
>  #include <internal/threadmap.h>
>
> +#include "util/sample.h"
> +#include <sys/param.h>
> +#include <subcmd/run-command.h>
> +
>  #define DEFAULT_SEPARATOR      " "
>  #define FREEZE_ON_SMI_PATH     "devices/cpu/freeze_on_smi"
> +#define PERF_DATA              "-"
>
>  static void print_counters(struct timespec *ts, int argc, const char **argv);
>
> @@ -163,6 +168,8 @@ static struct perf_stat_config stat_config = {
>         .ctl_fd_ack             = -1,
>         .iostat_run             = false,
>         .tpebs_events           = LIST_HEAD_INIT(stat_config.tpebs_events),
> +       .tpebs_results          = LIST_HEAD_INIT(stat_config.tpebs_results),
> +       .tpebs_pid              = -1,
>  };
>
>  static bool cpus_map_matched(struct evsel *a, struct evsel *b)
> @@ -687,12 +694,155 @@ static enum counter_recovery stat_handle_error(struct evsel *counter)
>         return COUNTER_FATAL;
>  }
>
> -static int __run_perf_record(void)
> +static int __run_perf_record(const char **record_argv)
>  {
> +       int i = 0;
> +       struct tpebs_event *e;
> +
>         pr_debug("Prepare perf record for retire_latency\n");
> +
> +       record_argv[i++] = "perf";
> +       record_argv[i++] = "record";
> +       record_argv[i++] = "-W";
> +       record_argv[i++] = "--synth=no";
> +
> +       if (stat_config.user_requested_cpu_list) {
> +               record_argv[i++] = "-C";
> +               record_argv[i++] = stat_config.user_requested_cpu_list;
> +       }
> +
> +       if (stat_config.system_wide)
> +               record_argv[i++] = "-a";
> +
> +       list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> +               record_argv[i++] = "-e";
> +               record_argv[i++] = e->name;
> +       }
> +
> +       record_argv[i++] = "-o";
> +       record_argv[i++] = PERF_DATA;
> +
>         return 0;
>  }
>
> +static void prepare_run_command(struct child_process *cmd,
> +                              const char **argv)
> +{
> +       memset(cmd, 0, sizeof(*cmd));
> +       cmd->argv = argv;
> +       cmd->out = -1;
> +}
> +
> +static int prepare_perf_record(struct child_process *cmd)
> +{
> +       const char **record_argv;
> +
> +       record_argv = calloc(10 + 2 * stat_config.tpebs_event_size, sizeof(char *));
> +       if (!record_argv)
> +               return -1;
> +       __run_perf_record(record_argv);
> +
> +       prepare_run_command(cmd, record_argv);
> +       return start_command(cmd);
> +}
> +
> +struct perf_script {
> +       struct perf_tool        tool;
> +       struct perf_session     *session;
> +};
> +
> +static void tpebs_data__delete(void)
> +{
> +       struct tpebs_retire_lat *r, *rtmp;
> +       struct tpebs_event *e, *etmp;
> +       list_for_each_entry_safe(r, rtmp, &stat_config.tpebs_results, nd) {
> +               list_del_init(&r->nd);
> +               free(r);
> +       }
> +       list_for_each_entry_safe(e, etmp, &stat_config.tpebs_events, nd) {
> +               list_del_init(&e->nd);
> +               free(e);
> +       }
> +}
> +
> +static int process_sample_event(struct perf_tool *tool __maybe_unused,
> +                               union perf_event *event __maybe_unused,
> +                               struct perf_sample *sample,
> +                               struct evsel *evsel,
> +                               struct machine *machine __maybe_unused)
> +{
> +       int ret = 0;
> +       const char *evname;
> +       struct tpebs_retire_lat *t;
> +
> +       evname = evsel__name(evsel);
> +
> +       /*
> +        * Need to handle per core results? We are assuming average retire
> +        * latency value will be used. Save the number of samples and the sum of
> +        * retire latency value for each event.
> +        */
> +       list_for_each_entry(t, &stat_config.tpebs_results, nd) {
> +               if (!strcmp(evname, t->name)) {
> +                       t->count += 1;
> +                       t->sum += sample->retire_lat;
> +                       break;
> +               }
> +       }
> +
> +       return ret;
> +}
> +
> +static int process_feature_event(struct perf_session *session,
> +                                union perf_event *event)
> +{
> +       if (event->feat.feat_id < HEADER_LAST_FEATURE)
> +               return perf_event__process_feature(session, event);
> +       return 0;
> +}
> +
> +static int __cmd_script(struct child_process *cmd __maybe_unused)
> +{
> +       int err = 0;
> +       struct perf_session *session;
> +       struct perf_data data = {
> +               .mode = PERF_DATA_MODE_READ,
> +               .path = PERF_DATA,
> +               .fd   = cmd->out,
> +       };
> +       struct perf_script script = {
> +               .tool = {
> +               .sample          = process_sample_event,
> +               .ordering_requires_timestamps = true,
> +               .feature         = process_feature_event,
> +               .attr            = perf_event__process_attr,
> +               },
> +       };
> +       struct tpebs_event *e;
> +
> +       list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> +               struct tpebs_retire_lat *new = malloc(sizeof(struct tpebs_retire_lat));
> +
> +               if (!new)
> +                       return -1;
> +               new->name = strdup(e->name);
> +               new->tpebs_name = strdup(e->tpebs_name);
> +               new->count = 0;
> +               new->sum = 0;
> +               list_add_tail(&new->nd, &stat_config.tpebs_results);
> +       }
> +
> +       kill(cmd->pid, SIGTERM);
> +       session = perf_session__new(&data, &script.tool);
> +       if (IS_ERR(session))
> +               return PTR_ERR(session);
> +       script.session = session;
> +       err = perf_session__process_events(session);
> +       perf_session__delete(session);
> +
> +       return err;
> +}
> +
>  static int __run_perf_stat(int argc, const char **argv, int run_idx)
>  {
>         int interval = stat_config.interval;
> @@ -709,13 +859,15 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
>         struct affinity saved_affinity, *affinity = NULL;
>         int err;
>         bool second_pass = false;
> +       struct child_process cmd;
>
>         /* Prepare perf record for sampling event retire_latency before fork and
>          * prepare workload */
>         if (stat_config.tpebs_event_size > 0) {
>                 int ret;
>
> -               ret = __run_perf_record();
> +               pr_debug("perf stat pid = %d\n", getpid());
> +               ret = prepare_perf_record(&cmd);
>                 if (ret)
>                         return ret;
>         }
> @@ -925,6 +1077,13 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
>
>         t1 = rdclock();
>
> +       if (stat_config.tpebs_event_size > 0) {
> +               int ret;
> +
> +               ret = __cmd_script(&cmd);
> +               close(cmd.out);
> +       }
> +
>         if (stat_config.walltime_run_table)
>                 stat_config.walltime_run[run_idx] = t1 - t0;
>
> @@ -2972,5 +3131,7 @@ int cmd_stat(int argc, const char **argv)
>         metricgroup__rblist_exit(&stat_config.metric_events);
>         evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack, &stat_config.ctl_fd_close);
>
> +       tpebs_data__delete();
> +
>         return status;
>  }
> diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
> index 08c4bfbd817f..2e2a20fc5c30 100644
> --- a/tools/perf/util/data.c
> +++ b/tools/perf/util/data.c
> @@ -185,6 +185,10 @@ static bool check_pipe(struct perf_data *data)
>         int fd = perf_data__is_read(data) ?
>                  STDIN_FILENO : STDOUT_FILENO;
>
> +       if (data->fd > 0) {
> +               fd = data->fd;
> +       }

nit: no curlies needed here

> +
>         if (!data->path) {
>                 if (!fstat(fd, &st) && S_ISFIFO(st.st_mode))
>                         is_pipe = true;
> diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h
> index 110f3ebde30f..720638116ca0 100644
> --- a/tools/perf/util/data.h
> +++ b/tools/perf/util/data.h
> @@ -28,6 +28,7 @@ struct perf_data_file {
>
>  struct perf_data {
>         const char              *path;
> +       int                      fd;
>         struct perf_data_file    file;

Could fd and file be a union, so that its clear they are mutually
exclusive? Perhaps just comment.

>         bool                     is_pipe;
>         bool                     is_dir;
> diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
> index 7c24ed768ff3..3c37d80c4d34 100644
> --- a/tools/perf/util/metricgroup.h
> +++ b/tools/perf/util/metricgroup.h
> @@ -68,10 +68,22 @@ struct metric_expr {
>
>  struct tpebs_event {
>         struct list_head nd;
> +       /* Event name */
>         const char *name;
> +       /* Event name with TPEBS modifier */
>         const char *tpebs_name;
>  };
>
> +struct tpebs_retire_lat {
> +       struct list_head nd;
> +       /* Event name */
> +       const char *name;
> +       /* Event name with TPEBS modifier */
> +       const char *tpebs_name;
> +       size_t count;
> +       int sum;
> +};

nit: Would it make sense to make this:

struct tpebs_retire_lat {
  struct tpebs_event event;
  size_t count;
  int sum;
}

Thanks,
Ian

> +
>  struct metric_event *metricgroup__lookup(struct rblist *metric_events,
>                                          struct evsel *evsel,
>                                          bool create);
> diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> index b987960df3c5..0726bdc06681 100644
> --- a/tools/perf/util/stat.h
> +++ b/tools/perf/util/stat.h
> @@ -111,6 +111,8 @@ struct perf_stat_config {
>         struct rblist            metric_events;
>         struct list_head         tpebs_events;
>         size_t                   tpebs_event_size;
> +       struct list_head         tpebs_results;
> +       pid_t                    tpebs_pid;
>         int                      ctl_fd;
>         int                      ctl_fd_ack;
>         bool                     ctl_fd_close;
> --
> 2.43.0
>
Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by Andi Kleen 1 year, 11 months ago
weilin.wang@intel.com writes:

> From: Weilin Wang <weilin.wang@intel.com>
>
> When retire_latency value is used in a metric formula, perf stat would fork a
> perf record process with "-e" and "-W" options. Perf record will collect
> required retire_latency values in parallel while perf stat is collecting
> counting values.

How does that work when the workload is specified on the command line?
The workload would run twice? That is very inefficient and may not
work if it's a large workload.

The perf tool infrastructure is imho not up to the task of such
parallel collection.

Also it won't work for very long collections because you will get a
very large perf.data. Better to use a pipeline.

I think it would be better if you made it a separate operation that can
generate a file that is then consumed by perf stat. This is also more efficient
because often the calibration is only needed once. And it's all under
user control so no nasty surprises.

-Andi
RE: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by Wang, Weilin 1 year, 11 months ago

> -----Original Message-----
> From: Andi Kleen <ak@linux.intel.com>
> Sent: Tuesday, March 12, 2024 5:03 PM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Namhyung Kim <namhyung@kernel.org>; Ian Rogers
> <irogers@google.com>; Arnaldo Carvalho de Melo <acme@kernel.org>; Peter
> Zijlstra <peterz@infradead.org>; Ingo Molnar <mingo@redhat.com>;
> Alexander Shishkin <alexander.shishkin@linux.intel.com>; Jiri Olsa
> <jolsa@kernel.org>; Hunter, Adrian <adrian.hunter@intel.com>; Kan Liang
> <kan.liang@linux.intel.com>; linux-perf-users@vger.kernel.org; linux-
> kernel@vger.kernel.org; Taylor, Perry <perry.taylor@intel.com>; Alt, Samantha
> <samantha.alt@intel.com>; Biggers, Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
> 
> weilin.wang@intel.com writes:
> 
> > From: Weilin Wang <weilin.wang@intel.com>
> >
> > When retire_latency value is used in a metric formula, perf stat would fork a
> > perf record process with "-e" and "-W" options. Perf record will collect
> > required retire_latency values in parallel while perf stat is collecting
> > counting values.
> 
> How does that work when the workload is specified on the command line?
> The workload would run twice? That is very inefficient and may not
> work if it's a large workload.
> 
> The perf tool infrastructure is imho not up to the task of such
> parallel collection.
> 
> Also it won't work for very long collections because you will get a
> very large perf.data. Better to use a pipeline.
> 
> I think it would be better if you made it a separate operation that can
> generate a file that is then consumed by perf stat. This is also more efficient
> because often the calibration is only needed once. And it's all under
> user control so no nasty surprises.
> 

Workload runs only once with perf stat. Perf record is forked by perf stat and run
in parallel with perf stat. Perf stat will send perf record a signal to terminate after 
perf stat stops collecting count value.

The implementation uses a PIPE to pass the sampled data from perf record instead
of writing the data into a file.

Thanks,
Weilin

> -Andi
Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by Andi Kleen 1 year, 11 months ago
"Wang, Weilin" <weilin.wang@intel.com> writes:

>> -----Original Message-----
>> From: Andi Kleen <ak@linux.intel.com>
>> Sent: Tuesday, March 12, 2024 5:03 PM
>> To: Wang, Weilin <weilin.wang@intel.com>
>> Cc: Namhyung Kim <namhyung@kernel.org>; Ian Rogers
>> <irogers@google.com>; Arnaldo Carvalho de Melo <acme@kernel.org>; Peter
>> Zijlstra <peterz@infradead.org>; Ingo Molnar <mingo@redhat.com>;
>> Alexander Shishkin <alexander.shishkin@linux.intel.com>; Jiri Olsa
>> <jolsa@kernel.org>; Hunter, Adrian <adrian.hunter@intel.com>; Kan Liang
>> <kan.liang@linux.intel.com>; linux-perf-users@vger.kernel.org; linux-
>> kernel@vger.kernel.org; Taylor, Perry <perry.taylor@intel.com>; Alt, Samantha
>> <samantha.alt@intel.com>; Biggers, Caleb <caleb.biggers@intel.com>
>> Subject: Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when
>> perf stat needs to get retire latency value for a metric.
>> 
>> weilin.wang@intel.com writes:
>> 
>> > From: Weilin Wang <weilin.wang@intel.com>
>> >
>> > When retire_latency value is used in a metric formula, perf stat would fork a
>> > perf record process with "-e" and "-W" options. Perf record will collect
>> > required retire_latency values in parallel while perf stat is collecting
>> > counting values.
>> 
>> How does that work when the workload is specified on the command line?
>> The workload would run twice? That is very inefficient and may not
>> work if it's a large workload.
>> 
>> The perf tool infrastructure is imho not up to the task of such
>> parallel collection.
>> 
>> Also it won't work for very long collections because you will get a
>> very large perf.data. Better to use a pipeline.
>> 
>> I think it would be better if you made it a separate operation that can
>> generate a file that is then consumed by perf stat. This is also more efficient
>> because often the calibration is only needed once. And it's all under
>> user control so no nasty surprises.
>> 
>
> Workload runs only once with perf stat. Perf record is forked by perf stat and run
> in parallel with perf stat. Perf stat will send perf record a signal to terminate after 
> perf stat stops collecting count value.

I don't understand how the perf record filters on the workload created by
the perf stat. At a minimum you would need -p to connect to the pid
of the parent, but IIRC -p doesnt follow children, so if it forked
it wouldn't work.

I think your approach may only work with -a, but perhaps I'm missing
something (-a is often not usable due to restrictions)

Also if perf stat runs in interval mode and you only get the data
at the end how would that work?

iirc i wrestled with all these questions for toplev (which has a
similar feature) and in the end i concluded doing it automatically
has far too many problems.

-Andi
RE: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by Wang, Weilin 1 year, 11 months ago

> -----Original Message-----
> From: Andi Kleen <ak@linux.intel.com>
> Sent: Tuesday, March 12, 2024 5:56 PM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Namhyung Kim <namhyung@kernel.org>; Ian Rogers
> <irogers@google.com>; Arnaldo Carvalho de Melo <acme@kernel.org>; Peter
> Zijlstra <peterz@infradead.org>; Ingo Molnar <mingo@redhat.com>;
> Alexander Shishkin <alexander.shishkin@linux.intel.com>; Jiri Olsa
> <jolsa@kernel.org>; Hunter, Adrian <adrian.hunter@intel.com>; Kan Liang
> <kan.liang@linux.intel.com>; linux-perf-users@vger.kernel.org; linux-
> kernel@vger.kernel.org; Taylor, Perry <perry.taylor@intel.com>; Alt, Samantha
> <samantha.alt@intel.com>; Biggers, Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
> 
> "Wang, Weilin" <weilin.wang@intel.com> writes:
> 
> >> -----Original Message-----
> >> From: Andi Kleen <ak@linux.intel.com>
> >> Sent: Tuesday, March 12, 2024 5:03 PM
> >> To: Wang, Weilin <weilin.wang@intel.com>
> >> Cc: Namhyung Kim <namhyung@kernel.org>; Ian Rogers
> >> <irogers@google.com>; Arnaldo Carvalho de Melo <acme@kernel.org>;
> Peter
> >> Zijlstra <peterz@infradead.org>; Ingo Molnar <mingo@redhat.com>;
> >> Alexander Shishkin <alexander.shishkin@linux.intel.com>; Jiri Olsa
> >> <jolsa@kernel.org>; Hunter, Adrian <adrian.hunter@intel.com>; Kan Liang
> >> <kan.liang@linux.intel.com>; linux-perf-users@vger.kernel.org; linux-
> >> kernel@vger.kernel.org; Taylor, Perry <perry.taylor@intel.com>; Alt,
> Samantha
> >> <samantha.alt@intel.com>; Biggers, Caleb <caleb.biggers@intel.com>
> >> Subject: Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record
> when
> >> perf stat needs to get retire latency value for a metric.
> >>
> >> weilin.wang@intel.com writes:
> >>
> >> > From: Weilin Wang <weilin.wang@intel.com>
> >> >
> >> > When retire_latency value is used in a metric formula, perf stat would fork
> a
> >> > perf record process with "-e" and "-W" options. Perf record will collect
> >> > required retire_latency values in parallel while perf stat is collecting
> >> > counting values.
> >>
> >> How does that work when the workload is specified on the command line?
> >> The workload would run twice? That is very inefficient and may not
> >> work if it's a large workload.
> >>
> >> The perf tool infrastructure is imho not up to the task of such
> >> parallel collection.
> >>
> >> Also it won't work for very long collections because you will get a
> >> very large perf.data. Better to use a pipeline.
> >>
> >> I think it would be better if you made it a separate operation that can
> >> generate a file that is then consumed by perf stat. This is also more efficient
> >> because often the calibration is only needed once. And it's all under
> >> user control so no nasty surprises.
> >>
> >
> > Workload runs only once with perf stat. Perf record is forked by perf stat and
> run
> > in parallel with perf stat. Perf stat will send perf record a signal to terminate
> after
> > perf stat stops collecting count value.
> 
> I don't understand how the perf record filters on the workload created by
> the perf stat. At a minimum you would need -p to connect to the pid
> of the parent, but IIRC -p doesnt follow children, so if it forked
> it wouldn't work.
> 
> I think your approach may only work with -a, but perhaps I'm missing
> something (-a is often not usable due to restrictions)
> 
> Also if perf stat runs in interval mode and you only get the data
> at the end how would that work?
> 
> iirc i wrestled with all these questions for toplev (which has a
> similar feature) and in the end i concluded doing it automatically
> has far too many problems.
> 

Yes, you are completely right that there are limitation that we can only support -a, -C 
and not support on -I now. I'm wondering if we could support "-I" in next step by 
processing sampled data on the go.

Thanks,
Weilin

> -Andi
Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by Andi Kleen 1 year, 11 months ago
On Wed, Mar 13, 2024 at 03:31:14PM +0000, Wang, Weilin wrote:
> 
> 
> > -----Original Message-----
> > From: Andi Kleen <ak@linux.intel.com>
> > Sent: Tuesday, March 12, 2024 5:56 PM
> > To: Wang, Weilin <weilin.wang@intel.com>
> > Cc: Namhyung Kim <namhyung@kernel.org>; Ian Rogers
> > <irogers@google.com>; Arnaldo Carvalho de Melo <acme@kernel.org>; Peter
> > Zijlstra <peterz@infradead.org>; Ingo Molnar <mingo@redhat.com>;
> > Alexander Shishkin <alexander.shishkin@linux.intel.com>; Jiri Olsa
> > <jolsa@kernel.org>; Hunter, Adrian <adrian.hunter@intel.com>; Kan Liang
> > <kan.liang@linux.intel.com>; linux-perf-users@vger.kernel.org; linux-
> > kernel@vger.kernel.org; Taylor, Perry <perry.taylor@intel.com>; Alt, Samantha
> > <samantha.alt@intel.com>; Biggers, Caleb <caleb.biggers@intel.com>
> > Subject: Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when
> > perf stat needs to get retire latency value for a metric.
> > 
> > "Wang, Weilin" <weilin.wang@intel.com> writes:
> > 
> > >> -----Original Message-----
> > >> From: Andi Kleen <ak@linux.intel.com>
> > >> Sent: Tuesday, March 12, 2024 5:03 PM
> > >> To: Wang, Weilin <weilin.wang@intel.com>
> > >> Cc: Namhyung Kim <namhyung@kernel.org>; Ian Rogers
> > >> <irogers@google.com>; Arnaldo Carvalho de Melo <acme@kernel.org>;
> > Peter
> > >> Zijlstra <peterz@infradead.org>; Ingo Molnar <mingo@redhat.com>;
> > >> Alexander Shishkin <alexander.shishkin@linux.intel.com>; Jiri Olsa
> > >> <jolsa@kernel.org>; Hunter, Adrian <adrian.hunter@intel.com>; Kan Liang
> > >> <kan.liang@linux.intel.com>; linux-perf-users@vger.kernel.org; linux-
> > >> kernel@vger.kernel.org; Taylor, Perry <perry.taylor@intel.com>; Alt,
> > Samantha
> > >> <samantha.alt@intel.com>; Biggers, Caleb <caleb.biggers@intel.com>
> > >> Subject: Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record
> > when
> > >> perf stat needs to get retire latency value for a metric.
> > >>
> > >> weilin.wang@intel.com writes:
> > >>
> > >> > From: Weilin Wang <weilin.wang@intel.com>
> > >> >
> > >> > When retire_latency value is used in a metric formula, perf stat would fork
> > a
> > >> > perf record process with "-e" and "-W" options. Perf record will collect
> > >> > required retire_latency values in parallel while perf stat is collecting
> > >> > counting values.
> > >>
> > >> How does that work when the workload is specified on the command line?
> > >> The workload would run twice? That is very inefficient and may not
> > >> work if it's a large workload.
> > >>
> > >> The perf tool infrastructure is imho not up to the task of such
> > >> parallel collection.
> > >>
> > >> Also it won't work for very long collections because you will get a
> > >> very large perf.data. Better to use a pipeline.
> > >>
> > >> I think it would be better if you made it a separate operation that can
> > >> generate a file that is then consumed by perf stat. This is also more efficient
> > >> because often the calibration is only needed once. And it's all under
> > >> user control so no nasty surprises.
> > >>
> > >
> > > Workload runs only once with perf stat. Perf record is forked by perf stat and
> > run
> > > in parallel with perf stat. Perf stat will send perf record a signal to terminate
> > after
> > > perf stat stops collecting count value.
> > 
> > I don't understand how the perf record filters on the workload created by
> > the perf stat. At a minimum you would need -p to connect to the pid
> > of the parent, but IIRC -p doesnt follow children, so if it forked
> > it wouldn't work.
> > 
> > I think your approach may only work with -a, but perhaps I'm missing
> > something (-a is often not usable due to restrictions)
> > 
> > Also if perf stat runs in interval mode and you only get the data
> > at the end how would that work?
> > 
> > iirc i wrestled with all these questions for toplev (which has a
> > similar feature) and in the end i concluded doing it automatically
> > has far too many problems.
> > 
> 
> Yes, you are completely right that there are limitation that we can only support -a, -C 
> and not support on -I now. I'm wondering if we could support "-I" in next step by 
> processing sampled data on the go.

-I is very tricky in a separate process. How do you align the two
intervals on a long runs without drift. I don't know of a reliable
way to do it in the general case only using time.

Also just the non support for forking workloads without -a is fatal imho. That's 
likely one of the most common cases.

Separate is a far better model imho:

- It is under full user control and no surprises
- No uncontrolled multiplexing
- Often it is fine to measure once and cache the data

It cannot deal with -I properly either (short of some form of
phase detection), but at least it doesn't give false promises
to that effect.

The way to do it is to have defaults in a json file
and the user can override them with a calibration step.
There is a JSON format that is used by some other tools.

This is my implementation:
https://github.com/andikleen/pmu-tools/blob/master/genretlat.py
https://github.com/andikleen/pmu-tools/blob/89861055b53e57ba0b7c6348745b2fbe6615c068/toplev.py#L1031


-Andi
RE: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by Wang, Weilin 1 year, 11 months ago

> -----Original Message-----
> From: Andi Kleen <ak@linux.intel.com>
> Sent: Wednesday, March 13, 2024 8:55 AM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Namhyung Kim <namhyung@kernel.org>; Ian Rogers
> <irogers@google.com>; Arnaldo Carvalho de Melo <acme@kernel.org>; Peter
> Zijlstra <peterz@infradead.org>; Ingo Molnar <mingo@redhat.com>;
> Alexander Shishkin <alexander.shishkin@linux.intel.com>; Jiri Olsa
> <jolsa@kernel.org>; Hunter, Adrian <adrian.hunter@intel.com>; Kan Liang
> <kan.liang@linux.intel.com>; linux-perf-users@vger.kernel.org; linux-
> kernel@vger.kernel.org; Taylor, Perry <perry.taylor@intel.com>; Alt, Samantha
> <samantha.alt@intel.com>; Biggers, Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
> 
> On Wed, Mar 13, 2024 at 03:31:14PM +0000, Wang, Weilin wrote:
> >
> >
> > > -----Original Message-----
> > > From: Andi Kleen <ak@linux.intel.com>
> > > Sent: Tuesday, March 12, 2024 5:56 PM
> > > To: Wang, Weilin <weilin.wang@intel.com>
> > > Cc: Namhyung Kim <namhyung@kernel.org>; Ian Rogers
> > > <irogers@google.com>; Arnaldo Carvalho de Melo <acme@kernel.org>;
> Peter
> > > Zijlstra <peterz@infradead.org>; Ingo Molnar <mingo@redhat.com>;
> > > Alexander Shishkin <alexander.shishkin@linux.intel.com>; Jiri Olsa
> > > <jolsa@kernel.org>; Hunter, Adrian <adrian.hunter@intel.com>; Kan Liang
> > > <kan.liang@linux.intel.com>; linux-perf-users@vger.kernel.org; linux-
> > > kernel@vger.kernel.org; Taylor, Perry <perry.taylor@intel.com>; Alt,
> Samantha
> > > <samantha.alt@intel.com>; Biggers, Caleb <caleb.biggers@intel.com>
> > > Subject: Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record
> when
> > > perf stat needs to get retire latency value for a metric.
> > >
> > > "Wang, Weilin" <weilin.wang@intel.com> writes:
> > >
> > > >> -----Original Message-----
> > > >> From: Andi Kleen <ak@linux.intel.com>
> > > >> Sent: Tuesday, March 12, 2024 5:03 PM
> > > >> To: Wang, Weilin <weilin.wang@intel.com>
> > > >> Cc: Namhyung Kim <namhyung@kernel.org>; Ian Rogers
> > > >> <irogers@google.com>; Arnaldo Carvalho de Melo <acme@kernel.org>;
> > > Peter
> > > >> Zijlstra <peterz@infradead.org>; Ingo Molnar <mingo@redhat.com>;
> > > >> Alexander Shishkin <alexander.shishkin@linux.intel.com>; Jiri Olsa
> > > >> <jolsa@kernel.org>; Hunter, Adrian <adrian.hunter@intel.com>; Kan
> Liang
> > > >> <kan.liang@linux.intel.com>; linux-perf-users@vger.kernel.org; linux-
> > > >> kernel@vger.kernel.org; Taylor, Perry <perry.taylor@intel.com>; Alt,
> > > Samantha
> > > >> <samantha.alt@intel.com>; Biggers, Caleb <caleb.biggers@intel.com>
> > > >> Subject: Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record
> > > when
> > > >> perf stat needs to get retire latency value for a metric.
> > > >>
> > > >> weilin.wang@intel.com writes:
> > > >>
> > > >> > From: Weilin Wang <weilin.wang@intel.com>
> > > >> >
> > > >> > When retire_latency value is used in a metric formula, perf stat would
> fork
> > > a
> > > >> > perf record process with "-e" and "-W" options. Perf record will collect
> > > >> > required retire_latency values in parallel while perf stat is collecting
> > > >> > counting values.
> > > >>
> > > >> How does that work when the workload is specified on the command
> line?
> > > >> The workload would run twice? That is very inefficient and may not
> > > >> work if it's a large workload.
> > > >>
> > > >> The perf tool infrastructure is imho not up to the task of such
> > > >> parallel collection.
> > > >>
> > > >> Also it won't work for very long collections because you will get a
> > > >> very large perf.data. Better to use a pipeline.
> > > >>
> > > >> I think it would be better if you made it a separate operation that can
> > > >> generate a file that is then consumed by perf stat. This is also more
> efficient
> > > >> because often the calibration is only needed once. And it's all under
> > > >> user control so no nasty surprises.
> > > >>
> > > >
> > > > Workload runs only once with perf stat. Perf record is forked by perf stat
> and
> > > run
> > > > in parallel with perf stat. Perf stat will send perf record a signal to
> terminate
> > > after
> > > > perf stat stops collecting count value.
> > >
> > > I don't understand how the perf record filters on the workload created by
> > > the perf stat. At a minimum you would need -p to connect to the pid
> > > of the parent, but IIRC -p doesnt follow children, so if it forked
> > > it wouldn't work.
> > >
> > > I think your approach may only work with -a, but perhaps I'm missing
> > > something (-a is often not usable due to restrictions)
> > >
> > > Also if perf stat runs in interval mode and you only get the data
> > > at the end how would that work?
> > >
> > > iirc i wrestled with all these questions for toplev (which has a
> > > similar feature) and in the end i concluded doing it automatically
> > > has far too many problems.
> > >
> >
> > Yes, you are completely right that there are limitation that we can only
> support -a, -C
> > and not support on -I now. I'm wondering if we could support "-I" in next
> step by
> > processing sampled data on the go.
> 
> -I is very tricky in a separate process. How do you align the two
> intervals on a long runs without drift. I don't know of a reliable
> way to do it in the general case only using time.
> 
> Also just the non support for forking workloads without -a is fatal imho. That's
> likely one of the most common cases.
> 

We could use -a -C and cgroup together. I think this could be a useful use case.
There could be other improvement to the implementation in next step. But I believe
current implementation could provide users the access to our new feature with 
accurate results and without adding too much overhead. 

Thanks,
Weilin

> Separate is a far better model imho:
> 
> - It is under full user control and no surprises
> - No uncontrolled multiplexing
> - Often it is fine to measure once and cache the data
> 
> It cannot deal with -I properly either (short of some form of
> phase detection), but at least it doesn't give false promises
> to that effect.
> 
> The way to do it is to have defaults in a json file
> and the user can override them with a calibration step.
> There is a JSON format that is used by some other tools.
> 
> This is my implementation:
> https://github.com/andikleen/pmu-tools/blob/master/genretlat.py
> https://github.com/andikleen/pmu-
> tools/blob/89861055b53e57ba0b7c6348745b2fbe6615c068/toplev.py#L10
> 31
> 
> 
> -Andi
Re: [RFC PATCH v4 2/6] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by Andi Kleen 1 year, 11 months ago
> We could use -a -C and cgroup together. I think this could be a useful use case.
> There could be other improvement to the implementation in next step. But I believe

I don't know how you would improve it. A lot of the problems are fairly
fundamental.

> current implementation could provide users the access to our new feature with 
> accurate results 
> and without adding too much overhead. 

perf record increases the overhead a lot over a perf stat! Sampling
is much more expensive than counting.

That should be at least a order of magnitude difference.
Another advantage of doing it separately.

That will also make it inaccurate.

Please do a proper implementation. This one is no good.

You can likely reuse a lot of your code:

- Add a perf calibrate to run the profile step separately that generates the JSON
- Add a --metrics option to perf to read the resulting JSON
- Add some mechanism to read a default JSON as fallback. I guess this
one could be compiled in to avoid a dependency on an installed file.

-Andi