[RFC PATCH v9 3/7] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.

weilin.wang@intel.com posted 7 patches 1 year, 8 months ago
There is a newer version of this series
[RFC PATCH v9 3/7] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by weilin.wang@intel.com 1 year, 8 months ago
From: Weilin Wang <weilin.wang@intel.com>

When retire_latency value is used in a metric formula, perf stat would fork a
perf record process with "-e" and "-W" options. Perf record will collect
required retire_latency values in parallel while perf stat is collecting
counting values.

At the point of time that perf stat stops counting, it would send sigterm signal
to perf record process and receiving sampling data back from perf record from a
pipe. Perf stat will then process the received data to get retire latency data
and calculate metric result.

Another thread is required to synchronize between perf stat and perf record
when we pass data through pipe.

Signed-off-by: Weilin Wang <weilin.wang@intel.com>
---
 tools/perf/builtin-stat.c     |  17 ++
 tools/perf/util/Build         |   1 +
 tools/perf/util/intel-tpebs.c | 301 ++++++++++++++++++++++++++++++++++
 tools/perf/util/intel-tpebs.h |  30 ++++
 tools/perf/util/stat.h        |   1 +
 5 files changed, 350 insertions(+)
 create mode 100644 tools/perf/util/intel-tpebs.c
 create mode 100644 tools/perf/util/intel-tpebs.h

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 428e9721b908..c0e9dfa3b3c2 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -70,6 +70,7 @@
 #include "util/bpf_counter.h"
 #include "util/iostat.h"
 #include "util/util.h"
+#include "util/intel-tpebs.h"
 #include "asm/bug.h"
 
 #include <linux/time64.h>
@@ -162,6 +163,7 @@ static struct perf_stat_config stat_config = {
 	.ctl_fd			= -1,
 	.ctl_fd_ack		= -1,
 	.iostat_run		= false,
+	.tpebs_pid              = -1,
 };
 
 static void evlist__check_cpu_maps(struct evlist *evlist)
@@ -653,6 +655,8 @@ static enum counter_recovery stat_handle_error(struct evsel *counter)
 
 	if (child_pid != -1)
 		kill(child_pid, SIGTERM);
+	if (stat_config.tpebs_pid != -1)
+		stop_tpebs();
 	return COUNTER_FATAL;
 }
 
@@ -673,6 +677,10 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	int err;
 	bool second_pass = false;
 
+	err = start_tpebs(&stat_config, evsel_list);
+	if (err < 0)
+		return err;
+
 	if (forks) {
 		if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe, workload_exec_failed_signal) < 0) {
 			perror("failed to prepare workload");
@@ -878,6 +886,10 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 
 	t1 = rdclock();
 
+	err = stop_tpebs();
+	if (err < 0)
+		return err;
+
 	if (stat_config.walltime_run_table)
 		stat_config.walltime_run[run_idx] = t1 - t0;
 
@@ -985,6 +997,9 @@ static void sig_atexit(void)
 	if (child_pid != -1)
 		kill(child_pid, SIGTERM);
 
+	if (stat_config.tpebs_pid != -1)
+		kill(stat_config.tpebs_pid, SIGTERM);
+
 	sigprocmask(SIG_SETMASK, &oset, NULL);
 
 	if (signr == -1)
@@ -2918,5 +2933,7 @@ int cmd_stat(int argc, const char **argv)
 	metricgroup__rblist_exit(&stat_config.metric_events);
 	evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack, &stat_config.ctl_fd_close);
 
+	tpebs_data__delete();
+
 	return status;
 }
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 292170a99ab6..c9f1d0bb6bf8 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -153,6 +153,7 @@ perf-y += clockid.o
 perf-y += list_sort.o
 perf-y += mutex.o
 perf-y += sharded_mutex.o
+perf-y += intel-tpebs.o
 
 perf-$(CONFIG_LIBBPF) += bpf_map.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
new file mode 100644
index 000000000000..13f65e39a845
--- /dev/null
+++ b/tools/perf/util/intel-tpebs.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * intel_tpebs.c: Intel TPEBS support
+ */
+
+
+#include <sys/param.h>
+#include <subcmd/run-command.h>
+#include <thread.h>
+#include "intel-tpebs.h"
+#include <linux/list.h>
+#include <linux/zalloc.h>
+#include <linux/err.h>
+#include "sample.h"
+#include "debug.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "session.h"
+#include "tool.h"
+#include "metricgroup.h"
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <poll.h>
+
+#define PERF_DATA		"-"
+
+struct list_head tpebs_results = LIST_HEAD_INIT(tpebs_results);
+static pthread_t reader_thread;
+static struct child_process *cmd;
+static struct perf_stat_config *stat_config;
+static size_t tpebs_event_size;
+
+static int get_perf_record_args(const char **record_argv, char buf[])
+{
+	struct tpebs_retire_lat *e;
+	int i = 0;
+
+	pr_debug("Prepare perf record for retire_latency\n");
+
+	record_argv[i++] = "perf";
+	record_argv[i++] = "record";
+	record_argv[i++] = "-W";
+	record_argv[i++] = "--synth=no";
+	record_argv[i++] = buf;
+
+	if (stat_config->user_requested_cpu_list) {
+		record_argv[i++] = "-C";
+		record_argv[i++] = stat_config->user_requested_cpu_list;
+	}
+
+	if (stat_config->system_wide)
+		record_argv[i++] = "-a";
+
+	if (!stat_config->system_wide && !stat_config->user_requested_cpu_list) {
+		pr_err("Require -a or -C option to run sampling.\n");
+		return -ECANCELED;
+	}
+
+	list_for_each_entry(e, &tpebs_results, nd) {
+		record_argv[i++] = "-e";
+		record_argv[i++] = e->name;
+	}
+
+	record_argv[i++] = "-o";
+	record_argv[i++] = PERF_DATA;
+
+	return 0;
+}
+
+static int prepare_run_command(const char **argv)
+{
+	cmd = zalloc(sizeof(struct child_process));
+	if (!cmd)
+		return -ENOMEM;
+	cmd->argv = argv;
+	cmd->out = -1;
+	return 0;
+}
+
+static int prepare_perf_record(int control_fd[], int ack_fd[])
+{
+	const char **record_argv;
+	int ret;
+	char buf[32];
+
+	scnprintf(buf, sizeof(buf), "--control=fd:%d,%d", control_fd[0], ack_fd[1]);
+
+	record_argv = calloc(12 + 2 * tpebs_event_size, sizeof(char *));
+	if (!record_argv)
+		return -ENOMEM;
+
+	ret = get_perf_record_args(record_argv, buf);
+	if (ret)
+		goto out;
+
+	ret = prepare_run_command(record_argv);
+	if (ret)
+		goto out;
+	ret = start_command(cmd);
+out:
+	free(record_argv);
+	return ret;
+}
+struct sample_data_reader {
+	struct perf_tool	tool;
+	struct perf_session	*session;
+};
+
+static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
+{
+	zfree(&r->name);
+	zfree(&r->tpebs_name);
+	free(r);
+}
+
+void tpebs_data__delete(void)
+{
+	struct tpebs_retire_lat *r, *rtmp;
+
+	list_for_each_entry_safe(r, rtmp, &tpebs_results, nd) {
+		list_del_init(&r->nd);
+		tpebs_retire_lat__delete(r);
+	}
+	free(cmd);
+}
+
+static int process_sample_event(struct perf_tool *tool __maybe_unused,
+				union perf_event *event __maybe_unused,
+				struct perf_sample *sample,
+				struct evsel *evsel,
+				struct machine *machine __maybe_unused)
+{
+	int ret = 0;
+	const char *evname;
+	struct tpebs_retire_lat *t;
+
+	evname = evsel__name(evsel);
+
+	/*
+	 * Need to handle per core results? We are assuming average retire
+	 * latency value will be used. Save the number of samples and the sum of
+	 * retire latency value for each event.
+	 */
+	list_for_each_entry(t, &tpebs_results, nd) {
+		if (!strcmp(evname, t->name)) {
+			t->count += 1;
+			t->sum += sample->retire_lat;
+			t->val = (double) t->sum / t->count;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int process_feature_event(struct perf_session *session,
+				 union perf_event *event)
+{
+	if (event->feat.feat_id < HEADER_LAST_FEATURE)
+		return perf_event__process_feature(session, event);
+	return 0;
+}
+
+static void *__sample_reader(void *arg)
+{
+	struct child_process *child = arg;
+	struct perf_session *session;
+	struct perf_data data = {
+		.mode = PERF_DATA_MODE_READ,
+		.path = PERF_DATA,
+		.file.fd = child->out,
+	};
+	struct sample_data_reader reader = {
+		.tool = {
+		.sample		 = process_sample_event,
+		.feature	 = process_feature_event,
+		.attr		 = perf_event__process_attr,
+		},
+	};
+
+	session = perf_session__new(&data, &reader.tool);
+	if (IS_ERR(session))
+		return NULL;
+	reader.session = session;
+	perf_session__process_events(session);
+	perf_session__delete(session);
+
+	return NULL;
+}
+
+
+int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist *evsel_list)
+{
+	int ret = 0;
+	struct evsel *evsel;
+
+	stat_config = perf_stat_config;
+	/*
+	 * Prepare perf record for sampling event retire_latency before fork and
+	 * prepare workload
+	 */
+	evlist__for_each_entry(evsel_list, evsel) {
+		if (evsel->retire_lat) {
+			struct tpebs_retire_lat *new = malloc(sizeof(struct tpebs_retire_lat));
+			int i;
+			char *name;
+
+			pr_debug("perf stat retire latency %s required\n", evsel->name);
+			if (!new)
+				return -1;
+			for (i = strlen(evsel->name) - 1; i > 0; i--) {
+				if (evsel->name[i] == 'R')
+					break;
+			}
+			if (i <= 0 || evsel->name[i] != 'R')
+				return -1;
+
+			name = strdup(evsel->name);
+			if (!name)
+				return -ENOMEM;
+			name[i] = 'p';
+			new->name = strdup(name);
+			free(name);
+			new->tpebs_name = strdup(evsel->name);
+			if (!new->tpebs_name)
+				return -ENOMEM;
+			new->count = 0;
+			new->sum = 0;
+			list_add_tail(&new->nd, &tpebs_results);
+			tpebs_event_size += 1;
+		}
+	}
+
+	if (tpebs_event_size > 0) {
+		struct pollfd pollfd = { .events = POLLIN, };
+		int control_fd[2], ack_fd[2], len;
+		char ack_buf[8];
+
+		/*Create control and ack fd for --control*/
+		if (pipe(control_fd) < 0) {
+			pr_err("Failed to create control fifo");
+			return -1;
+		}
+		if (pipe(ack_fd) < 0) {
+			pr_err("Failed to create control fifo");
+			return -1;
+		}
+
+		ret = prepare_perf_record(control_fd, ack_fd);
+		if (ret)
+			return ret;
+		if (pthread_create(&reader_thread, NULL, __sample_reader, cmd)) {
+			kill(cmd->pid, SIGTERM);
+			close(cmd->out);
+			pr_err("Could not create thread to process sample data.\n");
+			return -1;
+		}
+		/* Wait for perf record initialization.*/
+		len = strlen("enable");
+		ret = write(control_fd[1], "enable", len);
+		if (ret != len) {
+			pr_err("perf record control write control message failed\n");
+			goto out;
+		}
+
+		pollfd.fd = ack_fd[0];
+		if (!poll(&pollfd, 1, 2000))
+			goto out;
+
+		ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
+		if (ret > 0)
+			ret = strcmp(ack_buf, "ack\n");
+		else {
+			pr_err("perf record control ack failed\n");
+			goto out;
+		}
+		pr_debug("Received ack from perf record\n");
+out:
+		close(control_fd[0]);
+		close(control_fd[1]);
+		close(ack_fd[0]);
+		close(ack_fd[1]);
+	}
+	return ret;
+}
+
+
+int stop_tpebs(void)
+{
+	int ret = 0;
+
+	if (tpebs_event_size > 0) {
+		kill(cmd->pid, SIGTERM);
+		pthread_join(reader_thread, NULL);
+		close(cmd->out);
+		ret = finish_command(cmd);
+		if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
+			ret = 0;
+	}
+	return ret;
+}
diff --git a/tools/perf/util/intel-tpebs.h b/tools/perf/util/intel-tpebs.h
new file mode 100644
index 000000000000..25e3e6729146
--- /dev/null
+++ b/tools/perf/util/intel-tpebs.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * intel_tpebs.h: Intel TEPBS support
+ */
+#include "stat.h"
+
+#ifndef INCLUDE__PERF_INTEL_TPEBS_H__
+#define INCLUDE__PERF_INTEL_TPEBS_H__
+
+struct tpebs_retire_lat {
+	struct list_head nd;
+	/* Event name */
+	const char *name;
+	/* Event name with the TPEBS modifier R */
+	const char *tpebs_name;
+	/* Count of retire_latency values found in sample data */
+	size_t count;
+	/* Sum of all the retire_latency values in sample data */
+	int sum;
+	/* Average of retire_latency, val = sum / count */
+	double val;
+};
+
+extern struct list_head tpebs_results;
+
+int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist *evsel_list);
+int stop_tpebs(void);
+void tpebs_data__delete(void);
+
+#endif
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index fd7a187551bd..6e86311bc75e 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -110,6 +110,7 @@ struct perf_stat_config {
 	struct cpu_aggr_map	*cpus_aggr_map;
 	u64			*walltime_run;
 	struct rblist		 metric_events;
+	pid_t			 tpebs_pid;
 	int			 ctl_fd;
 	int			 ctl_fd_ack;
 	bool			 ctl_fd_close;
-- 
2.43.0
Re: [RFC PATCH v9 3/7] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by Namhyung Kim 1 year, 8 months ago
Hello,

On Tue, May 21, 2024 at 10:40 AM <weilin.wang@intel.com> wrote:
>
> From: Weilin Wang <weilin.wang@intel.com>
>
> When retire_latency value is used in a metric formula, perf stat would fork a
> perf record process with "-e" and "-W" options. Perf record will collect
> required retire_latency values in parallel while perf stat is collecting
> counting values.
>
> At the point of time that perf stat stops counting, it would send sigterm signal
> to perf record process and receiving sampling data back from perf record from a
> pipe. Perf stat will then process the received data to get retire latency data
> and calculate metric result.
>
> Another thread is required to synchronize between perf stat and perf record
> when we pass data through pipe.
>
> Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> ---
>  tools/perf/builtin-stat.c     |  17 ++
>  tools/perf/util/Build         |   1 +
>  tools/perf/util/intel-tpebs.c | 301 ++++++++++++++++++++++++++++++++++
>  tools/perf/util/intel-tpebs.h |  30 ++++
>  tools/perf/util/stat.h        |   1 +
>  5 files changed, 350 insertions(+)
>  create mode 100644 tools/perf/util/intel-tpebs.c
>  create mode 100644 tools/perf/util/intel-tpebs.h
>
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 428e9721b908..c0e9dfa3b3c2 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -70,6 +70,7 @@
>  #include "util/bpf_counter.h"
>  #include "util/iostat.h"
>  #include "util/util.h"
> +#include "util/intel-tpebs.h"
>  #include "asm/bug.h"
>
>  #include <linux/time64.h>
> @@ -162,6 +163,7 @@ static struct perf_stat_config stat_config = {
>         .ctl_fd                 = -1,
>         .ctl_fd_ack             = -1,
>         .iostat_run             = false,
> +       .tpebs_pid              = -1,

Where is this set?

>  };
>
>  static void evlist__check_cpu_maps(struct evlist *evlist)
> @@ -653,6 +655,8 @@ static enum counter_recovery stat_handle_error(struct evsel *counter)
>
>         if (child_pid != -1)
>                 kill(child_pid, SIGTERM);
> +       if (stat_config.tpebs_pid != -1)
> +               stop_tpebs();
>         return COUNTER_FATAL;
>  }
>
> @@ -673,6 +677,10 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
>         int err;
>         bool second_pass = false;
>
> +       err = start_tpebs(&stat_config, evsel_list);
> +       if (err < 0)
> +               return err;
> +
>         if (forks) {
>                 if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe, workload_exec_failed_signal) < 0) {
>                         perror("failed to prepare workload");
> @@ -878,6 +886,10 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
>
>         t1 = rdclock();
>
> +       err = stop_tpebs();
> +       if (err < 0)
> +               return err;
> +
>         if (stat_config.walltime_run_table)
>                 stat_config.walltime_run[run_idx] = t1 - t0;
>
> @@ -985,6 +997,9 @@ static void sig_atexit(void)
>         if (child_pid != -1)
>                 kill(child_pid, SIGTERM);
>
> +       if (stat_config.tpebs_pid != -1)
> +               kill(stat_config.tpebs_pid, SIGTERM);
> +
>         sigprocmask(SIG_SETMASK, &oset, NULL);
>
>         if (signr == -1)
> @@ -2918,5 +2933,7 @@ int cmd_stat(int argc, const char **argv)
>         metricgroup__rblist_exit(&stat_config.metric_events);
>         evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack, &stat_config.ctl_fd_close);
>
> +       tpebs_data__delete();
> +
>         return status;
>  }
> diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> index 292170a99ab6..c9f1d0bb6bf8 100644
> --- a/tools/perf/util/Build
> +++ b/tools/perf/util/Build
> @@ -153,6 +153,7 @@ perf-y += clockid.o
>  perf-y += list_sort.o
>  perf-y += mutex.o
>  perf-y += sharded_mutex.o
> +perf-y += intel-tpebs.o

Can we make this Intel (or x86) only?

>
>  perf-$(CONFIG_LIBBPF) += bpf_map.o
>  perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
> diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> new file mode 100644
> index 000000000000..13f65e39a845
> --- /dev/null
> +++ b/tools/perf/util/intel-tpebs.c
> @@ -0,0 +1,301 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * intel_tpebs.c: Intel TPEBS support
> + */
> +
> +
> +#include <sys/param.h>
> +#include <subcmd/run-command.h>
> +#include <thread.h>
> +#include "intel-tpebs.h"
> +#include <linux/list.h>
> +#include <linux/zalloc.h>
> +#include <linux/err.h>
> +#include "sample.h"
> +#include "debug.h"
> +#include "evlist.h"
> +#include "evsel.h"
> +#include "session.h"
> +#include "tool.h"
> +#include "metricgroup.h"
> +#include <sys/stat.h>
> +#include <sys/file.h>
> +#include <poll.h>
> +
> +#define PERF_DATA              "-"
> +
> +struct list_head tpebs_results = LIST_HEAD_INIT(tpebs_results);

static LIST_HEAD(tpebs_results);


> +static pthread_t reader_thread;
> +static struct child_process *cmd;

Maybe better to have the 'tpebs_' prefix.


> +static struct perf_stat_config *stat_config;

Is this really needed? ...


> +static size_t tpebs_event_size;
> +
> +static int get_perf_record_args(const char **record_argv, char buf[])

... I think you can just pass the stat_config argument
from start_tpebs().

> +{
> +       struct tpebs_retire_lat *e;
> +       int i = 0;
> +
> +       pr_debug("Prepare perf record for retire_latency\n");
> +
> +       record_argv[i++] = "perf";
> +       record_argv[i++] = "record";
> +       record_argv[i++] = "-W";
> +       record_argv[i++] = "--synth=no";
> +       record_argv[i++] = buf;
> +
> +       if (stat_config->user_requested_cpu_list) {
> +               record_argv[i++] = "-C";
> +               record_argv[i++] = stat_config->user_requested_cpu_list;
> +       }
> +
> +       if (stat_config->system_wide)
> +               record_argv[i++] = "-a";
> +
> +       if (!stat_config->system_wide && !stat_config->user_requested_cpu_list) {
> +               pr_err("Require -a or -C option to run sampling.\n");
> +               return -ECANCELED;
> +       }
> +
> +       list_for_each_entry(e, &tpebs_results, nd) {
> +               record_argv[i++] = "-e";
> +               record_argv[i++] = e->name;
> +       }
> +
> +       record_argv[i++] = "-o";
> +       record_argv[i++] = PERF_DATA;
> +
> +       return 0;
> +}
> +
> +static int prepare_run_command(const char **argv)
> +{
> +       cmd = zalloc(sizeof(struct child_process));
> +       if (!cmd)
> +               return -ENOMEM;
> +       cmd->argv = argv;
> +       cmd->out = -1;
> +       return 0;
> +}
> +
> +static int prepare_perf_record(int control_fd[], int ack_fd[])
> +{
> +       const char **record_argv;
> +       int ret;
> +       char buf[32];
> +
> +       scnprintf(buf, sizeof(buf), "--control=fd:%d,%d", control_fd[0], ack_fd[1]);
> +
> +       record_argv = calloc(12 + 2 * tpebs_event_size, sizeof(char *));
> +       if (!record_argv)
> +               return -ENOMEM;
> +
> +       ret = get_perf_record_args(record_argv, buf);
> +       if (ret)
> +               goto out;
> +
> +       ret = prepare_run_command(record_argv);
> +       if (ret)
> +               goto out;
> +       ret = start_command(cmd);
> +out:
> +       free(record_argv);
> +       return ret;
> +}

Please add a blank line.


> +struct sample_data_reader {
> +       struct perf_tool        tool;
> +       struct perf_session     *session;

You don't need this, right?  Why not pass the 'tool' directly?

> +};
> +
> +static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
> +{
> +       zfree(&r->name);
> +       zfree(&r->tpebs_name);
> +       free(r);
> +}
> +
> +void tpebs_data__delete(void)
> +{
> +       struct tpebs_retire_lat *r, *rtmp;
> +
> +       list_for_each_entry_safe(r, rtmp, &tpebs_results, nd) {
> +               list_del_init(&r->nd);
> +               tpebs_retire_lat__delete(r);
> +       }
> +       free(cmd);
> +}
> +
> +static int process_sample_event(struct perf_tool *tool __maybe_unused,
> +                               union perf_event *event __maybe_unused,
> +                               struct perf_sample *sample,
> +                               struct evsel *evsel,
> +                               struct machine *machine __maybe_unused)
> +{
> +       int ret = 0;
> +       const char *evname;
> +       struct tpebs_retire_lat *t;
> +
> +       evname = evsel__name(evsel);
> +
> +       /*
> +        * Need to handle per core results? We are assuming average retire

I don't know.  It depends on the use case where you want
different per-core retire-latency for the (per-core) metric.


> +        * latency value will be used. Save the number of samples and the sum of
> +        * retire latency value for each event.
> +        */
> +       list_for_each_entry(t, &tpebs_results, nd) {
> +               if (!strcmp(evname, t->name)) {
> +                       t->count += 1;
> +                       t->sum += sample->retire_lat;
> +                       t->val = (double) t->sum / t->count;
> +                       break;
> +               }
> +       }
> +
> +       return ret;
> +}
> +
> +static int process_feature_event(struct perf_session *session,
> +                                union perf_event *event)
> +{
> +       if (event->feat.feat_id < HEADER_LAST_FEATURE)
> +               return perf_event__process_feature(session, event);
> +       return 0;
> +}
> +
> +static void *__sample_reader(void *arg)
> +{
> +       struct child_process *child = arg;
> +       struct perf_session *session;
> +       struct perf_data data = {
> +               .mode = PERF_DATA_MODE_READ,
> +               .path = PERF_DATA,
> +               .file.fd = child->out,
> +       };
> +       struct sample_data_reader reader = {
> +               .tool = {
> +               .sample          = process_sample_event,
> +               .feature         = process_feature_event,
> +               .attr            = perf_event__process_attr,

Strange indentations.


> +               },
> +       };
> +
> +       session = perf_session__new(&data, &reader.tool);
> +       if (IS_ERR(session))
> +               return NULL;
> +       reader.session = session;
> +       perf_session__process_events(session);
> +       perf_session__delete(session);
> +
> +       return NULL;
> +}
> +
> +
> +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist *evsel_list)
> +{
> +       int ret = 0;
> +       struct evsel *evsel;
> +
> +       stat_config = perf_stat_config;
> +       /*
> +        * Prepare perf record for sampling event retire_latency before fork and
> +        * prepare workload
> +        */
> +       evlist__for_each_entry(evsel_list, evsel) {
> +               if (evsel->retire_lat) {

To reduce the indentation, consider early continue like

    if (!evsel->retire_lat)
        continue;


> +                       struct tpebs_retire_lat *new = malloc(sizeof(struct tpebs_retire_lat));

I prefer sizeof(*new) instead but it doesn't matter.  But you might use
zalloc() and forget about the 0 initialization later.

> +                       int i;
> +                       char *name;
> +
> +                       pr_debug("perf stat retire latency %s required\n", evsel->name);
> +                       if (!new)
> +                               return -1;
> +                       for (i = strlen(evsel->name) - 1; i > 0; i--) {
> +                               if (evsel->name[i] == 'R')
> +                                       break;

I remember Ian checked '/' and ':' too.  Otherwise it can detect
R in the event name and treat it as a modifier.

> +                       }
> +                       if (i <= 0 || evsel->name[i] != 'R')
> +                               return -1;

You need to free the memory here and other places.

> +
> +                       name = strdup(evsel->name);
> +                       if (!name)
> +                               return -ENOMEM;
> +                       name[i] = 'p';
> +                       new->name = strdup(name);
> +                       free(name);

Seems like an unnecessary allocation.  Why not use 'name'
directly?

> +                       new->tpebs_name = strdup(evsel->name);
> +                       if (!new->tpebs_name)
> +                               return -ENOMEM;

Maybe orig_name?  But I'm not sure if it's really needed..
Can we just teach perf record to understand 'R' and to act
like 'p'?


> +                       new->count = 0;
> +                       new->sum = 0;
> +                       list_add_tail(&new->nd, &tpebs_results);
> +                       tpebs_event_size += 1;
> +               }
> +       }
> +
> +       if (tpebs_event_size > 0) {
> +               struct pollfd pollfd = { .events = POLLIN, };
> +               int control_fd[2], ack_fd[2], len;
> +               char ack_buf[8];
> +
> +               /*Create control and ack fd for --control*/
> +               if (pipe(control_fd) < 0) {
> +                       pr_err("Failed to create control fifo");
> +                       return -1;
> +               }
> +               if (pipe(ack_fd) < 0) {
> +                       pr_err("Failed to create control fifo");
> +                       return -1;

Need to close the pipes here and other places.

> +               }
> +
> +               ret = prepare_perf_record(control_fd, ack_fd);
> +               if (ret)
> +                       return ret;
> +               if (pthread_create(&reader_thread, NULL, __sample_reader, cmd)) {
> +                       kill(cmd->pid, SIGTERM);
> +                       close(cmd->out);
> +                       pr_err("Could not create thread to process sample data.\n");
> +                       return -1;
> +               }
> +               /* Wait for perf record initialization.*/
> +               len = strlen("enable");
> +               ret = write(control_fd[1], "enable", len);
> +               if (ret != len) {
> +                       pr_err("perf record control write control message failed\n");
> +                       goto out;
> +               }
> +
> +               pollfd.fd = ack_fd[0];
> +               if (!poll(&pollfd, 1, 2000))
> +                       goto out;

Do we need this?  Why not just read?

> +
> +               ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
> +               if (ret > 0)
> +                       ret = strcmp(ack_buf, "ack\n");
> +               else {
> +                       pr_err("perf record control ack failed\n");
> +                       goto out;
> +               }
> +               pr_debug("Received ack from perf record\n");
> +out:
> +               close(control_fd[0]);
> +               close(control_fd[1]);
> +               close(ack_fd[0]);
> +               close(ack_fd[1]);
> +       }
> +       return ret;
> +}
> +
> +
> +int stop_tpebs(void)
> +{
> +       int ret = 0;
> +
> +       if (tpebs_event_size > 0) {
> +               kill(cmd->pid, SIGTERM);
> +               pthread_join(reader_thread, NULL);
> +               close(cmd->out);
> +               ret = finish_command(cmd);
> +               if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> +                       ret = 0;
> +       }
> +       return ret;
> +}
> diff --git a/tools/perf/util/intel-tpebs.h b/tools/perf/util/intel-tpebs.h
> new file mode 100644
> index 000000000000..25e3e6729146
> --- /dev/null
> +++ b/tools/perf/util/intel-tpebs.h
> @@ -0,0 +1,30 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * intel_tpebs.h: Intel TEPBS support
> + */
> +#include "stat.h"
> +
> +#ifndef INCLUDE__PERF_INTEL_TPEBS_H__
> +#define INCLUDE__PERF_INTEL_TPEBS_H__
> +
> +struct tpebs_retire_lat {
> +       struct list_head nd;
> +       /* Event name */
> +       const char *name;
> +       /* Event name with the TPEBS modifier R */
> +       const char *tpebs_name;
> +       /* Count of retire_latency values found in sample data */
> +       size_t count;
> +       /* Sum of all the retire_latency values in sample data */
> +       int sum;
> +       /* Average of retire_latency, val = sum / count */
> +       double val;
> +};
> +
> +extern struct list_head tpebs_results;
> +
> +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist *evsel_list);
> +int stop_tpebs(void);
> +void tpebs_data__delete(void);

I think it's better to have the same prefix.

tpebs_start()
tpebs_stop()
tpebs_delete()

Thanks,
Namhyung

> +
> +#endif
> diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> index fd7a187551bd..6e86311bc75e 100644
> --- a/tools/perf/util/stat.h
> +++ b/tools/perf/util/stat.h
> @@ -110,6 +110,7 @@ struct perf_stat_config {
>         struct cpu_aggr_map     *cpus_aggr_map;
>         u64                     *walltime_run;
>         struct rblist            metric_events;
> +       pid_t                    tpebs_pid;
>         int                      ctl_fd;
>         int                      ctl_fd_ack;
>         bool                     ctl_fd_close;
> --
> 2.43.0
>
RE: [RFC PATCH v9 3/7] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by Wang, Weilin 1 year, 8 months ago

> -----Original Message-----
> From: Namhyung Kim <namhyung@kernel.org>
> Sent: Friday, May 24, 2024 4:02 PM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v9 3/7] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
> 
> Hello,
> 
> On Tue, May 21, 2024 at 10:40 AM <weilin.wang@intel.com> wrote:
> >
> > From: Weilin Wang <weilin.wang@intel.com>
> >
> > When retire_latency value is used in a metric formula, perf stat would fork a
> > perf record process with "-e" and "-W" options. Perf record will collect
> > required retire_latency values in parallel while perf stat is collecting
> > counting values.
> >
> > At the point of time that perf stat stops counting, it would send sigterm
> signal
> > to perf record process and receiving sampling data back from perf record
> from a
> > pipe. Perf stat will then process the received data to get retire latency data
> > and calculate metric result.
> >
> > Another thread is required to synchronize between perf stat and perf record
> > when we pass data through pipe.
> >
> > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > ---
> >  tools/perf/builtin-stat.c     |  17 ++
> >  tools/perf/util/Build         |   1 +
> >  tools/perf/util/intel-tpebs.c | 301
> ++++++++++++++++++++++++++++++++++
> >  tools/perf/util/intel-tpebs.h |  30 ++++
> >  tools/perf/util/stat.h        |   1 +
> >  5 files changed, 350 insertions(+)
> >  create mode 100644 tools/perf/util/intel-tpebs.c
> >  create mode 100644 tools/perf/util/intel-tpebs.h
> >
> > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > index 428e9721b908..c0e9dfa3b3c2 100644
> > --- a/tools/perf/builtin-stat.c
> > +++ b/tools/perf/builtin-stat.c
> > @@ -70,6 +70,7 @@
> >  #include "util/bpf_counter.h"
> >  #include "util/iostat.h"
> >  #include "util/util.h"
> > +#include "util/intel-tpebs.h"
> >  #include "asm/bug.h"
> >
> >  #include <linux/time64.h>
> > @@ -162,6 +163,7 @@ static struct perf_stat_config stat_config = {
> >         .ctl_fd                 = -1,
> >         .ctl_fd_ack             = -1,
> >         .iostat_run             = false,
> > +       .tpebs_pid              = -1,
> 
> Where is this set?
> 
> >  };
> >
> >  static void evlist__check_cpu_maps(struct evlist *evlist)
> > @@ -653,6 +655,8 @@ static enum counter_recovery
> stat_handle_error(struct evsel *counter)
> >
> >         if (child_pid != -1)
> >                 kill(child_pid, SIGTERM);
> > +       if (stat_config.tpebs_pid != -1)
> > +               stop_tpebs();
> >         return COUNTER_FATAL;
> >  }
> >
> > @@ -673,6 +677,10 @@ static int __run_perf_stat(int argc, const char
> **argv, int run_idx)
> >         int err;
> >         bool second_pass = false;
> >
> > +       err = start_tpebs(&stat_config, evsel_list);
> > +       if (err < 0)
> > +               return err;
> > +
> >         if (forks) {
> >                 if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe,
> workload_exec_failed_signal) < 0) {
> >                         perror("failed to prepare workload");
> > @@ -878,6 +886,10 @@ static int __run_perf_stat(int argc, const char
> **argv, int run_idx)
> >
> >         t1 = rdclock();
> >
> > +       err = stop_tpebs();
> > +       if (err < 0)
> > +               return err;
> > +
> >         if (stat_config.walltime_run_table)
> >                 stat_config.walltime_run[run_idx] = t1 - t0;
> >
> > @@ -985,6 +997,9 @@ static void sig_atexit(void)
> >         if (child_pid != -1)
> >                 kill(child_pid, SIGTERM);
> >
> > +       if (stat_config.tpebs_pid != -1)
> > +               kill(stat_config.tpebs_pid, SIGTERM);
> > +
> >         sigprocmask(SIG_SETMASK, &oset, NULL);
> >
> >         if (signr == -1)
> > @@ -2918,5 +2933,7 @@ int cmd_stat(int argc, const char **argv)
> >         metricgroup__rblist_exit(&stat_config.metric_events);
> >         evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack,
> &stat_config.ctl_fd_close);
> >
> > +       tpebs_data__delete();
> > +
> >         return status;
> >  }
> > diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> > index 292170a99ab6..c9f1d0bb6bf8 100644
> > --- a/tools/perf/util/Build
> > +++ b/tools/perf/util/Build
> > @@ -153,6 +153,7 @@ perf-y += clockid.o
> >  perf-y += list_sort.o
> >  perf-y += mutex.o
> >  perf-y += sharded_mutex.o
> > +perf-y += intel-tpebs.o
> 
> Can we make this Intel (or x86) only?
> 
> >
> >  perf-$(CONFIG_LIBBPF) += bpf_map.o
> >  perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
> > diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> > new file mode 100644
> > index 000000000000..13f65e39a845
> > --- /dev/null
> > +++ b/tools/perf/util/intel-tpebs.c
> > @@ -0,0 +1,301 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * intel_tpebs.c: Intel TPEBS support
> > + */
> > +
> > +
> > +#include <sys/param.h>
> > +#include <subcmd/run-command.h>
> > +#include <thread.h>
> > +#include "intel-tpebs.h"
> > +#include <linux/list.h>
> > +#include <linux/zalloc.h>
> > +#include <linux/err.h>
> > +#include "sample.h"
> > +#include "debug.h"
> > +#include "evlist.h"
> > +#include "evsel.h"
> > +#include "session.h"
> > +#include "tool.h"
> > +#include "metricgroup.h"
> > +#include <sys/stat.h>
> > +#include <sys/file.h>
> > +#include <poll.h>
> > +
> > +#define PERF_DATA              "-"
> > +
> > +struct list_head tpebs_results = LIST_HEAD_INIT(tpebs_results);
> 
> static LIST_HEAD(tpebs_results);
> 
> 
> > +static pthread_t reader_thread;
> > +static struct child_process *cmd;
> 
> Maybe better to have the 'tpebs_' prefix.
> 
> 
> > +static struct perf_stat_config *stat_config;
> 
> Is this really needed? ...
> 
> 
> > +static size_t tpebs_event_size;
> > +
> > +static int get_perf_record_args(const char **record_argv, char buf[])
> 
> ... I think you can just pass the stat_config argument
> from start_tpebs().
> 
> > +{
> > +       struct tpebs_retire_lat *e;
> > +       int i = 0;
> > +
> > +       pr_debug("Prepare perf record for retire_latency\n");
> > +
> > +       record_argv[i++] = "perf";
> > +       record_argv[i++] = "record";
> > +       record_argv[i++] = "-W";
> > +       record_argv[i++] = "--synth=no";
> > +       record_argv[i++] = buf;
> > +
> > +       if (stat_config->user_requested_cpu_list) {
> > +               record_argv[i++] = "-C";
> > +               record_argv[i++] = stat_config->user_requested_cpu_list;
> > +       }
> > +
> > +       if (stat_config->system_wide)
> > +               record_argv[i++] = "-a";
> > +
> > +       if (!stat_config->system_wide && !stat_config-
> >user_requested_cpu_list) {
> > +               pr_err("Require -a or -C option to run sampling.\n");
> > +               return -ECANCELED;
> > +       }
> > +
> > +       list_for_each_entry(e, &tpebs_results, nd) {
> > +               record_argv[i++] = "-e";
> > +               record_argv[i++] = e->name;
> > +       }
> > +
> > +       record_argv[i++] = "-o";
> > +       record_argv[i++] = PERF_DATA;
> > +
> > +       return 0;
> > +}
> > +
> > +static int prepare_run_command(const char **argv)
> > +{
> > +       cmd = zalloc(sizeof(struct child_process));
> > +       if (!cmd)
> > +               return -ENOMEM;
> > +       cmd->argv = argv;
> > +       cmd->out = -1;
> > +       return 0;
> > +}
> > +
> > +static int prepare_perf_record(int control_fd[], int ack_fd[])
> > +{
> > +       const char **record_argv;
> > +       int ret;
> > +       char buf[32];
> > +
> > +       scnprintf(buf, sizeof(buf), "--control=fd:%d,%d", control_fd[0],
> ack_fd[1]);
> > +
> > +       record_argv = calloc(12 + 2 * tpebs_event_size, sizeof(char *));
> > +       if (!record_argv)
> > +               return -ENOMEM;
> > +
> > +       ret = get_perf_record_args(record_argv, buf);
> > +       if (ret)
> > +               goto out;
> > +
> > +       ret = prepare_run_command(record_argv);
> > +       if (ret)
> > +               goto out;
> > +       ret = start_command(cmd);
> > +out:
> > +       free(record_argv);
> > +       return ret;
> > +}
> 
> Please add a blank line.
> 
> 
> > +struct sample_data_reader {
> > +       struct perf_tool        tool;
> > +       struct perf_session     *session;
> 
> You don't need this, right?  Why not pass the 'tool' directly?
> 
> > +};
> > +
> > +static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
> > +{
> > +       zfree(&r->name);
> > +       zfree(&r->tpebs_name);
> > +       free(r);
> > +}
> > +
> > +void tpebs_data__delete(void)
> > +{
> > +       struct tpebs_retire_lat *r, *rtmp;
> > +
> > +       list_for_each_entry_safe(r, rtmp, &tpebs_results, nd) {
> > +               list_del_init(&r->nd);
> > +               tpebs_retire_lat__delete(r);
> > +       }
> > +       free(cmd);
> > +}
> > +
> > +static int process_sample_event(struct perf_tool *tool __maybe_unused,
> > +                               union perf_event *event __maybe_unused,
> > +                               struct perf_sample *sample,
> > +                               struct evsel *evsel,
> > +                               struct machine *machine __maybe_unused)
> > +{
> > +       int ret = 0;
> > +       const char *evname;
> > +       struct tpebs_retire_lat *t;
> > +
> > +       evname = evsel__name(evsel);
> > +
> > +       /*
> > +        * Need to handle per core results? We are assuming average retire
> 
> I don't know.  It depends on the use case where you want
> different per-core retire-latency for the (per-core) metric.
> 
> 
> > +        * latency value will be used. Save the number of samples and the sum
> of
> > +        * retire latency value for each event.
> > +        */
> > +       list_for_each_entry(t, &tpebs_results, nd) {
> > +               if (!strcmp(evname, t->name)) {
> > +                       t->count += 1;
> > +                       t->sum += sample->retire_lat;
> > +                       t->val = (double) t->sum / t->count;
> > +                       break;
> > +               }
> > +       }
> > +
> > +       return ret;
> > +}
> > +
> > +static int process_feature_event(struct perf_session *session,
> > +                                union perf_event *event)
> > +{
> > +       if (event->feat.feat_id < HEADER_LAST_FEATURE)
> > +               return perf_event__process_feature(session, event);
> > +       return 0;
> > +}
> > +
> > +static void *__sample_reader(void *arg)
> > +{
> > +       struct child_process *child = arg;
> > +       struct perf_session *session;
> > +       struct perf_data data = {
> > +               .mode = PERF_DATA_MODE_READ,
> > +               .path = PERF_DATA,
> > +               .file.fd = child->out,
> > +       };
> > +       struct sample_data_reader reader = {
> > +               .tool = {
> > +               .sample          = process_sample_event,
> > +               .feature         = process_feature_event,
> > +               .attr            = perf_event__process_attr,
> 
> Strange indentations.
> 
> 
> > +               },
> > +       };
> > +
> > +       session = perf_session__new(&data, &reader.tool);
> > +       if (IS_ERR(session))
> > +               return NULL;
> > +       reader.session = session;
> > +       perf_session__process_events(session);
> > +       perf_session__delete(session);
> > +
> > +       return NULL;
> > +}
> > +
> > +
> > +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist
> *evsel_list)
> > +{
> > +       int ret = 0;
> > +       struct evsel *evsel;
> > +
> > +       stat_config = perf_stat_config;
> > +       /*
> > +        * Prepare perf record for sampling event retire_latency before fork and
> > +        * prepare workload
> > +        */
> > +       evlist__for_each_entry(evsel_list, evsel) {
> > +               if (evsel->retire_lat) {
> 
> To reduce the indentation, consider early continue like
> 
>     if (!evsel->retire_lat)
>         continue;
> 
> 
> > +                       struct tpebs_retire_lat *new = malloc(sizeof(struct
> tpebs_retire_lat));
> 
> I prefer sizeof(*new) instead but it doesn't matter.  But you might use
> zalloc() and forget about the 0 initialization later.
> 
> > +                       int i;
> > +                       char *name;
> > +
> > +                       pr_debug("perf stat retire latency %s required\n", evsel-
> >name);
> > +                       if (!new)
> > +                               return -1;
> > +                       for (i = strlen(evsel->name) - 1; i > 0; i--) {
> > +                               if (evsel->name[i] == 'R')
> > +                                       break;
> 
> I remember Ian checked '/' and ':' too.  Otherwise it can detect
> R in the event name and treat it as a modifier.

Sorry, I forgot this one in last email. This patch set depends on Ian's 'R' parser 
code patch. When we reach here, the evsel->name should be in the format 
like event_name:R or cpu@event_name@R. This code is to replace 'R' with 'p' 
for perf record. Therefore, I think we just need to find the position of the last 'R'.

Thanks,
Weilin 

> 
> > +                       }
> > +                       if (i <= 0 || evsel->name[i] != 'R')
> > +                               return -1;
> 
> You need to free the memory here and other places.
> 
> > +
> > +                       name = strdup(evsel->name);
> > +                       if (!name)
> > +                               return -ENOMEM;
> > +                       name[i] = 'p';
> > +                       new->name = strdup(name);
> > +                       free(name);
> 
> Seems like an unnecessary allocation.  Why not use 'name'
> directly?
> 
> > +                       new->tpebs_name = strdup(evsel->name);
> > +                       if (!new->tpebs_name)
> > +                               return -ENOMEM;
> 
> Maybe orig_name?  But I'm not sure if it's really needed..
> Can we just teach perf record to understand 'R' and to act
> like 'p'?
> 
> 
> > +                       new->count = 0;
> > +                       new->sum = 0;
> > +                       list_add_tail(&new->nd, &tpebs_results);
> > +                       tpebs_event_size += 1;
> > +               }
> > +       }
> > +
> > +       if (tpebs_event_size > 0) {
> > +               struct pollfd pollfd = { .events = POLLIN, };
> > +               int control_fd[2], ack_fd[2], len;
> > +               char ack_buf[8];
> > +
> > +               /*Create control and ack fd for --control*/
> > +               if (pipe(control_fd) < 0) {
> > +                       pr_err("Failed to create control fifo");
> > +                       return -1;
> > +               }
> > +               if (pipe(ack_fd) < 0) {
> > +                       pr_err("Failed to create control fifo");
> > +                       return -1;
> 
> Need to close the pipes here and other places.
> 
> > +               }
> > +
> > +               ret = prepare_perf_record(control_fd, ack_fd);
> > +               if (ret)
> > +                       return ret;
> > +               if (pthread_create(&reader_thread, NULL, __sample_reader, cmd)) {
> > +                       kill(cmd->pid, SIGTERM);
> > +                       close(cmd->out);
> > +                       pr_err("Could not create thread to process sample data.\n");
> > +                       return -1;
> > +               }
> > +               /* Wait for perf record initialization.*/
> > +               len = strlen("enable");
> > +               ret = write(control_fd[1], "enable", len);
> > +               if (ret != len) {
> > +                       pr_err("perf record control write control message failed\n");
> > +                       goto out;
> > +               }
> > +
> > +               pollfd.fd = ack_fd[0];
> > +               if (!poll(&pollfd, 1, 2000))
> > +                       goto out;
> 
> Do we need this?  Why not just read?
> 
> > +
> > +               ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
> > +               if (ret > 0)
> > +                       ret = strcmp(ack_buf, "ack\n");
> > +               else {
> > +                       pr_err("perf record control ack failed\n");
> > +                       goto out;
> > +               }
> > +               pr_debug("Received ack from perf record\n");
> > +out:
> > +               close(control_fd[0]);
> > +               close(control_fd[1]);
> > +               close(ack_fd[0]);
> > +               close(ack_fd[1]);
> > +       }
> > +       return ret;
> > +}
> > +
> > +
> > +int stop_tpebs(void)
> > +{
> > +       int ret = 0;
> > +
> > +       if (tpebs_event_size > 0) {
> > +               kill(cmd->pid, SIGTERM);
> > +               pthread_join(reader_thread, NULL);
> > +               close(cmd->out);
> > +               ret = finish_command(cmd);
> > +               if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> > +                       ret = 0;
> > +       }
> > +       return ret;
> > +}
> > diff --git a/tools/perf/util/intel-tpebs.h b/tools/perf/util/intel-tpebs.h
> > new file mode 100644
> > index 000000000000..25e3e6729146
> > --- /dev/null
> > +++ b/tools/perf/util/intel-tpebs.h
> > @@ -0,0 +1,30 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * intel_tpebs.h: Intel TEPBS support
> > + */
> > +#include "stat.h"
> > +
> > +#ifndef INCLUDE__PERF_INTEL_TPEBS_H__
> > +#define INCLUDE__PERF_INTEL_TPEBS_H__
> > +
> > +struct tpebs_retire_lat {
> > +       struct list_head nd;
> > +       /* Event name */
> > +       const char *name;
> > +       /* Event name with the TPEBS modifier R */
> > +       const char *tpebs_name;
> > +       /* Count of retire_latency values found in sample data */
> > +       size_t count;
> > +       /* Sum of all the retire_latency values in sample data */
> > +       int sum;
> > +       /* Average of retire_latency, val = sum / count */
> > +       double val;
> > +};
> > +
> > +extern struct list_head tpebs_results;
> > +
> > +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist
> *evsel_list);
> > +int stop_tpebs(void);
> > +void tpebs_data__delete(void);
> 
> I think it's better to have the same prefix.
> 
> tpebs_start()
> tpebs_stop()
> tpebs_delete()
> 
> Thanks,
> Namhyung
> 
> > +
> > +#endif
> > diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> > index fd7a187551bd..6e86311bc75e 100644
> > --- a/tools/perf/util/stat.h
> > +++ b/tools/perf/util/stat.h
> > @@ -110,6 +110,7 @@ struct perf_stat_config {
> >         struct cpu_aggr_map     *cpus_aggr_map;
> >         u64                     *walltime_run;
> >         struct rblist            metric_events;
> > +       pid_t                    tpebs_pid;
> >         int                      ctl_fd;
> >         int                      ctl_fd_ack;
> >         bool                     ctl_fd_close;
> > --
> > 2.43.0
> >
RE: [RFC PATCH v9 3/7] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by Wang, Weilin 1 year, 8 months ago

> -----Original Message-----
> From: Namhyung Kim <namhyung@kernel.org>
> Sent: Friday, May 24, 2024 4:02 PM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v9 3/7] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
> 
> Hello,
> 
> On Tue, May 21, 2024 at 10:40 AM <weilin.wang@intel.com> wrote:
> >
> > From: Weilin Wang <weilin.wang@intel.com>
> >
> > When retire_latency value is used in a metric formula, perf stat would fork a
> > perf record process with "-e" and "-W" options. Perf record will collect
> > required retire_latency values in parallel while perf stat is collecting
> > counting values.
> >
> > At the point of time that perf stat stops counting, it would send sigterm
> signal
> > to perf record process and receiving sampling data back from perf record
> from a
> > pipe. Perf stat will then process the received data to get retire latency data
> > and calculate metric result.
> >
> > Another thread is required to synchronize between perf stat and perf record
> > when we pass data through pipe.
> >
> > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > ---
> >  tools/perf/builtin-stat.c     |  17 ++
> >  tools/perf/util/Build         |   1 +
> >  tools/perf/util/intel-tpebs.c | 301
> ++++++++++++++++++++++++++++++++++
> >  tools/perf/util/intel-tpebs.h |  30 ++++
> >  tools/perf/util/stat.h        |   1 +
> >  5 files changed, 350 insertions(+)
> >  create mode 100644 tools/perf/util/intel-tpebs.c
> >  create mode 100644 tools/perf/util/intel-tpebs.h
> >
> > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > index 428e9721b908..c0e9dfa3b3c2 100644
> > --- a/tools/perf/builtin-stat.c
> > +++ b/tools/perf/builtin-stat.c
> > @@ -70,6 +70,7 @@
> >  #include "util/bpf_counter.h"
> >  #include "util/iostat.h"
> >  #include "util/util.h"
> > +#include "util/intel-tpebs.h"
> >  #include "asm/bug.h"
> >
> >  #include <linux/time64.h>
> > @@ -162,6 +163,7 @@ static struct perf_stat_config stat_config = {
> >         .ctl_fd                 = -1,
> >         .ctl_fd_ack             = -1,
> >         .iostat_run             = false,
> > +       .tpebs_pid              = -1,
> 
> Where is this set?
> 
> >  };
> >
> >  static void evlist__check_cpu_maps(struct evlist *evlist)
> > @@ -653,6 +655,8 @@ static enum counter_recovery
> stat_handle_error(struct evsel *counter)
> >
> >         if (child_pid != -1)
> >                 kill(child_pid, SIGTERM);
> > +       if (stat_config.tpebs_pid != -1)
> > +               stop_tpebs();
> >         return COUNTER_FATAL;
> >  }
> >
> > @@ -673,6 +677,10 @@ static int __run_perf_stat(int argc, const char
> **argv, int run_idx)
> >         int err;
> >         bool second_pass = false;
> >
> > +       err = start_tpebs(&stat_config, evsel_list);
> > +       if (err < 0)
> > +               return err;
> > +
> >         if (forks) {
> >                 if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe,
> workload_exec_failed_signal) < 0) {
> >                         perror("failed to prepare workload");
> > @@ -878,6 +886,10 @@ static int __run_perf_stat(int argc, const char
> **argv, int run_idx)
> >
> >         t1 = rdclock();
> >
> > +       err = stop_tpebs();
> > +       if (err < 0)
> > +               return err;
> > +
> >         if (stat_config.walltime_run_table)
> >                 stat_config.walltime_run[run_idx] = t1 - t0;
> >
> > @@ -985,6 +997,9 @@ static void sig_atexit(void)
> >         if (child_pid != -1)
> >                 kill(child_pid, SIGTERM);
> >
> > +       if (stat_config.tpebs_pid != -1)
> > +               kill(stat_config.tpebs_pid, SIGTERM);
> > +
> >         sigprocmask(SIG_SETMASK, &oset, NULL);
> >
> >         if (signr == -1)
> > @@ -2918,5 +2933,7 @@ int cmd_stat(int argc, const char **argv)
> >         metricgroup__rblist_exit(&stat_config.metric_events);
> >         evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack,
> &stat_config.ctl_fd_close);
> >
> > +       tpebs_data__delete();
> > +
> >         return status;
> >  }
> > diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> > index 292170a99ab6..c9f1d0bb6bf8 100644
> > --- a/tools/perf/util/Build
> > +++ b/tools/perf/util/Build
> > @@ -153,6 +153,7 @@ perf-y += clockid.o
> >  perf-y += list_sort.o
> >  perf-y += mutex.o
> >  perf-y += sharded_mutex.o
> > +perf-y += intel-tpebs.o
> 
> Can we make this Intel (or x86) only?
> 
> >
> >  perf-$(CONFIG_LIBBPF) += bpf_map.o
> >  perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
> > diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> > new file mode 100644
> > index 000000000000..13f65e39a845
> > --- /dev/null
> > +++ b/tools/perf/util/intel-tpebs.c
> > @@ -0,0 +1,301 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * intel_tpebs.c: Intel TPEBS support
> > + */
> > +
> > +
> > +#include <sys/param.h>
> > +#include <subcmd/run-command.h>
> > +#include <thread.h>
> > +#include "intel-tpebs.h"
> > +#include <linux/list.h>
> > +#include <linux/zalloc.h>
> > +#include <linux/err.h>
> > +#include "sample.h"
> > +#include "debug.h"
> > +#include "evlist.h"
> > +#include "evsel.h"
> > +#include "session.h"
> > +#include "tool.h"
> > +#include "metricgroup.h"
> > +#include <sys/stat.h>
> > +#include <sys/file.h>
> > +#include <poll.h>
> > +
> > +#define PERF_DATA              "-"
> > +
> > +struct list_head tpebs_results = LIST_HEAD_INIT(tpebs_results);
> 
> static LIST_HEAD(tpebs_results);
> 
> 
> > +static pthread_t reader_thread;
> > +static struct child_process *cmd;
> 
> Maybe better to have the 'tpebs_' prefix.
> 
> 
> > +static struct perf_stat_config *stat_config;
> 
> Is this really needed? ...
> 
> 
> > +static size_t tpebs_event_size;
> > +
> > +static int get_perf_record_args(const char **record_argv, char buf[])
> 
> ... I think you can just pass the stat_config argument
> from start_tpebs().
> 
> > +{
> > +       struct tpebs_retire_lat *e;
> > +       int i = 0;
> > +
> > +       pr_debug("Prepare perf record for retire_latency\n");
> > +
> > +       record_argv[i++] = "perf";
> > +       record_argv[i++] = "record";
> > +       record_argv[i++] = "-W";
> > +       record_argv[i++] = "--synth=no";
> > +       record_argv[i++] = buf;
> > +
> > +       if (stat_config->user_requested_cpu_list) {
> > +               record_argv[i++] = "-C";
> > +               record_argv[i++] = stat_config->user_requested_cpu_list;
> > +       }
> > +
> > +       if (stat_config->system_wide)
> > +               record_argv[i++] = "-a";
> > +
> > +       if (!stat_config->system_wide && !stat_config-
> >user_requested_cpu_list) {
> > +               pr_err("Require -a or -C option to run sampling.\n");
> > +               return -ECANCELED;
> > +       }
> > +
> > +       list_for_each_entry(e, &tpebs_results, nd) {
> > +               record_argv[i++] = "-e";
> > +               record_argv[i++] = e->name;
> > +       }
> > +
> > +       record_argv[i++] = "-o";
> > +       record_argv[i++] = PERF_DATA;
> > +
> > +       return 0;
> > +}
> > +
> > +static int prepare_run_command(const char **argv)
> > +{
> > +       cmd = zalloc(sizeof(struct child_process));
> > +       if (!cmd)
> > +               return -ENOMEM;
> > +       cmd->argv = argv;
> > +       cmd->out = -1;
> > +       return 0;
> > +}
> > +
> > +static int prepare_perf_record(int control_fd[], int ack_fd[])
> > +{
> > +       const char **record_argv;
> > +       int ret;
> > +       char buf[32];
> > +
> > +       scnprintf(buf, sizeof(buf), "--control=fd:%d,%d", control_fd[0],
> ack_fd[1]);
> > +
> > +       record_argv = calloc(12 + 2 * tpebs_event_size, sizeof(char *));
> > +       if (!record_argv)
> > +               return -ENOMEM;
> > +
> > +       ret = get_perf_record_args(record_argv, buf);
> > +       if (ret)
> > +               goto out;
> > +
> > +       ret = prepare_run_command(record_argv);
> > +       if (ret)
> > +               goto out;
> > +       ret = start_command(cmd);
> > +out:
> > +       free(record_argv);
> > +       return ret;
> > +}
> 
> Please add a blank line.
> 
> 
> > +struct sample_data_reader {
> > +       struct perf_tool        tool;
> > +       struct perf_session     *session;
> 
> You don't need this, right?  Why not pass the 'tool' directly?
> 
> > +};
> > +
> > +static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
> > +{
> > +       zfree(&r->name);
> > +       zfree(&r->tpebs_name);
> > +       free(r);
> > +}
> > +
> > +void tpebs_data__delete(void)
> > +{
> > +       struct tpebs_retire_lat *r, *rtmp;
> > +
> > +       list_for_each_entry_safe(r, rtmp, &tpebs_results, nd) {
> > +               list_del_init(&r->nd);
> > +               tpebs_retire_lat__delete(r);
> > +       }
> > +       free(cmd);
> > +}
> > +
> > +static int process_sample_event(struct perf_tool *tool __maybe_unused,
> > +                               union perf_event *event __maybe_unused,
> > +                               struct perf_sample *sample,
> > +                               struct evsel *evsel,
> > +                               struct machine *machine __maybe_unused)
> > +{
> > +       int ret = 0;
> > +       const char *evname;
> > +       struct tpebs_retire_lat *t;
> > +
> > +       evname = evsel__name(evsel);
> > +
> > +       /*
> > +        * Need to handle per core results? We are assuming average retire
> 
> I don't know.  It depends on the use case where you want
> different per-core retire-latency for the (per-core) metric.
> 
> 
> > +        * latency value will be used. Save the number of samples and the sum
> of
> > +        * retire latency value for each event.
> > +        */
> > +       list_for_each_entry(t, &tpebs_results, nd) {
> > +               if (!strcmp(evname, t->name)) {
> > +                       t->count += 1;
> > +                       t->sum += sample->retire_lat;
> > +                       t->val = (double) t->sum / t->count;
> > +                       break;
> > +               }
> > +       }
> > +
> > +       return ret;
> > +}
> > +
> > +static int process_feature_event(struct perf_session *session,
> > +                                union perf_event *event)
> > +{
> > +       if (event->feat.feat_id < HEADER_LAST_FEATURE)
> > +               return perf_event__process_feature(session, event);
> > +       return 0;
> > +}
> > +
> > +static void *__sample_reader(void *arg)
> > +{
> > +       struct child_process *child = arg;
> > +       struct perf_session *session;
> > +       struct perf_data data = {
> > +               .mode = PERF_DATA_MODE_READ,
> > +               .path = PERF_DATA,
> > +               .file.fd = child->out,
> > +       };
> > +       struct sample_data_reader reader = {
> > +               .tool = {
> > +               .sample          = process_sample_event,
> > +               .feature         = process_feature_event,
> > +               .attr            = perf_event__process_attr,
> 
> Strange indentations.

Hi Namhyung,

The indentations here look correct on the webpage and in my environment. 
I'm not sure why they are different here. 

> 
> 
> > +               },
> > +       };
> > +
> > +       session = perf_session__new(&data, &reader.tool);
> > +       if (IS_ERR(session))
> > +               return NULL;
> > +       reader.session = session;
> > +       perf_session__process_events(session);
> > +       perf_session__delete(session);
> > +
> > +       return NULL;
> > +}
> > +
> > +
> > +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist
> *evsel_list)
> > +{
> > +       int ret = 0;
> > +       struct evsel *evsel;
> > +
> > +       stat_config = perf_stat_config;
> > +       /*
> > +        * Prepare perf record for sampling event retire_latency before fork and
> > +        * prepare workload
> > +        */
> > +       evlist__for_each_entry(evsel_list, evsel) {
> > +               if (evsel->retire_lat) {
> 
> To reduce the indentation, consider early continue like
> 
>     if (!evsel->retire_lat)
>         continue;
> 
> 
> > +                       struct tpebs_retire_lat *new = malloc(sizeof(struct
> tpebs_retire_lat));
> 
> I prefer sizeof(*new) instead but it doesn't matter.  But you might use
> zalloc() and forget about the 0 initialization later.
> 
> > +                       int i;
> > +                       char *name;
> > +
> > +                       pr_debug("perf stat retire latency %s required\n", evsel-
> >name);
> > +                       if (!new)
> > +                               return -1;
> > +                       for (i = strlen(evsel->name) - 1; i > 0; i--) {
> > +                               if (evsel->name[i] == 'R')
> > +                                       break;
> 
> I remember Ian checked '/' and ':' too.  Otherwise it can detect
> R in the event name and treat it as a modifier.
> 
> > +                       }
> > +                       if (i <= 0 || evsel->name[i] != 'R')
> > +                               return -1;
> 
> You need to free the memory here and other places.
> 
> > +
> > +                       name = strdup(evsel->name);
> > +                       if (!name)
> > +                               return -ENOMEM;
> > +                       name[i] = 'p';
> > +                       new->name = strdup(name);
> > +                       free(name);
> 
> Seems like an unnecessary allocation.  Why not use 'name'
> directly?
> 
> > +                       new->tpebs_name = strdup(evsel->name);
> > +                       if (!new->tpebs_name)
> > +                               return -ENOMEM;
> 
> Maybe orig_name?  But I'm not sure if it's really needed..
> Can we just teach perf record to understand 'R' and to act
> like 'p'?
> 
> 
> > +                       new->count = 0;
> > +                       new->sum = 0;
> > +                       list_add_tail(&new->nd, &tpebs_results);
> > +                       tpebs_event_size += 1;
> > +               }
> > +       }
> > +
> > +       if (tpebs_event_size > 0) {
> > +               struct pollfd pollfd = { .events = POLLIN, };
> > +               int control_fd[2], ack_fd[2], len;
> > +               char ack_buf[8];
> > +
> > +               /*Create control and ack fd for --control*/
> > +               if (pipe(control_fd) < 0) {
> > +                       pr_err("Failed to create control fifo");
> > +                       return -1;
> > +               }
> > +               if (pipe(ack_fd) < 0) {
> > +                       pr_err("Failed to create control fifo");
> > +                       return -1;
> 
> Need to close the pipes here and other places.

The pipes are closed at the end of the function. 

> 
> > +               }
> > +
> > +               ret = prepare_perf_record(control_fd, ack_fd);
> > +               if (ret)
> > +                       return ret;
> > +               if (pthread_create(&reader_thread, NULL, __sample_reader, cmd)) {
> > +                       kill(cmd->pid, SIGTERM);
> > +                       close(cmd->out);
> > +                       pr_err("Could not create thread to process sample data.\n");
> > +                       return -1;
> > +               }
> > +               /* Wait for perf record initialization.*/
> > +               len = strlen("enable");
> > +               ret = write(control_fd[1], "enable", len);
> > +               if (ret != len) {
> > +                       pr_err("perf record control write control message failed\n");
> > +                       goto out;
> > +               }
> > +
> > +               pollfd.fd = ack_fd[0];
> > +               if (!poll(&pollfd, 1, 2000))
> > +                       goto out;
> 
> Do we need this?  Why not just read?
I could remove this part if we don't need it.

Thanks,
Weilin

> 
> > +
> > +               ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
> > +               if (ret > 0)
> > +                       ret = strcmp(ack_buf, "ack\n");
> > +               else {
> > +                       pr_err("perf record control ack failed\n");
> > +                       goto out;
> > +               }
> > +               pr_debug("Received ack from perf record\n");
> > +out:
> > +               close(control_fd[0]);
> > +               close(control_fd[1]);
> > +               close(ack_fd[0]);
> > +               close(ack_fd[1]);
> > +       }
> > +       return ret;
> > +}
> > +
> > +
> > +int stop_tpebs(void)
> > +{
> > +       int ret = 0;
> > +
> > +       if (tpebs_event_size > 0) {
> > +               kill(cmd->pid, SIGTERM);
> > +               pthread_join(reader_thread, NULL);
> > +               close(cmd->out);
> > +               ret = finish_command(cmd);
> > +               if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> > +                       ret = 0;
> > +       }
> > +       return ret;
> > +}
> > diff --git a/tools/perf/util/intel-tpebs.h b/tools/perf/util/intel-tpebs.h
> > new file mode 100644
> > index 000000000000..25e3e6729146
> > --- /dev/null
> > +++ b/tools/perf/util/intel-tpebs.h
> > @@ -0,0 +1,30 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * intel_tpebs.h: Intel TEPBS support
> > + */
> > +#include "stat.h"
> > +
> > +#ifndef INCLUDE__PERF_INTEL_TPEBS_H__
> > +#define INCLUDE__PERF_INTEL_TPEBS_H__
> > +
> > +struct tpebs_retire_lat {
> > +       struct list_head nd;
> > +       /* Event name */
> > +       const char *name;
> > +       /* Event name with the TPEBS modifier R */
> > +       const char *tpebs_name;
> > +       /* Count of retire_latency values found in sample data */
> > +       size_t count;
> > +       /* Sum of all the retire_latency values in sample data */
> > +       int sum;
> > +       /* Average of retire_latency, val = sum / count */
> > +       double val;
> > +};
> > +
> > +extern struct list_head tpebs_results;
> > +
> > +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist
> *evsel_list);
> > +int stop_tpebs(void);
> > +void tpebs_data__delete(void);
> 
> I think it's better to have the same prefix.
> 
> tpebs_start()
> tpebs_stop()
> tpebs_delete()
> 
> Thanks,
> Namhyung
> 
> > +
> > +#endif
> > diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> > index fd7a187551bd..6e86311bc75e 100644
> > --- a/tools/perf/util/stat.h
> > +++ b/tools/perf/util/stat.h
> > @@ -110,6 +110,7 @@ struct perf_stat_config {
> >         struct cpu_aggr_map     *cpus_aggr_map;
> >         u64                     *walltime_run;
> >         struct rblist            metric_events;
> > +       pid_t                    tpebs_pid;
> >         int                      ctl_fd;
> >         int                      ctl_fd_ack;
> >         bool                     ctl_fd_close;
> > --
> > 2.43.0
> >
Re: [RFC PATCH v9 3/7] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.
Posted by Namhyung Kim 1 year, 8 months ago
On Fri, May 24, 2024 at 4:45 PM Wang, Weilin <weilin.wang@intel.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Namhyung Kim <namhyung@kernel.org>
> > Sent: Friday, May 24, 2024 4:02 PM
> > To: Wang, Weilin <weilin.wang@intel.com>
> > Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > <mingo@redhat.com>; Alexander Shishkin
> > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> > Caleb <caleb.biggers@intel.com>
> > Subject: Re: [RFC PATCH v9 3/7] perf stat: Fork and launch perf record when
> > perf stat needs to get retire latency value for a metric.
> >
> > Hello,
> >
> > On Tue, May 21, 2024 at 10:40 AM <weilin.wang@intel.com> wrote:
> > >
> > > From: Weilin Wang <weilin.wang@intel.com>
> > >
> > > When retire_latency value is used in a metric formula, perf stat would fork a
> > > perf record process with "-e" and "-W" options. Perf record will collect
> > > required retire_latency values in parallel while perf stat is collecting
> > > counting values.
> > >
> > > At the point of time that perf stat stops counting, it would send sigterm
> > signal
> > > to perf record process and receiving sampling data back from perf record
> > from a
> > > pipe. Perf stat will then process the received data to get retire latency data
> > > and calculate metric result.
> > >
> > > Another thread is required to synchronize between perf stat and perf record
> > > when we pass data through pipe.
> > >
> > > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > > ---
> > >  tools/perf/builtin-stat.c     |  17 ++
> > >  tools/perf/util/Build         |   1 +
> > >  tools/perf/util/intel-tpebs.c | 301
> > ++++++++++++++++++++++++++++++++++
> > >  tools/perf/util/intel-tpebs.h |  30 ++++
> > >  tools/perf/util/stat.h        |   1 +
> > >  5 files changed, 350 insertions(+)
> > >  create mode 100644 tools/perf/util/intel-tpebs.c
> > >  create mode 100644 tools/perf/util/intel-tpebs.h
> > >
> > > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > > index 428e9721b908..c0e9dfa3b3c2 100644
> > > --- a/tools/perf/builtin-stat.c
> > > +++ b/tools/perf/builtin-stat.c
> > > @@ -70,6 +70,7 @@
> > >  #include "util/bpf_counter.h"
> > >  #include "util/iostat.h"
> > >  #include "util/util.h"
> > > +#include "util/intel-tpebs.h"
> > >  #include "asm/bug.h"
> > >
> > >  #include <linux/time64.h>
> > > @@ -162,6 +163,7 @@ static struct perf_stat_config stat_config = {
> > >         .ctl_fd                 = -1,
> > >         .ctl_fd_ack             = -1,
> > >         .iostat_run             = false,
> > > +       .tpebs_pid              = -1,
> >
> > Where is this set?
> >
> > >  };
> > >
> > >  static void evlist__check_cpu_maps(struct evlist *evlist)
> > > @@ -653,6 +655,8 @@ static enum counter_recovery
> > stat_handle_error(struct evsel *counter)
> > >
> > >         if (child_pid != -1)
> > >                 kill(child_pid, SIGTERM);
> > > +       if (stat_config.tpebs_pid != -1)
> > > +               stop_tpebs();
> > >         return COUNTER_FATAL;
> > >  }
> > >
> > > @@ -673,6 +677,10 @@ static int __run_perf_stat(int argc, const char
> > **argv, int run_idx)
> > >         int err;
> > >         bool second_pass = false;
> > >
> > > +       err = start_tpebs(&stat_config, evsel_list);
> > > +       if (err < 0)
> > > +               return err;
> > > +
> > >         if (forks) {
> > >                 if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe,
> > workload_exec_failed_signal) < 0) {
> > >                         perror("failed to prepare workload");
> > > @@ -878,6 +886,10 @@ static int __run_perf_stat(int argc, const char
> > **argv, int run_idx)
> > >
> > >         t1 = rdclock();
> > >
> > > +       err = stop_tpebs();
> > > +       if (err < 0)
> > > +               return err;
> > > +
> > >         if (stat_config.walltime_run_table)
> > >                 stat_config.walltime_run[run_idx] = t1 - t0;
> > >
> > > @@ -985,6 +997,9 @@ static void sig_atexit(void)
> > >         if (child_pid != -1)
> > >                 kill(child_pid, SIGTERM);
> > >
> > > +       if (stat_config.tpebs_pid != -1)
> > > +               kill(stat_config.tpebs_pid, SIGTERM);
> > > +
> > >         sigprocmask(SIG_SETMASK, &oset, NULL);
> > >
> > >         if (signr == -1)
> > > @@ -2918,5 +2933,7 @@ int cmd_stat(int argc, const char **argv)
> > >         metricgroup__rblist_exit(&stat_config.metric_events);
> > >         evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack,
> > &stat_config.ctl_fd_close);
> > >
> > > +       tpebs_data__delete();
> > > +
> > >         return status;
> > >  }
> > > diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> > > index 292170a99ab6..c9f1d0bb6bf8 100644
> > > --- a/tools/perf/util/Build
> > > +++ b/tools/perf/util/Build
> > > @@ -153,6 +153,7 @@ perf-y += clockid.o
> > >  perf-y += list_sort.o
> > >  perf-y += mutex.o
> > >  perf-y += sharded_mutex.o
> > > +perf-y += intel-tpebs.o
> >
> > Can we make this Intel (or x86) only?
> >
> > >
> > >  perf-$(CONFIG_LIBBPF) += bpf_map.o
> > >  perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
> > > diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> > > new file mode 100644
> > > index 000000000000..13f65e39a845
> > > --- /dev/null
> > > +++ b/tools/perf/util/intel-tpebs.c
> > > @@ -0,0 +1,301 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * intel_tpebs.c: Intel TPEBS support
> > > + */
> > > +
> > > +
> > > +#include <sys/param.h>
> > > +#include <subcmd/run-command.h>
> > > +#include <thread.h>
> > > +#include "intel-tpebs.h"
> > > +#include <linux/list.h>
> > > +#include <linux/zalloc.h>
> > > +#include <linux/err.h>
> > > +#include "sample.h"
> > > +#include "debug.h"
> > > +#include "evlist.h"
> > > +#include "evsel.h"
> > > +#include "session.h"
> > > +#include "tool.h"
> > > +#include "metricgroup.h"
> > > +#include <sys/stat.h>
> > > +#include <sys/file.h>
> > > +#include <poll.h>
> > > +
> > > +#define PERF_DATA              "-"
> > > +
> > > +struct list_head tpebs_results = LIST_HEAD_INIT(tpebs_results);
> >
> > static LIST_HEAD(tpebs_results);
> >
> >
> > > +static pthread_t reader_thread;
> > > +static struct child_process *cmd;
> >
> > Maybe better to have the 'tpebs_' prefix.
> >
> >
> > > +static struct perf_stat_config *stat_config;
> >
> > Is this really needed? ...
> >
> >
> > > +static size_t tpebs_event_size;
> > > +
> > > +static int get_perf_record_args(const char **record_argv, char buf[])
> >
> > ... I think you can just pass the stat_config argument
> > from start_tpebs().
> >
> > > +{
> > > +       struct tpebs_retire_lat *e;
> > > +       int i = 0;
> > > +
> > > +       pr_debug("Prepare perf record for retire_latency\n");
> > > +
> > > +       record_argv[i++] = "perf";
> > > +       record_argv[i++] = "record";
> > > +       record_argv[i++] = "-W";
> > > +       record_argv[i++] = "--synth=no";
> > > +       record_argv[i++] = buf;
> > > +
> > > +       if (stat_config->user_requested_cpu_list) {
> > > +               record_argv[i++] = "-C";
> > > +               record_argv[i++] = stat_config->user_requested_cpu_list;
> > > +       }
> > > +
> > > +       if (stat_config->system_wide)
> > > +               record_argv[i++] = "-a";
> > > +
> > > +       if (!stat_config->system_wide && !stat_config-
> > >user_requested_cpu_list) {
> > > +               pr_err("Require -a or -C option to run sampling.\n");
> > > +               return -ECANCELED;
> > > +       }
> > > +
> > > +       list_for_each_entry(e, &tpebs_results, nd) {
> > > +               record_argv[i++] = "-e";
> > > +               record_argv[i++] = e->name;
> > > +       }
> > > +
> > > +       record_argv[i++] = "-o";
> > > +       record_argv[i++] = PERF_DATA;
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static int prepare_run_command(const char **argv)
> > > +{
> > > +       cmd = zalloc(sizeof(struct child_process));
> > > +       if (!cmd)
> > > +               return -ENOMEM;
> > > +       cmd->argv = argv;
> > > +       cmd->out = -1;
> > > +       return 0;
> > > +}
> > > +
> > > +static int prepare_perf_record(int control_fd[], int ack_fd[])
> > > +{
> > > +       const char **record_argv;
> > > +       int ret;
> > > +       char buf[32];
> > > +
> > > +       scnprintf(buf, sizeof(buf), "--control=fd:%d,%d", control_fd[0],
> > ack_fd[1]);
> > > +
> > > +       record_argv = calloc(12 + 2 * tpebs_event_size, sizeof(char *));
> > > +       if (!record_argv)
> > > +               return -ENOMEM;
> > > +
> > > +       ret = get_perf_record_args(record_argv, buf);
> > > +       if (ret)
> > > +               goto out;
> > > +
> > > +       ret = prepare_run_command(record_argv);
> > > +       if (ret)
> > > +               goto out;
> > > +       ret = start_command(cmd);
> > > +out:
> > > +       free(record_argv);
> > > +       return ret;
> > > +}
> >
> > Please add a blank line.
> >
> >
> > > +struct sample_data_reader {
> > > +       struct perf_tool        tool;
> > > +       struct perf_session     *session;
> >
> > You don't need this, right?  Why not pass the 'tool' directly?
> >
> > > +};
> > > +
> > > +static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
> > > +{
> > > +       zfree(&r->name);
> > > +       zfree(&r->tpebs_name);
> > > +       free(r);
> > > +}
> > > +
> > > +void tpebs_data__delete(void)
> > > +{
> > > +       struct tpebs_retire_lat *r, *rtmp;
> > > +
> > > +       list_for_each_entry_safe(r, rtmp, &tpebs_results, nd) {
> > > +               list_del_init(&r->nd);
> > > +               tpebs_retire_lat__delete(r);
> > > +       }
> > > +       free(cmd);
> > > +}
> > > +
> > > +static int process_sample_event(struct perf_tool *tool __maybe_unused,
> > > +                               union perf_event *event __maybe_unused,
> > > +                               struct perf_sample *sample,
> > > +                               struct evsel *evsel,
> > > +                               struct machine *machine __maybe_unused)
> > > +{
> > > +       int ret = 0;
> > > +       const char *evname;
> > > +       struct tpebs_retire_lat *t;
> > > +
> > > +       evname = evsel__name(evsel);
> > > +
> > > +       /*
> > > +        * Need to handle per core results? We are assuming average retire
> >
> > I don't know.  It depends on the use case where you want
> > different per-core retire-latency for the (per-core) metric.
> >
> >
> > > +        * latency value will be used. Save the number of samples and the sum
> > of
> > > +        * retire latency value for each event.
> > > +        */
> > > +       list_for_each_entry(t, &tpebs_results, nd) {
> > > +               if (!strcmp(evname, t->name)) {
> > > +                       t->count += 1;
> > > +                       t->sum += sample->retire_lat;
> > > +                       t->val = (double) t->sum / t->count;
> > > +                       break;
> > > +               }
> > > +       }
> > > +
> > > +       return ret;
> > > +}
> > > +
> > > +static int process_feature_event(struct perf_session *session,
> > > +                                union perf_event *event)
> > > +{
> > > +       if (event->feat.feat_id < HEADER_LAST_FEATURE)
> > > +               return perf_event__process_feature(session, event);
> > > +       return 0;
> > > +}
> > > +
> > > +static void *__sample_reader(void *arg)
> > > +{
> > > +       struct child_process *child = arg;
> > > +       struct perf_session *session;
> > > +       struct perf_data data = {
> > > +               .mode = PERF_DATA_MODE_READ,
> > > +               .path = PERF_DATA,
> > > +               .file.fd = child->out,
> > > +       };
> > > +       struct sample_data_reader reader = {
> > > +               .tool = {
> > > +               .sample          = process_sample_event,
> > > +               .feature         = process_feature_event,
> > > +               .attr            = perf_event__process_attr,
> >
> > Strange indentations.
>
> Hi Namhyung,
>
> The indentations here look correct on the webpage and in my environment.
> I'm not sure why they are different here.

Sounds like a tab vs space issue.

>
> >
> >
> > > +               },
> > > +       };
> > > +
> > > +       session = perf_session__new(&data, &reader.tool);
> > > +       if (IS_ERR(session))
> > > +               return NULL;
> > > +       reader.session = session;
> > > +       perf_session__process_events(session);
> > > +       perf_session__delete(session);
> > > +
> > > +       return NULL;
> > > +}
> > > +
> > > +
> > > +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist
> > *evsel_list)
> > > +{
> > > +       int ret = 0;
> > > +       struct evsel *evsel;
> > > +
> > > +       stat_config = perf_stat_config;
> > > +       /*
> > > +        * Prepare perf record for sampling event retire_latency before fork and
> > > +        * prepare workload
> > > +        */
> > > +       evlist__for_each_entry(evsel_list, evsel) {
> > > +               if (evsel->retire_lat) {
> >
> > To reduce the indentation, consider early continue like
> >
> >     if (!evsel->retire_lat)
> >         continue;
> >
> >
> > > +                       struct tpebs_retire_lat *new = malloc(sizeof(struct
> > tpebs_retire_lat));
> >
> > I prefer sizeof(*new) instead but it doesn't matter.  But you might use
> > zalloc() and forget about the 0 initialization later.
> >
> > > +                       int i;
> > > +                       char *name;
> > > +
> > > +                       pr_debug("perf stat retire latency %s required\n", evsel-
> > >name);
> > > +                       if (!new)
> > > +                               return -1;
> > > +                       for (i = strlen(evsel->name) - 1; i > 0; i--) {
> > > +                               if (evsel->name[i] == 'R')
> > > +                                       break;
> >
> > I remember Ian checked '/' and ':' too.  Otherwise it can detect
> > R in the event name and treat it as a modifier.
> >
> > > +                       }
> > > +                       if (i <= 0 || evsel->name[i] != 'R')
> > > +                               return -1;
> >
> > You need to free the memory here and other places.
> >
> > > +
> > > +                       name = strdup(evsel->name);
> > > +                       if (!name)
> > > +                               return -ENOMEM;
> > > +                       name[i] = 'p';
> > > +                       new->name = strdup(name);
> > > +                       free(name);
> >
> > Seems like an unnecessary allocation.  Why not use 'name'
> > directly?
> >
> > > +                       new->tpebs_name = strdup(evsel->name);
> > > +                       if (!new->tpebs_name)
> > > +                               return -ENOMEM;
> >
> > Maybe orig_name?  But I'm not sure if it's really needed..
> > Can we just teach perf record to understand 'R' and to act
> > like 'p'?
> >
> >
> > > +                       new->count = 0;
> > > +                       new->sum = 0;
> > > +                       list_add_tail(&new->nd, &tpebs_results);
> > > +                       tpebs_event_size += 1;
> > > +               }
> > > +       }
> > > +
> > > +       if (tpebs_event_size > 0) {
> > > +               struct pollfd pollfd = { .events = POLLIN, };
> > > +               int control_fd[2], ack_fd[2], len;
> > > +               char ack_buf[8];
> > > +
> > > +               /*Create control and ack fd for --control*/
> > > +               if (pipe(control_fd) < 0) {
> > > +                       pr_err("Failed to create control fifo");
> > > +                       return -1;
> > > +               }
> > > +               if (pipe(ack_fd) < 0) {
> > > +                       pr_err("Failed to create control fifo");
> > > +                       return -1;
> >
> > Need to close the pipes here and other places.
>
> The pipes are closed at the end of the function.

Yep, but you didn't go there. :)

Thanks,
Namhyung

>
> >
> > > +               }
> > > +
> > > +               ret = prepare_perf_record(control_fd, ack_fd);
> > > +               if (ret)
> > > +                       return ret;
> > > +               if (pthread_create(&reader_thread, NULL, __sample_reader, cmd)) {
> > > +                       kill(cmd->pid, SIGTERM);
> > > +                       close(cmd->out);
> > > +                       pr_err("Could not create thread to process sample data.\n");
> > > +                       return -1;
> > > +               }
> > > +               /* Wait for perf record initialization.*/
> > > +               len = strlen("enable");
> > > +               ret = write(control_fd[1], "enable", len);
> > > +               if (ret != len) {
> > > +                       pr_err("perf record control write control message failed\n");
> > > +                       goto out;
> > > +               }
> > > +
> > > +               pollfd.fd = ack_fd[0];
> > > +               if (!poll(&pollfd, 1, 2000))
> > > +                       goto out;
> >
> > Do we need this?  Why not just read?
> I could remove this part if we don't need it.
>
> Thanks,
> Weilin
>
> >
> > > +
> > > +               ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
> > > +               if (ret > 0)
> > > +                       ret = strcmp(ack_buf, "ack\n");
> > > +               else {
> > > +                       pr_err("perf record control ack failed\n");
> > > +                       goto out;
> > > +               }
> > > +               pr_debug("Received ack from perf record\n");
> > > +out:
> > > +               close(control_fd[0]);
> > > +               close(control_fd[1]);
> > > +               close(ack_fd[0]);
> > > +               close(ack_fd[1]);
> > > +       }
> > > +       return ret;
> > > +}
> > > +
> > > +
> > > +int stop_tpebs(void)
> > > +{
> > > +       int ret = 0;
> > > +
> > > +       if (tpebs_event_size > 0) {
> > > +               kill(cmd->pid, SIGTERM);
> > > +               pthread_join(reader_thread, NULL);
> > > +               close(cmd->out);
> > > +               ret = finish_command(cmd);
> > > +               if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> > > +                       ret = 0;
> > > +       }
> > > +       return ret;
> > > +}
> > > diff --git a/tools/perf/util/intel-tpebs.h b/tools/perf/util/intel-tpebs.h
> > > new file mode 100644
> > > index 000000000000..25e3e6729146
> > > --- /dev/null
> > > +++ b/tools/perf/util/intel-tpebs.h
> > > @@ -0,0 +1,30 @@
> > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > +/*
> > > + * intel_tpebs.h: Intel TEPBS support
> > > + */
> > > +#include "stat.h"
> > > +
> > > +#ifndef INCLUDE__PERF_INTEL_TPEBS_H__
> > > +#define INCLUDE__PERF_INTEL_TPEBS_H__
> > > +
> > > +struct tpebs_retire_lat {
> > > +       struct list_head nd;
> > > +       /* Event name */
> > > +       const char *name;
> > > +       /* Event name with the TPEBS modifier R */
> > > +       const char *tpebs_name;
> > > +       /* Count of retire_latency values found in sample data */
> > > +       size_t count;
> > > +       /* Sum of all the retire_latency values in sample data */
> > > +       int sum;
> > > +       /* Average of retire_latency, val = sum / count */
> > > +       double val;
> > > +};
> > > +
> > > +extern struct list_head tpebs_results;
> > > +
> > > +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist
> > *evsel_list);
> > > +int stop_tpebs(void);
> > > +void tpebs_data__delete(void);
> >
> > I think it's better to have the same prefix.
> >
> > tpebs_start()
> > tpebs_stop()
> > tpebs_delete()
> >
> > Thanks,
> > Namhyung
> >
> > > +
> > > +#endif
> > > diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> > > index fd7a187551bd..6e86311bc75e 100644
> > > --- a/tools/perf/util/stat.h
> > > +++ b/tools/perf/util/stat.h
> > > @@ -110,6 +110,7 @@ struct perf_stat_config {
> > >         struct cpu_aggr_map     *cpus_aggr_map;
> > >         u64                     *walltime_run;
> > >         struct rblist            metric_events;
> > > +       pid_t                    tpebs_pid;
> > >         int                      ctl_fd;
> > >         int                      ctl_fd_ack;
> > >         bool                     ctl_fd_close;
> > > --
> > > 2.43.0
> > >