From: Weilin Wang <weilin.wang@intel.com>
When retire_latency value is used in a metric formula, perf stat would fork a
perf record process with "-e" and "-W" options. Perf record will collect
required retire_latency values in parallel while perf stat is collecting
counting values.
At the point of time that perf stat stops counting, it would send sigterm signal
to perf record process and receiving sampling data back from perf record from a
pipe. Perf stat will then process the received data to get retire latency data
and calculate metric result.
Another thread is required to synchronize between perf stat and perf record
when we pass data through pipe.
Signed-off-by: Weilin Wang <weilin.wang@intel.com>
Reviewed-by: Ian Rogers <irogers@google.com>
---
tools/perf/builtin-stat.c | 19 +++
tools/perf/util/Build | 1 +
tools/perf/util/intel-tpebs.c | 285 ++++++++++++++++++++++++++++++++++
tools/perf/util/intel-tpebs.h | 29 ++++
tools/perf/util/stat.h | 3 +
5 files changed, 337 insertions(+)
create mode 100644 tools/perf/util/intel-tpebs.c
create mode 100644 tools/perf/util/intel-tpebs.h
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 428e9721b908..85927cf45adb 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -70,6 +70,7 @@
#include "util/bpf_counter.h"
#include "util/iostat.h"
#include "util/util.h"
+#include "util/intel-tpebs.h"
#include "asm/bug.h"
#include <linux/time64.h>
@@ -94,6 +95,7 @@
#include <perf/evlist.h>
#include <internal/threadmap.h>
+
#define DEFAULT_SEPARATOR " "
#define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi"
@@ -162,6 +164,8 @@ static struct perf_stat_config stat_config = {
.ctl_fd = -1,
.ctl_fd_ack = -1,
.iostat_run = false,
+ .tpebs_results = LIST_HEAD_INIT(stat_config.tpebs_results),
+ .tpebs_pid = -1,
};
static void evlist__check_cpu_maps(struct evlist *evlist)
@@ -653,6 +657,8 @@ static enum counter_recovery stat_handle_error(struct evsel *counter)
if (child_pid != -1)
kill(child_pid, SIGTERM);
+ if (stat_config.tpebs_pid != -1)
+ stop_tpebs();
return COUNTER_FATAL;
}
@@ -673,6 +679,10 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
int err;
bool second_pass = false;
+ err = start_tpebs(&stat_config, evsel_list);
+ if (err < 0)
+ return err;
+
if (forks) {
if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe, workload_exec_failed_signal) < 0) {
perror("failed to prepare workload");
@@ -878,6 +888,10 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
t1 = rdclock();
+ err = stop_tpebs();
+ if (err < 0)
+ return err;
+
if (stat_config.walltime_run_table)
stat_config.walltime_run[run_idx] = t1 - t0;
@@ -985,6 +999,9 @@ static void sig_atexit(void)
if (child_pid != -1)
kill(child_pid, SIGTERM);
+ if (stat_config.tpebs_pid != -1)
+ kill(stat_config.tpebs_pid, SIGTERM);
+
sigprocmask(SIG_SETMASK, &oset, NULL);
if (signr == -1)
@@ -2918,5 +2935,7 @@ int cmd_stat(int argc, const char **argv)
metricgroup__rblist_exit(&stat_config.metric_events);
evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack, &stat_config.ctl_fd_close);
+ tpebs_data__delete();
+
return status;
}
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 292170a99ab6..c9f1d0bb6bf8 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -153,6 +153,7 @@ perf-y += clockid.o
perf-y += list_sort.o
perf-y += mutex.o
perf-y += sharded_mutex.o
+perf-y += intel-tpebs.o
perf-$(CONFIG_LIBBPF) += bpf_map.o
perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
new file mode 100644
index 000000000000..4b7a98794fae
--- /dev/null
+++ b/tools/perf/util/intel-tpebs.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * intel_pt.c: Intel Processor Trace support
+ * Copyright (c) 2013-2015, Intel Corporation.
+ */
+
+
+#include <sys/param.h>
+#include <subcmd/run-command.h>
+#include <thread.h>
+#include "intel-tpebs.h"
+#include <linux/list.h>
+#include <linux/zalloc.h>
+#include <linux/err.h>
+#include "sample.h"
+#include "debug.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "session.h"
+#include "tool.h"
+#include "metricgroup.h"
+#include <sys/stat.h>
+#include <sys/file.h>
+
+
+
+#define PERF_DATA "-"
+#define CONTROL "/tmp/control"
+#define ACK "/tmp/ack"
+pthread_t reader_thread;
+struct child_process *cmd;
+struct perf_stat_config *stat_config;
+
+static int get_perf_record_args(const char **record_argv)
+{
+ int i = 0;
+ struct tpebs_retire_lat *e;
+
+ pr_debug("Prepare perf record for retire_latency\n");
+
+ record_argv[i++] = "perf";
+ record_argv[i++] = "record";
+ record_argv[i++] = "-W";
+ record_argv[i++] = "--synth=no";
+ record_argv[i++] = "--control=fifo:/tmp/control,/tmp/ack";
+
+ if (stat_config->user_requested_cpu_list) {
+ record_argv[i++] = "-C";
+ record_argv[i++] = stat_config->user_requested_cpu_list;
+ }
+
+ if (stat_config->system_wide)
+ record_argv[i++] = "-a";
+
+ if (!stat_config->system_wide && !stat_config->user_requested_cpu_list) {
+ pr_err("Require -a or -C option to run sampling.\n");
+ return -ECANCELED;
+ }
+
+ list_for_each_entry(e, &stat_config->tpebs_results, nd) {
+ record_argv[i++] = "-e";
+ record_argv[i++] = e->name;
+ }
+
+ record_argv[i++] = "-o";
+ record_argv[i++] = PERF_DATA;
+
+ return 0;
+}
+
+static int prepare_run_command(const char **argv)
+{
+ cmd = zalloc(sizeof(struct child_process));
+ if (!cmd)
+ return -ENOMEM;
+ cmd->argv = argv;
+ cmd->out = -1;
+ return 0;
+}
+
+static int prepare_perf_record(int tpebs_event_size)
+{
+ const char **record_argv;
+ int ret;
+
+ /*Create control and ack fd for --control*/
+ if (mkfifo(CONTROL, 0600)) {
+ pr_err("Failed to create control fifo");
+ return -1;
+ }
+ if (mkfifo(ACK, 0600)) {
+ pr_err("Failed to create control fifo");
+ return -1;
+ }
+
+ record_argv = calloc(10 + 2 * tpebs_event_size, sizeof(char *));
+ if (!record_argv)
+ return -ENOMEM;
+
+
+ ret = get_perf_record_args(record_argv);
+ if (ret)
+ goto out;
+
+ ret = prepare_run_command(record_argv);
+ if (ret)
+ goto out;
+ ret = start_command(cmd);
+out:
+ free(record_argv);
+ return ret;
+}
+struct sample_data_reader {
+ struct perf_tool tool;
+ struct perf_session *session;
+};
+
+static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
+{
+ zfree(&r->name);
+ zfree(&r->tpebs_name);
+ free(r);
+}
+
+void tpebs_data__delete(void)
+{
+ struct tpebs_retire_lat *r, *rtmp;
+
+ list_for_each_entry_safe(r, rtmp, &stat_config->tpebs_results, nd) {
+ list_del_init(&r->nd);
+ tpebs_retire_lat__delete(r);
+ }
+ free(cmd);
+}
+
+static int process_sample_event(struct perf_tool *tool __maybe_unused,
+ union perf_event *event __maybe_unused,
+ struct perf_sample *sample,
+ struct evsel *evsel,
+ struct machine *machine __maybe_unused)
+{
+ int ret = 0;
+ const char *evname;
+ struct tpebs_retire_lat *t;
+
+ evname = evsel__name(evsel);
+
+ /*
+ * Need to handle per core results? We are assuming average retire
+ * latency value will be used. Save the number of samples and the sum of
+ * retire latency value for each event.
+ */
+ list_for_each_entry(t, &stat_config->tpebs_results, nd) {
+ if (!strcmp(evname, t->name)) {
+ t->count += 1;
+ t->sum += sample->retire_lat;
+ t->val = (double) t->sum / t->count;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int process_feature_event(struct perf_session *session,
+ union perf_event *event)
+{
+ if (event->feat.feat_id < HEADER_LAST_FEATURE)
+ return perf_event__process_feature(session, event);
+ return 0;
+}
+
+static void *__sample_reader(void *arg)
+{
+ struct child_process *child = arg;
+ struct perf_session *session;
+ struct perf_data data = {
+ .mode = PERF_DATA_MODE_READ,
+ .path = PERF_DATA,
+ .file.fd = child->out,
+ };
+ struct sample_data_reader reader = {
+ .tool = {
+ .sample = process_sample_event,
+ .feature = process_feature_event,
+ .attr = perf_event__process_attr,
+ },
+ };
+
+ session = perf_session__new(&data, &reader.tool);
+ if (IS_ERR(session))
+ return NULL;
+ reader.session = session;
+ perf_session__process_events(session);
+ perf_session__delete(session);
+
+ return NULL;
+}
+
+
+int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist *evsel_list)
+{
+ int ret = 0;
+ struct evsel *evsel;
+ int control = -1, ack = -1;
+
+ stat_config = perf_stat_config;
+ /*
+ * Prepare perf record for sampling event retire_latency before fork and
+ * prepare workload
+ */
+ evlist__for_each_entry(evsel_list, evsel) {
+ if (evsel->retire_lat) {
+ struct tpebs_retire_lat *new = malloc(sizeof(struct tpebs_retire_lat));
+ int i;
+ char *name;
+
+ pr_debug("perf stat retire latency %s required\n", evsel->name);
+ if (!new)
+ return -1;
+ for (i = strlen(evsel->name) - 1; i > 0; i--) {
+ if (evsel->name[i] == 'R')
+ break;
+ }
+ if (i <= 0 || evsel->name[i] != 'R')
+ return -1;
+
+ name = strdup(evsel->name);
+ if (!name)
+ return -ENOMEM;
+ name[i] = 'p';
+ new->name = strdup(name);
+ free(name);
+ new->tpebs_name = strdup(evsel->name);
+ if (!new->tpebs_name)
+ return -ENOMEM;
+ new->count = 0;
+ new->sum = 0;
+ list_add_tail(&new->nd, &stat_config->tpebs_results);
+ stat_config->tpebs_event_size += 1;
+ }
+ }
+
+ if (stat_config->tpebs_event_size > 0) {
+ ret = prepare_perf_record(stat_config->tpebs_event_size);
+ if (ret)
+ return ret;
+ if (pthread_create(&reader_thread, NULL, __sample_reader, cmd)) {
+ kill(cmd->pid, SIGTERM);
+ close(cmd->out);
+ pr_err("Could not create thread to process sample data.\n");
+ return -1;
+ }
+ /* Wait for perf record initialization a little bit.*/
+ control = open(CONTROL, O_RDONLY, O_NONBLOCK);
+ if (!control)
+ return -1;
+ close(control);
+ ack = open(ACK, O_RDONLY, O_NONBLOCK);
+ if (!ack)
+ return -1;
+ close(ack);
+ pr_debug("Received ack from perf record\n");
+ }
+
+ return ret;
+}
+
+
+int stop_tpebs(void)
+{
+ int ret = 0;
+
+ if (stat_config->tpebs_event_size > 0) {
+ kill(cmd->pid, SIGTERM);
+ pthread_join(reader_thread, NULL);
+ close(cmd->out);
+ ret = finish_command(cmd);
+ if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
+ ret = 0;
+ remove(CONTROL);
+ remove(ACK);
+ }
+ return ret;
+}
diff --git a/tools/perf/util/intel-tpebs.h b/tools/perf/util/intel-tpebs.h
new file mode 100644
index 000000000000..e8e2bb2f479b
--- /dev/null
+++ b/tools/perf/util/intel-tpebs.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * intel_pt.h: Intel Processor Trace support
+ * Copyright (c) 2013-2015, Intel Corporation.
+ */
+#include "stat.h"
+
+#ifndef INCLUDE__PERF_INTEL_TPEBS_H__
+#define INCLUDE__PERF_INTEL_TPEBS_H__
+
+struct tpebs_retire_lat {
+ struct list_head nd;
+ /* Event name */
+ const char *name;
+ /* Event name with the TPEBS modifier R */
+ const char *tpebs_name;
+ /* Count of retire_latency values found in sample data */
+ size_t count;
+ /* Sum of all the retire_latency values in sample data */
+ int sum;
+ /* Average of retire_latency, val = sum / count */
+ double val;
+};
+
+int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist *evsel_list);
+int stop_tpebs(void);
+void tpebs_data__delete(void);
+
+#endif
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index fd7a187551bd..c6c2aa43030f 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -110,6 +110,9 @@ struct perf_stat_config {
struct cpu_aggr_map *cpus_aggr_map;
u64 *walltime_run;
struct rblist metric_events;
+ size_t tpebs_event_size;
+ struct list_head tpebs_results;
+ pid_t tpebs_pid;
int ctl_fd;
int ctl_fd_ack;
bool ctl_fd_close;
--
2.43.0
On Tue, May 14, 2024 at 10:44 PM <weilin.wang@intel.com> wrote:
>
> From: Weilin Wang <weilin.wang@intel.com>
>
> When retire_latency value is used in a metric formula, perf stat would fork a
> perf record process with "-e" and "-W" options. Perf record will collect
> required retire_latency values in parallel while perf stat is collecting
> counting values.
>
> At the point of time that perf stat stops counting, it would send sigterm signal
> to perf record process and receiving sampling data back from perf record from a
> pipe. Perf stat will then process the received data to get retire latency data
> and calculate metric result.
>
> Another thread is required to synchronize between perf stat and perf record
> when we pass data through pipe.
>
> Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> Reviewed-by: Ian Rogers <irogers@google.com>
> ---
[SNIP]
> diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> new file mode 100644
> index 000000000000..4b7a98794fae
> --- /dev/null
> +++ b/tools/perf/util/intel-tpebs.c
> @@ -0,0 +1,285 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * intel_pt.c: Intel Processor Trace support
> + * Copyright (c) 2013-2015, Intel Corporation.
This needs some updates. :)
> + */
> +
> +
> +#include <sys/param.h>
> +#include <subcmd/run-command.h>
> +#include <thread.h>
> +#include "intel-tpebs.h"
> +#include <linux/list.h>
> +#include <linux/zalloc.h>
> +#include <linux/err.h>
> +#include "sample.h"
> +#include "debug.h"
> +#include "evlist.h"
> +#include "evsel.h"
> +#include "session.h"
> +#include "tool.h"
> +#include "metricgroup.h"
> +#include <sys/stat.h>
> +#include <sys/file.h>
> +
> +
> +
> +#define PERF_DATA "-"
> +#define CONTROL "/tmp/control"
> +#define ACK "/tmp/ack"
> +pthread_t reader_thread;
> +struct child_process *cmd;
> +struct perf_stat_config *stat_config;
static ?
> +
> +static int get_perf_record_args(const char **record_argv)
> +{
> + int i = 0;
> + struct tpebs_retire_lat *e;
> +
> + pr_debug("Prepare perf record for retire_latency\n");
> +
> + record_argv[i++] = "perf";
> + record_argv[i++] = "record";
> + record_argv[i++] = "-W";
> + record_argv[i++] = "--synth=no";
Unfortunately this still synthesizes MMAP records for the kernel
and modules. As we don't care about them and just want to
minimize the overhead at the beginning, we can add
"--tail-synthesize" too.
> + record_argv[i++] = "--control=fifo:/tmp/control,/tmp/ack";
This hard-coded path won't work well when more than one users
want to run the perf command at the same time.
Thanks,
Namhyung
> +
> + if (stat_config->user_requested_cpu_list) {
> + record_argv[i++] = "-C";
> + record_argv[i++] = stat_config->user_requested_cpu_list;
> + }
> +
> + if (stat_config->system_wide)
> + record_argv[i++] = "-a";
> +
> + if (!stat_config->system_wide && !stat_config->user_requested_cpu_list) {
> + pr_err("Require -a or -C option to run sampling.\n");
> + return -ECANCELED;
> + }
> +
> + list_for_each_entry(e, &stat_config->tpebs_results, nd) {
> + record_argv[i++] = "-e";
> + record_argv[i++] = e->name;
> + }
> +
> + record_argv[i++] = "-o";
> + record_argv[i++] = PERF_DATA;
> +
> + return 0;
> +}
> +
>
> -----Original Message-----
> From: Namhyung Kim <namhyung@kernel.org>
> Sent: Friday, May 17, 2024 2:43 PM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v8 3/7] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
>
> On Tue, May 14, 2024 at 10:44 PM <weilin.wang@intel.com> wrote:
> >
> > From: Weilin Wang <weilin.wang@intel.com>
> >
> > When retire_latency value is used in a metric formula, perf stat would fork a
> > perf record process with "-e" and "-W" options. Perf record will collect
> > required retire_latency values in parallel while perf stat is collecting
> > counting values.
> >
> > At the point of time that perf stat stops counting, it would send sigterm
> signal
> > to perf record process and receiving sampling data back from perf record
> from a
> > pipe. Perf stat will then process the received data to get retire latency data
> > and calculate metric result.
> >
> > Another thread is required to synchronize between perf stat and perf record
> > when we pass data through pipe.
> >
> > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > Reviewed-by: Ian Rogers <irogers@google.com>
> > ---
> [SNIP]
> > diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> > new file mode 100644
> > index 000000000000..4b7a98794fae
> > --- /dev/null
> > +++ b/tools/perf/util/intel-tpebs.c
> > @@ -0,0 +1,285 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * intel_pt.c: Intel Processor Trace support
> > + * Copyright (c) 2013-2015, Intel Corporation.
>
> This needs some updates. :)
>
>
> > + */
> > +
> > +
> > +#include <sys/param.h>
> > +#include <subcmd/run-command.h>
> > +#include <thread.h>
> > +#include "intel-tpebs.h"
> > +#include <linux/list.h>
> > +#include <linux/zalloc.h>
> > +#include <linux/err.h>
> > +#include "sample.h"
> > +#include "debug.h"
> > +#include "evlist.h"
> > +#include "evsel.h"
> > +#include "session.h"
> > +#include "tool.h"
> > +#include "metricgroup.h"
> > +#include <sys/stat.h>
> > +#include <sys/file.h>
> > +
> > +
> > +
> > +#define PERF_DATA "-"
> > +#define CONTROL "/tmp/control"
> > +#define ACK "/tmp/ack"
> > +pthread_t reader_thread;
> > +struct child_process *cmd;
> > +struct perf_stat_config *stat_config;
>
> static ?
>
> > +
> > +static int get_perf_record_args(const char **record_argv)
> > +{
> > + int i = 0;
> > + struct tpebs_retire_lat *e;
> > +
> > + pr_debug("Prepare perf record for retire_latency\n");
> > +
> > + record_argv[i++] = "perf";
> > + record_argv[i++] = "record";
> > + record_argv[i++] = "-W";
> > + record_argv[i++] = "--synth=no";
>
> Unfortunately this still synthesizes MMAP records for the kernel
> and modules. As we don't care about them and just want to
> minimize the overhead at the beginning, we can add
> "--tail-synthesize" too.
Hi Namhyung,
I'm trying out the "--tail-synthesize" option but failed to get it work
correctly. Could you please take a look at this command and let me
know what's the problem?
"perf record -e cycles:p --synth=no --tail-synthesize -W -a -o - sleep 1 | perf script -F retire_lat -i -".
I got an error "0x40 [0x40]: failed to process type: 9" when run this command.
If I run the command in two steps without pipe, they work fine.
Thanks,
Weilin
>
>
> > + record_argv[i++] = "--control=fifo:/tmp/control,/tmp/ack";
>
> This hard-coded path won't work well when more than one users
> want to run the perf command at the same time.
>
> Thanks,
> Namhyung
>
> > +
> > + if (stat_config->user_requested_cpu_list) {
> > + record_argv[i++] = "-C";
> > + record_argv[i++] = stat_config->user_requested_cpu_list;
> > + }
> > +
> > + if (stat_config->system_wide)
> > + record_argv[i++] = "-a";
> > +
> > + if (!stat_config->system_wide && !stat_config-
> >user_requested_cpu_list) {
> > + pr_err("Require -a or -C option to run sampling.\n");
> > + return -ECANCELED;
> > + }
> > +
> > + list_for_each_entry(e, &stat_config->tpebs_results, nd) {
> > + record_argv[i++] = "-e";
> > + record_argv[i++] = e->name;
> > + }
> > +
> > + record_argv[i++] = "-o";
> > + record_argv[i++] = PERF_DATA;
> > +
> > + return 0;
> > +}
> > +
> >
Hello,
On Mon, May 20, 2024 at 5:10 PM Wang, Weilin <weilin.wang@intel.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Namhyung Kim <namhyung@kernel.org>
> > Sent: Friday, May 17, 2024 2:43 PM
> > To: Wang, Weilin <weilin.wang@intel.com>
> > Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > <mingo@redhat.com>; Alexander Shishkin
> > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> > Caleb <caleb.biggers@intel.com>
> > Subject: Re: [RFC PATCH v8 3/7] perf stat: Fork and launch perf record when
> > perf stat needs to get retire latency value for a metric.
> >
> > On Tue, May 14, 2024 at 10:44 PM <weilin.wang@intel.com> wrote:
> > >
> > > From: Weilin Wang <weilin.wang@intel.com>
> > >
> > > When retire_latency value is used in a metric formula, perf stat would fork a
> > > perf record process with "-e" and "-W" options. Perf record will collect
> > > required retire_latency values in parallel while perf stat is collecting
> > > counting values.
> > >
> > > At the point of time that perf stat stops counting, it would send sigterm
> > signal
> > > to perf record process and receiving sampling data back from perf record
> > from a
> > > pipe. Perf stat will then process the received data to get retire latency data
> > > and calculate metric result.
> > >
> > > Another thread is required to synchronize between perf stat and perf record
> > > when we pass data through pipe.
> > >
> > > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > > Reviewed-by: Ian Rogers <irogers@google.com>
> > > ---
> > [SNIP]
> > > diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> > > new file mode 100644
> > > index 000000000000..4b7a98794fae
> > > --- /dev/null
> > > +++ b/tools/perf/util/intel-tpebs.c
> > > @@ -0,0 +1,285 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * intel_pt.c: Intel Processor Trace support
> > > + * Copyright (c) 2013-2015, Intel Corporation.
> >
> > This needs some updates. :)
> >
> >
> > > + */
> > > +
> > > +
> > > +#include <sys/param.h>
> > > +#include <subcmd/run-command.h>
> > > +#include <thread.h>
> > > +#include "intel-tpebs.h"
> > > +#include <linux/list.h>
> > > +#include <linux/zalloc.h>
> > > +#include <linux/err.h>
> > > +#include "sample.h"
> > > +#include "debug.h"
> > > +#include "evlist.h"
> > > +#include "evsel.h"
> > > +#include "session.h"
> > > +#include "tool.h"
> > > +#include "metricgroup.h"
> > > +#include <sys/stat.h>
> > > +#include <sys/file.h>
> > > +
> > > +
> > > +
> > > +#define PERF_DATA "-"
> > > +#define CONTROL "/tmp/control"
> > > +#define ACK "/tmp/ack"
> > > +pthread_t reader_thread;
> > > +struct child_process *cmd;
> > > +struct perf_stat_config *stat_config;
> >
> > static ?
> >
> > > +
> > > +static int get_perf_record_args(const char **record_argv)
> > > +{
> > > + int i = 0;
> > > + struct tpebs_retire_lat *e;
> > > +
> > > + pr_debug("Prepare perf record for retire_latency\n");
> > > +
> > > + record_argv[i++] = "perf";
> > > + record_argv[i++] = "record";
> > > + record_argv[i++] = "-W";
> > > + record_argv[i++] = "--synth=no";
> >
> > Unfortunately this still synthesizes MMAP records for the kernel
> > and modules. As we don't care about them and just want to
> > minimize the overhead at the beginning, we can add
> > "--tail-synthesize" too.
>
> Hi Namhyung,
>
> I'm trying out the "--tail-synthesize" option but failed to get it work
> correctly. Could you please take a look at this command and let me
> know what's the problem?
>
> "perf record -e cycles:p --synth=no --tail-synthesize -W -a -o - sleep 1 | perf script -F retire_lat -i -".
>
> I got an error "0x40 [0x40]: failed to process type: 9" when run this command.
>
> If I run the command in two steps without pipe, they work fine.
Hmm... it may not work with pipes. Let's go without the
tail-synthesize then. Sorry for the noise.
Thanks,
Namhyung
> -----Original Message-----
> From: Namhyung Kim <namhyung@kernel.org>
> Sent: Monday, May 20, 2024 10:43 PM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v8 3/7] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
>
> Hello,
>
> On Mon, May 20, 2024 at 5:10 PM Wang, Weilin <weilin.wang@intel.com>
> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Namhyung Kim <namhyung@kernel.org>
> > > Sent: Friday, May 17, 2024 2:43 PM
> > > To: Wang, Weilin <weilin.wang@intel.com>
> > > Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> > > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > > <mingo@redhat.com>; Alexander Shishkin
> > > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor,
> Perry
> > > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>;
> Biggers,
> > > Caleb <caleb.biggers@intel.com>
> > > Subject: Re: [RFC PATCH v8 3/7] perf stat: Fork and launch perf record
> when
> > > perf stat needs to get retire latency value for a metric.
> > >
> > > On Tue, May 14, 2024 at 10:44 PM <weilin.wang@intel.com> wrote:
> > > >
> > > > From: Weilin Wang <weilin.wang@intel.com>
> > > >
> > > > When retire_latency value is used in a metric formula, perf stat would
> fork a
> > > > perf record process with "-e" and "-W" options. Perf record will collect
> > > > required retire_latency values in parallel while perf stat is collecting
> > > > counting values.
> > > >
> > > > At the point of time that perf stat stops counting, it would send sigterm
> > > signal
> > > > to perf record process and receiving sampling data back from perf record
> > > from a
> > > > pipe. Perf stat will then process the received data to get retire latency
> data
> > > > and calculate metric result.
> > > >
> > > > Another thread is required to synchronize between perf stat and perf
> record
> > > > when we pass data through pipe.
> > > >
> > > > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > > > Reviewed-by: Ian Rogers <irogers@google.com>
> > > > ---
> > > [SNIP]
> > > > diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> > > > new file mode 100644
> > > > index 000000000000..4b7a98794fae
> > > > --- /dev/null
> > > > +++ b/tools/perf/util/intel-tpebs.c
> > > > @@ -0,0 +1,285 @@
> > > > +// SPDX-License-Identifier: GPL-2.0-only
> > > > +/*
> > > > + * intel_pt.c: Intel Processor Trace support
> > > > + * Copyright (c) 2013-2015, Intel Corporation.
> > >
> > > This needs some updates. :)
> > >
> > >
> > > > + */
> > > > +
> > > > +
> > > > +#include <sys/param.h>
> > > > +#include <subcmd/run-command.h>
> > > > +#include <thread.h>
> > > > +#include "intel-tpebs.h"
> > > > +#include <linux/list.h>
> > > > +#include <linux/zalloc.h>
> > > > +#include <linux/err.h>
> > > > +#include "sample.h"
> > > > +#include "debug.h"
> > > > +#include "evlist.h"
> > > > +#include "evsel.h"
> > > > +#include "session.h"
> > > > +#include "tool.h"
> > > > +#include "metricgroup.h"
> > > > +#include <sys/stat.h>
> > > > +#include <sys/file.h>
> > > > +
> > > > +
> > > > +
> > > > +#define PERF_DATA "-"
> > > > +#define CONTROL "/tmp/control"
> > > > +#define ACK "/tmp/ack"
> > > > +pthread_t reader_thread;
> > > > +struct child_process *cmd;
> > > > +struct perf_stat_config *stat_config;
> > >
> > > static ?
> > >
> > > > +
> > > > +static int get_perf_record_args(const char **record_argv)
> > > > +{
> > > > + int i = 0;
> > > > + struct tpebs_retire_lat *e;
> > > > +
> > > > + pr_debug("Prepare perf record for retire_latency\n");
> > > > +
> > > > + record_argv[i++] = "perf";
> > > > + record_argv[i++] = "record";
> > > > + record_argv[i++] = "-W";
> > > > + record_argv[i++] = "--synth=no";
> > >
> > > Unfortunately this still synthesizes MMAP records for the kernel
> > > and modules. As we don't care about them and just want to
> > > minimize the overhead at the beginning, we can add
> > > "--tail-synthesize" too.
> >
> > Hi Namhyung,
> >
> > I'm trying out the "--tail-synthesize" option but failed to get it work
> > correctly. Could you please take a look at this command and let me
> > know what's the problem?
> >
> > "perf record -e cycles:p --synth=no --tail-synthesize -W -a -o - sleep 1 | perf
> script -F retire_lat -i -".
> >
> > I got an error "0x40 [0x40]: failed to process type: 9" when run this
> command.
> >
> > If I run the command in two steps without pipe, they work fine.
>
> Hmm... it may not work with pipes. Let's go without the
> tail-synthesize then. Sorry for the noise.
No problem.
Thanks,
Weilin
>
> Thanks,
> Namhyung
> -----Original Message-----
> From: Namhyung Kim <namhyung@kernel.org>
> Sent: Friday, May 17, 2024 2:43 PM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v8 3/7] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
>
> On Tue, May 14, 2024 at 10:44 PM <weilin.wang@intel.com> wrote:
> >
> > From: Weilin Wang <weilin.wang@intel.com>
> >
> > When retire_latency value is used in a metric formula, perf stat would fork a
> > perf record process with "-e" and "-W" options. Perf record will collect
> > required retire_latency values in parallel while perf stat is collecting
> > counting values.
> >
> > At the point of time that perf stat stops counting, it would send sigterm
> signal
> > to perf record process and receiving sampling data back from perf record
> from a
> > pipe. Perf stat will then process the received data to get retire latency data
> > and calculate metric result.
> >
> > Another thread is required to synchronize between perf stat and perf record
> > when we pass data through pipe.
> >
> > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > Reviewed-by: Ian Rogers <irogers@google.com>
> > ---
> [SNIP]
> > diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> > new file mode 100644
> > index 000000000000..4b7a98794fae
> > --- /dev/null
> > +++ b/tools/perf/util/intel-tpebs.c
> > @@ -0,0 +1,285 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * intel_pt.c: Intel Processor Trace support
> > + * Copyright (c) 2013-2015, Intel Corporation.
>
> This needs some updates. :)
>
>
> > + */
> > +
> > +
> > +#include <sys/param.h>
> > +#include <subcmd/run-command.h>
> > +#include <thread.h>
> > +#include "intel-tpebs.h"
> > +#include <linux/list.h>
> > +#include <linux/zalloc.h>
> > +#include <linux/err.h>
> > +#include "sample.h"
> > +#include "debug.h"
> > +#include "evlist.h"
> > +#include "evsel.h"
> > +#include "session.h"
> > +#include "tool.h"
> > +#include "metricgroup.h"
> > +#include <sys/stat.h>
> > +#include <sys/file.h>
> > +
> > +
> > +
> > +#define PERF_DATA "-"
> > +#define CONTROL "/tmp/control"
> > +#define ACK "/tmp/ack"
> > +pthread_t reader_thread;
> > +struct child_process *cmd;
> > +struct perf_stat_config *stat_config;
>
> static ?
>
> > +
> > +static int get_perf_record_args(const char **record_argv)
> > +{
> > + int i = 0;
> > + struct tpebs_retire_lat *e;
> > +
> > + pr_debug("Prepare perf record for retire_latency\n");
> > +
> > + record_argv[i++] = "perf";
> > + record_argv[i++] = "record";
> > + record_argv[i++] = "-W";
> > + record_argv[i++] = "--synth=no";
>
> Unfortunately this still synthesizes MMAP records for the kernel
> and modules. As we don't care about them and just want to
> minimize the overhead at the beginning, we can add
> "--tail-synthesize" too.
>
>
> > + record_argv[i++] = "--control=fifo:/tmp/control,/tmp/ack";
>
> This hard-coded path won't work well when more than one users
> want to run the perf command at the same time.
>
Hi Namhyung,
Yes, this is not a good implementation. I'm going to update this named
pipe and all the related code.
Thanks,
Weilin
> Thanks,
> Namhyung
>
> > +
> > + if (stat_config->user_requested_cpu_list) {
> > + record_argv[i++] = "-C";
> > + record_argv[i++] = stat_config->user_requested_cpu_list;
> > + }
> > +
> > + if (stat_config->system_wide)
> > + record_argv[i++] = "-a";
> > +
> > + if (!stat_config->system_wide && !stat_config-
> >user_requested_cpu_list) {
> > + pr_err("Require -a or -C option to run sampling.\n");
> > + return -ECANCELED;
> > + }
> > +
> > + list_for_each_entry(e, &stat_config->tpebs_results, nd) {
> > + record_argv[i++] = "-e";
> > + record_argv[i++] = e->name;
> > + }
> > +
> > + record_argv[i++] = "-o";
> > + record_argv[i++] = PERF_DATA;
> > +
> > + return 0;
> > +}
> > +
> >
On Tue, May 14, 2024 at 10:44 PM <weilin.wang@intel.com> wrote:
>
> From: Weilin Wang <weilin.wang@intel.com>
>
> When retire_latency value is used in a metric formula, perf stat would fork a
> perf record process with "-e" and "-W" options. Perf record will collect
> required retire_latency values in parallel while perf stat is collecting
> counting values.
>
> At the point of time that perf stat stops counting, it would send sigterm signal
> to perf record process and receiving sampling data back from perf record from a
> pipe. Perf stat will then process the received data to get retire latency data
> and calculate metric result.
>
> Another thread is required to synchronize between perf stat and perf record
> when we pass data through pipe.
>
> Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> Reviewed-by: Ian Rogers <irogers@google.com>
So I don't think this is the correct way to add this. My reviewed-by
was based on the fact that I was going to refactor this after landing.
It didn't land and I already sent out:
"Retirement latency perf stat support"
https://lore.kernel.org/lkml/20240428053616.1125891-1-irogers@google.com/
v3 to just land the tool portion:
https://lore.kernel.org/lkml/20240503232849.17752-1-irogers@google.com/
So my reviewed-by no longer stands. The changes I've sent out haven't
been reviewed. Given you're trying to land this, can we work on
reviewing those changes? The v3 was specifically done just so that we
can have the cleaner basis for adding tpebs to the evsel.
Let me point to specific issues below:
> ---
> tools/perf/builtin-stat.c | 19 +++
> tools/perf/util/Build | 1 +
> tools/perf/util/intel-tpebs.c | 285 ++++++++++++++++++++++++++++++++++
> tools/perf/util/intel-tpebs.h | 29 ++++
> tools/perf/util/stat.h | 3 +
> 5 files changed, 337 insertions(+)
> create mode 100644 tools/perf/util/intel-tpebs.c
> create mode 100644 tools/perf/util/intel-tpebs.h
>
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 428e9721b908..85927cf45adb 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -70,6 +70,7 @@
> #include "util/bpf_counter.h"
> #include "util/iostat.h"
> #include "util/util.h"
> +#include "util/intel-tpebs.h"
> #include "asm/bug.h"
>
> #include <linux/time64.h>
> @@ -94,6 +95,7 @@
> #include <perf/evlist.h>
> #include <internal/threadmap.h>
>
> +
> #define DEFAULT_SEPARATOR " "
> #define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi"
>
> @@ -162,6 +164,8 @@ static struct perf_stat_config stat_config = {
> .ctl_fd = -1,
> .ctl_fd_ack = -1,
> .iostat_run = false,
> + .tpebs_results = LIST_HEAD_INIT(stat_config.tpebs_results),
> + .tpebs_pid = -1,
Here we're adding state to the stat_config for the sake of tpebs
events. Were the state in the evsel, as in my changes, we'd not need
to carry new global state for tpebs.
> };
>
> static void evlist__check_cpu_maps(struct evlist *evlist)
> @@ -653,6 +657,8 @@ static enum counter_recovery stat_handle_error(struct evsel *counter)
>
> if (child_pid != -1)
> kill(child_pid, SIGTERM);
> + if (stat_config.tpebs_pid != -1)
> + stop_tpebs();
This logic is builtin-stat but what about other commands that could be
using TPEBS for counters?
> return COUNTER_FATAL;
> }
>
> @@ -673,6 +679,10 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
> int err;
> bool second_pass = false;
>
> + err = start_tpebs(&stat_config, evsel_list);
> + if (err < 0)
> + return err;
> +
> if (forks) {
> if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe, workload_exec_failed_signal) < 0) {
> perror("failed to prepare workload");
> @@ -878,6 +888,10 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
>
> t1 = rdclock();
>
> + err = stop_tpebs();
> + if (err < 0)
> + return err;
> +
> if (stat_config.walltime_run_table)
> stat_config.walltime_run[run_idx] = t1 - t0;
>
> @@ -985,6 +999,9 @@ static void sig_atexit(void)
> if (child_pid != -1)
> kill(child_pid, SIGTERM);
>
> + if (stat_config.tpebs_pid != -1)
> + kill(stat_config.tpebs_pid, SIGTERM);
> +
> sigprocmask(SIG_SETMASK, &oset, NULL);
>
> if (signr == -1)
> @@ -2918,5 +2935,7 @@ int cmd_stat(int argc, const char **argv)
> metricgroup__rblist_exit(&stat_config.metric_events);
> evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack, &stat_config.ctl_fd_close);
>
> + tpebs_data__delete();
> +
> return status;
> }
> diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> index 292170a99ab6..c9f1d0bb6bf8 100644
> --- a/tools/perf/util/Build
> +++ b/tools/perf/util/Build
> @@ -153,6 +153,7 @@ perf-y += clockid.o
> perf-y += list_sort.o
> perf-y += mutex.o
> perf-y += sharded_mutex.o
> +perf-y += intel-tpebs.o
>
> perf-$(CONFIG_LIBBPF) += bpf_map.o
> perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
> diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> new file mode 100644
> index 000000000000..4b7a98794fae
> --- /dev/null
> +++ b/tools/perf/util/intel-tpebs.c
> @@ -0,0 +1,285 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * intel_pt.c: Intel Processor Trace support
> + * Copyright (c) 2013-2015, Intel Corporation.
> + */
> +
> +
> +#include <sys/param.h>
> +#include <subcmd/run-command.h>
> +#include <thread.h>
> +#include "intel-tpebs.h"
> +#include <linux/list.h>
> +#include <linux/zalloc.h>
> +#include <linux/err.h>
> +#include "sample.h"
> +#include "debug.h"
> +#include "evlist.h"
> +#include "evsel.h"
> +#include "session.h"
> +#include "tool.h"
> +#include "metricgroup.h"
> +#include <sys/stat.h>
> +#include <sys/file.h>
> +
> +
> +
> +#define PERF_DATA "-"
> +#define CONTROL "/tmp/control"
> +#define ACK "/tmp/ack"
> +pthread_t reader_thread;
> +struct child_process *cmd;
> +struct perf_stat_config *stat_config;
> +
> +static int get_perf_record_args(const char **record_argv)
> +{
> + int i = 0;
> + struct tpebs_retire_lat *e;
> +
> + pr_debug("Prepare perf record for retire_latency\n");
> +
> + record_argv[i++] = "perf";
> + record_argv[i++] = "record";
> + record_argv[i++] = "-W";
> + record_argv[i++] = "--synth=no";
There should be more things to disable, like BPF events, we don't need
the dummy, etc.
> + record_argv[i++] = "--control=fifo:/tmp/control,/tmp/ack";
> +
> + if (stat_config->user_requested_cpu_list) {
> + record_argv[i++] = "-C";
> + record_argv[i++] = stat_config->user_requested_cpu_list;
> + }
> +
> + if (stat_config->system_wide)
> + record_argv[i++] = "-a";
> +
> + if (!stat_config->system_wide && !stat_config->user_requested_cpu_list) {
> + pr_err("Require -a or -C option to run sampling.\n");
> + return -ECANCELED;
> + }
> +
> + list_for_each_entry(e, &stat_config->tpebs_results, nd) {
> + record_argv[i++] = "-e";
> + record_argv[i++] = e->name;
> + }
> +
> + record_argv[i++] = "-o";
> + record_argv[i++] = PERF_DATA;
> +
> + return 0;
> +}
> +
> +static int prepare_run_command(const char **argv)
> +{
> + cmd = zalloc(sizeof(struct child_process));
> + if (!cmd)
> + return -ENOMEM;
> + cmd->argv = argv;
> + cmd->out = -1;
> + return 0;
> +}
> +
> +static int prepare_perf_record(int tpebs_event_size)
> +{
> + const char **record_argv;
> + int ret;
> +
> + /*Create control and ack fd for --control*/
> + if (mkfifo(CONTROL, 0600)) {
> + pr_err("Failed to create control fifo");
> + return -1;
> + }
> + if (mkfifo(ACK, 0600)) {
> + pr_err("Failed to create control fifo");
> + return -1;
> + }
> +
> + record_argv = calloc(10 + 2 * tpebs_event_size, sizeof(char *));
> + if (!record_argv)
> + return -ENOMEM;
> +
> +
> + ret = get_perf_record_args(record_argv);
> + if (ret)
> + goto out;
> +
> + ret = prepare_run_command(record_argv);
> + if (ret)
> + goto out;
> + ret = start_command(cmd);
> +out:
> + free(record_argv);
> + return ret;
> +}
> +struct sample_data_reader {
> + struct perf_tool tool;
> + struct perf_session *session;
> +};
> +
> +static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
> +{
> + zfree(&r->name);
> + zfree(&r->tpebs_name);
> + free(r);
> +}
> +
> +void tpebs_data__delete(void)
> +{
> + struct tpebs_retire_lat *r, *rtmp;
> +
> + list_for_each_entry_safe(r, rtmp, &stat_config->tpebs_results, nd) {
> + list_del_init(&r->nd);
> + tpebs_retire_lat__delete(r);
> + }
> + free(cmd);
> +}
> +
> +static int process_sample_event(struct perf_tool *tool __maybe_unused,
> + union perf_event *event __maybe_unused,
> + struct perf_sample *sample,
> + struct evsel *evsel,
> + struct machine *machine __maybe_unused)
> +{
> + int ret = 0;
> + const char *evname;
> + struct tpebs_retire_lat *t;
> +
> + evname = evsel__name(evsel);
> +
> + /*
> + * Need to handle per core results? We are assuming average retire
> + * latency value will be used. Save the number of samples and the sum of
> + * retire latency value for each event.
> + */
> + list_for_each_entry(t, &stat_config->tpebs_results, nd) {
> + if (!strcmp(evname, t->name)) {
> + t->count += 1;
> + t->sum += sample->retire_lat;
> + t->val = (double) t->sum / t->count;
> + break;
> + }
> + }
> +
> + return ret;
> +}
> +
> +static int process_feature_event(struct perf_session *session,
> + union perf_event *event)
> +{
> + if (event->feat.feat_id < HEADER_LAST_FEATURE)
> + return perf_event__process_feature(session, event);
> + return 0;
> +}
> +
> +static void *__sample_reader(void *arg)
> +{
> + struct child_process *child = arg;
> + struct perf_session *session;
> + struct perf_data data = {
> + .mode = PERF_DATA_MODE_READ,
> + .path = PERF_DATA,
> + .file.fd = child->out,
> + };
> + struct sample_data_reader reader = {
> + .tool = {
> + .sample = process_sample_event,
> + .feature = process_feature_event,
> + .attr = perf_event__process_attr,
> + },
> + };
I prefer the reading approach here over what I did here:
https://lore.kernel.org/lkml/CAM9d7cgzTsfk3C+dTN80f5FhB1rmfturjuUUwvSTeUvny5eWKw@mail.gmail.com/
I'd done that as Namhyung had commented on using perf report:
https://lore.kernel.org/lkml/CAM9d7cgdQQn5GYB7t++xuoMdeqPXiEkkcop69+rD06RAnu9-EQ@mail.gmail.com/
It was also less work. I don't see why we can't move the reading logic
into a function like evsel__read_retire_latency that's in my change. I
also think that in the first instance all reading logic should be
implemented to return 0 and we only do the forking, etc. when a
command line flag is passed.
> +
> + session = perf_session__new(&data, &reader.tool);
> + if (IS_ERR(session))
> + return NULL;
> + reader.session = session;
> + perf_session__process_events(session);
> + perf_session__delete(session);
> +
> + return NULL;
> +}
> +
> +
> +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist *evsel_list)
> +{
> + int ret = 0;
> + struct evsel *evsel;
> + int control = -1, ack = -1;
> +
> + stat_config = perf_stat_config;
> + /*
> + * Prepare perf record for sampling event retire_latency before fork and
> + * prepare workload
> + */
> + evlist__for_each_entry(evsel_list, evsel) {
> + if (evsel->retire_lat) {
> + struct tpebs_retire_lat *new = malloc(sizeof(struct tpebs_retire_lat));
> + int i;
> + char *name;
> +
> + pr_debug("perf stat retire latency %s required\n", evsel->name);
> + if (!new)
> + return -1;
> + for (i = strlen(evsel->name) - 1; i > 0; i--) {
> + if (evsel->name[i] == 'R')
> + break;
> + }
> + if (i <= 0 || evsel->name[i] != 'R')
> + return -1;
> +
> + name = strdup(evsel->name);
> + if (!name)
> + return -ENOMEM;
> + name[i] = 'p';
> + new->name = strdup(name);
> + free(name);
> + new->tpebs_name = strdup(evsel->name);
> + if (!new->tpebs_name)
> + return -ENOMEM;
> + new->count = 0;
> + new->sum = 0;
> + list_add_tail(&new->nd, &stat_config->tpebs_results);
> + stat_config->tpebs_event_size += 1;
> + }
> + }
> +
> + if (stat_config->tpebs_event_size > 0) {
> + ret = prepare_perf_record(stat_config->tpebs_event_size);
> + if (ret)
> + return ret;
> + if (pthread_create(&reader_thread, NULL, __sample_reader, cmd)) {
> + kill(cmd->pid, SIGTERM);
> + close(cmd->out);
> + pr_err("Could not create thread to process sample data.\n");
> + return -1;
> + }
> + /* Wait for perf record initialization a little bit.*/
> + control = open(CONTROL, O_RDONLY, O_NONBLOCK);
> + if (!control)
> + return -1;
> + close(control);
> + ack = open(ACK, O_RDONLY, O_NONBLOCK);
> + if (!ack)
> + return -1;
> + close(ack);
> + pr_debug("Received ack from perf record\n");
> + }
> +
> + return ret;
> +}
> +
> +
> +int stop_tpebs(void)
> +{
> + int ret = 0;
> +
> + if (stat_config->tpebs_event_size > 0) {
> + kill(cmd->pid, SIGTERM);
> + pthread_join(reader_thread, NULL);
> + close(cmd->out);
> + ret = finish_command(cmd);
> + if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> + ret = 0;
> + remove(CONTROL);
> + remove(ACK);
> + }
> + return ret;
> +}
> diff --git a/tools/perf/util/intel-tpebs.h b/tools/perf/util/intel-tpebs.h
> new file mode 100644
> index 000000000000..e8e2bb2f479b
> --- /dev/null
> +++ b/tools/perf/util/intel-tpebs.h
> @@ -0,0 +1,29 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * intel_pt.h: Intel Processor Trace support
> + * Copyright (c) 2013-2015, Intel Corporation.
> + */
> +#include "stat.h"
> +
> +#ifndef INCLUDE__PERF_INTEL_TPEBS_H__
> +#define INCLUDE__PERF_INTEL_TPEBS_H__
> +
> +struct tpebs_retire_lat {
> + struct list_head nd;
> + /* Event name */
> + const char *name;
> + /* Event name with the TPEBS modifier R */
> + const char *tpebs_name;
> + /* Count of retire_latency values found in sample data */
> + size_t count;
> + /* Sum of all the retire_latency values in sample data */
> + int sum;
> + /* Average of retire_latency, val = sum / count */
> + double val;
> +};
An evsel has a pretty much all of this and so we're duplicating in
particular the counting logic which then needs later work to integrate
and is why I'd prefer we went the evsel route.
Thanks,
Ian
> +
> +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist *evsel_list);
> +int stop_tpebs(void);
> +void tpebs_data__delete(void);
> +
> +#endif
> diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> index fd7a187551bd..c6c2aa43030f 100644
> --- a/tools/perf/util/stat.h
> +++ b/tools/perf/util/stat.h
> @@ -110,6 +110,9 @@ struct perf_stat_config {
> struct cpu_aggr_map *cpus_aggr_map;
> u64 *walltime_run;
> struct rblist metric_events;
> + size_t tpebs_event_size;
> + struct list_head tpebs_results;
> + pid_t tpebs_pid;
> int ctl_fd;
> int ctl_fd_ack;
> bool ctl_fd_close;
> --
> 2.43.0
>
> -----Original Message-----
> From: Ian Rogers <irogers@google.com>
> Sent: Thursday, May 16, 2024 9:43 AM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Namhyung Kim <namhyung@kernel.org>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v8 3/7] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
>
> On Tue, May 14, 2024 at 10:44 PM <weilin.wang@intel.com> wrote:
> >
> > From: Weilin Wang <weilin.wang@intel.com>
> >
> > When retire_latency value is used in a metric formula, perf stat would fork a
> > perf record process with "-e" and "-W" options. Perf record will collect
> > required retire_latency values in parallel while perf stat is collecting
> > counting values.
> >
> > At the point of time that perf stat stops counting, it would send sigterm
> signal
> > to perf record process and receiving sampling data back from perf record
> from a
> > pipe. Perf stat will then process the received data to get retire latency data
> > and calculate metric result.
> >
> > Another thread is required to synchronize between perf stat and perf record
> > when we pass data through pipe.
> >
> > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > Reviewed-by: Ian Rogers <irogers@google.com>
>
> So I don't think this is the correct way to add this. My reviewed-by
> was based on the fact that I was going to refactor this after landing.
> It didn't land and I already sent out:
> "Retirement latency perf stat support"
> https://lore.kernel.org/lkml/20240428053616.1125891-1-
> irogers@google.com/
> v3 to just land the tool portion:
> https://lore.kernel.org/lkml/20240503232849.17752-1-
> irogers@google.com/
> So my reviewed-by no longer stands. The changes I've sent out haven't
> been reviewed. Given you're trying to land this, can we work on
> reviewing those changes? The v3 was specifically done just so that we
> can have the cleaner basis for adding tpebs to the evsel.
>
Sorry about carrying the reviewed-by in wrong places. I will remove them.
> Let me point to specific issues below:
>
> > ---
> > tools/perf/builtin-stat.c | 19 +++
> > tools/perf/util/Build | 1 +
> > tools/perf/util/intel-tpebs.c | 285
> ++++++++++++++++++++++++++++++++++
> > tools/perf/util/intel-tpebs.h | 29 ++++
> > tools/perf/util/stat.h | 3 +
> > 5 files changed, 337 insertions(+)
> > create mode 100644 tools/perf/util/intel-tpebs.c
> > create mode 100644 tools/perf/util/intel-tpebs.h
> >
> > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > index 428e9721b908..85927cf45adb 100644
> > --- a/tools/perf/builtin-stat.c
> > +++ b/tools/perf/builtin-stat.c
> > @@ -70,6 +70,7 @@
> > #include "util/bpf_counter.h"
> > #include "util/iostat.h"
> > #include "util/util.h"
> > +#include "util/intel-tpebs.h"
> > #include "asm/bug.h"
> >
> > #include <linux/time64.h>
> > @@ -94,6 +95,7 @@
> > #include <perf/evlist.h>
> > #include <internal/threadmap.h>
> >
> > +
> > #define DEFAULT_SEPARATOR " "
> > #define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi"
> >
> > @@ -162,6 +164,8 @@ static struct perf_stat_config stat_config = {
> > .ctl_fd = -1,
> > .ctl_fd_ack = -1,
> > .iostat_run = false,
> > + .tpebs_results = LIST_HEAD_INIT(stat_config.tpebs_results),
> > + .tpebs_pid = -1,
>
> Here we're adding state to the stat_config for the sake of tpebs
> events. Were the state in the evsel, as in my changes, we'd not need
> to carry new global state for tpebs.
Yes, I should remove the .tpebs_results out. How about .tpebs_pid?
It is used in builtin-stat later.
>
> > };
> >
> > static void evlist__check_cpu_maps(struct evlist *evlist)
> > @@ -653,6 +657,8 @@ static enum counter_recovery
> stat_handle_error(struct evsel *counter)
> >
> > if (child_pid != -1)
> > kill(child_pid, SIGTERM);
> > + if (stat_config.tpebs_pid != -1)
> > + stop_tpebs();
>
> This logic is builtin-stat but what about other commands that could be
> using TPEBS for counters?
Could you please let me know the commands that could use it?
I was thinking we could do this logic in evsel once the per core per event fork
get updated. I think that would work for all the commands.
>
> > return COUNTER_FATAL;
> > }
> >
> > @@ -673,6 +679,10 @@ static int __run_perf_stat(int argc, const char
> **argv, int run_idx)
> > int err;
> > bool second_pass = false;
> >
> > + err = start_tpebs(&stat_config, evsel_list);
> > + if (err < 0)
> > + return err;
> > +
> > if (forks) {
> > if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe,
> workload_exec_failed_signal) < 0) {
> > perror("failed to prepare workload");
> > @@ -878,6 +888,10 @@ static int __run_perf_stat(int argc, const char
> **argv, int run_idx)
> >
> > t1 = rdclock();
> >
> > + err = stop_tpebs();
> > + if (err < 0)
> > + return err;
> > +
> > if (stat_config.walltime_run_table)
> > stat_config.walltime_run[run_idx] = t1 - t0;
> >
> > @@ -985,6 +999,9 @@ static void sig_atexit(void)
> > if (child_pid != -1)
> > kill(child_pid, SIGTERM);
> >
> > + if (stat_config.tpebs_pid != -1)
> > + kill(stat_config.tpebs_pid, SIGTERM);
> > +
> > sigprocmask(SIG_SETMASK, &oset, NULL);
> >
> > if (signr == -1)
> > @@ -2918,5 +2935,7 @@ int cmd_stat(int argc, const char **argv)
> > metricgroup__rblist_exit(&stat_config.metric_events);
> > evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack,
> &stat_config.ctl_fd_close);
> >
> > + tpebs_data__delete();
> > +
> > return status;
> > }
> > diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> > index 292170a99ab6..c9f1d0bb6bf8 100644
> > --- a/tools/perf/util/Build
> > +++ b/tools/perf/util/Build
> > @@ -153,6 +153,7 @@ perf-y += clockid.o
> > perf-y += list_sort.o
> > perf-y += mutex.o
> > perf-y += sharded_mutex.o
> > +perf-y += intel-tpebs.o
> >
> > perf-$(CONFIG_LIBBPF) += bpf_map.o
> > perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
> > diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> > new file mode 100644
> > index 000000000000..4b7a98794fae
> > --- /dev/null
> > +++ b/tools/perf/util/intel-tpebs.c
> > @@ -0,0 +1,285 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * intel_pt.c: Intel Processor Trace support
> > + * Copyright (c) 2013-2015, Intel Corporation.
> > + */
> > +
> > +
> > +#include <sys/param.h>
> > +#include <subcmd/run-command.h>
> > +#include <thread.h>
> > +#include "intel-tpebs.h"
> > +#include <linux/list.h>
> > +#include <linux/zalloc.h>
> > +#include <linux/err.h>
> > +#include "sample.h"
> > +#include "debug.h"
> > +#include "evlist.h"
> > +#include "evsel.h"
> > +#include "session.h"
> > +#include "tool.h"
> > +#include "metricgroup.h"
> > +#include <sys/stat.h>
> > +#include <sys/file.h>
> > +
> > +
> > +
> > +#define PERF_DATA "-"
> > +#define CONTROL "/tmp/control"
> > +#define ACK "/tmp/ack"
> > +pthread_t reader_thread;
> > +struct child_process *cmd;
> > +struct perf_stat_config *stat_config;
> > +
> > +static int get_perf_record_args(const char **record_argv)
> > +{
> > + int i = 0;
> > + struct tpebs_retire_lat *e;
> > +
> > + pr_debug("Prepare perf record for retire_latency\n");
> > +
> > + record_argv[i++] = "perf";
> > + record_argv[i++] = "record";
> > + record_argv[i++] = "-W";
> > + record_argv[i++] = "--synth=no";
>
> There should be more things to disable, like BPF events, we don't need
> the dummy, etc.
Ok, I will add that.
>
> > + record_argv[i++] = "--control=fifo:/tmp/control,/tmp/ack";
> > +
> > + if (stat_config->user_requested_cpu_list) {
> > + record_argv[i++] = "-C";
> > + record_argv[i++] = stat_config->user_requested_cpu_list;
> > + }
> > +
> > + if (stat_config->system_wide)
> > + record_argv[i++] = "-a";
> > +
> > + if (!stat_config->system_wide && !stat_config-
> >user_requested_cpu_list) {
> > + pr_err("Require -a or -C option to run sampling.\n");
> > + return -ECANCELED;
> > + }
> > +
> > + list_for_each_entry(e, &stat_config->tpebs_results, nd) {
> > + record_argv[i++] = "-e";
> > + record_argv[i++] = e->name;
> > + }
> > +
> > + record_argv[i++] = "-o";
> > + record_argv[i++] = PERF_DATA;
> > +
> > + return 0;
> > +}
> > +
> > +static int prepare_run_command(const char **argv)
> > +{
> > + cmd = zalloc(sizeof(struct child_process));
> > + if (!cmd)
> > + return -ENOMEM;
> > + cmd->argv = argv;
> > + cmd->out = -1;
> > + return 0;
> > +}
> > +
> > +static int prepare_perf_record(int tpebs_event_size)
> > +{
> > + const char **record_argv;
> > + int ret;
> > +
> > + /*Create control and ack fd for --control*/
> > + if (mkfifo(CONTROL, 0600)) {
> > + pr_err("Failed to create control fifo");
> > + return -1;
> > + }
> > + if (mkfifo(ACK, 0600)) {
> > + pr_err("Failed to create control fifo");
> > + return -1;
> > + }
> > +
> > + record_argv = calloc(10 + 2 * tpebs_event_size, sizeof(char *));
> > + if (!record_argv)
> > + return -ENOMEM;
> > +
> > +
> > + ret = get_perf_record_args(record_argv);
> > + if (ret)
> > + goto out;
> > +
> > + ret = prepare_run_command(record_argv);
> > + if (ret)
> > + goto out;
> > + ret = start_command(cmd);
> > +out:
> > + free(record_argv);
> > + return ret;
> > +}
> > +struct sample_data_reader {
> > + struct perf_tool tool;
> > + struct perf_session *session;
> > +};
> > +
> > +static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
> > +{
> > + zfree(&r->name);
> > + zfree(&r->tpebs_name);
> > + free(r);
> > +}
> > +
> > +void tpebs_data__delete(void)
> > +{
> > + struct tpebs_retire_lat *r, *rtmp;
> > +
> > + list_for_each_entry_safe(r, rtmp, &stat_config->tpebs_results, nd) {
> > + list_del_init(&r->nd);
> > + tpebs_retire_lat__delete(r);
> > + }
> > + free(cmd);
> > +}
> > +
> > +static int process_sample_event(struct perf_tool *tool __maybe_unused,
> > + union perf_event *event __maybe_unused,
> > + struct perf_sample *sample,
> > + struct evsel *evsel,
> > + struct machine *machine __maybe_unused)
> > +{
> > + int ret = 0;
> > + const char *evname;
> > + struct tpebs_retire_lat *t;
> > +
> > + evname = evsel__name(evsel);
> > +
> > + /*
> > + * Need to handle per core results? We are assuming average retire
> > + * latency value will be used. Save the number of samples and the sum
> of
> > + * retire latency value for each event.
> > + */
> > + list_for_each_entry(t, &stat_config->tpebs_results, nd) {
> > + if (!strcmp(evname, t->name)) {
> > + t->count += 1;
> > + t->sum += sample->retire_lat;
> > + t->val = (double) t->sum / t->count;
> > + break;
> > + }
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +static int process_feature_event(struct perf_session *session,
> > + union perf_event *event)
> > +{
> > + if (event->feat.feat_id < HEADER_LAST_FEATURE)
> > + return perf_event__process_feature(session, event);
> > + return 0;
> > +}
> > +
> > +static void *__sample_reader(void *arg)
> > +{
> > + struct child_process *child = arg;
> > + struct perf_session *session;
> > + struct perf_data data = {
> > + .mode = PERF_DATA_MODE_READ,
> > + .path = PERF_DATA,
> > + .file.fd = child->out,
> > + };
> > + struct sample_data_reader reader = {
> > + .tool = {
> > + .sample = process_sample_event,
> > + .feature = process_feature_event,
> > + .attr = perf_event__process_attr,
> > + },
> > + };
>
> I prefer the reading approach here over what I did here:
> https://lore.kernel.org/lkml/CAM9d7cgzTsfk3C+dTN80f5FhB1rmfturjuUUwv
> STeUvny5eWKw@mail.gmail.com/
> I'd done that as Namhyung had commented on using perf report:
> https://lore.kernel.org/lkml/CAM9d7cgdQQn5GYB7t++xuoMdeqPXiEkkcop69
> +rD06RAnu9-EQ@mail.gmail.com/
> It was also less work. I don't see why we can't move the reading logic
> into a function like evsel__read_retire_latency that's in my change. I
> also think that in the first instance all reading logic should be
> implemented to return 0 and we only do the forking, etc. when a
> command line flag is passed.
Yes, a command line flag to turn it on is a good idea. Andi also suggested this.
I want to add this option in next version.
I tried to refactor the code so that you could easily reuse it later. You could either
put this code directly into evsel__read_retire_latency or call this function,
whichever way you feel better.
>
> > +
> > + session = perf_session__new(&data, &reader.tool);
> > + if (IS_ERR(session))
> > + return NULL;
> > + reader.session = session;
> > + perf_session__process_events(session);
> > + perf_session__delete(session);
> > +
> > + return NULL;
> > +}
> > +
> > +
> > +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist
> *evsel_list)
> > +{
> > + int ret = 0;
> > + struct evsel *evsel;
> > + int control = -1, ack = -1;
> > +
> > + stat_config = perf_stat_config;
> > + /*
> > + * Prepare perf record for sampling event retire_latency before fork and
> > + * prepare workload
> > + */
> > + evlist__for_each_entry(evsel_list, evsel) {
> > + if (evsel->retire_lat) {
> > + struct tpebs_retire_lat *new = malloc(sizeof(struct
> tpebs_retire_lat));
> > + int i;
> > + char *name;
> > +
> > + pr_debug("perf stat retire latency %s required\n", evsel-
> >name);
> > + if (!new)
> > + return -1;
> > + for (i = strlen(evsel->name) - 1; i > 0; i--) {
> > + if (evsel->name[i] == 'R')
> > + break;
> > + }
> > + if (i <= 0 || evsel->name[i] != 'R')
> > + return -1;
> > +
> > + name = strdup(evsel->name);
> > + if (!name)
> > + return -ENOMEM;
> > + name[i] = 'p';
> > + new->name = strdup(name);
> > + free(name);
> > + new->tpebs_name = strdup(evsel->name);
> > + if (!new->tpebs_name)
> > + return -ENOMEM;
> > + new->count = 0;
> > + new->sum = 0;
> > + list_add_tail(&new->nd, &stat_config->tpebs_results);
> > + stat_config->tpebs_event_size += 1;
> > + }
> > + }
> > +
> > + if (stat_config->tpebs_event_size > 0) {
> > + ret = prepare_perf_record(stat_config->tpebs_event_size);
> > + if (ret)
> > + return ret;
> > + if (pthread_create(&reader_thread, NULL, __sample_reader, cmd)) {
> > + kill(cmd->pid, SIGTERM);
> > + close(cmd->out);
> > + pr_err("Could not create thread to process sample data.\n");
> > + return -1;
> > + }
> > + /* Wait for perf record initialization a little bit.*/
> > + control = open(CONTROL, O_RDONLY, O_NONBLOCK);
> > + if (!control)
> > + return -1;
> > + close(control);
> > + ack = open(ACK, O_RDONLY, O_NONBLOCK);
> > + if (!ack)
> > + return -1;
> > + close(ack);
> > + pr_debug("Received ack from perf record\n");
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +
> > +int stop_tpebs(void)
> > +{
> > + int ret = 0;
> > +
> > + if (stat_config->tpebs_event_size > 0) {
> > + kill(cmd->pid, SIGTERM);
> > + pthread_join(reader_thread, NULL);
> > + close(cmd->out);
> > + ret = finish_command(cmd);
> > + if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> > + ret = 0;
> > + remove(CONTROL);
> > + remove(ACK);
> > + }
> > + return ret;
> > +}
> > diff --git a/tools/perf/util/intel-tpebs.h b/tools/perf/util/intel-tpebs.h
> > new file mode 100644
> > index 000000000000..e8e2bb2f479b
> > --- /dev/null
> > +++ b/tools/perf/util/intel-tpebs.h
> > @@ -0,0 +1,29 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * intel_pt.h: Intel Processor Trace support
> > + * Copyright (c) 2013-2015, Intel Corporation.
> > + */
> > +#include "stat.h"
> > +
> > +#ifndef INCLUDE__PERF_INTEL_TPEBS_H__
> > +#define INCLUDE__PERF_INTEL_TPEBS_H__
> > +
> > +struct tpebs_retire_lat {
> > + struct list_head nd;
> > + /* Event name */
> > + const char *name;
> > + /* Event name with the TPEBS modifier R */
> > + const char *tpebs_name;
> > + /* Count of retire_latency values found in sample data */
> > + size_t count;
> > + /* Sum of all the retire_latency values in sample data */
> > + int sum;
> > + /* Average of retire_latency, val = sum / count */
> > + double val;
> > +};
>
> An evsel has a pretty much all of this and so we're duplicating in
> particular the counting logic which then needs later work to integrate
> and is why I'd prefer we went the evsel route.
I wanted to reuse evsel as much as I could. But as you know, I had some issue
to use it and had to remove the code. I'm going to explore more and see if I could
get it work with evsel better.
Comparing with last version, do you think this change is toward the direct that you
Want? I believe we could get this code to use evsel more.
Thanks,
Weilin
>
> Thanks,
> Ian
>
> > +
> > +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist
> *evsel_list);
> > +int stop_tpebs(void);
> > +void tpebs_data__delete(void);
> > +
> > +#endif
> > diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> > index fd7a187551bd..c6c2aa43030f 100644
> > --- a/tools/perf/util/stat.h
> > +++ b/tools/perf/util/stat.h
> > @@ -110,6 +110,9 @@ struct perf_stat_config {
> > struct cpu_aggr_map *cpus_aggr_map;
> > u64 *walltime_run;
> > struct rblist metric_events;
> > + size_t tpebs_event_size;
> > + struct list_head tpebs_results;
> > + pid_t tpebs_pid;
> > int ctl_fd;
> > int ctl_fd_ack;
> > bool ctl_fd_close;
> > --
> > 2.43.0
> >
On Thu, May 16, 2024 at 10:38 AM Wang, Weilin <weilin.wang@intel.com> wrote:
>
> > -----Original Message-----
> > From: Ian Rogers <irogers@google.com>
> > Sent: Thursday, May 16, 2024 9:43 AM
> > To: Wang, Weilin <weilin.wang@intel.com>
> > Cc: Namhyung Kim <namhyung@kernel.org>; Arnaldo Carvalho de Melo
> > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > <mingo@redhat.com>; Alexander Shishkin
> > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> > Caleb <caleb.biggers@intel.com>
> > Subject: Re: [RFC PATCH v8 3/7] perf stat: Fork and launch perf record when
> > perf stat needs to get retire latency value for a metric.
> >
> > On Tue, May 14, 2024 at 10:44 PM <weilin.wang@intel.com> wrote:
> > >
> > > From: Weilin Wang <weilin.wang@intel.com>
> > >
> > > When retire_latency value is used in a metric formula, perf stat would fork a
> > > perf record process with "-e" and "-W" options. Perf record will collect
> > > required retire_latency values in parallel while perf stat is collecting
> > > counting values.
> > >
> > > At the point of time that perf stat stops counting, it would send sigterm
> > signal
> > > to perf record process and receiving sampling data back from perf record
> > from a
> > > pipe. Perf stat will then process the received data to get retire latency data
> > > and calculate metric result.
> > >
> > > Another thread is required to synchronize between perf stat and perf record
> > > when we pass data through pipe.
> > >
> > > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > > Reviewed-by: Ian Rogers <irogers@google.com>
> >
> > So I don't think this is the correct way to add this. My reviewed-by
> > was based on the fact that I was going to refactor this after landing.
> > It didn't land and I already sent out:
> > "Retirement latency perf stat support"
> > https://lore.kernel.org/lkml/20240428053616.1125891-1-
> > irogers@google.com/
> > v3 to just land the tool portion:
> > https://lore.kernel.org/lkml/20240503232849.17752-1-
> > irogers@google.com/
> > So my reviewed-by no longer stands. The changes I've sent out haven't
> > been reviewed. Given you're trying to land this, can we work on
> > reviewing those changes? The v3 was specifically done just so that we
> > can have the cleaner basis for adding tpebs to the evsel.
> >
>
> Sorry about carrying the reviewed-by in wrong places. I will remove them.
>
> > Let me point to specific issues below:
> >
> > > ---
> > > tools/perf/builtin-stat.c | 19 +++
> > > tools/perf/util/Build | 1 +
> > > tools/perf/util/intel-tpebs.c | 285
> > ++++++++++++++++++++++++++++++++++
> > > tools/perf/util/intel-tpebs.h | 29 ++++
> > > tools/perf/util/stat.h | 3 +
> > > 5 files changed, 337 insertions(+)
> > > create mode 100644 tools/perf/util/intel-tpebs.c
> > > create mode 100644 tools/perf/util/intel-tpebs.h
> > >
> > > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > > index 428e9721b908..85927cf45adb 100644
> > > --- a/tools/perf/builtin-stat.c
> > > +++ b/tools/perf/builtin-stat.c
> > > @@ -70,6 +70,7 @@
> > > #include "util/bpf_counter.h"
> > > #include "util/iostat.h"
> > > #include "util/util.h"
> > > +#include "util/intel-tpebs.h"
> > > #include "asm/bug.h"
> > >
> > > #include <linux/time64.h>
> > > @@ -94,6 +95,7 @@
> > > #include <perf/evlist.h>
> > > #include <internal/threadmap.h>
> > >
> > > +
> > > #define DEFAULT_SEPARATOR " "
> > > #define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi"
> > >
> > > @@ -162,6 +164,8 @@ static struct perf_stat_config stat_config = {
> > > .ctl_fd = -1,
> > > .ctl_fd_ack = -1,
> > > .iostat_run = false,
> > > + .tpebs_results = LIST_HEAD_INIT(stat_config.tpebs_results),
> > > + .tpebs_pid = -1,
> >
> > Here we're adding state to the stat_config for the sake of tpebs
> > events. Were the state in the evsel, as in my changes, we'd not need
> > to carry new global state for tpebs.
>
> Yes, I should remove the .tpebs_results out. How about .tpebs_pid?
> It is used in builtin-stat later.
>
> >
> > > };
> > >
> > > static void evlist__check_cpu_maps(struct evlist *evlist)
> > > @@ -653,6 +657,8 @@ static enum counter_recovery
> > stat_handle_error(struct evsel *counter)
> > >
> > > if (child_pid != -1)
> > > kill(child_pid, SIGTERM);
> > > + if (stat_config.tpebs_pid != -1)
> > > + stop_tpebs();
> >
> > This logic is builtin-stat but what about other commands that could be
> > using TPEBS for counters?
>
> Could you please let me know the commands that could use it?
> I was thinking we could do this logic in evsel once the per core per event fork
> get updated. I think that would work for all the commands.
Using this code refactored into evsels is what I'd like to see, so I
think we're agreed. I'm not thinking of a main command reading
counters. The evlist and evsel are exposed through python so
potentially any script could use the values, which is why it'd be best
to abstract them behind an evsel.
> >
> > > return COUNTER_FATAL;
> > > }
> > >
> > > @@ -673,6 +679,10 @@ static int __run_perf_stat(int argc, const char
> > **argv, int run_idx)
> > > int err;
> > > bool second_pass = false;
> > >
> > > + err = start_tpebs(&stat_config, evsel_list);
> > > + if (err < 0)
> > > + return err;
> > > +
> > > if (forks) {
> > > if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe,
> > workload_exec_failed_signal) < 0) {
> > > perror("failed to prepare workload");
> > > @@ -878,6 +888,10 @@ static int __run_perf_stat(int argc, const char
> > **argv, int run_idx)
> > >
> > > t1 = rdclock();
> > >
> > > + err = stop_tpebs();
> > > + if (err < 0)
> > > + return err;
> > > +
> > > if (stat_config.walltime_run_table)
> > > stat_config.walltime_run[run_idx] = t1 - t0;
> > >
> > > @@ -985,6 +999,9 @@ static void sig_atexit(void)
> > > if (child_pid != -1)
> > > kill(child_pid, SIGTERM);
> > >
> > > + if (stat_config.tpebs_pid != -1)
> > > + kill(stat_config.tpebs_pid, SIGTERM);
> > > +
> > > sigprocmask(SIG_SETMASK, &oset, NULL);
> > >
> > > if (signr == -1)
> > > @@ -2918,5 +2935,7 @@ int cmd_stat(int argc, const char **argv)
> > > metricgroup__rblist_exit(&stat_config.metric_events);
> > > evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack,
> > &stat_config.ctl_fd_close);
> > >
> > > + tpebs_data__delete();
> > > +
> > > return status;
> > > }
> > > diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> > > index 292170a99ab6..c9f1d0bb6bf8 100644
> > > --- a/tools/perf/util/Build
> > > +++ b/tools/perf/util/Build
> > > @@ -153,6 +153,7 @@ perf-y += clockid.o
> > > perf-y += list_sort.o
> > > perf-y += mutex.o
> > > perf-y += sharded_mutex.o
> > > +perf-y += intel-tpebs.o
> > >
> > > perf-$(CONFIG_LIBBPF) += bpf_map.o
> > > perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
> > > diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> > > new file mode 100644
> > > index 000000000000..4b7a98794fae
> > > --- /dev/null
> > > +++ b/tools/perf/util/intel-tpebs.c
> > > @@ -0,0 +1,285 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * intel_pt.c: Intel Processor Trace support
> > > + * Copyright (c) 2013-2015, Intel Corporation.
> > > + */
> > > +
> > > +
> > > +#include <sys/param.h>
> > > +#include <subcmd/run-command.h>
> > > +#include <thread.h>
> > > +#include "intel-tpebs.h"
> > > +#include <linux/list.h>
> > > +#include <linux/zalloc.h>
> > > +#include <linux/err.h>
> > > +#include "sample.h"
> > > +#include "debug.h"
> > > +#include "evlist.h"
> > > +#include "evsel.h"
> > > +#include "session.h"
> > > +#include "tool.h"
> > > +#include "metricgroup.h"
> > > +#include <sys/stat.h>
> > > +#include <sys/file.h>
> > > +
> > > +
> > > +
> > > +#define PERF_DATA "-"
> > > +#define CONTROL "/tmp/control"
> > > +#define ACK "/tmp/ack"
> > > +pthread_t reader_thread;
> > > +struct child_process *cmd;
> > > +struct perf_stat_config *stat_config;
> > > +
> > > +static int get_perf_record_args(const char **record_argv)
> > > +{
> > > + int i = 0;
> > > + struct tpebs_retire_lat *e;
> > > +
> > > + pr_debug("Prepare perf record for retire_latency\n");
> > > +
> > > + record_argv[i++] = "perf";
> > > + record_argv[i++] = "record";
> > > + record_argv[i++] = "-W";
> > > + record_argv[i++] = "--synth=no";
> >
> > There should be more things to disable, like BPF events, we don't need
> > the dummy, etc.
>
> Ok, I will add that.
Well this is my comment that we shouldn't be beefing up "perf record"
for retirement latencies, it'd be better to avoid the fork altogether.
Adding "--no-bpf-event" is simple.
> >
> > > + record_argv[i++] = "--control=fifo:/tmp/control,/tmp/ack";
> > > +
> > > + if (stat_config->user_requested_cpu_list) {
> > > + record_argv[i++] = "-C";
> > > + record_argv[i++] = stat_config->user_requested_cpu_list;
> > > + }
> > > +
> > > + if (stat_config->system_wide)
> > > + record_argv[i++] = "-a";
> > > +
> > > + if (!stat_config->system_wide && !stat_config-
> > >user_requested_cpu_list) {
> > > + pr_err("Require -a or -C option to run sampling.\n");
> > > + return -ECANCELED;
> > > + }
> > > +
> > > + list_for_each_entry(e, &stat_config->tpebs_results, nd) {
> > > + record_argv[i++] = "-e";
> > > + record_argv[i++] = e->name;
> > > + }
> > > +
> > > + record_argv[i++] = "-o";
> > > + record_argv[i++] = PERF_DATA;
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static int prepare_run_command(const char **argv)
> > > +{
> > > + cmd = zalloc(sizeof(struct child_process));
> > > + if (!cmd)
> > > + return -ENOMEM;
> > > + cmd->argv = argv;
> > > + cmd->out = -1;
> > > + return 0;
> > > +}
> > > +
> > > +static int prepare_perf_record(int tpebs_event_size)
> > > +{
> > > + const char **record_argv;
> > > + int ret;
> > > +
> > > + /*Create control and ack fd for --control*/
> > > + if (mkfifo(CONTROL, 0600)) {
> > > + pr_err("Failed to create control fifo");
> > > + return -1;
> > > + }
> > > + if (mkfifo(ACK, 0600)) {
> > > + pr_err("Failed to create control fifo");
> > > + return -1;
> > > + }
> > > +
> > > + record_argv = calloc(10 + 2 * tpebs_event_size, sizeof(char *));
> > > + if (!record_argv)
> > > + return -ENOMEM;
> > > +
> > > +
> > > + ret = get_perf_record_args(record_argv);
> > > + if (ret)
> > > + goto out;
> > > +
> > > + ret = prepare_run_command(record_argv);
> > > + if (ret)
> > > + goto out;
> > > + ret = start_command(cmd);
> > > +out:
> > > + free(record_argv);
> > > + return ret;
> > > +}
> > > +struct sample_data_reader {
> > > + struct perf_tool tool;
> > > + struct perf_session *session;
> > > +};
> > > +
> > > +static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
> > > +{
> > > + zfree(&r->name);
> > > + zfree(&r->tpebs_name);
> > > + free(r);
> > > +}
> > > +
> > > +void tpebs_data__delete(void)
> > > +{
> > > + struct tpebs_retire_lat *r, *rtmp;
> > > +
> > > + list_for_each_entry_safe(r, rtmp, &stat_config->tpebs_results, nd) {
> > > + list_del_init(&r->nd);
> > > + tpebs_retire_lat__delete(r);
> > > + }
> > > + free(cmd);
> > > +}
> > > +
> > > +static int process_sample_event(struct perf_tool *tool __maybe_unused,
> > > + union perf_event *event __maybe_unused,
> > > + struct perf_sample *sample,
> > > + struct evsel *evsel,
> > > + struct machine *machine __maybe_unused)
> > > +{
> > > + int ret = 0;
> > > + const char *evname;
> > > + struct tpebs_retire_lat *t;
> > > +
> > > + evname = evsel__name(evsel);
> > > +
> > > + /*
> > > + * Need to handle per core results? We are assuming average retire
> > > + * latency value will be used. Save the number of samples and the sum
> > of
> > > + * retire latency value for each event.
> > > + */
> > > + list_for_each_entry(t, &stat_config->tpebs_results, nd) {
> > > + if (!strcmp(evname, t->name)) {
> > > + t->count += 1;
> > > + t->sum += sample->retire_lat;
> > > + t->val = (double) t->sum / t->count;
> > > + break;
> > > + }
> > > + }
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +static int process_feature_event(struct perf_session *session,
> > > + union perf_event *event)
> > > +{
> > > + if (event->feat.feat_id < HEADER_LAST_FEATURE)
> > > + return perf_event__process_feature(session, event);
> > > + return 0;
> > > +}
> > > +
> > > +static void *__sample_reader(void *arg)
> > > +{
> > > + struct child_process *child = arg;
> > > + struct perf_session *session;
> > > + struct perf_data data = {
> > > + .mode = PERF_DATA_MODE_READ,
> > > + .path = PERF_DATA,
> > > + .file.fd = child->out,
> > > + };
> > > + struct sample_data_reader reader = {
> > > + .tool = {
> > > + .sample = process_sample_event,
> > > + .feature = process_feature_event,
> > > + .attr = perf_event__process_attr,
> > > + },
> > > + };
> >
> > I prefer the reading approach here over what I did here:
> > https://lore.kernel.org/lkml/CAM9d7cgzTsfk3C+dTN80f5FhB1rmfturjuUUwv
> > STeUvny5eWKw@mail.gmail.com/
> > I'd done that as Namhyung had commented on using perf report:
> > https://lore.kernel.org/lkml/CAM9d7cgdQQn5GYB7t++xuoMdeqPXiEkkcop69
> > +rD06RAnu9-EQ@mail.gmail.com/
> > It was also less work. I don't see why we can't move the reading logic
> > into a function like evsel__read_retire_latency that's in my change. I
> > also think that in the first instance all reading logic should be
> > implemented to return 0 and we only do the forking, etc. when a
> > command line flag is passed.
>
> Yes, a command line flag to turn it on is a good idea. Andi also suggested this.
> I want to add this option in next version.
>
> I tried to refactor the code so that you could easily reuse it later. You could either
> put this code directly into evsel__read_retire_latency or call this function,
> whichever way you feel better.
Could you add that into the series rather than leaving it as work for me?
> >
> > > +
> > > + session = perf_session__new(&data, &reader.tool);
> > > + if (IS_ERR(session))
> > > + return NULL;
> > > + reader.session = session;
> > > + perf_session__process_events(session);
> > > + perf_session__delete(session);
> > > +
> > > + return NULL;
> > > +}
> > > +
> > > +
> > > +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist
> > *evsel_list)
> > > +{
> > > + int ret = 0;
> > > + struct evsel *evsel;
> > > + int control = -1, ack = -1;
> > > +
> > > + stat_config = perf_stat_config;
> > > + /*
> > > + * Prepare perf record for sampling event retire_latency before fork and
> > > + * prepare workload
> > > + */
> > > + evlist__for_each_entry(evsel_list, evsel) {
> > > + if (evsel->retire_lat) {
> > > + struct tpebs_retire_lat *new = malloc(sizeof(struct
> > tpebs_retire_lat));
> > > + int i;
> > > + char *name;
> > > +
> > > + pr_debug("perf stat retire latency %s required\n", evsel-
> > >name);
> > > + if (!new)
> > > + return -1;
> > > + for (i = strlen(evsel->name) - 1; i > 0; i--) {
> > > + if (evsel->name[i] == 'R')
> > > + break;
> > > + }
> > > + if (i <= 0 || evsel->name[i] != 'R')
> > > + return -1;
> > > +
> > > + name = strdup(evsel->name);
> > > + if (!name)
> > > + return -ENOMEM;
> > > + name[i] = 'p';
> > > + new->name = strdup(name);
> > > + free(name);
> > > + new->tpebs_name = strdup(evsel->name);
> > > + if (!new->tpebs_name)
> > > + return -ENOMEM;
> > > + new->count = 0;
> > > + new->sum = 0;
> > > + list_add_tail(&new->nd, &stat_config->tpebs_results);
> > > + stat_config->tpebs_event_size += 1;
> > > + }
> > > + }
> > > +
> > > + if (stat_config->tpebs_event_size > 0) {
> > > + ret = prepare_perf_record(stat_config->tpebs_event_size);
> > > + if (ret)
> > > + return ret;
> > > + if (pthread_create(&reader_thread, NULL, __sample_reader, cmd)) {
> > > + kill(cmd->pid, SIGTERM);
> > > + close(cmd->out);
> > > + pr_err("Could not create thread to process sample data.\n");
> > > + return -1;
> > > + }
> > > + /* Wait for perf record initialization a little bit.*/
> > > + control = open(CONTROL, O_RDONLY, O_NONBLOCK);
> > > + if (!control)
> > > + return -1;
> > > + close(control);
> > > + ack = open(ACK, O_RDONLY, O_NONBLOCK);
> > > + if (!ack)
> > > + return -1;
> > > + close(ack);
> > > + pr_debug("Received ack from perf record\n");
> > > + }
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +
> > > +int stop_tpebs(void)
> > > +{
> > > + int ret = 0;
> > > +
> > > + if (stat_config->tpebs_event_size > 0) {
> > > + kill(cmd->pid, SIGTERM);
> > > + pthread_join(reader_thread, NULL);
> > > + close(cmd->out);
> > > + ret = finish_command(cmd);
> > > + if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> > > + ret = 0;
> > > + remove(CONTROL);
> > > + remove(ACK);
> > > + }
> > > + return ret;
> > > +}
> > > diff --git a/tools/perf/util/intel-tpebs.h b/tools/perf/util/intel-tpebs.h
> > > new file mode 100644
> > > index 000000000000..e8e2bb2f479b
> > > --- /dev/null
> > > +++ b/tools/perf/util/intel-tpebs.h
> > > @@ -0,0 +1,29 @@
> > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > +/*
> > > + * intel_pt.h: Intel Processor Trace support
> > > + * Copyright (c) 2013-2015, Intel Corporation.
> > > + */
> > > +#include "stat.h"
> > > +
> > > +#ifndef INCLUDE__PERF_INTEL_TPEBS_H__
> > > +#define INCLUDE__PERF_INTEL_TPEBS_H__
> > > +
> > > +struct tpebs_retire_lat {
> > > + struct list_head nd;
> > > + /* Event name */
> > > + const char *name;
> > > + /* Event name with the TPEBS modifier R */
> > > + const char *tpebs_name;
> > > + /* Count of retire_latency values found in sample data */
> > > + size_t count;
> > > + /* Sum of all the retire_latency values in sample data */
> > > + int sum;
> > > + /* Average of retire_latency, val = sum / count */
> > > + double val;
> > > +};
> >
> > An evsel has a pretty much all of this and so we're duplicating in
> > particular the counting logic which then needs later work to integrate
> > and is why I'd prefer we went the evsel route.
>
> I wanted to reuse evsel as much as I could. But as you know, I had some issue
> to use it and had to remove the code. I'm going to explore more and see if I could
> get it work with evsel better.
Ok, it was never clear to me what the issue was.
> Comparing with last version, do you think this change is toward the direct that you
> Want? I believe we could get this code to use evsel more.
I think with the evsels and the record/reading logic here things will
be in a good place. There will be more work to do to have the
retirement latency respect options like the cgroups, perhaps the BPF
filters are an option there.
https://lore.kernel.org/lkml/20240516041948.3546553-1-irogers@google.com/
Thanks,
Ian
> Thanks,
> Weilin
>
> >
> > Thanks,
> > Ian
> >
> > > +
> > > +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist
> > *evsel_list);
> > > +int stop_tpebs(void);
> > > +void tpebs_data__delete(void);
> > > +
> > > +#endif
> > > diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> > > index fd7a187551bd..c6c2aa43030f 100644
> > > --- a/tools/perf/util/stat.h
> > > +++ b/tools/perf/util/stat.h
> > > @@ -110,6 +110,9 @@ struct perf_stat_config {
> > > struct cpu_aggr_map *cpus_aggr_map;
> > > u64 *walltime_run;
> > > struct rblist metric_events;
> > > + size_t tpebs_event_size;
> > > + struct list_head tpebs_results;
> > > + pid_t tpebs_pid;
> > > int ctl_fd;
> > > int ctl_fd_ack;
> > > bool ctl_fd_close;
> > > --
> > > 2.43.0
> > >
> -----Original Message-----
> From: Ian Rogers <irogers@google.com>
> Sent: Thursday, May 16, 2024 11:07 AM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Namhyung Kim <namhyung@kernel.org>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v8 3/7] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
>
> On Thu, May 16, 2024 at 10:38 AM Wang, Weilin <weilin.wang@intel.com>
> wrote:
> >
> > > -----Original Message-----
> > > From: Ian Rogers <irogers@google.com>
> > > Sent: Thursday, May 16, 2024 9:43 AM
> > > To: Wang, Weilin <weilin.wang@intel.com>
> > > Cc: Namhyung Kim <namhyung@kernel.org>; Arnaldo Carvalho de Melo
> > > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > > <mingo@redhat.com>; Alexander Shishkin
> > > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor,
> Perry
> > > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>;
> Biggers,
> > > Caleb <caleb.biggers@intel.com>
> > > Subject: Re: [RFC PATCH v8 3/7] perf stat: Fork and launch perf record
> when
> > > perf stat needs to get retire latency value for a metric.
> > >
> > > On Tue, May 14, 2024 at 10:44 PM <weilin.wang@intel.com> wrote:
> > > >
> > > > From: Weilin Wang <weilin.wang@intel.com>
> > > >
> > > > When retire_latency value is used in a metric formula, perf stat would
> fork a
> > > > perf record process with "-e" and "-W" options. Perf record will collect
> > > > required retire_latency values in parallel while perf stat is collecting
> > > > counting values.
> > > >
> > > > At the point of time that perf stat stops counting, it would send sigterm
> > > signal
> > > > to perf record process and receiving sampling data back from perf record
> > > from a
> > > > pipe. Perf stat will then process the received data to get retire latency
> data
> > > > and calculate metric result.
> > > >
> > > > Another thread is required to synchronize between perf stat and perf
> record
> > > > when we pass data through pipe.
> > > >
> > > > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > > > Reviewed-by: Ian Rogers <irogers@google.com>
> > >
> > > So I don't think this is the correct way to add this. My reviewed-by
> > > was based on the fact that I was going to refactor this after landing.
> > > It didn't land and I already sent out:
> > > "Retirement latency perf stat support"
> > > https://lore.kernel.org/lkml/20240428053616.1125891-1-
> > > irogers@google.com/
> > > v3 to just land the tool portion:
> > > https://lore.kernel.org/lkml/20240503232849.17752-1-
> > > irogers@google.com/
> > > So my reviewed-by no longer stands. The changes I've sent out haven't
> > > been reviewed. Given you're trying to land this, can we work on
> > > reviewing those changes? The v3 was specifically done just so that we
> > > can have the cleaner basis for adding tpebs to the evsel.
> > >
> >
> > Sorry about carrying the reviewed-by in wrong places. I will remove them.
> >
> > > Let me point to specific issues below:
> > >
> > > > ---
> > > > tools/perf/builtin-stat.c | 19 +++
> > > > tools/perf/util/Build | 1 +
> > > > tools/perf/util/intel-tpebs.c | 285
> > > ++++++++++++++++++++++++++++++++++
> > > > tools/perf/util/intel-tpebs.h | 29 ++++
> > > > tools/perf/util/stat.h | 3 +
> > > > 5 files changed, 337 insertions(+)
> > > > create mode 100644 tools/perf/util/intel-tpebs.c
> > > > create mode 100644 tools/perf/util/intel-tpebs.h
> > > >
> > > > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > > > index 428e9721b908..85927cf45adb 100644
> > > > --- a/tools/perf/builtin-stat.c
> > > > +++ b/tools/perf/builtin-stat.c
> > > > @@ -70,6 +70,7 @@
> > > > #include "util/bpf_counter.h"
> > > > #include "util/iostat.h"
> > > > #include "util/util.h"
> > > > +#include "util/intel-tpebs.h"
> > > > #include "asm/bug.h"
> > > >
> > > > #include <linux/time64.h>
> > > > @@ -94,6 +95,7 @@
> > > > #include <perf/evlist.h>
> > > > #include <internal/threadmap.h>
> > > >
> > > > +
> > > > #define DEFAULT_SEPARATOR " "
> > > > #define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi"
> > > >
> > > > @@ -162,6 +164,8 @@ static struct perf_stat_config stat_config = {
> > > > .ctl_fd = -1,
> > > > .ctl_fd_ack = -1,
> > > > .iostat_run = false,
> > > > + .tpebs_results = LIST_HEAD_INIT(stat_config.tpebs_results),
> > > > + .tpebs_pid = -1,
> > >
> > > Here we're adding state to the stat_config for the sake of tpebs
> > > events. Were the state in the evsel, as in my changes, we'd not need
> > > to carry new global state for tpebs.
> >
> > Yes, I should remove the .tpebs_results out. How about .tpebs_pid?
> > It is used in builtin-stat later.
> >
> > >
> > > > };
> > > >
> > > > static void evlist__check_cpu_maps(struct evlist *evlist)
> > > > @@ -653,6 +657,8 @@ static enum counter_recovery
> > > stat_handle_error(struct evsel *counter)
> > > >
> > > > if (child_pid != -1)
> > > > kill(child_pid, SIGTERM);
> > > > + if (stat_config.tpebs_pid != -1)
> > > > + stop_tpebs();
> > >
> > > This logic is builtin-stat but what about other commands that could be
> > > using TPEBS for counters?
> >
> > Could you please let me know the commands that could use it?
> > I was thinking we could do this logic in evsel once the per core per event fork
> > get updated. I think that would work for all the commands.
>
> Using this code refactored into evsels is what I'd like to see, so I
> think we're agreed. I'm not thinking of a main command reading
> counters. The evlist and evsel are exposed through python so
> potentially any script could use the values, which is why it'd be best
> to abstract them behind an evsel.
>
> > >
> > > > return COUNTER_FATAL;
> > > > }
> > > >
> > > > @@ -673,6 +679,10 @@ static int __run_perf_stat(int argc, const char
> > > **argv, int run_idx)
> > > > int err;
> > > > bool second_pass = false;
> > > >
> > > > + err = start_tpebs(&stat_config, evsel_list);
> > > > + if (err < 0)
> > > > + return err;
> > > > +
> > > > if (forks) {
> > > > if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe,
> > > workload_exec_failed_signal) < 0) {
> > > > perror("failed to prepare workload");
> > > > @@ -878,6 +888,10 @@ static int __run_perf_stat(int argc, const char
> > > **argv, int run_idx)
> > > >
> > > > t1 = rdclock();
> > > >
> > > > + err = stop_tpebs();
> > > > + if (err < 0)
> > > > + return err;
> > > > +
> > > > if (stat_config.walltime_run_table)
> > > > stat_config.walltime_run[run_idx] = t1 - t0;
> > > >
> > > > @@ -985,6 +999,9 @@ static void sig_atexit(void)
> > > > if (child_pid != -1)
> > > > kill(child_pid, SIGTERM);
> > > >
> > > > + if (stat_config.tpebs_pid != -1)
> > > > + kill(stat_config.tpebs_pid, SIGTERM);
> > > > +
> > > > sigprocmask(SIG_SETMASK, &oset, NULL);
> > > >
> > > > if (signr == -1)
> > > > @@ -2918,5 +2935,7 @@ int cmd_stat(int argc, const char **argv)
> > > > metricgroup__rblist_exit(&stat_config.metric_events);
> > > > evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack,
> > > &stat_config.ctl_fd_close);
> > > >
> > > > + tpebs_data__delete();
> > > > +
> > > > return status;
> > > > }
> > > > diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> > > > index 292170a99ab6..c9f1d0bb6bf8 100644
> > > > --- a/tools/perf/util/Build
> > > > +++ b/tools/perf/util/Build
> > > > @@ -153,6 +153,7 @@ perf-y += clockid.o
> > > > perf-y += list_sort.o
> > > > perf-y += mutex.o
> > > > perf-y += sharded_mutex.o
> > > > +perf-y += intel-tpebs.o
> > > >
> > > > perf-$(CONFIG_LIBBPF) += bpf_map.o
> > > > perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
> > > > diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> > > > new file mode 100644
> > > > index 000000000000..4b7a98794fae
> > > > --- /dev/null
> > > > +++ b/tools/perf/util/intel-tpebs.c
> > > > @@ -0,0 +1,285 @@
> > > > +// SPDX-License-Identifier: GPL-2.0-only
> > > > +/*
> > > > + * intel_pt.c: Intel Processor Trace support
> > > > + * Copyright (c) 2013-2015, Intel Corporation.
> > > > + */
> > > > +
> > > > +
> > > > +#include <sys/param.h>
> > > > +#include <subcmd/run-command.h>
> > > > +#include <thread.h>
> > > > +#include "intel-tpebs.h"
> > > > +#include <linux/list.h>
> > > > +#include <linux/zalloc.h>
> > > > +#include <linux/err.h>
> > > > +#include "sample.h"
> > > > +#include "debug.h"
> > > > +#include "evlist.h"
> > > > +#include "evsel.h"
> > > > +#include "session.h"
> > > > +#include "tool.h"
> > > > +#include "metricgroup.h"
> > > > +#include <sys/stat.h>
> > > > +#include <sys/file.h>
> > > > +
> > > > +
> > > > +
> > > > +#define PERF_DATA "-"
> > > > +#define CONTROL "/tmp/control"
> > > > +#define ACK "/tmp/ack"
> > > > +pthread_t reader_thread;
> > > > +struct child_process *cmd;
> > > > +struct perf_stat_config *stat_config;
> > > > +
> > > > +static int get_perf_record_args(const char **record_argv)
> > > > +{
> > > > + int i = 0;
> > > > + struct tpebs_retire_lat *e;
> > > > +
> > > > + pr_debug("Prepare perf record for retire_latency\n");
> > > > +
> > > > + record_argv[i++] = "perf";
> > > > + record_argv[i++] = "record";
> > > > + record_argv[i++] = "-W";
> > > > + record_argv[i++] = "--synth=no";
> > >
> > > There should be more things to disable, like BPF events, we don't need
> > > the dummy, etc.
> >
> > Ok, I will add that.
>
> Well this is my comment that we shouldn't be beefing up "perf record"
> for retirement latencies, it'd be better to avoid the fork altogether.
> Adding "--no-bpf-event" is simple.
>
> > >
> > > > + record_argv[i++] = "--control=fifo:/tmp/control,/tmp/ack";
> > > > +
> > > > + if (stat_config->user_requested_cpu_list) {
> > > > + record_argv[i++] = "-C";
> > > > + record_argv[i++] = stat_config->user_requested_cpu_list;
> > > > + }
> > > > +
> > > > + if (stat_config->system_wide)
> > > > + record_argv[i++] = "-a";
> > > > +
> > > > + if (!stat_config->system_wide && !stat_config-
> > > >user_requested_cpu_list) {
> > > > + pr_err("Require -a or -C option to run sampling.\n");
> > > > + return -ECANCELED;
> > > > + }
> > > > +
> > > > + list_for_each_entry(e, &stat_config->tpebs_results, nd) {
> > > > + record_argv[i++] = "-e";
> > > > + record_argv[i++] = e->name;
> > > > + }
> > > > +
> > > > + record_argv[i++] = "-o";
> > > > + record_argv[i++] = PERF_DATA;
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static int prepare_run_command(const char **argv)
> > > > +{
> > > > + cmd = zalloc(sizeof(struct child_process));
> > > > + if (!cmd)
> > > > + return -ENOMEM;
> > > > + cmd->argv = argv;
> > > > + cmd->out = -1;
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static int prepare_perf_record(int tpebs_event_size)
> > > > +{
> > > > + const char **record_argv;
> > > > + int ret;
> > > > +
> > > > + /*Create control and ack fd for --control*/
> > > > + if (mkfifo(CONTROL, 0600)) {
> > > > + pr_err("Failed to create control fifo");
> > > > + return -1;
> > > > + }
> > > > + if (mkfifo(ACK, 0600)) {
> > > > + pr_err("Failed to create control fifo");
> > > > + return -1;
> > > > + }
> > > > +
> > > > + record_argv = calloc(10 + 2 * tpebs_event_size, sizeof(char *));
> > > > + if (!record_argv)
> > > > + return -ENOMEM;
> > > > +
> > > > +
> > > > + ret = get_perf_record_args(record_argv);
> > > > + if (ret)
> > > > + goto out;
> > > > +
> > > > + ret = prepare_run_command(record_argv);
> > > > + if (ret)
> > > > + goto out;
> > > > + ret = start_command(cmd);
> > > > +out:
> > > > + free(record_argv);
> > > > + return ret;
> > > > +}
> > > > +struct sample_data_reader {
> > > > + struct perf_tool tool;
> > > > + struct perf_session *session;
> > > > +};
> > > > +
> > > > +static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
> > > > +{
> > > > + zfree(&r->name);
> > > > + zfree(&r->tpebs_name);
> > > > + free(r);
> > > > +}
> > > > +
> > > > +void tpebs_data__delete(void)
> > > > +{
> > > > + struct tpebs_retire_lat *r, *rtmp;
> > > > +
> > > > + list_for_each_entry_safe(r, rtmp, &stat_config->tpebs_results, nd) {
> > > > + list_del_init(&r->nd);
> > > > + tpebs_retire_lat__delete(r);
> > > > + }
> > > > + free(cmd);
> > > > +}
> > > > +
> > > > +static int process_sample_event(struct perf_tool *tool
> __maybe_unused,
> > > > + union perf_event *event __maybe_unused,
> > > > + struct perf_sample *sample,
> > > > + struct evsel *evsel,
> > > > + struct machine *machine __maybe_unused)
> > > > +{
> > > > + int ret = 0;
> > > > + const char *evname;
> > > > + struct tpebs_retire_lat *t;
> > > > +
> > > > + evname = evsel__name(evsel);
> > > > +
> > > > + /*
> > > > + * Need to handle per core results? We are assuming average retire
> > > > + * latency value will be used. Save the number of samples and the
> sum
> > > of
> > > > + * retire latency value for each event.
> > > > + */
> > > > + list_for_each_entry(t, &stat_config->tpebs_results, nd) {
> > > > + if (!strcmp(evname, t->name)) {
> > > > + t->count += 1;
> > > > + t->sum += sample->retire_lat;
> > > > + t->val = (double) t->sum / t->count;
> > > > + break;
> > > > + }
> > > > + }
> > > > +
> > > > + return ret;
> > > > +}
> > > > +
> > > > +static int process_feature_event(struct perf_session *session,
> > > > + union perf_event *event)
> > > > +{
> > > > + if (event->feat.feat_id < HEADER_LAST_FEATURE)
> > > > + return perf_event__process_feature(session, event);
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static void *__sample_reader(void *arg)
> > > > +{
> > > > + struct child_process *child = arg;
> > > > + struct perf_session *session;
> > > > + struct perf_data data = {
> > > > + .mode = PERF_DATA_MODE_READ,
> > > > + .path = PERF_DATA,
> > > > + .file.fd = child->out,
> > > > + };
> > > > + struct sample_data_reader reader = {
> > > > + .tool = {
> > > > + .sample = process_sample_event,
> > > > + .feature = process_feature_event,
> > > > + .attr = perf_event__process_attr,
> > > > + },
> > > > + };
> > >
> > > I prefer the reading approach here over what I did here:
> > >
> https://lore.kernel.org/lkml/CAM9d7cgzTsfk3C+dTN80f5FhB1rmfturjuUUwv
> > > STeUvny5eWKw@mail.gmail.com/
> > > I'd done that as Namhyung had commented on using perf report:
> > >
> https://lore.kernel.org/lkml/CAM9d7cgdQQn5GYB7t++xuoMdeqPXiEkkcop69
> > > +rD06RAnu9-EQ@mail.gmail.com/
> > > It was also less work. I don't see why we can't move the reading logic
> > > into a function like evsel__read_retire_latency that's in my change. I
> > > also think that in the first instance all reading logic should be
> > > implemented to return 0 and we only do the forking, etc. when a
> > > command line flag is passed.
> >
> > Yes, a command line flag to turn it on is a good idea. Andi also suggested this.
> > I want to add this option in next version.
> >
> > I tried to refactor the code so that you could easily reuse it later. You could
> either
> > put this code directly into evsel__read_retire_latency or call this function,
> > whichever way you feel better.
>
> Could you add that into the series rather than leaving it as work for me?
Sorry about miscommunicating. I didn’t mean to left the work for you. I was just trying
to say if you don't agree with the code at any point of time, you could move it
around or do whatever way you'd like.
>
> > >
> > > > +
> > > > + session = perf_session__new(&data, &reader.tool);
> > > > + if (IS_ERR(session))
> > > > + return NULL;
> > > > + reader.session = session;
> > > > + perf_session__process_events(session);
> > > > + perf_session__delete(session);
> > > > +
> > > > + return NULL;
> > > > +}
> > > > +
> > > > +
> > > > +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist
> > > *evsel_list)
> > > > +{
> > > > + int ret = 0;
> > > > + struct evsel *evsel;
> > > > + int control = -1, ack = -1;
> > > > +
> > > > + stat_config = perf_stat_config;
> > > > + /*
> > > > + * Prepare perf record for sampling event retire_latency before fork
> and
> > > > + * prepare workload
> > > > + */
> > > > + evlist__for_each_entry(evsel_list, evsel) {
> > > > + if (evsel->retire_lat) {
> > > > + struct tpebs_retire_lat *new = malloc(sizeof(struct
> > > tpebs_retire_lat));
> > > > + int i;
> > > > + char *name;
> > > > +
> > > > + pr_debug("perf stat retire latency %s required\n", evsel-
> > > >name);
> > > > + if (!new)
> > > > + return -1;
> > > > + for (i = strlen(evsel->name) - 1; i > 0; i--) {
> > > > + if (evsel->name[i] == 'R')
> > > > + break;
> > > > + }
> > > > + if (i <= 0 || evsel->name[i] != 'R')
> > > > + return -1;
> > > > +
> > > > + name = strdup(evsel->name);
> > > > + if (!name)
> > > > + return -ENOMEM;
> > > > + name[i] = 'p';
> > > > + new->name = strdup(name);
> > > > + free(name);
> > > > + new->tpebs_name = strdup(evsel->name);
> > > > + if (!new->tpebs_name)
> > > > + return -ENOMEM;
> > > > + new->count = 0;
> > > > + new->sum = 0;
> > > > + list_add_tail(&new->nd, &stat_config->tpebs_results);
> > > > + stat_config->tpebs_event_size += 1;
> > > > + }
> > > > + }
> > > > +
> > > > + if (stat_config->tpebs_event_size > 0) {
> > > > + ret = prepare_perf_record(stat_config->tpebs_event_size);
> > > > + if (ret)
> > > > + return ret;
> > > > + if (pthread_create(&reader_thread, NULL, __sample_reader,
> cmd)) {
> > > > + kill(cmd->pid, SIGTERM);
> > > > + close(cmd->out);
> > > > + pr_err("Could not create thread to process sample data.\n");
> > > > + return -1;
> > > > + }
> > > > + /* Wait for perf record initialization a little bit.*/
> > > > + control = open(CONTROL, O_RDONLY, O_NONBLOCK);
> > > > + if (!control)
> > > > + return -1;
> > > > + close(control);
> > > > + ack = open(ACK, O_RDONLY, O_NONBLOCK);
> > > > + if (!ack)
> > > > + return -1;
> > > > + close(ack);
> > > > + pr_debug("Received ack from perf record\n");
> > > > + }
> > > > +
> > > > + return ret;
> > > > +}
> > > > +
> > > > +
> > > > +int stop_tpebs(void)
> > > > +{
> > > > + int ret = 0;
> > > > +
> > > > + if (stat_config->tpebs_event_size > 0) {
> > > > + kill(cmd->pid, SIGTERM);
> > > > + pthread_join(reader_thread, NULL);
> > > > + close(cmd->out);
> > > > + ret = finish_command(cmd);
> > > > + if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> > > > + ret = 0;
> > > > + remove(CONTROL);
> > > > + remove(ACK);
> > > > + }
> > > > + return ret;
> > > > +}
> > > > diff --git a/tools/perf/util/intel-tpebs.h b/tools/perf/util/intel-tpebs.h
> > > > new file mode 100644
> > > > index 000000000000..e8e2bb2f479b
> > > > --- /dev/null
> > > > +++ b/tools/perf/util/intel-tpebs.h
> > > > @@ -0,0 +1,29 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > > +/*
> > > > + * intel_pt.h: Intel Processor Trace support
> > > > + * Copyright (c) 2013-2015, Intel Corporation.
> > > > + */
> > > > +#include "stat.h"
> > > > +
> > > > +#ifndef INCLUDE__PERF_INTEL_TPEBS_H__
> > > > +#define INCLUDE__PERF_INTEL_TPEBS_H__
> > > > +
> > > > +struct tpebs_retire_lat {
> > > > + struct list_head nd;
> > > > + /* Event name */
> > > > + const char *name;
> > > > + /* Event name with the TPEBS modifier R */
> > > > + const char *tpebs_name;
> > > > + /* Count of retire_latency values found in sample data */
> > > > + size_t count;
> > > > + /* Sum of all the retire_latency values in sample data */
> > > > + int sum;
> > > > + /* Average of retire_latency, val = sum / count */
> > > > + double val;
> > > > +};
> > >
> > > An evsel has a pretty much all of this and so we're duplicating in
> > > particular the counting logic which then needs later work to integrate
> > > and is why I'd prefer we went the evsel route.
> >
> > I wanted to reuse evsel as much as I could. But as you know, I had some
> issue
> > to use it and had to remove the code. I'm going to explore more and see if I
> could
> > get it work with evsel better.
>
> Ok, it was never clear to me what the issue was.
Yes, I was swamped by some work and haven't been able to work on this
specific issue and send you the code. I will do this as quick as possible.
Thanks,
Weilin
>
> > Comparing with last version, do you think this change is toward the direct
> that you
> > Want? I believe we could get this code to use evsel more.
>
> I think with the evsels and the record/reading logic here things will
> be in a good place. There will be more work to do to have the
> retirement latency respect options like the cgroups, perhaps the BPF
> filters are an option there.
> https://lore.kernel.org/lkml/20240516041948.3546553-1-
> irogers@google.com/
>
> Thanks,
> Ian
>
> > Thanks,
> > Weilin
> >
> > >
> > > Thanks,
> > > Ian
> > >
> > > > +
> > > > +int start_tpebs(struct perf_stat_config *perf_stat_config, struct evlist
> > > *evsel_list);
> > > > +int stop_tpebs(void);
> > > > +void tpebs_data__delete(void);
> > > > +
> > > > +#endif
> > > > diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> > > > index fd7a187551bd..c6c2aa43030f 100644
> > > > --- a/tools/perf/util/stat.h
> > > > +++ b/tools/perf/util/stat.h
> > > > @@ -110,6 +110,9 @@ struct perf_stat_config {
> > > > struct cpu_aggr_map *cpus_aggr_map;
> > > > u64 *walltime_run;
> > > > struct rblist metric_events;
> > > > + size_t tpebs_event_size;
> > > > + struct list_head tpebs_results;
> > > > + pid_t tpebs_pid;
> > > > int ctl_fd;
> > > > int ctl_fd_ack;
> > > > bool ctl_fd_close;
> > > > --
> > > > 2.43.0
> > > >
© 2016 - 2026 Red Hat, Inc.