From: Weilin Wang <weilin.wang@intel.com>
When retire_latency value is used in a metric formula, perf stat would fork a
perf record process with "-e" and "-W" options. Perf record will collect
required retire_latency values in parallel while perf stat is collecting
counting values.
At the point of time that perf stat stops counting, it would send sigterm signal
to perf record process and receiving sampling data back from perf record from a
pipe. Perf stat will then process the received data to get retire latency data
and calculate metric result.
Another thread is required to synchronize between perf stat and perf record
when we pass data through pipe.
Signed-off-by: Weilin Wang <weilin.wang@intel.com>
Reviewed-by: Ian Rogers <irogers@google.com>
---
tools/perf/builtin-stat.c | 190 +++++++++++++++++++++++++++++++++-
tools/perf/util/data.c | 6 +-
tools/perf/util/metricgroup.h | 8 ++
tools/perf/util/stat.h | 2 +
4 files changed, 203 insertions(+), 3 deletions(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 6291e1e24535..7fbe47b0c44c 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -94,8 +94,13 @@
#include <perf/evlist.h>
#include <internal/threadmap.h>
+#include "util/sample.h"
+#include <sys/param.h>
+#include <subcmd/run-command.h>
+
#define DEFAULT_SEPARATOR " "
#define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi"
+#define PERF_DATA "-"
static void print_counters(struct timespec *ts, int argc, const char **argv);
@@ -163,6 +168,8 @@ static struct perf_stat_config stat_config = {
.ctl_fd_ack = -1,
.iostat_run = false,
.tpebs_events = LIST_HEAD_INIT(stat_config.tpebs_events),
+ .tpebs_results = LIST_HEAD_INIT(stat_config.tpebs_results),
+ .tpebs_pid = -1,
};
static bool cpus_map_matched(struct evsel *a, struct evsel *b)
@@ -684,15 +691,155 @@ static enum counter_recovery stat_handle_error(struct evsel *counter)
if (child_pid != -1)
kill(child_pid, SIGTERM);
+ if (stat_config.tpebs_pid != -1)
+ kill(stat_config.tpebs_pid, SIGTERM);
return COUNTER_FATAL;
}
-static int __run_perf_record(void)
+static int __run_perf_record(const char **record_argv)
{
+ int i = 0;
+ struct tpebs_event *e;
+
pr_debug("Prepare perf record for retire_latency\n");
+
+ record_argv[i++] = "perf";
+ record_argv[i++] = "record";
+ record_argv[i++] = "-W";
+ record_argv[i++] = "--synth=no";
+
+ if (stat_config.user_requested_cpu_list) {
+ record_argv[i++] = "-C";
+ record_argv[i++] = stat_config.user_requested_cpu_list;
+ }
+
+ if (stat_config.system_wide)
+ record_argv[i++] = "-a";
+
+ if (!stat_config.system_wide && !stat_config.user_requested_cpu_list) {
+ pr_err("Require -a or -C option to run sampling.\n");
+ return -ECANCELED;
+ }
+
+ list_for_each_entry(e, &stat_config.tpebs_events, nd) {
+ record_argv[i++] = "-e";
+ record_argv[i++] = e->name;
+ }
+
+ record_argv[i++] = "-o";
+ record_argv[i++] = PERF_DATA;
+
return 0;
}
+static void prepare_run_command(struct child_process *cmd,
+ const char **argv)
+{
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->argv = argv;
+ cmd->out = -1;
+}
+
+static int prepare_perf_record(struct child_process *cmd)
+{
+ const char **record_argv;
+ int ret;
+
+ record_argv = calloc(10 + 2 * stat_config.tpebs_event_size, sizeof(char *));
+ if (!record_argv)
+ return -1;
+
+ ret = __run_perf_record(record_argv);
+ if (ret)
+ return ret;
+
+ prepare_run_command(cmd, record_argv);
+ return start_command(cmd);
+}
+
+struct perf_script {
+ struct perf_tool tool;
+ struct perf_session *session;
+};
+
+static void tpebs_data__delete(void)
+{
+ struct tpebs_retire_lat *r, *rtmp;
+ struct tpebs_event *e, *etmp;
+
+ list_for_each_entry_safe(r, rtmp, &stat_config.tpebs_results, event.nd) {
+ list_del_init(&r->event.nd);
+ free(r);
+ }
+ list_for_each_entry_safe(e, etmp, &stat_config.tpebs_events, nd) {
+ list_del_init(&e->nd);
+ free(e);
+ }
+}
+
+static int process_sample_event(struct perf_tool *tool __maybe_unused,
+ union perf_event *event __maybe_unused,
+ struct perf_sample *sample,
+ struct evsel *evsel,
+ struct machine *machine __maybe_unused)
+{
+ int ret = 0;
+ const char *evname;
+ struct tpebs_retire_lat *t;
+
+ evname = evsel__name(evsel);
+
+ /*
+ * Need to handle per core results? We are assuming average retire
+ * latency value will be used. Save the number of samples and the sum of
+ * retire latency value for each event.
+ */
+ list_for_each_entry(t, &stat_config.tpebs_results, event.nd) {
+ if (!strcmp(evname, t->event.name)) {
+ t->count += 1;
+ t->sum += sample->retire_lat;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int process_feature_event(struct perf_session *session,
+ union perf_event *event)
+{
+ if (event->feat.feat_id < HEADER_LAST_FEATURE)
+ return perf_event__process_feature(session, event);
+ return 0;
+}
+
+static void *__cmd_script(void *arg __maybe_unused)
+{
+ struct child_process *cmd = arg;
+ struct perf_session *session;
+ struct perf_data data = {
+ .mode = PERF_DATA_MODE_READ,
+ .path = PERF_DATA,
+ .file.fd = cmd->out,
+ };
+ struct perf_script script = {
+ .tool = {
+ .sample = process_sample_event,
+ .feature = process_feature_event,
+ .attr = perf_event__process_attr,
+ },
+ };
+
+ session = perf_session__new(&data, &script.tool);
+ if (IS_ERR(session))
+ return NULL;
+ script.session = session;
+ perf_session__process_events(session);
+ perf_session__delete(session);
+
+ return NULL;
+}
+
static int __run_perf_stat(int argc, const char **argv, int run_idx)
{
int interval = stat_config.interval;
@@ -709,15 +856,38 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
struct affinity saved_affinity, *affinity = NULL;
int err;
bool second_pass = false;
+ struct child_process cmd;
+ pthread_t thread_script;
/* Prepare perf record for sampling event retire_latency before fork and
* prepare workload */
if (stat_config.tpebs_event_size > 0) {
int ret;
+ struct tpebs_event *e;
- ret = __run_perf_record();
+ pr_debug("perf stat pid = %d\n", getpid());
+ list_for_each_entry(e, &stat_config.tpebs_events, nd) {
+ struct tpebs_retire_lat *new = malloc(sizeof(struct tpebs_retire_lat));
+
+ if (!new)
+ return -1;
+ new->event.name = strdup(e->name);
+ new->event.tpebs_name = strdup(e->tpebs_name);
+ new->count = 0;
+ new->sum = 0;
+ list_add_tail(&new->event.nd, &stat_config.tpebs_results);
+ }
+ ret = prepare_perf_record(&cmd);
if (ret)
return ret;
+ if (pthread_create(&thread_script, NULL, __cmd_script, &cmd)) {
+ kill(cmd.pid, SIGTERM);
+ close(cmd.out);
+ pr_err("Could not create thread to process sample data.\n");
+ return -1;
+ }
+ /* Wait for perf record initialization a little bit.*/
+ sleep(2);
}
if (forks) {
@@ -925,6 +1095,17 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
t1 = rdclock();
+ if (stat_config.tpebs_event_size > 0) {
+ int ret;
+
+ kill(cmd.pid, SIGTERM);
+ pthread_join(thread_script, NULL);
+ close(cmd.out);
+ ret = finish_command(&cmd);
+ if (ret != -ERR_RUN_COMMAND_WAITPID_SIGNAL)
+ return ret;
+ }
+
if (stat_config.walltime_run_table)
stat_config.walltime_run[run_idx] = t1 - t0;
@@ -1032,6 +1213,9 @@ static void sig_atexit(void)
if (child_pid != -1)
kill(child_pid, SIGTERM);
+ if (stat_config.tpebs_pid != -1)
+ kill(stat_config.tpebs_pid, SIGTERM);
+
sigprocmask(SIG_SETMASK, &oset, NULL);
if (signr == -1)
@@ -2972,5 +3156,7 @@ int cmd_stat(int argc, const char **argv)
metricgroup__rblist_exit(&stat_config.metric_events);
evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack, &stat_config.ctl_fd_close);
+ tpebs_data__delete();
+
return status;
}
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 08c4bfbd817f..98e3014c0aef 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -204,7 +204,11 @@ static bool check_pipe(struct perf_data *data)
data->file.fd = fd;
data->use_stdio = false;
}
- } else {
+ /*
+ * When is_pipe and data->file.fd is given, use given fd
+ * instead of STDIN_FILENO or STDOUT_FILENO
+ */
+ } else if (data->file.fd <= 0) {
data->file.fd = fd;
}
}
diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
index 7c24ed768ff3..ae788edef30f 100644
--- a/tools/perf/util/metricgroup.h
+++ b/tools/perf/util/metricgroup.h
@@ -68,10 +68,18 @@ struct metric_expr {
struct tpebs_event {
struct list_head nd;
+ /* Event name */
const char *name;
+ /* Event name with the TPEBS modifier R */
const char *tpebs_name;
};
+struct tpebs_retire_lat {
+ struct tpebs_event event;
+ size_t count;
+ int sum;
+};
+
struct metric_event *metricgroup__lookup(struct rblist *metric_events,
struct evsel *evsel,
bool create);
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index b987960df3c5..0726bdc06681 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -111,6 +111,8 @@ struct perf_stat_config {
struct rblist metric_events;
struct list_head tpebs_events;
size_t tpebs_event_size;
+ struct list_head tpebs_results;
+ pid_t tpebs_pid;
int ctl_fd;
int ctl_fd_ack;
bool ctl_fd_close;
--
2.43.0
On Fri, Mar 29, 2024 at 12:12 PM <weilin.wang@intel.com> wrote:
>
> From: Weilin Wang <weilin.wang@intel.com>
>
> When retire_latency value is used in a metric formula, perf stat would fork a
> perf record process with "-e" and "-W" options. Perf record will collect
> required retire_latency values in parallel while perf stat is collecting
> counting values.
>
> At the point of time that perf stat stops counting, it would send sigterm signal
> to perf record process and receiving sampling data back from perf record from a
> pipe. Perf stat will then process the received data to get retire latency data
> and calculate metric result.
>
> Another thread is required to synchronize between perf stat and perf record
> when we pass data through pipe.
>
> Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> Reviewed-by: Ian Rogers <irogers@google.com>
> ---
> tools/perf/builtin-stat.c | 190 +++++++++++++++++++++++++++++++++-
> tools/perf/util/data.c | 6 +-
> tools/perf/util/metricgroup.h | 8 ++
> tools/perf/util/stat.h | 2 +
> 4 files changed, 203 insertions(+), 3 deletions(-)
>
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 6291e1e24535..7fbe47b0c44c 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -94,8 +94,13 @@
> #include <perf/evlist.h>
> #include <internal/threadmap.h>
>
> +#include "util/sample.h"
> +#include <sys/param.h>
> +#include <subcmd/run-command.h>
> +
> #define DEFAULT_SEPARATOR " "
> #define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi"
> +#define PERF_DATA "-"
>
> static void print_counters(struct timespec *ts, int argc, const char **argv);
>
> @@ -163,6 +168,8 @@ static struct perf_stat_config stat_config = {
> .ctl_fd_ack = -1,
> .iostat_run = false,
> .tpebs_events = LIST_HEAD_INIT(stat_config.tpebs_events),
> + .tpebs_results = LIST_HEAD_INIT(stat_config.tpebs_results),
> + .tpebs_pid = -1,
> };
>
> static bool cpus_map_matched(struct evsel *a, struct evsel *b)
> @@ -684,15 +691,155 @@ static enum counter_recovery stat_handle_error(struct evsel *counter)
>
> if (child_pid != -1)
> kill(child_pid, SIGTERM);
> + if (stat_config.tpebs_pid != -1)
> + kill(stat_config.tpebs_pid, SIGTERM);
> return COUNTER_FATAL;
> }
>
> -static int __run_perf_record(void)
> +static int __run_perf_record(const char **record_argv)
> {
> + int i = 0;
> + struct tpebs_event *e;
> +
> pr_debug("Prepare perf record for retire_latency\n");
> +
> + record_argv[i++] = "perf";
> + record_argv[i++] = "record";
> + record_argv[i++] = "-W";
> + record_argv[i++] = "--synth=no";
> +
> + if (stat_config.user_requested_cpu_list) {
> + record_argv[i++] = "-C";
> + record_argv[i++] = stat_config.user_requested_cpu_list;
> + }
> +
> + if (stat_config.system_wide)
> + record_argv[i++] = "-a";
> +
> + if (!stat_config.system_wide && !stat_config.user_requested_cpu_list) {
> + pr_err("Require -a or -C option to run sampling.\n");
> + return -ECANCELED;
> + }
> +
> + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> + record_argv[i++] = "-e";
> + record_argv[i++] = e->name;
> + }
> +
> + record_argv[i++] = "-o";
> + record_argv[i++] = PERF_DATA;
> +
> return 0;
> }
Still I think it's weird it has 'perf record' in perf stat (despite the
'perf stat record'). If it's only Intel thing and we don't have a plan
to do the same on other arches, we can move it to the arch
directory and keep the perf stat code simple.
>
> +static void prepare_run_command(struct child_process *cmd,
> + const char **argv)
> +{
> + memset(cmd, 0, sizeof(*cmd));
> + cmd->argv = argv;
> + cmd->out = -1;
> +}
> +
> +static int prepare_perf_record(struct child_process *cmd)
> +{
> + const char **record_argv;
> + int ret;
> +
> + record_argv = calloc(10 + 2 * stat_config.tpebs_event_size, sizeof(char *));
> + if (!record_argv)
> + return -1;
> +
> + ret = __run_perf_record(record_argv);
> + if (ret)
> + return ret;
> +
> + prepare_run_command(cmd, record_argv);
> + return start_command(cmd);
> +}
> +
> +struct perf_script {
> + struct perf_tool tool;
> + struct perf_session *session;
> +};
> +
> +static void tpebs_data__delete(void)
> +{
> + struct tpebs_retire_lat *r, *rtmp;
> + struct tpebs_event *e, *etmp;
> +
> + list_for_each_entry_safe(r, rtmp, &stat_config.tpebs_results, event.nd) {
> + list_del_init(&r->event.nd);
> + free(r);
> + }
> + list_for_each_entry_safe(e, etmp, &stat_config.tpebs_events, nd) {
> + list_del_init(&e->nd);
> + free(e);
Shouldn't it free the names?
> + }
> +}
> +
> +static int process_sample_event(struct perf_tool *tool __maybe_unused,
> + union perf_event *event __maybe_unused,
> + struct perf_sample *sample,
> + struct evsel *evsel,
> + struct machine *machine __maybe_unused)
> +{
> + int ret = 0;
> + const char *evname;
> + struct tpebs_retire_lat *t;
> +
> + evname = evsel__name(evsel);
> +
> + /*
> + * Need to handle per core results? We are assuming average retire
> + * latency value will be used. Save the number of samples and the sum of
> + * retire latency value for each event.
> + */
> + list_for_each_entry(t, &stat_config.tpebs_results, event.nd) {
> + if (!strcmp(evname, t->event.name)) {
> + t->count += 1;
> + t->sum += sample->retire_lat;
> + break;
> + }
> + }
> +
> + return ret;
> +}
> +
> +static int process_feature_event(struct perf_session *session,
> + union perf_event *event)
> +{
> + if (event->feat.feat_id < HEADER_LAST_FEATURE)
> + return perf_event__process_feature(session, event);
> + return 0;
> +}
> +
> +static void *__cmd_script(void *arg __maybe_unused)
The arg is used.
Also I don't like the name 'script' as it has nothing to do with
scripting. Maybe 'sample_reader', 'tpebs_reader' or
'reader_thread'?
> +{
> + struct child_process *cmd = arg;
> + struct perf_session *session;
> + struct perf_data data = {
> + .mode = PERF_DATA_MODE_READ,
> + .path = PERF_DATA,
> + .file.fd = cmd->out,
> + };
> + struct perf_script script = {
> + .tool = {
> + .sample = process_sample_event,
> + .feature = process_feature_event,
> + .attr = perf_event__process_attr,
Broken indentation. And if you just use the tool, you can
pass it directly.
> + },
> + };
> +
> + session = perf_session__new(&data, &script.tool);
> + if (IS_ERR(session))
> + return NULL;
> + script.session = session;
> + perf_session__process_events(session);
> + perf_session__delete(session);
> +
> + return NULL;
> +}
> +
> static int __run_perf_stat(int argc, const char **argv, int run_idx)
> {
> int interval = stat_config.interval;
> @@ -709,15 +856,38 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
> struct affinity saved_affinity, *affinity = NULL;
> int err;
> bool second_pass = false;
> + struct child_process cmd;
> + pthread_t thread_script;
>
> /* Prepare perf record for sampling event retire_latency before fork and
> * prepare workload */
> if (stat_config.tpebs_event_size > 0) {
> int ret;
> + struct tpebs_event *e;
>
> - ret = __run_perf_record();
> + pr_debug("perf stat pid = %d\n", getpid());
> + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> + struct tpebs_retire_lat *new = malloc(sizeof(struct tpebs_retire_lat));
> +
> + if (!new)
> + return -1;
> + new->event.name = strdup(e->name);
> + new->event.tpebs_name = strdup(e->tpebs_name);
These can fail too.
> + new->count = 0;
> + new->sum = 0;
> + list_add_tail(&new->event.nd, &stat_config.tpebs_results);
> + }
> + ret = prepare_perf_record(&cmd);
> if (ret)
> return ret;
> + if (pthread_create(&thread_script, NULL, __cmd_script, &cmd)) {
> + kill(cmd.pid, SIGTERM);
> + close(cmd.out);
> + pr_err("Could not create thread to process sample data.\n");
> + return -1;
> + }
> + /* Wait for perf record initialization a little bit.*/
> + sleep(2);
This won't guarantee anything. If you want to make sure the
'thread_script' to run before the 'perf record' process, you can
use a pipe to signal that like in evlist__prepare_workload() and
evlist__start_workload().
> }
>
> if (forks) {
> @@ -925,6 +1095,17 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
>
> t1 = rdclock();
>
> + if (stat_config.tpebs_event_size > 0) {
> + int ret;
> +
> + kill(cmd.pid, SIGTERM);
> + pthread_join(thread_script, NULL);
> + close(cmd.out);
> + ret = finish_command(&cmd);
> + if (ret != -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> + return ret;
> + }
> +
> if (stat_config.walltime_run_table)
> stat_config.walltime_run[run_idx] = t1 - t0;
>
> @@ -1032,6 +1213,9 @@ static void sig_atexit(void)
> if (child_pid != -1)
> kill(child_pid, SIGTERM);
>
> + if (stat_config.tpebs_pid != -1)
> + kill(stat_config.tpebs_pid, SIGTERM);
> +
> sigprocmask(SIG_SETMASK, &oset, NULL);
>
> if (signr == -1)
> @@ -2972,5 +3156,7 @@ int cmd_stat(int argc, const char **argv)
> metricgroup__rblist_exit(&stat_config.metric_events);
> evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack, &stat_config.ctl_fd_close);
>
> + tpebs_data__delete();
> +
> return status;
> }
> diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
> index 08c4bfbd817f..98e3014c0aef 100644
> --- a/tools/perf/util/data.c
> +++ b/tools/perf/util/data.c
> @@ -204,7 +204,11 @@ static bool check_pipe(struct perf_data *data)
> data->file.fd = fd;
> data->use_stdio = false;
> }
> - } else {
> + /*
> + * When is_pipe and data->file.fd is given, use given fd
> + * instead of STDIN_FILENO or STDOUT_FILENO
> + */
> + } else if (data->file.fd <= 0) {
> data->file.fd = fd;
> }
> }
I think this can be in a separate commit.
> diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
> index 7c24ed768ff3..ae788edef30f 100644
> --- a/tools/perf/util/metricgroup.h
> +++ b/tools/perf/util/metricgroup.h
> @@ -68,10 +68,18 @@ struct metric_expr {
>
> struct tpebs_event {
> struct list_head nd;
> + /* Event name */
> const char *name;
> + /* Event name with the TPEBS modifier R */
> const char *tpebs_name;
> };
>
> +struct tpebs_retire_lat {
> + struct tpebs_event event;
> + size_t count;
> + int sum;
> +};
Actually I don't know why you need this separate structure.
Can we just use tpebs_event?
Thanks,
Namhyung
> +
> struct metric_event *metricgroup__lookup(struct rblist *metric_events,
> struct evsel *evsel,
> bool create);
> diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> index b987960df3c5..0726bdc06681 100644
> --- a/tools/perf/util/stat.h
> +++ b/tools/perf/util/stat.h
> @@ -111,6 +111,8 @@ struct perf_stat_config {
> struct rblist metric_events;
> struct list_head tpebs_events;
> size_t tpebs_event_size;
> + struct list_head tpebs_results;
> + pid_t tpebs_pid;
> int ctl_fd;
> int ctl_fd_ack;
> bool ctl_fd_close;
> --
> 2.43.0
>
> -----Original Message-----
> From: Namhyung Kim <namhyung@kernel.org>
> Sent: Monday, April 1, 2024 1:58 PM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
>
> On Fri, Mar 29, 2024 at 12:12 PM <weilin.wang@intel.com> wrote:
> >
> > From: Weilin Wang <weilin.wang@intel.com>
> >
> > When retire_latency value is used in a metric formula, perf stat would fork a
> > perf record process with "-e" and "-W" options. Perf record will collect
> > required retire_latency values in parallel while perf stat is collecting
> > counting values.
> >
> > At the point of time that perf stat stops counting, it would send sigterm
> signal
> > to perf record process and receiving sampling data back from perf record
> from a
> > pipe. Perf stat will then process the received data to get retire latency data
> > and calculate metric result.
> >
> > Another thread is required to synchronize between perf stat and perf record
> > when we pass data through pipe.
> >
> > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > Reviewed-by: Ian Rogers <irogers@google.com>
> > ---
> > tools/perf/builtin-stat.c | 190
> +++++++++++++++++++++++++++++++++-
> > tools/perf/util/data.c | 6 +-
> > tools/perf/util/metricgroup.h | 8 ++
> > tools/perf/util/stat.h | 2 +
> > 4 files changed, 203 insertions(+), 3 deletions(-)
> >
> > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > index 6291e1e24535..7fbe47b0c44c 100644
> > --- a/tools/perf/builtin-stat.c
> > +++ b/tools/perf/builtin-stat.c
> > @@ -94,8 +94,13 @@
> > #include <perf/evlist.h>
> > #include <internal/threadmap.h>
> >
> > +#include "util/sample.h"
> > +#include <sys/param.h>
> > +#include <subcmd/run-command.h>
> > +
> > #define DEFAULT_SEPARATOR " "
> > #define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi"
> > +#define PERF_DATA "-"
> >
> > static void print_counters(struct timespec *ts, int argc, const char **argv);
> >
> > @@ -163,6 +168,8 @@ static struct perf_stat_config stat_config = {
> > .ctl_fd_ack = -1,
> > .iostat_run = false,
> > .tpebs_events = LIST_HEAD_INIT(stat_config.tpebs_events),
> > + .tpebs_results = LIST_HEAD_INIT(stat_config.tpebs_results),
> > + .tpebs_pid = -1,
> > };
> >
> > static bool cpus_map_matched(struct evsel *a, struct evsel *b)
> > @@ -684,15 +691,155 @@ static enum counter_recovery
> stat_handle_error(struct evsel *counter)
> >
> > if (child_pid != -1)
> > kill(child_pid, SIGTERM);
> > + if (stat_config.tpebs_pid != -1)
> > + kill(stat_config.tpebs_pid, SIGTERM);
> > return COUNTER_FATAL;
> > }
> >
> > -static int __run_perf_record(void)
> > +static int __run_perf_record(const char **record_argv)
> > {
> > + int i = 0;
> > + struct tpebs_event *e;
> > +
> > pr_debug("Prepare perf record for retire_latency\n");
> > +
> > + record_argv[i++] = "perf";
> > + record_argv[i++] = "record";
> > + record_argv[i++] = "-W";
> > + record_argv[i++] = "--synth=no";
> > +
> > + if (stat_config.user_requested_cpu_list) {
> > + record_argv[i++] = "-C";
> > + record_argv[i++] = stat_config.user_requested_cpu_list;
> > + }
> > +
> > + if (stat_config.system_wide)
> > + record_argv[i++] = "-a";
> > +
> > + if (!stat_config.system_wide && !stat_config.user_requested_cpu_list)
> {
> > + pr_err("Require -a or -C option to run sampling.\n");
> > + return -ECANCELED;
> > + }
> > +
> > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > + record_argv[i++] = "-e";
> > + record_argv[i++] = e->name;
> > + }
> > +
> > + record_argv[i++] = "-o";
> > + record_argv[i++] = PERF_DATA;
> > +
> > return 0;
> > }
>
> Still I think it's weird it has 'perf record' in perf stat (despite the
> 'perf stat record'). If it's only Intel thing and we don't have a plan
> to do the same on other arches, we can move it to the arch
> directory and keep the perf stat code simple.
I'm not sure what is the proper way to solve this. And Ian mentioned
that put code in arch directory could potentially cause other bugs.
So I'm wondering if we could keep this code here for now. I could work
on it later if we found it's better to be in arch directory.
>
>
> >
> > +static void prepare_run_command(struct child_process *cmd,
> > + const char **argv)
> > +{
> > + memset(cmd, 0, sizeof(*cmd));
> > + cmd->argv = argv;
> > + cmd->out = -1;
> > +}
> > +
> > +static int prepare_perf_record(struct child_process *cmd)
> > +{
> > + const char **record_argv;
> > + int ret;
> > +
> > + record_argv = calloc(10 + 2 * stat_config.tpebs_event_size, sizeof(char
> *));
> > + if (!record_argv)
> > + return -1;
> > +
> > + ret = __run_perf_record(record_argv);
> > + if (ret)
> > + return ret;
> > +
> > + prepare_run_command(cmd, record_argv);
> > + return start_command(cmd);
> > +}
> > +
> > +struct perf_script {
> > + struct perf_tool tool;
> > + struct perf_session *session;
> > +};
> > +
> > +static void tpebs_data__delete(void)
> > +{
> > + struct tpebs_retire_lat *r, *rtmp;
> > + struct tpebs_event *e, *etmp;
> > +
> > + list_for_each_entry_safe(r, rtmp, &stat_config.tpebs_results, event.nd)
> {
> > + list_del_init(&r->event.nd);
> > + free(r);
> > + }
> > + list_for_each_entry_safe(e, etmp, &stat_config.tpebs_events, nd) {
> > + list_del_init(&e->nd);
> > + free(e);
>
> Shouldn't it free the names?
>
>
> > + }
> > +}
> > +
> > +static int process_sample_event(struct perf_tool *tool __maybe_unused,
> > + union perf_event *event __maybe_unused,
> > + struct perf_sample *sample,
> > + struct evsel *evsel,
> > + struct machine *machine __maybe_unused)
> > +{
> > + int ret = 0;
> > + const char *evname;
> > + struct tpebs_retire_lat *t;
> > +
> > + evname = evsel__name(evsel);
> > +
> > + /*
> > + * Need to handle per core results? We are assuming average retire
> > + * latency value will be used. Save the number of samples and the sum
> of
> > + * retire latency value for each event.
> > + */
> > + list_for_each_entry(t, &stat_config.tpebs_results, event.nd) {
> > + if (!strcmp(evname, t->event.name)) {
> > + t->count += 1;
> > + t->sum += sample->retire_lat;
> > + break;
> > + }
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +static int process_feature_event(struct perf_session *session,
> > + union perf_event *event)
> > +{
> > + if (event->feat.feat_id < HEADER_LAST_FEATURE)
> > + return perf_event__process_feature(session, event);
> > + return 0;
> > +}
> > +
> > +static void *__cmd_script(void *arg __maybe_unused)
>
> The arg is used.
>
> Also I don't like the name 'script' as it has nothing to do with
> scripting. Maybe 'sample_reader', 'tpebs_reader' or
> 'reader_thread'?
>
>
> > +{
> > + struct child_process *cmd = arg;
> > + struct perf_session *session;
> > + struct perf_data data = {
> > + .mode = PERF_DATA_MODE_READ,
> > + .path = PERF_DATA,
> > + .file.fd = cmd->out,
> > + };
> > + struct perf_script script = {
> > + .tool = {
> > + .sample = process_sample_event,
> > + .feature = process_feature_event,
> > + .attr = perf_event__process_attr,
>
> Broken indentation. And if you just use the tool, you can
> pass it directly.
>
>
> > + },
> > + };
> > +
> > + session = perf_session__new(&data, &script.tool);
> > + if (IS_ERR(session))
> > + return NULL;
> > + script.session = session;
> > + perf_session__process_events(session);
> > + perf_session__delete(session);
> > +
> > + return NULL;
> > +}
> > +
> > static int __run_perf_stat(int argc, const char **argv, int run_idx)
> > {
> > int interval = stat_config.interval;
> > @@ -709,15 +856,38 @@ static int __run_perf_stat(int argc, const char
> **argv, int run_idx)
> > struct affinity saved_affinity, *affinity = NULL;
> > int err;
> > bool second_pass = false;
> > + struct child_process cmd;
> > + pthread_t thread_script;
> >
> > /* Prepare perf record for sampling event retire_latency before fork and
> > * prepare workload */
> > if (stat_config.tpebs_event_size > 0) {
> > int ret;
> > + struct tpebs_event *e;
> >
> > - ret = __run_perf_record();
> > + pr_debug("perf stat pid = %d\n", getpid());
> > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > + struct tpebs_retire_lat *new = malloc(sizeof(struct
> tpebs_retire_lat));
> > +
> > + if (!new)
> > + return -1;
> > + new->event.name = strdup(e->name);
> > + new->event.tpebs_name = strdup(e->tpebs_name);
>
> These can fail too.
>
>
> > + new->count = 0;
> > + new->sum = 0;
> > + list_add_tail(&new->event.nd, &stat_config.tpebs_results);
> > + }
> > + ret = prepare_perf_record(&cmd);
> > if (ret)
> > return ret;
> > + if (pthread_create(&thread_script, NULL, __cmd_script, &cmd)) {
> > + kill(cmd.pid, SIGTERM);
> > + close(cmd.out);
> > + pr_err("Could not create thread to process sample data.\n");
> > + return -1;
> > + }
> > + /* Wait for perf record initialization a little bit.*/
> > + sleep(2);
>
> This won't guarantee anything. If you want to make sure the
> 'thread_script' to run before the 'perf record' process, you can
> use a pipe to signal that like in evlist__prepare_workload() and
> evlist__start_workload().
This sleep is added to make perf stat wait for record initialization because in the
case that the workload runs very small a mount of time, we'd like to ensure perf
record has enough time to launch and start collecting sample data.
Because the code uses the common API in run-command.h to do the fork, I think
it cannot use PIPE like in evlist__prepare_workload to sync and start perf record
and perf stat data collection together. Please correct me if I'm wrong here.
>
>
> > }
> >
> > if (forks) {
> > @@ -925,6 +1095,17 @@ static int __run_perf_stat(int argc, const char
> **argv, int run_idx)
> >
> > t1 = rdclock();
> >
> > + if (stat_config.tpebs_event_size > 0) {
> > + int ret;
> > +
> > + kill(cmd.pid, SIGTERM);
> > + pthread_join(thread_script, NULL);
> > + close(cmd.out);
> > + ret = finish_command(&cmd);
> > + if (ret != -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> > + return ret;
> > + }
> > +
> > if (stat_config.walltime_run_table)
> > stat_config.walltime_run[run_idx] = t1 - t0;
> >
> > @@ -1032,6 +1213,9 @@ static void sig_atexit(void)
> > if (child_pid != -1)
> > kill(child_pid, SIGTERM);
> >
> > + if (stat_config.tpebs_pid != -1)
> > + kill(stat_config.tpebs_pid, SIGTERM);
> > +
> > sigprocmask(SIG_SETMASK, &oset, NULL);
> >
> > if (signr == -1)
> > @@ -2972,5 +3156,7 @@ int cmd_stat(int argc, const char **argv)
> > metricgroup__rblist_exit(&stat_config.metric_events);
> > evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack,
> &stat_config.ctl_fd_close);
> >
> > + tpebs_data__delete();
> > +
> > return status;
> > }
> > diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
> > index 08c4bfbd817f..98e3014c0aef 100644
> > --- a/tools/perf/util/data.c
> > +++ b/tools/perf/util/data.c
> > @@ -204,7 +204,11 @@ static bool check_pipe(struct perf_data *data)
> > data->file.fd = fd;
> > data->use_stdio = false;
> > }
> > - } else {
> > + /*
> > + * When is_pipe and data->file.fd is given, use given fd
> > + * instead of STDIN_FILENO or STDOUT_FILENO
> > + */
> > + } else if (data->file.fd <= 0) {
> > data->file.fd = fd;
> > }
> > }
>
> I think this can be in a separate commit.
>
>
> > diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
> > index 7c24ed768ff3..ae788edef30f 100644
> > --- a/tools/perf/util/metricgroup.h
> > +++ b/tools/perf/util/metricgroup.h
> > @@ -68,10 +68,18 @@ struct metric_expr {
> >
> > struct tpebs_event {
> > struct list_head nd;
> > + /* Event name */
> > const char *name;
> > + /* Event name with the TPEBS modifier R */
> > const char *tpebs_name;
> > };
> >
> > +struct tpebs_retire_lat {
> > + struct tpebs_event event;
> > + size_t count;
> > + int sum;
> > +};
>
> Actually I don't know why you need this separate structure.
> Can we just use tpebs_event?
Currently, we use average value as the retire latency value in metrics. But we
might update it to use other value, for example the minimum or maximum. So, I thought
it would be better to have a dedicated data structure to handle this data. I could update
the code to use tpebs_event if you still feel that's better.
Thanks,
Weilin
>
> Thanks,
> Namhyung
>
>
> > +
> > struct metric_event *metricgroup__lookup(struct rblist *metric_events,
> > struct evsel *evsel,
> > bool create);
> > diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> > index b987960df3c5..0726bdc06681 100644
> > --- a/tools/perf/util/stat.h
> > +++ b/tools/perf/util/stat.h
> > @@ -111,6 +111,8 @@ struct perf_stat_config {
> > struct rblist metric_events;
> > struct list_head tpebs_events;
> > size_t tpebs_event_size;
> > + struct list_head tpebs_results;
> > + pid_t tpebs_pid;
> > int ctl_fd;
> > int ctl_fd_ack;
> > bool ctl_fd_close;
> > --
> > 2.43.0
> >
On Mon, Apr 1, 2024 at 2:23 PM Wang, Weilin <weilin.wang@intel.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Namhyung Kim <namhyung@kernel.org>
> > Sent: Monday, April 1, 2024 1:58 PM
> > To: Wang, Weilin <weilin.wang@intel.com>
> > Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > <mingo@redhat.com>; Alexander Shishkin
> > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> > Caleb <caleb.biggers@intel.com>
> > Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record when
> > perf stat needs to get retire latency value for a metric.
> >
> > On Fri, Mar 29, 2024 at 12:12 PM <weilin.wang@intel.com> wrote:
> > >
> > > From: Weilin Wang <weilin.wang@intel.com>
> > >
> > > When retire_latency value is used in a metric formula, perf stat would fork a
> > > perf record process with "-e" and "-W" options. Perf record will collect
> > > required retire_latency values in parallel while perf stat is collecting
> > > counting values.
> > >
> > > At the point of time that perf stat stops counting, it would send sigterm
> > signal
> > > to perf record process and receiving sampling data back from perf record
> > from a
> > > pipe. Perf stat will then process the received data to get retire latency data
> > > and calculate metric result.
> > >
> > > Another thread is required to synchronize between perf stat and perf record
> > > when we pass data through pipe.
> > >
> > > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > > Reviewed-by: Ian Rogers <irogers@google.com>
> > > ---
> > > tools/perf/builtin-stat.c | 190
> > +++++++++++++++++++++++++++++++++-
> > > tools/perf/util/data.c | 6 +-
> > > tools/perf/util/metricgroup.h | 8 ++
> > > tools/perf/util/stat.h | 2 +
> > > 4 files changed, 203 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > > index 6291e1e24535..7fbe47b0c44c 100644
> > > --- a/tools/perf/builtin-stat.c
> > > +++ b/tools/perf/builtin-stat.c
> > > @@ -94,8 +94,13 @@
> > > #include <perf/evlist.h>
> > > #include <internal/threadmap.h>
> > >
> > > +#include "util/sample.h"
> > > +#include <sys/param.h>
> > > +#include <subcmd/run-command.h>
> > > +
> > > #define DEFAULT_SEPARATOR " "
> > > #define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi"
> > > +#define PERF_DATA "-"
> > >
> > > static void print_counters(struct timespec *ts, int argc, const char **argv);
> > >
> > > @@ -163,6 +168,8 @@ static struct perf_stat_config stat_config = {
> > > .ctl_fd_ack = -1,
> > > .iostat_run = false,
> > > .tpebs_events = LIST_HEAD_INIT(stat_config.tpebs_events),
> > > + .tpebs_results = LIST_HEAD_INIT(stat_config.tpebs_results),
> > > + .tpebs_pid = -1,
> > > };
> > >
> > > static bool cpus_map_matched(struct evsel *a, struct evsel *b)
> > > @@ -684,15 +691,155 @@ static enum counter_recovery
> > stat_handle_error(struct evsel *counter)
> > >
> > > if (child_pid != -1)
> > > kill(child_pid, SIGTERM);
> > > + if (stat_config.tpebs_pid != -1)
> > > + kill(stat_config.tpebs_pid, SIGTERM);
> > > return COUNTER_FATAL;
> > > }
> > >
> > > -static int __run_perf_record(void)
> > > +static int __run_perf_record(const char **record_argv)
> > > {
> > > + int i = 0;
> > > + struct tpebs_event *e;
> > > +
> > > pr_debug("Prepare perf record for retire_latency\n");
> > > +
> > > + record_argv[i++] = "perf";
> > > + record_argv[i++] = "record";
> > > + record_argv[i++] = "-W";
> > > + record_argv[i++] = "--synth=no";
> > > +
> > > + if (stat_config.user_requested_cpu_list) {
> > > + record_argv[i++] = "-C";
> > > + record_argv[i++] = stat_config.user_requested_cpu_list;
> > > + }
> > > +
> > > + if (stat_config.system_wide)
> > > + record_argv[i++] = "-a";
> > > +
> > > + if (!stat_config.system_wide && !stat_config.user_requested_cpu_list)
> > {
> > > + pr_err("Require -a or -C option to run sampling.\n");
> > > + return -ECANCELED;
> > > + }
> > > +
> > > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > > + record_argv[i++] = "-e";
> > > + record_argv[i++] = e->name;
> > > + }
> > > +
> > > + record_argv[i++] = "-o";
> > > + record_argv[i++] = PERF_DATA;
> > > +
> > > return 0;
> > > }
> >
> > Still I think it's weird it has 'perf record' in perf stat (despite the
> > 'perf stat record'). If it's only Intel thing and we don't have a plan
> > to do the same on other arches, we can move it to the arch
> > directory and keep the perf stat code simple.
>
> I'm not sure what is the proper way to solve this. And Ian mentioned
> that put code in arch directory could potentially cause other bugs.
> So I'm wondering if we could keep this code here for now. I could work
> on it later if we found it's better to be in arch directory.
Maybe somewhere in the util/ and keep the main code minimal.
IIUC it's only for very recent (or upcoming?) Intel CPUs and we
don't have tests (hopefully can run on other arch/CPUs).
So I don't think having it here would help fixing potential bugs.
> > >
> > > +static void prepare_run_command(struct child_process *cmd,
> > > + const char **argv)
> > > +{
> > > + memset(cmd, 0, sizeof(*cmd));
> > > + cmd->argv = argv;
> > > + cmd->out = -1;
> > > +}
> > > +
> > > +static int prepare_perf_record(struct child_process *cmd)
> > > +{
> > > + const char **record_argv;
> > > + int ret;
> > > +
> > > + record_argv = calloc(10 + 2 * stat_config.tpebs_event_size, sizeof(char
> > *));
> > > + if (!record_argv)
> > > + return -1;
> > > +
> > > + ret = __run_perf_record(record_argv);
> > > + if (ret)
> > > + return ret;
> > > +
> > > + prepare_run_command(cmd, record_argv);
> > > + return start_command(cmd);
> > > +}
> > > +
> > > +struct perf_script {
> > > + struct perf_tool tool;
> > > + struct perf_session *session;
> > > +};
> > > +
> > > +static void tpebs_data__delete(void)
> > > +{
> > > + struct tpebs_retire_lat *r, *rtmp;
> > > + struct tpebs_event *e, *etmp;
> > > +
> > > + list_for_each_entry_safe(r, rtmp, &stat_config.tpebs_results, event.nd)
> > {
> > > + list_del_init(&r->event.nd);
> > > + free(r);
> > > + }
> > > + list_for_each_entry_safe(e, etmp, &stat_config.tpebs_events, nd) {
> > > + list_del_init(&e->nd);
> > > + free(e);
> >
> > Shouldn't it free the names?
> >
> >
> > > + }
> > > +}
> > > +
> > > +static int process_sample_event(struct perf_tool *tool __maybe_unused,
> > > + union perf_event *event __maybe_unused,
> > > + struct perf_sample *sample,
> > > + struct evsel *evsel,
> > > + struct machine *machine __maybe_unused)
> > > +{
> > > + int ret = 0;
> > > + const char *evname;
> > > + struct tpebs_retire_lat *t;
> > > +
> > > + evname = evsel__name(evsel);
> > > +
> > > + /*
> > > + * Need to handle per core results? We are assuming average retire
> > > + * latency value will be used. Save the number of samples and the sum
> > of
> > > + * retire latency value for each event.
> > > + */
> > > + list_for_each_entry(t, &stat_config.tpebs_results, event.nd) {
> > > + if (!strcmp(evname, t->event.name)) {
> > > + t->count += 1;
> > > + t->sum += sample->retire_lat;
> > > + break;
> > > + }
> > > + }
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +static int process_feature_event(struct perf_session *session,
> > > + union perf_event *event)
> > > +{
> > > + if (event->feat.feat_id < HEADER_LAST_FEATURE)
> > > + return perf_event__process_feature(session, event);
> > > + return 0;
> > > +}
> > > +
> > > +static void *__cmd_script(void *arg __maybe_unused)
> >
> > The arg is used.
> >
> > Also I don't like the name 'script' as it has nothing to do with
> > scripting. Maybe 'sample_reader', 'tpebs_reader' or
> > 'reader_thread'?
> >
> >
> > > +{
> > > + struct child_process *cmd = arg;
> > > + struct perf_session *session;
> > > + struct perf_data data = {
> > > + .mode = PERF_DATA_MODE_READ,
> > > + .path = PERF_DATA,
> > > + .file.fd = cmd->out,
> > > + };
> > > + struct perf_script script = {
> > > + .tool = {
> > > + .sample = process_sample_event,
> > > + .feature = process_feature_event,
> > > + .attr = perf_event__process_attr,
> >
> > Broken indentation. And if you just use the tool, you can
> > pass it directly.
> >
> >
> > > + },
> > > + };
> > > +
> > > + session = perf_session__new(&data, &script.tool);
> > > + if (IS_ERR(session))
> > > + return NULL;
> > > + script.session = session;
> > > + perf_session__process_events(session);
> > > + perf_session__delete(session);
> > > +
> > > + return NULL;
> > > +}
> > > +
> > > static int __run_perf_stat(int argc, const char **argv, int run_idx)
> > > {
> > > int interval = stat_config.interval;
> > > @@ -709,15 +856,38 @@ static int __run_perf_stat(int argc, const char
> > **argv, int run_idx)
> > > struct affinity saved_affinity, *affinity = NULL;
> > > int err;
> > > bool second_pass = false;
> > > + struct child_process cmd;
> > > + pthread_t thread_script;
> > >
> > > /* Prepare perf record for sampling event retire_latency before fork and
> > > * prepare workload */
> > > if (stat_config.tpebs_event_size > 0) {
> > > int ret;
> > > + struct tpebs_event *e;
> > >
> > > - ret = __run_perf_record();
> > > + pr_debug("perf stat pid = %d\n", getpid());
> > > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > > + struct tpebs_retire_lat *new = malloc(sizeof(struct
> > tpebs_retire_lat));
> > > +
> > > + if (!new)
> > > + return -1;
> > > + new->event.name = strdup(e->name);
> > > + new->event.tpebs_name = strdup(e->tpebs_name);
> >
> > These can fail too.
> >
> >
> > > + new->count = 0;
> > > + new->sum = 0;
> > > + list_add_tail(&new->event.nd, &stat_config.tpebs_results);
> > > + }
> > > + ret = prepare_perf_record(&cmd);
> > > if (ret)
> > > return ret;
> > > + if (pthread_create(&thread_script, NULL, __cmd_script, &cmd)) {
> > > + kill(cmd.pid, SIGTERM);
> > > + close(cmd.out);
> > > + pr_err("Could not create thread to process sample data.\n");
> > > + return -1;
> > > + }
> > > + /* Wait for perf record initialization a little bit.*/
> > > + sleep(2);
> >
> > This won't guarantee anything. If you want to make sure the
> > 'thread_script' to run before the 'perf record' process, you can
> > use a pipe to signal that like in evlist__prepare_workload() and
> > evlist__start_workload().
>
> This sleep is added to make perf stat wait for record initialization because in the
> case that the workload runs very small a mount of time, we'd like to ensure perf
> record has enough time to launch and start collecting sample data.
But waiting for 2 seconds won't solve the problem.
>
> Because the code uses the common API in run-command.h to do the fork, I think
> it cannot use PIPE like in evlist__prepare_workload to sync and start perf record
> and perf stat data collection together. Please correct me if I'm wrong here.
Ok, it'd be hard to sync both perf record and perf stat with a single workload.
I think you can try --control option in perf record to enable/disable
with timing
you want. Also --synth=no should reduce a lot of overhead during
initialization.
Maybe you can also add --synth=no-kernel to completely skip the synthesis.
Thanks,
Namhyung
> > > }
> > >
> > > if (forks) {
> > > @@ -925,6 +1095,17 @@ static int __run_perf_stat(int argc, const char
> > **argv, int run_idx)
> > >
> > > t1 = rdclock();
> > >
> > > + if (stat_config.tpebs_event_size > 0) {
> > > + int ret;
> > > +
> > > + kill(cmd.pid, SIGTERM);
> > > + pthread_join(thread_script, NULL);
> > > + close(cmd.out);
> > > + ret = finish_command(&cmd);
> > > + if (ret != -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> > > + return ret;
> > > + }
> > > +
> > > if (stat_config.walltime_run_table)
> > > stat_config.walltime_run[run_idx] = t1 - t0;
> > >
> > > @@ -1032,6 +1213,9 @@ static void sig_atexit(void)
> > > if (child_pid != -1)
> > > kill(child_pid, SIGTERM);
> > >
> > > + if (stat_config.tpebs_pid != -1)
> > > + kill(stat_config.tpebs_pid, SIGTERM);
> > > +
> > > sigprocmask(SIG_SETMASK, &oset, NULL);
> > >
> > > if (signr == -1)
> > > @@ -2972,5 +3156,7 @@ int cmd_stat(int argc, const char **argv)
> > > metricgroup__rblist_exit(&stat_config.metric_events);
> > > evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack,
> > &stat_config.ctl_fd_close);
> > >
> > > + tpebs_data__delete();
> > > +
> > > return status;
> > > }
> > > diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
> > > index 08c4bfbd817f..98e3014c0aef 100644
> > > --- a/tools/perf/util/data.c
> > > +++ b/tools/perf/util/data.c
> > > @@ -204,7 +204,11 @@ static bool check_pipe(struct perf_data *data)
> > > data->file.fd = fd;
> > > data->use_stdio = false;
> > > }
> > > - } else {
> > > + /*
> > > + * When is_pipe and data->file.fd is given, use given fd
> > > + * instead of STDIN_FILENO or STDOUT_FILENO
> > > + */
> > > + } else if (data->file.fd <= 0) {
> > > data->file.fd = fd;
> > > }
> > > }
> >
> > I think this can be in a separate commit.
> >
> >
> > > diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
> > > index 7c24ed768ff3..ae788edef30f 100644
> > > --- a/tools/perf/util/metricgroup.h
> > > +++ b/tools/perf/util/metricgroup.h
> > > @@ -68,10 +68,18 @@ struct metric_expr {
> > >
> > > struct tpebs_event {
> > > struct list_head nd;
> > > + /* Event name */
> > > const char *name;
> > > + /* Event name with the TPEBS modifier R */
> > > const char *tpebs_name;
> > > };
> > >
> > > +struct tpebs_retire_lat {
> > > + struct tpebs_event event;
> > > + size_t count;
> > > + int sum;
> > > +};
> >
> > Actually I don't know why you need this separate structure.
> > Can we just use tpebs_event?
>
> Currently, we use average value as the retire latency value in metrics. But we
> might update it to use other value, for example the minimum or maximum. So, I thought
> it would be better to have a dedicated data structure to handle this data. I could update
> the code to use tpebs_event if you still feel that's better.
>
> Thanks,
> Weilin
>
> >
> > Thanks,
> > Namhyung
> >
> >
> > > +
> > > struct metric_event *metricgroup__lookup(struct rblist *metric_events,
> > > struct evsel *evsel,
> > > bool create);
> > > diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> > > index b987960df3c5..0726bdc06681 100644
> > > --- a/tools/perf/util/stat.h
> > > +++ b/tools/perf/util/stat.h
> > > @@ -111,6 +111,8 @@ struct perf_stat_config {
> > > struct rblist metric_events;
> > > struct list_head tpebs_events;
> > > size_t tpebs_event_size;
> > > + struct list_head tpebs_results;
> > > + pid_t tpebs_pid;
> > > int ctl_fd;
> > > int ctl_fd_ack;
> > > bool ctl_fd_close;
> > > --
> > > 2.43.0
> > >
> -----Original Message-----
> From: Namhyung Kim <namhyung@kernel.org>
> Sent: Tuesday, April 23, 2024 1:59 PM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
>
> On Mon, Apr 1, 2024 at 2:23 PM Wang, Weilin <weilin.wang@intel.com>
> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Namhyung Kim <namhyung@kernel.org>
> > > Sent: Monday, April 1, 2024 1:58 PM
> > > To: Wang, Weilin <weilin.wang@intel.com>
> > > Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> > > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > > <mingo@redhat.com>; Alexander Shishkin
> > > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor,
> Perry
> > > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>;
> Biggers,
> > > Caleb <caleb.biggers@intel.com>
> > > Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record
> when
> > > perf stat needs to get retire latency value for a metric.
> > >
> > > On Fri, Mar 29, 2024 at 12:12 PM <weilin.wang@intel.com> wrote:
> > > >
> > > > From: Weilin Wang <weilin.wang@intel.com>
> > > >
> > > > When retire_latency value is used in a metric formula, perf stat would
> fork a
> > > > perf record process with "-e" and "-W" options. Perf record will collect
> > > > required retire_latency values in parallel while perf stat is collecting
> > > > counting values.
> > > >
> > > > At the point of time that perf stat stops counting, it would send sigterm
> > > signal
> > > > to perf record process and receiving sampling data back from perf record
> > > from a
> > > > pipe. Perf stat will then process the received data to get retire latency
> data
> > > > and calculate metric result.
> > > >
> > > > Another thread is required to synchronize between perf stat and perf
> record
> > > > when we pass data through pipe.
> > > >
> > > > Signed-off-by: Weilin Wang <weilin.wang@intel.com>
> > > > Reviewed-by: Ian Rogers <irogers@google.com>
> > > > ---
> > > > tools/perf/builtin-stat.c | 190
> > > +++++++++++++++++++++++++++++++++-
> > > > tools/perf/util/data.c | 6 +-
> > > > tools/perf/util/metricgroup.h | 8 ++
> > > > tools/perf/util/stat.h | 2 +
> > > > 4 files changed, 203 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > > > index 6291e1e24535..7fbe47b0c44c 100644
> > > > --- a/tools/perf/builtin-stat.c
> > > > +++ b/tools/perf/builtin-stat.c
> > > > @@ -94,8 +94,13 @@
> > > > #include <perf/evlist.h>
> > > > #include <internal/threadmap.h>
> > > >
> > > > +#include "util/sample.h"
> > > > +#include <sys/param.h>
> > > > +#include <subcmd/run-command.h>
> > > > +
> > > > #define DEFAULT_SEPARATOR " "
> > > > #define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi"
> > > > +#define PERF_DATA "-"
> > > >
> > > > static void print_counters(struct timespec *ts, int argc, const char
> **argv);
> > > >
> > > > @@ -163,6 +168,8 @@ static struct perf_stat_config stat_config = {
> > > > .ctl_fd_ack = -1,
> > > > .iostat_run = false,
> > > > .tpebs_events = LIST_HEAD_INIT(stat_config.tpebs_events),
> > > > + .tpebs_results = LIST_HEAD_INIT(stat_config.tpebs_results),
> > > > + .tpebs_pid = -1,
> > > > };
> > > >
> > > > static bool cpus_map_matched(struct evsel *a, struct evsel *b)
> > > > @@ -684,15 +691,155 @@ static enum counter_recovery
> > > stat_handle_error(struct evsel *counter)
> > > >
> > > > if (child_pid != -1)
> > > > kill(child_pid, SIGTERM);
> > > > + if (stat_config.tpebs_pid != -1)
> > > > + kill(stat_config.tpebs_pid, SIGTERM);
> > > > return COUNTER_FATAL;
> > > > }
> > > >
> > > > -static int __run_perf_record(void)
> > > > +static int __run_perf_record(const char **record_argv)
> > > > {
> > > > + int i = 0;
> > > > + struct tpebs_event *e;
> > > > +
> > > > pr_debug("Prepare perf record for retire_latency\n");
> > > > +
> > > > + record_argv[i++] = "perf";
> > > > + record_argv[i++] = "record";
> > > > + record_argv[i++] = "-W";
> > > > + record_argv[i++] = "--synth=no";
> > > > +
> > > > + if (stat_config.user_requested_cpu_list) {
> > > > + record_argv[i++] = "-C";
> > > > + record_argv[i++] = stat_config.user_requested_cpu_list;
> > > > + }
> > > > +
> > > > + if (stat_config.system_wide)
> > > > + record_argv[i++] = "-a";
> > > > +
> > > > + if (!stat_config.system_wide
> && !stat_config.user_requested_cpu_list)
> > > {
> > > > + pr_err("Require -a or -C option to run sampling.\n");
> > > > + return -ECANCELED;
> > > > + }
> > > > +
> > > > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > > > + record_argv[i++] = "-e";
> > > > + record_argv[i++] = e->name;
> > > > + }
> > > > +
> > > > + record_argv[i++] = "-o";
> > > > + record_argv[i++] = PERF_DATA;
> > > > +
> > > > return 0;
> > > > }
> > >
> > > Still I think it's weird it has 'perf record' in perf stat (despite the
> > > 'perf stat record'). If it's only Intel thing and we don't have a plan
> > > to do the same on other arches, we can move it to the arch
> > > directory and keep the perf stat code simple.
> >
> > I'm not sure what is the proper way to solve this. And Ian mentioned
> > that put code in arch directory could potentially cause other bugs.
> > So I'm wondering if we could keep this code here for now. I could work
> > on it later if we found it's better to be in arch directory.
>
> Maybe somewhere in the util/ and keep the main code minimal.
> IIUC it's only for very recent (or upcoming?) Intel CPUs and we
> don't have tests (hopefully can run on other arch/CPUs).
>
> So I don't think having it here would help fixing potential bugs.
Do you mean by creating a new file in util/ to hold this code?
Yes, this feature is for very recent Intel CPUs. It should only be triggered if
a metric uses event(s) that has the R modifier in the formula.
Thanks,
Weilin
>
> > > >
> > > > +static void prepare_run_command(struct child_process *cmd,
> > > > + const char **argv)
> > > > +{
> > > > + memset(cmd, 0, sizeof(*cmd));
> > > > + cmd->argv = argv;
> > > > + cmd->out = -1;
> > > > +}
> > > > +
> > > > +static int prepare_perf_record(struct child_process *cmd)
> > > > +{
> > > > + const char **record_argv;
> > > > + int ret;
> > > > +
> > > > + record_argv = calloc(10 + 2 * stat_config.tpebs_event_size,
> sizeof(char
> > > *));
> > > > + if (!record_argv)
> > > > + return -1;
> > > > +
> > > > + ret = __run_perf_record(record_argv);
> > > > + if (ret)
> > > > + return ret;
> > > > +
> > > > + prepare_run_command(cmd, record_argv);
> > > > + return start_command(cmd);
> > > > +}
> > > > +
> > > > +struct perf_script {
> > > > + struct perf_tool tool;
> > > > + struct perf_session *session;
> > > > +};
> > > > +
> > > > +static void tpebs_data__delete(void)
> > > > +{
> > > > + struct tpebs_retire_lat *r, *rtmp;
> > > > + struct tpebs_event *e, *etmp;
> > > > +
> > > > + list_for_each_entry_safe(r, rtmp, &stat_config.tpebs_results,
> event.nd)
> > > {
> > > > + list_del_init(&r->event.nd);
> > > > + free(r);
> > > > + }
> > > > + list_for_each_entry_safe(e, etmp, &stat_config.tpebs_events, nd) {
> > > > + list_del_init(&e->nd);
> > > > + free(e);
> > >
> > > Shouldn't it free the names?
> > >
> > >
> > > > + }
> > > > +}
> > > > +
> > > > +static int process_sample_event(struct perf_tool *tool
> __maybe_unused,
> > > > + union perf_event *event __maybe_unused,
> > > > + struct perf_sample *sample,
> > > > + struct evsel *evsel,
> > > > + struct machine *machine __maybe_unused)
> > > > +{
> > > > + int ret = 0;
> > > > + const char *evname;
> > > > + struct tpebs_retire_lat *t;
> > > > +
> > > > + evname = evsel__name(evsel);
> > > > +
> > > > + /*
> > > > + * Need to handle per core results? We are assuming average retire
> > > > + * latency value will be used. Save the number of samples and the
> sum
> > > of
> > > > + * retire latency value for each event.
> > > > + */
> > > > + list_for_each_entry(t, &stat_config.tpebs_results, event.nd) {
> > > > + if (!strcmp(evname, t->event.name)) {
> > > > + t->count += 1;
> > > > + t->sum += sample->retire_lat;
> > > > + break;
> > > > + }
> > > > + }
> > > > +
> > > > + return ret;
> > > > +}
> > > > +
> > > > +static int process_feature_event(struct perf_session *session,
> > > > + union perf_event *event)
> > > > +{
> > > > + if (event->feat.feat_id < HEADER_LAST_FEATURE)
> > > > + return perf_event__process_feature(session, event);
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static void *__cmd_script(void *arg __maybe_unused)
> > >
> > > The arg is used.
> > >
> > > Also I don't like the name 'script' as it has nothing to do with
> > > scripting. Maybe 'sample_reader', 'tpebs_reader' or
> > > 'reader_thread'?
> > >
> > >
> > > > +{
> > > > + struct child_process *cmd = arg;
> > > > + struct perf_session *session;
> > > > + struct perf_data data = {
> > > > + .mode = PERF_DATA_MODE_READ,
> > > > + .path = PERF_DATA,
> > > > + .file.fd = cmd->out,
> > > > + };
> > > > + struct perf_script script = {
> > > > + .tool = {
> > > > + .sample = process_sample_event,
> > > > + .feature = process_feature_event,
> > > > + .attr = perf_event__process_attr,
> > >
> > > Broken indentation. And if you just use the tool, you can
> > > pass it directly.
> > >
> > >
> > > > + },
> > > > + };
> > > > +
> > > > + session = perf_session__new(&data, &script.tool);
> > > > + if (IS_ERR(session))
> > > > + return NULL;
> > > > + script.session = session;
> > > > + perf_session__process_events(session);
> > > > + perf_session__delete(session);
> > > > +
> > > > + return NULL;
> > > > +}
> > > > +
> > > > static int __run_perf_stat(int argc, const char **argv, int run_idx)
> > > > {
> > > > int interval = stat_config.interval;
> > > > @@ -709,15 +856,38 @@ static int __run_perf_stat(int argc, const char
> > > **argv, int run_idx)
> > > > struct affinity saved_affinity, *affinity = NULL;
> > > > int err;
> > > > bool second_pass = false;
> > > > + struct child_process cmd;
> > > > + pthread_t thread_script;
> > > >
> > > > /* Prepare perf record for sampling event retire_latency before fork
> and
> > > > * prepare workload */
> > > > if (stat_config.tpebs_event_size > 0) {
> > > > int ret;
> > > > + struct tpebs_event *e;
> > > >
> > > > - ret = __run_perf_record();
> > > > + pr_debug("perf stat pid = %d\n", getpid());
> > > > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > > > + struct tpebs_retire_lat *new = malloc(sizeof(struct
> > > tpebs_retire_lat));
> > > > +
> > > > + if (!new)
> > > > + return -1;
> > > > + new->event.name = strdup(e->name);
> > > > + new->event.tpebs_name = strdup(e->tpebs_name);
> > >
> > > These can fail too.
> > >
> > >
> > > > + new->count = 0;
> > > > + new->sum = 0;
> > > > + list_add_tail(&new->event.nd, &stat_config.tpebs_results);
> > > > + }
> > > > + ret = prepare_perf_record(&cmd);
> > > > if (ret)
> > > > return ret;
> > > > + if (pthread_create(&thread_script, NULL, __cmd_script, &cmd))
> {
> > > > + kill(cmd.pid, SIGTERM);
> > > > + close(cmd.out);
> > > > + pr_err("Could not create thread to process sample data.\n");
> > > > + return -1;
> > > > + }
> > > > + /* Wait for perf record initialization a little bit.*/
> > > > + sleep(2);
> > >
> > > This won't guarantee anything. If you want to make sure the
> > > 'thread_script' to run before the 'perf record' process, you can
> > > use a pipe to signal that like in evlist__prepare_workload() and
> > > evlist__start_workload().
> >
> > This sleep is added to make perf stat wait for record initialization because in
> the
> > case that the workload runs very small a mount of time, we'd like to ensure
> perf
> > record has enough time to launch and start collecting sample data.
>
> But waiting for 2 seconds won't solve the problem.
>
> >
> > Because the code uses the common API in run-command.h to do the fork, I
> think
> > it cannot use PIPE like in evlist__prepare_workload to sync and start perf
> record
> > and perf stat data collection together. Please correct me if I'm wrong here.
>
> Ok, it'd be hard to sync both perf record and perf stat with a single workload.
> I think you can try --control option in perf record to enable/disable
> with timing
> you want. Also --synth=no should reduce a lot of overhead during
> initialization.
> Maybe you can also add --synth=no-kernel to completely skip the synthesis.
>
> Thanks,
> Namhyung
>
>
> > > > }
> > > >
> > > > if (forks) {
> > > > @@ -925,6 +1095,17 @@ static int __run_perf_stat(int argc, const char
> > > **argv, int run_idx)
> > > >
> > > > t1 = rdclock();
> > > >
> > > > + if (stat_config.tpebs_event_size > 0) {
> > > > + int ret;
> > > > +
> > > > + kill(cmd.pid, SIGTERM);
> > > > + pthread_join(thread_script, NULL);
> > > > + close(cmd.out);
> > > > + ret = finish_command(&cmd);
> > > > + if (ret != -ERR_RUN_COMMAND_WAITPID_SIGNAL)
> > > > + return ret;
> > > > + }
> > > > +
> > > > if (stat_config.walltime_run_table)
> > > > stat_config.walltime_run[run_idx] = t1 - t0;
> > > >
> > > > @@ -1032,6 +1213,9 @@ static void sig_atexit(void)
> > > > if (child_pid != -1)
> > > > kill(child_pid, SIGTERM);
> > > >
> > > > + if (stat_config.tpebs_pid != -1)
> > > > + kill(stat_config.tpebs_pid, SIGTERM);
> > > > +
> > > > sigprocmask(SIG_SETMASK, &oset, NULL);
> > > >
> > > > if (signr == -1)
> > > > @@ -2972,5 +3156,7 @@ int cmd_stat(int argc, const char **argv)
> > > > metricgroup__rblist_exit(&stat_config.metric_events);
> > > > evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack,
> > > &stat_config.ctl_fd_close);
> > > >
> > > > + tpebs_data__delete();
> > > > +
> > > > return status;
> > > > }
> > > > diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
> > > > index 08c4bfbd817f..98e3014c0aef 100644
> > > > --- a/tools/perf/util/data.c
> > > > +++ b/tools/perf/util/data.c
> > > > @@ -204,7 +204,11 @@ static bool check_pipe(struct perf_data *data)
> > > > data->file.fd = fd;
> > > > data->use_stdio = false;
> > > > }
> > > > - } else {
> > > > + /*
> > > > + * When is_pipe and data->file.fd is given, use given fd
> > > > + * instead of STDIN_FILENO or STDOUT_FILENO
> > > > + */
> > > > + } else if (data->file.fd <= 0) {
> > > > data->file.fd = fd;
> > > > }
> > > > }
> > >
> > > I think this can be in a separate commit.
> > >
> > >
> > > > diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
> > > > index 7c24ed768ff3..ae788edef30f 100644
> > > > --- a/tools/perf/util/metricgroup.h
> > > > +++ b/tools/perf/util/metricgroup.h
> > > > @@ -68,10 +68,18 @@ struct metric_expr {
> > > >
> > > > struct tpebs_event {
> > > > struct list_head nd;
> > > > + /* Event name */
> > > > const char *name;
> > > > + /* Event name with the TPEBS modifier R */
> > > > const char *tpebs_name;
> > > > };
> > > >
> > > > +struct tpebs_retire_lat {
> > > > + struct tpebs_event event;
> > > > + size_t count;
> > > > + int sum;
> > > > +};
> > >
> > > Actually I don't know why you need this separate structure.
> > > Can we just use tpebs_event?
> >
> > Currently, we use average value as the retire latency value in metrics. But we
> > might update it to use other value, for example the minimum or maximum.
> So, I thought
> > it would be better to have a dedicated data structure to handle this data. I
> could update
> > the code to use tpebs_event if you still feel that's better.
> >
> > Thanks,
> > Weilin
> >
> > >
> > > Thanks,
> > > Namhyung
> > >
> > >
> > > > +
> > > > struct metric_event *metricgroup__lookup(struct rblist *metric_events,
> > > > struct evsel *evsel,
> > > > bool create);
> > > > diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> > > > index b987960df3c5..0726bdc06681 100644
> > > > --- a/tools/perf/util/stat.h
> > > > +++ b/tools/perf/util/stat.h
> > > > @@ -111,6 +111,8 @@ struct perf_stat_config {
> > > > struct rblist metric_events;
> > > > struct list_head tpebs_events;
> > > > size_t tpebs_event_size;
> > > > + struct list_head tpebs_results;
> > > > + pid_t tpebs_pid;
> > > > int ctl_fd;
> > > > int ctl_fd_ack;
> > > > bool ctl_fd_close;
> > > > --
> > > > 2.43.0
> > > >
On Tue, Apr 23, 2024 at 3:16 PM Wang, Weilin <weilin.wang@intel.com> wrote:
> > > > > -static int __run_perf_record(void)
> > > > > +static int __run_perf_record(const char **record_argv)
> > > > > {
> > > > > + int i = 0;
> > > > > + struct tpebs_event *e;
> > > > > +
> > > > > pr_debug("Prepare perf record for retire_latency\n");
> > > > > +
> > > > > + record_argv[i++] = "perf";
> > > > > + record_argv[i++] = "record";
> > > > > + record_argv[i++] = "-W";
> > > > > + record_argv[i++] = "--synth=no";
> > > > > +
> > > > > + if (stat_config.user_requested_cpu_list) {
> > > > > + record_argv[i++] = "-C";
> > > > > + record_argv[i++] = stat_config.user_requested_cpu_list;
> > > > > + }
> > > > > +
> > > > > + if (stat_config.system_wide)
> > > > > + record_argv[i++] = "-a";
> > > > > +
> > > > > + if (!stat_config.system_wide
> > && !stat_config.user_requested_cpu_list)
> > > > {
> > > > > + pr_err("Require -a or -C option to run sampling.\n");
> > > > > + return -ECANCELED;
> > > > > + }
> > > > > +
> > > > > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > > > > + record_argv[i++] = "-e";
> > > > > + record_argv[i++] = e->name;
> > > > > + }
> > > > > +
> > > > > + record_argv[i++] = "-o";
> > > > > + record_argv[i++] = PERF_DATA;
> > > > > +
> > > > > return 0;
> > > > > }
> > > >
> > > > Still I think it's weird it has 'perf record' in perf stat (despite the
> > > > 'perf stat record'). If it's only Intel thing and we don't have a plan
> > > > to do the same on other arches, we can move it to the arch
> > > > directory and keep the perf stat code simple.
> > >
> > > I'm not sure what is the proper way to solve this. And Ian mentioned
> > > that put code in arch directory could potentially cause other bugs.
> > > So I'm wondering if we could keep this code here for now. I could work
> > > on it later if we found it's better to be in arch directory.
> >
> > Maybe somewhere in the util/ and keep the main code minimal.
> > IIUC it's only for very recent (or upcoming?) Intel CPUs and we
> > don't have tests (hopefully can run on other arch/CPUs).
> >
> > So I don't think having it here would help fixing potential bugs.
>
> Do you mean by creating a new file in util/ to hold this code?
Yeah, maybe util/intel-tpebs.c (if it's better than arch/x86/...) ?
>
> Yes, this feature is for very recent Intel CPUs. It should only be triggered if
> a metric uses event(s) that has the R modifier in the formula.
Can we have a test with a fake metric so that we can test
the code on non-(or old-)Intel machines?
Thanks,
Namhyung
> -----Original Message-----
> From: Namhyung Kim <namhyung@kernel.org>
> Sent: Tuesday, April 23, 2024 4:06 PM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
>
> On Tue, Apr 23, 2024 at 3:16 PM Wang, Weilin <weilin.wang@intel.com>
> wrote:
> > > > > > -static int __run_perf_record(void)
> > > > > > +static int __run_perf_record(const char **record_argv)
> > > > > > {
> > > > > > + int i = 0;
> > > > > > + struct tpebs_event *e;
> > > > > > +
> > > > > > pr_debug("Prepare perf record for retire_latency\n");
> > > > > > +
> > > > > > + record_argv[i++] = "perf";
> > > > > > + record_argv[i++] = "record";
> > > > > > + record_argv[i++] = "-W";
> > > > > > + record_argv[i++] = "--synth=no";
> > > > > > +
> > > > > > + if (stat_config.user_requested_cpu_list) {
> > > > > > + record_argv[i++] = "-C";
> > > > > > + record_argv[i++] = stat_config.user_requested_cpu_list;
> > > > > > + }
> > > > > > +
> > > > > > + if (stat_config.system_wide)
> > > > > > + record_argv[i++] = "-a";
> > > > > > +
> > > > > > + if (!stat_config.system_wide
> > > && !stat_config.user_requested_cpu_list)
> > > > > {
> > > > > > + pr_err("Require -a or -C option to run sampling.\n");
> > > > > > + return -ECANCELED;
> > > > > > + }
> > > > > > +
> > > > > > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > > > > > + record_argv[i++] = "-e";
> > > > > > + record_argv[i++] = e->name;
> > > > > > + }
> > > > > > +
> > > > > > + record_argv[i++] = "-o";
> > > > > > + record_argv[i++] = PERF_DATA;
> > > > > > +
> > > > > > return 0;
> > > > > > }
> > > > >
> > > > > Still I think it's weird it has 'perf record' in perf stat (despite the
> > > > > 'perf stat record'). If it's only Intel thing and we don't have a plan
> > > > > to do the same on other arches, we can move it to the arch
> > > > > directory and keep the perf stat code simple.
> > > >
> > > > I'm not sure what is the proper way to solve this. And Ian mentioned
> > > > that put code in arch directory could potentially cause other bugs.
> > > > So I'm wondering if we could keep this code here for now. I could work
> > > > on it later if we found it's better to be in arch directory.
> > >
> > > Maybe somewhere in the util/ and keep the main code minimal.
> > > IIUC it's only for very recent (or upcoming?) Intel CPUs and we
> > > don't have tests (hopefully can run on other arch/CPUs).
> > >
> > > So I don't think having it here would help fixing potential bugs.
> >
> > Do you mean by creating a new file in util/ to hold this code?
>
> Yeah, maybe util/intel-tpebs.c (if it's better than arch/x86/...) ?
>
> >
> > Yes, this feature is for very recent Intel CPUs. It should only be triggered if
> > a metric uses event(s) that has the R modifier in the formula.
>
> Can we have a test with a fake metric so that we can test
> the code on non-(or old-)Intel machines?
All the existing metrics in non-(or old-)Intel CPUs should work as usual. So I think
existing metric tests should work for it.
If we want to add a fake metric uses the :R modifier for those platforms, the metric
should either fail (if the fake metric uses an event not exist on the test platform) or
return all 0 retire latency data.
So, I'm not quite sure what we want the fake metric to test for. Maybe we could
continue using existing metric tests to ensure all the defined metrics work correctly
in each machine under test?
Thanks,
Weilin
>
> Thanks,
> Namhyung
On Wed, Apr 24, 2024 at 10:08 AM Wang, Weilin <weilin.wang@intel.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Namhyung Kim <namhyung@kernel.org>
> > Sent: Tuesday, April 23, 2024 4:06 PM
> > To: Wang, Weilin <weilin.wang@intel.com>
> > Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > <mingo@redhat.com>; Alexander Shishkin
> > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> > Caleb <caleb.biggers@intel.com>
> > Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record when
> > perf stat needs to get retire latency value for a metric.
> >
> > On Tue, Apr 23, 2024 at 3:16 PM Wang, Weilin <weilin.wang@intel.com>
> > wrote:
> > > > > > > -static int __run_perf_record(void)
> > > > > > > +static int __run_perf_record(const char **record_argv)
> > > > > > > {
> > > > > > > + int i = 0;
> > > > > > > + struct tpebs_event *e;
> > > > > > > +
> > > > > > > pr_debug("Prepare perf record for retire_latency\n");
> > > > > > > +
> > > > > > > + record_argv[i++] = "perf";
> > > > > > > + record_argv[i++] = "record";
> > > > > > > + record_argv[i++] = "-W";
> > > > > > > + record_argv[i++] = "--synth=no";
> > > > > > > +
> > > > > > > + if (stat_config.user_requested_cpu_list) {
> > > > > > > + record_argv[i++] = "-C";
> > > > > > > + record_argv[i++] = stat_config.user_requested_cpu_list;
> > > > > > > + }
> > > > > > > +
> > > > > > > + if (stat_config.system_wide)
> > > > > > > + record_argv[i++] = "-a";
> > > > > > > +
> > > > > > > + if (!stat_config.system_wide
> > > > && !stat_config.user_requested_cpu_list)
> > > > > > {
> > > > > > > + pr_err("Require -a or -C option to run sampling.\n");
> > > > > > > + return -ECANCELED;
> > > > > > > + }
> > > > > > > +
> > > > > > > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > > > > > > + record_argv[i++] = "-e";
> > > > > > > + record_argv[i++] = e->name;
> > > > > > > + }
> > > > > > > +
> > > > > > > + record_argv[i++] = "-o";
> > > > > > > + record_argv[i++] = PERF_DATA;
> > > > > > > +
> > > > > > > return 0;
> > > > > > > }
> > > > > >
> > > > > > Still I think it's weird it has 'perf record' in perf stat (despite the
> > > > > > 'perf stat record'). If it's only Intel thing and we don't have a plan
> > > > > > to do the same on other arches, we can move it to the arch
> > > > > > directory and keep the perf stat code simple.
> > > > >
> > > > > I'm not sure what is the proper way to solve this. And Ian mentioned
> > > > > that put code in arch directory could potentially cause other bugs.
> > > > > So I'm wondering if we could keep this code here for now. I could work
> > > > > on it later if we found it's better to be in arch directory.
> > > >
> > > > Maybe somewhere in the util/ and keep the main code minimal.
> > > > IIUC it's only for very recent (or upcoming?) Intel CPUs and we
> > > > don't have tests (hopefully can run on other arch/CPUs).
> > > >
> > > > So I don't think having it here would help fixing potential bugs.
> > >
> > > Do you mean by creating a new file in util/ to hold this code?
> >
> > Yeah, maybe util/intel-tpebs.c (if it's better than arch/x86/...) ?
> >
> > >
> > > Yes, this feature is for very recent Intel CPUs. It should only be triggered if
> > > a metric uses event(s) that has the R modifier in the formula.
> >
> > Can we have a test with a fake metric so that we can test
> > the code on non-(or old-)Intel machines?
>
> All the existing metrics in non-(or old-)Intel CPUs should work as usual. So I think
> existing metric tests should work for it.
>
> If we want to add a fake metric uses the :R modifier for those platforms, the metric
> should either fail (if the fake metric uses an event not exist on the test platform) or
> return all 0 retire latency data.
>
> So, I'm not quite sure what we want the fake metric to test for. Maybe we could
> continue using existing metric tests to ensure all the defined metrics work correctly
> in each machine under test?
I think it's ok to return all 0 retire latency for fake tPEBS metrics.
It's just to verify the background record + report logic works ok.
Thanks,
Namhyung
> -----Original Message-----
> From: Namhyung Kim <namhyung@kernel.org>
> Sent: Wednesday, April 24, 2024 11:50 AM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
>
> On Wed, Apr 24, 2024 at 10:08 AM Wang, Weilin <weilin.wang@intel.com>
> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Namhyung Kim <namhyung@kernel.org>
> > > Sent: Tuesday, April 23, 2024 4:06 PM
> > > To: Wang, Weilin <weilin.wang@intel.com>
> > > Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> > > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > > <mingo@redhat.com>; Alexander Shishkin
> > > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor,
> Perry
> > > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>;
> Biggers,
> > > Caleb <caleb.biggers@intel.com>
> > > Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record
> when
> > > perf stat needs to get retire latency value for a metric.
> > >
> > > On Tue, Apr 23, 2024 at 3:16 PM Wang, Weilin <weilin.wang@intel.com>
> > > wrote:
> > > > > > > > -static int __run_perf_record(void)
> > > > > > > > +static int __run_perf_record(const char **record_argv)
> > > > > > > > {
> > > > > > > > + int i = 0;
> > > > > > > > + struct tpebs_event *e;
> > > > > > > > +
> > > > > > > > pr_debug("Prepare perf record for retire_latency\n");
> > > > > > > > +
> > > > > > > > + record_argv[i++] = "perf";
> > > > > > > > + record_argv[i++] = "record";
> > > > > > > > + record_argv[i++] = "-W";
> > > > > > > > + record_argv[i++] = "--synth=no";
> > > > > > > > +
> > > > > > > > + if (stat_config.user_requested_cpu_list) {
> > > > > > > > + record_argv[i++] = "-C";
> > > > > > > > + record_argv[i++] = stat_config.user_requested_cpu_list;
> > > > > > > > + }
> > > > > > > > +
> > > > > > > > + if (stat_config.system_wide)
> > > > > > > > + record_argv[i++] = "-a";
> > > > > > > > +
> > > > > > > > + if (!stat_config.system_wide
> > > > > && !stat_config.user_requested_cpu_list)
> > > > > > > {
> > > > > > > > + pr_err("Require -a or -C option to run sampling.\n");
> > > > > > > > + return -ECANCELED;
> > > > > > > > + }
> > > > > > > > +
> > > > > > > > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > > > > > > > + record_argv[i++] = "-e";
> > > > > > > > + record_argv[i++] = e->name;
> > > > > > > > + }
> > > > > > > > +
> > > > > > > > + record_argv[i++] = "-o";
> > > > > > > > + record_argv[i++] = PERF_DATA;
> > > > > > > > +
> > > > > > > > return 0;
> > > > > > > > }
> > > > > > >
> > > > > > > Still I think it's weird it has 'perf record' in perf stat (despite the
> > > > > > > 'perf stat record'). If it's only Intel thing and we don't have a plan
> > > > > > > to do the same on other arches, we can move it to the arch
> > > > > > > directory and keep the perf stat code simple.
> > > > > >
> > > > > > I'm not sure what is the proper way to solve this. And Ian mentioned
> > > > > > that put code in arch directory could potentially cause other bugs.
> > > > > > So I'm wondering if we could keep this code here for now. I could
> work
> > > > > > on it later if we found it's better to be in arch directory.
> > > > >
> > > > > Maybe somewhere in the util/ and keep the main code minimal.
> > > > > IIUC it's only for very recent (or upcoming?) Intel CPUs and we
> > > > > don't have tests (hopefully can run on other arch/CPUs).
> > > > >
> > > > > So I don't think having it here would help fixing potential bugs.
> > > >
> > > > Do you mean by creating a new file in util/ to hold this code?
> > >
> > > Yeah, maybe util/intel-tpebs.c (if it's better than arch/x86/...) ?
> > >
> > > >
> > > > Yes, this feature is for very recent Intel CPUs. It should only be triggered if
> > > > a metric uses event(s) that has the R modifier in the formula.
> > >
> > > Can we have a test with a fake metric so that we can test
> > > the code on non-(or old-)Intel machines?
> >
> > All the existing metrics in non-(or old-)Intel CPUs should work as usual. So I
> think
> > existing metric tests should work for it.
> >
> > If we want to add a fake metric uses the :R modifier for those platforms, the
> metric
> > should either fail (if the fake metric uses an event not exist on the test
> platform) or
> > return all 0 retire latency data.
> >
> > So, I'm not quite sure what we want the fake metric to test for. Maybe we
> could
> > continue using existing metric tests to ensure all the defined metrics work
> correctly
> > in each machine under test?
>
> I think it's ok to return all 0 retire latency for fake tPEBS metrics.
> It's just to verify the background record + report logic works ok.
Hi Namhyung,
After think more about how TPEBS and metrics work, I feel should discuss more
about defining a fake TPEBS metric in unsupported platforms.
If we'd like a add fake metrics, where should we define and store these metrics?
Should we add this type of metrics for every platform? All the official metrics
we publish are defined by architect and stored in JSON files under separate
directories for each platform. I think it is not a good idea to place fake metrics
together with real metrics. Could you please let me know if there is any other
method to define fake metrics for testing?
Thanks,
Weilin
>
> Thanks,
> Namhyung
On Tue, May 14, 2024 at 10:57 PM Wang, Weilin <weilin.wang@intel.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Namhyung Kim <namhyung@kernel.org>
> > Sent: Wednesday, April 24, 2024 11:50 AM
> > To: Wang, Weilin <weilin.wang@intel.com>
> > Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > <mingo@redhat.com>; Alexander Shishkin
> > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> > Caleb <caleb.biggers@intel.com>
> > Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record when
> > perf stat needs to get retire latency value for a metric.
> >
> > On Wed, Apr 24, 2024 at 10:08 AM Wang, Weilin <weilin.wang@intel.com>
> > wrote:
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Namhyung Kim <namhyung@kernel.org>
> > > > Sent: Tuesday, April 23, 2024 4:06 PM
> > > > To: Wang, Weilin <weilin.wang@intel.com>
> > > > Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> > > > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > > > <mingo@redhat.com>; Alexander Shishkin
> > > > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > > > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > > > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor,
> > Perry
> > > > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>;
> > Biggers,
> > > > Caleb <caleb.biggers@intel.com>
> > > > Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record
> > when
> > > > perf stat needs to get retire latency value for a metric.
> > > >
> > > > On Tue, Apr 23, 2024 at 3:16 PM Wang, Weilin <weilin.wang@intel.com>
> > > > wrote:
> > > > > > > > > -static int __run_perf_record(void)
> > > > > > > > > +static int __run_perf_record(const char **record_argv)
> > > > > > > > > {
> > > > > > > > > + int i = 0;
> > > > > > > > > + struct tpebs_event *e;
> > > > > > > > > +
> > > > > > > > > pr_debug("Prepare perf record for retire_latency\n");
> > > > > > > > > +
> > > > > > > > > + record_argv[i++] = "perf";
> > > > > > > > > + record_argv[i++] = "record";
> > > > > > > > > + record_argv[i++] = "-W";
> > > > > > > > > + record_argv[i++] = "--synth=no";
> > > > > > > > > +
> > > > > > > > > + if (stat_config.user_requested_cpu_list) {
> > > > > > > > > + record_argv[i++] = "-C";
> > > > > > > > > + record_argv[i++] = stat_config.user_requested_cpu_list;
> > > > > > > > > + }
> > > > > > > > > +
> > > > > > > > > + if (stat_config.system_wide)
> > > > > > > > > + record_argv[i++] = "-a";
> > > > > > > > > +
> > > > > > > > > + if (!stat_config.system_wide
> > > > > > && !stat_config.user_requested_cpu_list)
> > > > > > > > {
> > > > > > > > > + pr_err("Require -a or -C option to run sampling.\n");
> > > > > > > > > + return -ECANCELED;
> > > > > > > > > + }
> > > > > > > > > +
> > > > > > > > > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > > > > > > > > + record_argv[i++] = "-e";
> > > > > > > > > + record_argv[i++] = e->name;
> > > > > > > > > + }
> > > > > > > > > +
> > > > > > > > > + record_argv[i++] = "-o";
> > > > > > > > > + record_argv[i++] = PERF_DATA;
> > > > > > > > > +
> > > > > > > > > return 0;
> > > > > > > > > }
> > > > > > > >
> > > > > > > > Still I think it's weird it has 'perf record' in perf stat (despite the
> > > > > > > > 'perf stat record'). If it's only Intel thing and we don't have a plan
> > > > > > > > to do the same on other arches, we can move it to the arch
> > > > > > > > directory and keep the perf stat code simple.
> > > > > > >
> > > > > > > I'm not sure what is the proper way to solve this. And Ian mentioned
> > > > > > > that put code in arch directory could potentially cause other bugs.
> > > > > > > So I'm wondering if we could keep this code here for now. I could
> > work
> > > > > > > on it later if we found it's better to be in arch directory.
> > > > > >
> > > > > > Maybe somewhere in the util/ and keep the main code minimal.
> > > > > > IIUC it's only for very recent (or upcoming?) Intel CPUs and we
> > > > > > don't have tests (hopefully can run on other arch/CPUs).
> > > > > >
> > > > > > So I don't think having it here would help fixing potential bugs.
> > > > >
> > > > > Do you mean by creating a new file in util/ to hold this code?
> > > >
> > > > Yeah, maybe util/intel-tpebs.c (if it's better than arch/x86/...) ?
> > > >
> > > > >
> > > > > Yes, this feature is for very recent Intel CPUs. It should only be triggered if
> > > > > a metric uses event(s) that has the R modifier in the formula.
> > > >
> > > > Can we have a test with a fake metric so that we can test
> > > > the code on non-(or old-)Intel machines?
> > >
> > > All the existing metrics in non-(or old-)Intel CPUs should work as usual. So I
> > think
> > > existing metric tests should work for it.
> > >
> > > If we want to add a fake metric uses the :R modifier for those platforms, the
> > metric
> > > should either fail (if the fake metric uses an event not exist on the test
> > platform) or
> > > return all 0 retire latency data.
> > >
> > > So, I'm not quite sure what we want the fake metric to test for. Maybe we
> > could
> > > continue using existing metric tests to ensure all the defined metrics work
> > correctly
> > > in each machine under test?
> >
> > I think it's ok to return all 0 retire latency for fake tPEBS metrics.
> > It's just to verify the background record + report logic works ok.
>
> Hi Namhyung,
>
> After think more about how TPEBS and metrics work, I feel should discuss more
> about defining a fake TPEBS metric in unsupported platforms.
> If we'd like a add fake metrics, where should we define and store these metrics?
> Should we add this type of metrics for every platform? All the official metrics
> we publish are defined by architect and stored in JSON files under separate
> directories for each platform. I think it is not a good idea to place fake metrics
> together with real metrics. Could you please let me know if there is any other
> method to define fake metrics for testing?
We do put fake events/metrics in the "test" arch:
https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/pmu-events/arch/test/test_soc/cpu/metrics.json
It is something of a pain bringing things here over to the
NO_JEVENTS=1 (ie no python) empty-pmu-events.c file. I agree on not
wanting to pollute real metrics with test metrics, we currently just
use the test metrics to fake up some expression parsing tests:
https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/tests/pmu-events.c#n811
Thanks,
Ian
> Thanks,
> Weilin
>
> >
> > Thanks,
> > Namhyung
> -----Original Message-----
> From: Ian Rogers <irogers@google.com>
> Sent: Tuesday, May 14, 2024 11:07 PM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Namhyung Kim <namhyung@kernel.org>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
>
> On Tue, May 14, 2024 at 10:57 PM Wang, Weilin <weilin.wang@intel.com>
> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Namhyung Kim <namhyung@kernel.org>
> > > Sent: Wednesday, April 24, 2024 11:50 AM
> > > To: Wang, Weilin <weilin.wang@intel.com>
> > > Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> > > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > > <mingo@redhat.com>; Alexander Shishkin
> > > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor,
> Perry
> > > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>;
> Biggers,
> > > Caleb <caleb.biggers@intel.com>
> > > Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record
> when
> > > perf stat needs to get retire latency value for a metric.
> > >
> > > On Wed, Apr 24, 2024 at 10:08 AM Wang, Weilin
> <weilin.wang@intel.com>
> > > wrote:
> > > >
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Namhyung Kim <namhyung@kernel.org>
> > > > > Sent: Tuesday, April 23, 2024 4:06 PM
> > > > > To: Wang, Weilin <weilin.wang@intel.com>
> > > > > Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> > > > > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo
> Molnar
> > > > > <mingo@redhat.com>; Alexander Shishkin
> > > > > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>;
> Hunter,
> > > > > Adrian <adrian.hunter@intel.com>; Kan Liang
> <kan.liang@linux.intel.com>;
> > > > > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor,
> > > Perry
> > > > > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>;
> > > Biggers,
> > > > > Caleb <caleb.biggers@intel.com>
> > > > > Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record
> > > when
> > > > > perf stat needs to get retire latency value for a metric.
> > > > >
> > > > > On Tue, Apr 23, 2024 at 3:16 PM Wang, Weilin
> <weilin.wang@intel.com>
> > > > > wrote:
> > > > > > > > > > -static int __run_perf_record(void)
> > > > > > > > > > +static int __run_perf_record(const char **record_argv)
> > > > > > > > > > {
> > > > > > > > > > + int i = 0;
> > > > > > > > > > + struct tpebs_event *e;
> > > > > > > > > > +
> > > > > > > > > > pr_debug("Prepare perf record for retire_latency\n");
> > > > > > > > > > +
> > > > > > > > > > + record_argv[i++] = "perf";
> > > > > > > > > > + record_argv[i++] = "record";
> > > > > > > > > > + record_argv[i++] = "-W";
> > > > > > > > > > + record_argv[i++] = "--synth=no";
> > > > > > > > > > +
> > > > > > > > > > + if (stat_config.user_requested_cpu_list) {
> > > > > > > > > > + record_argv[i++] = "-C";
> > > > > > > > > > + record_argv[i++] =
> stat_config.user_requested_cpu_list;
> > > > > > > > > > + }
> > > > > > > > > > +
> > > > > > > > > > + if (stat_config.system_wide)
> > > > > > > > > > + record_argv[i++] = "-a";
> > > > > > > > > > +
> > > > > > > > > > + if (!stat_config.system_wide
> > > > > > > && !stat_config.user_requested_cpu_list)
> > > > > > > > > {
> > > > > > > > > > + pr_err("Require -a or -C option to run sampling.\n");
> > > > > > > > > > + return -ECANCELED;
> > > > > > > > > > + }
> > > > > > > > > > +
> > > > > > > > > > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > > > > > > > > > + record_argv[i++] = "-e";
> > > > > > > > > > + record_argv[i++] = e->name;
> > > > > > > > > > + }
> > > > > > > > > > +
> > > > > > > > > > + record_argv[i++] = "-o";
> > > > > > > > > > + record_argv[i++] = PERF_DATA;
> > > > > > > > > > +
> > > > > > > > > > return 0;
> > > > > > > > > > }
> > > > > > > > >
> > > > > > > > > Still I think it's weird it has 'perf record' in perf stat (despite the
> > > > > > > > > 'perf stat record'). If it's only Intel thing and we don't have a plan
> > > > > > > > > to do the same on other arches, we can move it to the arch
> > > > > > > > > directory and keep the perf stat code simple.
> > > > > > > >
> > > > > > > > I'm not sure what is the proper way to solve this. And Ian
> mentioned
> > > > > > > > that put code in arch directory could potentially cause other bugs.
> > > > > > > > So I'm wondering if we could keep this code here for now. I could
> > > work
> > > > > > > > on it later if we found it's better to be in arch directory.
> > > > > > >
> > > > > > > Maybe somewhere in the util/ and keep the main code minimal.
> > > > > > > IIUC it's only for very recent (or upcoming?) Intel CPUs and we
> > > > > > > don't have tests (hopefully can run on other arch/CPUs).
> > > > > > >
> > > > > > > So I don't think having it here would help fixing potential bugs.
> > > > > >
> > > > > > Do you mean by creating a new file in util/ to hold this code?
> > > > >
> > > > > Yeah, maybe util/intel-tpebs.c (if it's better than arch/x86/...) ?
> > > > >
> > > > > >
> > > > > > Yes, this feature is for very recent Intel CPUs. It should only be
> triggered if
> > > > > > a metric uses event(s) that has the R modifier in the formula.
> > > > >
> > > > > Can we have a test with a fake metric so that we can test
> > > > > the code on non-(or old-)Intel machines?
> > > >
> > > > All the existing metrics in non-(or old-)Intel CPUs should work as usual.
> So I
> > > think
> > > > existing metric tests should work for it.
> > > >
> > > > If we want to add a fake metric uses the :R modifier for those platforms,
> the
> > > metric
> > > > should either fail (if the fake metric uses an event not exist on the test
> > > platform) or
> > > > return all 0 retire latency data.
> > > >
> > > > So, I'm not quite sure what we want the fake metric to test for. Maybe we
> > > could
> > > > continue using existing metric tests to ensure all the defined metrics work
> > > correctly
> > > > in each machine under test?
> > >
> > > I think it's ok to return all 0 retire latency for fake tPEBS metrics.
> > > It's just to verify the background record + report logic works ok.
> >
> > Hi Namhyung,
> >
> > After think more about how TPEBS and metrics work, I feel should discuss
> more
> > about defining a fake TPEBS metric in unsupported platforms.
> > If we'd like a add fake metrics, where should we define and store these
> metrics?
> > Should we add this type of metrics for every platform? All the official metrics
> > we publish are defined by architect and stored in JSON files under separate
> > directories for each platform. I think it is not a good idea to place fake metrics
> > together with real metrics. Could you please let me know if there is any other
> > method to define fake metrics for testing?
>
> We do put fake events/metrics in the "test" arch:
> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-
> next.git/tree/tools/perf/pmu-events/arch/test/test_soc/cpu/metrics.json
>
> It is something of a pain bringing things here over to the
> NO_JEVENTS=1 (ie no python) empty-pmu-events.c file. I agree on not
> wanting to pollute real metrics with test metrics, we currently just
> use the test metrics to fake up some expression parsing tests:
> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-
> next.git/tree/tools/perf/tests/pmu-events.c#n811
>
Thanks Ian! I will work on this.
Best,
Weilin
> Thanks,
> Ian
>
> > Thanks,
> > Weilin
> >
> > >
> > > Thanks,
> > > Namhyung
> -----Original Message-----
> From: Namhyung Kim <namhyung@kernel.org>
> Sent: Wednesday, April 24, 2024 11:50 AM
> To: Wang, Weilin <weilin.wang@intel.com>
> Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> <mingo@redhat.com>; Alexander Shishkin
> <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor, Perry
> <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>; Biggers,
> Caleb <caleb.biggers@intel.com>
> Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
>
> On Wed, Apr 24, 2024 at 10:08 AM Wang, Weilin <weilin.wang@intel.com>
> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Namhyung Kim <namhyung@kernel.org>
> > > Sent: Tuesday, April 23, 2024 4:06 PM
> > > To: Wang, Weilin <weilin.wang@intel.com>
> > > Cc: Ian Rogers <irogers@google.com>; Arnaldo Carvalho de Melo
> > > <acme@kernel.org>; Peter Zijlstra <peterz@infradead.org>; Ingo Molnar
> > > <mingo@redhat.com>; Alexander Shishkin
> > > <alexander.shishkin@linux.intel.com>; Jiri Olsa <jolsa@kernel.org>; Hunter,
> > > Adrian <adrian.hunter@intel.com>; Kan Liang <kan.liang@linux.intel.com>;
> > > linux-perf-users@vger.kernel.org; linux-kernel@vger.kernel.org; Taylor,
> Perry
> > > <perry.taylor@intel.com>; Alt, Samantha <samantha.alt@intel.com>;
> Biggers,
> > > Caleb <caleb.biggers@intel.com>
> > > Subject: Re: [RFC PATCH v6 2/5] perf stat: Fork and launch perf record
> when
> > > perf stat needs to get retire latency value for a metric.
> > >
> > > On Tue, Apr 23, 2024 at 3:16 PM Wang, Weilin <weilin.wang@intel.com>
> > > wrote:
> > > > > > > > -static int __run_perf_record(void)
> > > > > > > > +static int __run_perf_record(const char **record_argv)
> > > > > > > > {
> > > > > > > > + int i = 0;
> > > > > > > > + struct tpebs_event *e;
> > > > > > > > +
> > > > > > > > pr_debug("Prepare perf record for retire_latency\n");
> > > > > > > > +
> > > > > > > > + record_argv[i++] = "perf";
> > > > > > > > + record_argv[i++] = "record";
> > > > > > > > + record_argv[i++] = "-W";
> > > > > > > > + record_argv[i++] = "--synth=no";
> > > > > > > > +
> > > > > > > > + if (stat_config.user_requested_cpu_list) {
> > > > > > > > + record_argv[i++] = "-C";
> > > > > > > > + record_argv[i++] = stat_config.user_requested_cpu_list;
> > > > > > > > + }
> > > > > > > > +
> > > > > > > > + if (stat_config.system_wide)
> > > > > > > > + record_argv[i++] = "-a";
> > > > > > > > +
> > > > > > > > + if (!stat_config.system_wide
> > > > > && !stat_config.user_requested_cpu_list)
> > > > > > > {
> > > > > > > > + pr_err("Require -a or -C option to run sampling.\n");
> > > > > > > > + return -ECANCELED;
> > > > > > > > + }
> > > > > > > > +
> > > > > > > > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > > > > > > > + record_argv[i++] = "-e";
> > > > > > > > + record_argv[i++] = e->name;
> > > > > > > > + }
> > > > > > > > +
> > > > > > > > + record_argv[i++] = "-o";
> > > > > > > > + record_argv[i++] = PERF_DATA;
> > > > > > > > +
> > > > > > > > return 0;
> > > > > > > > }
> > > > > > >
> > > > > > > Still I think it's weird it has 'perf record' in perf stat (despite the
> > > > > > > 'perf stat record'). If it's only Intel thing and we don't have a plan
> > > > > > > to do the same on other arches, we can move it to the arch
> > > > > > > directory and keep the perf stat code simple.
> > > > > >
> > > > > > I'm not sure what is the proper way to solve this. And Ian mentioned
> > > > > > that put code in arch directory could potentially cause other bugs.
> > > > > > So I'm wondering if we could keep this code here for now. I could
> work
> > > > > > on it later if we found it's better to be in arch directory.
> > > > >
> > > > > Maybe somewhere in the util/ and keep the main code minimal.
> > > > > IIUC it's only for very recent (or upcoming?) Intel CPUs and we
> > > > > don't have tests (hopefully can run on other arch/CPUs).
> > > > >
> > > > > So I don't think having it here would help fixing potential bugs.
> > > >
> > > > Do you mean by creating a new file in util/ to hold this code?
> > >
> > > Yeah, maybe util/intel-tpebs.c (if it's better than arch/x86/...) ?
> > >
> > > >
> > > > Yes, this feature is for very recent Intel CPUs. It should only be triggered if
> > > > a metric uses event(s) that has the R modifier in the formula.
> > >
> > > Can we have a test with a fake metric so that we can test
> > > the code on non-(or old-)Intel machines?
> >
> > All the existing metrics in non-(or old-)Intel CPUs should work as usual. So I
> think
> > existing metric tests should work for it.
> >
> > If we want to add a fake metric uses the :R modifier for those platforms, the
> metric
> > should either fail (if the fake metric uses an event not exist on the test
> platform) or
> > return all 0 retire latency data.
> >
> > So, I'm not quite sure what we want the fake metric to test for. Maybe we
> could
> > continue using existing metric tests to ensure all the defined metrics work
> correctly
> > in each machine under test?
>
> I think it's ok to return all 0 retire latency for fake tPEBS metrics.
> It's just to verify the background record + report logic works ok.
Ok, I will work on this. We need a fake metric that uses events exist on all the
platforms.
Thanks,
Weilin
>
> Thanks,
> Namhyung
© 2016 - 2026 Red Hat, Inc.