If an event file exists in sysfs, check if a event.cpus file exists
and read a perf_cpu_map from it if it does. This allows particular
events to have a different set of CPUs compared to the PMU.
One scenario where this could be useful is when a PMU is set up with a
cpumask/events per SMT thread but some events record for all SMT
threads. Programming an event on each SMT thread will cause
unnecessary counters to be programmed and the aggregate value to be
too large.
Another scenario where this could be useful if when a PMU has
historically had a cpumask at the package level, but now newer per
die, core or CPU information is available.
Additional context for the motivation is in these patches and
conversation:
https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/
Signed-off-by: Ian Rogers <irogers@google.com>
---
.../sysfs-bus-event_source-devices-events | 14 ++++++
tools/perf/util/parse-events.c | 45 ++++++++++---------
tools/perf/util/pmu.c | 44 +++++++++++++++++-
tools/perf/util/pmu.h | 1 +
4 files changed, 82 insertions(+), 22 deletions(-)
diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
index e7efeab2ee83..d8e3a4dd3ba7 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
@@ -70,6 +70,20 @@ Description: Per-pmu performance monitoring events specific to the running syste
This is referred to as "event parameterization". Event
parameters have the format 'param=?'.
+What: /sys/bus/event_source/devices/<pmu>/events/<event>.cpus
+Date: 2024/07/17
+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description: Perf event CPUs
+
+ A list of CPUs on which a the perf event should be
+ opened by default. This is an event specific variant
+ of /sys/bus/event_source/devices/<pmu>/cpumask.
+
+ Examples (each of these lines would be in a separate file):
+
+ 0,2,4,6
+ 0-7
+
What: /sys/bus/event_source/devices/<pmu>/events/<event>.unit
Date: 2014/02/24
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 43501eb56336..b181f83c9678 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1425,12 +1425,13 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
bool auto_merge_stats)
{
struct perf_event_attr attr;
- struct perf_pmu_info info;
+ struct perf_pmu_info info = {};
struct evsel *evsel;
struct parse_events_error *err = parse_state->error;
LIST_HEAD(config_terms);
struct parse_events_terms parsed_terms;
bool alias_rewrote_terms = false;
+ int ret = 0;
if (verbose > 1) {
struct strbuf sb;
@@ -1465,8 +1466,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
parse_events_terms__init(&parsed_terms);
if (const_parsed_terms) {
- int ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms);
-
+ ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms);
if (ret)
return ret;
}
@@ -1474,15 +1474,15 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
/* Configure attr/terms with a known PMU, this will set hardcoded terms. */
if (config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) {
- parse_events_terms__exit(&parsed_terms);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_err;
}
/* Look for event names in the terms and rewrite into format based terms. */
if (!parse_state->fake_pmu && perf_pmu__check_alias(pmu, &parsed_terms,
&info, &alias_rewrote_terms, err)) {
- parse_events_terms__exit(&parsed_terms);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_err;
}
if (verbose > 1) {
@@ -1497,13 +1497,13 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
/* Configure attr/terms again if an alias was expanded. */
if (alias_rewrote_terms &&
config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) {
- parse_events_terms__exit(&parsed_terms);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_err;
}
if (get_config_terms(&parsed_terms, &config_terms)) {
- parse_events_terms__exit(&parsed_terms);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_err;
}
/*
@@ -1512,24 +1512,23 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
*/
if (pmu->perf_event_attr_init_default &&
get_config_chgs(pmu, &parsed_terms, &config_terms)) {
- parse_events_terms__exit(&parsed_terms);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_err;
}
if (!parse_state->fake_pmu &&
perf_pmu__config(pmu, &attr, &parsed_terms, parse_state->error)) {
- free_config_terms(&config_terms);
- parse_events_terms__exit(&parsed_terms);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_err;
}
evsel = __add_event(list, &parse_state->idx, &attr, /*init_attr=*/true,
get_config_name(&parsed_terms),
get_config_metric_id(&parsed_terms), pmu,
- &config_terms, auto_merge_stats, /*cpu_list=*/NULL);
+ &config_terms, auto_merge_stats, info.cpus);
if (!evsel) {
- parse_events_terms__exit(&parsed_terms);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_err;
}
if (evsel->name)
@@ -1542,13 +1541,17 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
return 0;
}
- parse_events_terms__exit(&parsed_terms);
free((char *)evsel->unit);
evsel->unit = strdup(info.unit);
evsel->scale = info.scale;
evsel->per_pkg = info.per_pkg;
evsel->snapshot = info.snapshot;
- return 0;
+out_err:
+ parse_events_terms__exit(&parsed_terms);
+ if (ret)
+ free_config_terms(&config_terms);
+ perf_cpu_map__put(info.cpus);
+ return ret;
}
int parse_events_multi_pmu_add(struct parse_events_state *parse_state,
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 5148b6639dd3..280b2499c861 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -73,6 +73,11 @@ struct perf_pmu_alias {
* differ from the PMU name as it won't have suffixes.
*/
char *pmu_name;
+ /**
+ * @cpus: A possible per-event cpumap that overrides that given for the
+ * PMU.
+ */
+ struct perf_cpu_map *cpus;
/** @unit: Units for the event, such as bytes or cache lines. */
char unit[UNIT_MAX_LEN+1];
/** @scale: Value to scale read counter values by. */
@@ -332,6 +337,32 @@ static int perf_pmu__parse_scale(struct perf_pmu *pmu, struct perf_pmu_alias *al
return ret;
}
+static void perf_pmu__parse_cpus(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
+{
+ char path[PATH_MAX];
+ size_t len;
+ FILE *file;
+ int fd;
+
+ len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path));
+ if (!len)
+ return;
+ scnprintf(path + len, sizeof(path) - len, "%s/events/%s.cpus", pmu->name, alias->name);
+
+ fd = open(path, O_RDONLY);
+ if (fd == -1)
+ return; /* Expected common case. */
+
+ file = fdopen(fd, "r");
+ if (!file) {
+ close(fd);
+ return;
+ }
+
+ alias->cpus = perf_cpu_map__read(file);
+ fclose(file);
+}
+
static int perf_pmu__parse_unit(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
{
char path[PATH_MAX];
@@ -493,6 +524,7 @@ static void read_alias_info(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
/*
* load unit name and scale if available
*/
+ perf_pmu__parse_cpus(pmu, alias);
perf_pmu__parse_unit(pmu, alias);
perf_pmu__parse_scale(pmu, alias);
perf_pmu__parse_per_pkg(pmu, alias);
@@ -618,7 +650,7 @@ static inline bool pmu_alias_info_file(const char *name)
size_t len;
len = strlen(name);
- if (len > 5 && !strcmp(name + len - 5, ".unit"))
+ if (len > 5 && (!strcmp(name + len - 5, ".cpus") || !strcmp(name + len - 5, ".unit")))
return true;
if (len > 6 && !strcmp(name + len - 6, ".scale"))
return true;
@@ -1560,6 +1592,12 @@ static int check_info_data(struct perf_pmu *pmu,
* define unit, scale and snapshot, fail
* if there's more than one.
*/
+ if (!perf_cpu_map__is_empty(info->cpus) && !perf_cpu_map__is_empty(alias->cpus)) {
+ parse_events_error__handle(err, column,
+ strdup("Attempt to set event's cpus twice"),
+ NULL);
+ return -EINVAL;
+ }
if (info->unit && alias->unit[0]) {
parse_events_error__handle(err, column,
strdup("Attempt to set event's unit twice"),
@@ -1579,6 +1617,9 @@ static int check_info_data(struct perf_pmu *pmu,
return -EINVAL;
}
+ if (!perf_cpu_map__is_empty(alias->cpus))
+ info->cpus = perf_cpu_map__get(alias->cpus);
+
if (alias->unit[0])
info->unit = alias->unit;
@@ -1610,6 +1651,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_
* Mark unit and scale as not set
* (different from default values, see below)
*/
+ info->cpus = NULL;
info->unit = NULL;
info->scale = 0.0;
info->snapshot = false;
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index b2d3fd291f02..b1ccfe8d3df4 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -177,6 +177,7 @@ struct perf_pmu {
extern struct perf_pmu perf_pmu__fake;
struct perf_pmu_info {
+ struct perf_cpu_map *cpus;
const char *unit;
double scale;
bool per_pkg;
--
2.45.2.1089.g2a221341d9-goog
On 2024-07-17 8:30 p.m., Ian Rogers wrote:
> If an event file exists in sysfs, check if a event.cpus file exists
> and read a perf_cpu_map from it if it does. This allows particular
> events to have a different set of CPUs compared to the PMU.
>
> One scenario where this could be useful is when a PMU is set up with a
> cpumask/events per SMT thread but some events record for all SMT
> threads. Programming an event on each SMT thread will cause
> unnecessary counters to be programmed and the aggregate value to be
> too large.
If I understand the scenario correctly, I think the issue should have
been addressed since ICX, with the introduction of the "SMT-aware
events". Is there a specific event which still has the issue on newer
platforms?
>
> Another scenario where this could be useful if when a PMU has
> historically had a cpumask at the package level, but now newer per
> die, core or CPU information is available.
The traditional way to support new counters with a different scope is to
add a new PMU.
>
> Additional context for the motivation is in these patches and
> conversation:
> https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/
I don't see the advantages of the new event.cpus way.
The aggregation should be the same.
The RAPL counters are free-running counters. So there is no difference
whether grouping or not grouping.
But it makes the kernel driver complex, since it has to maintain at
least two different cpumasks.
The tool becomes complex either, since it has to take care of the
per-event CPU override case. The json file must also be updated to add a
new field cpumask.
Thanks,
Kan
>
> Signed-off-by: Ian Rogers <irogers@google.com>
> ---
> .../sysfs-bus-event_source-devices-events | 14 ++++++
> tools/perf/util/parse-events.c | 45 ++++++++++---------
> tools/perf/util/pmu.c | 44 +++++++++++++++++-
> tools/perf/util/pmu.h | 1 +
> 4 files changed, 82 insertions(+), 22 deletions(-)
>
> diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
> index e7efeab2ee83..d8e3a4dd3ba7 100644
> --- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
> +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
> @@ -70,6 +70,20 @@ Description: Per-pmu performance monitoring events specific to the running syste
> This is referred to as "event parameterization". Event
> parameters have the format 'param=?'.
>
> +What: /sys/bus/event_source/devices/<pmu>/events/<event>.cpus
> +Date: 2024/07/17
> +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
> +Description: Perf event CPUs
> +
> + A list of CPUs on which a the perf event should be
> + opened by default. This is an event specific variant
> + of /sys/bus/event_source/devices/<pmu>/cpumask.
> +
> + Examples (each of these lines would be in a separate file):
> +
> + 0,2,4,6
> + 0-7
> +
> What: /sys/bus/event_source/devices/<pmu>/events/<event>.unit
> Date: 2014/02/24
> Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
> diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
> index 43501eb56336..b181f83c9678 100644
> --- a/tools/perf/util/parse-events.c
> +++ b/tools/perf/util/parse-events.c
> @@ -1425,12 +1425,13 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
> bool auto_merge_stats)
> {
> struct perf_event_attr attr;
> - struct perf_pmu_info info;
> + struct perf_pmu_info info = {};
> struct evsel *evsel;
> struct parse_events_error *err = parse_state->error;
> LIST_HEAD(config_terms);
> struct parse_events_terms parsed_terms;
> bool alias_rewrote_terms = false;
> + int ret = 0;
>
> if (verbose > 1) {
> struct strbuf sb;
> @@ -1465,8 +1466,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
>
> parse_events_terms__init(&parsed_terms);
> if (const_parsed_terms) {
> - int ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms);
> -
> + ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms);
> if (ret)
> return ret;
> }
> @@ -1474,15 +1474,15 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
>
> /* Configure attr/terms with a known PMU, this will set hardcoded terms. */
> if (config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) {
> - parse_events_terms__exit(&parsed_terms);
> - return -EINVAL;
> + ret = -EINVAL;
> + goto out_err;
> }
>
> /* Look for event names in the terms and rewrite into format based terms. */
> if (!parse_state->fake_pmu && perf_pmu__check_alias(pmu, &parsed_terms,
> &info, &alias_rewrote_terms, err)) {
> - parse_events_terms__exit(&parsed_terms);
> - return -EINVAL;
> + ret = -EINVAL;
> + goto out_err;
> }
>
> if (verbose > 1) {
> @@ -1497,13 +1497,13 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
> /* Configure attr/terms again if an alias was expanded. */
> if (alias_rewrote_terms &&
> config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) {
> - parse_events_terms__exit(&parsed_terms);
> - return -EINVAL;
> + ret = -EINVAL;
> + goto out_err;
> }
>
> if (get_config_terms(&parsed_terms, &config_terms)) {
> - parse_events_terms__exit(&parsed_terms);
> - return -ENOMEM;
> + ret = -ENOMEM;
> + goto out_err;
> }
>
> /*
> @@ -1512,24 +1512,23 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
> */
> if (pmu->perf_event_attr_init_default &&
> get_config_chgs(pmu, &parsed_terms, &config_terms)) {
> - parse_events_terms__exit(&parsed_terms);
> - return -ENOMEM;
> + ret = -ENOMEM;
> + goto out_err;
> }
>
> if (!parse_state->fake_pmu &&
> perf_pmu__config(pmu, &attr, &parsed_terms, parse_state->error)) {
> - free_config_terms(&config_terms);
> - parse_events_terms__exit(&parsed_terms);
> - return -EINVAL;
> + ret = -EINVAL;
> + goto out_err;
> }
>
> evsel = __add_event(list, &parse_state->idx, &attr, /*init_attr=*/true,
> get_config_name(&parsed_terms),
> get_config_metric_id(&parsed_terms), pmu,
> - &config_terms, auto_merge_stats, /*cpu_list=*/NULL);
> + &config_terms, auto_merge_stats, info.cpus);
> if (!evsel) {
> - parse_events_terms__exit(&parsed_terms);
> - return -ENOMEM;
> + ret = -ENOMEM;
> + goto out_err;
> }
>
> if (evsel->name)
> @@ -1542,13 +1541,17 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
> return 0;
> }
>
> - parse_events_terms__exit(&parsed_terms);
> free((char *)evsel->unit);
> evsel->unit = strdup(info.unit);
> evsel->scale = info.scale;
> evsel->per_pkg = info.per_pkg;
> evsel->snapshot = info.snapshot;
> - return 0;
> +out_err:
> + parse_events_terms__exit(&parsed_terms);
> + if (ret)
> + free_config_terms(&config_terms);
> + perf_cpu_map__put(info.cpus);
> + return ret;
> }
>
> int parse_events_multi_pmu_add(struct parse_events_state *parse_state,
> diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
> index 5148b6639dd3..280b2499c861 100644
> --- a/tools/perf/util/pmu.c
> +++ b/tools/perf/util/pmu.c
> @@ -73,6 +73,11 @@ struct perf_pmu_alias {
> * differ from the PMU name as it won't have suffixes.
> */
> char *pmu_name;
> + /**
> + * @cpus: A possible per-event cpumap that overrides that given for the
> + * PMU.
> + */
> + struct perf_cpu_map *cpus;
> /** @unit: Units for the event, such as bytes or cache lines. */
> char unit[UNIT_MAX_LEN+1];
> /** @scale: Value to scale read counter values by. */
> @@ -332,6 +337,32 @@ static int perf_pmu__parse_scale(struct perf_pmu *pmu, struct perf_pmu_alias *al
> return ret;
> }
>
> +static void perf_pmu__parse_cpus(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
> +{
> + char path[PATH_MAX];
> + size_t len;
> + FILE *file;
> + int fd;
> +
> + len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path));
> + if (!len)
> + return;
> + scnprintf(path + len, sizeof(path) - len, "%s/events/%s.cpus", pmu->name, alias->name);
> +
> + fd = open(path, O_RDONLY);
> + if (fd == -1)
> + return; /* Expected common case. */
> +
> + file = fdopen(fd, "r");
> + if (!file) {
> + close(fd);
> + return;
> + }
> +
> + alias->cpus = perf_cpu_map__read(file);
> + fclose(file);
> +}
> +
> static int perf_pmu__parse_unit(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
> {
> char path[PATH_MAX];
> @@ -493,6 +524,7 @@ static void read_alias_info(struct perf_pmu *pmu, struct perf_pmu_alias *alias)
> /*
> * load unit name and scale if available
> */
> + perf_pmu__parse_cpus(pmu, alias);
> perf_pmu__parse_unit(pmu, alias);
> perf_pmu__parse_scale(pmu, alias);
> perf_pmu__parse_per_pkg(pmu, alias);
> @@ -618,7 +650,7 @@ static inline bool pmu_alias_info_file(const char *name)
> size_t len;
>
> len = strlen(name);
> - if (len > 5 && !strcmp(name + len - 5, ".unit"))
> + if (len > 5 && (!strcmp(name + len - 5, ".cpus") || !strcmp(name + len - 5, ".unit")))
> return true;
> if (len > 6 && !strcmp(name + len - 6, ".scale"))
> return true;
> @@ -1560,6 +1592,12 @@ static int check_info_data(struct perf_pmu *pmu,
> * define unit, scale and snapshot, fail
> * if there's more than one.
> */
> + if (!perf_cpu_map__is_empty(info->cpus) && !perf_cpu_map__is_empty(alias->cpus)) {
> + parse_events_error__handle(err, column,
> + strdup("Attempt to set event's cpus twice"),
> + NULL);
> + return -EINVAL;
> + }
> if (info->unit && alias->unit[0]) {
> parse_events_error__handle(err, column,
> strdup("Attempt to set event's unit twice"),
> @@ -1579,6 +1617,9 @@ static int check_info_data(struct perf_pmu *pmu,
> return -EINVAL;
> }
>
> + if (!perf_cpu_map__is_empty(alias->cpus))
> + info->cpus = perf_cpu_map__get(alias->cpus);
> +
> if (alias->unit[0])
> info->unit = alias->unit;
>
> @@ -1610,6 +1651,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_
> * Mark unit and scale as not set
> * (different from default values, see below)
> */
> + info->cpus = NULL;
> info->unit = NULL;
> info->scale = 0.0;
> info->snapshot = false;
> diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
> index b2d3fd291f02..b1ccfe8d3df4 100644
> --- a/tools/perf/util/pmu.h
> +++ b/tools/perf/util/pmu.h
> @@ -177,6 +177,7 @@ struct perf_pmu {
> extern struct perf_pmu perf_pmu__fake;
>
> struct perf_pmu_info {
> + struct perf_cpu_map *cpus;
> const char *unit;
> double scale;
> bool per_pkg;
On Thu, Jul 18, 2024 at 7:33 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > > > > On 2024-07-17 8:30 p.m., Ian Rogers wrote: > > If an event file exists in sysfs, check if a event.cpus file exists > > and read a perf_cpu_map from it if it does. This allows particular > > events to have a different set of CPUs compared to the PMU. > > > > One scenario where this could be useful is when a PMU is set up with a > > cpumask/events per SMT thread but some events record for all SMT > > threads. Programming an event on each SMT thread will cause > > unnecessary counters to be programmed and the aggregate value to be > > too large. > > If I understand the scenario correctly, I think the issue should have > been addressed since ICX, with the introduction of the "SMT-aware > events". Is there a specific event which still has the issue on newer > platforms? Nothing comes to mind, but it is there on popular machines like Skylake. > > > > Another scenario where this could be useful if when a PMU has > > historically had a cpumask at the package level, but now newer per > > die, core or CPU information is available. > > The traditional way to support new counters with a different scope is to > add a new PMU. > > > > > Additional context for the motivation is in these patches and > > conversation: > > https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/ > > I don't see the advantages of the new event.cpus way. > The aggregation should be the same. Agreed. My concern is that we may end up with a pattern of <pmu>_per_pkg, <pmu>_per_core, <pmu>_per_cpu, <pmu>_per_l3, etc. just for the sake of the cpumask. > The RAPL counters are free-running counters. So there is no difference > whether grouping or not grouping. Should the RAPL counters return true for perf_pmu__is_software? In the tool we currently return false and won't allow grouping, these events with other hardware events. The intent in perf_pmu__is_software was to emulate the way the kernel allows/disallows groups - software context events can be in a hardware context but otherwise I don't believe the grouping is allowed. > But it makes the kernel driver complex, since it has to maintain at > least two different cpumasks. Two drivers each maintaining a cpumask or 1 driver maintaining 2 cpumasks, it seems like there is chance for code reuse in both cases. I'm not seeing how it adds to complexity particularly. > The tool becomes complex either, since it has to take care of the > per-event CPU override case. I'm not sure I agree with this. What we need for perf_event_open is a perf_event_attr, we dress these up as evsels which also have the ability to be for >1 CPU or thread. Longer term I think evsels can also have >1 PMU, for the wildcard cases like uncore memory controllers - this would remove the need for resorting evsels except for topdown events which have thrown us a giant bundle of challenges. Anyway, so an evsel is perf_event_attr information paired with CPU information, it makes sense to me that the parser should do this pairing. What's harder in the evsel/evlist setup is trying to fix CPU maps up not in parsing, like with __perf_evlist__propagate_maps where the parsing is trying to leave crumbs around (like system_wide, has_user_cpus, is_pmu_core) so the map propagation works properly. > The json file must also be updated to add a > new field cpumask. Yeah, I don't like this as it means we end up putting CPU information into the json that isn't the same for every CPU variant of the same architecture model. Maybe we can have some kind of "scope" enum value in the json and then when the scope differs from the PMU's, core scope vs the PMU's hyperthread scope, then in the tool we can figure out the cpumask from the topology in sysfs. Maybe we should just always use the topology and get rid of cpumask files in sysfs, replacing them with "scope" files. Will Deacon pushed back on having ARM PMUs supporting hot plugging (https://lore.kernel.org/lkml/20240701142222.GA2691@willie-the-truck/) where the main thing hot plugging handler needs to maintain is set the cpumask. Thanks, Ian
On 2024-07-18 11:39 a.m., Ian Rogers wrote: > On Thu, Jul 18, 2024 at 7:33 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >> >> >> >> On 2024-07-17 8:30 p.m., Ian Rogers wrote: >>> If an event file exists in sysfs, check if a event.cpus file exists >>> and read a perf_cpu_map from it if it does. This allows particular >>> events to have a different set of CPUs compared to the PMU. >>> >>> One scenario where this could be useful is when a PMU is set up with a >>> cpumask/events per SMT thread but some events record for all SMT >>> threads. Programming an event on each SMT thread will cause >>> unnecessary counters to be programmed and the aggregate value to be >>> too large. >> >> If I understand the scenario correctly, I think the issue should have >> been addressed since ICX, with the introduction of the "SMT-aware >> events". Is there a specific event which still has the issue on newer >> platforms? > > Nothing comes to mind, but it is there on popular machines like Skylake. > >>> >>> Another scenario where this could be useful if when a PMU has >>> historically had a cpumask at the package level, but now newer per >>> die, core or CPU information is available. >> >> The traditional way to support new counters with a different scope is to >> add a new PMU. >> >>> >>> Additional context for the motivation is in these patches and >>> conversation: >>> https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/ >> >> I don't see the advantages of the new event.cpus way. >> The aggregation should be the same. > > Agreed. My concern is that we may end up with a pattern of > <pmu>_per_pkg, <pmu>_per_core, <pmu>_per_cpu, <pmu>_per_l3, etc. just > for the sake of the cpumask. The cstate PMUs already do so, e.g., cstate_core, cstate_pkg. From another perspective, it discloses the scope information in a PMU name. If a user only cares about the information of a package, they just need to focus on everything under <pmu>_pkg. Otherwise, they have to go through all the events. > >> The RAPL counters are free-running counters. So there is no difference >> whether grouping or not grouping. > > Should the RAPL counters return true for perf_pmu__is_software? In the > tool we currently return false and won't allow grouping, these events > with other hardware events. The intent in perf_pmu__is_software was to > emulate the way the kernel allows/disallows groups - software context > events can be in a hardware context but otherwise I don't believe the > grouping is allowed. No, it's not allowed for the RAPL counters. If the motivation is to find another way to group counters with different scope, it may not work. We once tried to mix the perf_invalid_context PMUs in a group. But it's denied. https://yhbt.net/lore/all/20150415172856.GA5029@twins.programming.kicks-ass.net/ The event.cpus still faces the same issues. > >> But it makes the kernel driver complex, since it has to maintain at >> least two different cpumasks. > > Two drivers each maintaining a cpumask or 1 driver maintaining 2 > cpumasks, it seems like there is chance for code reuse in both cases. > I'm not seeing how it adds to complexity particularly. Yes, there are some cleanup opportunities for the cpumask/hotplug codes. We may even abstracts some generic interfaces for pkg or core level PMUs. Eventually, the complexity/duplication should be able to be reduced. > >> The tool becomes complex either, since it has to take care of the >> per-event CPU override case. > > I'm not sure I agree with this. What we need for perf_event_open is a > perf_event_attr, we dress these up as evsels which also have the > ability to be for >1 CPU or thread. Longer term I think evsels can > also have >1 PMU, for the wildcard cases like uncore memory > controllers - this would remove the need for resorting evsels except > for topdown events which have thrown us a giant bundle of challenges. > Anyway, so an evsel is perf_event_attr information paired with CPU > information, it makes sense to me that the parser should do this > pairing. What's harder in the evsel/evlist setup is trying to fix CPU > maps up not in parsing, like with __perf_evlist__propagate_maps where > the parsing is trying to leave crumbs around (like system_wide, > has_user_cpus, is_pmu_core) so the map propagation works properly. > >> The json file must also be updated to add a >> new field cpumask. > > Yeah, I don't like this as it means we end up putting CPU information > into the json that isn't the same for every CPU variant of the same > architecture model. Maybe we can have some kind of "scope" enum value > in the json and then when the scope differs from the PMU's, core scope > vs the PMU's hyperthread scope, then in the tool we can figure out the > cpumask from the topology in sysfs. Maybe we should just always use > the topology and get rid of cpumask files in sysfs, replacing them > with "scope" files. Will Deacon pushed back on having ARM PMUs > supporting hot plugging > (https://lore.kernel.org/lkml/20240701142222.GA2691@willie-the-truck/) > where the main thing hot plugging handler needs to maintain is set the > cpumask. Not just the cpumask but also migrate the context for some PMUs, see perf_pmu_migrate_context(). It seems we really need a shared cpumask in the generic code, so the drivers don't need to handle the hotplug everywhere. I will think about it. Thanks, Kan
On Thu, Jul 18, 2024 at 10:48 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > > > > On 2024-07-18 11:39 a.m., Ian Rogers wrote: > > On Thu, Jul 18, 2024 at 7:33 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > >> > >> > >> > >> On 2024-07-17 8:30 p.m., Ian Rogers wrote: > >>> If an event file exists in sysfs, check if a event.cpus file exists > >>> and read a perf_cpu_map from it if it does. This allows particular > >>> events to have a different set of CPUs compared to the PMU. > >>> > >>> One scenario where this could be useful is when a PMU is set up with a > >>> cpumask/events per SMT thread but some events record for all SMT > >>> threads. Programming an event on each SMT thread will cause > >>> unnecessary counters to be programmed and the aggregate value to be > >>> too large. > >> > >> If I understand the scenario correctly, I think the issue should have > >> been addressed since ICX, with the introduction of the "SMT-aware > >> events". Is there a specific event which still has the issue on newer > >> platforms? > > > > Nothing comes to mind, but it is there on popular machines like Skylake. > > > >>> > >>> Another scenario where this could be useful if when a PMU has > >>> historically had a cpumask at the package level, but now newer per > >>> die, core or CPU information is available. > >> > >> The traditional way to support new counters with a different scope is to > >> add a new PMU. > >> > >>> > >>> Additional context for the motivation is in these patches and > >>> conversation: > >>> https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/ > >> > >> I don't see the advantages of the new event.cpus way. > >> The aggregation should be the same. > > > > Agreed. My concern is that we may end up with a pattern of > > <pmu>_per_pkg, <pmu>_per_core, <pmu>_per_cpu, <pmu>_per_l3, etc. just > > for the sake of the cpumask. > > The cstate PMUs already do so, e.g., cstate_core, cstate_pkg. > > From another perspective, it discloses the scope information in a PMU > name. If a user only cares about the information of a package, they just > need to focus on everything under <pmu>_pkg. Otherwise, they have to go > through all the events. > > > > >> The RAPL counters are free-running counters. So there is no difference > >> whether grouping or not grouping. > > > > Should the RAPL counters return true for perf_pmu__is_software? In the > > tool we currently return false and won't allow grouping, these events > > with other hardware events. The intent in perf_pmu__is_software was to > > emulate the way the kernel allows/disallows groups - software context > > events can be in a hardware context but otherwise I don't believe the > > grouping is allowed. > > No, it's not allowed for the RAPL counters. > > If the motivation is to find another way to group counters with > different scope, it may not work. > > We once tried to mix the perf_invalid_context PMUs in a group. But it's > denied. > https://yhbt.net/lore/all/20150415172856.GA5029@twins.programming.kicks-ass.net/ > > The event.cpus still faces the same issues. Why so? The events would share the same perf_event_context, I thought the perf_event_open would succeed. > > > >> But it makes the kernel driver complex, since it has to maintain at > >> least two different cpumasks. > > > > Two drivers each maintaining a cpumask or 1 driver maintaining 2 > > cpumasks, it seems like there is chance for code reuse in both cases. > > I'm not seeing how it adds to complexity particularly. > > Yes, there are some cleanup opportunities for the cpumask/hotplug codes. > > We may even abstracts some generic interfaces for pkg or core level PMUs. > > Eventually, the complexity/duplication should be able to be reduced. > > > > >> The tool becomes complex either, since it has to take care of the > >> per-event CPU override case. > > > > I'm not sure I agree with this. What we need for perf_event_open is a > > perf_event_attr, we dress these up as evsels which also have the > > ability to be for >1 CPU or thread. Longer term I think evsels can > > also have >1 PMU, for the wildcard cases like uncore memory > > controllers - this would remove the need for resorting evsels except > > for topdown events which have thrown us a giant bundle of challenges. > > Anyway, so an evsel is perf_event_attr information paired with CPU > > information, it makes sense to me that the parser should do this > > pairing. What's harder in the evsel/evlist setup is trying to fix CPU > > maps up not in parsing, like with __perf_evlist__propagate_maps where > > the parsing is trying to leave crumbs around (like system_wide, > > has_user_cpus, is_pmu_core) so the map propagation works properly. > > > >> The json file must also be updated to add a > >> new field cpumask. > > > > Yeah, I don't like this as it means we end up putting CPU information > > into the json that isn't the same for every CPU variant of the same > > architecture model. Maybe we can have some kind of "scope" enum value > > in the json and then when the scope differs from the PMU's, core scope > > vs the PMU's hyperthread scope, then in the tool we can figure out the > > cpumask from the topology in sysfs. Maybe we should just always use > > the topology and get rid of cpumask files in sysfs, replacing them > > with "scope" files. Will Deacon pushed back on having ARM PMUs > > supporting hot plugging > > (https://lore.kernel.org/lkml/20240701142222.GA2691@willie-the-truck/) > > where the main thing hot plugging handler needs to maintain is set the > > cpumask. > > Not just the cpumask but also migrate the context for some PMUs, see > perf_pmu_migrate_context(). Will do, thanks. > It seems we really need a shared cpumask in the generic code, so the > drivers don't need to handle the hotplug everywhere. I will think about it. Thanks. There are other problems on ARM PMUs like having no or empty cpumasks, which the tool take to mean open the event on every online CPU (there is no cpus or cpumask file on core PMUs historically, so we adopt this behavior when the cpumask is empty or not present). I've been thinking to expand `tools/perf/tests/pmu.c` with basic PMU sanity tests. Some ideas: 1) some kind of cpumask sanity check - we could open an event with the cpumask and see if it yields multiplexing.. which would highlight the ARM no cpumask bug 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) files parse correctly and have a corresponding event. 3) keep adding opening events on the PMU to a group to make sure that when counters are exhausted the perf_event_open fails (I've seen this bug on AMD) 4) are the values in the type file unique Thanks, Ian
On 2024-07-18 4:50 p.m., Ian Rogers wrote: > On Thu, Jul 18, 2024 at 10:48 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >> >> >> >> On 2024-07-18 11:39 a.m., Ian Rogers wrote: >>> On Thu, Jul 18, 2024 at 7:33 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >>>> >>>> >>>> >>>> On 2024-07-17 8:30 p.m., Ian Rogers wrote: >>>>> If an event file exists in sysfs, check if a event.cpus file exists >>>>> and read a perf_cpu_map from it if it does. This allows particular >>>>> events to have a different set of CPUs compared to the PMU. >>>>> >>>>> One scenario where this could be useful is when a PMU is set up with a >>>>> cpumask/events per SMT thread but some events record for all SMT >>>>> threads. Programming an event on each SMT thread will cause >>>>> unnecessary counters to be programmed and the aggregate value to be >>>>> too large. >>>> >>>> If I understand the scenario correctly, I think the issue should have >>>> been addressed since ICX, with the introduction of the "SMT-aware >>>> events". Is there a specific event which still has the issue on newer >>>> platforms? >>> >>> Nothing comes to mind, but it is there on popular machines like Skylake. >>> >>>>> >>>>> Another scenario where this could be useful if when a PMU has >>>>> historically had a cpumask at the package level, but now newer per >>>>> die, core or CPU information is available. >>>> >>>> The traditional way to support new counters with a different scope is to >>>> add a new PMU. >>>> >>>>> >>>>> Additional context for the motivation is in these patches and >>>>> conversation: >>>>> https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/ >>>> >>>> I don't see the advantages of the new event.cpus way. >>>> The aggregation should be the same. >>> >>> Agreed. My concern is that we may end up with a pattern of >>> <pmu>_per_pkg, <pmu>_per_core, <pmu>_per_cpu, <pmu>_per_l3, etc. just >>> for the sake of the cpumask. >> >> The cstate PMUs already do so, e.g., cstate_core, cstate_pkg. >> >> From another perspective, it discloses the scope information in a PMU >> name. If a user only cares about the information of a package, they just >> need to focus on everything under <pmu>_pkg. Otherwise, they have to go >> through all the events. >> >>> >>>> The RAPL counters are free-running counters. So there is no difference >>>> whether grouping or not grouping. >>> >>> Should the RAPL counters return true for perf_pmu__is_software? In the >>> tool we currently return false and won't allow grouping, these events >>> with other hardware events. The intent in perf_pmu__is_software was to >>> emulate the way the kernel allows/disallows groups - software context >>> events can be in a hardware context but otherwise I don't believe the >>> grouping is allowed. >> >> No, it's not allowed for the RAPL counters. >> >> If the motivation is to find another way to group counters with >> different scope, it may not work. >> >> We once tried to mix the perf_invalid_context PMUs in a group. But it's >> denied. >> https://yhbt.net/lore/all/20150415172856.GA5029@twins.programming.kicks-ass.net/ >> >> The event.cpus still faces the same issues. > > Why so? The events would share the same perf_event_context, I thought > the perf_event_open would succeed. I think it breaks what groups are as well. At least, you cannot guarantee that two events can be co-scheduled on the same CPU. Even worse, there could be no events on some CPU. The first thing that pops up is the sample read feature. On CPU_A, the event_A is the leader event, but on CPU_B, the event_B could be the leader event if event_A's cpumask doesn't include the CPU_B. There could be more similar features we have to special handle. > >>> >>>> But it makes the kernel driver complex, since it has to maintain at >>>> least two different cpumasks. >>> >>> Two drivers each maintaining a cpumask or 1 driver maintaining 2 >>> cpumasks, it seems like there is chance for code reuse in both cases. >>> I'm not seeing how it adds to complexity particularly. >> >> Yes, there are some cleanup opportunities for the cpumask/hotplug codes. >> >> We may even abstracts some generic interfaces for pkg or core level PMUs. >> >> Eventually, the complexity/duplication should be able to be reduced. >> >>> >>>> The tool becomes complex either, since it has to take care of the >>>> per-event CPU override case. >>> >>> I'm not sure I agree with this. What we need for perf_event_open is a >>> perf_event_attr, we dress these up as evsels which also have the >>> ability to be for >1 CPU or thread. Longer term I think evsels can >>> also have >1 PMU, for the wildcard cases like uncore memory >>> controllers - this would remove the need for resorting evsels except >>> for topdown events which have thrown us a giant bundle of challenges. >>> Anyway, so an evsel is perf_event_attr information paired with CPU >>> information, it makes sense to me that the parser should do this >>> pairing. What's harder in the evsel/evlist setup is trying to fix CPU >>> maps up not in parsing, like with __perf_evlist__propagate_maps where >>> the parsing is trying to leave crumbs around (like system_wide, >>> has_user_cpus, is_pmu_core) so the map propagation works properly. >>> >>>> The json file must also be updated to add a >>>> new field cpumask. >>> >>> Yeah, I don't like this as it means we end up putting CPU information >>> into the json that isn't the same for every CPU variant of the same >>> architecture model. Maybe we can have some kind of "scope" enum value >>> in the json and then when the scope differs from the PMU's, core scope >>> vs the PMU's hyperthread scope, then in the tool we can figure out the >>> cpumask from the topology in sysfs. Maybe we should just always use >>> the topology and get rid of cpumask files in sysfs, replacing them >>> with "scope" files. Will Deacon pushed back on having ARM PMUs >>> supporting hot plugging >>> (https://lore.kernel.org/lkml/20240701142222.GA2691@willie-the-truck/) >>> where the main thing hot plugging handler needs to maintain is set the >>> cpumask. >> >> Not just the cpumask but also migrate the context for some PMUs, see >> perf_pmu_migrate_context(). > > Will do, thanks. > >> It seems we really need a shared cpumask in the generic code, so the >> drivers don't need to handle the hotplug everywhere. I will think about it. > > Thanks. There are other problems on ARM PMUs like having no or empty > cpumasks, which the tool take to mean open the event on every online > CPU (there is no cpus or cpumask file on core PMUs historically, so we > adopt this behavior when the cpumask is empty or not present). The no cpus/cpumasks and empty cpumasks should be different. No cpus/cpumasks file means that the counters/events are available for all the CPUs. But if it's an empty cpus/cpumasks file, it means that there are no proper CPUs. It may happen on a hybrid machine when all e-core CPUs are hot-removed. Since it's possible that the CPUs can be hot-added later, the kernel driver doesn't unregister the cpu_atom PMU. > I've > been thinking to expand `tools/perf/tests/pmu.c` with basic PMU sanity > tests. Some ideas: > Thanks. > 1) some kind of cpumask sanity check - we could open an event with the > cpumask and see if it yields multiplexing.. which would highlight the > ARM no cpumask bug The multiplexing is triggered when there are not enough counters. It should not related to the cpumask. For the PMU without cpumask, I think the test case should try to open an event on all CPUs and check if the open succeeds. For the PMU with cpumask, the test case should try to open an event on the masked CPUs and check if the open succeeds. The behavior of opening an event on unmasked CPUs seems not defined. For uncore, it's OK. The kernel treats the CPUs from the same socket exactly the same. But I'm not sure about the other PMUs. > 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) > files parse correctly and have a corresponding event. > 3) keep adding opening events on the PMU to a group to make sure that > when counters are exhausted the perf_event_open fails (I've seen this > bug on AMD) > 4) are the values in the type file unique > The rest sounds good to me. Thanks, Kan
On Fri, Jul 19, 2024 at 6:56 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > > > > On 2024-07-18 4:50 p.m., Ian Rogers wrote: > > On Thu, Jul 18, 2024 at 10:48 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > >> > >> > >> > >> On 2024-07-18 11:39 a.m., Ian Rogers wrote: > >>> On Thu, Jul 18, 2024 at 7:33 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > >>>> > >>>> > >>>> > >>>> On 2024-07-17 8:30 p.m., Ian Rogers wrote: > >>>>> If an event file exists in sysfs, check if a event.cpus file exists > >>>>> and read a perf_cpu_map from it if it does. This allows particular > >>>>> events to have a different set of CPUs compared to the PMU. > >>>>> > >>>>> One scenario where this could be useful is when a PMU is set up with a > >>>>> cpumask/events per SMT thread but some events record for all SMT > >>>>> threads. Programming an event on each SMT thread will cause > >>>>> unnecessary counters to be programmed and the aggregate value to be > >>>>> too large. > >>>> > >>>> If I understand the scenario correctly, I think the issue should have > >>>> been addressed since ICX, with the introduction of the "SMT-aware > >>>> events". Is there a specific event which still has the issue on newer > >>>> platforms? > >>> > >>> Nothing comes to mind, but it is there on popular machines like Skylake. > >>> > >>>>> > >>>>> Another scenario where this could be useful if when a PMU has > >>>>> historically had a cpumask at the package level, but now newer per > >>>>> die, core or CPU information is available. > >>>> > >>>> The traditional way to support new counters with a different scope is to > >>>> add a new PMU. > >>>> > >>>>> > >>>>> Additional context for the motivation is in these patches and > >>>>> conversation: > >>>>> https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/ > >>>> > >>>> I don't see the advantages of the new event.cpus way. > >>>> The aggregation should be the same. > >>> > >>> Agreed. My concern is that we may end up with a pattern of > >>> <pmu>_per_pkg, <pmu>_per_core, <pmu>_per_cpu, <pmu>_per_l3, etc. just > >>> for the sake of the cpumask. > >> > >> The cstate PMUs already do so, e.g., cstate_core, cstate_pkg. > >> > >> From another perspective, it discloses the scope information in a PMU > >> name. If a user only cares about the information of a package, they just > >> need to focus on everything under <pmu>_pkg. Otherwise, they have to go > >> through all the events. > >> > >>> > >>>> The RAPL counters are free-running counters. So there is no difference > >>>> whether grouping or not grouping. > >>> > >>> Should the RAPL counters return true for perf_pmu__is_software? In the > >>> tool we currently return false and won't allow grouping, these events > >>> with other hardware events. The intent in perf_pmu__is_software was to > >>> emulate the way the kernel allows/disallows groups - software context > >>> events can be in a hardware context but otherwise I don't believe the > >>> grouping is allowed. > >> > >> No, it's not allowed for the RAPL counters. > >> > >> If the motivation is to find another way to group counters with > >> different scope, it may not work. > >> > >> We once tried to mix the perf_invalid_context PMUs in a group. But it's > >> denied. > >> https://yhbt.net/lore/all/20150415172856.GA5029@twins.programming.kicks-ass.net/ > >> > >> The event.cpus still faces the same issues. > > > > Why so? The events would share the same perf_event_context, I thought > > the perf_event_open would succeed. > > I think it breaks what groups are as well. At least, you cannot > guarantee that two events can be co-scheduled on the same CPU. Even > worse, there could be no events on some CPU. > The first thing that pops up is the sample read feature. On CPU_A, the > event_A is the leader event, but on CPU_B, the event_B could be the > leader event if event_A's cpumask doesn't include the CPU_B. > There could be more similar features we have to special handle. Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and solely its perf event context, I want to know its core power and package power as a group so I never record one without the other. That grouping wouldn't be possible with 2 PMUs. > > > >>> > >>>> But it makes the kernel driver complex, since it has to maintain at > >>>> least two different cpumasks. > >>> > >>> Two drivers each maintaining a cpumask or 1 driver maintaining 2 > >>> cpumasks, it seems like there is chance for code reuse in both cases. > >>> I'm not seeing how it adds to complexity particularly. > >> > >> Yes, there are some cleanup opportunities for the cpumask/hotplug codes. > >> > >> We may even abstracts some generic interfaces for pkg or core level PMUs. > >> > >> Eventually, the complexity/duplication should be able to be reduced. > >> > >>> > >>>> The tool becomes complex either, since it has to take care of the > >>>> per-event CPU override case. > >>> > >>> I'm not sure I agree with this. What we need for perf_event_open is a > >>> perf_event_attr, we dress these up as evsels which also have the > >>> ability to be for >1 CPU or thread. Longer term I think evsels can > >>> also have >1 PMU, for the wildcard cases like uncore memory > >>> controllers - this would remove the need for resorting evsels except > >>> for topdown events which have thrown us a giant bundle of challenges. > >>> Anyway, so an evsel is perf_event_attr information paired with CPU > >>> information, it makes sense to me that the parser should do this > >>> pairing. What's harder in the evsel/evlist setup is trying to fix CPU > >>> maps up not in parsing, like with __perf_evlist__propagate_maps where > >>> the parsing is trying to leave crumbs around (like system_wide, > >>> has_user_cpus, is_pmu_core) so the map propagation works properly. > >>> > >>>> The json file must also be updated to add a > >>>> new field cpumask. > >>> > >>> Yeah, I don't like this as it means we end up putting CPU information > >>> into the json that isn't the same for every CPU variant of the same > >>> architecture model. Maybe we can have some kind of "scope" enum value > >>> in the json and then when the scope differs from the PMU's, core scope > >>> vs the PMU's hyperthread scope, then in the tool we can figure out the > >>> cpumask from the topology in sysfs. Maybe we should just always use > >>> the topology and get rid of cpumask files in sysfs, replacing them > >>> with "scope" files. Will Deacon pushed back on having ARM PMUs > >>> supporting hot plugging > >>> (https://lore.kernel.org/lkml/20240701142222.GA2691@willie-the-truck/) > >>> where the main thing hot plugging handler needs to maintain is set the > >>> cpumask. > >> > >> Not just the cpumask but also migrate the context for some PMUs, see > >> perf_pmu_migrate_context(). > > > > Will do, thanks. > > > >> It seems we really need a shared cpumask in the generic code, so the > >> drivers don't need to handle the hotplug everywhere. I will think about it. > > > > Thanks. There are other problems on ARM PMUs like having no or empty > > cpumasks, which the tool take to mean open the event on every online > > CPU (there is no cpus or cpumask file on core PMUs historically, so we > > adopt this behavior when the cpumask is empty or not present). > > The no cpus/cpumasks and empty cpumasks should be different. > No cpus/cpumasks file means that the counters/events are available for > all the CPUs. > But if it's an empty cpus/cpumasks file, it means that there are no > proper CPUs. It may happen on a hybrid machine when all e-core CPUs are > hot-removed. Since it's possible that the CPUs can be hot-added later, > the kernel driver doesn't unregister the cpu_atom PMU. > > > I've > > been thinking to expand `tools/perf/tests/pmu.c` with basic PMU sanity > > tests. Some ideas: > > > > Thanks. > > > 1) some kind of cpumask sanity check - we could open an event with the > > cpumask and see if it yields multiplexing.. which would highlight the > > ARM no cpumask bug > > The multiplexing is triggered when there are not enough counters. It > should not related to the cpumask. Agreed. Here I'm thinking about the bugs I see in PMUs. One of them is to always succeed in opening siblings within a group and for the unavailable counters to just report "not counted". This breaks weak groups used by metrics. > For the PMU without cpumask, I think the test case should try to open an > event on all CPUs and check if the open succeeds. > For the PMU with cpumask, the test case should try to open an event on > the masked CPUs and check if the open succeeds. I agree without cpumask means all CPUs, the bug I see on ARM PMUs is that they have uncore PMUs with no or empty cpumasks leading to the uncore events being programmed on every CPU and over counting, multiplexing counters and so on. I'm trying to devise tests so that they can see they are broken. > The behavior of opening an event on unmasked CPUs seems not defined. > For uncore, it's OK. The kernel treats the CPUs from the same socket > exactly the same. But I'm not sure about the other PMUs. My understanding had been that for core PMUs a "perf stat -C" option would choose the particular CPU to count the event on, for an uncore PMU the -C option would override the cpumask's "default" value. We have code to validate this: https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522 But it seems now that overriding an uncore PMU's default CPU is ignored. If you did: $ perf stat -C 1 -e data_read -a sleep 0.1 Then the tool thinks data_read is on CPU1 and will set its thread affinity to CPU1 to avoid IPIs. It seems to fix this we need to just throw away the -C option. > > 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) > > files parse correctly and have a corresponding event. > > 3) keep adding opening events on the PMU to a group to make sure that > > when counters are exhausted the perf_event_open fails (I've seen this > > bug on AMD) > > 4) are the values in the type file unique > > > > The rest sounds good to me. Cool. Let me know if you can think of more. Thanks, Ian > Thanks, > Kan
On 2024-07-19 10:59 a.m., Ian Rogers wrote: > On Fri, Jul 19, 2024 at 6:56 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >> >> >> >> On 2024-07-18 4:50 p.m., Ian Rogers wrote: >>> On Thu, Jul 18, 2024 at 10:48 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >>>> >>>> >>>> >>>> On 2024-07-18 11:39 a.m., Ian Rogers wrote: >>>>> On Thu, Jul 18, 2024 at 7:33 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >>>>>> >>>>>> >>>>>> >>>>>> On 2024-07-17 8:30 p.m., Ian Rogers wrote: >>>>>>> If an event file exists in sysfs, check if a event.cpus file exists >>>>>>> and read a perf_cpu_map from it if it does. This allows particular >>>>>>> events to have a different set of CPUs compared to the PMU. >>>>>>> >>>>>>> One scenario where this could be useful is when a PMU is set up with a >>>>>>> cpumask/events per SMT thread but some events record for all SMT >>>>>>> threads. Programming an event on each SMT thread will cause >>>>>>> unnecessary counters to be programmed and the aggregate value to be >>>>>>> too large. >>>>>> >>>>>> If I understand the scenario correctly, I think the issue should have >>>>>> been addressed since ICX, with the introduction of the "SMT-aware >>>>>> events". Is there a specific event which still has the issue on newer >>>>>> platforms? >>>>> >>>>> Nothing comes to mind, but it is there on popular machines like Skylake. >>>>> >>>>>>> >>>>>>> Another scenario where this could be useful if when a PMU has >>>>>>> historically had a cpumask at the package level, but now newer per >>>>>>> die, core or CPU information is available. >>>>>> >>>>>> The traditional way to support new counters with a different scope is to >>>>>> add a new PMU. >>>>>> >>>>>>> >>>>>>> Additional context for the motivation is in these patches and >>>>>>> conversation: >>>>>>> https://lore.kernel.org/lkml/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/ >>>>>> >>>>>> I don't see the advantages of the new event.cpus way. >>>>>> The aggregation should be the same. >>>>> >>>>> Agreed. My concern is that we may end up with a pattern of >>>>> <pmu>_per_pkg, <pmu>_per_core, <pmu>_per_cpu, <pmu>_per_l3, etc. just >>>>> for the sake of the cpumask. >>>> >>>> The cstate PMUs already do so, e.g., cstate_core, cstate_pkg. >>>> >>>> From another perspective, it discloses the scope information in a PMU >>>> name. If a user only cares about the information of a package, they just >>>> need to focus on everything under <pmu>_pkg. Otherwise, they have to go >>>> through all the events. >>>> >>>>> >>>>>> The RAPL counters are free-running counters. So there is no difference >>>>>> whether grouping or not grouping. >>>>> >>>>> Should the RAPL counters return true for perf_pmu__is_software? In the >>>>> tool we currently return false and won't allow grouping, these events >>>>> with other hardware events. The intent in perf_pmu__is_software was to >>>>> emulate the way the kernel allows/disallows groups - software context >>>>> events can be in a hardware context but otherwise I don't believe the >>>>> grouping is allowed. >>>> >>>> No, it's not allowed for the RAPL counters. >>>> >>>> If the motivation is to find another way to group counters with >>>> different scope, it may not work. >>>> >>>> We once tried to mix the perf_invalid_context PMUs in a group. But it's >>>> denied. >>>> https://yhbt.net/lore/all/20150415172856.GA5029@twins.programming.kicks-ass.net/ >>>> >>>> The event.cpus still faces the same issues. >>> >>> Why so? The events would share the same perf_event_context, I thought >>> the perf_event_open would succeed. >> >> I think it breaks what groups are as well. At least, you cannot >> guarantee that two events can be co-scheduled on the same CPU. Even >> worse, there could be no events on some CPU. >> The first thing that pops up is the sample read feature. On CPU_A, the >> event_A is the leader event, but on CPU_B, the event_B could be the >> leader event if event_A's cpumask doesn't include the CPU_B. >> There could be more similar features we have to special handle. > > Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and > solely its perf event context, I want to know its core power and > package power as a group so I never record one without the other. That > grouping wouldn't be possible with 2 PMUs. For power, to be honest, I don't think it improves anything. It gives users a false image that perf can group these counters. But the truth is that perf cannot. The power counters are all free-running counters. It's impossible to co-schedule them (which requires a global mechanism to disable/enable all counters, e.g., GLOBAL_CTRL for core PMU). The kernel still has to read the counters one by one while the counters keep running. There are no differences with or without a group for the power events. > >>> >>>>> >>>>>> But it makes the kernel driver complex, since it has to maintain at >>>>>> least two different cpumasks. >>>>> >>>>> Two drivers each maintaining a cpumask or 1 driver maintaining 2 >>>>> cpumasks, it seems like there is chance for code reuse in both cases. >>>>> I'm not seeing how it adds to complexity particularly. >>>> >>>> Yes, there are some cleanup opportunities for the cpumask/hotplug codes. >>>> >>>> We may even abstracts some generic interfaces for pkg or core level PMUs. >>>> >>>> Eventually, the complexity/duplication should be able to be reduced. >>>> >>>>> >>>>>> The tool becomes complex either, since it has to take care of the >>>>>> per-event CPU override case. >>>>> >>>>> I'm not sure I agree with this. What we need for perf_event_open is a >>>>> perf_event_attr, we dress these up as evsels which also have the >>>>> ability to be for >1 CPU or thread. Longer term I think evsels can >>>>> also have >1 PMU, for the wildcard cases like uncore memory >>>>> controllers - this would remove the need for resorting evsels except >>>>> for topdown events which have thrown us a giant bundle of challenges. >>>>> Anyway, so an evsel is perf_event_attr information paired with CPU >>>>> information, it makes sense to me that the parser should do this >>>>> pairing. What's harder in the evsel/evlist setup is trying to fix CPU >>>>> maps up not in parsing, like with __perf_evlist__propagate_maps where >>>>> the parsing is trying to leave crumbs around (like system_wide, >>>>> has_user_cpus, is_pmu_core) so the map propagation works properly. >>>>> >>>>>> The json file must also be updated to add a >>>>>> new field cpumask. >>>>> >>>>> Yeah, I don't like this as it means we end up putting CPU information >>>>> into the json that isn't the same for every CPU variant of the same >>>>> architecture model. Maybe we can have some kind of "scope" enum value >>>>> in the json and then when the scope differs from the PMU's, core scope >>>>> vs the PMU's hyperthread scope, then in the tool we can figure out the >>>>> cpumask from the topology in sysfs. Maybe we should just always use >>>>> the topology and get rid of cpumask files in sysfs, replacing them >>>>> with "scope" files. Will Deacon pushed back on having ARM PMUs >>>>> supporting hot plugging >>>>> (https://lore.kernel.org/lkml/20240701142222.GA2691@willie-the-truck/) >>>>> where the main thing hot plugging handler needs to maintain is set the >>>>> cpumask. >>>> >>>> Not just the cpumask but also migrate the context for some PMUs, see >>>> perf_pmu_migrate_context(). >>> >>> Will do, thanks. >>> >>>> It seems we really need a shared cpumask in the generic code, so the >>>> drivers don't need to handle the hotplug everywhere. I will think about it. >>> >>> Thanks. There are other problems on ARM PMUs like having no or empty >>> cpumasks, which the tool take to mean open the event on every online >>> CPU (there is no cpus or cpumask file on core PMUs historically, so we >>> adopt this behavior when the cpumask is empty or not present). >> >> The no cpus/cpumasks and empty cpumasks should be different. >> No cpus/cpumasks file means that the counters/events are available for >> all the CPUs. >> But if it's an empty cpus/cpumasks file, it means that there are no >> proper CPUs. It may happen on a hybrid machine when all e-core CPUs are >> hot-removed. Since it's possible that the CPUs can be hot-added later, >> the kernel driver doesn't unregister the cpu_atom PMU. >> >>> I've >>> been thinking to expand `tools/perf/tests/pmu.c` with basic PMU sanity >>> tests. Some ideas: >>> >> >> Thanks. >> >>> 1) some kind of cpumask sanity check - we could open an event with the >>> cpumask and see if it yields multiplexing.. which would highlight the >>> ARM no cpumask bug >> >> The multiplexing is triggered when there are not enough counters. It >> should not related to the cpumask. > > Agreed. Here I'm thinking about the bugs I see in PMUs. One of them is > to always succeed in opening siblings within a group and for the > unavailable counters to just report "not counted". This breaks weak > groups used by metrics. > >> For the PMU without cpumask, I think the test case should try to open an >> event on all CPUs and check if the open succeeds. >> For the PMU with cpumask, the test case should try to open an event on >> the masked CPUs and check if the open succeeds. > > I agree without cpumask means all CPUs, the bug I see on ARM PMUs is > that they have uncore PMUs with no or empty cpumasks leading to the > uncore events being programmed on every CPU and over counting, > multiplexing counters and so on. I'm trying to devise tests so that > they can see they are broken. > >> The behavior of opening an event on unmasked CPUs seems not defined. >> For uncore, it's OK. The kernel treats the CPUs from the same socket >> exactly the same. But I'm not sure about the other PMUs. > > My understanding had been that for core PMUs a "perf stat -C" option > would choose the particular CPU to count the event on, for an uncore > PMU the -C option would override the cpumask's "default" value. We > have code to validate this: > https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522 > But it seems now that overriding an uncore PMU's default CPU is > ignored. For the uncore driver, no matter what -C set, it writes the default CPU back. https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760 > If you did: > $ perf stat -C 1 -e data_read -a sleep 0.1 > Then the tool thinks data_read is on CPU1 and will set its thread > affinity to CPU1 to avoid IPIs. It seems to fix this we need to just > throw away the -C option. The perf tool can still read the the counter from CPU1 and no IPIs because of the PMU_EV_CAP_READ_ACTIVE_PKG(). Not quite sure, but it seems only the open and close may be impacted and silently changed to CPU0. Thanks, Kan > >>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) >>> files parse correctly and have a corresponding event. >>> 3) keep adding opening events on the PMU to a group to make sure that >>> when counters are exhausted the perf_event_open fails (I've seen this >>> bug on AMD) >>> 4) are the values in the type file unique >>> >> >> The rest sounds good to me. > > Cool. Let me know if you can think of more. > > Thanks, > Ian > >> Thanks, >> Kan >
On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > On 2024-07-19 10:59 a.m., Ian Rogers wrote: > > Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and > > solely its perf event context, I want to know its core power and > > package power as a group so I never record one without the other. That > > grouping wouldn't be possible with 2 PMUs. > > For power, to be honest, I don't think it improves anything. It gives > users a false image that perf can group these counters. > But the truth is that perf cannot. The power counters are all > free-running counters. It's impossible to co-schedule them (which > requires a global mechanism to disable/enable all counters, e.g., > GLOBAL_CTRL for core PMU). The kernel still has to read the counters one > by one while the counters keep running. There are no differences with or > without a group for the power events. Ok, so power should copy cstate with _core, _pkg, etc. I agree the difference is small and I like the idea of being consistent. Do we want to add "event.cpus" support to the tool anyway for potential future uses? This would at least avoid problems with newer kernels and older perf tools were we to find a good use for it. > > My understanding had been that for core PMUs a "perf stat -C" option > > would choose the particular CPU to count the event on, for an uncore > > PMU the -C option would override the cpumask's "default" value. We > > have code to validate this: > > https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522 > > But it seems now that overriding an uncore PMU's default CPU is > > ignored. > > For the uncore driver, no matter what -C set, it writes the default CPU > back. > https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760 > > > If you did: > > $ perf stat -C 1 -e data_read -a sleep 0.1 > > Then the tool thinks data_read is on CPU1 and will set its thread > > affinity to CPU1 to avoid IPIs. It seems to fix this we need to just > > throw away the -C option. > The perf tool can still read the the counter from CPU1 and no IPIs > because of the PMU_EV_CAP_READ_ACTIVE_PKG(). > > Not quite sure, but it seems only the open and close may be impacted and > silently changed to CPU0. There's also enable/disable. Andi did the work and there were some notable gains but likely more on core events. Ultimately it'd be nice to be opening, closing.. everything in parallel given the calls are slow and the work is embarrassingly parallel. It feels like the cpumasks for uncore could still do with some cleanup wrt -C I'm just unsure at the moment what this should be. Tbh, I'm tempted to rewrite evlist propagate maps as someone may look at it and think I believe in what it is doing. The parallel stuff we should grab Riccardo's past work. Thanks, Ian > Thanks, > Kan > > > >>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) > >>> files parse correctly and have a corresponding event. > >>> 3) keep adding opening events on the PMU to a group to make sure that > >>> when counters are exhausted the perf_event_open fails (I've seen this > >>> bug on AMD) > >>> 4) are the values in the type file unique > >>> > >> > >> The rest sounds good to me. > > > > Cool. Let me know if you can think of more. > > > > Thanks, > > Ian > > > >> Thanks, > >> Kan > >
Hello, Ian, Kan, On 7/20/2024 3:32 AM, Ian Rogers wrote: > On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >> On 2024-07-19 10:59 a.m., Ian Rogers wrote: >>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and >>> solely its perf event context, I want to know its core power and >>> package power as a group so I never record one without the other. That >>> grouping wouldn't be possible with 2 PMUs. >> >> For power, to be honest, I don't think it improves anything. It gives >> users a false image that perf can group these counters. >> But the truth is that perf cannot. The power counters are all >> free-running counters. It's impossible to co-schedule them (which >> requires a global mechanism to disable/enable all counters, e.g., >> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one >> by one while the counters keep running. There are no differences with or >> without a group for the power events. > > Ok, so power should copy cstate with _core, _pkg, etc. I agree the > difference is small and I like the idea of being consistent. So, it seems we want to follow the new PMU addition approach for RAPL being consistent with Intel cstate driver, should I revive my "power_per_core" PMU thread now? Thanks, Dhananjay Do we > want to add "event.cpus" support to the tool anyway for potential > future uses? This would at least avoid problems with newer kernels and > older perf tools were we to find a good use for it. > >>> My understanding had been that for core PMUs a "perf stat -C" option >>> would choose the particular CPU to count the event on, for an uncore >>> PMU the -C option would override the cpumask's "default" value. We >>> have code to validate this: >>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522 >>> But it seems now that overriding an uncore PMU's default CPU is >>> ignored. >> >> For the uncore driver, no matter what -C set, it writes the default CPU >> back. >> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760 >> >>> If you did: >>> $ perf stat -C 1 -e data_read -a sleep 0.1 >>> Then the tool thinks data_read is on CPU1 and will set its thread >>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just >>> throw away the -C option. >> The perf tool can still read the the counter from CPU1 and no IPIs >> because of the PMU_EV_CAP_READ_ACTIVE_PKG(). >> >> Not quite sure, but it seems only the open and close may be impacted and >> silently changed to CPU0. > > There's also enable/disable. Andi did the work and there were some > notable gains but likely more on core events. Ultimately it'd be nice > to be opening, closing.. everything in parallel given the calls are > slow and the work is embarrassingly parallel. > It feels like the cpumasks for uncore could still do with some cleanup > wrt -C I'm just unsure at the moment what this should be. Tbh, I'm > tempted to rewrite evlist propagate maps as someone may look at it and > think I believe in what it is doing. The parallel stuff we should grab > Riccardo's past work. > > Thanks, > Ian > > >> Thanks, >> Kan >>> >>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) >>>>> files parse correctly and have a corresponding event. >>>>> 3) keep adding opening events on the PMU to a group to make sure that >>>>> when counters are exhausted the perf_event_open fails (I've seen this >>>>> bug on AMD) >>>>> 4) are the values in the type file unique >>>>> >>>> >>>> The rest sounds good to me. >>> >>> Cool. Let me know if you can think of more. >>> >>> Thanks, >>> Ian >>> >>>> Thanks, >>>> Kan >>>
On 7/26/2024 12:36 PM, Dhananjay Ugwekar wrote: > Hello, Ian, Kan, > > On 7/20/2024 3:32 AM, Ian Rogers wrote: >> On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >>> On 2024-07-19 10:59 a.m., Ian Rogers wrote: >>>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and >>>> solely its perf event context, I want to know its core power and >>>> package power as a group so I never record one without the other. That >>>> grouping wouldn't be possible with 2 PMUs. >>> >>> For power, to be honest, I don't think it improves anything. It gives >>> users a false image that perf can group these counters. >>> But the truth is that perf cannot. The power counters are all >>> free-running counters. It's impossible to co-schedule them (which >>> requires a global mechanism to disable/enable all counters, e.g., >>> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one >>> by one while the counters keep running. There are no differences with or >>> without a group for the power events. >> >> Ok, so power should copy cstate with _core, _pkg, etc. I agree the >> difference is small and I like the idea of being consistent. > > So, it seems we want to follow the new PMU addition approach for RAPL > being consistent with Intel cstate driver, should I revive my "power_per_core" > PMU thread now? The power_per_core PMU thread link for reference, https://lore.kernel.org/all/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/ > > Thanks, > Dhananjay > > Do we >> want to add "event.cpus" support to the tool anyway for potential >> future uses? This would at least avoid problems with newer kernels and >> older perf tools were we to find a good use for it. >> >>>> My understanding had been that for core PMUs a "perf stat -C" option >>>> would choose the particular CPU to count the event on, for an uncore >>>> PMU the -C option would override the cpumask's "default" value. We >>>> have code to validate this: >>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522 >>>> But it seems now that overriding an uncore PMU's default CPU is >>>> ignored. >>> >>> For the uncore driver, no matter what -C set, it writes the default CPU >>> back. >>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760 >>> >>>> If you did: >>>> $ perf stat -C 1 -e data_read -a sleep 0.1 >>>> Then the tool thinks data_read is on CPU1 and will set its thread >>>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just >>>> throw away the -C option. >>> The perf tool can still read the the counter from CPU1 and no IPIs >>> because of the PMU_EV_CAP_READ_ACTIVE_PKG(). >>> >>> Not quite sure, but it seems only the open and close may be impacted and >>> silently changed to CPU0. >> >> There's also enable/disable. Andi did the work and there were some >> notable gains but likely more on core events. Ultimately it'd be nice >> to be opening, closing.. everything in parallel given the calls are >> slow and the work is embarrassingly parallel. >> It feels like the cpumasks for uncore could still do with some cleanup >> wrt -C I'm just unsure at the moment what this should be. Tbh, I'm >> tempted to rewrite evlist propagate maps as someone may look at it and >> think I believe in what it is doing. The parallel stuff we should grab >> Riccardo's past work. >> >> Thanks, >> Ian >> >> >>> Thanks, >>> Kan >>>> >>>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) >>>>>> files parse correctly and have a corresponding event. >>>>>> 3) keep adding opening events on the PMU to a group to make sure that >>>>>> when counters are exhausted the perf_event_open fails (I've seen this >>>>>> bug on AMD) >>>>>> 4) are the values in the type file unique >>>>>> >>>>> >>>>> The rest sounds good to me. >>>> >>>> Cool. Let me know if you can think of more. >>>> >>>> Thanks, >>>> Ian >>>> >>>>> Thanks, >>>>> Kan >>>>
On Fri, Jul 26, 2024 at 12:10 AM Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com> wrote: > > > > On 7/26/2024 12:36 PM, Dhananjay Ugwekar wrote: > > Hello, Ian, Kan, > > > > On 7/20/2024 3:32 AM, Ian Rogers wrote: > >> On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > >>> On 2024-07-19 10:59 a.m., Ian Rogers wrote: > >>>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and > >>>> solely its perf event context, I want to know its core power and > >>>> package power as a group so I never record one without the other. That > >>>> grouping wouldn't be possible with 2 PMUs. > >>> > >>> For power, to be honest, I don't think it improves anything. It gives > >>> users a false image that perf can group these counters. > >>> But the truth is that perf cannot. The power counters are all > >>> free-running counters. It's impossible to co-schedule them (which > >>> requires a global mechanism to disable/enable all counters, e.g., > >>> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one > >>> by one while the counters keep running. There are no differences with or > >>> without a group for the power events. > >> > >> Ok, so power should copy cstate with _core, _pkg, etc. I agree the > >> difference is small and I like the idea of being consistent. > > > > So, it seems we want to follow the new PMU addition approach for RAPL > > being consistent with Intel cstate driver, should I revive my "power_per_core" > > PMU thread now? > > The power_per_core PMU thread link for reference, > > https://lore.kernel.org/all/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/ I think so. Would it be possible to follow the same naming convention as cstate, where there is cstate_pkg and cstate_core? (ie no "_per" in the name) Thanks, Ian > > > > Thanks, > > Dhananjay > > > > Do we > >> want to add "event.cpus" support to the tool anyway for potential > >> future uses? This would at least avoid problems with newer kernels and > >> older perf tools were we to find a good use for it. > >> > >>>> My understanding had been that for core PMUs a "perf stat -C" option > >>>> would choose the particular CPU to count the event on, for an uncore > >>>> PMU the -C option would override the cpumask's "default" value. We > >>>> have code to validate this: > >>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522 > >>>> But it seems now that overriding an uncore PMU's default CPU is > >>>> ignored. > >>> > >>> For the uncore driver, no matter what -C set, it writes the default CPU > >>> back. > >>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760 > >>> > >>>> If you did: > >>>> $ perf stat -C 1 -e data_read -a sleep 0.1 > >>>> Then the tool thinks data_read is on CPU1 and will set its thread > >>>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just > >>>> throw away the -C option. > >>> The perf tool can still read the the counter from CPU1 and no IPIs > >>> because of the PMU_EV_CAP_READ_ACTIVE_PKG(). > >>> > >>> Not quite sure, but it seems only the open and close may be impacted and > >>> silently changed to CPU0. > >> > >> There's also enable/disable. Andi did the work and there were some > >> notable gains but likely more on core events. Ultimately it'd be nice > >> to be opening, closing.. everything in parallel given the calls are > >> slow and the work is embarrassingly parallel. > >> It feels like the cpumasks for uncore could still do with some cleanup > >> wrt -C I'm just unsure at the moment what this should be. Tbh, I'm > >> tempted to rewrite evlist propagate maps as someone may look at it and > >> think I believe in what it is doing. The parallel stuff we should grab > >> Riccardo's past work. > >> > >> Thanks, > >> Ian > >> > >> > >>> Thanks, > >>> Kan > >>>> > >>>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) > >>>>>> files parse correctly and have a corresponding event. > >>>>>> 3) keep adding opening events on the PMU to a group to make sure that > >>>>>> when counters are exhausted the perf_event_open fails (I've seen this > >>>>>> bug on AMD) > >>>>>> 4) are the values in the type file unique > >>>>>> > >>>>> > >>>>> The rest sounds good to me. > >>>> > >>>> Cool. Let me know if you can think of more. > >>>> > >>>> Thanks, > >>>> Ian > >>>> > >>>>> Thanks, > >>>>> Kan > >>>> >
On 7/26/2024 1:22 PM, Ian Rogers wrote: > On Fri, Jul 26, 2024 at 12:10 AM Dhananjay Ugwekar > <Dhananjay.Ugwekar@amd.com> wrote: >> >> >> >> On 7/26/2024 12:36 PM, Dhananjay Ugwekar wrote: >>> Hello, Ian, Kan, >>> >>> On 7/20/2024 3:32 AM, Ian Rogers wrote: >>>> On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >>>>> On 2024-07-19 10:59 a.m., Ian Rogers wrote: >>>>>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and >>>>>> solely its perf event context, I want to know its core power and >>>>>> package power as a group so I never record one without the other. That >>>>>> grouping wouldn't be possible with 2 PMUs. >>>>> >>>>> For power, to be honest, I don't think it improves anything. It gives >>>>> users a false image that perf can group these counters. >>>>> But the truth is that perf cannot. The power counters are all >>>>> free-running counters. It's impossible to co-schedule them (which >>>>> requires a global mechanism to disable/enable all counters, e.g., >>>>> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one >>>>> by one while the counters keep running. There are no differences with or >>>>> without a group for the power events. >>>> >>>> Ok, so power should copy cstate with _core, _pkg, etc. I agree the >>>> difference is small and I like the idea of being consistent. >>> >>> So, it seems we want to follow the new PMU addition approach for RAPL >>> being consistent with Intel cstate driver, should I revive my "power_per_core" >>> PMU thread now? >> >> The power_per_core PMU thread link for reference, >> >> https://lore.kernel.org/all/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/ > > I think so. Would it be possible to follow the same naming convention > as cstate, where there is cstate_pkg and cstate_core? (ie no "_per" in > the name) Makes sense, we should probably rename the original "power" PMU to "power_pkg" as well. Thanks, Dhananjay > > Thanks, > Ian > >>> >>> Thanks, >>> Dhananjay >>> >>> Do we >>>> want to add "event.cpus" support to the tool anyway for potential >>>> future uses? This would at least avoid problems with newer kernels and >>>> older perf tools were we to find a good use for it. >>>> >>>>>> My understanding had been that for core PMUs a "perf stat -C" option >>>>>> would choose the particular CPU to count the event on, for an uncore >>>>>> PMU the -C option would override the cpumask's "default" value. We >>>>>> have code to validate this: >>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522 >>>>>> But it seems now that overriding an uncore PMU's default CPU is >>>>>> ignored. >>>>> >>>>> For the uncore driver, no matter what -C set, it writes the default CPU >>>>> back. >>>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760 >>>>> >>>>>> If you did: >>>>>> $ perf stat -C 1 -e data_read -a sleep 0.1 >>>>>> Then the tool thinks data_read is on CPU1 and will set its thread >>>>>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just >>>>>> throw away the -C option. >>>>> The perf tool can still read the the counter from CPU1 and no IPIs >>>>> because of the PMU_EV_CAP_READ_ACTIVE_PKG(). >>>>> >>>>> Not quite sure, but it seems only the open and close may be impacted and >>>>> silently changed to CPU0. >>>> >>>> There's also enable/disable. Andi did the work and there were some >>>> notable gains but likely more on core events. Ultimately it'd be nice >>>> to be opening, closing.. everything in parallel given the calls are >>>> slow and the work is embarrassingly parallel. >>>> It feels like the cpumasks for uncore could still do with some cleanup >>>> wrt -C I'm just unsure at the moment what this should be. Tbh, I'm >>>> tempted to rewrite evlist propagate maps as someone may look at it and >>>> think I believe in what it is doing. The parallel stuff we should grab >>>> Riccardo's past work. >>>> >>>> Thanks, >>>> Ian >>>> >>>> >>>>> Thanks, >>>>> Kan >>>>>> >>>>>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) >>>>>>>> files parse correctly and have a corresponding event. >>>>>>>> 3) keep adding opening events on the PMU to a group to make sure that >>>>>>>> when counters are exhausted the perf_event_open fails (I've seen this >>>>>>>> bug on AMD) >>>>>>>> 4) are the values in the type file unique >>>>>>>> >>>>>>> >>>>>>> The rest sounds good to me. >>>>>> >>>>>> Cool. Let me know if you can think of more. >>>>>> >>>>>> Thanks, >>>>>> Ian >>>>>> >>>>>>> Thanks, >>>>>>> Kan >>>>>> >>
On 2024-07-26 4:17 a.m., Dhananjay Ugwekar wrote: > > > On 7/26/2024 1:22 PM, Ian Rogers wrote: >> On Fri, Jul 26, 2024 at 12:10 AM Dhananjay Ugwekar >> <Dhananjay.Ugwekar@amd.com> wrote: >>> >>> >>> >>> On 7/26/2024 12:36 PM, Dhananjay Ugwekar wrote: >>>> Hello, Ian, Kan, >>>> >>>> On 7/20/2024 3:32 AM, Ian Rogers wrote: >>>>> On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >>>>>> On 2024-07-19 10:59 a.m., Ian Rogers wrote: >>>>>>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and >>>>>>> solely its perf event context, I want to know its core power and >>>>>>> package power as a group so I never record one without the other. That >>>>>>> grouping wouldn't be possible with 2 PMUs. >>>>>> >>>>>> For power, to be honest, I don't think it improves anything. It gives >>>>>> users a false image that perf can group these counters. >>>>>> But the truth is that perf cannot. The power counters are all >>>>>> free-running counters. It's impossible to co-schedule them (which >>>>>> requires a global mechanism to disable/enable all counters, e.g., >>>>>> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one >>>>>> by one while the counters keep running. There are no differences with or >>>>>> without a group for the power events. >>>>> >>>>> Ok, so power should copy cstate with _core, _pkg, etc. I agree the >>>>> difference is small and I like the idea of being consistent. >>>> >>>> So, it seems we want to follow the new PMU addition approach for RAPL >>>> being consistent with Intel cstate driver, should I revive my "power_per_core" >>>> PMU thread now? >>> >>> The power_per_core PMU thread link for reference, >>> >>> https://lore.kernel.org/all/20240711102436.4432-1-Dhananjay.Ugwekar@amd.com/ >> >> I think so. Would it be possible to follow the same naming convention >> as cstate, where there is cstate_pkg and cstate_core? (ie no "_per" in >> the name) > > Makes sense, we should probably rename the original "power" PMU to "power_pkg" > as well. It may brings some compatible issue for the old platforms. There may be two ways to address it. - Add a symlink or something to link the "power" and "power_pkg". - Only when there are two or more different scopes of counters in a system, the "power_<scope>" are used. If there is only one scope of power counter, "power" is still used. The latter method is used for the Intel uncore and hybrid core drivers now. Thanks, Kan > > Thanks, > Dhananjay > >> >> Thanks, >> Ian >> >>>> >>>> Thanks, >>>> Dhananjay >>>> >>>> Do we >>>>> want to add "event.cpus" support to the tool anyway for potential >>>>> future uses? This would at least avoid problems with newer kernels and >>>>> older perf tools were we to find a good use for it. >>>>> >>>>>>> My understanding had been that for core PMUs a "perf stat -C" option >>>>>>> would choose the particular CPU to count the event on, for an uncore >>>>>>> PMU the -C option would override the cpumask's "default" value. We >>>>>>> have code to validate this: >>>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522 >>>>>>> But it seems now that overriding an uncore PMU's default CPU is >>>>>>> ignored. >>>>>> >>>>>> For the uncore driver, no matter what -C set, it writes the default CPU >>>>>> back. >>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760 >>>>>> >>>>>>> If you did: >>>>>>> $ perf stat -C 1 -e data_read -a sleep 0.1 >>>>>>> Then the tool thinks data_read is on CPU1 and will set its thread >>>>>>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just >>>>>>> throw away the -C option. >>>>>> The perf tool can still read the the counter from CPU1 and no IPIs >>>>>> because of the PMU_EV_CAP_READ_ACTIVE_PKG(). >>>>>> >>>>>> Not quite sure, but it seems only the open and close may be impacted and >>>>>> silently changed to CPU0. >>>>> >>>>> There's also enable/disable. Andi did the work and there were some >>>>> notable gains but likely more on core events. Ultimately it'd be nice >>>>> to be opening, closing.. everything in parallel given the calls are >>>>> slow and the work is embarrassingly parallel. >>>>> It feels like the cpumasks for uncore could still do with some cleanup >>>>> wrt -C I'm just unsure at the moment what this should be. Tbh, I'm >>>>> tempted to rewrite evlist propagate maps as someone may look at it and >>>>> think I believe in what it is doing. The parallel stuff we should grab >>>>> Riccardo's past work. >>>>> >>>>> Thanks, >>>>> Ian >>>>> >>>>> >>>>>> Thanks, >>>>>> Kan >>>>>>> >>>>>>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) >>>>>>>>> files parse correctly and have a corresponding event. >>>>>>>>> 3) keep adding opening events on the PMU to a group to make sure that >>>>>>>>> when counters are exhausted the perf_event_open fails (I've seen this >>>>>>>>> bug on AMD) >>>>>>>>> 4) are the values in the type file unique >>>>>>>>> >>>>>>>> >>>>>>>> The rest sounds good to me. >>>>>>> >>>>>>> Cool. Let me know if you can think of more. >>>>>>> >>>>>>> Thanks, >>>>>>> Ian >>>>>>> >>>>>>>> Thanks, >>>>>>>> Kan >>>>>>> >>> >
On 2024-07-19 6:02 p.m., Ian Rogers wrote: > On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >> On 2024-07-19 10:59 a.m., Ian Rogers wrote: >>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and >>> solely its perf event context, I want to know its core power and >>> package power as a group so I never record one without the other. That >>> grouping wouldn't be possible with 2 PMUs. >> >> For power, to be honest, I don't think it improves anything. It gives >> users a false image that perf can group these counters. >> But the truth is that perf cannot. The power counters are all >> free-running counters. It's impossible to co-schedule them (which >> requires a global mechanism to disable/enable all counters, e.g., >> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one >> by one while the counters keep running. There are no differences with or >> without a group for the power events. > > Ok, so power should copy cstate with _core, _pkg, etc. I agree the > difference is small and I like the idea of being consistent. Do we > want to add "event.cpus" support to the tool anyway for potential > future uses? The only thing I can imagine is that it may be used to disclose the event constraint information, Or even more to update/override the event constraint information (which requires kernel update.). But what I'm worried about is that it may be abused. It's very easy to confuse an event and a counter in a PMU. > This would at least avoid problems with newer kernels and > older perf tools were we to find a good use for it. > >>> My understanding had been that for core PMUs a "perf stat -C" option >>> would choose the particular CPU to count the event on, for an uncore >>> PMU the -C option would override the cpumask's "default" value. We >>> have code to validate this: >>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522 >>> But it seems now that overriding an uncore PMU's default CPU is >>> ignored. >> >> For the uncore driver, no matter what -C set, it writes the default CPU >> back. >> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760 >> >>> If you did: >>> $ perf stat -C 1 -e data_read -a sleep 0.1 >>> Then the tool thinks data_read is on CPU1 and will set its thread >>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just >>> throw away the -C option. >> The perf tool can still read the the counter from CPU1 and no IPIs >> because of the PMU_EV_CAP_READ_ACTIVE_PKG(). >> >> Not quite sure, but it seems only the open and close may be impacted and >> silently changed to CPU0. > > There's also enable/disable. Andi did the work and there were some > notable gains but likely more on core events. Ultimately it'd be nice > to be opening, closing.. everything in parallel given the calls are > slow and the work is embarrassingly parallel. > It feels like the cpumasks for uncore could still do with some cleanup > wrt -C I'm just unsure at the moment what this should be. Tbh, I'm > tempted to rewrite evlist propagate maps as someone may look at it and > think I believe in what it is doing. The parallel stuff we should grab > Riccardo's past work. For core PMU, the parallel may not work, because the core PMU is usually MSR based. Perf has to access the MSRs on the specific CPU. IPIs may be triggered if the users try to operate them from the other CPUs. But the parallel is good for the counters in the MMIO space. The counters can be accessed from any CPU. There are more and more counters which are moved to the MMIO space, e.g., new uncore PMUs, IOMMU PMU, TMPI (for power), etc. Thanks, Kan > > Thanks, > Ian > > >> Thanks, >> Kan >>> >>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) >>>>> files parse correctly and have a corresponding event. >>>>> 3) keep adding opening events on the PMU to a group to make sure that >>>>> when counters are exhausted the perf_event_open fails (I've seen this >>>>> bug on AMD) >>>>> 4) are the values in the type file unique >>>>> >>>> >>>> The rest sounds good to me. >>> >>> Cool. Let me know if you can think of more. >>> >>> Thanks, >>> Ian >>> >>>> Thanks, >>>> Kan >>> >
On Mon, Jul 22, 2024 at 6:57 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > > > > On 2024-07-19 6:02 p.m., Ian Rogers wrote: > > On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > >> On 2024-07-19 10:59 a.m., Ian Rogers wrote: > >>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and > >>> solely its perf event context, I want to know its core power and > >>> package power as a group so I never record one without the other. That > >>> grouping wouldn't be possible with 2 PMUs. > >> > >> For power, to be honest, I don't think it improves anything. It gives > >> users a false image that perf can group these counters. > >> But the truth is that perf cannot. The power counters are all > >> free-running counters. It's impossible to co-schedule them (which > >> requires a global mechanism to disable/enable all counters, e.g., > >> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one > >> by one while the counters keep running. There are no differences with or > >> without a group for the power events. > > > > Ok, so power should copy cstate with _core, _pkg, etc. I agree the > > difference is small and I like the idea of being consistent. Do we > > want to add "event.cpus" support to the tool anyway for potential > > future uses? > > The only thing I can imagine is that it may be used to disclose the > event constraint information, Or even more to update/override the event > constraint information (which requires kernel update.). But what I'm > worried about is that it may be abused. It's very easy to confuse an > event and a counter in a PMU. So you mean if you have a dual socket machine and an uncore PMU with a cpumask of "0,48" you worry that someone setting an event on CPU 47 may think they are getting a CPU on the second socket? Perhaps if the user can express an intent to the tool, say "perf stat -randomly-select-uncore-cpus ...", then this can be avoided. I'm not sure I'm worried about this as specifying the CPU for an event to use is already something of a more niche/advanced thing to be doing. > > This would at least avoid problems with newer kernels and > > older perf tools were we to find a good use for it. > > > >>> My understanding had been that for core PMUs a "perf stat -C" option > >>> would choose the particular CPU to count the event on, for an uncore > >>> PMU the -C option would override the cpumask's "default" value. We > >>> have code to validate this: > >>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522 > >>> But it seems now that overriding an uncore PMU's default CPU is > >>> ignored. > >> > >> For the uncore driver, no matter what -C set, it writes the default CPU > >> back. > >> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760 > >> > >>> If you did: > >>> $ perf stat -C 1 -e data_read -a sleep 0.1 > >>> Then the tool thinks data_read is on CPU1 and will set its thread > >>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just > >>> throw away the -C option. > >> The perf tool can still read the the counter from CPU1 and no IPIs > >> because of the PMU_EV_CAP_READ_ACTIVE_PKG(). > >> > >> Not quite sure, but it seems only the open and close may be impacted and > >> silently changed to CPU0. > > > > There's also enable/disable. Andi did the work and there were some > > notable gains but likely more on core events. Ultimately it'd be nice > > to be opening, closing.. everything in parallel given the calls are > > slow and the work is embarrassingly parallel. > > It feels like the cpumasks for uncore could still do with some cleanup > > wrt -C I'm just unsure at the moment what this should be. Tbh, I'm > > tempted to rewrite evlist propagate maps as someone may look at it and > > think I believe in what it is doing. The parallel stuff we should grab > > Riccardo's past work. > > For core PMU, the parallel may not work, because the core PMU is usually > MSR based. Perf has to access the MSRs on the specific CPU. IPIs may be > triggered if the users try to operate them from the other CPUs. Right, I think the idea would be to have as many threads as you have CPUs then give each thread affinity to a different CPU. The work done on the thread would match the CPU they have affinity with to avoid the IPIs. Because of the use of RCU in the kernel perf code it is possible to hit RCU synchronize where IPIs are sent after 200ms IIRC. If you get an RCU synchronize needing an IPI then "200ms x num CPUs" can mean seconds of delay in a serial implementation (500 CPUs would be 100 seconds). With parallel code a worst case slow down shouldn't increase with the number of CPUs. On laptops, .. this doesn't matter much. > But the parallel is good for the counters in the MMIO space. The > counters can be accessed from any CPU. There are more and more counters > which are moved to the MMIO space, e.g., new uncore PMUs, IOMMU PMU, > TMPI (for power), etc. Sounds good but I'm wondering how we can get the tool in the right place for doing affinity games. For the MMIO case life's good and we don't care. How can the tool know one case from another? Should the tool always be just following the cpumask? What about on ARM where the cpumasks are broken? Thanks, Ian > Thanks, > Kan > > > > Thanks, > > Ian > > > > > >> Thanks, > >> Kan > >>> > >>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) > >>>>> files parse correctly and have a corresponding event. > >>>>> 3) keep adding opening events on the PMU to a group to make sure that > >>>>> when counters are exhausted the perf_event_open fails (I've seen this > >>>>> bug on AMD) > >>>>> 4) are the values in the type file unique > >>>>> > >>>> > >>>> The rest sounds good to me. > >>> > >>> Cool. Let me know if you can think of more. > >>> > >>> Thanks, > >>> Ian > >>> > >>>> Thanks, > >>>> Kan > >>> > >
On 2024-07-22 11:43 a.m., Ian Rogers wrote: > On Mon, Jul 22, 2024 at 6:57 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >> >> >> >> On 2024-07-19 6:02 p.m., Ian Rogers wrote: >>> On Fri, Jul 19, 2024 at 9:35 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >>>> On 2024-07-19 10:59 a.m., Ian Rogers wrote: >>>>> Thanks Kan. I'm not wondering about a case of 2 CPUs, say on CPU0 and >>>>> solely its perf event context, I want to know its core power and >>>>> package power as a group so I never record one without the other. That >>>>> grouping wouldn't be possible with 2 PMUs. >>>> >>>> For power, to be honest, I don't think it improves anything. It gives >>>> users a false image that perf can group these counters. >>>> But the truth is that perf cannot. The power counters are all >>>> free-running counters. It's impossible to co-schedule them (which >>>> requires a global mechanism to disable/enable all counters, e.g., >>>> GLOBAL_CTRL for core PMU). The kernel still has to read the counters one >>>> by one while the counters keep running. There are no differences with or >>>> without a group for the power events. >>> >>> Ok, so power should copy cstate with _core, _pkg, etc. I agree the >>> difference is small and I like the idea of being consistent. Do we >>> want to add "event.cpus" support to the tool anyway for potential >>> future uses? >> >> The only thing I can imagine is that it may be used to disclose the >> event constraint information, Or even more to update/override the event >> constraint information (which requires kernel update.). But what I'm >> worried about is that it may be abused. It's very easy to confuse an >> event and a counter in a PMU. > > So you mean if you have a dual socket machine and an uncore PMU with a > cpumask of "0,48" you worry that someone setting an event on CPU 47 > may think they are getting a CPU on the second socket? Perhaps if the > user can express an intent to the tool, say "perf stat > -randomly-select-uncore-cpus ...", then this can be avoided. I'm not > sure I'm worried about this as specifying the CPU for an event to use > is already something of a more niche/advanced thing to be doing. > The perf tool can specify any CPU the users want, as long as the kernel can respond properly. I'm not worried about it. The "event.cpus" is exposed by the kernel. The main concern is also from the kernel side. Some drivers may use it to distinguish different scopes of counters. So they can combine various types of counters into a single PMU, which may break some rules they don't realize. An example is the above 'group' rule. >>> This would at least avoid problems with newer kernels and >>> older perf tools were we to find a good use for it. >>> >>>>> My understanding had been that for core PMUs a "perf stat -C" option >>>>> would choose the particular CPU to count the event on, for an uncore >>>>> PMU the -C option would override the cpumask's "default" value. We >>>>> have code to validate this: >>>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/evlist.c?h=perf-tools-next#n2522 >>>>> But it seems now that overriding an uncore PMU's default CPU is >>>>> ignored. >>>> >>>> For the uncore driver, no matter what -C set, it writes the default CPU >>>> back. >>>> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/intel/uncore.c#n760 >>>> >>>>> If you did: >>>>> $ perf stat -C 1 -e data_read -a sleep 0.1 >>>>> Then the tool thinks data_read is on CPU1 and will set its thread >>>>> affinity to CPU1 to avoid IPIs. It seems to fix this we need to just >>>>> throw away the -C option. >>>> The perf tool can still read the the counter from CPU1 and no IPIs >>>> because of the PMU_EV_CAP_READ_ACTIVE_PKG(). >>>> >>>> Not quite sure, but it seems only the open and close may be impacted and >>>> silently changed to CPU0. >>> >>> There's also enable/disable. Andi did the work and there were some >>> notable gains but likely more on core events. Ultimately it'd be nice >>> to be opening, closing.. everything in parallel given the calls are >>> slow and the work is embarrassingly parallel. >>> It feels like the cpumasks for uncore could still do with some cleanup >>> wrt -C I'm just unsure at the moment what this should be. Tbh, I'm >>> tempted to rewrite evlist propagate maps as someone may look at it and >>> think I believe in what it is doing. The parallel stuff we should grab >>> Riccardo's past work. >> >> For core PMU, the parallel may not work, because the core PMU is usually >> MSR based. Perf has to access the MSRs on the specific CPU. IPIs may be >> triggered if the users try to operate them from the other CPUs. > > Right, I think the idea would be to have as many threads as you have > CPUs then give each thread affinity to a different CPU. The work done > on the thread would match the CPU they have affinity with to avoid the > IPIs. Because of the use of RCU in the kernel perf code it is possible > to hit RCU synchronize where IPIs are sent after 200ms IIRC. If you > get an RCU synchronize needing an IPI then "200ms x num CPUs" can mean > seconds of delay in a serial implementation (500 CPUs would be 100 > seconds). With parallel code a worst case slow down shouldn't increase > with the number of CPUs. On laptops, .. this doesn't matter much. > >> But the parallel is good for the counters in the MMIO space. The >> counters can be accessed from any CPU. There are more and more counters >> which are moved to the MMIO space, e.g., new uncore PMUs, IOMMU PMU, >> TMPI (for power), etc. > > Sounds good but I'm wondering how we can get the tool in the right > place for doing affinity games. For the MMIO case life's good and we > don't care. How can the tool know one case from another? Should the > tool always be just following the cpumask? What about on ARM where the > cpumasks are broken? The cpumask cannot tell such information. For example, the counters of the uncore PMUs can be located in three different places, MSRs, the PCI config space, and the MMIO space. But the scope of the uncore counters is the same. So the cpumask is the same for various uncore PMUs. To get the information, the kernel has to be updated, e.g., add a caps/counter_type in sysfs. Thanks, Kan > > Thanks, > Ian > >> Thanks, >> Kan >>> >>> Thanks, >>> Ian >>> >>> >>>> Thanks, >>>> Kan >>>>> >>>>>>> 2) do the /sys/devices/<pmu>/events/event.(unit|scale|per-pkg|snapshot) >>>>>>> files parse correctly and have a corresponding event. >>>>>>> 3) keep adding opening events on the PMU to a group to make sure that >>>>>>> when counters are exhausted the perf_event_open fails (I've seen this >>>>>>> bug on AMD) >>>>>>> 4) are the values in the type file unique >>>>>>> >>>>>> >>>>>> The rest sounds good to me. >>>>> >>>>> Cool. Let me know if you can think of more. >>>>> >>>>> Thanks, >>>>> Ian >>>>> >>>>>> Thanks, >>>>>> Kan >>>>> >>> >
© 2016 - 2025 Red Hat, Inc.