[RFC PATCH] perf: Add --uncorepmu option for filtering uncore PMUs

Qinxin Xia posted 1 patch 2 weeks, 4 days ago
tools/perf/Documentation/perf-stat.txt |  4 ++++
tools/perf/builtin-stat.c              | 26 ++++++++++++++++++++++++++
tools/perf/util/metricgroup.c          | 18 +++++++++++++-----
3 files changed, 43 insertions(+), 5 deletions(-)
[RFC PATCH] perf: Add --uncorepmu option for filtering uncore PMUs
Posted by Qinxin Xia 2 weeks, 4 days ago
This patch adds a new --uncorepmu option to perf-stat command to allow
filtering events on specific uncore PMUs. This is useful when there are
multiple uncore PMUs with the same type (e.g. hisi_sicl2_cpa0 and
hisi_sicl0_cpa0).

eg:
[root@localhost tmp]# ./perf stat -M cpa_p0_avg_bw
 Performance counter stats for 'system wide':

    19,417,779,115      hisi_sicl0_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw             
                 0      hisi_sicl0_cpa0/cpa_p0_wr_dat/                                        
                 0      hisi_sicl0_cpa0/cpa_p0_rd_dat_64b/                                      
                 0      hisi_sicl0_cpa0/cpa_p0_rd_dat_32b/                                      
    19,417,751,103      hisi_sicl10_cpa0/cpa_cycles/     #     0.00 cpa_p0_avg_bw             
                 0      hisi_sicl10_cpa0/cpa_p0_wr_dat/                                       
                 0      hisi_sicl10_cpa0/cpa_p0_rd_dat_64b/                                      
                 0      hisi_sicl10_cpa0/cpa_p0_rd_dat_32b/                                      
    19,417,730,679      hisi_sicl2_cpa0/cpa_cycles/      #     0.31 cpa_p0_avg_bw             
        75,635,749      hisi_sicl2_cpa0/cpa_p0_wr_dat/                                        
        18,520,640      hisi_sicl2_cpa0/cpa_p0_rd_dat_64b/                                      
                 0      hisi_sicl2_cpa0/cpa_p0_rd_dat_32b/                                      
    19,417,674,227      hisi_sicl8_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw             
                 0      hisi_sicl8_cpa0/cpa_p0_wr_dat/                                        
                 0      hisi_sicl8_cpa0/cpa_p0_rd_dat_64b/                                      
                 0      hisi_sicl8_cpa0/cpa_p0_rd_dat_32b/                                      

      19.417734480 seconds time elapsed

[root@localhost tmp]# ./perf stat --uncorepmu hisi_sicl2_cpa0 -M cpa_p0_avg_bw
 Performance counter stats for 'system wide':

     6,234,093,559      cpa_cycles                       #     0.60 cpa_p0_avg_bw             
        50,548,465      cpa_p0_wr_dat                                                         
         7,552,182      cpa_p0_rd_dat_64b                                                     
                 0      cpa_p0_rd_dat_32b                                                     

       6.234139320 seconds time elapsed

Signed-off-by: Qinxin Xia <xiaqinxin@huawei.com>
---
 tools/perf/Documentation/perf-stat.txt |  4 ++++
 tools/perf/builtin-stat.c              | 26 ++++++++++++++++++++++++++
 tools/perf/util/metricgroup.c          | 18 +++++++++++++-----
 3 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 1a766d4a2233..9c0952b43a57 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -573,6 +573,10 @@ $ perf config stat.no-csv-summary=true
 Only enable events on applying cpu with this type for hybrid platform
 (e.g. core or atom)"
 
+--uncorepmu::
+Only enable events on applying uncore pmu with specified for multiple
+uncore pmus with same type (e.g. hisi_sicl2_cpa0 or hisi_sicl0_cpa0)
+
 EXAMPLES
 --------
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index ab40d85fb125..3f34bfc79179 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1235,6 +1235,28 @@ static int parse_cputype(const struct option *opt,
 	return 0;
 }
 
+static int parse_uncorepmu(const struct option *opt,
+			   const char *str,
+			   int unset __maybe_unused)
+{
+	const struct perf_pmu *pmu;
+	struct evlist *evlist = *(struct evlist **)opt->value;
+
+	if (!list_empty(&evlist->core.entries)) {
+		fprintf(stderr, "Must define uncorepmu before events/metrics\n");
+		return -1;
+	}
+
+	pmu = perf_pmus__pmu_for_pmu_filter(str);
+	if (!pmu || pmu->is_core) {
+		fprintf(stderr, "--uncorepmu %s is not supported!\n", str);
+		return -1;
+	}
+	parse_events_option_args.pmu_filter = pmu->name;
+
+	return 0;
+}
+
 static int parse_cache_level(const struct option *opt,
 			     const char *str,
 			     int unset __maybe_unused)
@@ -2579,6 +2601,10 @@ int cmd_stat(int argc, const char **argv)
 			"Only enable events on applying cpu with this type "
 			"for hybrid platform (e.g. core or atom)",
 			parse_cputype),
+		OPT_CALLBACK(0, "uncorepmu", &evsel_list, "uncore pmu",
+			"Only enable events on applying uncore pmu with specified "
+			"for multiple uncore pmus with same type(e.g. hisi_sicl2_cpa0 or hisi_sicl0_cpa0)",
+			parse_uncorepmu),
 #ifdef HAVE_LIBPFM
 		OPT_CALLBACK(0, "pfm-events", &evsel_list, "event",
 			"libpfm4 event selector. use 'perf list' to list available events",
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 25c75fdbfc52..59ef71df20bd 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -387,8 +387,13 @@ static bool match_pm_metric_or_groups(const struct pmu_metric *pm, const char *p
 				      const char *metric_or_groups)
 {
 	const char *pm_pmu = pm->pmu ?: "cpu";
+	struct perf_pmu *perf_pmu = NULL;
 
-	if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu))
+	if (pm->pmu)
+		perf_pmu = perf_pmus__find(pm->pmu);
+
+	if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu) &&
+	   (perf_pmu && !perf_pmu__name_wildcard_match(perf_pmu, pmu)))
 		return false;
 
 	return match_metric_or_groups(pm->metric_group, metric_or_groups) ||
@@ -1260,7 +1265,8 @@ static int build_combined_expr_ctx(const struct list_head *metric_list,
 static int parse_ids(bool metric_no_merge, bool fake_pmu,
 		     struct expr_parse_ctx *ids, const char *modifier,
 		     bool group_events, const bool tool_events[TOOL_PMU__EVENT_MAX],
-		     struct evlist **out_evlist)
+		     struct evlist **out_evlist,
+		     const char *filter_pmu)
 {
 	struct parse_events_error parse_error;
 	struct evlist *parsed_evlist;
@@ -1314,7 +1320,7 @@ static int parse_ids(bool metric_no_merge, bool fake_pmu,
 	}
 	pr_debug("Parsing metric events '%s'\n", events.buf);
 	parse_events_error__init(&parse_error);
-	ret = __parse_events(parsed_evlist, events.buf, /*pmu_filter=*/NULL,
+	ret = __parse_events(parsed_evlist, events.buf, filter_pmu,
 			     &parse_error, fake_pmu, /*warn_if_reordered=*/false,
 			     /*fake_tp=*/false);
 	if (ret) {
@@ -1417,7 +1423,8 @@ static int parse_groups(struct evlist *perf_evlist,
 					/*modifier=*/NULL,
 					/*group_events=*/false,
 					tool_events,
-					&combined_evlist);
+					&combined_evlist,
+					(pmu && strcmp(pmu, "all") == 0) ? NULL : pmu);
 		}
 		if (combined)
 			expr__ctx_free(combined);
@@ -1472,7 +1479,8 @@ static int parse_groups(struct evlist *perf_evlist,
 		}
 		if (!metric_evlist) {
 			ret = parse_ids(metric_no_merge, fake_pmu, m->pctx, m->modifier,
-					m->group_events, tool_events, &m->evlist);
+					m->group_events, tool_events, &m->evlist,
+					(pmu && strcmp(pmu, "all") == 0) ? NULL : pmu);
 			if (ret)
 				goto out;
 
-- 
2.33.0
Re: [RFC PATCH] perf: Add --uncorepmu option for filtering uncore PMUs
Posted by Ian Rogers 2 weeks, 3 days ago
On Tue, Jan 20, 2026 at 1:51 AM Qinxin Xia <xiaqinxin@huawei.com> wrote:
>
> This patch adds a new --uncorepmu option to perf-stat command to allow
> filtering events on specific uncore PMUs. This is useful when there are
> multiple uncore PMUs with the same type (e.g. hisi_sicl2_cpa0 and
> hisi_sicl0_cpa0).
>
> eg:
> [root@localhost tmp]# ./perf stat -M cpa_p0_avg_bw
>  Performance counter stats for 'system wide':
>
>     19,417,779,115      hisi_sicl0_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw
>                  0      hisi_sicl0_cpa0/cpa_p0_wr_dat/
>                  0      hisi_sicl0_cpa0/cpa_p0_rd_dat_64b/
>                  0      hisi_sicl0_cpa0/cpa_p0_rd_dat_32b/
>     19,417,751,103      hisi_sicl10_cpa0/cpa_cycles/     #     0.00 cpa_p0_avg_bw
>                  0      hisi_sicl10_cpa0/cpa_p0_wr_dat/
>                  0      hisi_sicl10_cpa0/cpa_p0_rd_dat_64b/
>                  0      hisi_sicl10_cpa0/cpa_p0_rd_dat_32b/
>     19,417,730,679      hisi_sicl2_cpa0/cpa_cycles/      #     0.31 cpa_p0_avg_bw
>         75,635,749      hisi_sicl2_cpa0/cpa_p0_wr_dat/
>         18,520,640      hisi_sicl2_cpa0/cpa_p0_rd_dat_64b/
>                  0      hisi_sicl2_cpa0/cpa_p0_rd_dat_32b/
>     19,417,674,227      hisi_sicl8_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw
>                  0      hisi_sicl8_cpa0/cpa_p0_wr_dat/
>                  0      hisi_sicl8_cpa0/cpa_p0_rd_dat_64b/
>                  0      hisi_sicl8_cpa0/cpa_p0_rd_dat_32b/
>
>       19.417734480 seconds time elapsed
>
> [root@localhost tmp]# ./perf stat --uncorepmu hisi_sicl2_cpa0 -M cpa_p0_avg_bw
>  Performance counter stats for 'system wide':
>
>      6,234,093,559      cpa_cycles                       #     0.60 cpa_p0_avg_bw
>         50,548,465      cpa_p0_wr_dat
>          7,552,182      cpa_p0_rd_dat_64b
>                  0      cpa_p0_rd_dat_32b
>
>        6.234139320 seconds time elapsed

So normally for this case the -A/--no-merge will give you a per-PMU
breakdown. It isn't restricted to a single PMU like the output here,
but we similarly don't have options to restrict per-socket, per-dir,
per-cluster and per-core. There is an option with --per-cache but that
looks like a way to avoid having many --per-cache options rather than
taking a value for a particular cache to only dump.

I did deliberate make the --cputype code generic so it could be used
in ways like this, but it'd be nice to glob match the names for
consistency with event parsing:
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/pmu.c#n2647
I needed to have --cputype as Intel's early hybrid patches had added
it before I cleaned things up to be more PMU oriented.

So I can see the purpose of this patch in being low overhead compared
to -A, I'm just not sure whether this is a PMU or an aggregation
option. I also think the term "uncore" is kind of confusing and so
have tried to keep it away from command line options. Why isn't is it
just a pmu filter?

Thanks,
Ian

> Signed-off-by: Qinxin Xia <xiaqinxin@huawei.com>
> ---
>  tools/perf/Documentation/perf-stat.txt |  4 ++++
>  tools/perf/builtin-stat.c              | 26 ++++++++++++++++++++++++++
>  tools/perf/util/metricgroup.c          | 18 +++++++++++++-----
>  3 files changed, 43 insertions(+), 5 deletions(-)
>
> diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
> index 1a766d4a2233..9c0952b43a57 100644
> --- a/tools/perf/Documentation/perf-stat.txt
> +++ b/tools/perf/Documentation/perf-stat.txt
> @@ -573,6 +573,10 @@ $ perf config stat.no-csv-summary=true
>  Only enable events on applying cpu with this type for hybrid platform
>  (e.g. core or atom)"
>
> +--uncorepmu::
> +Only enable events on applying uncore pmu with specified for multiple
> +uncore pmus with same type (e.g. hisi_sicl2_cpa0 or hisi_sicl0_cpa0)
> +
>  EXAMPLES
>  --------
>
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index ab40d85fb125..3f34bfc79179 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -1235,6 +1235,28 @@ static int parse_cputype(const struct option *opt,
>         return 0;
>  }
>
> +static int parse_uncorepmu(const struct option *opt,
> +                          const char *str,
> +                          int unset __maybe_unused)
> +{
> +       const struct perf_pmu *pmu;
> +       struct evlist *evlist = *(struct evlist **)opt->value;
> +
> +       if (!list_empty(&evlist->core.entries)) {
> +               fprintf(stderr, "Must define uncorepmu before events/metrics\n");
> +               return -1;
> +       }
> +
> +       pmu = perf_pmus__pmu_for_pmu_filter(str);
> +       if (!pmu || pmu->is_core) {
> +               fprintf(stderr, "--uncorepmu %s is not supported!\n", str);
> +               return -1;
> +       }
> +       parse_events_option_args.pmu_filter = pmu->name;
> +
> +       return 0;
> +}
> +
>  static int parse_cache_level(const struct option *opt,
>                              const char *str,
>                              int unset __maybe_unused)
> @@ -2579,6 +2601,10 @@ int cmd_stat(int argc, const char **argv)
>                         "Only enable events on applying cpu with this type "
>                         "for hybrid platform (e.g. core or atom)",
>                         parse_cputype),
> +               OPT_CALLBACK(0, "uncorepmu", &evsel_list, "uncore pmu",
> +                       "Only enable events on applying uncore pmu with specified "
> +                       "for multiple uncore pmus with same type(e.g. hisi_sicl2_cpa0 or hisi_sicl0_cpa0)",
> +                       parse_uncorepmu),
>  #ifdef HAVE_LIBPFM
>                 OPT_CALLBACK(0, "pfm-events", &evsel_list, "event",
>                         "libpfm4 event selector. use 'perf list' to list available events",
> diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
> index 25c75fdbfc52..59ef71df20bd 100644
> --- a/tools/perf/util/metricgroup.c
> +++ b/tools/perf/util/metricgroup.c
> @@ -387,8 +387,13 @@ static bool match_pm_metric_or_groups(const struct pmu_metric *pm, const char *p
>                                       const char *metric_or_groups)
>  {
>         const char *pm_pmu = pm->pmu ?: "cpu";
> +       struct perf_pmu *perf_pmu = NULL;
>
> -       if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu))
> +       if (pm->pmu)
> +               perf_pmu = perf_pmus__find(pm->pmu);
> +
> +       if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu) &&
> +          (perf_pmu && !perf_pmu__name_wildcard_match(perf_pmu, pmu)))
>                 return false;
>
>         return match_metric_or_groups(pm->metric_group, metric_or_groups) ||
> @@ -1260,7 +1265,8 @@ static int build_combined_expr_ctx(const struct list_head *metric_list,
>  static int parse_ids(bool metric_no_merge, bool fake_pmu,
>                      struct expr_parse_ctx *ids, const char *modifier,
>                      bool group_events, const bool tool_events[TOOL_PMU__EVENT_MAX],
> -                    struct evlist **out_evlist)
> +                    struct evlist **out_evlist,
> +                    const char *filter_pmu)
>  {
>         struct parse_events_error parse_error;
>         struct evlist *parsed_evlist;
> @@ -1314,7 +1320,7 @@ static int parse_ids(bool metric_no_merge, bool fake_pmu,
>         }
>         pr_debug("Parsing metric events '%s'\n", events.buf);
>         parse_events_error__init(&parse_error);
> -       ret = __parse_events(parsed_evlist, events.buf, /*pmu_filter=*/NULL,
> +       ret = __parse_events(parsed_evlist, events.buf, filter_pmu,
>                              &parse_error, fake_pmu, /*warn_if_reordered=*/false,
>                              /*fake_tp=*/false);
>         if (ret) {
> @@ -1417,7 +1423,8 @@ static int parse_groups(struct evlist *perf_evlist,
>                                         /*modifier=*/NULL,
>                                         /*group_events=*/false,
>                                         tool_events,
> -                                       &combined_evlist);
> +                                       &combined_evlist,
> +                                       (pmu && strcmp(pmu, "all") == 0) ? NULL : pmu);
>                 }
>                 if (combined)
>                         expr__ctx_free(combined);
> @@ -1472,7 +1479,8 @@ static int parse_groups(struct evlist *perf_evlist,
>                 }
>                 if (!metric_evlist) {
>                         ret = parse_ids(metric_no_merge, fake_pmu, m->pctx, m->modifier,
> -                                       m->group_events, tool_events, &m->evlist);
> +                                       m->group_events, tool_events, &m->evlist,
> +                                       (pmu && strcmp(pmu, "all") == 0) ? NULL : pmu);
>                         if (ret)
>                                 goto out;
>
> --
> 2.33.0
>
Re: [RFC PATCH] perf: Add --uncorepmu option for filtering uncore PMUs
Posted by Qinxin Xia 2 weeks, 3 days ago

On 2026/1/21 04:55:30, Ian Rogers <irogers@google.com> wrote:
> On Tue, Jan 20, 2026 at 1:51 AM Qinxin Xia <xiaqinxin@huawei.com> wrote:
>>
>> This patch adds a new --uncorepmu option to perf-stat command to allow
>> filtering events on specific uncore PMUs. This is useful when there are
>> multiple uncore PMUs with the same type (e.g. hisi_sicl2_cpa0 and
>> hisi_sicl0_cpa0).
>>
>> eg:
>> [root@localhost tmp]# ./perf stat -M cpa_p0_avg_bw
>>   Performance counter stats for 'system wide':
>>
>>      19,417,779,115      hisi_sicl0_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw
>>                   0      hisi_sicl0_cpa0/cpa_p0_wr_dat/
>>                   0      hisi_sicl0_cpa0/cpa_p0_rd_dat_64b/
>>                   0      hisi_sicl0_cpa0/cpa_p0_rd_dat_32b/
>>      19,417,751,103      hisi_sicl10_cpa0/cpa_cycles/     #     0.00 cpa_p0_avg_bw
>>                   0      hisi_sicl10_cpa0/cpa_p0_wr_dat/
>>                   0      hisi_sicl10_cpa0/cpa_p0_rd_dat_64b/
>>                   0      hisi_sicl10_cpa0/cpa_p0_rd_dat_32b/
>>      19,417,730,679      hisi_sicl2_cpa0/cpa_cycles/      #     0.31 cpa_p0_avg_bw
>>          75,635,749      hisi_sicl2_cpa0/cpa_p0_wr_dat/
>>          18,520,640      hisi_sicl2_cpa0/cpa_p0_rd_dat_64b/
>>                   0      hisi_sicl2_cpa0/cpa_p0_rd_dat_32b/
>>      19,417,674,227      hisi_sicl8_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw
>>                   0      hisi_sicl8_cpa0/cpa_p0_wr_dat/
>>                   0      hisi_sicl8_cpa0/cpa_p0_rd_dat_64b/
>>                   0      hisi_sicl8_cpa0/cpa_p0_rd_dat_32b/
>>
>>        19.417734480 seconds time elapsed
>>
>> [root@localhost tmp]# ./perf stat --uncorepmu hisi_sicl2_cpa0 -M cpa_p0_avg_bw
>>   Performance counter stats for 'system wide':
>>
>>       6,234,093,559      cpa_cycles                       #     0.60 cpa_p0_avg_bw
>>          50,548,465      cpa_p0_wr_dat
>>           7,552,182      cpa_p0_rd_dat_64b
>>                   0      cpa_p0_rd_dat_32b
>>
>>         6.234139320 seconds time elapsed
> 
> So normally for this case the -A/--no-merge will give you a per-PMU
> breakdown. It isn't restricted to a single PMU like the output here,
> but we similarly don't have options to restrict per-socket, per-dir,
> per-cluster and per-core. There is an option with --per-cache but that
> looks like a way to avoid having many --per-cache options rather than
> taking a value for a particular cache to only dump.
> 
> I did deliberate make the --cputype code generic so it could be used
> in ways like this, but it'd be nice to glob match the names for
> consistency with event parsing:
> https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/pmu.c#n2647
> I needed to have --cputype as Intel's early hybrid patches had added
> it before I cleaned things up to be more PMU oriented.
> 
Hello, Ian!

For core-related events and metric, we usually care about which core it
happens on, so we can filter it with -C.

Such as:

[root@localhost ~]# perf stat -C 0 -e L1-dcache-loads sleep 3

  Performance counter stats for 'CPU(s) 0':

            136,578      L1-dcache-loads 


        3.099147550 seconds time elapsed

[root@localhost tool]# ./perf stat -C 0 -M l1d_miss_rate sleep 3

  Performance counter stats for 'CPU(s) 0':

             18,708      L1-dcache-load-misses            #      3.6 % 
l1d_miss_rate
            513,038      L1-dcache-loads 


        3.004649800 seconds time elapsed

For core-independent events and metrics such as ddrc, smmu, and cpa, we
care about which hardware module they occur. (The example provided in
commit info is sicl.) so we want to filter it by pmu, for events I can
filter it with'cputype' (This is a little strange because sicl does not
belong to any cputype.). However, metric does not support this mode.

[root@localhost ~]# perf stat --cputype hisi_sicl0_cpa0 -e cpa_cycles
sleep 3

  Performance counter stats for 'system wide':

      3,003,492,731      cpa_cycles 


        3.003484270 seconds time elapsed


[root@localhost ~]# perf stat --cputype hisi_sicl0_cpa0 -M
cpa_p0_wr_dat sleep 3

Cannot find metric or group `cpa_p0_wr_dat'

This patch supports the metric scenario through '--uncorepmu'.

> So I can see the purpose of this patch in being low overhead compared
> to -A, I'm just not sure whether this is a PMU or an aggregation
> option. I also think the term "uncore" is kind of confusing and so
> have tried to keep it away from command line options. Why isn't is it
> just a pmu filter?
> 
> Thanks,
> Ian
> 
Perhaps we could make cputype a little more generic so that it could
also include pmu-filter functionality.

>> Signed-off-by: Qinxin Xia <xiaqinxin@huawei.com>
>> ---
>>   tools/perf/Documentation/perf-stat.txt |  4 ++++
>>   tools/perf/builtin-stat.c              | 26 ++++++++++++++++++++++++++
>>   tools/perf/util/metricgroup.c          | 18 +++++++++++++-----
>>   3 files changed, 43 insertions(+), 5 deletions(-)
>>
>> diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
>> index 1a766d4a2233..9c0952b43a57 100644
>> --- a/tools/perf/Documentation/perf-stat.txt
>> +++ b/tools/perf/Documentation/perf-stat.txt
>> @@ -573,6 +573,10 @@ $ perf config stat.no-csv-summary=true
>>   Only enable events on applying cpu with this type for hybrid platform
>>   (e.g. core or atom)"
>>
>> +--uncorepmu::
>> +Only enable events on applying uncore pmu with specified for multiple
>> +uncore pmus with same type (e.g. hisi_sicl2_cpa0 or hisi_sicl0_cpa0)
>> +
>>   EXAMPLES
>>   --------
>>
>> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
>> index ab40d85fb125..3f34bfc79179 100644
>> --- a/tools/perf/builtin-stat.c
>> +++ b/tools/perf/builtin-stat.c
>> @@ -1235,6 +1235,28 @@ static int parse_cputype(const struct option *opt,
>>          return 0;
>>   }
>>
>> +static int parse_uncorepmu(const struct option *opt,
>> +                          const char *str,
>> +                          int unset __maybe_unused)
>> +{
>> +       const struct perf_pmu *pmu;
>> +       struct evlist *evlist = *(struct evlist **)opt->value;
>> +
>> +       if (!list_empty(&evlist->core.entries)) {
>> +               fprintf(stderr, "Must define uncorepmu before events/metrics\n");
>> +               return -1;
>> +       }
>> +
>> +       pmu = perf_pmus__pmu_for_pmu_filter(str);
>> +       if (!pmu || pmu->is_core) {
>> +               fprintf(stderr, "--uncorepmu %s is not supported!\n", str);
>> +               return -1;
>> +       }
>> +       parse_events_option_args.pmu_filter = pmu->name;
>> +
>> +       return 0;
>> +}
>> +
>>   static int parse_cache_level(const struct option *opt,
>>                               const char *str,
>>                               int unset __maybe_unused)
>> @@ -2579,6 +2601,10 @@ int cmd_stat(int argc, const char **argv)
>>                          "Only enable events on applying cpu with this type "
>>                          "for hybrid platform (e.g. core or atom)",
>>                          parse_cputype),
>> +               OPT_CALLBACK(0, "uncorepmu", &evsel_list, "uncore pmu",
>> +                       "Only enable events on applying uncore pmu with specified "
>> +                       "for multiple uncore pmus with same type(e.g. hisi_sicl2_cpa0 or hisi_sicl0_cpa0)",
>> +                       parse_uncorepmu),
>>   #ifdef HAVE_LIBPFM
>>                  OPT_CALLBACK(0, "pfm-events", &evsel_list, "event",
>>                          "libpfm4 event selector. use 'perf list' to list available events",
>> diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
>> index 25c75fdbfc52..59ef71df20bd 100644
>> --- a/tools/perf/util/metricgroup.c
>> +++ b/tools/perf/util/metricgroup.c
>> @@ -387,8 +387,13 @@ static bool match_pm_metric_or_groups(const struct pmu_metric *pm, const char *p
>>                                        const char *metric_or_groups)
>>   {
>>          const char *pm_pmu = pm->pmu ?: "cpu";
>> +       struct perf_pmu *perf_pmu = NULL;
>>
>> -       if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu))
>> +       if (pm->pmu)
>> +               perf_pmu = perf_pmus__find(pm->pmu);
>> +
>> +       if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu) &&
>> +          (perf_pmu && !perf_pmu__name_wildcard_match(perf_pmu, pmu)))
>>                  return false;
>>
>>          return match_metric_or_groups(pm->metric_group, metric_or_groups) ||
>> @@ -1260,7 +1265,8 @@ static int build_combined_expr_ctx(const struct list_head *metric_list,
>>   static int parse_ids(bool metric_no_merge, bool fake_pmu,
>>                       struct expr_parse_ctx *ids, const char *modifier,
>>                       bool group_events, const bool tool_events[TOOL_PMU__EVENT_MAX],
>> -                    struct evlist **out_evlist)
>> +                    struct evlist **out_evlist,
>> +                    const char *filter_pmu)
>>   {
>>          struct parse_events_error parse_error;
>>          struct evlist *parsed_evlist;
>> @@ -1314,7 +1320,7 @@ static int parse_ids(bool metric_no_merge, bool fake_pmu,
>>          }
>>          pr_debug("Parsing metric events '%s'\n", events.buf);
>>          parse_events_error__init(&parse_error);
>> -       ret = __parse_events(parsed_evlist, events.buf, /*pmu_filter=*/NULL,
>> +       ret = __parse_events(parsed_evlist, events.buf, filter_pmu,
>>                               &parse_error, fake_pmu, /*warn_if_reordered=*/false,
>>                               /*fake_tp=*/false);
>>          if (ret) {
>> @@ -1417,7 +1423,8 @@ static int parse_groups(struct evlist *perf_evlist,
>>                                          /*modifier=*/NULL,
>>                                          /*group_events=*/false,
>>                                          tool_events,
>> -                                       &combined_evlist);
>> +                                       &combined_evlist,
>> +                                       (pmu && strcmp(pmu, "all") == 0) ? NULL : pmu);
>>                  }
>>                  if (combined)
>>                          expr__ctx_free(combined);
>> @@ -1472,7 +1479,8 @@ static int parse_groups(struct evlist *perf_evlist,
>>                  }
>>                  if (!metric_evlist) {
>>                          ret = parse_ids(metric_no_merge, fake_pmu, m->pctx, m->modifier,
>> -                                       m->group_events, tool_events, &m->evlist);
>> +                                       m->group_events, tool_events, &m->evlist,
>> +                                       (pmu && strcmp(pmu, "all") == 0) ? NULL : pmu);
>>                          if (ret)
>>                                  goto out;
>>
>> --
>> 2.33.0
>>

-- 
Thanks,
Qinxin

Re: [RFC PATCH] perf: Add --uncorepmu option for filtering uncore PMUs
Posted by Ian Rogers 2 weeks, 3 days ago
On Tue, Jan 20, 2026 at 7:15 PM Qinxin Xia <xiaqinxin@huawei.com> wrote:
>
>
>
> On 2026/1/21 04:55:30, Ian Rogers <irogers@google.com> wrote:
> > On Tue, Jan 20, 2026 at 1:51 AM Qinxin Xia <xiaqinxin@huawei.com> wrote:
> >>
> >> This patch adds a new --uncorepmu option to perf-stat command to allow
> >> filtering events on specific uncore PMUs. This is useful when there are
> >> multiple uncore PMUs with the same type (e.g. hisi_sicl2_cpa0 and
> >> hisi_sicl0_cpa0).
> >>
> >> eg:
> >> [root@localhost tmp]# ./perf stat -M cpa_p0_avg_bw
> >>   Performance counter stats for 'system wide':
> >>
> >>      19,417,779,115      hisi_sicl0_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw
> >>                   0      hisi_sicl0_cpa0/cpa_p0_wr_dat/
> >>                   0      hisi_sicl0_cpa0/cpa_p0_rd_dat_64b/
> >>                   0      hisi_sicl0_cpa0/cpa_p0_rd_dat_32b/
> >>      19,417,751,103      hisi_sicl10_cpa0/cpa_cycles/     #     0.00 cpa_p0_avg_bw
> >>                   0      hisi_sicl10_cpa0/cpa_p0_wr_dat/
> >>                   0      hisi_sicl10_cpa0/cpa_p0_rd_dat_64b/
> >>                   0      hisi_sicl10_cpa0/cpa_p0_rd_dat_32b/
> >>      19,417,730,679      hisi_sicl2_cpa0/cpa_cycles/      #     0.31 cpa_p0_avg_bw
> >>          75,635,749      hisi_sicl2_cpa0/cpa_p0_wr_dat/
> >>          18,520,640      hisi_sicl2_cpa0/cpa_p0_rd_dat_64b/
> >>                   0      hisi_sicl2_cpa0/cpa_p0_rd_dat_32b/
> >>      19,417,674,227      hisi_sicl8_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw
> >>                   0      hisi_sicl8_cpa0/cpa_p0_wr_dat/
> >>                   0      hisi_sicl8_cpa0/cpa_p0_rd_dat_64b/
> >>                   0      hisi_sicl8_cpa0/cpa_p0_rd_dat_32b/
> >>
> >>        19.417734480 seconds time elapsed
> >>
> >> [root@localhost tmp]# ./perf stat --uncorepmu hisi_sicl2_cpa0 -M cpa_p0_avg_bw
> >>   Performance counter stats for 'system wide':
> >>
> >>       6,234,093,559      cpa_cycles                       #     0.60 cpa_p0_avg_bw
> >>          50,548,465      cpa_p0_wr_dat
> >>           7,552,182      cpa_p0_rd_dat_64b
> >>                   0      cpa_p0_rd_dat_32b
> >>
> >>         6.234139320 seconds time elapsed
> >
> > So normally for this case the -A/--no-merge will give you a per-PMU
> > breakdown. It isn't restricted to a single PMU like the output here,
> > but we similarly don't have options to restrict per-socket, per-dir,
> > per-cluster and per-core. There is an option with --per-cache but that
> > looks like a way to avoid having many --per-cache options rather than
> > taking a value for a particular cache to only dump.
> >
> > I did deliberate make the --cputype code generic so it could be used
> > in ways like this, but it'd be nice to glob match the names for
> > consistency with event parsing:
> > https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/pmu.c#n2647
> > I needed to have --cputype as Intel's early hybrid patches had added
> > it before I cleaned things up to be more PMU oriented.
> >
> Hello, Ian!
>
> For core-related events and metric, we usually care about which core it
> happens on, so we can filter it with -C.
>
> Such as:
>
> [root@localhost ~]# perf stat -C 0 -e L1-dcache-loads sleep 3
>
>   Performance counter stats for 'CPU(s) 0':
>
>             136,578      L1-dcache-loads
>
>
>         3.099147550 seconds time elapsed
>
> [root@localhost tool]# ./perf stat -C 0 -M l1d_miss_rate sleep 3
>
>   Performance counter stats for 'CPU(s) 0':
>
>              18,708      L1-dcache-load-misses            #      3.6 %
> l1d_miss_rate
>             513,038      L1-dcache-loads
>
>
>         3.004649800 seconds time elapsed
>
> For core-independent events and metrics such as ddrc, smmu, and cpa, we
> care about which hardware module they occur. (The example provided in
> commit info is sicl.) so we want to filter it by pmu, for events I can
> filter it with'cputype' (This is a little strange because sicl does not
> belong to any cputype.). However, metric does not support this mode.
>
> [root@localhost ~]# perf stat --cputype hisi_sicl0_cpa0 -e cpa_cycles
> sleep 3
>
>   Performance counter stats for 'system wide':
>
>       3,003,492,731      cpa_cycles
>
>
>         3.003484270 seconds time elapsed
>
>
> [root@localhost ~]# perf stat --cputype hisi_sicl0_cpa0 -M
> cpa_p0_wr_dat sleep 3
>
> Cannot find metric or group `cpa_p0_wr_dat'
>
> This patch supports the metric scenario through '--uncorepmu'.
>
> > So I can see the purpose of this patch in being low overhead compared
> > to -A, I'm just not sure whether this is a PMU or an aggregation
> > option. I also think the term "uncore" is kind of confusing and so
> > have tried to keep it away from command line options. Why isn't is it
> > just a pmu filter?
> >
> > Thanks,
> > Ian
> >
> Perhaps we could make cputype a little more generic so that it could
> also include pmu-filter functionality.

Yeah, that'd been my intention quite a while ago. I wanted to hide the
cputype flag and replace it with something like --pmu-filter. Looks
like I never got around to writing the patch though :-) I agree the -C
filtering is similar to this and similarly different to how the
aggregation code is working. Could we rename --uncorepmu to
--pmu-filter then?

Thanks,
Ian

> >> Signed-off-by: Qinxin Xia <xiaqinxin@huawei.com>
> >> ---
> >>   tools/perf/Documentation/perf-stat.txt |  4 ++++
> >>   tools/perf/builtin-stat.c              | 26 ++++++++++++++++++++++++++
> >>   tools/perf/util/metricgroup.c          | 18 +++++++++++++-----
> >>   3 files changed, 43 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
> >> index 1a766d4a2233..9c0952b43a57 100644
> >> --- a/tools/perf/Documentation/perf-stat.txt
> >> +++ b/tools/perf/Documentation/perf-stat.txt
> >> @@ -573,6 +573,10 @@ $ perf config stat.no-csv-summary=true
> >>   Only enable events on applying cpu with this type for hybrid platform
> >>   (e.g. core or atom)"
> >>
> >> +--uncorepmu::
> >> +Only enable events on applying uncore pmu with specified for multiple
> >> +uncore pmus with same type (e.g. hisi_sicl2_cpa0 or hisi_sicl0_cpa0)
> >> +
> >>   EXAMPLES
> >>   --------
> >>
> >> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> >> index ab40d85fb125..3f34bfc79179 100644
> >> --- a/tools/perf/builtin-stat.c
> >> +++ b/tools/perf/builtin-stat.c
> >> @@ -1235,6 +1235,28 @@ static int parse_cputype(const struct option *opt,
> >>          return 0;
> >>   }
> >>
> >> +static int parse_uncorepmu(const struct option *opt,
> >> +                          const char *str,
> >> +                          int unset __maybe_unused)
> >> +{
> >> +       const struct perf_pmu *pmu;
> >> +       struct evlist *evlist = *(struct evlist **)opt->value;
> >> +
> >> +       if (!list_empty(&evlist->core.entries)) {
> >> +               fprintf(stderr, "Must define uncorepmu before events/metrics\n");
> >> +               return -1;
> >> +       }
> >> +
> >> +       pmu = perf_pmus__pmu_for_pmu_filter(str);
> >> +       if (!pmu || pmu->is_core) {
> >> +               fprintf(stderr, "--uncorepmu %s is not supported!\n", str);
> >> +               return -1;
> >> +       }
> >> +       parse_events_option_args.pmu_filter = pmu->name;
> >> +
> >> +       return 0;
> >> +}
> >> +
> >>   static int parse_cache_level(const struct option *opt,
> >>                               const char *str,
> >>                               int unset __maybe_unused)
> >> @@ -2579,6 +2601,10 @@ int cmd_stat(int argc, const char **argv)
> >>                          "Only enable events on applying cpu with this type "
> >>                          "for hybrid platform (e.g. core or atom)",
> >>                          parse_cputype),
> >> +               OPT_CALLBACK(0, "uncorepmu", &evsel_list, "uncore pmu",
> >> +                       "Only enable events on applying uncore pmu with specified "
> >> +                       "for multiple uncore pmus with same type(e.g. hisi_sicl2_cpa0 or hisi_sicl0_cpa0)",
> >> +                       parse_uncorepmu),
> >>   #ifdef HAVE_LIBPFM
> >>                  OPT_CALLBACK(0, "pfm-events", &evsel_list, "event",
> >>                          "libpfm4 event selector. use 'perf list' to list available events",
> >> diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
> >> index 25c75fdbfc52..59ef71df20bd 100644
> >> --- a/tools/perf/util/metricgroup.c
> >> +++ b/tools/perf/util/metricgroup.c
> >> @@ -387,8 +387,13 @@ static bool match_pm_metric_or_groups(const struct pmu_metric *pm, const char *p
> >>                                        const char *metric_or_groups)
> >>   {
> >>          const char *pm_pmu = pm->pmu ?: "cpu";
> >> +       struct perf_pmu *perf_pmu = NULL;
> >>
> >> -       if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu))
> >> +       if (pm->pmu)
> >> +               perf_pmu = perf_pmus__find(pm->pmu);
> >> +
> >> +       if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu) &&
> >> +          (perf_pmu && !perf_pmu__name_wildcard_match(perf_pmu, pmu)))
> >>                  return false;
> >>
> >>          return match_metric_or_groups(pm->metric_group, metric_or_groups) ||
> >> @@ -1260,7 +1265,8 @@ static int build_combined_expr_ctx(const struct list_head *metric_list,
> >>   static int parse_ids(bool metric_no_merge, bool fake_pmu,
> >>                       struct expr_parse_ctx *ids, const char *modifier,
> >>                       bool group_events, const bool tool_events[TOOL_PMU__EVENT_MAX],
> >> -                    struct evlist **out_evlist)
> >> +                    struct evlist **out_evlist,
> >> +                    const char *filter_pmu)
> >>   {
> >>          struct parse_events_error parse_error;
> >>          struct evlist *parsed_evlist;
> >> @@ -1314,7 +1320,7 @@ static int parse_ids(bool metric_no_merge, bool fake_pmu,
> >>          }
> >>          pr_debug("Parsing metric events '%s'\n", events.buf);
> >>          parse_events_error__init(&parse_error);
> >> -       ret = __parse_events(parsed_evlist, events.buf, /*pmu_filter=*/NULL,
> >> +       ret = __parse_events(parsed_evlist, events.buf, filter_pmu,
> >>                               &parse_error, fake_pmu, /*warn_if_reordered=*/false,
> >>                               /*fake_tp=*/false);
> >>          if (ret) {
> >> @@ -1417,7 +1423,8 @@ static int parse_groups(struct evlist *perf_evlist,
> >>                                          /*modifier=*/NULL,
> >>                                          /*group_events=*/false,
> >>                                          tool_events,
> >> -                                       &combined_evlist);
> >> +                                       &combined_evlist,
> >> +                                       (pmu && strcmp(pmu, "all") == 0) ? NULL : pmu);
> >>                  }
> >>                  if (combined)
> >>                          expr__ctx_free(combined);
> >> @@ -1472,7 +1479,8 @@ static int parse_groups(struct evlist *perf_evlist,
> >>                  }
> >>                  if (!metric_evlist) {
> >>                          ret = parse_ids(metric_no_merge, fake_pmu, m->pctx, m->modifier,
> >> -                                       m->group_events, tool_events, &m->evlist);
> >> +                                       m->group_events, tool_events, &m->evlist,
> >> +                                       (pmu && strcmp(pmu, "all") == 0) ? NULL : pmu);
> >>                          if (ret)
> >>                                  goto out;
> >>
> >> --
> >> 2.33.0
> >>
>
> --
> Thanks,
> Qinxin
>
Re: [RFC PATCH] perf: Add --uncorepmu option for filtering uncore PMUs
Posted by Qinxin Xia 2 weeks, 3 days ago

On 2026/1/21 11:31:01, Ian Rogers <irogers@google.com> wrote:
> On Tue, Jan 20, 2026 at 7:15 PM Qinxin Xia <xiaqinxin@huawei.com> wrote:
>>
>>
>>
>> On 2026/1/21 04:55:30, Ian Rogers <irogers@google.com> wrote:
>>> On Tue, Jan 20, 2026 at 1:51 AM Qinxin Xia <xiaqinxin@huawei.com> wrote:
>>>>
>>>> This patch adds a new --uncorepmu option to perf-stat command to allow
>>>> filtering events on specific uncore PMUs. This is useful when there are
>>>> multiple uncore PMUs with the same type (e.g. hisi_sicl2_cpa0 and
>>>> hisi_sicl0_cpa0).
>>>>
>>>> eg:
>>>> [root@localhost tmp]# ./perf stat -M cpa_p0_avg_bw
>>>>    Performance counter stats for 'system wide':
>>>>
>>>>       19,417,779,115      hisi_sicl0_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw
>>>>                    0      hisi_sicl0_cpa0/cpa_p0_wr_dat/
>>>>                    0      hisi_sicl0_cpa0/cpa_p0_rd_dat_64b/
>>>>                    0      hisi_sicl0_cpa0/cpa_p0_rd_dat_32b/
>>>>       19,417,751,103      hisi_sicl10_cpa0/cpa_cycles/     #     0.00 cpa_p0_avg_bw
>>>>                    0      hisi_sicl10_cpa0/cpa_p0_wr_dat/
>>>>                    0      hisi_sicl10_cpa0/cpa_p0_rd_dat_64b/
>>>>                    0      hisi_sicl10_cpa0/cpa_p0_rd_dat_32b/
>>>>       19,417,730,679      hisi_sicl2_cpa0/cpa_cycles/      #     0.31 cpa_p0_avg_bw
>>>>           75,635,749      hisi_sicl2_cpa0/cpa_p0_wr_dat/
>>>>           18,520,640      hisi_sicl2_cpa0/cpa_p0_rd_dat_64b/
>>>>                    0      hisi_sicl2_cpa0/cpa_p0_rd_dat_32b/
>>>>       19,417,674,227      hisi_sicl8_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw
>>>>                    0      hisi_sicl8_cpa0/cpa_p0_wr_dat/
>>>>                    0      hisi_sicl8_cpa0/cpa_p0_rd_dat_64b/
>>>>                    0      hisi_sicl8_cpa0/cpa_p0_rd_dat_32b/
>>>>
>>>>         19.417734480 seconds time elapsed
>>>>
>>>> [root@localhost tmp]# ./perf stat --uncorepmu hisi_sicl2_cpa0 -M cpa_p0_avg_bw
>>>>    Performance counter stats for 'system wide':
>>>>
>>>>        6,234,093,559      cpa_cycles                       #     0.60 cpa_p0_avg_bw
>>>>           50,548,465      cpa_p0_wr_dat
>>>>            7,552,182      cpa_p0_rd_dat_64b
>>>>                    0      cpa_p0_rd_dat_32b
>>>>
>>>>          6.234139320 seconds time elapsed
>>>
>>> So normally for this case the -A/--no-merge will give you a per-PMU
>>> breakdown. It isn't restricted to a single PMU like the output here,
>>> but we similarly don't have options to restrict per-socket, per-dir,
>>> per-cluster and per-core. There is an option with --per-cache but that
>>> looks like a way to avoid having many --per-cache options rather than
>>> taking a value for a particular cache to only dump.
>>>
>>> I did deliberate make the --cputype code generic so it could be used
>>> in ways like this, but it'd be nice to glob match the names for
>>> consistency with event parsing:
>>> https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/pmu.c#n2647
>>> I needed to have --cputype as Intel's early hybrid patches had added
>>> it before I cleaned things up to be more PMU oriented.
>>>
>> Hello, Ian!
>>
>> For core-related events and metric, we usually care about which core it
>> happens on, so we can filter it with -C.
>>
>> Such as:
>>
>> [root@localhost ~]# perf stat -C 0 -e L1-dcache-loads sleep 3
>>
>>    Performance counter stats for 'CPU(s) 0':
>>
>>              136,578      L1-dcache-loads
>>
>>
>>          3.099147550 seconds time elapsed
>>
>> [root@localhost tool]# ./perf stat -C 0 -M l1d_miss_rate sleep 3
>>
>>    Performance counter stats for 'CPU(s) 0':
>>
>>               18,708      L1-dcache-load-misses            #      3.6 %
>> l1d_miss_rate
>>              513,038      L1-dcache-loads
>>
>>
>>          3.004649800 seconds time elapsed
>>
>> For core-independent events and metrics such as ddrc, smmu, and cpa, we
>> care about which hardware module they occur. (The example provided in
>> commit info is sicl.) so we want to filter it by pmu, for events I can
>> filter it with'cputype' (This is a little strange because sicl does not
>> belong to any cputype.). However, metric does not support this mode.
>>
>> [root@localhost ~]# perf stat --cputype hisi_sicl0_cpa0 -e cpa_cycles
>> sleep 3
>>
>>    Performance counter stats for 'system wide':
>>
>>        3,003,492,731      cpa_cycles
>>
>>
>>          3.003484270 seconds time elapsed
>>
>>
>> [root@localhost ~]# perf stat --cputype hisi_sicl0_cpa0 -M
>> cpa_p0_wr_dat sleep 3
>>
>> Cannot find metric or group `cpa_p0_wr_dat'
>>
>> This patch supports the metric scenario through '--uncorepmu'.
>>
>>> So I can see the purpose of this patch in being low overhead compared
>>> to -A, I'm just not sure whether this is a PMU or an aggregation
>>> option. I also think the term "uncore" is kind of confusing and so
>>> have tried to keep it away from command line options. Why isn't is it
>>> just a pmu filter?
>>>
>>> Thanks,
>>> Ian
>>>
>> Perhaps we could make cputype a little more generic so that it could
>> also include pmu-filter functionality.
> 
> Yeah, that'd been my intention quite a while ago. I wanted to hide the
> cputype flag and replace it with something like --pmu-filter. Looks
> like I never got around to writing the patch though :-) I agree the -C
> filtering is similar to this and similarly different to how the
> aggregation code is working. Could we rename --uncorepmu to
> --pmu-filter then?
> 
> Thanks,
> Ian
> 
Thank you for the suggestions, I will rename --uncorepmu to --pmu-filter
in the next version of this patch.:-)

Regarding --cputype: would you like me to also rename it to --pmu-filter
the same patch series next version?

>>>> Signed-off-by: Qinxin Xia <xiaqinxin@huawei.com>
>>>> ---
>>>>    tools/perf/Documentation/perf-stat.txt |  4 ++++
>>>>    tools/perf/builtin-stat.c              | 26 ++++++++++++++++++++++++++
>>>>    tools/perf/util/metricgroup.c          | 18 +++++++++++++-----
>>>>    3 files changed, 43 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
>>>> index 1a766d4a2233..9c0952b43a57 100644
>>>> --- a/tools/perf/Documentation/perf-stat.txt
>>>> +++ b/tools/perf/Documentation/perf-stat.txt
>>>> @@ -573,6 +573,10 @@ $ perf config stat.no-csv-summary=true
>>>>    Only enable events on applying cpu with this type for hybrid platform
>>>>    (e.g. core or atom)"
>>>>
>>>> +--uncorepmu::
>>>> +Only enable events on applying uncore pmu with specified for multiple
>>>> +uncore pmus with same type (e.g. hisi_sicl2_cpa0 or hisi_sicl0_cpa0)
>>>> +
>>>>    EXAMPLES
>>>>    --------
>>>>
>>>> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
>>>> index ab40d85fb125..3f34bfc79179 100644
>>>> --- a/tools/perf/builtin-stat.c
>>>> +++ b/tools/perf/builtin-stat.c
>>>> @@ -1235,6 +1235,28 @@ static int parse_cputype(const struct option *opt,
>>>>           return 0;
>>>>    }
>>>>
>>>> +static int parse_uncorepmu(const struct option *opt,
>>>> +                          const char *str,
>>>> +                          int unset __maybe_unused)
>>>> +{
>>>> +       const struct perf_pmu *pmu;
>>>> +       struct evlist *evlist = *(struct evlist **)opt->value;
>>>> +
>>>> +       if (!list_empty(&evlist->core.entries)) {
>>>> +               fprintf(stderr, "Must define uncorepmu before events/metrics\n");
>>>> +               return -1;
>>>> +       }
>>>> +
>>>> +       pmu = perf_pmus__pmu_for_pmu_filter(str);
>>>> +       if (!pmu || pmu->is_core) {
>>>> +               fprintf(stderr, "--uncorepmu %s is not supported!\n", str);
>>>> +               return -1;
>>>> +       }
>>>> +       parse_events_option_args.pmu_filter = pmu->name;
>>>> +
>>>> +       return 0;
>>>> +}
>>>> +
>>>>    static int parse_cache_level(const struct option *opt,
>>>>                                const char *str,
>>>>                                int unset __maybe_unused)
>>>> @@ -2579,6 +2601,10 @@ int cmd_stat(int argc, const char **argv)
>>>>                           "Only enable events on applying cpu with this type "
>>>>                           "for hybrid platform (e.g. core or atom)",
>>>>                           parse_cputype),
>>>> +               OPT_CALLBACK(0, "uncorepmu", &evsel_list, "uncore pmu",
>>>> +                       "Only enable events on applying uncore pmu with specified "
>>>> +                       "for multiple uncore pmus with same type(e.g. hisi_sicl2_cpa0 or hisi_sicl0_cpa0)",
>>>> +                       parse_uncorepmu),
>>>>    #ifdef HAVE_LIBPFM
>>>>                   OPT_CALLBACK(0, "pfm-events", &evsel_list, "event",
>>>>                           "libpfm4 event selector. use 'perf list' to list available events",
>>>> diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
>>>> index 25c75fdbfc52..59ef71df20bd 100644
>>>> --- a/tools/perf/util/metricgroup.c
>>>> +++ b/tools/perf/util/metricgroup.c
>>>> @@ -387,8 +387,13 @@ static bool match_pm_metric_or_groups(const struct pmu_metric *pm, const char *p
>>>>                                         const char *metric_or_groups)
>>>>    {
>>>>           const char *pm_pmu = pm->pmu ?: "cpu";
>>>> +       struct perf_pmu *perf_pmu = NULL;
>>>>
>>>> -       if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu))
>>>> +       if (pm->pmu)
>>>> +               perf_pmu = perf_pmus__find(pm->pmu);
>>>> +
>>>> +       if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu) &&
>>>> +          (perf_pmu && !perf_pmu__name_wildcard_match(perf_pmu, pmu)))
>>>>                   return false;
>>>>
>>>>           return match_metric_or_groups(pm->metric_group, metric_or_groups) ||
>>>> @@ -1260,7 +1265,8 @@ static int build_combined_expr_ctx(const struct list_head *metric_list,
>>>>    static int parse_ids(bool metric_no_merge, bool fake_pmu,
>>>>                        struct expr_parse_ctx *ids, const char *modifier,
>>>>                        bool group_events, const bool tool_events[TOOL_PMU__EVENT_MAX],
>>>> -                    struct evlist **out_evlist)
>>>> +                    struct evlist **out_evlist,
>>>> +                    const char *filter_pmu)
>>>>    {
>>>>           struct parse_events_error parse_error;
>>>>           struct evlist *parsed_evlist;
>>>> @@ -1314,7 +1320,7 @@ static int parse_ids(bool metric_no_merge, bool fake_pmu,
>>>>           }
>>>>           pr_debug("Parsing metric events '%s'\n", events.buf);
>>>>           parse_events_error__init(&parse_error);
>>>> -       ret = __parse_events(parsed_evlist, events.buf, /*pmu_filter=*/NULL,
>>>> +       ret = __parse_events(parsed_evlist, events.buf, filter_pmu,
>>>>                                &parse_error, fake_pmu, /*warn_if_reordered=*/false,
>>>>                                /*fake_tp=*/false);
>>>>           if (ret) {
>>>> @@ -1417,7 +1423,8 @@ static int parse_groups(struct evlist *perf_evlist,
>>>>                                           /*modifier=*/NULL,
>>>>                                           /*group_events=*/false,
>>>>                                           tool_events,
>>>> -                                       &combined_evlist);
>>>> +                                       &combined_evlist,
>>>> +                                       (pmu && strcmp(pmu, "all") == 0) ? NULL : pmu);
>>>>                   }
>>>>                   if (combined)
>>>>                           expr__ctx_free(combined);
>>>> @@ -1472,7 +1479,8 @@ static int parse_groups(struct evlist *perf_evlist,
>>>>                   }
>>>>                   if (!metric_evlist) {
>>>>                           ret = parse_ids(metric_no_merge, fake_pmu, m->pctx, m->modifier,
>>>> -                                       m->group_events, tool_events, &m->evlist);
>>>> +                                       m->group_events, tool_events, &m->evlist,
>>>> +                                       (pmu && strcmp(pmu, "all") == 0) ? NULL : pmu);
>>>>                           if (ret)
>>>>                                   goto out;
>>>>
>>>> --
>>>> 2.33.0
>>>>
>>
>> --
>> Thanks,
>> Qinxin
>>

-- 
Thanks,
Qinxin