[PATCH v3 2/7] perf report: Add parallelism sort key

Dmitry Vyukov posted 7 patches 1 year ago
There is a newer version of this series
[PATCH v3 2/7] perf report: Add parallelism sort key
Posted by Dmitry Vyukov 1 year ago
Show parallelism level in profiles if requested by user.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: linux-perf-users@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 tools/perf/builtin-report.c | 11 +++++++++++
 tools/perf/util/hist.c      |  2 ++
 tools/perf/util/hist.h      |  3 +++
 tools/perf/util/session.c   | 12 ++++++++++++
 tools/perf/util/session.h   |  1 +
 tools/perf/util/sort.c      | 23 +++++++++++++++++++++++
 tools/perf/util/sort.h      |  1 +
 7 files changed, 53 insertions(+)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 0d9bd090eda71..14d49f0625881 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1720,6 +1720,17 @@ int cmd_report(int argc, const char **argv)
 		symbol_conf.annotate_data_sample = true;
 	}
 
+	if (report.disable_order || !perf_session__has_switch_events(session)) {
+		if ((sort_order && strstr(sort_order, "parallelism")) ||
+				(field_order && strstr(field_order, "parallelism"))) {
+			if (report.disable_order)
+				ui__error("Use of parallelism is incompatible with --disable-order.\n");
+			else
+				ui__error("Use of parallelism requires --switch-events during record.\n");
+			return -1;
+		}
+	}
+
 	if (sort_order && strstr(sort_order, "ipc")) {
 		parse_options_usage(report_usage, options, "s", 1);
 		goto error;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 0f30f843c566d..cafd693568189 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -207,6 +207,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 
 	hists__new_col_len(hists, HISTC_CGROUP, 6);
 	hists__new_col_len(hists, HISTC_CGROUP_ID, 20);
+	hists__new_col_len(hists, HISTC_PARALLELISM, 11);
 	hists__new_col_len(hists, HISTC_CPU, 3);
 	hists__new_col_len(hists, HISTC_SOCKET, 6);
 	hists__new_col_len(hists, HISTC_MEM_LOCKED, 6);
@@ -741,6 +742,7 @@ __hists__add_entry(struct hists *hists,
 		.ip	 = al->addr,
 		.level	 = al->level,
 		.code_page_size = sample->code_page_size,
+		.parallelism	= al->parallelism,
 		.stat = {
 			.nr_events = 1,
 			.period	= sample->period,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 46c8373e31465..a6e662d77dc24 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -42,6 +42,7 @@ enum hist_column {
 	HISTC_CGROUP_ID,
 	HISTC_CGROUP,
 	HISTC_PARENT,
+	HISTC_PARALLELISM,
 	HISTC_CPU,
 	HISTC_SOCKET,
 	HISTC_SRCLINE,
@@ -228,6 +229,7 @@ struct hist_entry {
 	u64			transaction;
 	s32			socket;
 	s32			cpu;
+	int			parallelism;
 	u64			code_page_size;
 	u64			weight;
 	u64			ins_lat;
@@ -580,6 +582,7 @@ bool perf_hpp__is_thread_entry(struct perf_hpp_fmt *fmt);
 bool perf_hpp__is_comm_entry(struct perf_hpp_fmt *fmt);
 bool perf_hpp__is_dso_entry(struct perf_hpp_fmt *fmt);
 bool perf_hpp__is_sym_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_parallelism_entry(struct perf_hpp_fmt *fmt);
 
 struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt);
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index c06e3020a9769..00fcf8d8ac255 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2403,6 +2403,18 @@ bool perf_session__has_traces(struct perf_session *session, const char *msg)
 	return false;
 }
 
+bool perf_session__has_switch_events(struct perf_session *session)
+{
+	struct evsel *evsel;
+
+	evlist__for_each_entry(session->evlist, evsel) {
+		if (evsel->core.attr.context_switch)
+			return true;
+	}
+
+	return false;
+}
+
 int map__set_kallsyms_ref_reloc_sym(struct map *map, const char *symbol_name, u64 addr)
 {
 	char *bracket;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index bcf1bcf06959b..db1c120a9e67f 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -141,6 +141,7 @@ int perf_session__resolve_callchain(struct perf_session *session,
 				    struct symbol **parent);
 
 bool perf_session__has_traces(struct perf_session *session, const char *msg);
+bool perf_session__has_switch_events(struct perf_session *session);
 
 void perf_event__attr_swap(struct perf_event_attr *attr);
 
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 3dd33721823f3..7eef43f5be360 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -892,6 +892,27 @@ struct sort_entry sort_cpu = {
 	.se_width_idx	= HISTC_CPU,
 };
 
+/* --sort parallelism */
+
+static int64_t
+sort__parallelism_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return right->parallelism - left->parallelism;
+}
+
+static int hist_entry__parallelism_snprintf(struct hist_entry *he, char *bf,
+				    size_t size, unsigned int width)
+{
+	return repsep_snprintf(bf, size, "%*d", width, he->parallelism);
+}
+
+struct sort_entry sort_parallelism = {
+	.se_header      = "Parallelism",
+	.se_cmp	        = sort__parallelism_cmp,
+	.se_snprintf    = hist_entry__parallelism_snprintf,
+	.se_width_idx	= HISTC_PARALLELISM,
+};
+
 /* --sort cgroup_id */
 
 static int64_t _sort__cgroup_dev_cmp(u64 left_dev, u64 right_dev)
@@ -2534,6 +2555,7 @@ static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_ANNOTATE_DATA_TYPE_OFFSET, "typeoff", sort_type_offset),
 	DIM(SORT_SYM_OFFSET, "symoff", sort_sym_offset),
 	DIM(SORT_ANNOTATE_DATA_TYPE_CACHELINE, "typecln", sort_type_cacheline),
+	DIM(SORT_PARALLELISM, "parallelism", sort_parallelism),
 };
 
 #undef DIM
@@ -2735,6 +2757,7 @@ MK_SORT_ENTRY_CHK(thread)
 MK_SORT_ENTRY_CHK(comm)
 MK_SORT_ENTRY_CHK(dso)
 MK_SORT_ENTRY_CHK(sym)
+MK_SORT_ENTRY_CHK(parallelism)
 
 
 static bool __sort__hpp_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index a8572574e1686..11fb15f914093 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -72,6 +72,7 @@ enum sort_type {
 	SORT_ANNOTATE_DATA_TYPE_OFFSET,
 	SORT_SYM_OFFSET,
 	SORT_ANNOTATE_DATA_TYPE_CACHELINE,
+	SORT_PARALLELISM,
 
 	/* branch stack specific sort keys */
 	__SORT_BRANCH_STACK,
-- 
2.48.1.262.g85cc9f2d1e-goog
Re: [PATCH v3 2/7] perf report: Add parallelism sort key
Posted by Namhyung Kim 1 year ago
On Mon, Jan 27, 2025 at 10:58:49AM +0100, Dmitry Vyukov wrote:
> Show parallelism level in profiles if requested by user.
> 
> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> Cc: Namhyung Kim <namhyung@kernel.org>
> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
> Cc: Ian Rogers <irogers@google.com>
> Cc: linux-perf-users@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  tools/perf/builtin-report.c | 11 +++++++++++
>  tools/perf/util/hist.c      |  2 ++
>  tools/perf/util/hist.h      |  3 +++
>  tools/perf/util/session.c   | 12 ++++++++++++
>  tools/perf/util/session.h   |  1 +
>  tools/perf/util/sort.c      | 23 +++++++++++++++++++++++
>  tools/perf/util/sort.h      |  1 +
>  7 files changed, 53 insertions(+)
> 
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index 0d9bd090eda71..14d49f0625881 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -1720,6 +1720,17 @@ int cmd_report(int argc, const char **argv)
>  		symbol_conf.annotate_data_sample = true;
>  	}
>  
> +	if (report.disable_order || !perf_session__has_switch_events(session)) {
> +		if ((sort_order && strstr(sort_order, "parallelism")) ||
> +				(field_order && strstr(field_order, "parallelism"))) {
> +			if (report.disable_order)
> +				ui__error("Use of parallelism is incompatible with --disable-order.\n");
> +			else
> +				ui__error("Use of parallelism requires --switch-events during record.\n");
> +			return -1;
> +		}
> +	}
> +
>  	if (sort_order && strstr(sort_order, "ipc")) {
>  		parse_options_usage(report_usage, options, "s", 1);
>  		goto error;
> diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> index 0f30f843c566d..cafd693568189 100644
> --- a/tools/perf/util/hist.c
> +++ b/tools/perf/util/hist.c
> @@ -207,6 +207,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
>  
>  	hists__new_col_len(hists, HISTC_CGROUP, 6);
>  	hists__new_col_len(hists, HISTC_CGROUP_ID, 20);
> +	hists__new_col_len(hists, HISTC_PARALLELISM, 11);
>  	hists__new_col_len(hists, HISTC_CPU, 3);
>  	hists__new_col_len(hists, HISTC_SOCKET, 6);
>  	hists__new_col_len(hists, HISTC_MEM_LOCKED, 6);
> @@ -741,6 +742,7 @@ __hists__add_entry(struct hists *hists,
>  		.ip	 = al->addr,
>  		.level	 = al->level,
>  		.code_page_size = sample->code_page_size,
> +		.parallelism	= al->parallelism,
>  		.stat = {
>  			.nr_events = 1,
>  			.period	= sample->period,
> diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
> index 46c8373e31465..a6e662d77dc24 100644
> --- a/tools/perf/util/hist.h
> +++ b/tools/perf/util/hist.h
> @@ -42,6 +42,7 @@ enum hist_column {
>  	HISTC_CGROUP_ID,
>  	HISTC_CGROUP,
>  	HISTC_PARENT,
> +	HISTC_PARALLELISM,
>  	HISTC_CPU,
>  	HISTC_SOCKET,
>  	HISTC_SRCLINE,
> @@ -228,6 +229,7 @@ struct hist_entry {
>  	u64			transaction;
>  	s32			socket;
>  	s32			cpu;
> +	int			parallelism;

Can you make it u16 and move to around cpumode to remove paddings?

Thanks,
Namhyung


>  	u64			code_page_size;
>  	u64			weight;
>  	u64			ins_lat;
> @@ -580,6 +582,7 @@ bool perf_hpp__is_thread_entry(struct perf_hpp_fmt *fmt);
>  bool perf_hpp__is_comm_entry(struct perf_hpp_fmt *fmt);
>  bool perf_hpp__is_dso_entry(struct perf_hpp_fmt *fmt);
>  bool perf_hpp__is_sym_entry(struct perf_hpp_fmt *fmt);
> +bool perf_hpp__is_parallelism_entry(struct perf_hpp_fmt *fmt);
>  
>  struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt);
Re: [PATCH v3 2/7] perf report: Add parallelism sort key
Posted by Dmitry Vyukov 1 year ago
On Wed, 29 Jan 2025 at 05:42, Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Mon, Jan 27, 2025 at 10:58:49AM +0100, Dmitry Vyukov wrote:
> > Show parallelism level in profiles if requested by user.
> >
> > Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> > Cc: Namhyung Kim <namhyung@kernel.org>
> > Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
> > Cc: Ian Rogers <irogers@google.com>
> > Cc: linux-perf-users@vger.kernel.org
> > Cc: linux-kernel@vger.kernel.org
> > ---
> >  tools/perf/builtin-report.c | 11 +++++++++++
> >  tools/perf/util/hist.c      |  2 ++
> >  tools/perf/util/hist.h      |  3 +++
> >  tools/perf/util/session.c   | 12 ++++++++++++
> >  tools/perf/util/session.h   |  1 +
> >  tools/perf/util/sort.c      | 23 +++++++++++++++++++++++
> >  tools/perf/util/sort.h      |  1 +
> >  7 files changed, 53 insertions(+)
> >
> > diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> > index 0d9bd090eda71..14d49f0625881 100644
> > --- a/tools/perf/builtin-report.c
> > +++ b/tools/perf/builtin-report.c
> > @@ -1720,6 +1720,17 @@ int cmd_report(int argc, const char **argv)
> >               symbol_conf.annotate_data_sample = true;
> >       }
> >
> > +     if (report.disable_order || !perf_session__has_switch_events(session)) {
> > +             if ((sort_order && strstr(sort_order, "parallelism")) ||
> > +                             (field_order && strstr(field_order, "parallelism"))) {
> > +                     if (report.disable_order)
> > +                             ui__error("Use of parallelism is incompatible with --disable-order.\n");
> > +                     else
> > +                             ui__error("Use of parallelism requires --switch-events during record.\n");
> > +                     return -1;
> > +             }
> > +     }
> > +
> >       if (sort_order && strstr(sort_order, "ipc")) {
> >               parse_options_usage(report_usage, options, "s", 1);
> >               goto error;
> > diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> > index 0f30f843c566d..cafd693568189 100644
> > --- a/tools/perf/util/hist.c
> > +++ b/tools/perf/util/hist.c
> > @@ -207,6 +207,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
> >
> >       hists__new_col_len(hists, HISTC_CGROUP, 6);
> >       hists__new_col_len(hists, HISTC_CGROUP_ID, 20);
> > +     hists__new_col_len(hists, HISTC_PARALLELISM, 11);
> >       hists__new_col_len(hists, HISTC_CPU, 3);
> >       hists__new_col_len(hists, HISTC_SOCKET, 6);
> >       hists__new_col_len(hists, HISTC_MEM_LOCKED, 6);
> > @@ -741,6 +742,7 @@ __hists__add_entry(struct hists *hists,
> >               .ip      = al->addr,
> >               .level   = al->level,
> >               .code_page_size = sample->code_page_size,
> > +             .parallelism    = al->parallelism,
> >               .stat = {
> >                       .nr_events = 1,
> >                       .period = sample->period,
> > diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
> > index 46c8373e31465..a6e662d77dc24 100644
> > --- a/tools/perf/util/hist.h
> > +++ b/tools/perf/util/hist.h
> > @@ -42,6 +42,7 @@ enum hist_column {
> >       HISTC_CGROUP_ID,
> >       HISTC_CGROUP,
> >       HISTC_PARENT,
> > +     HISTC_PARALLELISM,
> >       HISTC_CPU,
> >       HISTC_SOCKET,
> >       HISTC_SRCLINE,
> > @@ -228,6 +229,7 @@ struct hist_entry {
> >       u64                     transaction;
> >       s32                     socket;
> >       s32                     cpu;
> > +     int                     parallelism;
>
> Can you make it u16 and move to around cpumode to remove paddings?

It generally should be the same size as cpu/socket/etc. And these are
32-bits throughout the codebase (the previous fields). Are there any
checks that MAX_NR_CPUS<64K?


> Thanks,
> Namhyung
>
>
> >       u64                     code_page_size;
> >       u64                     weight;
> >       u64                     ins_lat;
> > @@ -580,6 +582,7 @@ bool perf_hpp__is_thread_entry(struct perf_hpp_fmt *fmt);
> >  bool perf_hpp__is_comm_entry(struct perf_hpp_fmt *fmt);
> >  bool perf_hpp__is_dso_entry(struct perf_hpp_fmt *fmt);
> >  bool perf_hpp__is_sym_entry(struct perf_hpp_fmt *fmt);
> > +bool perf_hpp__is_parallelism_entry(struct perf_hpp_fmt *fmt);
> >
> >  struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt);
Re: [PATCH v3 2/7] perf report: Add parallelism sort key
Posted by Namhyung Kim 1 year ago
On Wed, Jan 29, 2025 at 08:18:34AM +0100, Dmitry Vyukov wrote:
> On Wed, 29 Jan 2025 at 05:42, Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Mon, Jan 27, 2025 at 10:58:49AM +0100, Dmitry Vyukov wrote:
> > > Show parallelism level in profiles if requested by user.
> > >
> > > Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> > > Cc: Namhyung Kim <namhyung@kernel.org>
> > > Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
> > > Cc: Ian Rogers <irogers@google.com>
> > > Cc: linux-perf-users@vger.kernel.org
> > > Cc: linux-kernel@vger.kernel.org
> > > ---
> > >  tools/perf/builtin-report.c | 11 +++++++++++
> > >  tools/perf/util/hist.c      |  2 ++
> > >  tools/perf/util/hist.h      |  3 +++
> > >  tools/perf/util/session.c   | 12 ++++++++++++
> > >  tools/perf/util/session.h   |  1 +
> > >  tools/perf/util/sort.c      | 23 +++++++++++++++++++++++
> > >  tools/perf/util/sort.h      |  1 +
> > >  7 files changed, 53 insertions(+)
> > >
> > > diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> > > index 0d9bd090eda71..14d49f0625881 100644
> > > --- a/tools/perf/builtin-report.c
> > > +++ b/tools/perf/builtin-report.c
> > > @@ -1720,6 +1720,17 @@ int cmd_report(int argc, const char **argv)
> > >               symbol_conf.annotate_data_sample = true;
> > >       }
> > >
> > > +     if (report.disable_order || !perf_session__has_switch_events(session)) {
> > > +             if ((sort_order && strstr(sort_order, "parallelism")) ||
> > > +                             (field_order && strstr(field_order, "parallelism"))) {
> > > +                     if (report.disable_order)
> > > +                             ui__error("Use of parallelism is incompatible with --disable-order.\n");
> > > +                     else
> > > +                             ui__error("Use of parallelism requires --switch-events during record.\n");
> > > +                     return -1;
> > > +             }
> > > +     }
> > > +
> > >       if (sort_order && strstr(sort_order, "ipc")) {
> > >               parse_options_usage(report_usage, options, "s", 1);
> > >               goto error;
> > > diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> > > index 0f30f843c566d..cafd693568189 100644
> > > --- a/tools/perf/util/hist.c
> > > +++ b/tools/perf/util/hist.c
> > > @@ -207,6 +207,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
> > >
> > >       hists__new_col_len(hists, HISTC_CGROUP, 6);
> > >       hists__new_col_len(hists, HISTC_CGROUP_ID, 20);
> > > +     hists__new_col_len(hists, HISTC_PARALLELISM, 11);
> > >       hists__new_col_len(hists, HISTC_CPU, 3);
> > >       hists__new_col_len(hists, HISTC_SOCKET, 6);
> > >       hists__new_col_len(hists, HISTC_MEM_LOCKED, 6);
> > > @@ -741,6 +742,7 @@ __hists__add_entry(struct hists *hists,
> > >               .ip      = al->addr,
> > >               .level   = al->level,
> > >               .code_page_size = sample->code_page_size,
> > > +             .parallelism    = al->parallelism,
> > >               .stat = {
> > >                       .nr_events = 1,
> > >                       .period = sample->period,
> > > diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
> > > index 46c8373e31465..a6e662d77dc24 100644
> > > --- a/tools/perf/util/hist.h
> > > +++ b/tools/perf/util/hist.h
> > > @@ -42,6 +42,7 @@ enum hist_column {
> > >       HISTC_CGROUP_ID,
> > >       HISTC_CGROUP,
> > >       HISTC_PARENT,
> > > +     HISTC_PARALLELISM,
> > >       HISTC_CPU,
> > >       HISTC_SOCKET,
> > >       HISTC_SRCLINE,
> > > @@ -228,6 +229,7 @@ struct hist_entry {
> > >       u64                     transaction;
> > >       s32                     socket;
> > >       s32                     cpu;
> > > +     int                     parallelism;
> >
> > Can you make it u16 and move to around cpumode to remove paddings?
> 
> It generally should be the same size as cpu/socket/etc. And these are
> 32-bits throughout the codebase (the previous fields). Are there any
> checks that MAX_NR_CPUS<64K?

I don't think there's such a check.  But practically it used be 4K and
you can add the check. :)

Anyway hist_entry is getting bigger and we may need to shrink it later.
I don't stringly insist on u16 though.  It's up to you.

Thanks,
Namhyung

> >
> >
> > >       u64                     code_page_size;
> > >       u64                     weight;
> > >       u64                     ins_lat;
> > > @@ -580,6 +582,7 @@ bool perf_hpp__is_thread_entry(struct perf_hpp_fmt *fmt);
> > >  bool perf_hpp__is_comm_entry(struct perf_hpp_fmt *fmt);
> > >  bool perf_hpp__is_dso_entry(struct perf_hpp_fmt *fmt);
> > >  bool perf_hpp__is_sym_entry(struct perf_hpp_fmt *fmt);
> > > +bool perf_hpp__is_parallelism_entry(struct perf_hpp_fmt *fmt);
> > >
> > >  struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt);
Re: [PATCH v3 2/7] perf report: Add parallelism sort key
Posted by Dmitry Vyukov 1 year ago
On Thu, 30 Jan 2025 at 06:28, Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Wed, Jan 29, 2025 at 08:18:34AM +0100, Dmitry Vyukov wrote:
> > On Wed, 29 Jan 2025 at 05:42, Namhyung Kim <namhyung@kernel.org> wrote:
> > >
> > > On Mon, Jan 27, 2025 at 10:58:49AM +0100, Dmitry Vyukov wrote:
> > > > Show parallelism level in profiles if requested by user.
> > > >
> > > > Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> > > > Cc: Namhyung Kim <namhyung@kernel.org>
> > > > Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
> > > > Cc: Ian Rogers <irogers@google.com>
> > > > Cc: linux-perf-users@vger.kernel.org
> > > > Cc: linux-kernel@vger.kernel.org
> > > > ---
> > > >  tools/perf/builtin-report.c | 11 +++++++++++
> > > >  tools/perf/util/hist.c      |  2 ++
> > > >  tools/perf/util/hist.h      |  3 +++
> > > >  tools/perf/util/session.c   | 12 ++++++++++++
> > > >  tools/perf/util/session.h   |  1 +
> > > >  tools/perf/util/sort.c      | 23 +++++++++++++++++++++++
> > > >  tools/perf/util/sort.h      |  1 +
> > > >  7 files changed, 53 insertions(+)
> > > >
> > > > diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> > > > index 0d9bd090eda71..14d49f0625881 100644
> > > > --- a/tools/perf/builtin-report.c
> > > > +++ b/tools/perf/builtin-report.c
> > > > @@ -1720,6 +1720,17 @@ int cmd_report(int argc, const char **argv)
> > > >               symbol_conf.annotate_data_sample = true;
> > > >       }
> > > >
> > > > +     if (report.disable_order || !perf_session__has_switch_events(session)) {
> > > > +             if ((sort_order && strstr(sort_order, "parallelism")) ||
> > > > +                             (field_order && strstr(field_order, "parallelism"))) {
> > > > +                     if (report.disable_order)
> > > > +                             ui__error("Use of parallelism is incompatible with --disable-order.\n");
> > > > +                     else
> > > > +                             ui__error("Use of parallelism requires --switch-events during record.\n");
> > > > +                     return -1;
> > > > +             }
> > > > +     }
> > > > +
> > > >       if (sort_order && strstr(sort_order, "ipc")) {
> > > >               parse_options_usage(report_usage, options, "s", 1);
> > > >               goto error;
> > > > diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> > > > index 0f30f843c566d..cafd693568189 100644
> > > > --- a/tools/perf/util/hist.c
> > > > +++ b/tools/perf/util/hist.c
> > > > @@ -207,6 +207,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
> > > >
> > > >       hists__new_col_len(hists, HISTC_CGROUP, 6);
> > > >       hists__new_col_len(hists, HISTC_CGROUP_ID, 20);
> > > > +     hists__new_col_len(hists, HISTC_PARALLELISM, 11);
> > > >       hists__new_col_len(hists, HISTC_CPU, 3);
> > > >       hists__new_col_len(hists, HISTC_SOCKET, 6);
> > > >       hists__new_col_len(hists, HISTC_MEM_LOCKED, 6);
> > > > @@ -741,6 +742,7 @@ __hists__add_entry(struct hists *hists,
> > > >               .ip      = al->addr,
> > > >               .level   = al->level,
> > > >               .code_page_size = sample->code_page_size,
> > > > +             .parallelism    = al->parallelism,
> > > >               .stat = {
> > > >                       .nr_events = 1,
> > > >                       .period = sample->period,
> > > > diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
> > > > index 46c8373e31465..a6e662d77dc24 100644
> > > > --- a/tools/perf/util/hist.h
> > > > +++ b/tools/perf/util/hist.h
> > > > @@ -42,6 +42,7 @@ enum hist_column {
> > > >       HISTC_CGROUP_ID,
> > > >       HISTC_CGROUP,
> > > >       HISTC_PARENT,
> > > > +     HISTC_PARALLELISM,
> > > >       HISTC_CPU,
> > > >       HISTC_SOCKET,
> > > >       HISTC_SRCLINE,
> > > > @@ -228,6 +229,7 @@ struct hist_entry {
> > > >       u64                     transaction;
> > > >       s32                     socket;
> > > >       s32                     cpu;
> > > > +     int                     parallelism;
> > >
> > > Can you make it u16 and move to around cpumode to remove paddings?
> >
> > It generally should be the same size as cpu/socket/etc. And these are
> > 32-bits throughout the codebase (the previous fields). Are there any
> > checks that MAX_NR_CPUS<64K?
>
> I don't think there's such a check.  But practically it used be 4K and
> you can add the check. :)
>
> Anyway hist_entry is getting bigger and we may need to shrink it later.
> I don't stringly insist on u16 though.  It's up to you.

Added this commit to v4:

perf hist: Shrink struct hist_entry size
https://lore.kernel.org/linux-perf-users/75dd56a2d467515dc354d8fc8d5acd841d63b565.1738592865.git.dvyukov@google.com/T/#u

It cuts 8 bytes, which I hope is enough to compensate for my 4 added bytes :)

However, it's 584 bytes, so it saves like 1%.

If hist_entry size is a problem I think the following approaches will
have more impact (not saying I want to do it as part of this series
:)):

1. Make hist_entry layout dynamic (like the kernel perf event data) to
completely exclude unused parts when they are unused.

2. Add slab allocator for hist_entry which won't round up allocation size.
E.g. if you take default TCMalloc size classes, that's 576 and then
640. So this messing with padding actually doesn't save anything. We
shave off 1%, but the actual heap block is still 10% wasted.