[PATCH v3 1/3] perf sample: Remove arch notion of sample parsing

Ian Rogers posted 3 patches 6 months, 3 weeks ago
[PATCH v3 1/3] perf sample: Remove arch notion of sample parsing
Posted by Ian Rogers 6 months, 3 weeks ago
By definition arch sample parsing and synthesis will inhibit certain
kinds of cross-platform record then analysis (report, script,
etc.). Remove arch_perf_parse_sample_weight and
arch_perf_synthesize_sample_weight replacing with a common
implementation. Combine perf_sample p_stage_cyc and retire_lat to
capture the differing uses regardless of compiled for architecture.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 tools/perf/arch/powerpc/util/event.c       | 26 ---------------------
 tools/perf/arch/x86/tests/sample-parsing.c |  4 ++--
 tools/perf/arch/x86/util/event.c           | 27 ----------------------
 tools/perf/builtin-script.c                |  2 +-
 tools/perf/util/dlfilter.c                 |  2 +-
 tools/perf/util/event.h                    |  2 --
 tools/perf/util/evsel.c                    | 17 ++++++++++----
 tools/perf/util/hist.c                     |  4 ++--
 tools/perf/util/hist.h                     |  2 +-
 tools/perf/util/intel-tpebs.c              |  4 ++--
 tools/perf/util/sample.h                   |  5 +---
 tools/perf/util/session.c                  |  2 +-
 tools/perf/util/sort.c                     |  6 ++---
 tools/perf/util/synthetic-events.c         | 10 ++++++--
 14 files changed, 34 insertions(+), 79 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c
index 77d8cc2b5691..024ac8b54c33 100644
--- a/tools/perf/arch/powerpc/util/event.c
+++ b/tools/perf/arch/powerpc/util/event.c
@@ -11,32 +11,6 @@
 #include "../../../util/debug.h"
 #include "../../../util/sample.h"
 
-void arch_perf_parse_sample_weight(struct perf_sample *data,
-				   const __u64 *array, u64 type)
-{
-	union perf_sample_weight weight;
-
-	weight.full = *array;
-	if (type & PERF_SAMPLE_WEIGHT)
-		data->weight = weight.full;
-	else {
-		data->weight = weight.var1_dw;
-		data->ins_lat = weight.var2_w;
-		data->p_stage_cyc = weight.var3_w;
-	}
-}
-
-void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
-					__u64 *array, u64 type)
-{
-	*array = data->weight;
-
-	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
-		*array &= 0xffffffff;
-		*array |= ((u64)data->ins_lat << 32);
-	}
-}
-
 const char *arch_perf_header_entry(const char *se_header)
 {
 	if (!strcmp(se_header, "Local INSTR Latency"))
diff --git a/tools/perf/arch/x86/tests/sample-parsing.c b/tools/perf/arch/x86/tests/sample-parsing.c
index a061e8619267..95d8f7f1d2fb 100644
--- a/tools/perf/arch/x86/tests/sample-parsing.c
+++ b/tools/perf/arch/x86/tests/sample-parsing.c
@@ -29,7 +29,7 @@ static bool samples_same(const struct perf_sample *s1,
 {
 	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
 		COMP(ins_lat);
-		COMP(retire_lat);
+		COMP(p_stage_cyc_or_retire_lat);
 	}
 
 	return true;
@@ -50,7 +50,7 @@ static int do_test(u64 sample_type)
 	struct perf_sample sample = {
 		.weight		= 101,
 		.ins_lat        = 102,
-		.retire_lat     = 103,
+		.p_stage_cyc_or_retire_lat = 103,
 	};
 	struct perf_sample sample_out;
 	size_t i, sz, bufsz;
diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c
index a0400707180c..576c1c36046c 100644
--- a/tools/perf/arch/x86/util/event.c
+++ b/tools/perf/arch/x86/util/event.c
@@ -92,33 +92,6 @@ int perf_event__synthesize_extra_kmaps(const struct perf_tool *tool,
 
 #endif
 
-void arch_perf_parse_sample_weight(struct perf_sample *data,
-				   const __u64 *array, u64 type)
-{
-	union perf_sample_weight weight;
-
-	weight.full = *array;
-	if (type & PERF_SAMPLE_WEIGHT)
-		data->weight = weight.full;
-	else {
-		data->weight = weight.var1_dw;
-		data->ins_lat = weight.var2_w;
-		data->retire_lat = weight.var3_w;
-	}
-}
-
-void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
-					__u64 *array, u64 type)
-{
-	*array = data->weight;
-
-	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
-		*array &= 0xffffffff;
-		*array |= ((u64)data->ins_lat << 32);
-		*array |= ((u64)data->retire_lat << 48);
-	}
-}
-
 const char *arch_perf_header_entry(const char *se_header)
 {
 	if (!strcmp(se_header, "Local Pipeline Stage Cycle"))
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 6c3bf74dd78c..c02c435e0f0b 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2251,7 +2251,7 @@ static void process_event(struct perf_script *script,
 		fprintf(fp, "%16" PRIu16, sample->ins_lat);
 
 	if (PRINT_FIELD(RETIRE_LAT))
-		fprintf(fp, "%16" PRIu16, sample->retire_lat);
+		fprintf(fp, "%16" PRIu16, sample->p_stage_cyc_or_retire_lat);
 
 	if (PRINT_FIELD(CGROUP)) {
 		const char *cgrp_name;
diff --git a/tools/perf/util/dlfilter.c b/tools/perf/util/dlfilter.c
index ddacef881af2..d5fd6d34a17c 100644
--- a/tools/perf/util/dlfilter.c
+++ b/tools/perf/util/dlfilter.c
@@ -513,6 +513,7 @@ int dlfilter__do_filter_event(struct dlfilter *d,
 	d->d_addr_al   = &d_addr_al;
 
 	d_sample.size  = sizeof(d_sample);
+	d_sample.p_stage_cyc = sample->p_stage_cyc_or_retire_lat;
 	d_ip_al.size   = 0; /* To indicate d_ip_al is not initialized */
 	d_addr_al.size = 0; /* To indicate d_addr_al is not initialized */
 
@@ -526,7 +527,6 @@ int dlfilter__do_filter_event(struct dlfilter *d,
 	ASSIGN(period);
 	ASSIGN(weight);
 	ASSIGN(ins_lat);
-	ASSIGN(p_stage_cyc);
 	ASSIGN(transaction);
 	ASSIGN(insn_cnt);
 	ASSIGN(cyc_cnt);
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 664bf39567ce..119bce37f4fd 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -390,8 +390,6 @@ extern unsigned int proc_map_timeout;
 #define PAGE_SIZE_NAME_LEN	32
 char *get_page_size_name(u64 size, char *str);
 
-void arch_perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type);
-void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 *array, u64 type);
 const char *arch_perf_header_entry(const char *se_header);
 int arch_support_sort_key(const char *sort_key);
 
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index d55482f094bf..27de167855ee 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2846,11 +2846,18 @@ perf_event__check_size(union perf_event *event, unsigned int sample_size)
 	return 0;
 }
 
-void __weak arch_perf_parse_sample_weight(struct perf_sample *data,
-					  const __u64 *array,
-					  u64 type __maybe_unused)
+static void perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type)
 {
-	data->weight = *array;
+	union perf_sample_weight weight;
+
+	weight.full = *array;
+	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
+		data->weight = weight.var1_dw;
+		data->ins_lat = weight.var2_w;
+		data->p_stage_cyc_or_retire_lat = weight.var3_w;
+	} else {
+		data->weight = weight.full;
+	}
 }
 
 u64 evsel__bitfield_swap_branch_flags(u64 value)
@@ -3236,7 +3243,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 
 	if (type & PERF_SAMPLE_WEIGHT_TYPE) {
 		OVERFLOW_CHECK_u64(array);
-		arch_perf_parse_sample_weight(data, array, type);
+		perf_parse_sample_weight(data, array, type);
 		array++;
 	}
 
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index afc6855327ab..ae9803dca0b1 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -829,7 +829,7 @@ __hists__add_entry(struct hists *hists,
 			.period	= sample->period,
 			.weight1 = sample->weight,
 			.weight2 = sample->ins_lat,
-			.weight3 = sample->p_stage_cyc,
+			.weight3 = sample->p_stage_cyc_or_retire_lat,
 			.latency = al->latency,
 		},
 		.parent = sym_parent,
@@ -846,7 +846,7 @@ __hists__add_entry(struct hists *hists,
 		.time = hist_time(sample->time),
 		.weight = sample->weight,
 		.ins_lat = sample->ins_lat,
-		.p_stage_cyc = sample->p_stage_cyc,
+		.p_stage_cyc_or_retire_lat = sample->p_stage_cyc_or_retire_lat,
 		.simd_flags = sample->simd_flags,
 	}, *he = hists__findnew_entry(hists, &entry, al, sample_self);
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index c64254088fc7..67033bdabcf4 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -255,7 +255,7 @@ struct hist_entry {
 	u64			code_page_size;
 	u64			weight;
 	u64			ins_lat;
-	u64			p_stage_cyc;
+	u64			p_stage_cyc_or_retire_lat;
 	s32			socket;
 	s32			cpu;
 	int			parallelism;
diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
index 4ad4bc118ea5..ec2f3ecf1e1c 100644
--- a/tools/perf/util/intel-tpebs.c
+++ b/tools/perf/util/intel-tpebs.c
@@ -202,8 +202,8 @@ static int process_sample_event(const struct perf_tool *tool __maybe_unused,
 	 * latency value will be used. Save the number of samples and the sum of
 	 * retire latency value for each event.
 	 */
-	t->last = sample->retire_lat;
-	update_stats(&t->stats, sample->retire_lat);
+	t->last = sample->p_stage_cyc_or_retire_lat;
+	update_stats(&t->stats, sample->p_stage_cyc_or_retire_lat);
 	mutex_unlock(tpebs_mtx_get());
 	return 0;
 }
diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
index 0e96240052e9..3330d18fb5fd 100644
--- a/tools/perf/util/sample.h
+++ b/tools/perf/util/sample.h
@@ -104,10 +104,7 @@ struct perf_sample {
 	u8  cpumode;
 	u16 misc;
 	u16 ins_lat;
-	union {
-		u16 p_stage_cyc;
-		u16 retire_lat;
-	};
+	u16 p_stage_cyc_or_retire_lat;
 	bool no_hw_idx;		/* No hw_idx collected in branch_stack */
 	char insn[MAX_INSN];
 	void *raw_data;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index a320672c264e..451bc24ccfba 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1094,7 +1094,7 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
 		printf("... weight: %" PRIu64 "", sample->weight);
 			if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
 				printf(",0x%"PRIx16"", sample->ins_lat);
-				printf(",0x%"PRIx16"", sample->p_stage_cyc);
+				printf(",0x%"PRIx16"", sample->p_stage_cyc_or_retire_lat);
 			}
 		printf("\n");
 	}
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 45e654653960..dda4ef0b5a73 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1879,21 +1879,21 @@ struct sort_entry sort_global_ins_lat = {
 static int64_t
 sort__p_stage_cyc_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	return left->p_stage_cyc - right->p_stage_cyc;
+	return left->p_stage_cyc_or_retire_lat - right->p_stage_cyc_or_retire_lat;
 }
 
 static int hist_entry__global_p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
 					size_t size, unsigned int width)
 {
 	return repsep_snprintf(bf, size, "%-*u", width,
-			he->p_stage_cyc * he->stat.nr_events);
+			he->p_stage_cyc_or_retire_lat * he->stat.nr_events);
 }
 
 
 static int hist_entry__p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
 					size_t size, unsigned int width)
 {
-	return repsep_snprintf(bf, size, "%-*u", width, he->p_stage_cyc);
+	return repsep_snprintf(bf, size, "%-*u", width, he->p_stage_cyc_or_retire_lat);
 }
 
 struct sort_entry sort_local_p_stage_cyc = {
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 2fc4d0537840..449a41900fc4 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -1567,10 +1567,16 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
 	return result;
 }
 
-void __weak arch_perf_synthesize_sample_weight(const struct perf_sample *data,
+static void perf_synthesize_sample_weight(const struct perf_sample *data,
 					       __u64 *array, u64 type __maybe_unused)
 {
 	*array = data->weight;
+
+	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
+		*array &= 0xffffffff;
+		*array |= ((u64)data->ins_lat << 32);
+		*array |= ((u64)data->p_stage_cyc_or_retire_lat << 48);
+	}
 }
 
 static __u64 *copy_read_group_values(__u64 *array, __u64 read_format,
@@ -1730,7 +1736,7 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo
 	}
 
 	if (type & PERF_SAMPLE_WEIGHT_TYPE) {
-		arch_perf_synthesize_sample_weight(sample, array, type);
+		perf_synthesize_sample_weight(sample, array, type);
 		array++;
 	}
 
-- 
2.49.0.1143.g0be31eac6b-goog
Re: [PATCH v3 1/3] perf sample: Remove arch notion of sample parsing
Posted by Namhyung Kim 6 months, 3 weeks ago
On Wed, May 21, 2025 at 09:53:15AM -0700, Ian Rogers wrote:
> By definition arch sample parsing and synthesis will inhibit certain
> kinds of cross-platform record then analysis (report, script,
> etc.). Remove arch_perf_parse_sample_weight and
> arch_perf_synthesize_sample_weight replacing with a common
> implementation. Combine perf_sample p_stage_cyc and retire_lat to
> capture the differing uses regardless of compiled for architecture.

Can you please do this without renaming?  It can be a separate patch but
I think we can just leave it.

Thanks,
Namhyung

> 
> Signed-off-by: Ian Rogers <irogers@google.com>
> ---
>  tools/perf/arch/powerpc/util/event.c       | 26 ---------------------
>  tools/perf/arch/x86/tests/sample-parsing.c |  4 ++--
>  tools/perf/arch/x86/util/event.c           | 27 ----------------------
>  tools/perf/builtin-script.c                |  2 +-
>  tools/perf/util/dlfilter.c                 |  2 +-
>  tools/perf/util/event.h                    |  2 --
>  tools/perf/util/evsel.c                    | 17 ++++++++++----
>  tools/perf/util/hist.c                     |  4 ++--
>  tools/perf/util/hist.h                     |  2 +-
>  tools/perf/util/intel-tpebs.c              |  4 ++--
>  tools/perf/util/sample.h                   |  5 +---
>  tools/perf/util/session.c                  |  2 +-
>  tools/perf/util/sort.c                     |  6 ++---
>  tools/perf/util/synthetic-events.c         | 10 ++++++--
>  14 files changed, 34 insertions(+), 79 deletions(-)
> 
> diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c
> index 77d8cc2b5691..024ac8b54c33 100644
> --- a/tools/perf/arch/powerpc/util/event.c
> +++ b/tools/perf/arch/powerpc/util/event.c
> @@ -11,32 +11,6 @@
>  #include "../../../util/debug.h"
>  #include "../../../util/sample.h"
>  
> -void arch_perf_parse_sample_weight(struct perf_sample *data,
> -				   const __u64 *array, u64 type)
> -{
> -	union perf_sample_weight weight;
> -
> -	weight.full = *array;
> -	if (type & PERF_SAMPLE_WEIGHT)
> -		data->weight = weight.full;
> -	else {
> -		data->weight = weight.var1_dw;
> -		data->ins_lat = weight.var2_w;
> -		data->p_stage_cyc = weight.var3_w;
> -	}
> -}
> -
> -void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
> -					__u64 *array, u64 type)
> -{
> -	*array = data->weight;
> -
> -	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
> -		*array &= 0xffffffff;
> -		*array |= ((u64)data->ins_lat << 32);
> -	}
> -}
> -
>  const char *arch_perf_header_entry(const char *se_header)
>  {
>  	if (!strcmp(se_header, "Local INSTR Latency"))
> diff --git a/tools/perf/arch/x86/tests/sample-parsing.c b/tools/perf/arch/x86/tests/sample-parsing.c
> index a061e8619267..95d8f7f1d2fb 100644
> --- a/tools/perf/arch/x86/tests/sample-parsing.c
> +++ b/tools/perf/arch/x86/tests/sample-parsing.c
> @@ -29,7 +29,7 @@ static bool samples_same(const struct perf_sample *s1,
>  {
>  	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
>  		COMP(ins_lat);
> -		COMP(retire_lat);
> +		COMP(p_stage_cyc_or_retire_lat);
>  	}
>  
>  	return true;
> @@ -50,7 +50,7 @@ static int do_test(u64 sample_type)
>  	struct perf_sample sample = {
>  		.weight		= 101,
>  		.ins_lat        = 102,
> -		.retire_lat     = 103,
> +		.p_stage_cyc_or_retire_lat = 103,
>  	};
>  	struct perf_sample sample_out;
>  	size_t i, sz, bufsz;
> diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c
> index a0400707180c..576c1c36046c 100644
> --- a/tools/perf/arch/x86/util/event.c
> +++ b/tools/perf/arch/x86/util/event.c
> @@ -92,33 +92,6 @@ int perf_event__synthesize_extra_kmaps(const struct perf_tool *tool,
>  
>  #endif
>  
> -void arch_perf_parse_sample_weight(struct perf_sample *data,
> -				   const __u64 *array, u64 type)
> -{
> -	union perf_sample_weight weight;
> -
> -	weight.full = *array;
> -	if (type & PERF_SAMPLE_WEIGHT)
> -		data->weight = weight.full;
> -	else {
> -		data->weight = weight.var1_dw;
> -		data->ins_lat = weight.var2_w;
> -		data->retire_lat = weight.var3_w;
> -	}
> -}
> -
> -void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
> -					__u64 *array, u64 type)
> -{
> -	*array = data->weight;
> -
> -	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
> -		*array &= 0xffffffff;
> -		*array |= ((u64)data->ins_lat << 32);
> -		*array |= ((u64)data->retire_lat << 48);
> -	}
> -}
> -
>  const char *arch_perf_header_entry(const char *se_header)
>  {
>  	if (!strcmp(se_header, "Local Pipeline Stage Cycle"))
> diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
> index 6c3bf74dd78c..c02c435e0f0b 100644
> --- a/tools/perf/builtin-script.c
> +++ b/tools/perf/builtin-script.c
> @@ -2251,7 +2251,7 @@ static void process_event(struct perf_script *script,
>  		fprintf(fp, "%16" PRIu16, sample->ins_lat);
>  
>  	if (PRINT_FIELD(RETIRE_LAT))
> -		fprintf(fp, "%16" PRIu16, sample->retire_lat);
> +		fprintf(fp, "%16" PRIu16, sample->p_stage_cyc_or_retire_lat);
>  
>  	if (PRINT_FIELD(CGROUP)) {
>  		const char *cgrp_name;
> diff --git a/tools/perf/util/dlfilter.c b/tools/perf/util/dlfilter.c
> index ddacef881af2..d5fd6d34a17c 100644
> --- a/tools/perf/util/dlfilter.c
> +++ b/tools/perf/util/dlfilter.c
> @@ -513,6 +513,7 @@ int dlfilter__do_filter_event(struct dlfilter *d,
>  	d->d_addr_al   = &d_addr_al;
>  
>  	d_sample.size  = sizeof(d_sample);
> +	d_sample.p_stage_cyc = sample->p_stage_cyc_or_retire_lat;
>  	d_ip_al.size   = 0; /* To indicate d_ip_al is not initialized */
>  	d_addr_al.size = 0; /* To indicate d_addr_al is not initialized */
>  
> @@ -526,7 +527,6 @@ int dlfilter__do_filter_event(struct dlfilter *d,
>  	ASSIGN(period);
>  	ASSIGN(weight);
>  	ASSIGN(ins_lat);
> -	ASSIGN(p_stage_cyc);
>  	ASSIGN(transaction);
>  	ASSIGN(insn_cnt);
>  	ASSIGN(cyc_cnt);
> diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
> index 664bf39567ce..119bce37f4fd 100644
> --- a/tools/perf/util/event.h
> +++ b/tools/perf/util/event.h
> @@ -390,8 +390,6 @@ extern unsigned int proc_map_timeout;
>  #define PAGE_SIZE_NAME_LEN	32
>  char *get_page_size_name(u64 size, char *str);
>  
> -void arch_perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type);
> -void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 *array, u64 type);
>  const char *arch_perf_header_entry(const char *se_header);
>  int arch_support_sort_key(const char *sort_key);
>  
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index d55482f094bf..27de167855ee 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -2846,11 +2846,18 @@ perf_event__check_size(union perf_event *event, unsigned int sample_size)
>  	return 0;
>  }
>  
> -void __weak arch_perf_parse_sample_weight(struct perf_sample *data,
> -					  const __u64 *array,
> -					  u64 type __maybe_unused)
> +static void perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type)
>  {
> -	data->weight = *array;
> +	union perf_sample_weight weight;
> +
> +	weight.full = *array;
> +	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
> +		data->weight = weight.var1_dw;
> +		data->ins_lat = weight.var2_w;
> +		data->p_stage_cyc_or_retire_lat = weight.var3_w;
> +	} else {
> +		data->weight = weight.full;
> +	}
>  }
>  
>  u64 evsel__bitfield_swap_branch_flags(u64 value)
> @@ -3236,7 +3243,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
>  
>  	if (type & PERF_SAMPLE_WEIGHT_TYPE) {
>  		OVERFLOW_CHECK_u64(array);
> -		arch_perf_parse_sample_weight(data, array, type);
> +		perf_parse_sample_weight(data, array, type);
>  		array++;
>  	}
>  
> diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> index afc6855327ab..ae9803dca0b1 100644
> --- a/tools/perf/util/hist.c
> +++ b/tools/perf/util/hist.c
> @@ -829,7 +829,7 @@ __hists__add_entry(struct hists *hists,
>  			.period	= sample->period,
>  			.weight1 = sample->weight,
>  			.weight2 = sample->ins_lat,
> -			.weight3 = sample->p_stage_cyc,
> +			.weight3 = sample->p_stage_cyc_or_retire_lat,
>  			.latency = al->latency,
>  		},
>  		.parent = sym_parent,
> @@ -846,7 +846,7 @@ __hists__add_entry(struct hists *hists,
>  		.time = hist_time(sample->time),
>  		.weight = sample->weight,
>  		.ins_lat = sample->ins_lat,
> -		.p_stage_cyc = sample->p_stage_cyc,
> +		.p_stage_cyc_or_retire_lat = sample->p_stage_cyc_or_retire_lat,
>  		.simd_flags = sample->simd_flags,
>  	}, *he = hists__findnew_entry(hists, &entry, al, sample_self);
>  
> diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
> index c64254088fc7..67033bdabcf4 100644
> --- a/tools/perf/util/hist.h
> +++ b/tools/perf/util/hist.h
> @@ -255,7 +255,7 @@ struct hist_entry {
>  	u64			code_page_size;
>  	u64			weight;
>  	u64			ins_lat;
> -	u64			p_stage_cyc;
> +	u64			p_stage_cyc_or_retire_lat;
>  	s32			socket;
>  	s32			cpu;
>  	int			parallelism;
> diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> index 4ad4bc118ea5..ec2f3ecf1e1c 100644
> --- a/tools/perf/util/intel-tpebs.c
> +++ b/tools/perf/util/intel-tpebs.c
> @@ -202,8 +202,8 @@ static int process_sample_event(const struct perf_tool *tool __maybe_unused,
>  	 * latency value will be used. Save the number of samples and the sum of
>  	 * retire latency value for each event.
>  	 */
> -	t->last = sample->retire_lat;
> -	update_stats(&t->stats, sample->retire_lat);
> +	t->last = sample->p_stage_cyc_or_retire_lat;
> +	update_stats(&t->stats, sample->p_stage_cyc_or_retire_lat);
>  	mutex_unlock(tpebs_mtx_get());
>  	return 0;
>  }
> diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
> index 0e96240052e9..3330d18fb5fd 100644
> --- a/tools/perf/util/sample.h
> +++ b/tools/perf/util/sample.h
> @@ -104,10 +104,7 @@ struct perf_sample {
>  	u8  cpumode;
>  	u16 misc;
>  	u16 ins_lat;
> -	union {
> -		u16 p_stage_cyc;
> -		u16 retire_lat;
> -	};
> +	u16 p_stage_cyc_or_retire_lat;
>  	bool no_hw_idx;		/* No hw_idx collected in branch_stack */
>  	char insn[MAX_INSN];
>  	void *raw_data;
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index a320672c264e..451bc24ccfba 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -1094,7 +1094,7 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
>  		printf("... weight: %" PRIu64 "", sample->weight);
>  			if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
>  				printf(",0x%"PRIx16"", sample->ins_lat);
> -				printf(",0x%"PRIx16"", sample->p_stage_cyc);
> +				printf(",0x%"PRIx16"", sample->p_stage_cyc_or_retire_lat);
>  			}
>  		printf("\n");
>  	}
> diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
> index 45e654653960..dda4ef0b5a73 100644
> --- a/tools/perf/util/sort.c
> +++ b/tools/perf/util/sort.c
> @@ -1879,21 +1879,21 @@ struct sort_entry sort_global_ins_lat = {
>  static int64_t
>  sort__p_stage_cyc_cmp(struct hist_entry *left, struct hist_entry *right)
>  {
> -	return left->p_stage_cyc - right->p_stage_cyc;
> +	return left->p_stage_cyc_or_retire_lat - right->p_stage_cyc_or_retire_lat;
>  }
>  
>  static int hist_entry__global_p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
>  					size_t size, unsigned int width)
>  {
>  	return repsep_snprintf(bf, size, "%-*u", width,
> -			he->p_stage_cyc * he->stat.nr_events);
> +			he->p_stage_cyc_or_retire_lat * he->stat.nr_events);
>  }
>  
>  
>  static int hist_entry__p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
>  					size_t size, unsigned int width)
>  {
> -	return repsep_snprintf(bf, size, "%-*u", width, he->p_stage_cyc);
> +	return repsep_snprintf(bf, size, "%-*u", width, he->p_stage_cyc_or_retire_lat);
>  }
>  
>  struct sort_entry sort_local_p_stage_cyc = {
> diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
> index 2fc4d0537840..449a41900fc4 100644
> --- a/tools/perf/util/synthetic-events.c
> +++ b/tools/perf/util/synthetic-events.c
> @@ -1567,10 +1567,16 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
>  	return result;
>  }
>  
> -void __weak arch_perf_synthesize_sample_weight(const struct perf_sample *data,
> +static void perf_synthesize_sample_weight(const struct perf_sample *data,
>  					       __u64 *array, u64 type __maybe_unused)
>  {
>  	*array = data->weight;
> +
> +	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
> +		*array &= 0xffffffff;
> +		*array |= ((u64)data->ins_lat << 32);
> +		*array |= ((u64)data->p_stage_cyc_or_retire_lat << 48);
> +	}
>  }
>  
>  static __u64 *copy_read_group_values(__u64 *array, __u64 read_format,
> @@ -1730,7 +1736,7 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo
>  	}
>  
>  	if (type & PERF_SAMPLE_WEIGHT_TYPE) {
> -		arch_perf_synthesize_sample_weight(sample, array, type);
> +		perf_synthesize_sample_weight(sample, array, type);
>  		array++;
>  	}
>  
> -- 
> 2.49.0.1143.g0be31eac6b-goog
>
Re: [PATCH v3 1/3] perf sample: Remove arch notion of sample parsing
Posted by Ian Rogers 6 months, 3 weeks ago
On Wed, May 21, 2025 at 1:27 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Wed, May 21, 2025 at 09:53:15AM -0700, Ian Rogers wrote:
> > By definition arch sample parsing and synthesis will inhibit certain
> > kinds of cross-platform record then analysis (report, script,
> > etc.). Remove arch_perf_parse_sample_weight and
> > arch_perf_synthesize_sample_weight replacing with a common
> > implementation. Combine perf_sample p_stage_cyc and retire_lat to
> > capture the differing uses regardless of compiled for architecture.
>
> Can you please do this without renaming?  It can be a separate patch but
> I think we can just leave it.

It is not clear what the use of the union is. Presumably it is a
tagged union but there is no tag as the union value to use is implied
by either being built on x86_64 or powerpc. The change removes the
notion of this code being built for x86_64 or powerpc and so the union
value to use isn't clear (e.g. should arm use p_stage_cyc or
retire_lat from the union), hence combining to show that it could be
one or the other. The code could be:
```
#ifdef __x86_64__
       u16 p_stage_cyc;
#elif defined(powerpc)
       u16 retire_lat;
#endif
```
but this isn't cross-platform. The change in hist.h of
```
@@ -255,7 +255,7 @@ struct hist_entry {
        u64                     code_page_size;
        u64                     weight;
        u64                     ins_lat;
-       u64                     p_stage_cyc;
+       u64                     p_stage_cyc_or_retire_lat;
```
could be a follow up CL, but then we lose something of what the field
is holding given the value is just a copy of that same named value in
perf_sample. The code only inserts 34 lines and so the churn of doing
that seemed worse than having the change in a single patch for
clarity.

Thanks,
Ian

> Thanks,
> Namhyung
>
> >
> > Signed-off-by: Ian Rogers <irogers@google.com>
> > ---
> >  tools/perf/arch/powerpc/util/event.c       | 26 ---------------------
> >  tools/perf/arch/x86/tests/sample-parsing.c |  4 ++--
> >  tools/perf/arch/x86/util/event.c           | 27 ----------------------
> >  tools/perf/builtin-script.c                |  2 +-
> >  tools/perf/util/dlfilter.c                 |  2 +-
> >  tools/perf/util/event.h                    |  2 --
> >  tools/perf/util/evsel.c                    | 17 ++++++++++----
> >  tools/perf/util/hist.c                     |  4 ++--
> >  tools/perf/util/hist.h                     |  2 +-
> >  tools/perf/util/intel-tpebs.c              |  4 ++--
> >  tools/perf/util/sample.h                   |  5 +---
> >  tools/perf/util/session.c                  |  2 +-
> >  tools/perf/util/sort.c                     |  6 ++---
> >  tools/perf/util/synthetic-events.c         | 10 ++++++--
> >  14 files changed, 34 insertions(+), 79 deletions(-)
> >
> > diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c
> > index 77d8cc2b5691..024ac8b54c33 100644
> > --- a/tools/perf/arch/powerpc/util/event.c
> > +++ b/tools/perf/arch/powerpc/util/event.c
> > @@ -11,32 +11,6 @@
> >  #include "../../../util/debug.h"
> >  #include "../../../util/sample.h"
> >
> > -void arch_perf_parse_sample_weight(struct perf_sample *data,
> > -                                const __u64 *array, u64 type)
> > -{
> > -     union perf_sample_weight weight;
> > -
> > -     weight.full = *array;
> > -     if (type & PERF_SAMPLE_WEIGHT)
> > -             data->weight = weight.full;
> > -     else {
> > -             data->weight = weight.var1_dw;
> > -             data->ins_lat = weight.var2_w;
> > -             data->p_stage_cyc = weight.var3_w;
> > -     }
> > -}
> > -
> > -void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
> > -                                     __u64 *array, u64 type)
> > -{
> > -     *array = data->weight;
> > -
> > -     if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
> > -             *array &= 0xffffffff;
> > -             *array |= ((u64)data->ins_lat << 32);
> > -     }
> > -}
> > -
> >  const char *arch_perf_header_entry(const char *se_header)
> >  {
> >       if (!strcmp(se_header, "Local INSTR Latency"))
> > diff --git a/tools/perf/arch/x86/tests/sample-parsing.c b/tools/perf/arch/x86/tests/sample-parsing.c
> > index a061e8619267..95d8f7f1d2fb 100644
> > --- a/tools/perf/arch/x86/tests/sample-parsing.c
> > +++ b/tools/perf/arch/x86/tests/sample-parsing.c
> > @@ -29,7 +29,7 @@ static bool samples_same(const struct perf_sample *s1,
> >  {
> >       if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
> >               COMP(ins_lat);
> > -             COMP(retire_lat);
> > +             COMP(p_stage_cyc_or_retire_lat);
> >       }
> >
> >       return true;
> > @@ -50,7 +50,7 @@ static int do_test(u64 sample_type)
> >       struct perf_sample sample = {
> >               .weight         = 101,
> >               .ins_lat        = 102,
> > -             .retire_lat     = 103,
> > +             .p_stage_cyc_or_retire_lat = 103,
> >       };
> >       struct perf_sample sample_out;
> >       size_t i, sz, bufsz;
> > diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c
> > index a0400707180c..576c1c36046c 100644
> > --- a/tools/perf/arch/x86/util/event.c
> > +++ b/tools/perf/arch/x86/util/event.c
> > @@ -92,33 +92,6 @@ int perf_event__synthesize_extra_kmaps(const struct perf_tool *tool,
> >
> >  #endif
> >
> > -void arch_perf_parse_sample_weight(struct perf_sample *data,
> > -                                const __u64 *array, u64 type)
> > -{
> > -     union perf_sample_weight weight;
> > -
> > -     weight.full = *array;
> > -     if (type & PERF_SAMPLE_WEIGHT)
> > -             data->weight = weight.full;
> > -     else {
> > -             data->weight = weight.var1_dw;
> > -             data->ins_lat = weight.var2_w;
> > -             data->retire_lat = weight.var3_w;
> > -     }
> > -}
> > -
> > -void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
> > -                                     __u64 *array, u64 type)
> > -{
> > -     *array = data->weight;
> > -
> > -     if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
> > -             *array &= 0xffffffff;
> > -             *array |= ((u64)data->ins_lat << 32);
> > -             *array |= ((u64)data->retire_lat << 48);
> > -     }
> > -}
> > -
> >  const char *arch_perf_header_entry(const char *se_header)
> >  {
> >       if (!strcmp(se_header, "Local Pipeline Stage Cycle"))
> > diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
> > index 6c3bf74dd78c..c02c435e0f0b 100644
> > --- a/tools/perf/builtin-script.c
> > +++ b/tools/perf/builtin-script.c
> > @@ -2251,7 +2251,7 @@ static void process_event(struct perf_script *script,
> >               fprintf(fp, "%16" PRIu16, sample->ins_lat);
> >
> >       if (PRINT_FIELD(RETIRE_LAT))
> > -             fprintf(fp, "%16" PRIu16, sample->retire_lat);
> > +             fprintf(fp, "%16" PRIu16, sample->p_stage_cyc_or_retire_lat);
> >
> >       if (PRINT_FIELD(CGROUP)) {
> >               const char *cgrp_name;
> > diff --git a/tools/perf/util/dlfilter.c b/tools/perf/util/dlfilter.c
> > index ddacef881af2..d5fd6d34a17c 100644
> > --- a/tools/perf/util/dlfilter.c
> > +++ b/tools/perf/util/dlfilter.c
> > @@ -513,6 +513,7 @@ int dlfilter__do_filter_event(struct dlfilter *d,
> >       d->d_addr_al   = &d_addr_al;
> >
> >       d_sample.size  = sizeof(d_sample);
> > +     d_sample.p_stage_cyc = sample->p_stage_cyc_or_retire_lat;
> >       d_ip_al.size   = 0; /* To indicate d_ip_al is not initialized */
> >       d_addr_al.size = 0; /* To indicate d_addr_al is not initialized */
> >
> > @@ -526,7 +527,6 @@ int dlfilter__do_filter_event(struct dlfilter *d,
> >       ASSIGN(period);
> >       ASSIGN(weight);
> >       ASSIGN(ins_lat);
> > -     ASSIGN(p_stage_cyc);
> >       ASSIGN(transaction);
> >       ASSIGN(insn_cnt);
> >       ASSIGN(cyc_cnt);
> > diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
> > index 664bf39567ce..119bce37f4fd 100644
> > --- a/tools/perf/util/event.h
> > +++ b/tools/perf/util/event.h
> > @@ -390,8 +390,6 @@ extern unsigned int proc_map_timeout;
> >  #define PAGE_SIZE_NAME_LEN   32
> >  char *get_page_size_name(u64 size, char *str);
> >
> > -void arch_perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type);
> > -void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 *array, u64 type);
> >  const char *arch_perf_header_entry(const char *se_header);
> >  int arch_support_sort_key(const char *sort_key);
> >
> > diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> > index d55482f094bf..27de167855ee 100644
> > --- a/tools/perf/util/evsel.c
> > +++ b/tools/perf/util/evsel.c
> > @@ -2846,11 +2846,18 @@ perf_event__check_size(union perf_event *event, unsigned int sample_size)
> >       return 0;
> >  }
> >
> > -void __weak arch_perf_parse_sample_weight(struct perf_sample *data,
> > -                                       const __u64 *array,
> > -                                       u64 type __maybe_unused)
> > +static void perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type)
> >  {
> > -     data->weight = *array;
> > +     union perf_sample_weight weight;
> > +
> > +     weight.full = *array;
> > +     if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
> > +             data->weight = weight.var1_dw;
> > +             data->ins_lat = weight.var2_w;
> > +             data->p_stage_cyc_or_retire_lat = weight.var3_w;
> > +     } else {
> > +             data->weight = weight.full;
> > +     }
> >  }
> >
> >  u64 evsel__bitfield_swap_branch_flags(u64 value)
> > @@ -3236,7 +3243,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
> >
> >       if (type & PERF_SAMPLE_WEIGHT_TYPE) {
> >               OVERFLOW_CHECK_u64(array);
> > -             arch_perf_parse_sample_weight(data, array, type);
> > +             perf_parse_sample_weight(data, array, type);
> >               array++;
> >       }
> >
> > diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> > index afc6855327ab..ae9803dca0b1 100644
> > --- a/tools/perf/util/hist.c
> > +++ b/tools/perf/util/hist.c
> > @@ -829,7 +829,7 @@ __hists__add_entry(struct hists *hists,
> >                       .period = sample->period,
> >                       .weight1 = sample->weight,
> >                       .weight2 = sample->ins_lat,
> > -                     .weight3 = sample->p_stage_cyc,
> > +                     .weight3 = sample->p_stage_cyc_or_retire_lat,
> >                       .latency = al->latency,
> >               },
> >               .parent = sym_parent,
> > @@ -846,7 +846,7 @@ __hists__add_entry(struct hists *hists,
> >               .time = hist_time(sample->time),
> >               .weight = sample->weight,
> >               .ins_lat = sample->ins_lat,
> > -             .p_stage_cyc = sample->p_stage_cyc,
> > +             .p_stage_cyc_or_retire_lat = sample->p_stage_cyc_or_retire_lat,
> >               .simd_flags = sample->simd_flags,
> >       }, *he = hists__findnew_entry(hists, &entry, al, sample_self);
> >
> > diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
> > index c64254088fc7..67033bdabcf4 100644
> > --- a/tools/perf/util/hist.h
> > +++ b/tools/perf/util/hist.h
> > @@ -255,7 +255,7 @@ struct hist_entry {
> >       u64                     code_page_size;
> >       u64                     weight;
> >       u64                     ins_lat;
> > -     u64                     p_stage_cyc;
> > +     u64                     p_stage_cyc_or_retire_lat;
> >       s32                     socket;
> >       s32                     cpu;
> >       int                     parallelism;
> > diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
> > index 4ad4bc118ea5..ec2f3ecf1e1c 100644
> > --- a/tools/perf/util/intel-tpebs.c
> > +++ b/tools/perf/util/intel-tpebs.c
> > @@ -202,8 +202,8 @@ static int process_sample_event(const struct perf_tool *tool __maybe_unused,
> >        * latency value will be used. Save the number of samples and the sum of
> >        * retire latency value for each event.
> >        */
> > -     t->last = sample->retire_lat;
> > -     update_stats(&t->stats, sample->retire_lat);
> > +     t->last = sample->p_stage_cyc_or_retire_lat;
> > +     update_stats(&t->stats, sample->p_stage_cyc_or_retire_lat);
> >       mutex_unlock(tpebs_mtx_get());
> >       return 0;
> >  }
> > diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
> > index 0e96240052e9..3330d18fb5fd 100644
> > --- a/tools/perf/util/sample.h
> > +++ b/tools/perf/util/sample.h
> > @@ -104,10 +104,7 @@ struct perf_sample {
> >       u8  cpumode;
> >       u16 misc;
> >       u16 ins_lat;
> > -     union {
> > -             u16 p_stage_cyc;
> > -             u16 retire_lat;
> > -     };
> > +     u16 p_stage_cyc_or_retire_lat;
> >       bool no_hw_idx;         /* No hw_idx collected in branch_stack */
> >       char insn[MAX_INSN];
> >       void *raw_data;
> > diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> > index a320672c264e..451bc24ccfba 100644
> > --- a/tools/perf/util/session.c
> > +++ b/tools/perf/util/session.c
> > @@ -1094,7 +1094,7 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
> >               printf("... weight: %" PRIu64 "", sample->weight);
> >                       if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
> >                               printf(",0x%"PRIx16"", sample->ins_lat);
> > -                             printf(",0x%"PRIx16"", sample->p_stage_cyc);
> > +                             printf(",0x%"PRIx16"", sample->p_stage_cyc_or_retire_lat);
> >                       }
> >               printf("\n");
> >       }
> > diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
> > index 45e654653960..dda4ef0b5a73 100644
> > --- a/tools/perf/util/sort.c
> > +++ b/tools/perf/util/sort.c
> > @@ -1879,21 +1879,21 @@ struct sort_entry sort_global_ins_lat = {
> >  static int64_t
> >  sort__p_stage_cyc_cmp(struct hist_entry *left, struct hist_entry *right)
> >  {
> > -     return left->p_stage_cyc - right->p_stage_cyc;
> > +     return left->p_stage_cyc_or_retire_lat - right->p_stage_cyc_or_retire_lat;
> >  }
> >
> >  static int hist_entry__global_p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
> >                                       size_t size, unsigned int width)
> >  {
> >       return repsep_snprintf(bf, size, "%-*u", width,
> > -                     he->p_stage_cyc * he->stat.nr_events);
> > +                     he->p_stage_cyc_or_retire_lat * he->stat.nr_events);
> >  }
> >
> >
> >  static int hist_entry__p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
> >                                       size_t size, unsigned int width)
> >  {
> > -     return repsep_snprintf(bf, size, "%-*u", width, he->p_stage_cyc);
> > +     return repsep_snprintf(bf, size, "%-*u", width, he->p_stage_cyc_or_retire_lat);
> >  }
> >
> >  struct sort_entry sort_local_p_stage_cyc = {
> > diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
> > index 2fc4d0537840..449a41900fc4 100644
> > --- a/tools/perf/util/synthetic-events.c
> > +++ b/tools/perf/util/synthetic-events.c
> > @@ -1567,10 +1567,16 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
> >       return result;
> >  }
> >
> > -void __weak arch_perf_synthesize_sample_weight(const struct perf_sample *data,
> > +static void perf_synthesize_sample_weight(const struct perf_sample *data,
> >                                              __u64 *array, u64 type __maybe_unused)
> >  {
> >       *array = data->weight;
> > +
> > +     if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
> > +             *array &= 0xffffffff;
> > +             *array |= ((u64)data->ins_lat << 32);
> > +             *array |= ((u64)data->p_stage_cyc_or_retire_lat << 48);
> > +     }
> >  }
> >
> >  static __u64 *copy_read_group_values(__u64 *array, __u64 read_format,
> > @@ -1730,7 +1736,7 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo
> >       }
> >
> >       if (type & PERF_SAMPLE_WEIGHT_TYPE) {
> > -             arch_perf_synthesize_sample_weight(sample, array, type);
> > +             perf_synthesize_sample_weight(sample, array, type);
> >               array++;
> >       }
> >
> > --
> > 2.49.0.1143.g0be31eac6b-goog
> >
Re: [PATCH v3 1/3] perf sample: Remove arch notion of sample parsing
Posted by Namhyung Kim 6 months, 2 weeks ago
On Wed, May 21, 2025 at 02:15:24PM -0700, Ian Rogers wrote:
> On Wed, May 21, 2025 at 1:27 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Wed, May 21, 2025 at 09:53:15AM -0700, Ian Rogers wrote:
> > > By definition arch sample parsing and synthesis will inhibit certain
> > > kinds of cross-platform record then analysis (report, script,
> > > etc.). Remove arch_perf_parse_sample_weight and
> > > arch_perf_synthesize_sample_weight replacing with a common
> > > implementation. Combine perf_sample p_stage_cyc and retire_lat to
> > > capture the differing uses regardless of compiled for architecture.
> >
> > Can you please do this without renaming?  It can be a separate patch but
> > I think we can just leave it.
> 
> It is not clear what the use of the union is. Presumably it is a
> tagged union but there is no tag as the union value to use is implied
> by either being built on x86_64 or powerpc. The change removes the
> notion of this code being built for x86_64 or powerpc and so the union
> value to use isn't clear (e.g. should arm use p_stage_cyc or
> retire_lat from the union), hence combining to show that it could be
> one or the other. The code could be:
> ```
> #ifdef __x86_64__
>        u16 p_stage_cyc;
> #elif defined(powerpc)
>        u16 retire_lat;
> #endif
> ```
> but this isn't cross-platform.

Right, we probably don't want it.


> The change in hist.h of
> ```
> @@ -255,7 +255,7 @@ struct hist_entry {
>         u64                     code_page_size;
>         u64                     weight;
>         u64                     ins_lat;
> -       u64                     p_stage_cyc;
> +       u64                     p_stage_cyc_or_retire_lat;
> ```
> could be a follow up CL, but then we lose something of what the field
> is holding given the value is just a copy of that same named value in
> perf_sample. The code only inserts 34 lines and so the churn of doing
> that seemed worse than having the change in a single patch for
> clarity.

Assuming other archs can add something later, we won't rename the field
again.  So I can live with the ugly union fields.  If we really want to
rename it, I prefer calling it just 'weight3' and let the archs handle
the display name only.

Thanks,
Namhyung

Re: [PATCH v3 1/3] perf sample: Remove arch notion of sample parsing
Posted by Ian Rogers 6 months, 2 weeks ago
On Wed, May 28, 2025 at 11:11 AM Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Wed, May 21, 2025 at 02:15:24PM -0700, Ian Rogers wrote:
> > On Wed, May 21, 2025 at 1:27 PM Namhyung Kim <namhyung@kernel.org> wrote:
> > >
> > > On Wed, May 21, 2025 at 09:53:15AM -0700, Ian Rogers wrote:
> > > > By definition arch sample parsing and synthesis will inhibit certain
> > > > kinds of cross-platform record then analysis (report, script,
> > > > etc.). Remove arch_perf_parse_sample_weight and
> > > > arch_perf_synthesize_sample_weight replacing with a common
> > > > implementation. Combine perf_sample p_stage_cyc and retire_lat to
> > > > capture the differing uses regardless of compiled for architecture.
> > >
> > > Can you please do this without renaming?  It can be a separate patch but
> > > I think we can just leave it.
> >
> > It is not clear what the use of the union is. Presumably it is a
> > tagged union but there is no tag as the union value to use is implied
> > by either being built on x86_64 or powerpc. The change removes the
> > notion of this code being built for x86_64 or powerpc and so the union
> > value to use isn't clear (e.g. should arm use p_stage_cyc or
> > retire_lat from the union), hence combining to show that it could be
> > one or the other. The code could be:
> > ```
> > #ifdef __x86_64__
> >        u16 p_stage_cyc;
> > #elif defined(powerpc)
> >        u16 retire_lat;
> > #endif
> > ```
> > but this isn't cross-platform.
>
> Right, we probably don't want it.
>
>
> > The change in hist.h of
> > ```
> > @@ -255,7 +255,7 @@ struct hist_entry {
> >         u64                     code_page_size;
> >         u64                     weight;
> >         u64                     ins_lat;
> > -       u64                     p_stage_cyc;
> > +       u64                     p_stage_cyc_or_retire_lat;
> > ```
> > could be a follow up CL, but then we lose something of what the field
> > is holding given the value is just a copy of that same named value in
> > perf_sample. The code only inserts 34 lines and so the churn of doing
> > that seemed worse than having the change in a single patch for
> > clarity.
>
> Assuming other archs can add something later, we won't rename the field
> again.  So I can live with the ugly union fields.  If we really want to
> rename it, I prefer calling it just 'weight3' and let the archs handle
> the display name only.

But that's my point (or in other words maybe you've missed my point) .
Regardless of arch we should display p_stage_cyc if processing a
perf.data file from a PowerPC as determined from the perf_env in the
perf.data file, or retire_lat if processing a perf.data file from x86.
The arch of the perf build is entirely irrelevant and calling the
variable an opaque weight3 will require something that will need to be
disambiguate it elsewhere in the code. The goal in variable names
should be to be intention revealing, which I think
p_stage_cyc_or_retire_lat is doing better than weight3, which is
something of a regression from the existing but inaccurate
p_stage_cyc.

Thanks,
Ian

> Thanks,
> Namhyung
>
Re: [PATCH v3 1/3] perf sample: Remove arch notion of sample parsing
Posted by Namhyung Kim 6 months, 2 weeks ago
On Wed, May 28, 2025 at 11:27:06AM -0700, Ian Rogers wrote:
> On Wed, May 28, 2025 at 11:11 AM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Wed, May 21, 2025 at 02:15:24PM -0700, Ian Rogers wrote:
> > > On Wed, May 21, 2025 at 1:27 PM Namhyung Kim <namhyung@kernel.org> wrote:
> > > >
> > > > On Wed, May 21, 2025 at 09:53:15AM -0700, Ian Rogers wrote:
> > > > > By definition arch sample parsing and synthesis will inhibit certain
> > > > > kinds of cross-platform record then analysis (report, script,
> > > > > etc.). Remove arch_perf_parse_sample_weight and
> > > > > arch_perf_synthesize_sample_weight replacing with a common
> > > > > implementation. Combine perf_sample p_stage_cyc and retire_lat to
> > > > > capture the differing uses regardless of compiled for architecture.
> > > >
> > > > Can you please do this without renaming?  It can be a separate patch but
> > > > I think we can just leave it.
> > >
> > > It is not clear what the use of the union is. Presumably it is a
> > > tagged union but there is no tag as the union value to use is implied
> > > by either being built on x86_64 or powerpc. The change removes the
> > > notion of this code being built for x86_64 or powerpc and so the union
> > > value to use isn't clear (e.g. should arm use p_stage_cyc or
> > > retire_lat from the union), hence combining to show that it could be
> > > one or the other. The code could be:
> > > ```
> > > #ifdef __x86_64__
> > >        u16 p_stage_cyc;
> > > #elif defined(powerpc)
> > >        u16 retire_lat;
> > > #endif
> > > ```
> > > but this isn't cross-platform.
> >
> > Right, we probably don't want it.
> >
> >
> > > The change in hist.h of
> > > ```
> > > @@ -255,7 +255,7 @@ struct hist_entry {
> > >         u64                     code_page_size;
> > >         u64                     weight;
> > >         u64                     ins_lat;
> > > -       u64                     p_stage_cyc;
> > > +       u64                     p_stage_cyc_or_retire_lat;
> > > ```
> > > could be a follow up CL, but then we lose something of what the field
> > > is holding given the value is just a copy of that same named value in
> > > perf_sample. The code only inserts 34 lines and so the churn of doing
> > > that seemed worse than having the change in a single patch for
> > > clarity.
> >
> > Assuming other archs can add something later, we won't rename the field
> > again.  So I can live with the ugly union fields.  If we really want to
> > rename it, I prefer calling it just 'weight3' and let the archs handle
> > the display name only.
> 
> But that's my point (or in other words maybe you've missed my point) .
> Regardless of arch we should display p_stage_cyc if processing a
> perf.data file from a PowerPC as determined from the perf_env in the
> perf.data file, or retire_lat if processing a perf.data file from x86.

Agreed.


> The arch of the perf build is entirely irrelevant and calling the
> variable an opaque weight3 will require something that will need to be
> disambiguate it elsewhere in the code. The goal in variable names
> should be to be intention revealing, which I think
> p_stage_cyc_or_retire_lat is doing better than weight3, which is
> something of a regression from the existing but inaccurate
> p_stage_cyc.

Yeah, but I worried if it would end up with
'p_stage_cyc_or_retire_lat_or_something_else' later.

Thanks,
Namhyung

Re: [PATCH v3 1/3] perf sample: Remove arch notion of sample parsing
Posted by Ian Rogers 6 months, 2 weeks ago
On Wed, May 28, 2025 at 1:09 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Wed, May 28, 2025 at 11:27:06AM -0700, Ian Rogers wrote:
> > On Wed, May 28, 2025 at 11:11 AM Namhyung Kim <namhyung@kernel.org> wrote:
> > >
> > > On Wed, May 21, 2025 at 02:15:24PM -0700, Ian Rogers wrote:
> > > > On Wed, May 21, 2025 at 1:27 PM Namhyung Kim <namhyung@kernel.org> wrote:
> > > > >
> > > > > On Wed, May 21, 2025 at 09:53:15AM -0700, Ian Rogers wrote:
> > > > > > By definition arch sample parsing and synthesis will inhibit certain
> > > > > > kinds of cross-platform record then analysis (report, script,
> > > > > > etc.). Remove arch_perf_parse_sample_weight and
> > > > > > arch_perf_synthesize_sample_weight replacing with a common
> > > > > > implementation. Combine perf_sample p_stage_cyc and retire_lat to
> > > > > > capture the differing uses regardless of compiled for architecture.
> > > > >
> > > > > Can you please do this without renaming?  It can be a separate patch but
> > > > > I think we can just leave it.
> > > >
> > > > It is not clear what the use of the union is. Presumably it is a
> > > > tagged union but there is no tag as the union value to use is implied
> > > > by either being built on x86_64 or powerpc. The change removes the
> > > > notion of this code being built for x86_64 or powerpc and so the union
> > > > value to use isn't clear (e.g. should arm use p_stage_cyc or
> > > > retire_lat from the union), hence combining to show that it could be
> > > > one or the other. The code could be:
> > > > ```
> > > > #ifdef __x86_64__
> > > >        u16 p_stage_cyc;
> > > > #elif defined(powerpc)
> > > >        u16 retire_lat;
> > > > #endif
> > > > ```
> > > > but this isn't cross-platform.
> > >
> > > Right, we probably don't want it.
> > >
> > >
> > > > The change in hist.h of
> > > > ```
> > > > @@ -255,7 +255,7 @@ struct hist_entry {
> > > >         u64                     code_page_size;
> > > >         u64                     weight;
> > > >         u64                     ins_lat;
> > > > -       u64                     p_stage_cyc;
> > > > +       u64                     p_stage_cyc_or_retire_lat;
> > > > ```
> > > > could be a follow up CL, but then we lose something of what the field
> > > > is holding given the value is just a copy of that same named value in
> > > > perf_sample. The code only inserts 34 lines and so the churn of doing
> > > > that seemed worse than having the change in a single patch for
> > > > clarity.
> > >
> > > Assuming other archs can add something later, we won't rename the field
> > > again.  So I can live with the ugly union fields.  If we really want to
> > > rename it, I prefer calling it just 'weight3' and let the archs handle
> > > the display name only.
> >
> > But that's my point (or in other words maybe you've missed my point) .
> > Regardless of arch we should display p_stage_cyc if processing a
> > perf.data file from a PowerPC as determined from the perf_env in the
> > perf.data file, or retire_lat if processing a perf.data file from x86.
>
> Agreed.
>
>
> > The arch of the perf build is entirely irrelevant and calling the
> > variable an opaque weight3 will require something that will need to be
> > disambiguate it elsewhere in the code. The goal in variable names
> > should be to be intention revealing, which I think
> > p_stage_cyc_or_retire_lat is doing better than weight3, which is
> > something of a regression from the existing but inaccurate
> > p_stage_cyc.
>
> Yeah, but I worried if it would end up with
> 'p_stage_cyc_or_retire_lat_or_something_else' later.

Perhaps it should be:
```
union {
  u16 raw;
  u16 p_stage_cyc;
  u16 retire_lat;
} weight3;
```
to try to best capture this. `xyz.weight3.raw` when the PowerPC or x86
use isn't known, etc. In the histogram code the u16 is a u64.

Thanks,
Ian

> Thanks,
> Namhyung
>
Re: [PATCH v3 1/3] perf sample: Remove arch notion of sample parsing
Posted by Namhyung Kim 6 months, 2 weeks ago
On Wed, May 28, 2025 at 01:38:03PM -0700, Ian Rogers wrote:
> On Wed, May 28, 2025 at 1:09 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Wed, May 28, 2025 at 11:27:06AM -0700, Ian Rogers wrote:
> > > On Wed, May 28, 2025 at 11:11 AM Namhyung Kim <namhyung@kernel.org> wrote:
> > > >
> > > > On Wed, May 21, 2025 at 02:15:24PM -0700, Ian Rogers wrote:
> > > > > On Wed, May 21, 2025 at 1:27 PM Namhyung Kim <namhyung@kernel.org> wrote:
> > > > > >
> > > > > > On Wed, May 21, 2025 at 09:53:15AM -0700, Ian Rogers wrote:
> > > > > > > By definition arch sample parsing and synthesis will inhibit certain
> > > > > > > kinds of cross-platform record then analysis (report, script,
> > > > > > > etc.). Remove arch_perf_parse_sample_weight and
> > > > > > > arch_perf_synthesize_sample_weight replacing with a common
> > > > > > > implementation. Combine perf_sample p_stage_cyc and retire_lat to
> > > > > > > capture the differing uses regardless of compiled for architecture.
> > > > > >
> > > > > > Can you please do this without renaming?  It can be a separate patch but
> > > > > > I think we can just leave it.
> > > > >
> > > > > It is not clear what the use of the union is. Presumably it is a
> > > > > tagged union but there is no tag as the union value to use is implied
> > > > > by either being built on x86_64 or powerpc. The change removes the
> > > > > notion of this code being built for x86_64 or powerpc and so the union
> > > > > value to use isn't clear (e.g. should arm use p_stage_cyc or
> > > > > retire_lat from the union), hence combining to show that it could be
> > > > > one or the other. The code could be:
> > > > > ```
> > > > > #ifdef __x86_64__
> > > > >        u16 p_stage_cyc;
> > > > > #elif defined(powerpc)
> > > > >        u16 retire_lat;
> > > > > #endif
> > > > > ```
> > > > > but this isn't cross-platform.
> > > >
> > > > Right, we probably don't want it.
> > > >
> > > >
> > > > > The change in hist.h of
> > > > > ```
> > > > > @@ -255,7 +255,7 @@ struct hist_entry {
> > > > >         u64                     code_page_size;
> > > > >         u64                     weight;
> > > > >         u64                     ins_lat;
> > > > > -       u64                     p_stage_cyc;
> > > > > +       u64                     p_stage_cyc_or_retire_lat;
> > > > > ```
> > > > > could be a follow up CL, but then we lose something of what the field
> > > > > is holding given the value is just a copy of that same named value in
> > > > > perf_sample. The code only inserts 34 lines and so the churn of doing
> > > > > that seemed worse than having the change in a single patch for
> > > > > clarity.
> > > >
> > > > Assuming other archs can add something later, we won't rename the field
> > > > again.  So I can live with the ugly union fields.  If we really want to
> > > > rename it, I prefer calling it just 'weight3' and let the archs handle
> > > > the display name only.
> > >
> > > But that's my point (or in other words maybe you've missed my point) .
> > > Regardless of arch we should display p_stage_cyc if processing a
> > > perf.data file from a PowerPC as determined from the perf_env in the
> > > perf.data file, or retire_lat if processing a perf.data file from x86.
> >
> > Agreed.
> >
> >
> > > The arch of the perf build is entirely irrelevant and calling the
> > > variable an opaque weight3 will require something that will need to be
> > > disambiguate it elsewhere in the code. The goal in variable names
> > > should be to be intention revealing, which I think
> > > p_stage_cyc_or_retire_lat is doing better than weight3, which is
> > > something of a regression from the existing but inaccurate
> > > p_stage_cyc.
> >
> > Yeah, but I worried if it would end up with
> > 'p_stage_cyc_or_retire_lat_or_something_else' later.
> 
> Perhaps it should be:
> ```
> union {
>   u16 raw;
>   u16 p_stage_cyc;
>   u16 retire_lat;
> } weight3;
> ```
> to try to best capture this. `xyz.weight3.raw` when the PowerPC or x86
> use isn't known, etc.

Looks better.  But based on the offline discussion, it'd be better to
just use 'u16 weight3'.


> In the histogram code the u16 is a u64.

Yep, because histogram entry aggregates sample weights.

Thanks,
Namhyung