[PATCH 18/19] perf: Introduce positive capability for raw events

Robin Murphy posted 19 patches 1 month, 3 weeks ago
[PATCH 18/19] perf: Introduce positive capability for raw events
Posted by Robin Murphy 1 month, 3 weeks ago
Only a handful of CPU PMUs accept PERF_TYPE_{RAW,HARDWARE,HW_CACHE}
events without registering themselves as PERF_TYPE_RAW in the first
place. Add an explicit opt-in for these special cases, so that we can
make life easier for every other driver (and probably also speed up the
slow-path search) by having perf_try_init_event() do the basic type
checking to cover the majority of cases.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

A further possibility is to automatically add the cap to PERF_TYPE_RAW
PMUs in perf_pmu_register() to have a single point-of-use condition; I'm
undecided...
---
 arch/s390/kernel/perf_cpum_cf.c    |  1 +
 arch/s390/kernel/perf_pai_crypto.c |  2 +-
 arch/s390/kernel/perf_pai_ext.c    |  2 +-
 arch/x86/events/core.c             |  2 +-
 drivers/perf/arm_pmu.c             |  1 +
 include/linux/perf_event.h         |  1 +
 kernel/events/core.c               | 15 +++++++++++++++
 7 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 1a94e0944bc5..782ab755ddd4 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -1054,6 +1054,7 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
 /* Performance monitoring unit for s390x */
 static struct pmu cpumf_pmu = {
 	.task_ctx_nr  = perf_sw_context,
+	.capabilities = PERF_PMU_CAP_RAW_EVENTS,
 	.pmu_enable   = cpumf_pmu_enable,
 	.pmu_disable  = cpumf_pmu_disable,
 	.event_init   = cpumf_pmu_event_init,
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
index a64b6b056a21..b5b6d8b5d943 100644
--- a/arch/s390/kernel/perf_pai_crypto.c
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -569,7 +569,7 @@ static const struct attribute_group *paicrypt_attr_groups[] = {
 /* Performance monitoring unit for mapped counters */
 static struct pmu paicrypt = {
 	.task_ctx_nr  = perf_hw_context,
-	.capabilities = PERF_PMU_CAP_SAMPLING,
+	.capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
 	.event_init   = paicrypt_event_init,
 	.add	      = paicrypt_add,
 	.del	      = paicrypt_del,
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
index 1261f80c6d52..bcd28c38da70 100644
--- a/arch/s390/kernel/perf_pai_ext.c
+++ b/arch/s390/kernel/perf_pai_ext.c
@@ -595,7 +595,7 @@ static const struct attribute_group *paiext_attr_groups[] = {
 /* Performance monitoring unit for mapped counters */
 static struct pmu paiext = {
 	.task_ctx_nr  = perf_hw_context,
-	.capabilities = PERF_PMU_CAP_SAMPLING,
+	.capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
 	.event_init   = paiext_event_init,
 	.add	      = paiext_add,
 	.del	      = paiext_del,
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 789dfca2fa67..764728bb80ae 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2697,7 +2697,7 @@ static bool x86_pmu_filter(struct pmu *pmu, int cpu)
 }
 
 static struct pmu pmu = {
-	.capabilities		= PERF_PMU_CAP_SAMPLING,
+	.capabilities		= PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
 
 	.pmu_enable		= x86_pmu_enable,
 	.pmu_disable		= x86_pmu_disable,
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 72d8f38d0aa5..bc772a3bf411 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -877,6 +877,7 @@ struct arm_pmu *armpmu_alloc(void)
 		 * specific PMU.
 		 */
 		.capabilities	= PERF_PMU_CAP_SAMPLING |
+				  PERF_PMU_CAP_RAW_EVENTS |
 				  PERF_PMU_CAP_EXTENDED_REGS |
 				  PERF_PMU_CAP_EXTENDED_HW_TYPE,
 	};
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 183b7c48b329..c6ad036c0037 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -305,6 +305,7 @@ struct perf_event_pmu_context;
 #define PERF_PMU_CAP_EXTENDED_HW_TYPE	0x0100
 #define PERF_PMU_CAP_AUX_PAUSE		0x0200
 #define PERF_PMU_CAP_AUX_PREFER_LARGE	0x0400
+#define PERF_PMU_CAP_RAW_EVENTS		0x0800
 
 /**
  * pmu::scope
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 71b2a6730705..2ecee76d2ae2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12556,11 +12556,26 @@ static inline bool has_extended_regs(struct perf_event *event)
 	       (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
 }
 
+static bool is_raw_pmu(const struct pmu *pmu)
+{
+	return pmu->type == PERF_TYPE_RAW ||
+	       pmu->capabilities & PERF_PMU_CAP_RAW_EVENTS;
+}
+
 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 {
 	struct perf_event_context *ctx = NULL;
 	int ret;
 
+	/*
+	 * Before touching anything, we can safely skip:
+	 * - any event for a specific PMU which is not this one
+	 * - any common event if this PMU doesn't support them
+	 */
+	if (event->attr.type != pmu->type &&
+	    (event->attr.type >= PERF_TYPE_MAX || is_raw_pmu(pmu)))
+		return -ENOENT;
+
 	if (!try_module_get(pmu->module))
 		return -ENODEV;
 
-- 
2.39.2.101.g768bb238c484.dirty
Re: [PATCH 18/19] perf: Introduce positive capability for raw events
Posted by Mark Rutland 1 month, 1 week ago
On Wed, Aug 13, 2025 at 06:01:10PM +0100, Robin Murphy wrote:
> Only a handful of CPU PMUs accept PERF_TYPE_{RAW,HARDWARE,HW_CACHE}
> events without registering themselves as PERF_TYPE_RAW in the first
> place. Add an explicit opt-in for these special cases, so that we can
> make life easier for every other driver (and probably also speed up the
> slow-path search) by having perf_try_init_event() do the basic type
> checking to cover the majority of cases.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>


To bikeshed a little here, I'm not keen on the PERF_PMU_CAP_RAW_EVENTS
name, because it's not clear what "RAW" really means, and people will
definitely read that to mean something else.

Could we go with something like PERF_PMU_CAP_COMMON_CPU_EVENTS, to make
it clear that this is about opting into CPU-PMU specific event types (of
which PERF_TYPE_RAW is one of)?

Likewise, s/is_raw_pmu()/pmu_supports_common_cpu_events()/.

> ---
> 
> A further possibility is to automatically add the cap to PERF_TYPE_RAW
> PMUs in perf_pmu_register() to have a single point-of-use condition; I'm
> undecided...

I reckon we don't need to automagically do that, but I reckon that
is_raw_pmu()/pmu_supports_common_cpu_events() should only check the cap,
and we don't read anything special into any of
PERF_TYPE_{RAW,HARDWARE,HW_CACHE}.

> ---
>  arch/s390/kernel/perf_cpum_cf.c    |  1 +
>  arch/s390/kernel/perf_pai_crypto.c |  2 +-
>  arch/s390/kernel/perf_pai_ext.c    |  2 +-
>  arch/x86/events/core.c             |  2 +-
>  drivers/perf/arm_pmu.c             |  1 +
>  include/linux/perf_event.h         |  1 +
>  kernel/events/core.c               | 15 +++++++++++++++
>  7 files changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
> index 1a94e0944bc5..782ab755ddd4 100644
> --- a/arch/s390/kernel/perf_cpum_cf.c
> +++ b/arch/s390/kernel/perf_cpum_cf.c
> @@ -1054,6 +1054,7 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
>  /* Performance monitoring unit for s390x */
>  static struct pmu cpumf_pmu = {
>  	.task_ctx_nr  = perf_sw_context,
> +	.capabilities = PERF_PMU_CAP_RAW_EVENTS,
>  	.pmu_enable   = cpumf_pmu_enable,
>  	.pmu_disable  = cpumf_pmu_disable,
>  	.event_init   = cpumf_pmu_event_init,

Tangential, but use of perf_sw_context here looks bogus.

> diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
> index a64b6b056a21..b5b6d8b5d943 100644
> --- a/arch/s390/kernel/perf_pai_crypto.c
> +++ b/arch/s390/kernel/perf_pai_crypto.c
> @@ -569,7 +569,7 @@ static const struct attribute_group *paicrypt_attr_groups[] = {
>  /* Performance monitoring unit for mapped counters */
>  static struct pmu paicrypt = {
>  	.task_ctx_nr  = perf_hw_context,
> -	.capabilities = PERF_PMU_CAP_SAMPLING,
> +	.capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>  	.event_init   = paicrypt_event_init,
>  	.add	      = paicrypt_add,
>  	.del	      = paicrypt_del,
> diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
> index 1261f80c6d52..bcd28c38da70 100644
> --- a/arch/s390/kernel/perf_pai_ext.c
> +++ b/arch/s390/kernel/perf_pai_ext.c
> @@ -595,7 +595,7 @@ static const struct attribute_group *paiext_attr_groups[] = {
>  /* Performance monitoring unit for mapped counters */
>  static struct pmu paiext = {
>  	.task_ctx_nr  = perf_hw_context,
> -	.capabilities = PERF_PMU_CAP_SAMPLING,
> +	.capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>  	.event_init   = paiext_event_init,
>  	.add	      = paiext_add,
>  	.del	      = paiext_del,
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 789dfca2fa67..764728bb80ae 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2697,7 +2697,7 @@ static bool x86_pmu_filter(struct pmu *pmu, int cpu)
>  }
>  
>  static struct pmu pmu = {
> -	.capabilities		= PERF_PMU_CAP_SAMPLING,
> +	.capabilities		= PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>  
>  	.pmu_enable		= x86_pmu_enable,
>  	.pmu_disable		= x86_pmu_disable,
> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
> index 72d8f38d0aa5..bc772a3bf411 100644
> --- a/drivers/perf/arm_pmu.c
> +++ b/drivers/perf/arm_pmu.c
> @@ -877,6 +877,7 @@ struct arm_pmu *armpmu_alloc(void)
>  		 * specific PMU.
>  		 */
>  		.capabilities	= PERF_PMU_CAP_SAMPLING |
> +				  PERF_PMU_CAP_RAW_EVENTS |
>  				  PERF_PMU_CAP_EXTENDED_REGS |
>  				  PERF_PMU_CAP_EXTENDED_HW_TYPE,
>  	};
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 183b7c48b329..c6ad036c0037 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -305,6 +305,7 @@ struct perf_event_pmu_context;
>  #define PERF_PMU_CAP_EXTENDED_HW_TYPE	0x0100
>  #define PERF_PMU_CAP_AUX_PAUSE		0x0200
>  #define PERF_PMU_CAP_AUX_PREFER_LARGE	0x0400
> +#define PERF_PMU_CAP_RAW_EVENTS		0x0800
>  
>  /**
>   * pmu::scope
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 71b2a6730705..2ecee76d2ae2 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -12556,11 +12556,26 @@ static inline bool has_extended_regs(struct perf_event *event)
>  	       (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
>  }
>  
> +static bool is_raw_pmu(const struct pmu *pmu)
> +{
> +	return pmu->type == PERF_TYPE_RAW ||
> +	       pmu->capabilities & PERF_PMU_CAP_RAW_EVENTS;
> +}

As above, I reckon we should make this:

static bool pmu_supports_common_cpu_events(const struct pmu *pmu)
{
	return pmu->capabilities & PERF_PMU_CAP_RAW_EVENTS;
}

Other than the above, this looks good to me.

Mark.

> +
>  static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
>  {
>  	struct perf_event_context *ctx = NULL;
>  	int ret;
>  
> +	/*
> +	 * Before touching anything, we can safely skip:
> +	 * - any event for a specific PMU which is not this one
> +	 * - any common event if this PMU doesn't support them
> +	 */
> +	if (event->attr.type != pmu->type &&
> +	    (event->attr.type >= PERF_TYPE_MAX || is_raw_pmu(pmu)))
> +		return -ENOENT;
> +
>  	if (!try_module_get(pmu->module))
>  		return -ENODEV;
>  
> -- 
> 2.39.2.101.g768bb238c484.dirty
>
Re: [PATCH 18/19] perf: Introduce positive capability for raw events
Posted by Thomas Richter 1 month, 1 week ago
On 8/26/25 15:43, Mark Rutland wrote:
> On Wed, Aug 13, 2025 at 06:01:10PM +0100, Robin Murphy wrote:
>> Only a handful of CPU PMUs accept PERF_TYPE_{RAW,HARDWARE,HW_CACHE}
>> events without registering themselves as PERF_TYPE_RAW in the first
>> place. Add an explicit opt-in for these special cases, so that we can
>> make life easier for every other driver (and probably also speed up the
>> slow-path search) by having perf_try_init_event() do the basic type
>> checking to cover the majority of cases.
>>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> 
> 
> To bikeshed a little here, I'm not keen on the PERF_PMU_CAP_RAW_EVENTS
> name, because it's not clear what "RAW" really means, and people will
> definitely read that to mean something else.
> 
> Could we go with something like PERF_PMU_CAP_COMMON_CPU_EVENTS, to make
> it clear that this is about opting into CPU-PMU specific event types (of
> which PERF_TYPE_RAW is one of)?
> 
> Likewise, s/is_raw_pmu()/pmu_supports_common_cpu_events()/.
> 
>> ---
>>
>> A further possibility is to automatically add the cap to PERF_TYPE_RAW
>> PMUs in perf_pmu_register() to have a single point-of-use condition; I'm
>> undecided...
> 
> I reckon we don't need to automagically do that, but I reckon that
> is_raw_pmu()/pmu_supports_common_cpu_events() should only check the cap,
> and we don't read anything special into any of
> PERF_TYPE_{RAW,HARDWARE,HW_CACHE}.
> 
>> ---
>>  arch/s390/kernel/perf_cpum_cf.c    |  1 +
>>  arch/s390/kernel/perf_pai_crypto.c |  2 +-
>>  arch/s390/kernel/perf_pai_ext.c    |  2 +-
>>  arch/x86/events/core.c             |  2 +-
>>  drivers/perf/arm_pmu.c             |  1 +
>>  include/linux/perf_event.h         |  1 +
>>  kernel/events/core.c               | 15 +++++++++++++++
>>  7 files changed, 21 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
>> index 1a94e0944bc5..782ab755ddd4 100644
>> --- a/arch/s390/kernel/perf_cpum_cf.c
>> +++ b/arch/s390/kernel/perf_cpum_cf.c
>> @@ -1054,6 +1054,7 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
>>  /* Performance monitoring unit for s390x */
>>  static struct pmu cpumf_pmu = {
>>  	.task_ctx_nr  = perf_sw_context,
>> +	.capabilities = PERF_PMU_CAP_RAW_EVENTS,
>>  	.pmu_enable   = cpumf_pmu_enable,
>>  	.pmu_disable  = cpumf_pmu_disable,
>>  	.event_init   = cpumf_pmu_event_init,
> 
> Tangential, but use of perf_sw_context here looks bogus.
> 

It might look strange, but it was done on purpose. For details see
commit 9254e70c4ef1 ("s390/cpum_cf: use perf software context for hardware counters")

Background was a WARN_ON() statement which fired, because several PMU device drivers
existed in parallel on s390x platform.
Not sure if this condition is still true after all these years...

-- 
Thomas Richter, Dept 3303, IBM s390 Linux Development, Boeblingen, Germany
--
IBM Deutschland Research & Development GmbH

Vorsitzender des Aufsichtsrats: Wolfgang Wendt

Geschäftsführung: David Faller

Sitz der Gesellschaft: Böblingen / Registergericht: Amtsgericht Stuttgart, HRB 243294
Re: [PATCH 18/19] perf: Introduce positive capability for raw events
Posted by Robin Murphy 1 month, 1 week ago
On 2025-08-26 2:43 pm, Mark Rutland wrote:
> On Wed, Aug 13, 2025 at 06:01:10PM +0100, Robin Murphy wrote:
>> Only a handful of CPU PMUs accept PERF_TYPE_{RAW,HARDWARE,HW_CACHE}
>> events without registering themselves as PERF_TYPE_RAW in the first
>> place. Add an explicit opt-in for these special cases, so that we can
>> make life easier for every other driver (and probably also speed up the
>> slow-path search) by having perf_try_init_event() do the basic type
>> checking to cover the majority of cases.
>>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> 
> 
> To bikeshed a little here, I'm not keen on the PERF_PMU_CAP_RAW_EVENTS
> name, because it's not clear what "RAW" really means, and people will
> definitely read that to mean something else.
> 
> Could we go with something like PERF_PMU_CAP_COMMON_CPU_EVENTS, to make
> it clear that this is about opting into CPU-PMU specific event types (of
> which PERF_TYPE_RAW is one of)?

Indeed I started with that very intention after our previous discussion, 
but soon realised that in fact nowhere in the code is there any 
definition or even established notion of what "common" means in this 
context, so it's hardly immune to misinterpretation either. Furthermore 
the semantics of the cap as it ended up are specifically that the PMU 
wants the same behaviour as if it had registered as PERF_TYPE_RAW, so 
having "raw" in the name started to look like the more intuitive option 
after all (plus being nice and short helps.)

If anything, it's "events" that carries the implication that's proving 
hard to capture precisely and concisely here, so maybe the answer to 
avoid ambiguity is to lean further away from a "what it represents" to a 
"what it actually does" naming - PERF_PMU_CAP_TYPE_RAW, anyone?

> Likewise, s/is_raw_pmu()/pmu_supports_common_cpu_events()/.

Case in point: is it any more logical and expected that supporting 
common CPU events implies a PMU should be offered software or breakpoint 
events as well? Because that's what such a mere rename would currently 
mean :/

>> ---
>>
>> A further possibility is to automatically add the cap to PERF_TYPE_RAW
>> PMUs in perf_pmu_register() to have a single point-of-use condition; I'm
>> undecided...
> 
> I reckon we don't need to automagically do that, but I reckon that
> is_raw_pmu()/pmu_supports_common_cpu_events() should only check the cap,
> and we don't read anything special into any of
> PERF_TYPE_{RAW,HARDWARE,HW_CACHE}.

OK, but that would then necessitate having to explicitly add the cap to 
all 15-odd other drivers which register as PERF_TYPE_RAW as well, at 
which point it starts to look like a more general "I am a CPU PMU in 
terms of most typical assumptions you might want to make about that" flag...

To clarify (and perhaps something for a v2 commit message), we currently 
have 3 categories of PMU driver:

1: (Older/simpler CPUs) Registers as PERF_TYPE_RAW, wants 
PERF_TYPE_RAW/HARDWARE/HW_CACHE events
2: (Heterogeneous CPUs) Registers as dynamic type, wants 
PERF_TYPE_RAW/HARDWARE/HW_CACHE events plus events of its own type
3: (Mostly uncore) Registers as dynamic type, only wants events of its 
own type

My vested interest is in making category 3 the default behaviour, given 
that the growing majority of new drivers are uncore (and I keep having 
to write them...) However unclear the type overlaps in category 1 might 
be, it's been like that for 15 years, so I didn't feel compelled to 
churn fossils like Alpha more than reasonably necessary. Category 2 is 
only these 5 drivers, so a relatively small tweak to distinguish them 
from category 3 and let them retain the effective category 1 behaviour 
(which remains the current one of potentially still being offered 
software etc. events too) seemed like the neatest way to make progress.

I'm not saying I'm necessarily against a general overhaul of CPU PMUs 
being attempted too, just that it seems more like a whole other 
side-quest, and I'd really like to slay the uncore-boilerplate dragon first.

>> ---
>>   arch/s390/kernel/perf_cpum_cf.c    |  1 +
>>   arch/s390/kernel/perf_pai_crypto.c |  2 +-
>>   arch/s390/kernel/perf_pai_ext.c    |  2 +-
>>   arch/x86/events/core.c             |  2 +-
>>   drivers/perf/arm_pmu.c             |  1 +
>>   include/linux/perf_event.h         |  1 +
>>   kernel/events/core.c               | 15 +++++++++++++++
>>   7 files changed, 21 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
>> index 1a94e0944bc5..782ab755ddd4 100644
>> --- a/arch/s390/kernel/perf_cpum_cf.c
>> +++ b/arch/s390/kernel/perf_cpum_cf.c
>> @@ -1054,6 +1054,7 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
>>   /* Performance monitoring unit for s390x */
>>   static struct pmu cpumf_pmu = {
>>   	.task_ctx_nr  = perf_sw_context,
>> +	.capabilities = PERF_PMU_CAP_RAW_EVENTS,
>>   	.pmu_enable   = cpumf_pmu_enable,
>>   	.pmu_disable  = cpumf_pmu_disable,
>>   	.event_init   = cpumf_pmu_event_init,
> 
> Tangential, but use of perf_sw_context here looks bogus.

Indeed, according to the history it was intentional, but perhaps that no 
longer applies since the big context redesign? FWIW there seem to be a 
fair few instances of this, including Arm SPE.

Thanks,
Robin.

>> diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
>> index a64b6b056a21..b5b6d8b5d943 100644
>> --- a/arch/s390/kernel/perf_pai_crypto.c
>> +++ b/arch/s390/kernel/perf_pai_crypto.c
>> @@ -569,7 +569,7 @@ static const struct attribute_group *paicrypt_attr_groups[] = {
>>   /* Performance monitoring unit for mapped counters */
>>   static struct pmu paicrypt = {
>>   	.task_ctx_nr  = perf_hw_context,
>> -	.capabilities = PERF_PMU_CAP_SAMPLING,
>> +	.capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>>   	.event_init   = paicrypt_event_init,
>>   	.add	      = paicrypt_add,
>>   	.del	      = paicrypt_del,
>> diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
>> index 1261f80c6d52..bcd28c38da70 100644
>> --- a/arch/s390/kernel/perf_pai_ext.c
>> +++ b/arch/s390/kernel/perf_pai_ext.c
>> @@ -595,7 +595,7 @@ static const struct attribute_group *paiext_attr_groups[] = {
>>   /* Performance monitoring unit for mapped counters */
>>   static struct pmu paiext = {
>>   	.task_ctx_nr  = perf_hw_context,
>> -	.capabilities = PERF_PMU_CAP_SAMPLING,
>> +	.capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>>   	.event_init   = paiext_event_init,
>>   	.add	      = paiext_add,
>>   	.del	      = paiext_del,
>> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
>> index 789dfca2fa67..764728bb80ae 100644
>> --- a/arch/x86/events/core.c
>> +++ b/arch/x86/events/core.c
>> @@ -2697,7 +2697,7 @@ static bool x86_pmu_filter(struct pmu *pmu, int cpu)
>>   }
>>   
>>   static struct pmu pmu = {
>> -	.capabilities		= PERF_PMU_CAP_SAMPLING,
>> +	.capabilities		= PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>>   
>>   	.pmu_enable		= x86_pmu_enable,
>>   	.pmu_disable		= x86_pmu_disable,
>> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
>> index 72d8f38d0aa5..bc772a3bf411 100644
>> --- a/drivers/perf/arm_pmu.c
>> +++ b/drivers/perf/arm_pmu.c
>> @@ -877,6 +877,7 @@ struct arm_pmu *armpmu_alloc(void)
>>   		 * specific PMU.
>>   		 */
>>   		.capabilities	= PERF_PMU_CAP_SAMPLING |
>> +				  PERF_PMU_CAP_RAW_EVENTS |
>>   				  PERF_PMU_CAP_EXTENDED_REGS |
>>   				  PERF_PMU_CAP_EXTENDED_HW_TYPE,
>>   	};
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index 183b7c48b329..c6ad036c0037 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -305,6 +305,7 @@ struct perf_event_pmu_context;
>>   #define PERF_PMU_CAP_EXTENDED_HW_TYPE	0x0100
>>   #define PERF_PMU_CAP_AUX_PAUSE		0x0200
>>   #define PERF_PMU_CAP_AUX_PREFER_LARGE	0x0400
>> +#define PERF_PMU_CAP_RAW_EVENTS		0x0800
>>   
>>   /**
>>    * pmu::scope
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 71b2a6730705..2ecee76d2ae2 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -12556,11 +12556,26 @@ static inline bool has_extended_regs(struct perf_event *event)
>>   	       (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
>>   }
>>   
>> +static bool is_raw_pmu(const struct pmu *pmu)
>> +{
>> +	return pmu->type == PERF_TYPE_RAW ||
>> +	       pmu->capabilities & PERF_PMU_CAP_RAW_EVENTS;
>> +}
> 
> As above, I reckon we should make this:
> 
> static bool pmu_supports_common_cpu_events(const struct pmu *pmu)
> {
> 	return pmu->capabilities & PERF_PMU_CAP_RAW_EVENTS;
> }
> 
> Other than the above, this looks good to me.
> 
> Mark.
> 
>> +
>>   static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
>>   {
>>   	struct perf_event_context *ctx = NULL;
>>   	int ret;
>>   
>> +	/*
>> +	 * Before touching anything, we can safely skip:
>> +	 * - any event for a specific PMU which is not this one
>> +	 * - any common event if this PMU doesn't support them
>> +	 */
>> +	if (event->attr.type != pmu->type &&
>> +	    (event->attr.type >= PERF_TYPE_MAX || is_raw_pmu(pmu)))
>> +		return -ENOENT;
>> +
>>   	if (!try_module_get(pmu->module))
>>   		return -ENODEV;
>>   
>> -- 
>> 2.39.2.101.g768bb238c484.dirty
>>
Re: [PATCH 18/19] perf: Introduce positive capability for raw events
Posted by Mark Rutland 1 month, 1 week ago
On Tue, Aug 26, 2025 at 11:46:02PM +0100, Robin Murphy wrote:
> On 2025-08-26 2:43 pm, Mark Rutland wrote:
> > On Wed, Aug 13, 2025 at 06:01:10PM +0100, Robin Murphy wrote:
> > To bikeshed a little here, I'm not keen on the PERF_PMU_CAP_RAW_EVENTS
> > name, because it's not clear what "RAW" really means, and people will
> > definitely read that to mean something else.
> > 
> > Could we go with something like PERF_PMU_CAP_COMMON_CPU_EVENTS, to make
> > it clear that this is about opting into CPU-PMU specific event types (of
> > which PERF_TYPE_RAW is one of)?
> 
> Indeed I started with that very intention after our previous discussion, but
> soon realised that in fact nowhere in the code is there any definition or
> even established notion of what "common" means in this context, so it's
> hardly immune to misinterpretation either.

We can document that; it's everything less than PERF_TYPE_MAX:

	enum perf_type_id {
		PERF_TYPE_HARDWARE                      = 0, 
		PERF_TYPE_SOFTWARE                      = 1, 
		PERF_TYPE_TRACEPOINT                    = 2, 
		PERF_TYPE_HW_CACHE                      = 3, 
		PERF_TYPE_RAW                           = 4, 
		PERF_TYPE_BREAKPOINT                    = 5, 

		PERF_TYPE_MAX,                          /* non-ABI */
	};

... and maybe you could use "PERF_PMU_CAP_ABI_TYPES" to align with that
comment?

> Furthermore the semantics of the cap as it ended up are specifically
> that the PMU wants the same behaviour as if it had registered as
> PERF_TYPE_RAW, so having "raw" in the name started to look like the
> more intuitive option after all (plus being nice and short helps.)

I appreciate the shortness, but I think it's misleading to tie this to
"RAW" specifically, when really this is a capabiltiy to say "please let
me try to init any events for non-dynamic types, in addition to whatever
specific type I am registered with".

> If anything, it's "events" that carries the implication that's proving hard
> to capture precisely and concisely here, so maybe the answer to avoid
> ambiguity is to lean further away from a "what it represents" to a "what it
> actually does" naming - PERF_PMU_CAP_TYPE_RAW, anyone?

I'm happy with TYPE in the name; it's just RAW specifically that I'm
objecting to.

> > Likewise, s/is_raw_pmu()/pmu_supports_common_cpu_events()/.
> 
> Case in point: is it any more logical and expected that supporting common
> CPU events implies a PMU should be offered software or breakpoint events as
> well? Because that's what such a mere rename would currently mean :/

Yes, I think it is.

> > > ---
> > > 
> > > A further possibility is to automatically add the cap to PERF_TYPE_RAW
> > > PMUs in perf_pmu_register() to have a single point-of-use condition; I'm
> > > undecided...
> > 
> > I reckon we don't need to automagically do that, but I reckon that
> > is_raw_pmu()/pmu_supports_common_cpu_events() should only check the cap,
> > and we don't read anything special into any of
> > PERF_TYPE_{RAW,HARDWARE,HW_CACHE}.
> 
> OK, but that would then necessitate having to explicitly add the cap to all
> 15-odd other drivers which register as PERF_TYPE_RAW as well, at which point
> it starts to look like a more general "I am a CPU PMU in terms of most
> typical assumptions you might want to make about that" flag...
> 
> To clarify (and perhaps something for a v2 commit message), we currently
> have 3 categories of PMU driver:
> 
> 1: (Older/simpler CPUs) Registers as PERF_TYPE_RAW, wants
> PERF_TYPE_RAW/HARDWARE/HW_CACHE events
> 2: (Heterogeneous CPUs) Registers as dynamic type, wants
> PERF_TYPE_RAW/HARDWARE/HW_CACHE events plus events of its own type
> 3: (Mostly uncore) Registers as dynamic type, only wants events of its own
> type

Sure, but I think that separating 1 and 2 is an artificial distinction,
and what we really have is:

(a) Wants to handle (some of) the non-dynamic/common/ABI types (in
    addition to whatever specific type it was registered with). Needs to
    have a switch statement somewhere in pmu::event_init().

(b) Only wants to handle the specific type the PMU was registered with.

> My vested interest is in making category 3 the default behaviour, given that
> the growing majority of new drivers are uncore (and I keep having to write
> them...) 

Yes, we're aligned on that.

> However unclear the type overlaps in category 1 might be, it's been
> like that for 15 years, so I didn't feel compelled to churn fossils like
> Alpha more than reasonably necessary. Category 2 is only these 5 drivers, so
> a relatively small tweak to distinguish them from category 3 and let them
> retain the effective category 1 behaviour (which remains the current one of
> potentially still being offered software etc. events too) seemed like the
> neatest way to make progress.

I just think we should combine 1 and 2 (into categroy a as above), since
that removes the need to treat RAW specially going forwards.

> I'm not saying I'm necessarily against a general overhaul of CPU PMUs being
> attempted too, just that it seems more like a whole other side-quest, and
> I'd really like to slay the uncore-boilerplate dragon first.

I think that adding the cap to those 15 PMUs would take less time than
it has taken me to write this email, so I do not understand the
objection.

Mark.
Re: [PATCH 18/19] perf: Introduce positive capability for raw events
Posted by kernel test robot 1 month, 2 weeks ago

Hello,

kernel test robot noticed "perf-sanity-tests.Event_groups.fail" on:

commit: a704f7a13544a408baee6fa78f0f24fa05bfa406 ("[PATCH 18/19] perf: Introduce positive capability for raw events")
url: https://github.com/intel-lab-lkp/linux/commits/Robin-Murphy/perf-arm-cmn-Fix-event-validation/20250814-010626
base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git 91325f31afc1026de28665cf1a7b6e157fa4d39d
patch link: https://lore.kernel.org/all/542787fd188ea15ef41c53d557989c962ed44771.1755096883.git.robin.murphy@arm.com/
patch subject: [PATCH 18/19] perf: Introduce positive capability for raw events

in testcase: perf-sanity-tests
version: 
with following parameters:

	perf_compiler: gcc
	group: group-01



config: x86_64-rhel-9.4-bpf
compiler: gcc-12
test machine: 224 threads 4 sockets Intel(R) Xeon(R) Platinum 8380H CPU @ 2.90GHz (Cooper Lake) with 192G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202508211037.3f897218-lkp@intel.com


besides Event_groups, we also noticed perf_stat_JSON_output_linter and
perf_stat_STD_output_linter become failed upon this commit but pass on parent.

0129bbf0ee6f109a a704f7a13544a408baee6fa78f0
---------------- ---------------------------
       fail:runs  %reproduction    fail:runs
           |             |             |
           :38          16%           6:6     perf-sanity-tests.Event_groups.fail
           :38          16%           6:6     perf-sanity-tests.perf_stat_JSON_output_linter.fail
           :38          16%           6:6     perf-sanity-tests.perf_stat_STD_output_linter.fail



2025-08-18 13:20:21 sudo /usr/src/linux-perf-x86_64-rhel-9.4-bpf-a704f7a13544a408baee6fa78f0f24fa05bfa406/tools/perf/perf test 66 -v
 66: Event groups                                                    : Running (1 active)
--- start ---
test child forked, pid 9619
Using CPUID GenuineIntel-6-55-B
Using uncore_imc_0 for uncore pmu event
0x0 0x0, 0x0 0x0, 0x0 0x1: Fail
0x0 0x0, 0x0 0x0, 0x1 0x3: Fail
0x0 0x0, 0x0 0x0, 0xf 0x1: Fail
0x0 0x0, 0x1 0x3, 0x0 0x0: Fail
0x0 0x0, 0x1 0x3, 0x1 0x3: Fail
0x0 0x0, 0x1 0x3, 0xf 0x1: Fail
0x0 0x0, 0xf 0x1, 0x0 0x0: Fail
0x0 0x0, 0xf 0x1, 0x1 0x3: Fail
0x0 0x0, 0xf 0x1, 0xf 0x1: Fail
0x1 0x3, 0x0 0x0, 0x0 0x0: Fail
0x1 0x3, 0x0 0x0, 0x1 0x3: Fail
0x1 0x3, 0x0 0x0, 0xf 0x1: Pass
0x1 0x3, 0x1 0x3, 0x0 0x0: Fail
0x1 0x3, 0x1 0x3, 0x1 0x3: Pass
0x1 0x3, 0x1 0x3, 0xf 0x1: Pass
0x1 0x3, 0xf 0x1, 0x0 0x0: Pass
0x1 0x3, 0xf 0x1, 0x1 0x3: Pass
0x1 0x3, 0xf 0x1, 0xf 0x1: Pass
0xf 0x1, 0x0 0x0, 0x0 0x0: Pass
0xf 0x1, 0x0 0x0, 0x1 0x3: Pass
0xf 0x1, 0x0 0x0, 0xf 0x1: Pass
0xf 0x1, 0x1 0x3, 0x0 0x0: Pass
0xf 0x1, 0x1 0x3, 0x1 0x3: Pass
0xf 0x1, 0x1 0x3, 0xf 0x1: Pass
0xf 0x1, 0xf 0x1, 0x0 0x0: Pass
0xf 0x1, 0xf 0x1, 0x1 0x3: Pass
0xf 0x1, 0xf 0x1, 0xf 0x1: Pass
---- end(-1) ----
 66: Event groups                                                    : FAILED!

...

2025-08-18 13:29:36 sudo /usr/src/linux-perf-x86_64-rhel-9.4-bpf-a704f7a13544a408baee6fa78f0f24fa05bfa406/tools/perf/perf test 97 -v
 97: perf stat JSON output linter                                    : Running (1 active)
--- start ---
test child forked, pid 20715
Checking json output: no args [Success]
Checking json output: system wide [Success]
Checking json output: interval [Success]
Checking json output: event [Success]
Checking json output: per thread [Success]
Checking json output: per node [Success]
Checking json output: metric only Test failed for input:
{"metric-value" : "none"}

Traceback (most recent call last):
  File "/usr/src/perf_selftests-x86_64-rhel-9.4-bpf-a704f7a13544a408baee6fa78f0f24fa05bfa406/tools/perf/tests/shell/lib/perf_json_output_lint.py", line 108, in <module>
    check_json_output(expected_items)
  File "/usr/src/perf_selftests-x86_64-rhel-9.4-bpf-a704f7a13544a408baee6fa78f0f24fa05bfa406/tools/perf/tests/shell/lib/perf_json_output_lint.py", line 93, in check_json_output
    raise RuntimeError(f'Check failed for: key={key} value={value}')
RuntimeError: Check failed for: key=metric-value value=none
---- end(-1) ----
 97: perf stat JSON output linter                                    : FAILED!

...

2025-08-18 13:29:46 sudo /usr/src/linux-perf-x86_64-rhel-9.4-bpf-a704f7a13544a408baee6fa78f0f24fa05bfa406/tools/perf/perf test 99 -v
 99: perf stat STD output linter                                     : Running (1 active)
--- start ---
test child forked, pid 20818
Checking STD output: no args [Success]
Checking STD output: system wide [Success]
Checking STD output: interval [Success]
Checking STD output: per thread [Success]
Checking STD output: per node [Success]
Checking STD output: metric only ---- end(-1) ----
 99: perf stat STD output linter                                     : FAILED!



The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20250821/202508211037.3f897218-lkp@intel.com



-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH 18/19] perf: Introduce positive capability for raw events
Posted by Robin Murphy 1 month, 2 weeks ago
On 13/08/2025 6:01 pm, Robin Murphy wrote:
> Only a handful of CPU PMUs accept PERF_TYPE_{RAW,HARDWARE,HW_CACHE}
> events without registering themselves as PERF_TYPE_RAW in the first
> place. Add an explicit opt-in for these special cases, so that we can
> make life easier for every other driver (and probably also speed up the
> slow-path search) by having perf_try_init_event() do the basic type
> checking to cover the majority of cases.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
> 
> A further possibility is to automatically add the cap to PERF_TYPE_RAW
> PMUs in perf_pmu_register() to have a single point-of-use condition; I'm
> undecided...
> ---
>   arch/s390/kernel/perf_cpum_cf.c    |  1 +
>   arch/s390/kernel/perf_pai_crypto.c |  2 +-
>   arch/s390/kernel/perf_pai_ext.c    |  2 +-
>   arch/x86/events/core.c             |  2 +-
>   drivers/perf/arm_pmu.c             |  1 +
>   include/linux/perf_event.h         |  1 +
>   kernel/events/core.c               | 15 +++++++++++++++
>   7 files changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
> index 1a94e0944bc5..782ab755ddd4 100644
> --- a/arch/s390/kernel/perf_cpum_cf.c
> +++ b/arch/s390/kernel/perf_cpum_cf.c
> @@ -1054,6 +1054,7 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
>   /* Performance monitoring unit for s390x */
>   static struct pmu cpumf_pmu = {
>   	.task_ctx_nr  = perf_sw_context,
> +	.capabilities = PERF_PMU_CAP_RAW_EVENTS,
>   	.pmu_enable   = cpumf_pmu_enable,
>   	.pmu_disable  = cpumf_pmu_disable,
>   	.event_init   = cpumf_pmu_event_init,
> diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
> index a64b6b056a21..b5b6d8b5d943 100644
> --- a/arch/s390/kernel/perf_pai_crypto.c
> +++ b/arch/s390/kernel/perf_pai_crypto.c
> @@ -569,7 +569,7 @@ static const struct attribute_group *paicrypt_attr_groups[] = {
>   /* Performance monitoring unit for mapped counters */
>   static struct pmu paicrypt = {
>   	.task_ctx_nr  = perf_hw_context,
> -	.capabilities = PERF_PMU_CAP_SAMPLING,
> +	.capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>   	.event_init   = paicrypt_event_init,
>   	.add	      = paicrypt_add,
>   	.del	      = paicrypt_del,
> diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
> index 1261f80c6d52..bcd28c38da70 100644
> --- a/arch/s390/kernel/perf_pai_ext.c
> +++ b/arch/s390/kernel/perf_pai_ext.c
> @@ -595,7 +595,7 @@ static const struct attribute_group *paiext_attr_groups[] = {
>   /* Performance monitoring unit for mapped counters */
>   static struct pmu paiext = {
>   	.task_ctx_nr  = perf_hw_context,
> -	.capabilities = PERF_PMU_CAP_SAMPLING,
> +	.capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>   	.event_init   = paiext_event_init,
>   	.add	      = paiext_add,
>   	.del	      = paiext_del,
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 789dfca2fa67..764728bb80ae 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2697,7 +2697,7 @@ static bool x86_pmu_filter(struct pmu *pmu, int cpu)
>   }
>   
>   static struct pmu pmu = {
> -	.capabilities		= PERF_PMU_CAP_SAMPLING,
> +	.capabilities		= PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>   
>   	.pmu_enable		= x86_pmu_enable,
>   	.pmu_disable		= x86_pmu_disable,
> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
> index 72d8f38d0aa5..bc772a3bf411 100644
> --- a/drivers/perf/arm_pmu.c
> +++ b/drivers/perf/arm_pmu.c
> @@ -877,6 +877,7 @@ struct arm_pmu *armpmu_alloc(void)
>   		 * specific PMU.
>   		 */
>   		.capabilities	= PERF_PMU_CAP_SAMPLING |
> +				  PERF_PMU_CAP_RAW_EVENTS |
>   				  PERF_PMU_CAP_EXTENDED_REGS |
>   				  PERF_PMU_CAP_EXTENDED_HW_TYPE,
>   	};
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 183b7c48b329..c6ad036c0037 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -305,6 +305,7 @@ struct perf_event_pmu_context;
>   #define PERF_PMU_CAP_EXTENDED_HW_TYPE	0x0100
>   #define PERF_PMU_CAP_AUX_PAUSE		0x0200
>   #define PERF_PMU_CAP_AUX_PREFER_LARGE	0x0400
> +#define PERF_PMU_CAP_RAW_EVENTS		0x0800
>   
>   /**
>    * pmu::scope
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 71b2a6730705..2ecee76d2ae2 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -12556,11 +12556,26 @@ static inline bool has_extended_regs(struct perf_event *event)
>   	       (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
>   }
>   
> +static bool is_raw_pmu(const struct pmu *pmu)
> +{
> +	return pmu->type == PERF_TYPE_RAW ||
> +	       pmu->capabilities & PERF_PMU_CAP_RAW_EVENTS;
> +}
> +
>   static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
>   {
>   	struct perf_event_context *ctx = NULL;
>   	int ret;
>   
> +	/*
> +	 * Before touching anything, we can safely skip:
> +	 * - any event for a specific PMU which is not this one
> +	 * - any common event if this PMU doesn't support them
> +	 */
> +	if (event->attr.type != pmu->type &&
> +	    (event->attr.type >= PERF_TYPE_MAX || is_raw_pmu(pmu)))

Ah, that should be "!is_raw_pmu(pmu)" there (although it's not entirely 
the cause of the LKP report on the final patch.)

Thanks,
Robin.

> +		return -ENOENT;
> +
>   	if (!try_module_get(pmu->module))
>   		return -ENODEV;
>
Re: [PATCH 18/19] perf: Introduce positive capability for raw events
Posted by Thomas Richter 1 month, 2 weeks ago
On 8/19/25 15:15, Robin Murphy wrote:
> On 13/08/2025 6:01 pm, Robin Murphy wrote:
>> Only a handful of CPU PMUs accept PERF_TYPE_{RAW,HARDWARE,HW_CACHE}
>> events without registering themselves as PERF_TYPE_RAW in the first
>> place. Add an explicit opt-in for these special cases, so that we can
>> make life easier for every other driver (and probably also speed up the
>> slow-path search) by having perf_try_init_event() do the basic type
>> checking to cover the majority of cases.
>>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>> ---
>>
>> A further possibility is to automatically add the cap to PERF_TYPE_RAW
>> PMUs in perf_pmu_register() to have a single point-of-use condition; I'm
>> undecided...
>> ---
>>   arch/s390/kernel/perf_cpum_cf.c    |  1 +
>>   arch/s390/kernel/perf_pai_crypto.c |  2 +-
>>   arch/s390/kernel/perf_pai_ext.c    |  2 +-
>>   arch/x86/events/core.c             |  2 +-
>>   drivers/perf/arm_pmu.c             |  1 +
>>   include/linux/perf_event.h         |  1 +
>>   kernel/events/core.c               | 15 +++++++++++++++
>>   7 files changed, 21 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
>> index 1a94e0944bc5..782ab755ddd4 100644
>> --- a/arch/s390/kernel/perf_cpum_cf.c
>> +++ b/arch/s390/kernel/perf_cpum_cf.c
>> @@ -1054,6 +1054,7 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
>>   /* Performance monitoring unit for s390x */
>>   static struct pmu cpumf_pmu = {
>>       .task_ctx_nr  = perf_sw_context,
>> +    .capabilities = PERF_PMU_CAP_RAW_EVENTS,
>>       .pmu_enable   = cpumf_pmu_enable,
>>       .pmu_disable  = cpumf_pmu_disable,
>>       .event_init   = cpumf_pmu_event_init,
>> diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
>> index a64b6b056a21..b5b6d8b5d943 100644
>> --- a/arch/s390/kernel/perf_pai_crypto.c
>> +++ b/arch/s390/kernel/perf_pai_crypto.c
>> @@ -569,7 +569,7 @@ static const struct attribute_group *paicrypt_attr_groups[] = {
>>   /* Performance monitoring unit for mapped counters */
>>   static struct pmu paicrypt = {
>>       .task_ctx_nr  = perf_hw_context,
>> -    .capabilities = PERF_PMU_CAP_SAMPLING,
>> +    .capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>>       .event_init   = paicrypt_event_init,
>>       .add          = paicrypt_add,
>>       .del          = paicrypt_del,
>> diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
>> index 1261f80c6d52..bcd28c38da70 100644
>> --- a/arch/s390/kernel/perf_pai_ext.c
>> +++ b/arch/s390/kernel/perf_pai_ext.c
>> @@ -595,7 +595,7 @@ static const struct attribute_group *paiext_attr_groups[] = {
>>   /* Performance monitoring unit for mapped counters */
>>   static struct pmu paiext = {
>>       .task_ctx_nr  = perf_hw_context,
>> -    .capabilities = PERF_PMU_CAP_SAMPLING,
>> +    .capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>>       .event_init   = paiext_event_init,
>>       .add          = paiext_add,
>>       .del          = paiext_del,
>> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
>> index 789dfca2fa67..764728bb80ae 100644
>> --- a/arch/x86/events/core.c
>> +++ b/arch/x86/events/core.c
>> @@ -2697,7 +2697,7 @@ static bool x86_pmu_filter(struct pmu *pmu, int cpu)
>>   }
>>     static struct pmu pmu = {
>> -    .capabilities        = PERF_PMU_CAP_SAMPLING,
>> +    .capabilities        = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>>         .pmu_enable        = x86_pmu_enable,
>>       .pmu_disable        = x86_pmu_disable,
>> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
>> index 72d8f38d0aa5..bc772a3bf411 100644
>> --- a/drivers/perf/arm_pmu.c
>> +++ b/drivers/perf/arm_pmu.c
>> @@ -877,6 +877,7 @@ struct arm_pmu *armpmu_alloc(void)
>>            * specific PMU.
>>            */
>>           .capabilities    = PERF_PMU_CAP_SAMPLING |
>> +                  PERF_PMU_CAP_RAW_EVENTS |
>>                     PERF_PMU_CAP_EXTENDED_REGS |
>>                     PERF_PMU_CAP_EXTENDED_HW_TYPE,
>>       };
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index 183b7c48b329..c6ad036c0037 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -305,6 +305,7 @@ struct perf_event_pmu_context;
>>   #define PERF_PMU_CAP_EXTENDED_HW_TYPE    0x0100
>>   #define PERF_PMU_CAP_AUX_PAUSE        0x0200
>>   #define PERF_PMU_CAP_AUX_PREFER_LARGE    0x0400
>> +#define PERF_PMU_CAP_RAW_EVENTS        0x0800
>>     /**
>>    * pmu::scope
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 71b2a6730705..2ecee76d2ae2 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -12556,11 +12556,26 @@ static inline bool has_extended_regs(struct perf_event *event)
>>              (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
>>   }
>>   +static bool is_raw_pmu(const struct pmu *pmu)
>> +{
>> +    return pmu->type == PERF_TYPE_RAW ||
>> +           pmu->capabilities & PERF_PMU_CAP_RAW_EVENTS;
>> +}
>> +
>>   static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
>>   {
>>       struct perf_event_context *ctx = NULL;
>>       int ret;
>>   +    /*
>> +     * Before touching anything, we can safely skip:
>> +     * - any event for a specific PMU which is not this one
>> +     * - any common event if this PMU doesn't support them
>> +     */
>> +    if (event->attr.type != pmu->type &&
>> +        (event->attr.type >= PERF_TYPE_MAX || is_raw_pmu(pmu)))
> 
> Ah, that should be "!is_raw_pmu(pmu)" there (although it's not entirely the cause of the LKP report on the final patch.)
> 
> Thanks,
> Robin.
> 
>> +        return -ENOENT;
>> +
>>       if (!try_module_get(pmu->module))
>>           return -ENODEV;
>>   
> 
> 

Hi Robin,

what is the intention of that patch?
Can you explain that a bit more.

Thanks.

-- 
Thomas Richter, Dept 3303, IBM s390 Linux Development, Boeblingen, Germany
--
IBM Deutschland Research & Development GmbH

Vorsitzender des Aufsichtsrats: Wolfgang Wendt

Geschäftsführung: David Faller

Sitz der Gesellschaft: Böblingen / Registergericht: Amtsgericht Stuttgart, HRB 243294
Re: [PATCH 18/19] perf: Introduce positive capability for raw events
Posted by Robin Murphy 1 month, 2 weeks ago
Hi Thomas,

On 2025-08-20 9:09 am, Thomas Richter wrote:
> On 8/19/25 15:15, Robin Murphy wrote:
>> On 13/08/2025 6:01 pm, Robin Murphy wrote:
>>> Only a handful of CPU PMUs accept PERF_TYPE_{RAW,HARDWARE,HW_CACHE}
>>> events without registering themselves as PERF_TYPE_RAW in the first
>>> place. Add an explicit opt-in for these special cases, so that we can
>>> make life easier for every other driver (and probably also speed up the
>>> slow-path search) by having perf_try_init_event() do the basic type
>>> checking to cover the majority of cases.
>>>
>>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>>> ---
>>>
>>> A further possibility is to automatically add the cap to PERF_TYPE_RAW
>>> PMUs in perf_pmu_register() to have a single point-of-use condition; I'm
>>> undecided...
>>> ---
>>>    arch/s390/kernel/perf_cpum_cf.c    |  1 +
>>>    arch/s390/kernel/perf_pai_crypto.c |  2 +-
>>>    arch/s390/kernel/perf_pai_ext.c    |  2 +-
>>>    arch/x86/events/core.c             |  2 +-
>>>    drivers/perf/arm_pmu.c             |  1 +
>>>    include/linux/perf_event.h         |  1 +
>>>    kernel/events/core.c               | 15 +++++++++++++++
>>>    7 files changed, 21 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
>>> index 1a94e0944bc5..782ab755ddd4 100644
>>> --- a/arch/s390/kernel/perf_cpum_cf.c
>>> +++ b/arch/s390/kernel/perf_cpum_cf.c
>>> @@ -1054,6 +1054,7 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
>>>    /* Performance monitoring unit for s390x */
>>>    static struct pmu cpumf_pmu = {
>>>        .task_ctx_nr  = perf_sw_context,
>>> +    .capabilities = PERF_PMU_CAP_RAW_EVENTS,
>>>        .pmu_enable   = cpumf_pmu_enable,
>>>        .pmu_disable  = cpumf_pmu_disable,
>>>        .event_init   = cpumf_pmu_event_init,
>>> diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
>>> index a64b6b056a21..b5b6d8b5d943 100644
>>> --- a/arch/s390/kernel/perf_pai_crypto.c
>>> +++ b/arch/s390/kernel/perf_pai_crypto.c
>>> @@ -569,7 +569,7 @@ static const struct attribute_group *paicrypt_attr_groups[] = {
>>>    /* Performance monitoring unit for mapped counters */
>>>    static struct pmu paicrypt = {
>>>        .task_ctx_nr  = perf_hw_context,
>>> -    .capabilities = PERF_PMU_CAP_SAMPLING,
>>> +    .capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>>>        .event_init   = paicrypt_event_init,
>>>        .add          = paicrypt_add,
>>>        .del          = paicrypt_del,
>>> diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
>>> index 1261f80c6d52..bcd28c38da70 100644
>>> --- a/arch/s390/kernel/perf_pai_ext.c
>>> +++ b/arch/s390/kernel/perf_pai_ext.c
>>> @@ -595,7 +595,7 @@ static const struct attribute_group *paiext_attr_groups[] = {
>>>    /* Performance monitoring unit for mapped counters */
>>>    static struct pmu paiext = {
>>>        .task_ctx_nr  = perf_hw_context,
>>> -    .capabilities = PERF_PMU_CAP_SAMPLING,
>>> +    .capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>>>        .event_init   = paiext_event_init,
>>>        .add          = paiext_add,
>>>        .del          = paiext_del,
>>> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
>>> index 789dfca2fa67..764728bb80ae 100644
>>> --- a/arch/x86/events/core.c
>>> +++ b/arch/x86/events/core.c
>>> @@ -2697,7 +2697,7 @@ static bool x86_pmu_filter(struct pmu *pmu, int cpu)
>>>    }
>>>      static struct pmu pmu = {
>>> -    .capabilities        = PERF_PMU_CAP_SAMPLING,
>>> +    .capabilities        = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_RAW_EVENTS,
>>>          .pmu_enable        = x86_pmu_enable,
>>>        .pmu_disable        = x86_pmu_disable,
>>> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
>>> index 72d8f38d0aa5..bc772a3bf411 100644
>>> --- a/drivers/perf/arm_pmu.c
>>> +++ b/drivers/perf/arm_pmu.c
>>> @@ -877,6 +877,7 @@ struct arm_pmu *armpmu_alloc(void)
>>>             * specific PMU.
>>>             */
>>>            .capabilities    = PERF_PMU_CAP_SAMPLING |
>>> +                  PERF_PMU_CAP_RAW_EVENTS |
>>>                      PERF_PMU_CAP_EXTENDED_REGS |
>>>                      PERF_PMU_CAP_EXTENDED_HW_TYPE,
>>>        };
>>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>>> index 183b7c48b329..c6ad036c0037 100644
>>> --- a/include/linux/perf_event.h
>>> +++ b/include/linux/perf_event.h
>>> @@ -305,6 +305,7 @@ struct perf_event_pmu_context;
>>>    #define PERF_PMU_CAP_EXTENDED_HW_TYPE    0x0100
>>>    #define PERF_PMU_CAP_AUX_PAUSE        0x0200
>>>    #define PERF_PMU_CAP_AUX_PREFER_LARGE    0x0400
>>> +#define PERF_PMU_CAP_RAW_EVENTS        0x0800
>>>      /**
>>>     * pmu::scope
>>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>>> index 71b2a6730705..2ecee76d2ae2 100644
>>> --- a/kernel/events/core.c
>>> +++ b/kernel/events/core.c
>>> @@ -12556,11 +12556,26 @@ static inline bool has_extended_regs(struct perf_event *event)
>>>               (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
>>>    }
>>>    +static bool is_raw_pmu(const struct pmu *pmu)
>>> +{
>>> +    return pmu->type == PERF_TYPE_RAW ||
>>> +           pmu->capabilities & PERF_PMU_CAP_RAW_EVENTS;
>>> +}
>>> +
>>>    static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
>>>    {
>>>        struct perf_event_context *ctx = NULL;
>>>        int ret;
>>>    +    /*
>>> +     * Before touching anything, we can safely skip:
>>> +     * - any event for a specific PMU which is not this one
>>> +     * - any common event if this PMU doesn't support them
>>> +     */
>>> +    if (event->attr.type != pmu->type &&
>>> +        (event->attr.type >= PERF_TYPE_MAX || is_raw_pmu(pmu)))
>>
>> Ah, that should be "!is_raw_pmu(pmu)" there (although it's not entirely the cause of the LKP report on the final patch.)
>>
>> Thanks,
>> Robin.
>>
>>> +        return -ENOENT;
>>> +
>>>        if (!try_module_get(pmu->module))
>>>            return -ENODEV;
>>>    
>>
>>
> 
> Hi Robin,
> 
> what is the intention of that patch?
> Can you explain that a bit more.

The background here is that, in this context, we essentially have 3 
distinct categories of PMU driver:

- Older/simpler CPU PMUs which register as PERF_TYPE_RAW and accept 
raw/hardware events
- Newer/heterogeneous CPU PMUs which register as a dynamic type, and 
accept both raw/hardware events and events of their own type
- Other (mostly uncore) PMUs which only accept events of their own type

These days that third one is by far the majority, so it seems 
increasingly unreasonable and inefficient to always offer every kind of 
event to every driver, and so force nearly all of them to have the same 
boilerplate code to refuse events they don't want. The core code is 
already in a position to be able to assume that a PERF_TYPE_RAW PMU 
wants "raw" events and a typed PMU wants its own events, so the only 
actual new thing we need is a way to discern the 5 drivers in the middle 
category - where s390 dominates :) - from the rest in the third.

The way the check itself ends up structured is that the only time we'll 
now offer an event to a driver of a different type is if it's a "raw" 
event and the driver has asked to be offered them (either by registering 
as PERF_TYPE_RAW or with the new cap). Otherwise we can safely assume 
that this PMU won't want this event, and so skip straight to trying the 
next one. We can get away with the single PERF_TYPE_MAX check for all 
"raw" events, since the drivers which do handle them already have to 
consider the exact type to discern between RAW/HARDWARE/HW_CACHE, and 
thus must reject SOFTWARE/TRACEPOINT/BREAKPOINT events anyway, but I 
could of course make that more specific if people prefer. Conversely, 
since the actual software/tracepoint/breakpoint PMUs won't pass the 
is_raw_pmu() check either, and thus will only be given their own events, 
I could remove the type checking from their event_init routines as well, 
but I thought that might be perhaps a little too subtle as-is.

BTW if the s390 drivers are intended to coexist then I'm not sure they 
actually handle sharing PERF_TYPE_RAW events very well - what happens to 
any particular event seems ultimately largely dependent on the order in 
which the drivers happen to register - but that's a pre-existing issue 
and this series shouldn't change anything in that respect. (As it 
similarly shouldn't affect the trick of the first matching driver 
rewriting the event type to "forward" it to another driver later in the 
list.)

Thanks,
Robin.