perf: Rework event_init checks

[PATCH 12/19] perf: Ignore event state for group validation

Posted by Robin Murphy 1 month, 3 weeks ago

It may have been different long ago, but today it seems wrong for these
drivers to skip counting disabled sibling events in group validation,
given that perf_event_enable() could make them schedulable again, and
thus increase the effective size of the group later. Conversely, if a
sibling event is truly dead then it stands to reason that the whole
group is dead, so it's not worth going to any special effort to try to
squeeze in a new event that's never going to run anyway. Thus, we can
simply remove all these checks.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
 arch/alpha/kernel/perf_event.c   | 2 +-
 arch/powerpc/perf/core-book3s.c  | 3 +--
 arch/powerpc/perf/core-fsl-emb.c | 3 +--
 arch/sparc/kernel/perf_event.c   | 3 +--
 arch/x86/events/core.c           | 2 +-
 arch/x86/events/intel/uncore.c   | 3 +--
 drivers/dma/idxd/perfmon.c       | 3 +--
 drivers/perf/arm_pmu.c           | 6 ------
 8 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index a3eaab094ece..8557165e64c0 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -352,7 +352,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		current_idx[n++] = PMC_NO_INDEX;
 	}
 	for_each_sibling_event(pe, group) {
-		if (!is_software_event(pe) && pe->state != PERF_EVENT_STATE_OFF) {
+		if (!is_software_event(pe)) {
 			if (n >= max_count)
 				return -1;
 			event[n] = pe;
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 8b0081441f85..d67f7d511f13 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -1602,8 +1602,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		events[n++] = group->hw.config;
 	}
 	for_each_sibling_event(event, group) {
-		if (event->pmu->task_ctx_nr == perf_hw_context &&
-		    event->state != PERF_EVENT_STATE_OFF) {
+		if (event->pmu->task_ctx_nr == perf_hw_context) {
 			if (n >= max_count)
 				return -1;
 			ctrs[n] = event;
diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c
index 7120ab20cbfe..509932b91b75 100644
--- a/arch/powerpc/perf/core-fsl-emb.c
+++ b/arch/powerpc/perf/core-fsl-emb.c
@@ -261,8 +261,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		n++;
 	}
 	for_each_sibling_event(event, group) {
-		if (!is_software_event(event) &&
-		    event->state != PERF_EVENT_STATE_OFF) {
+		if (!is_software_event(event)) {
 			if (n >= max_count)
 				return -1;
 			ctrs[n] = event;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index cae4d33002a5..706127749c66 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1357,8 +1357,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		current_idx[n++] = PIC_NO_INDEX;
 	}
 	for_each_sibling_event(event, group) {
-		if (!is_software_event(event) &&
-		    event->state != PERF_EVENT_STATE_OFF) {
+		if (!is_software_event(event)) {
 			if (n >= max_count)
 				return -1;
 			evts[n] = event;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 7610f26dfbd9..eca5bb49aa85 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1211,7 +1211,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
 		return n;
 
 	for_each_sibling_event(event, leader) {
-		if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF)
+		if (!is_x86_event(event))
 			continue;
 
 		if (collect_event(cpuc, event, max_count, n))
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index a762f7f5b161..297ff5adb667 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -406,8 +406,7 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader,
 		return n;
 
 	for_each_sibling_event(event, leader) {
-		if (!is_box_event(box, event) ||
-		    event->state <= PERF_EVENT_STATE_OFF)
+		if (!is_box_event(box, event))
 			continue;
 
 		if (n >= max_count)
diff --git a/drivers/dma/idxd/perfmon.c b/drivers/dma/idxd/perfmon.c
index 4b6af2f15d8a..8c539e1f11da 100644
--- a/drivers/dma/idxd/perfmon.c
+++ b/drivers/dma/idxd/perfmon.c
@@ -75,8 +75,7 @@ static int perfmon_collect_events(struct idxd_pmu *idxd_pmu,
 		return n;
 
 	for_each_sibling_event(event, leader) {
-		if (!is_idxd_event(idxd_pmu, event) ||
-		    event->state <= PERF_EVENT_STATE_OFF)
+		if (!is_idxd_event(idxd_pmu, event))
 			continue;
 
 		if (n >= max_count)
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 5c310e803dd7..e8a3c8e99da0 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -386,12 +386,6 @@ validate_event(struct pmu *pmu, struct pmu_hw_events *hw_events,
 	if (event->pmu != pmu)
 		return 0;
 
-	if (event->state < PERF_EVENT_STATE_OFF)
-		return 1;
-
-	if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec)
-		return 1;
-
 	armpmu = to_arm_pmu(event->pmu);
 	return armpmu->get_event_idx(hw_events, event) >= 0;
 }
-- 
2.39.2.101.g768bb238c484.dirty

Re: [PATCH 12/19] perf: Ignore event state for group validation

Posted by Peter Zijlstra 1 month, 1 week ago

On Wed, Aug 13, 2025 at 06:01:04PM +0100, Robin Murphy wrote:
> It may have been different long ago, but today it seems wrong for these
> drivers to skip counting disabled sibling events in group validation,
> given that perf_event_enable() could make them schedulable again, and
> thus increase the effective size of the group later. Conversely, if a
> sibling event is truly dead then it stands to reason that the whole
> group is dead, so it's not worth going to any special effort to try to
> squeeze in a new event that's never going to run anyway. Thus, we can
> simply remove all these checks.

So currently you can do sort of a manual event rotation inside an
over-sized group and have it work.

I'm not sure if anybody actually does this, but its possible.

Eg. on a PMU that supports only 4 counters, create a group of 5 and
periodically cycle which of the 5 events is off.

So I'm not against changing this, but changing stuff like this always
makes me a little fearful -- it wouldn't be the first time that when it
finally trickles down to some 'enterprise' user in 5 years someone comes
and finally says, oh hey, you broke my shit :-(

Re: [PATCH 12/19] perf: Ignore event state for group validation

Posted by Robin Murphy 1 month, 1 week ago

On 2025-08-26 2:03 pm, Peter Zijlstra wrote:
> On Wed, Aug 13, 2025 at 06:01:04PM +0100, Robin Murphy wrote:
>> It may have been different long ago, but today it seems wrong for these
>> drivers to skip counting disabled sibling events in group validation,
>> given that perf_event_enable() could make them schedulable again, and
>> thus increase the effective size of the group later. Conversely, if a
>> sibling event is truly dead then it stands to reason that the whole
>> group is dead, so it's not worth going to any special effort to try to
>> squeeze in a new event that's never going to run anyway. Thus, we can
>> simply remove all these checks.
> 
> So currently you can do sort of a manual event rotation inside an
> over-sized group and have it work.
> 
> I'm not sure if anybody actually does this, but its possible.
> 
> Eg. on a PMU that supports only 4 counters, create a group of 5 and
> periodically cycle which of the 5 events is off.
> 
> So I'm not against changing this, but changing stuff like this always
> makes me a little fearful -- it wouldn't be the first time that when it
> finally trickles down to some 'enterprise' user in 5 years someone comes
> and finally says, oh hey, you broke my shit :-(

Eww, I see what you mean... and I guess that's probably lower-overhead 
than actually deleting and recreating the sibling event(s) each time, 
and potentially less bother then wrangling multiple groups for different 
combinations of subsets when one simply must still approximate a complex 
metric that requires more counters than the hardware offers.

I'm also not keen to break anything that wasn't already somewhat broken, 
especially since this patch is only intended as cleanup, so either we 
could just drop it altogether, or perhaps I can wrap the existing 
behaviour in a helper that can at least document this assumption and 
discourage new drivers from copying it. Am I right that only 
PERF_EVENT_STATE_{OFF,ERROR} would matter for this, though, and my 
reasoning for state <= PERF_EVENT_STATE_EXIT should still stand? As for 
the fiddly discrepancy with enable_on_exec between arm_pmu and others 
I'm not really sure what to think...

Thanks,
Robin.

Re: [PATCH 12/19] perf: Ignore event state for group validation

Posted by Ian Rogers 1 month, 1 week ago

On Tue, Aug 26, 2025 at 8:32 AM Robin Murphy <robin.murphy@arm.com> wrote:
>
> On 2025-08-26 2:03 pm, Peter Zijlstra wrote:
> > On Wed, Aug 13, 2025 at 06:01:04PM +0100, Robin Murphy wrote:
> >> It may have been different long ago, but today it seems wrong for these
> >> drivers to skip counting disabled sibling events in group validation,
> >> given that perf_event_enable() could make them schedulable again, and
> >> thus increase the effective size of the group later. Conversely, if a
> >> sibling event is truly dead then it stands to reason that the whole
> >> group is dead, so it's not worth going to any special effort to try to
> >> squeeze in a new event that's never going to run anyway. Thus, we can
> >> simply remove all these checks.
> >
> > So currently you can do sort of a manual event rotation inside an
> > over-sized group and have it work.
> >
> > I'm not sure if anybody actually does this, but its possible.
> >
> > Eg. on a PMU that supports only 4 counters, create a group of 5 and
> > periodically cycle which of the 5 events is off.

I'm not sure this is true, I thought this would fail in the
perf_event_open when adding the 5th event and there being insufficient
counters for the group. Not all PMUs validate a group will fit on the
counters, but I thought at least Intel's core PMU would validate and
not allow this. Fwiw, the metric code is reliant on this behavior as
by default all events are placed into a weak group:
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/metricgroup.c?h=perf-tools-next#n631
Weak groups are really just groups that when the perf_event_open fails
retry with the grouping removed. PMUs that don't fail the
perf_event_open are problematic as the reads just report "not counted"
and the metric doesn't work. Sometimes the PMU can't help it due to
errata. There are a bunch of workarounds for those cases carried in
the perf tool, but in general weak groups working is relied upon:
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/pmu-events/pmu-events.h?h=perf-tools-next#n16

Thanks,
Ian

> > So I'm not against changing this, but changing stuff like this always
> > makes me a little fearful -- it wouldn't be the first time that when it
> > finally trickles down to some 'enterprise' user in 5 years someone comes
> > and finally says, oh hey, you broke my shit :-(
>
> Eww, I see what you mean... and I guess that's probably lower-overhead
> than actually deleting and recreating the sibling event(s) each time,
> and potentially less bother then wrangling multiple groups for different
> combinations of subsets when one simply must still approximate a complex
> metric that requires more counters than the hardware offers.
>
> I'm also not keen to break anything that wasn't already somewhat broken,
> especially since this patch is only intended as cleanup, so either we
> could just drop it altogether, or perhaps I can wrap the existing
> behaviour in a helper that can at least document this assumption and
> discourage new drivers from copying it. Am I right that only
> PERF_EVENT_STATE_{OFF,ERROR} would matter for this, though, and my
> reasoning for state <= PERF_EVENT_STATE_EXIT should still stand? As for
> the fiddly discrepancy with enable_on_exec between arm_pmu and others
> I'm not really sure what to think...
>
> Thanks,
> Robin.

Re: [PATCH 12/19] perf: Ignore event state for group validation

Posted by Mark Rutland 1 month, 1 week ago

On Tue, Aug 26, 2025 at 11:48:48AM -0700, Ian Rogers wrote:
> On Tue, Aug 26, 2025 at 8:32 AM Robin Murphy <robin.murphy@arm.com> wrote:
> >
> > On 2025-08-26 2:03 pm, Peter Zijlstra wrote:
> > > On Wed, Aug 13, 2025 at 06:01:04PM +0100, Robin Murphy wrote:
> > >> It may have been different long ago, but today it seems wrong for these
> > >> drivers to skip counting disabled sibling events in group validation,
> > >> given that perf_event_enable() could make them schedulable again, and
> > >> thus increase the effective size of the group later. Conversely, if a
> > >> sibling event is truly dead then it stands to reason that the whole
> > >> group is dead, so it's not worth going to any special effort to try to
> > >> squeeze in a new event that's never going to run anyway. Thus, we can
> > >> simply remove all these checks.
> > >
> > > So currently you can do sort of a manual event rotation inside an
> > > over-sized group and have it work.
> > >
> > > I'm not sure if anybody actually does this, but its possible.
> > >
> > > Eg. on a PMU that supports only 4 counters, create a group of 5 and
> > > periodically cycle which of the 5 events is off.
> 
> I'm not sure this is true, I thought this would fail in the
> perf_event_open when adding the 5th event and there being insufficient
> counters for the group.

We're talking specifically about cases where the logic in a pmu's
pmu::event_init() callback doesn't count events in specific states, and
hence the 5th even doesn't get rejected when it is initialised.

For example, in arch/x86/events/core.c, validate_group() uses
collect_events(), which has:

	for_each_sibling_event(event, leader) {
		if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF)
			continue;

		if (collect_event(cpuc, event, max_count, n))
			return -EINVAL;

		n++;
	}

... and so where an event's state is <= PERF_EVENT_STATE_OFF at init
time, that event is not counted to see if it fits into HW counters.

Mark.

Re: [PATCH 12/19] perf: Ignore event state for group validation

Posted by Ian Rogers 1 month, 1 week ago

On Wed, Aug 27, 2025 at 1:18 AM Mark Rutland <mark.rutland@arm.com> wrote:
>
> On Tue, Aug 26, 2025 at 11:48:48AM -0700, Ian Rogers wrote:
> > On Tue, Aug 26, 2025 at 8:32 AM Robin Murphy <robin.murphy@arm.com> wrote:
> > >
> > > On 2025-08-26 2:03 pm, Peter Zijlstra wrote:
> > > > On Wed, Aug 13, 2025 at 06:01:04PM +0100, Robin Murphy wrote:
> > > >> It may have been different long ago, but today it seems wrong for these
> > > >> drivers to skip counting disabled sibling events in group validation,
> > > >> given that perf_event_enable() could make them schedulable again, and
> > > >> thus increase the effective size of the group later. Conversely, if a
> > > >> sibling event is truly dead then it stands to reason that the whole
> > > >> group is dead, so it's not worth going to any special effort to try to
> > > >> squeeze in a new event that's never going to run anyway. Thus, we can
> > > >> simply remove all these checks.
> > > >
> > > > So currently you can do sort of a manual event rotation inside an
> > > > over-sized group and have it work.
> > > >
> > > > I'm not sure if anybody actually does this, but its possible.
> > > >
> > > > Eg. on a PMU that supports only 4 counters, create a group of 5 and
> > > > periodically cycle which of the 5 events is off.
> >
> > I'm not sure this is true, I thought this would fail in the
> > perf_event_open when adding the 5th event and there being insufficient
> > counters for the group.
>
> We're talking specifically about cases where the logic in a pmu's
> pmu::event_init() callback doesn't count events in specific states, and
> hence the 5th even doesn't get rejected when it is initialised.
>
> For example, in arch/x86/events/core.c, validate_group() uses
> collect_events(), which has:
>
>         for_each_sibling_event(event, leader) {
>                 if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF)
>                         continue;
>
>                 if (collect_event(cpuc, event, max_count, n))
>                         return -EINVAL;
>
>                 n++;
>         }
>
> ... and so where an event's state is <= PERF_EVENT_STATE_OFF at init
> time, that event is not counted to see if it fits into HW counters.

Hmm.. Thinking out loud. So it looked like perf with weak groups could
be broken then:
```
$ sudo perf stat -vv -e '{instructions,cycles}:W' true
...
perf_event_attr:
 type                             0 (PERF_TYPE_HARDWARE)
 size                             136
 config                           0x400000001
(cpu_core/PERF_COUNT_HW_INSTRUCTIONS/)
 sample_type                      IDENTIFIER
 read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING|ID|GROUP
 disabled                         1
 inherit                          1
 enable_on_exec                   1
------------------------------------------------------------
sys_perf_event_open: pid 3337764  cpu -1  group_fd -1  flags 0x8 = 5
------------------------------------------------------------
perf_event_attr:
 type                             0 (PERF_TYPE_HARDWARE)
 size                             136
 config                           0x400000000
(cpu_core/PERF_COUNT_HW_CPU_CYCLES/)
 sample_type                      IDENTIFIER
 read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING|ID|GROUP
 inherit                          1
------------------------------------------------------------
sys_perf_event_open: pid 3337764  cpu -1  group_fd 5  flags 0x8 = 7
...
```
Note, the group leader (instructions) is disabled because of:
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/stat.c?h=perf-tools-next#n761
```
/*
* Disabling all counters initially, they will be enabled
* either manually by us or by kernel via enable_on_exec
* set later.
*/
if (evsel__is_group_leader(evsel)) {
        attr->disabled = 1;
```
but the checking of being disabled (PERF_EVENT_STATE_OFF) is only done
on siblings in the code you show above. So yes, you can disable the
group events to allow the perf_event_open to succeed but not on the
leader which is always checked (no PERF_EVENT_STATE_OFF check):
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/arch/x86/events/core.c?h=perf-tools-next#n1204
```
if (is_x86_event(leader)) {
        if (collect_event(cpuc, leader, max_count, n))
                return -EINVAL;
```

Thanks,
Ian