[PATCH V2 01/15] perf: Fix the throttle logic for a group

kan.liang@linux.intel.com posted 15 patches 8 months, 4 weeks ago
There is a newer version of this series
[PATCH V2 01/15] perf: Fix the throttle logic for a group
Posted by kan.liang@linux.intel.com 8 months, 4 weeks ago
From: Kan Liang <kan.liang@linux.intel.com>

The current throttle logic doesn't work well with a group, e.g., the
following sampling-read case.

$ perf record -e "{cycles,cycles}:S" ...

$ perf report -D | grep THROTTLE | tail -2
            THROTTLE events:        426  ( 9.0%)
          UNTHROTTLE events:        425  ( 9.0%)

$ perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
0 1020120874009167 0x74970 [0x68]: PERF_RECORD_SAMPLE(IP, 0x1):
... sample_read:
.... group nr 2
..... id 0000000000000327, value 000000000cbb993a, lost 0
..... id 0000000000000328, value 00000002211c26df, lost 0

The second cycles event has a much larger value than the first cycles
event in the same group.

The current throttle logic in the generic code only logs the THROTTLE
event. It relies on the specific driver implementation to disable
events. For all ARCHs, the implementation is similar. Only the event is
disabled, rather than the group.

The logic to disable the group should be generic for all ARCHs. Add the
logic in the generic code. The following patch will remove the buggy
driver-specific implementation.

The throttle only happens when an event is overflowed. Stop the entire
group when any event in the group triggers the throttle.
The MAX_INTERRUPTS is set to all throttle events.

The unthrottled could happen in 3 places.
- event/group sched. All events in the group are scheduled one by one.
  All of them will be unthrottled eventually. Nothing needs to be
  changed.
- The perf_adjust_freq_unthr_events for each tick. Needs to restart the
  group altogether.
- The __perf_event_period(). The whole group needs to be restarted
  altogether as well.

With the fix,
$ sudo perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
0 3573470770332 0x12f5f8 [0x70]: PERF_RECORD_SAMPLE(IP, 0x2):
... sample_read:
.... group nr 2
..... id 0000000000000a28, value 00000004fd3dfd8f, lost 0
..... id 0000000000000a29, value 00000004fd3dfd8f, lost 0

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V1:
- Apply the suggested throttle/unthrottle functions from Peter.
  The MAX_INTERRUPTS and throttle logs are applied to all events.
- Update the description and comments accordingly

 kernel/events/core.c | 58 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index a84abc2b7f20..a270fcda766d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2734,6 +2734,39 @@ void perf_event_disable_inatomic(struct perf_event *event)
 static void perf_log_throttle(struct perf_event *event, int enable);
 static void perf_log_itrace_start(struct perf_event *event);
 
+static void perf_event_unthrottle(struct perf_event *event, bool start)
+{
+	event->hw.interrupts = 0;
+	if (start)
+		event->pmu->start(event, 0);
+	perf_log_throttle(event, 1);
+}
+
+static void perf_event_throttle(struct perf_event *event)
+{
+	event->pmu->stop(event, 0);
+	event->hw.interrupts = MAX_INTERRUPTS;
+	perf_log_throttle(event, 0);
+}
+
+static void perf_event_unthrottle_group(struct perf_event *event, bool start)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+
+	perf_event_unthrottle(leader, leader != event || start);
+	for_each_sibling_event(sibling, leader)
+		perf_event_unthrottle(sibling, sibling != event || start);
+}
+
+static void perf_event_throttle_group(struct perf_event *event)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+
+	perf_event_throttle(leader);
+	for_each_sibling_event(sibling, leader)
+		perf_event_throttle(sibling);
+}
+
 static int
 event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
 {
@@ -4389,10 +4422,8 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
 		hwc = &event->hw;
 
 		if (hwc->interrupts == MAX_INTERRUPTS) {
-			hwc->interrupts = 0;
-			perf_log_throttle(event, 1);
-			if (!event->attr.freq || !event->attr.sample_freq)
-				event->pmu->start(event, 0);
+			perf_event_unthrottle_group(event,
+				!event->attr.freq || !event->attr.sample_freq);
 		}
 
 		if (!event->attr.freq || !event->attr.sample_freq)
@@ -6421,14 +6452,6 @@ static void __perf_event_period(struct perf_event *event,
 	active = (event->state == PERF_EVENT_STATE_ACTIVE);
 	if (active) {
 		perf_pmu_disable(event->pmu);
-		/*
-		 * We could be throttled; unthrottle now to avoid the tick
-		 * trying to unthrottle while we already re-started the event.
-		 */
-		if (event->hw.interrupts == MAX_INTERRUPTS) {
-			event->hw.interrupts = 0;
-			perf_log_throttle(event, 1);
-		}
 		event->pmu->stop(event, PERF_EF_UPDATE);
 	}
 
@@ -6436,6 +6459,14 @@ static void __perf_event_period(struct perf_event *event,
 
 	if (active) {
 		event->pmu->start(event, PERF_EF_RELOAD);
+		/*
+		 * Once the period is force-reset, the event starts immediately.
+		 * But the event/group could be throttled. Unthrottle the
+		 * event/group now to avoid the next tick trying to unthrottle
+		 * while we already re-started the event/group.
+		 */
+		if (event->hw.interrupts == MAX_INTERRUPTS)
+			perf_event_unthrottle_group(event, false);
 		perf_pmu_enable(event->pmu);
 	}
 }
@@ -10326,8 +10357,7 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
 	if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
 		__this_cpu_inc(perf_throttled_count);
 		tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
-		hwc->interrupts = MAX_INTERRUPTS;
-		perf_log_throttle(event, 0);
+		perf_event_throttle_group(event);
 		ret = 1;
 	}
 
-- 
2.38.1
Re: [PATCH V2 01/15] perf: Fix the throttle logic for a group
Posted by Leo Yan 8 months, 4 weeks ago
On Wed, May 14, 2025 at 08:13:47AM -0700, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> The current throttle logic doesn't work well with a group, e.g., the
> following sampling-read case.
> 
> $ perf record -e "{cycles,cycles}:S" ...
> 
> $ perf report -D | grep THROTTLE | tail -2
>             THROTTLE events:        426  ( 9.0%)
>           UNTHROTTLE events:        425  ( 9.0%)
> 
> $ perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
> 0 1020120874009167 0x74970 [0x68]: PERF_RECORD_SAMPLE(IP, 0x1):
> ... sample_read:
> .... group nr 2
> ..... id 0000000000000327, value 000000000cbb993a, lost 0
> ..... id 0000000000000328, value 00000002211c26df, lost 0
> 
> The second cycles event has a much larger value than the first cycles
> event in the same group.
> 
> The current throttle logic in the generic code only logs the THROTTLE
> event. It relies on the specific driver implementation to disable
> events. For all ARCHs, the implementation is similar. Only the event is
> disabled, rather than the group.
> 
> The logic to disable the group should be generic for all ARCHs. Add the
> logic in the generic code. The following patch will remove the buggy
> driver-specific implementation.
> 
> The throttle only happens when an event is overflowed. Stop the entire
> group when any event in the group triggers the throttle.
> The MAX_INTERRUPTS is set to all throttle events.
> 
> The unthrottled could happen in 3 places.
> - event/group sched. All events in the group are scheduled one by one.
>   All of them will be unthrottled eventually. Nothing needs to be
>   changed.
> - The perf_adjust_freq_unthr_events for each tick. Needs to restart the
>   group altogether.
> - The __perf_event_period(). The whole group needs to be restarted
>   altogether as well.
> 
> With the fix,
> $ sudo perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
> 0 3573470770332 0x12f5f8 [0x70]: PERF_RECORD_SAMPLE(IP, 0x2):
> ... sample_read:
> .... group nr 2
> ..... id 0000000000000a28, value 00000004fd3dfd8f, lost 0
> ..... id 0000000000000a29, value 00000004fd3dfd8f, lost 0
> 
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> ---
> 
> Changes since V1:
> - Apply the suggested throttle/unthrottle functions from Peter.
>   The MAX_INTERRUPTS and throttle logs are applied to all events.
> - Update the description and comments accordingly
> 
>  kernel/events/core.c | 58 +++++++++++++++++++++++++++++++++-----------
>  1 file changed, 44 insertions(+), 14 deletions(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index a84abc2b7f20..a270fcda766d 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2734,6 +2734,39 @@ void perf_event_disable_inatomic(struct perf_event *event)
>  static void perf_log_throttle(struct perf_event *event, int enable);
>  static void perf_log_itrace_start(struct perf_event *event);
>  
> +static void perf_event_unthrottle(struct perf_event *event, bool start)
> +{
> +	event->hw.interrupts = 0;
> +	if (start)
> +		event->pmu->start(event, 0);
> +	perf_log_throttle(event, 1);
> +}
> +
> +static void perf_event_throttle(struct perf_event *event)
> +{
> +	event->pmu->stop(event, 0);
> +	event->hw.interrupts = MAX_INTERRUPTS;
> +	perf_log_throttle(event, 0);
> +}
> +
> +static void perf_event_unthrottle_group(struct perf_event *event, bool start)
> +{
> +	struct perf_event *sibling, *leader = event->group_leader;
> +
> +	perf_event_unthrottle(leader, leader != event || start);
> +	for_each_sibling_event(sibling, leader)
> +		perf_event_unthrottle(sibling, sibling != event || start);

Seems to me that the condition "leader != event || start" is bit tricky
(similarly for the check "sibling != event || start").

If a session sets the frequency (with option -F in perf tool), the
following flow is triggered:

  perf_adjust_freq_unthr_events()
    `> perf_event_unthrottle_group(event, false);

The argument "start" is false, so all sibling events will be enabled,
but the event pointed by the "event" argument remains disabled.  Though
the __perf_event_period() function will enables all events with adjusted
period, but it is still risky for counting discrepancy caused by the
flow described above.

Thanks,
Leo

> +}
> +
> +static void perf_event_throttle_group(struct perf_event *event)
> +{
> +	struct perf_event *sibling, *leader = event->group_leader;
> +
> +	perf_event_throttle(leader);
> +	for_each_sibling_event(sibling, leader)
> +		perf_event_throttle(sibling);
> +}
> +
>  static int
>  event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
>  {
> @@ -4389,10 +4422,8 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
>  		hwc = &event->hw;
>  
>  		if (hwc->interrupts == MAX_INTERRUPTS) {
> -			hwc->interrupts = 0;
> -			perf_log_throttle(event, 1);
> -			if (!event->attr.freq || !event->attr.sample_freq)
> -				event->pmu->start(event, 0);
> +			perf_event_unthrottle_group(event,
> +				!event->attr.freq || !event->attr.sample_freq);
>  		}
>  
>  		if (!event->attr.freq || !event->attr.sample_freq)
> @@ -6421,14 +6452,6 @@ static void __perf_event_period(struct perf_event *event,
>  	active = (event->state == PERF_EVENT_STATE_ACTIVE);
>  	if (active) {
>  		perf_pmu_disable(event->pmu);
> -		/*
> -		 * We could be throttled; unthrottle now to avoid the tick
> -		 * trying to unthrottle while we already re-started the event.
> -		 */
> -		if (event->hw.interrupts == MAX_INTERRUPTS) {
> -			event->hw.interrupts = 0;
> -			perf_log_throttle(event, 1);
> -		}
>  		event->pmu->stop(event, PERF_EF_UPDATE);
>  	}
>  
> @@ -6436,6 +6459,14 @@ static void __perf_event_period(struct perf_event *event,
>  
>  	if (active) {
>  		event->pmu->start(event, PERF_EF_RELOAD);
> +		/*
> +		 * Once the period is force-reset, the event starts immediately.
> +		 * But the event/group could be throttled. Unthrottle the
> +		 * event/group now to avoid the next tick trying to unthrottle
> +		 * while we already re-started the event/group.
> +		 */
> +		if (event->hw.interrupts == MAX_INTERRUPTS)
> +			perf_event_unthrottle_group(event, false);
>  		perf_pmu_enable(event->pmu);
>  	}
>  }
> @@ -10326,8 +10357,7 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
>  	if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
>  		__this_cpu_inc(perf_throttled_count);
>  		tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
> -		hwc->interrupts = MAX_INTERRUPTS;
> -		perf_log_throttle(event, 0);
> +		perf_event_throttle_group(event);
>  		ret = 1;
>  	}
>  
> -- 
> 2.38.1
> 
>
Re: [PATCH V2 01/15] perf: Fix the throttle logic for a group
Posted by Liang, Kan 8 months, 4 weeks ago

On 2025-05-15 5:43 a.m., Leo Yan wrote:
> On Wed, May 14, 2025 at 08:13:47AM -0700, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> The current throttle logic doesn't work well with a group, e.g., the
>> following sampling-read case.
>>
>> $ perf record -e "{cycles,cycles}:S" ...
>>
>> $ perf report -D | grep THROTTLE | tail -2
>>             THROTTLE events:        426  ( 9.0%)
>>           UNTHROTTLE events:        425  ( 9.0%)
>>
>> $ perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
>> 0 1020120874009167 0x74970 [0x68]: PERF_RECORD_SAMPLE(IP, 0x1):
>> ... sample_read:
>> .... group nr 2
>> ..... id 0000000000000327, value 000000000cbb993a, lost 0
>> ..... id 0000000000000328, value 00000002211c26df, lost 0
>>
>> The second cycles event has a much larger value than the first cycles
>> event in the same group.
>>
>> The current throttle logic in the generic code only logs the THROTTLE
>> event. It relies on the specific driver implementation to disable
>> events. For all ARCHs, the implementation is similar. Only the event is
>> disabled, rather than the group.
>>
>> The logic to disable the group should be generic for all ARCHs. Add the
>> logic in the generic code. The following patch will remove the buggy
>> driver-specific implementation.
>>
>> The throttle only happens when an event is overflowed. Stop the entire
>> group when any event in the group triggers the throttle.
>> The MAX_INTERRUPTS is set to all throttle events.
>>
>> The unthrottled could happen in 3 places.
>> - event/group sched. All events in the group are scheduled one by one.
>>   All of them will be unthrottled eventually. Nothing needs to be
>>   changed.
>> - The perf_adjust_freq_unthr_events for each tick. Needs to restart the
>>   group altogether.
>> - The __perf_event_period(). The whole group needs to be restarted
>>   altogether as well.
>>
>> With the fix,
>> $ sudo perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
>> 0 3573470770332 0x12f5f8 [0x70]: PERF_RECORD_SAMPLE(IP, 0x2):
>> ... sample_read:
>> .... group nr 2
>> ..... id 0000000000000a28, value 00000004fd3dfd8f, lost 0
>> ..... id 0000000000000a29, value 00000004fd3dfd8f, lost 0
>>
>> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>> ---
>>
>> Changes since V1:
>> - Apply the suggested throttle/unthrottle functions from Peter.
>>   The MAX_INTERRUPTS and throttle logs are applied to all events.
>> - Update the description and comments accordingly
>>
>>  kernel/events/core.c | 58 +++++++++++++++++++++++++++++++++-----------
>>  1 file changed, 44 insertions(+), 14 deletions(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index a84abc2b7f20..a270fcda766d 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -2734,6 +2734,39 @@ void perf_event_disable_inatomic(struct perf_event *event)
>>  static void perf_log_throttle(struct perf_event *event, int enable);
>>  static void perf_log_itrace_start(struct perf_event *event);
>>  
>> +static void perf_event_unthrottle(struct perf_event *event, bool start)
>> +{
>> +	event->hw.interrupts = 0;
>> +	if (start)
>> +		event->pmu->start(event, 0);
>> +	perf_log_throttle(event, 1);
>> +}
>> +
>> +static void perf_event_throttle(struct perf_event *event)
>> +{
>> +	event->pmu->stop(event, 0);
>> +	event->hw.interrupts = MAX_INTERRUPTS;
>> +	perf_log_throttle(event, 0);
>> +}
>> +
>> +static void perf_event_unthrottle_group(struct perf_event *event, bool start)
>> +{
>> +	struct perf_event *sibling, *leader = event->group_leader;
>> +
>> +	perf_event_unthrottle(leader, leader != event || start);
>> +	for_each_sibling_event(sibling, leader)
>> +		perf_event_unthrottle(sibling, sibling != event || start);
> 
> Seems to me that the condition "leader != event || start" is bit tricky
> (similarly for the check "sibling != event || start").
> 
> If a session sets the frequency (with option -F in perf tool), the
> following flow is triggered:
> 
>   perf_adjust_freq_unthr_events()
>     `> perf_event_unthrottle_group(event, false);
> 
> The argument "start" is false, so all sibling events will be enabled,
> but the event pointed by the "event" argument remains disabled.  

Right. Because the following code will adjust the period of the event
and start it.
The PMU is disabled at the moment. There is no difference in starting
the leader first or the member first.

> Though
> the __perf_event_period() function will enables all events with adjusted
> period, but it is still risky for counting discrepancy caused by the
> flow described above.

The __perf_event_period() is similar. The event in both cases has to
adjust the period before re-start the event, which has to be done
outside of the perf_event_unthrottle_group().

Thanks,
Kan>
> Thanks,
> Leo
> 
>> +}
>> +
>> +static void perf_event_throttle_group(struct perf_event *event)
>> +{
>> +	struct perf_event *sibling, *leader = event->group_leader;
>> +
>> +	perf_event_throttle(leader);
>> +	for_each_sibling_event(sibling, leader)
>> +		perf_event_throttle(sibling);
>> +}
>> +
>>  static int
>>  event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
>>  {
>> @@ -4389,10 +4422,8 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
>>  		hwc = &event->hw;
>>  
>>  		if (hwc->interrupts == MAX_INTERRUPTS) {
>> -			hwc->interrupts = 0;
>> -			perf_log_throttle(event, 1);
>> -			if (!event->attr.freq || !event->attr.sample_freq)
>> -				event->pmu->start(event, 0);
>> +			perf_event_unthrottle_group(event,
>> +				!event->attr.freq || !event->attr.sample_freq);
>>  		}
>>  
>>  		if (!event->attr.freq || !event->attr.sample_freq)
>> @@ -6421,14 +6452,6 @@ static void __perf_event_period(struct perf_event *event,
>>  	active = (event->state == PERF_EVENT_STATE_ACTIVE);
>>  	if (active) {
>>  		perf_pmu_disable(event->pmu);
>> -		/*
>> -		 * We could be throttled; unthrottle now to avoid the tick
>> -		 * trying to unthrottle while we already re-started the event.
>> -		 */
>> -		if (event->hw.interrupts == MAX_INTERRUPTS) {
>> -			event->hw.interrupts = 0;
>> -			perf_log_throttle(event, 1);
>> -		}
>>  		event->pmu->stop(event, PERF_EF_UPDATE);
>>  	}
>>  
>> @@ -6436,6 +6459,14 @@ static void __perf_event_period(struct perf_event *event,
>>  
>>  	if (active) {
>>  		event->pmu->start(event, PERF_EF_RELOAD);
>> +		/*
>> +		 * Once the period is force-reset, the event starts immediately.
>> +		 * But the event/group could be throttled. Unthrottle the
>> +		 * event/group now to avoid the next tick trying to unthrottle
>> +		 * while we already re-started the event/group.
>> +		 */
>> +		if (event->hw.interrupts == MAX_INTERRUPTS)
>> +			perf_event_unthrottle_group(event, false);
>>  		perf_pmu_enable(event->pmu);
>>  	}
>>  }
>> @@ -10326,8 +10357,7 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
>>  	if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
>>  		__this_cpu_inc(perf_throttled_count);
>>  		tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
>> -		hwc->interrupts = MAX_INTERRUPTS;
>> -		perf_log_throttle(event, 0);
>> +		perf_event_throttle_group(event);
>>  		ret = 1;
>>  	}
>>  
>> -- 
>> 2.38.1
>>
>>
>
Re: [PATCH V2 01/15] perf: Fix the throttle logic for a group
Posted by Leo Yan 8 months, 3 weeks ago
On Thu, May 15, 2025 at 08:55:05AM -0400, Liang, Kan wrote:

[...]

> >> +static void perf_event_unthrottle_group(struct perf_event *event, bool start)
> >> +{
> >> +	struct perf_event *sibling, *leader = event->group_leader;
> >> +
> >> +	perf_event_unthrottle(leader, leader != event || start);
> >> +	for_each_sibling_event(sibling, leader)
> >> +		perf_event_unthrottle(sibling, sibling != event || start);
> > 
> > Seems to me that the condition "leader != event || start" is bit tricky
> > (similarly for the check "sibling != event || start").
> > 
> > If a session sets the frequency (with option -F in perf tool), the
> > following flow is triggered:
> > 
> >   perf_adjust_freq_unthr_events()
> >     `> perf_event_unthrottle_group(event, false);
> > 
> > The argument "start" is false, so all sibling events will be enabled,
> > but the event pointed by the "event" argument remains disabled.  
> 
> Right. Because the following code will adjust the period of the event
> and start it.
> The PMU is disabled at the moment. There is no difference in starting
> the leader first or the member first.

Thanks for explaination. In the case above, as you said, all events will
be enabled either in perf_event_unthrottle_group() or in
perf_adjust_freq_unthr_events() with a recalculated period.

Just a minor suggestion. Seems to me, the parameter "start" actually
means "only_enable_sibling". For more readable, the function can be
refine as:

static void perf_event_unthrottle_group(struct perf_event *event,
                                        bool only_enable_sibling)
{
	struct perf_event *sibling, *leader = event->group_leader;

	perf_event_unthrottle(leader,
                only_enable_sibling ? leader != event : true);
        ...
}

Thanks,
Leo
Re: [PATCH V2 01/15] perf: Fix the throttle logic for a group
Posted by Liang, Kan 8 months, 3 weeks ago

On 2025-05-16 8:51 a.m., Leo Yan wrote:
> On Thu, May 15, 2025 at 08:55:05AM -0400, Liang, Kan wrote:
> 
> [...]
> 
>>>> +static void perf_event_unthrottle_group(struct perf_event *event, bool start)
>>>> +{
>>>> +	struct perf_event *sibling, *leader = event->group_leader;
>>>> +
>>>> +	perf_event_unthrottle(leader, leader != event || start);
>>>> +	for_each_sibling_event(sibling, leader)
>>>> +		perf_event_unthrottle(sibling, sibling != event || start);
>>>
>>> Seems to me that the condition "leader != event || start" is bit tricky
>>> (similarly for the check "sibling != event || start").
>>>
>>> If a session sets the frequency (with option -F in perf tool), the
>>> following flow is triggered:
>>>
>>>   perf_adjust_freq_unthr_events()
>>>     `> perf_event_unthrottle_group(event, false);
>>>
>>> The argument "start" is false, so all sibling events will be enabled,
>>> but the event pointed by the "event" argument remains disabled.  
>>
>> Right. Because the following code will adjust the period of the event
>> and start it.
>> The PMU is disabled at the moment. There is no difference in starting
>> the leader first or the member first.
> 
> Thanks for explaination. In the case above, as you said, all events will
> be enabled either in perf_event_unthrottle_group() or in
> perf_adjust_freq_unthr_events() with a recalculated period.
> 
> Just a minor suggestion. Seems to me, the parameter "start" actually
> means "only_enable_sibling". For more readable, the function can be
> refine as:
> 
> static void perf_event_unthrottle_group(struct perf_event *event,
>                                         bool only_enable_sibling)
> {
> 	struct perf_event *sibling, *leader = event->group_leader;
> 
> 	perf_event_unthrottle(leader,
>                 only_enable_sibling ? leader != event : true);
>         ...
> }
> 

It should work for the perf_adjust_freq_unthr_events(), which only start
the leader. But it's possible that the __perf_event_period() update a
sibling, not leader.

I think I can check the name to bool event_has_start.
Is the name OK?

diff --git a/kernel/events/core.c b/kernel/events/core.c
index a270fcda766d..b1cb07fa9c18 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2749,13 +2749,13 @@ static void perf_event_throttle(struct
perf_event *event)
 	perf_log_throttle(event, 0);
 }

-static void perf_event_unthrottle_group(struct perf_event *event, bool
start)
+static void perf_event_unthrottle_group(struct perf_event *event, bool
event_has_start)
 {
 	struct perf_event *sibling, *leader = event->group_leader;

-	perf_event_unthrottle(leader, leader != event || start);
+	perf_event_unthrottle(leader, event_has_start ? leader != event : true);
 	for_each_sibling_event(sibling, leader)
-		perf_event_unthrottle(sibling, sibling != event || start);
+		perf_event_unthrottle(sibling, event_has_start ? sibling != event :
true);
 }

 static void perf_event_throttle_group(struct perf_event *event)
@@ -4423,7 +4423,7 @@ static void perf_adjust_freq_unthr_events(struct
list_head *event_list)

 		if (hwc->interrupts == MAX_INTERRUPTS) {
 			perf_event_unthrottle_group(event,
-				!event->attr.freq || !event->attr.sample_freq);
+				(event->attr.freq && event->attr.sample_freq));
 		}

 		if (!event->attr.freq || !event->attr.sample_freq)
@@ -6466,7 +6466,7 @@ static void __perf_event_period(struct perf_event
*event,
 		 * while we already re-started the event/group.
 		 */
 		if (event->hw.interrupts == MAX_INTERRUPTS)
-			perf_event_unthrottle_group(event, false);
+			perf_event_unthrottle_group(event, true);
 		perf_pmu_enable(event->pmu);
 	}
 }

Thanks,
Kan
Re: [PATCH V2 01/15] perf: Fix the throttle logic for a group
Posted by Leo Yan 8 months, 3 weeks ago
On Fri, May 16, 2025 at 09:28:07AM -0400, Liang, Kan wrote:

[...]

> > Just a minor suggestion. Seems to me, the parameter "start" actually
> > means "only_enable_sibling". For more readable, the function can be
> > refine as:
> > 
> > static void perf_event_unthrottle_group(struct perf_event *event,
> >                                         bool only_enable_sibling)
> > {
> > 	struct perf_event *sibling, *leader = event->group_leader;
> > 
> > 	perf_event_unthrottle(leader,
> >                 only_enable_sibling ? leader != event : true);
> >         ...
> > }
> > 
> 
> It should work for the perf_adjust_freq_unthr_events(), which only start
> the leader.

> But it's possible that the __perf_event_period() update a
> sibling, not leader.

Should not perf_event_unthrottle_group() always enable sibling events?

The only difference is how the leader event to be enabled.  It can be
enabled in perf_event_unthrottle_group() in period mode, or in
frequency mode due to a new period value is generated, the leader
event is enabled in perf_adjust_freq_unthr_events() or in
__perf_event_period().

This is why I suggested to rename the flag to only_enable_sibling:

  true: only enable sibling events
  false: enable all events (leader event and sibling events)

Or, we can rename the flag as "skip_start_event", means to skip
enabling the event specified in the argument.

> I think I can check the name to bool event_has_start.
> Is the name OK?

I am still confused for the naming "event_has_start" :)

What exactly does it mean?

> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index a270fcda766d..b1cb07fa9c18 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2749,13 +2749,13 @@ static void perf_event_throttle(struct
> perf_event *event)
>  	perf_log_throttle(event, 0);
>  }
> 
> -static void perf_event_unthrottle_group(struct perf_event *event, bool
> start)
> +static void perf_event_unthrottle_group(struct perf_event *event, bool
> event_has_start)
>  {
>  	struct perf_event *sibling, *leader = event->group_leader;
> 
> -	perf_event_unthrottle(leader, leader != event || start);
> +	perf_event_unthrottle(leader, event_has_start ? leader != event : true);
>  	for_each_sibling_event(sibling, leader)
> -		perf_event_unthrottle(sibling, sibling != event || start);
> +		perf_event_unthrottle(sibling, event_has_start ? sibling != event :
> true);
>  }
> 
>  static void perf_event_throttle_group(struct perf_event *event)
> @@ -4423,7 +4423,7 @@ static void perf_adjust_freq_unthr_events(struct
> list_head *event_list)
> 
>  		if (hwc->interrupts == MAX_INTERRUPTS) {
>  			perf_event_unthrottle_group(event,
> -				!event->attr.freq || !event->attr.sample_freq);
> +				(event->attr.freq && event->attr.sample_freq));
>  		}
> 
>  		if (!event->attr.freq || !event->attr.sample_freq)
> @@ -6466,7 +6466,7 @@ static void __perf_event_period(struct perf_event
> *event,
>  		 * while we already re-started the event/group.
>  		 */
>  		if (event->hw.interrupts == MAX_INTERRUPTS)
> -			perf_event_unthrottle_group(event, false);
> +			perf_event_unthrottle_group(event, true);
>  		perf_pmu_enable(event->pmu);

The logic in the updated code is correct for me.

Thanks,
Leo
Re: [PATCH V2 01/15] perf: Fix the throttle logic for a group
Posted by Liang, Kan 8 months, 3 weeks ago

On 2025-05-16 10:17 a.m., Leo Yan wrote:
> On Fri, May 16, 2025 at 09:28:07AM -0400, Liang, Kan wrote:
> 
> [...]
> 
>>> Just a minor suggestion. Seems to me, the parameter "start" actually
>>> means "only_enable_sibling". For more readable, the function can be
>>> refine as:
>>>
>>> static void perf_event_unthrottle_group(struct perf_event *event,
>>>                                         bool only_enable_sibling)
>>> {
>>> 	struct perf_event *sibling, *leader = event->group_leader;
>>>
>>> 	perf_event_unthrottle(leader,
>>>                 only_enable_sibling ? leader != event : true);
>>>         ...
>>> }
>>>
>>
>> It should work for the perf_adjust_freq_unthr_events(), which only start
>> the leader.
> 
>> But it's possible that the __perf_event_period() update a
>> sibling, not leader.
> 
> Should not perf_event_unthrottle_group() always enable sibling events?
>

No. __perf_event_period() can reset the period of a sibling event. I
know it sounds weird, but it's doable.


> The only difference is how the leader event to be enabled.  It can be
> enabled in perf_event_unthrottle_group() in period mode, or in
> frequency mode due to a new period value is generated, the leader
> event is enabled in perf_adjust_freq_unthr_events() or in
> __perf_event_period().
> 
> This is why I suggested to rename the flag to only_enable_sibling:
> 
>   true: only enable sibling events
>   false: enable all events (leader event and sibling events)
> 
> Or, we can rename the flag as "skip_start_event", means to skip
> enabling the event specified in the argument.

The name "skip_start_event" sounds good to me. I will use it in V3.

Thanks,
Kan>
>> I think I can check the name to bool event_has_start.
>> Is the name OK?
> 
> I am still confused for the naming "event_has_start" :)
> 
> What exactly does it mean?
> 
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index a270fcda766d..b1cb07fa9c18 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -2749,13 +2749,13 @@ static void perf_event_throttle(struct
>> perf_event *event)
>>  	perf_log_throttle(event, 0);
>>  }
>>
>> -static void perf_event_unthrottle_group(struct perf_event *event, bool
>> start)
>> +static void perf_event_unthrottle_group(struct perf_event *event, bool
>> event_has_start)
>>  {
>>  	struct perf_event *sibling, *leader = event->group_leader;
>>
>> -	perf_event_unthrottle(leader, leader != event || start);
>> +	perf_event_unthrottle(leader, event_has_start ? leader != event : true);
>>  	for_each_sibling_event(sibling, leader)
>> -		perf_event_unthrottle(sibling, sibling != event || start);
>> +		perf_event_unthrottle(sibling, event_has_start ? sibling != event :
>> true);
>>  }
>>
>>  static void perf_event_throttle_group(struct perf_event *event)
>> @@ -4423,7 +4423,7 @@ static void perf_adjust_freq_unthr_events(struct
>> list_head *event_list)
>>
>>  		if (hwc->interrupts == MAX_INTERRUPTS) {
>>  			perf_event_unthrottle_group(event,
>> -				!event->attr.freq || !event->attr.sample_freq);
>> +				(event->attr.freq && event->attr.sample_freq));
>>  		}
>>
>>  		if (!event->attr.freq || !event->attr.sample_freq)
>> @@ -6466,7 +6466,7 @@ static void __perf_event_period(struct perf_event
>> *event,
>>  		 * while we already re-started the event/group.
>>  		 */
>>  		if (event->hw.interrupts == MAX_INTERRUPTS)
>> -			perf_event_unthrottle_group(event, false);
>> +			perf_event_unthrottle_group(event, true);
>>  		perf_pmu_enable(event->pmu);
> 
> The logic in the updated code is correct for me.
> 
> Thanks,
> Leo
>