From: Kan Liang <kan.liang@linux.intel.com>
The current throttle logic doesn't work well with a group, e.g., the
following sampling-read case.
$ perf record -e "{cycles,cycles}:S" ...
$ perf report -D | grep THROTTLE | tail -2
THROTTLE events: 426 ( 9.0%)
UNTHROTTLE events: 425 ( 9.0%)
$ perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
0 1020120874009167 0x74970 [0x68]: PERF_RECORD_SAMPLE(IP, 0x1):
... sample_read:
.... group nr 2
..... id 0000000000000327, value 000000000cbb993a, lost 0
..... id 0000000000000328, value 00000002211c26df, lost 0
The second cycles event has a much larger value than the first cycles
event in the same group.
The current throttle logic in the generic code only logs the THROTTLE
event. It relies on the specific driver implementation to disable
events. For all ARCHs, the implementation is similar. Only the event is
disabled, rather than the group.
The logic to disable the group should be generic for all ARCHs. Add the
logic in the generic code. The following patch will remove the buggy
driver-specific implementation.
The throttle only happens when an event is overflowed. Stop the entire
group when any event in the group triggers the throttle.
The MAX_INTERRUPTS is set to all throttle events.
The unthrottled could happen in 3 places.
- event/group sched. All events in the group are scheduled one by one.
All of them will be unthrottled eventually. Nothing needs to be
changed.
- The perf_adjust_freq_unthr_events for each tick. Needs to restart the
group altogether.
- The __perf_event_period(). The whole group needs to be restarted
altogether as well.
With the fix,
$ sudo perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
0 3573470770332 0x12f5f8 [0x70]: PERF_RECORD_SAMPLE(IP, 0x2):
... sample_read:
.... group nr 2
..... id 0000000000000a28, value 00000004fd3dfd8f, lost 0
..... id 0000000000000a29, value 00000004fd3dfd8f, lost 0
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
kernel/events/core.c | 66 ++++++++++++++++++++++++++++++--------------
1 file changed, 46 insertions(+), 20 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index af78ec118e8f..915698f47682 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2739,6 +2739,39 @@ void perf_event_disable_inatomic(struct perf_event *event)
static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
+static void perf_event_unthrottle(struct perf_event *event, bool start)
+{
+ event->hw.interrupts = 0;
+ if (start)
+ event->pmu->start(event, 0);
+ perf_log_throttle(event, 1);
+}
+
+static void perf_event_throttle(struct perf_event *event)
+{
+ event->pmu->stop(event, 0);
+ event->hw.interrupts = MAX_INTERRUPTS;
+ perf_log_throttle(event, 0);
+}
+
+static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_unthrottle(leader, skip_start_event ? leader != event : true);
+ for_each_sibling_event(sibling, leader)
+ perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true);
+}
+
+static void perf_event_throttle_group(struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_throttle(leader);
+ for_each_sibling_event(sibling, leader)
+ perf_event_throttle(sibling);
+}
+
static int
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -2767,10 +2800,8 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
* ticks already, also for a heavily scheduling task there is little
* guarantee it'll get a tick in a timely manner.
*/
- if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
- perf_log_throttle(event, 1);
- event->hw.interrupts = 0;
- }
+ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS))
+ perf_event_unthrottle(event, false);
perf_pmu_disable(event->pmu);
@@ -4393,12 +4424,8 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
hwc = &event->hw;
- if (hwc->interrupts == MAX_INTERRUPTS) {
- hwc->interrupts = 0;
- perf_log_throttle(event, 1);
- if (!is_event_in_freq_mode(event))
- event->pmu->start(event, 0);
- }
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, is_event_in_freq_mode(event));
if (!is_event_in_freq_mode(event))
continue;
@@ -6426,14 +6453,6 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
perf_pmu_disable(event->pmu);
- /*
- * We could be throttled; unthrottle now to avoid the tick
- * trying to unthrottle while we already re-started the event.
- */
- if (event->hw.interrupts == MAX_INTERRUPTS) {
- event->hw.interrupts = 0;
- perf_log_throttle(event, 1);
- }
event->pmu->stop(event, PERF_EF_UPDATE);
}
@@ -6441,6 +6460,14 @@ static void __perf_event_period(struct perf_event *event,
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
+ /*
+ * Once the period is force-reset, the event starts immediately.
+ * But the event/group could be throttled. Unthrottle the
+ * event/group now to avoid the next tick trying to unthrottle
+ * while we already re-started the event/group.
+ */
+ if (event->hw.interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, true);
perf_pmu_enable(event->pmu);
}
}
@@ -10331,8 +10358,7 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
__this_cpu_inc(perf_throttled_count);
tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- hwc->interrupts = MAX_INTERRUPTS;
- perf_log_throttle(event, 0);
+ perf_event_throttle_group(event);
ret = 1;
}
--
2.38.1
Hi Kan,
[ + Aishwarya ]
On Tue, May 20, 2025 at 11:16:29AM -0700, kan.liang@linux.intel.com wrote:
[...]
> @@ -10331,8 +10358,7 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
> if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
> __this_cpu_inc(perf_throttled_count);
> tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
> - hwc->interrupts = MAX_INTERRUPTS;
> - perf_log_throttle(event, 0);
> + perf_event_throttle_group(event);
> ret = 1;
> }
Our (Arm) CI reports RCU stall that caused by this patch. I can use a
simple command to trigger system stuck with cpu-clock:
perf record -a -e cpu-clock -- sleep 2
I confirmed that if removing throttling code for cpu-clock event, then
the issue can be dimissed. Based on reading code, the flow below:
hrtimer interrupt:
`> __perf_event_account_interrupt()
`> perf_event_throttle_group()
`> perf_event_throttle()
`> cpu_clock_event_stop()
`> perf_swevent_cancel_hrtimer()
`> hrtimer_cancel() -> Inifite loop.
In the hrtimer interrupt handler, it tries to cancel itself and causes
inifite loop. Please consider to fix the issue.
Thanks,
Leo
On 2025-05-27 12:16 p.m., Leo Yan wrote:
> Hi Kan,
>
> [ + Aishwarya ]
>
> On Tue, May 20, 2025 at 11:16:29AM -0700, kan.liang@linux.intel.com wrote:
>
> [...]
>
>> @@ -10331,8 +10358,7 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
>> if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
>> __this_cpu_inc(perf_throttled_count);
>> tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
>> - hwc->interrupts = MAX_INTERRUPTS;
>> - perf_log_throttle(event, 0);
>> + perf_event_throttle_group(event);
>> ret = 1;
>> }
>
> Our (Arm) CI reports RCU stall that caused by this patch. I can use a
> simple command to trigger system stuck with cpu-clock:
>
> perf record -a -e cpu-clock -- sleep 2
>
> I confirmed that if removing throttling code for cpu-clock event, then
> the issue can be dimissed. Based on reading code, the flow below:
>
> hrtimer interrupt:
> `> __perf_event_account_interrupt()
> `> perf_event_throttle_group()
> `> perf_event_throttle()
> `> cpu_clock_event_stop()
> `> perf_swevent_cancel_hrtimer()
> `> hrtimer_cancel() -> Inifite loop.
>
> In the hrtimer interrupt handler, it tries to cancel itself and causes
> inifite loop. Please consider to fix the issue.
>
The cpu-clock and task_clock are two special SW events, which rely on
the hrtimer. I missed them when checking the SW events. :(
For the two events, instead of invoking the stop(), the
HRTIMER_NORESTART is returned to stop the timer. Invoking the stop()
cause the issue.
There may be two ways to fix it.
- Add a check of MAX_INTERRUPTS in the event_stop. Return immediately if
the stop is invoked by the throttle.
- Introduce a PMU flag to track the case. Avoid the event_stop in
perf_event_throttle() if the flag is detected.
The latter looks more generic. It may be used if there are other cases
that want to avoid the stop. So the latter is implemented as below.
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 947ad12dfdbe..66f02f46595c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -303,6 +303,7 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
#define PERF_PMU_CAP_AUX_PAUSE 0x0200
+#define PERF_PMU_CAP_NO_THROTTLE_STOP 0x0400
/**
* pmu::scope
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8327ab0ee641..596597886d96 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2655,7 +2655,8 @@ static void perf_event_unthrottle(struct
perf_event *event, bool start)
static void perf_event_throttle(struct perf_event *event)
{
- event->pmu->stop(event, 0);
+ if (!(event->pmu->capabilities & PERF_PMU_CAP_NO_THROTTLE_STOP))
+ event->pmu->stop(event, 0);
event->hw.interrupts = MAX_INTERRUPTS;
perf_log_throttle(event, 0);
}
@@ -11846,7 +11847,8 @@ static int cpu_clock_event_init(struct
perf_event *event)
static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,
- .capabilities = PERF_PMU_CAP_NO_NMI,
+ .capabilities = PERF_PMU_CAP_NO_NMI |
+ PERF_PMU_CAP_NO_THROTTLE_STOP,
.dev = PMU_NULL_DEV,
.event_init = cpu_clock_event_init,
@@ -11928,7 +11930,8 @@ static int task_clock_event_init(struct
perf_event *event)
static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,
- .capabilities = PERF_PMU_CAP_NO_NMI,
+ .capabilities = PERF_PMU_CAP_NO_NMI |
+ PERF_PMU_CAP_NO_THROTTLE_STOP,
.dev = PMU_NULL_DEV,
.event_init = task_clock_event_init,
Thanks,
Kan
On Tue, May 27, 2025 at 03:30:06PM -0400, Liang, Kan wrote:
[...]
> There may be two ways to fix it.
> - Add a check of MAX_INTERRUPTS in the event_stop. Return immediately if
> the stop is invoked by the throttle.
> - Introduce a PMU flag to track the case. Avoid the event_stop in
> perf_event_throttle() if the flag is detected.
>
> The latter looks more generic. It may be used if there are other cases
> that want to avoid the stop. So the latter is implemented as below.
Yes. I agreed the fix below is more general and confirmed it can fix
the observed issue.
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 947ad12dfdbe..66f02f46595c 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -303,6 +303,7 @@ struct perf_event_pmu_context;
> #define PERF_PMU_CAP_AUX_OUTPUT 0x0080
> #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
> #define PERF_PMU_CAP_AUX_PAUSE 0x0200
> +#define PERF_PMU_CAP_NO_THROTTLE_STOP 0x0400
>
> /**
> * pmu::scope
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 8327ab0ee641..596597886d96 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2655,7 +2655,8 @@ static void perf_event_unthrottle(struct
> perf_event *event, bool start)
>
> static void perf_event_throttle(struct perf_event *event)
> {
> - event->pmu->stop(event, 0);
> + if (!(event->pmu->capabilities & PERF_PMU_CAP_NO_THROTTLE_STOP))
> + event->pmu->stop(event, 0);
A background info is that even a PMU event is not stopped when
throttling, we still need to re-enable it. This is why we don't do
particualy handling for PERF_PMU_CAP_NO_THROTTLE_STOP in
perf_event_unthrottle().
Maybe it is deserved add a comment for easier understanding.
Thanks,
Leo
> event->hw.interrupts = MAX_INTERRUPTS;
> perf_log_throttle(event, 0);
> }
> @@ -11846,7 +11847,8 @@ static int cpu_clock_event_init(struct
> perf_event *event)
> static struct pmu perf_cpu_clock = {
> .task_ctx_nr = perf_sw_context,
>
> - .capabilities = PERF_PMU_CAP_NO_NMI,
> + .capabilities = PERF_PMU_CAP_NO_NMI |
> + PERF_PMU_CAP_NO_THROTTLE_STOP,
> .dev = PMU_NULL_DEV,
>
> .event_init = cpu_clock_event_init,
> @@ -11928,7 +11930,8 @@ static int task_clock_event_init(struct
> perf_event *event)
> static struct pmu perf_task_clock = {
> .task_ctx_nr = perf_sw_context,
>
> - .capabilities = PERF_PMU_CAP_NO_NMI,
> + .capabilities = PERF_PMU_CAP_NO_NMI |
> + PERF_PMU_CAP_NO_THROTTLE_STOP,
> .dev = PMU_NULL_DEV,
>
> .event_init = task_clock_event_init,
>
>
> Thanks,
> Kan
>
>
On 2025-05-28 6:28 a.m., Leo Yan wrote:
> On Tue, May 27, 2025 at 03:30:06PM -0400, Liang, Kan wrote:
>
> [...]
>
>> There may be two ways to fix it.
>> - Add a check of MAX_INTERRUPTS in the event_stop. Return immediately if
>> the stop is invoked by the throttle.
>> - Introduce a PMU flag to track the case. Avoid the event_stop in
>> perf_event_throttle() if the flag is detected.
>>
>> The latter looks more generic. It may be used if there are other cases
>> that want to avoid the stop. So the latter is implemented as below.
>
> Yes. I agreed the fix below is more general and confirmed it can fix
> the observed issue.
>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index 947ad12dfdbe..66f02f46595c 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -303,6 +303,7 @@ struct perf_event_pmu_context;
>> #define PERF_PMU_CAP_AUX_OUTPUT 0x0080
>> #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
>> #define PERF_PMU_CAP_AUX_PAUSE 0x0200
>> +#define PERF_PMU_CAP_NO_THROTTLE_STOP 0x0400
>>
>> /**
>> * pmu::scope
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 8327ab0ee641..596597886d96 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -2655,7 +2655,8 @@ static void perf_event_unthrottle(struct
>> perf_event *event, bool start)
>>
>> static void perf_event_throttle(struct perf_event *event)
>> {
>> - event->pmu->stop(event, 0);
>> + if (!(event->pmu->capabilities & PERF_PMU_CAP_NO_THROTTLE_STOP))
>> + event->pmu->stop(event, 0);
>
> A background info is that even a PMU event is not stopped when
> throttling, we still need to re-enable it. This is why we don't do
> particualy handling for PERF_PMU_CAP_NO_THROTTLE_STOP in
> perf_event_unthrottle().
>
> Maybe it is deserved add a comment for easier understanding.
Sure. A formal patch has been sent. Please take a look.
https://lore.kernel.org/lkml/20250528144823.2996185-1-kan.liang@linux.intel.com/
Thanks,
Kan>
> Thanks,
> Leo
>
>> event->hw.interrupts = MAX_INTERRUPTS;
>> perf_log_throttle(event, 0);
>> }
>> @@ -11846,7 +11847,8 @@ static int cpu_clock_event_init(struct
>> perf_event *event)
>> static struct pmu perf_cpu_clock = {
>> .task_ctx_nr = perf_sw_context,
>>
>> - .capabilities = PERF_PMU_CAP_NO_NMI,
>> + .capabilities = PERF_PMU_CAP_NO_NMI |
>> + PERF_PMU_CAP_NO_THROTTLE_STOP,
>> .dev = PMU_NULL_DEV,
>>
>> .event_init = cpu_clock_event_init,
>> @@ -11928,7 +11930,8 @@ static int task_clock_event_init(struct
>> perf_event *event)
>> static struct pmu perf_task_clock = {
>> .task_ctx_nr = perf_sw_context,
>>
>> - .capabilities = PERF_PMU_CAP_NO_NMI,
>> + .capabilities = PERF_PMU_CAP_NO_NMI |
>> + PERF_PMU_CAP_NO_THROTTLE_STOP,
>> .dev = PMU_NULL_DEV,
>>
>> .event_init = task_clock_event_init,
>>
>>
>> Thanks,
>> Kan
>>
>>
>
On Tue, May 20, 2025 at 11:16:29AM -0700, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
>
> The current throttle logic doesn't work well with a group, e.g., the
> following sampling-read case.
>
> $ perf record -e "{cycles,cycles}:S" ...
>
> $ perf report -D | grep THROTTLE | tail -2
> THROTTLE events: 426 ( 9.0%)
> UNTHROTTLE events: 425 ( 9.0%)
>
> $ perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
> 0 1020120874009167 0x74970 [0x68]: PERF_RECORD_SAMPLE(IP, 0x1):
> ... sample_read:
> .... group nr 2
> ..... id 0000000000000327, value 000000000cbb993a, lost 0
> ..... id 0000000000000328, value 00000002211c26df, lost 0
>
> The second cycles event has a much larger value than the first cycles
> event in the same group.
>
> The current throttle logic in the generic code only logs the THROTTLE
> event. It relies on the specific driver implementation to disable
> events. For all ARCHs, the implementation is similar. Only the event is
> disabled, rather than the group.
>
> The logic to disable the group should be generic for all ARCHs. Add the
> logic in the generic code. The following patch will remove the buggy
> driver-specific implementation.
>
> The throttle only happens when an event is overflowed. Stop the entire
> group when any event in the group triggers the throttle.
> The MAX_INTERRUPTS is set to all throttle events.
>
> The unthrottled could happen in 3 places.
> - event/group sched. All events in the group are scheduled one by one.
> All of them will be unthrottled eventually. Nothing needs to be
> changed.
> - The perf_adjust_freq_unthr_events for each tick. Needs to restart the
> group altogether.
> - The __perf_event_period(). The whole group needs to be restarted
> altogether as well.
>
> With the fix,
> $ sudo perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
> 0 3573470770332 0x12f5f8 [0x70]: PERF_RECORD_SAMPLE(IP, 0x2):
> ... sample_read:
> .... group nr 2
> ..... id 0000000000000a28, value 00000004fd3dfd8f, lost 0
> ..... id 0000000000000a29, value 00000004fd3dfd8f, lost 0
>
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Thanks,
Namhyung
> ---
> kernel/events/core.c | 66 ++++++++++++++++++++++++++++++--------------
> 1 file changed, 46 insertions(+), 20 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index af78ec118e8f..915698f47682 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2739,6 +2739,39 @@ void perf_event_disable_inatomic(struct perf_event *event)
> static void perf_log_throttle(struct perf_event *event, int enable);
> static void perf_log_itrace_start(struct perf_event *event);
>
> +static void perf_event_unthrottle(struct perf_event *event, bool start)
> +{
> + event->hw.interrupts = 0;
> + if (start)
> + event->pmu->start(event, 0);
> + perf_log_throttle(event, 1);
> +}
> +
> +static void perf_event_throttle(struct perf_event *event)
> +{
> + event->pmu->stop(event, 0);
> + event->hw.interrupts = MAX_INTERRUPTS;
> + perf_log_throttle(event, 0);
> +}
> +
> +static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event)
> +{
> + struct perf_event *sibling, *leader = event->group_leader;
> +
> + perf_event_unthrottle(leader, skip_start_event ? leader != event : true);
> + for_each_sibling_event(sibling, leader)
> + perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true);
> +}
> +
> +static void perf_event_throttle_group(struct perf_event *event)
> +{
> + struct perf_event *sibling, *leader = event->group_leader;
> +
> + perf_event_throttle(leader);
> + for_each_sibling_event(sibling, leader)
> + perf_event_throttle(sibling);
> +}
> +
> static int
> event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
> {
> @@ -2767,10 +2800,8 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
> * ticks already, also for a heavily scheduling task there is little
> * guarantee it'll get a tick in a timely manner.
> */
> - if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
> - perf_log_throttle(event, 1);
> - event->hw.interrupts = 0;
> - }
> + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS))
> + perf_event_unthrottle(event, false);
>
> perf_pmu_disable(event->pmu);
>
> @@ -4393,12 +4424,8 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
>
> hwc = &event->hw;
>
> - if (hwc->interrupts == MAX_INTERRUPTS) {
> - hwc->interrupts = 0;
> - perf_log_throttle(event, 1);
> - if (!is_event_in_freq_mode(event))
> - event->pmu->start(event, 0);
> - }
> + if (hwc->interrupts == MAX_INTERRUPTS)
> + perf_event_unthrottle_group(event, is_event_in_freq_mode(event));
>
> if (!is_event_in_freq_mode(event))
> continue;
> @@ -6426,14 +6453,6 @@ static void __perf_event_period(struct perf_event *event,
> active = (event->state == PERF_EVENT_STATE_ACTIVE);
> if (active) {
> perf_pmu_disable(event->pmu);
> - /*
> - * We could be throttled; unthrottle now to avoid the tick
> - * trying to unthrottle while we already re-started the event.
> - */
> - if (event->hw.interrupts == MAX_INTERRUPTS) {
> - event->hw.interrupts = 0;
> - perf_log_throttle(event, 1);
> - }
> event->pmu->stop(event, PERF_EF_UPDATE);
> }
>
> @@ -6441,6 +6460,14 @@ static void __perf_event_period(struct perf_event *event,
>
> if (active) {
> event->pmu->start(event, PERF_EF_RELOAD);
> + /*
> + * Once the period is force-reset, the event starts immediately.
> + * But the event/group could be throttled. Unthrottle the
> + * event/group now to avoid the next tick trying to unthrottle
> + * while we already re-started the event/group.
> + */
> + if (event->hw.interrupts == MAX_INTERRUPTS)
> + perf_event_unthrottle_group(event, true);
> perf_pmu_enable(event->pmu);
> }
> }
> @@ -10331,8 +10358,7 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
> if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
> __this_cpu_inc(perf_throttled_count);
> tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
> - hwc->interrupts = MAX_INTERRUPTS;
> - perf_log_throttle(event, 0);
> + perf_event_throttle_group(event);
> ret = 1;
> }
>
> --
> 2.38.1
>
On Tue, May 20, 2025 at 11:16:29AM -0700, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
>
> The current throttle logic doesn't work well with a group, e.g., the
> following sampling-read case.
>
> $ perf record -e "{cycles,cycles}:S" ...
>
> $ perf report -D | grep THROTTLE | tail -2
> THROTTLE events: 426 ( 9.0%)
> UNTHROTTLE events: 425 ( 9.0%)
>
> $ perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
> 0 1020120874009167 0x74970 [0x68]: PERF_RECORD_SAMPLE(IP, 0x1):
> ... sample_read:
> .... group nr 2
> ..... id 0000000000000327, value 000000000cbb993a, lost 0
> ..... id 0000000000000328, value 00000002211c26df, lost 0
>
> The second cycles event has a much larger value than the first cycles
> event in the same group.
>
> The current throttle logic in the generic code only logs the THROTTLE
> event. It relies on the specific driver implementation to disable
> events. For all ARCHs, the implementation is similar. Only the event is
> disabled, rather than the group.
>
> The logic to disable the group should be generic for all ARCHs. Add the
> logic in the generic code. The following patch will remove the buggy
> driver-specific implementation.
>
> The throttle only happens when an event is overflowed. Stop the entire
> group when any event in the group triggers the throttle.
> The MAX_INTERRUPTS is set to all throttle events.
>
> The unthrottled could happen in 3 places.
> - event/group sched. All events in the group are scheduled one by one.
> All of them will be unthrottled eventually. Nothing needs to be
> changed.
> - The perf_adjust_freq_unthr_events for each tick. Needs to restart the
> group altogether.
> - The __perf_event_period(). The whole group needs to be restarted
> altogether as well.
>
> With the fix,
> $ sudo perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
> 0 3573470770332 0x12f5f8 [0x70]: PERF_RECORD_SAMPLE(IP, 0x2):
> ... sample_read:
> .... group nr 2
> ..... id 0000000000000a28, value 00000004fd3dfd8f, lost 0
> ..... id 0000000000000a29, value 00000004fd3dfd8f, lost 0
>
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> ---
> kernel/events/core.c | 66 ++++++++++++++++++++++++++++++--------------
> 1 file changed, 46 insertions(+), 20 deletions(-)
This patch breaks perf hw events somehow.
After merging this into bpf trees we see random "watchdog: BUG: soft lockup"
with various stack traces followed up:
[ 78.620749] Sending NMI from CPU 8 to CPUs 0:
[ 76.387722] NMI backtrace for cpu 0
[ 76.387722] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O L 6.15.0-10818-ge0f0ee1c31de #1163 PREEMPT
[ 76.387722] Tainted: [O]=OOT_MODULE, [L]=SOFTLOCKUP
[ 76.387722] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[ 76.387722] RIP: 0010:_raw_spin_lock_irqsave+0xc/0x40
[ 76.387722] Call Trace:
[ 76.387722] <IRQ>
[ 76.387722] hrtimer_try_to_cancel.part.0+0x24/0xe0
[ 76.387722] hrtimer_cancel+0x21/0x40
[ 76.387722] cpu_clock_event_stop+0x64/0x70
[ 76.387722] __perf_event_account_interrupt+0xcf/0x140
[ 76.387722] __perf_event_overflow+0x36/0x340
[ 76.387722] ? hrtimer_start_range_ns+0x2c1/0x420
[ 76.387722] ? kvm_sched_clock_read+0x11/0x20
[ 76.387722] perf_swevent_hrtimer+0xaf/0x100
[ 76.387722] ? cpu_clock_event_add+0x6e/0x90
[ 76.387722] ? event_sched_in+0xc3/0x190
[ 76.387722] ? update_load_avg+0x87/0x3d0
[ 76.387722] ? _raw_spin_unlock+0xe/0x20
[ 76.387722] ? sched_balance_update_blocked_averages+0x59b/0x6a0
[ 76.387722] ? ctx_sched_in+0x184/0x210
[ 76.387722] ? kvm_sched_clock_read+0x11/0x20
[ 76.387722] ? sched_clock_cpu+0x55/0x190
[ 76.387722] ? perf_exclude_event+0x50/0x50
[ 76.387722] __hrtimer_run_queues+0x111/0x290
[ 76.387722] hrtimer_interrupt+0xff/0x240
[ 76.387722] __sysvec_apic_timer_interrupt+0x4f/0x110
[ 76.387722] sysvec_apic_timer_interrupt+0x6c/0x90
After reverting:
commit e800ac51202f ("perf: Only dump the throttle log for the leader")
commit 9734e25fbf5a ("perf: Fix the throttle logic for a group")
everything is back to normal.
There are many ways to reproduce.
Any test that sets up perf hw event followed up by tests that IPIs all cpus.
One way:
selftests/bpf/test_progs -t stacktrace_build_id_nmi
selftests/bpf/test_progs -t unpriv_bpf_disabled
Please take a look.
Hi Alexei,
On 2025-06-01 8:30 p.m., Alexei Starovoitov wrote:
> On Tue, May 20, 2025 at 11:16:29AM -0700, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> The current throttle logic doesn't work well with a group, e.g., the
>> following sampling-read case.
>>
>> $ perf record -e "{cycles,cycles}:S" ...
>>
>> $ perf report -D | grep THROTTLE | tail -2
>> THROTTLE events: 426 ( 9.0%)
>> UNTHROTTLE events: 425 ( 9.0%)
>>
>> $ perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
>> 0 1020120874009167 0x74970 [0x68]: PERF_RECORD_SAMPLE(IP, 0x1):
>> ... sample_read:
>> .... group nr 2
>> ..... id 0000000000000327, value 000000000cbb993a, lost 0
>> ..... id 0000000000000328, value 00000002211c26df, lost 0
>>
>> The second cycles event has a much larger value than the first cycles
>> event in the same group.
>>
>> The current throttle logic in the generic code only logs the THROTTLE
>> event. It relies on the specific driver implementation to disable
>> events. For all ARCHs, the implementation is similar. Only the event is
>> disabled, rather than the group.
>>
>> The logic to disable the group should be generic for all ARCHs. Add the
>> logic in the generic code. The following patch will remove the buggy
>> driver-specific implementation.
>>
>> The throttle only happens when an event is overflowed. Stop the entire
>> group when any event in the group triggers the throttle.
>> The MAX_INTERRUPTS is set to all throttle events.
>>
>> The unthrottled could happen in 3 places.
>> - event/group sched. All events in the group are scheduled one by one.
>> All of them will be unthrottled eventually. Nothing needs to be
>> changed.
>> - The perf_adjust_freq_unthr_events for each tick. Needs to restart the
>> group altogether.
>> - The __perf_event_period(). The whole group needs to be restarted
>> altogether as well.
>>
>> With the fix,
>> $ sudo perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
>> 0 3573470770332 0x12f5f8 [0x70]: PERF_RECORD_SAMPLE(IP, 0x2):
>> ... sample_read:
>> .... group nr 2
>> ..... id 0000000000000a28, value 00000004fd3dfd8f, lost 0
>> ..... id 0000000000000a29, value 00000004fd3dfd8f, lost 0
>>
>> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>> ---
>> kernel/events/core.c | 66 ++++++++++++++++++++++++++++++--------------
>> 1 file changed, 46 insertions(+), 20 deletions(-)
>
> This patch breaks perf hw events somehow.
>
> After merging this into bpf trees we see random "watchdog: BUG: soft lockup"
> with various stack traces followed up:
> [ 78.620749] Sending NMI from CPU 8 to CPUs 0:
> [ 76.387722] NMI backtrace for cpu 0
> [ 76.387722] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O L 6.15.0-10818-ge0f0ee1c31de #1163 PREEMPT
> [ 76.387722] Tainted: [O]=OOT_MODULE, [L]=SOFTLOCKUP
> [ 76.387722] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
> [ 76.387722] RIP: 0010:_raw_spin_lock_irqsave+0xc/0x40
> [ 76.387722] Call Trace:
> [ 76.387722] <IRQ>
> [ 76.387722] hrtimer_try_to_cancel.part.0+0x24/0xe0
> [ 76.387722] hrtimer_cancel+0x21/0x40
> [ 76.387722] cpu_clock_event_stop+0x64/0x70
The issues should be fixed by the patch.
https://lore.kernel.org/lkml/20250528175832.2999139-1-kan.liang@linux.intel.com/
Could you please give it a try?
Thanks,
Kan
> [ 76.387722] __perf_event_account_interrupt+0xcf/0x140
> [ 76.387722] __perf_event_overflow+0x36/0x340
> [ 76.387722] ? hrtimer_start_range_ns+0x2c1/0x420
> [ 76.387722] ? kvm_sched_clock_read+0x11/0x20
> [ 76.387722] perf_swevent_hrtimer+0xaf/0x100
> [ 76.387722] ? cpu_clock_event_add+0x6e/0x90
> [ 76.387722] ? event_sched_in+0xc3/0x190
> [ 76.387722] ? update_load_avg+0x87/0x3d0
> [ 76.387722] ? _raw_spin_unlock+0xe/0x20
> [ 76.387722] ? sched_balance_update_blocked_averages+0x59b/0x6a0
> [ 76.387722] ? ctx_sched_in+0x184/0x210
> [ 76.387722] ? kvm_sched_clock_read+0x11/0x20
> [ 76.387722] ? sched_clock_cpu+0x55/0x190
> [ 76.387722] ? perf_exclude_event+0x50/0x50
> [ 76.387722] __hrtimer_run_queues+0x111/0x290
> [ 76.387722] hrtimer_interrupt+0xff/0x240
> [ 76.387722] __sysvec_apic_timer_interrupt+0x4f/0x110
> [ 76.387722] sysvec_apic_timer_interrupt+0x6c/0x90
>
> After reverting:
> commit e800ac51202f ("perf: Only dump the throttle log for the leader")
> commit 9734e25fbf5a ("perf: Fix the throttle logic for a group")
> everything is back to normal.
>
> There are many ways to reproduce.
> Any test that sets up perf hw event followed up by tests that IPIs all cpus.
> One way:
> selftests/bpf/test_progs -t stacktrace_build_id_nmi
> selftests/bpf/test_progs -t unpriv_bpf_disabled
>
> Please take a look.
>
On Mon, Jun 2, 2025 at 5:55 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>
> Hi Alexei,
>
> On 2025-06-01 8:30 p.m., Alexei Starovoitov wrote:
> > On Tue, May 20, 2025 at 11:16:29AM -0700, kan.liang@linux.intel.com wrote:
> >> From: Kan Liang <kan.liang@linux.intel.com>
> >>
> >> The current throttle logic doesn't work well with a group, e.g., the
> >> following sampling-read case.
> >>
> >> $ perf record -e "{cycles,cycles}:S" ...
> >>
> >> $ perf report -D | grep THROTTLE | tail -2
> >> THROTTLE events: 426 ( 9.0%)
> >> UNTHROTTLE events: 425 ( 9.0%)
> >>
> >> $ perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
> >> 0 1020120874009167 0x74970 [0x68]: PERF_RECORD_SAMPLE(IP, 0x1):
> >> ... sample_read:
> >> .... group nr 2
> >> ..... id 0000000000000327, value 000000000cbb993a, lost 0
> >> ..... id 0000000000000328, value 00000002211c26df, lost 0
> >>
> >> The second cycles event has a much larger value than the first cycles
> >> event in the same group.
> >>
> >> The current throttle logic in the generic code only logs the THROTTLE
> >> event. It relies on the specific driver implementation to disable
> >> events. For all ARCHs, the implementation is similar. Only the event is
> >> disabled, rather than the group.
> >>
> >> The logic to disable the group should be generic for all ARCHs. Add the
> >> logic in the generic code. The following patch will remove the buggy
> >> driver-specific implementation.
> >>
> >> The throttle only happens when an event is overflowed. Stop the entire
> >> group when any event in the group triggers the throttle.
> >> The MAX_INTERRUPTS is set to all throttle events.
> >>
> >> The unthrottled could happen in 3 places.
> >> - event/group sched. All events in the group are scheduled one by one.
> >> All of them will be unthrottled eventually. Nothing needs to be
> >> changed.
> >> - The perf_adjust_freq_unthr_events for each tick. Needs to restart the
> >> group altogether.
> >> - The __perf_event_period(). The whole group needs to be restarted
> >> altogether as well.
> >>
> >> With the fix,
> >> $ sudo perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
> >> 0 3573470770332 0x12f5f8 [0x70]: PERF_RECORD_SAMPLE(IP, 0x2):
> >> ... sample_read:
> >> .... group nr 2
> >> ..... id 0000000000000a28, value 00000004fd3dfd8f, lost 0
> >> ..... id 0000000000000a29, value 00000004fd3dfd8f, lost 0
> >>
> >> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> >> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> >> ---
> >> kernel/events/core.c | 66 ++++++++++++++++++++++++++++++--------------
> >> 1 file changed, 46 insertions(+), 20 deletions(-)
> >
> > This patch breaks perf hw events somehow.
> >
> > After merging this into bpf trees we see random "watchdog: BUG: soft lockup"
> > with various stack traces followed up:
> > [ 78.620749] Sending NMI from CPU 8 to CPUs 0:
> > [ 76.387722] NMI backtrace for cpu 0
> > [ 76.387722] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O L 6.15.0-10818-ge0f0ee1c31de #1163 PREEMPT
> > [ 76.387722] Tainted: [O]=OOT_MODULE, [L]=SOFTLOCKUP
> > [ 76.387722] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
> > [ 76.387722] RIP: 0010:_raw_spin_lock_irqsave+0xc/0x40
> > [ 76.387722] Call Trace:
> > [ 76.387722] <IRQ>
> > [ 76.387722] hrtimer_try_to_cancel.part.0+0x24/0xe0
> > [ 76.387722] hrtimer_cancel+0x21/0x40
> > [ 76.387722] cpu_clock_event_stop+0x64/0x70
>
>
> The issues should be fixed by the patch.
> https://lore.kernel.org/lkml/20250528175832.2999139-1-kan.liang@linux.intel.com/
>
> Could you please give it a try?
Thanks. It fixes it, but the commit log says that
only cpu-clock and task_clock are affected,
which are SW events.
While our tests are locking while setting up:
struct perf_event_attr attr = {
.freq = 1,
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
};
Is it because we run in x86 VM and HW_CPU_CYCLES is mapped
to cpu-clock sw ?
On 2025-06-02 12:24 p.m., Alexei Starovoitov wrote:
> On Mon, Jun 2, 2025 at 5:55 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>
>> Hi Alexei,
>>
>> On 2025-06-01 8:30 p.m., Alexei Starovoitov wrote:
>>> On Tue, May 20, 2025 at 11:16:29AM -0700, kan.liang@linux.intel.com wrote:
>>>> From: Kan Liang <kan.liang@linux.intel.com>
>>>>
>>>> The current throttle logic doesn't work well with a group, e.g., the
>>>> following sampling-read case.
>>>>
>>>> $ perf record -e "{cycles,cycles}:S" ...
>>>>
>>>> $ perf report -D | grep THROTTLE | tail -2
>>>> THROTTLE events: 426 ( 9.0%)
>>>> UNTHROTTLE events: 425 ( 9.0%)
>>>>
>>>> $ perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
>>>> 0 1020120874009167 0x74970 [0x68]: PERF_RECORD_SAMPLE(IP, 0x1):
>>>> ... sample_read:
>>>> .... group nr 2
>>>> ..... id 0000000000000327, value 000000000cbb993a, lost 0
>>>> ..... id 0000000000000328, value 00000002211c26df, lost 0
>>>>
>>>> The second cycles event has a much larger value than the first cycles
>>>> event in the same group.
>>>>
>>>> The current throttle logic in the generic code only logs the THROTTLE
>>>> event. It relies on the specific driver implementation to disable
>>>> events. For all ARCHs, the implementation is similar. Only the event is
>>>> disabled, rather than the group.
>>>>
>>>> The logic to disable the group should be generic for all ARCHs. Add the
>>>> logic in the generic code. The following patch will remove the buggy
>>>> driver-specific implementation.
>>>>
>>>> The throttle only happens when an event is overflowed. Stop the entire
>>>> group when any event in the group triggers the throttle.
>>>> The MAX_INTERRUPTS is set to all throttle events.
>>>>
>>>> The unthrottled could happen in 3 places.
>>>> - event/group sched. All events in the group are scheduled one by one.
>>>> All of them will be unthrottled eventually. Nothing needs to be
>>>> changed.
>>>> - The perf_adjust_freq_unthr_events for each tick. Needs to restart the
>>>> group altogether.
>>>> - The __perf_event_period(). The whole group needs to be restarted
>>>> altogether as well.
>>>>
>>>> With the fix,
>>>> $ sudo perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
>>>> 0 3573470770332 0x12f5f8 [0x70]: PERF_RECORD_SAMPLE(IP, 0x2):
>>>> ... sample_read:
>>>> .... group nr 2
>>>> ..... id 0000000000000a28, value 00000004fd3dfd8f, lost 0
>>>> ..... id 0000000000000a29, value 00000004fd3dfd8f, lost 0
>>>>
>>>> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>>>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>>>> ---
>>>> kernel/events/core.c | 66 ++++++++++++++++++++++++++++++--------------
>>>> 1 file changed, 46 insertions(+), 20 deletions(-)
>>>
>>> This patch breaks perf hw events somehow.
>>>
>>> After merging this into bpf trees we see random "watchdog: BUG: soft lockup"
>>> with various stack traces followed up:
>>> [ 78.620749] Sending NMI from CPU 8 to CPUs 0:
>>> [ 76.387722] NMI backtrace for cpu 0
>>> [ 76.387722] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O L 6.15.0-10818-ge0f0ee1c31de #1163 PREEMPT
>>> [ 76.387722] Tainted: [O]=OOT_MODULE, [L]=SOFTLOCKUP
>>> [ 76.387722] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
>>> [ 76.387722] RIP: 0010:_raw_spin_lock_irqsave+0xc/0x40
>>> [ 76.387722] Call Trace:
>>> [ 76.387722] <IRQ>
>>> [ 76.387722] hrtimer_try_to_cancel.part.0+0x24/0xe0
>>> [ 76.387722] hrtimer_cancel+0x21/0x40
>>> [ 76.387722] cpu_clock_event_stop+0x64/0x70
>>
>>
>> The issues should be fixed by the patch.
>> https://lore.kernel.org/lkml/20250528175832.2999139-1-kan.liang@linux.intel.com/
>>
>> Could you please give it a try?
>
> Thanks. It fixes it, but the commit log says that
> only cpu-clock and task_clock are affected,
> which are SW events.
Yes, only the two SW events are affected.
>
> While our tests are locking while setting up:
>
> struct perf_event_attr attr = {
> .freq = 1,
> .type = PERF_TYPE_HARDWARE,
> .config = PERF_COUNT_HW_CPU_CYCLES,
> };
>
> Is it because we run in x86 VM and HW_CPU_CYCLES is mapped
> to cpu-clock sw ?
No, that's from different PMU. We never map HW_CPU_CYCLES to a SW event.
It will error our if the PMU is not available.
I'm not familiar with your test case and env. At least, I saw
PERF_COUNT_SW_CPU_CLOCK is used in the case unpriv_bpf_disabled.
Thanks,
Kan
On Mon, Jun 2, 2025 at 10:51 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>
>
>
> On 2025-06-02 12:24 p.m., Alexei Starovoitov wrote:
> > On Mon, Jun 2, 2025 at 5:55 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> >>
> >> Hi Alexei,
> >>
> >> On 2025-06-01 8:30 p.m., Alexei Starovoitov wrote:
> >>> On Tue, May 20, 2025 at 11:16:29AM -0700, kan.liang@linux.intel.com wrote:
> >>>> From: Kan Liang <kan.liang@linux.intel.com>
> >>>>
> >>>> The current throttle logic doesn't work well with a group, e.g., the
> >>>> following sampling-read case.
> >>>>
> >>>> $ perf record -e "{cycles,cycles}:S" ...
> >>>>
> >>>> $ perf report -D | grep THROTTLE | tail -2
> >>>> THROTTLE events: 426 ( 9.0%)
> >>>> UNTHROTTLE events: 425 ( 9.0%)
> >>>>
> >>>> $ perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
> >>>> 0 1020120874009167 0x74970 [0x68]: PERF_RECORD_SAMPLE(IP, 0x1):
> >>>> ... sample_read:
> >>>> .... group nr 2
> >>>> ..... id 0000000000000327, value 000000000cbb993a, lost 0
> >>>> ..... id 0000000000000328, value 00000002211c26df, lost 0
> >>>>
> >>>> The second cycles event has a much larger value than the first cycles
> >>>> event in the same group.
> >>>>
> >>>> The current throttle logic in the generic code only logs the THROTTLE
> >>>> event. It relies on the specific driver implementation to disable
> >>>> events. For all ARCHs, the implementation is similar. Only the event is
> >>>> disabled, rather than the group.
> >>>>
> >>>> The logic to disable the group should be generic for all ARCHs. Add the
> >>>> logic in the generic code. The following patch will remove the buggy
> >>>> driver-specific implementation.
> >>>>
> >>>> The throttle only happens when an event is overflowed. Stop the entire
> >>>> group when any event in the group triggers the throttle.
> >>>> The MAX_INTERRUPTS is set to all throttle events.
> >>>>
> >>>> The unthrottled could happen in 3 places.
> >>>> - event/group sched. All events in the group are scheduled one by one.
> >>>> All of them will be unthrottled eventually. Nothing needs to be
> >>>> changed.
> >>>> - The perf_adjust_freq_unthr_events for each tick. Needs to restart the
> >>>> group altogether.
> >>>> - The __perf_event_period(). The whole group needs to be restarted
> >>>> altogether as well.
> >>>>
> >>>> With the fix,
> >>>> $ sudo perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
> >>>> 0 3573470770332 0x12f5f8 [0x70]: PERF_RECORD_SAMPLE(IP, 0x2):
> >>>> ... sample_read:
> >>>> .... group nr 2
> >>>> ..... id 0000000000000a28, value 00000004fd3dfd8f, lost 0
> >>>> ..... id 0000000000000a29, value 00000004fd3dfd8f, lost 0
> >>>>
> >>>> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> >>>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> >>>> ---
> >>>> kernel/events/core.c | 66 ++++++++++++++++++++++++++++++--------------
> >>>> 1 file changed, 46 insertions(+), 20 deletions(-)
> >>>
> >>> This patch breaks perf hw events somehow.
> >>>
> >>> After merging this into bpf trees we see random "watchdog: BUG: soft lockup"
> >>> with various stack traces followed up:
> >>> [ 78.620749] Sending NMI from CPU 8 to CPUs 0:
> >>> [ 76.387722] NMI backtrace for cpu 0
> >>> [ 76.387722] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O L 6.15.0-10818-ge0f0ee1c31de #1163 PREEMPT
> >>> [ 76.387722] Tainted: [O]=OOT_MODULE, [L]=SOFTLOCKUP
> >>> [ 76.387722] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
> >>> [ 76.387722] RIP: 0010:_raw_spin_lock_irqsave+0xc/0x40
> >>> [ 76.387722] Call Trace:
> >>> [ 76.387722] <IRQ>
> >>> [ 76.387722] hrtimer_try_to_cancel.part.0+0x24/0xe0
> >>> [ 76.387722] hrtimer_cancel+0x21/0x40
> >>> [ 76.387722] cpu_clock_event_stop+0x64/0x70
> >>
> >>
> >> The issues should be fixed by the patch.
> >> https://lore.kernel.org/lkml/20250528175832.2999139-1-kan.liang@linux.intel.com/
> >>
> >> Could you please give it a try?
> >
> > Thanks. It fixes it, but the commit log says that
> > only cpu-clock and task_clock are affected,
> > which are SW events.
>
> Yes, only the two SW events are affected.
>
> >
> > While our tests are locking while setting up:
> >
> > struct perf_event_attr attr = {
> > .freq = 1,
> > .type = PERF_TYPE_HARDWARE,
> > .config = PERF_COUNT_HW_CPU_CYCLES,
> > };
> >
> > Is it because we run in x86 VM and HW_CPU_CYCLES is mapped
> > to cpu-clock sw ?
>
> No, that's from different PMU. We never map HW_CPU_CYCLES to a SW event.
> It will error our if the PMU is not available.
>
> I'm not familiar with your test case and env. At least, I saw
> PERF_COUNT_SW_CPU_CLOCK is used in the case unpriv_bpf_disabled.
I see. The first test was necessary to create throttle conditions
for the 2nd test that actually used cpu-clock.
Feel free to add
Tested-by: Alexei Starovoitov <ast@kernel.org>
I've applied your patch to bpf tree for now to stop the bleeding.
Will drop it when the fix gets to Linus through perf trees.
The following commit has been merged into the perf/core branch of tip:
Commit-ID: 9734e25fbf5ae68eb04234b2cd14a4b36ab89141
Gitweb: https://git.kernel.org/tip/9734e25fbf5ae68eb04234b2cd14a4b36ab89141
Author: Kan Liang <kan.liang@linux.intel.com>
AuthorDate: Tue, 20 May 2025 11:16:29 -07:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 21 May 2025 13:57:42 +02:00
perf: Fix the throttle logic for a group
The current throttle logic doesn't work well with a group, e.g., the
following sampling-read case.
$ perf record -e "{cycles,cycles}:S" ...
$ perf report -D | grep THROTTLE | tail -2
THROTTLE events: 426 ( 9.0%)
UNTHROTTLE events: 425 ( 9.0%)
$ perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
0 1020120874009167 0x74970 [0x68]: PERF_RECORD_SAMPLE(IP, 0x1):
... sample_read:
.... group nr 2
..... id 0000000000000327, value 000000000cbb993a, lost 0
..... id 0000000000000328, value 00000002211c26df, lost 0
The second cycles event has a much larger value than the first cycles
event in the same group.
The current throttle logic in the generic code only logs the THROTTLE
event. It relies on the specific driver implementation to disable
events. For all ARCHs, the implementation is similar. Only the event is
disabled, rather than the group.
The logic to disable the group should be generic for all ARCHs. Add the
logic in the generic code. The following patch will remove the buggy
driver-specific implementation.
The throttle only happens when an event is overflowed. Stop the entire
group when any event in the group triggers the throttle.
The MAX_INTERRUPTS is set to all throttle events.
The unthrottled could happen in 3 places.
- event/group sched. All events in the group are scheduled one by one.
All of them will be unthrottled eventually. Nothing needs to be
changed.
- The perf_adjust_freq_unthr_events for each tick. Needs to restart the
group altogether.
- The __perf_event_period(). The whole group needs to be restarted
altogether as well.
With the fix,
$ sudo perf report -D | grep PERF_RECORD_SAMPLE -a4 | tail -n 5
0 3573470770332 0x12f5f8 [0x70]: PERF_RECORD_SAMPLE(IP, 0x2):
... sample_read:
.... group nr 2
..... id 0000000000000a28, value 00000004fd3dfd8f, lost 0
..... id 0000000000000a29, value 00000004fd3dfd8f, lost 0
Suggested-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20250520181644.2673067-2-kan.liang@linux.intel.com
---
kernel/events/core.c | 66 +++++++++++++++++++++++++++++--------------
1 file changed, 46 insertions(+), 20 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 952340f..8327ab0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2645,6 +2645,39 @@ void perf_event_disable_inatomic(struct perf_event *event)
static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
+static void perf_event_unthrottle(struct perf_event *event, bool start)
+{
+ event->hw.interrupts = 0;
+ if (start)
+ event->pmu->start(event, 0);
+ perf_log_throttle(event, 1);
+}
+
+static void perf_event_throttle(struct perf_event *event)
+{
+ event->pmu->stop(event, 0);
+ event->hw.interrupts = MAX_INTERRUPTS;
+ perf_log_throttle(event, 0);
+}
+
+static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_unthrottle(leader, skip_start_event ? leader != event : true);
+ for_each_sibling_event(sibling, leader)
+ perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true);
+}
+
+static void perf_event_throttle_group(struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_throttle(leader);
+ for_each_sibling_event(sibling, leader)
+ perf_event_throttle(sibling);
+}
+
static int
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -2673,10 +2706,8 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
* ticks already, also for a heavily scheduling task there is little
* guarantee it'll get a tick in a timely manner.
*/
- if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
- perf_log_throttle(event, 1);
- event->hw.interrupts = 0;
- }
+ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS))
+ perf_event_unthrottle(event, false);
perf_pmu_disable(event->pmu);
@@ -4254,12 +4285,8 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
hwc = &event->hw;
- if (hwc->interrupts == MAX_INTERRUPTS) {
- hwc->interrupts = 0;
- perf_log_throttle(event, 1);
- if (!is_event_in_freq_mode(event))
- event->pmu->start(event, 0);
- }
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, is_event_in_freq_mode(event));
if (!is_event_in_freq_mode(event))
continue;
@@ -6181,14 +6208,6 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
perf_pmu_disable(event->pmu);
- /*
- * We could be throttled; unthrottle now to avoid the tick
- * trying to unthrottle while we already re-started the event.
- */
- if (event->hw.interrupts == MAX_INTERRUPTS) {
- event->hw.interrupts = 0;
- perf_log_throttle(event, 1);
- }
event->pmu->stop(event, PERF_EF_UPDATE);
}
@@ -6196,6 +6215,14 @@ static void __perf_event_period(struct perf_event *event,
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
+ /*
+ * Once the period is force-reset, the event starts immediately.
+ * But the event/group could be throttled. Unthrottle the
+ * event/group now to avoid the next tick trying to unthrottle
+ * while we already re-started the event/group.
+ */
+ if (event->hw.interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, true);
perf_pmu_enable(event->pmu);
}
}
@@ -10084,8 +10111,7 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
__this_cpu_inc(perf_throttled_count);
tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- hwc->interrupts = MAX_INTERRUPTS;
- perf_log_throttle(event, 0);
+ perf_event_throttle_group(event);
ret = 1;
}
© 2016 - 2025 Red Hat, Inc.