kernel/events/core.c | 118 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 93 insertions(+), 25 deletions(-)
Current ctx_resched() reschedules every events in all PMUs in the
context even if it only needs to do it for a single event. This is the
case when it opens a new event or enables an existing one. What we
want is to reschedule events in the PMU only. Also perf_pmu_resched()
currently calls ctx_resched() without PMU information.
Let's add pmu argument to ctx_resched() to do the work for the given
PMU only. And change the __pmu_ctx_sched_in() to be symmetrical to the
_sched_out() counterpart for its arguments so that it can be called
easily in the __perf_pmu_resched().
Note that __perf_install_in_context() should call ctx_resched() for the
very first event in the context in order to set ctx->is_active. Later
events can be handled by __perf_pmu_resched().
Care should be taken when it installs a task event for a PMU and
there's no CPU event for the PMU. __perf_pmu_resched() will ask the
CPU PMU context to schedule any events in it according to the group
info. But as the PMU context was not activated, it didn't set the
event context pointer. So I added new NULL checks in the
__pmu_ctx_sched_{in,out}.
With this change I can get 4x speed up (but actually it's proportional
to the number of uncore PMU events) on a 2-socket Intel EMR machine in
opening and closing a perf event for the core PMU in a loop while there
are a bunch of uncore PMU events active on the CPU. The test code
(stress-pmu) follows below.
Before)
# ./stress-pmu
delta: 0.087068 sec (870 usec/op)
After)
# ./stress-pmu
delta: 0.021440 sec (214 usec/op)
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
$ cat stress-pmu.c
#include <stdio.h>
#include <unistd.h>
#include <linux/perf_event.h>
#include <sys/syscall.h>
#include <sys/time.h>
/* from uncore cpumask on EMR */
#define TARGET_CPU 60
#define LOOP 100
#define US2S 1000000
int open_perf_event(int type, int config)
{
struct perf_event_attr attr = {
.type = type,
.config = config,
};
int fd;
fd = syscall(SYS_perf_event_open, &attr, /*pid=*/-1, TARGET_CPU,
/*group_fd=*/-1, /*flags=*/0);
if (fd < 0)
printf("perf_event_open failed (type=%d, config=%d): %m\n", type, config);
return fd;
}
int main(int argc, char *argv[])
{
struct timeval ts1, ts2;
unsigned long long delta;
int target_cpu = TARGET_CPU;
/* open random uncore PMU events */
for (int i = 0; i < 100; i++)
open_perf_event(/*type=*/i + 20, /*config=*/0);
gettimeofday(&ts1, NULL);
for (int i = 0; i < LOOP; i++)
close(open_perf_event(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES));
gettimeofday(&ts2, NULL);
delta = ts2.tv_sec * US2S + ts2.tv_usec - (ts1.tv_sec * US2S + ts1.tv_usec);
printf("delta: %llu.%06llu sec (%llu usec/op)\n",
delta / US2S, delta % US2S, delta / LOOP);
return 0;
}
---
v2) add 'pmu' argument to ctx_resched() to reduce duplication
kernel/events/core.c | 118 ++++++++++++++++++++++++++++++++++---------
1 file changed, 93 insertions(+), 25 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f64c30e7d5da..41e2533859a4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -709,6 +709,10 @@ static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type);
+static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type);
#ifdef CONFIG_CGROUP_PERF
@@ -2668,6 +2672,17 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
ctx_sched_in(ctx, EVENT_FLEXIBLE);
}
+static void perf_pmu_sched_in(struct perf_cpu_pmu_context *cpc,
+ struct perf_event_pmu_context *task_epc)
+{
+ __pmu_ctx_sched_in(&cpc->epc, EVENT_PINNED);
+ if (task_epc)
+ __pmu_ctx_sched_in(task_epc, EVENT_PINNED);
+ __pmu_ctx_sched_in(&cpc->epc, EVENT_FLEXIBLE);
+ if (task_epc)
+ __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
+}
+
/*
* We want to maintain the following priority of scheduling:
* - CPU pinned (EVENT_CPU | EVENT_PINNED)
@@ -2683,16 +2698,13 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
-/*
- * XXX: ctx_resched() reschedule entire perf_event_context while adding new
- * event to the context or enabling existing event in the context. We can
- * probably optimize it by rescheduling only affected pmu_ctx.
- */
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
- enum event_type_t event_type)
+ struct pmu *pmu, enum event_type_t event_type)
{
bool cpu_event = !!(event_type & EVENT_CPU);
+ struct perf_cpu_pmu_context *cpc = NULL;
+ struct perf_event_pmu_context *epc = NULL;
/*
* If pinned groups are involved, flexible groups also need to be
@@ -2703,10 +2715,24 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
event_type &= EVENT_ALL;
- perf_ctx_disable(&cpuctx->ctx, false);
+ if (pmu) {
+ cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ perf_pmu_disable(pmu);
+ } else {
+ perf_ctx_disable(&cpuctx->ctx, false);
+ }
+
if (task_ctx) {
- perf_ctx_disable(task_ctx, false);
- task_ctx_sched_out(task_ctx, event_type);
+ if (pmu) {
+ if (WARN_ON_ONCE(!cpc->task_epc || cpc->task_epc->ctx != task_ctx))
+ goto out;
+
+ epc = cpc->task_epc;
+ __pmu_ctx_sched_out(epc, event_type);
+ } else {
+ perf_ctx_disable(task_ctx, false);
+ task_ctx_sched_out(task_ctx, event_type);
+ }
}
/*
@@ -2716,15 +2742,30 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
* - otherwise, do nothing more.
*/
- if (cpu_event)
- ctx_sched_out(&cpuctx->ctx, event_type);
- else if (event_type & EVENT_PINNED)
- ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ if (cpu_event) {
+ if (pmu)
+ __pmu_ctx_sched_out(&cpc->epc, event_type);
+ else
+ ctx_sched_out(&cpuctx->ctx, event_type);
+ } else if (event_type & EVENT_PINNED) {
+ if (pmu)
+ __pmu_ctx_sched_out(&cpc->epc, EVENT_FLEXIBLE);
+ else
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ }
+
+ if (pmu)
+ perf_pmu_sched_in(cpc, epc);
+ else
+ perf_event_sched_in(cpuctx, task_ctx);
- perf_event_sched_in(cpuctx, task_ctx);
+out:
+ if (pmu)
+ perf_pmu_enable(pmu);
+ else
+ perf_ctx_enable(&cpuctx->ctx, false);
- perf_ctx_enable(&cpuctx->ctx, false);
- if (task_ctx)
+ if (task_ctx && !pmu)
perf_ctx_enable(task_ctx, false);
}
@@ -2734,7 +2775,7 @@ void perf_pmu_resched(struct pmu *pmu)
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
- ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+ ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
perf_ctx_unlock(cpuctx, task_ctx);
}
@@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
if (reprogram) {
ctx_sched_out(ctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ if (ctx->nr_events == 1) {
+ /* The first event needs to set ctx->is_active. */
+ ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
+ } else {
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
+ get_event_type(event));
+ ctx_sched_in(ctx, EVENT_TIME);
+ }
} else {
add_event_to_ctx(event, ctx);
}
@@ -2962,7 +3010,8 @@ static void __perf_event_enable(struct perf_event *event,
if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
+ ctx_sched_in(ctx, EVENT_TIME);
}
/*
@@ -3230,6 +3279,13 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
struct perf_event *event, *tmp;
struct pmu *pmu = pmu_ctx->pmu;
+ /*
+ * CPU's pmu_ctx might not be active when __perf_pmu_resched() is called
+ * for task events and there's no cpu events.
+ */
+ if (ctx == NULL)
+ return;
+
if (ctx->task && !ctx->is_active) {
struct perf_cpu_pmu_context *cpc;
@@ -3872,10 +3928,22 @@ static void ctx_groups_sched_in(struct perf_event_context *ctx,
}
}
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
- struct pmu *pmu)
+static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
- pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+ struct perf_event_context *ctx = pmu_ctx->ctx;
+
+ /*
+ * CPU's pmu_ctx might not be active when __perf_pmu_resched() is called
+ * for task events and there's no cpu events.
+ */
+ if (ctx == NULL)
+ return;
+
+ if (event_type & EVENT_PINNED)
+ pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+ if (event_type & EVENT_FLEXIBLE)
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
}
static void
@@ -4309,14 +4377,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
update_context_time(&cpuctx->ctx);
__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
- __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+ __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
}
if (task_event)
rotate_ctx(task_epc->ctx, task_event);
if (task_event || (task_epc && cpu_event))
- __pmu_ctx_sched_in(task_epc->ctx, pmu);
+ __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4394,7 +4462,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
*/
if (enabled) {
clone_ctx = unclone_ctx(ctx);
- ctx_resched(cpuctx, ctx, event_type);
+ ctx_resched(cpuctx, ctx, NULL, event_type);
} else {
ctx_sched_in(ctx, EVENT_TIME);
}
--
2.46.0.rc1.232.g9752f9e123-goog
On 2024-07-30 8:06 p.m., Namhyung Kim wrote:
> Current ctx_resched() reschedules every events in all PMUs in the
> context even if it only needs to do it for a single event. This is the
> case when it opens a new event or enables an existing one. What we
> want is to reschedule events in the PMU only. Also perf_pmu_resched()
> currently calls ctx_resched() without PMU information.
>
> Let's add pmu argument to ctx_resched() to do the work for the given
> PMU only. And change the __pmu_ctx_sched_in() to be symmetrical to the
> _sched_out() counterpart for its arguments so that it can be called
> easily in the __perf_pmu_resched().
>
> Note that __perf_install_in_context() should call ctx_resched() for the
> very first event in the context in order to set ctx->is_active. Later
> events can be handled by __perf_pmu_resched().
>
> Care should be taken when it installs a task event for a PMU and
> there's no CPU event for the PMU. __perf_pmu_resched() will ask the
> CPU PMU context to schedule any events in it according to the group
> info. But as the PMU context was not activated, it didn't set the
> event context pointer. So I added new NULL checks in the
> __pmu_ctx_sched_{in,out}.
>
> With this change I can get 4x speed up (but actually it's proportional
> to the number of uncore PMU events) on a 2-socket Intel EMR machine in
> opening and closing a perf event for the core PMU in a loop while there
> are a bunch of uncore PMU events active on the CPU. The test code
> (stress-pmu) follows below.
>
> Before)
> # ./stress-pmu
> delta: 0.087068 sec (870 usec/op)
>
> After)
> # ./stress-pmu
> delta: 0.021440 sec (214 usec/op)
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> $ cat stress-pmu.c
> #include <stdio.h>
> #include <unistd.h>
> #include <linux/perf_event.h>
> #include <sys/syscall.h>
> #include <sys/time.h>
>
> /* from uncore cpumask on EMR */
> #define TARGET_CPU 60
>
> #define LOOP 100
> #define US2S 1000000
>
> int open_perf_event(int type, int config)
> {
> struct perf_event_attr attr = {
> .type = type,
> .config = config,
> };
> int fd;
>
> fd = syscall(SYS_perf_event_open, &attr, /*pid=*/-1, TARGET_CPU,
> /*group_fd=*/-1, /*flags=*/0);
> if (fd < 0)
> printf("perf_event_open failed (type=%d, config=%d): %m\n", type, config);
> return fd;
> }
>
> int main(int argc, char *argv[])
> {
> struct timeval ts1, ts2;
> unsigned long long delta;
> int target_cpu = TARGET_CPU;
>
> /* open random uncore PMU events */
> for (int i = 0; i < 100; i++)
> open_perf_event(/*type=*/i + 20, /*config=*/0);
>
> gettimeofday(&ts1, NULL);
> for (int i = 0; i < LOOP; i++)
> close(open_perf_event(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES));
> gettimeofday(&ts2, NULL);
>
> delta = ts2.tv_sec * US2S + ts2.tv_usec - (ts1.tv_sec * US2S + ts1.tv_usec);
> printf("delta: %llu.%06llu sec (%llu usec/op)\n",
> delta / US2S, delta % US2S, delta / LOOP);
> return 0;
> }
> ---
> v2) add 'pmu' argument to ctx_resched() to reduce duplication
>
> kernel/events/core.c | 118 ++++++++++++++++++++++++++++++++++---------
> 1 file changed, 93 insertions(+), 25 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index f64c30e7d5da..41e2533859a4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -709,6 +709,10 @@ static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
>
> static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
> static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
> +static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
> + enum event_type_t event_type);
> +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
> + enum event_type_t event_type);
>
> #ifdef CONFIG_CGROUP_PERF
>
> @@ -2668,6 +2672,17 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
> ctx_sched_in(ctx, EVENT_FLEXIBLE);
> }
>
> +static void perf_pmu_sched_in(struct perf_cpu_pmu_context *cpc,
> + struct perf_event_pmu_context *task_epc)
> +{
> + __pmu_ctx_sched_in(&cpc->epc, EVENT_PINNED);
> + if (task_epc)
> + __pmu_ctx_sched_in(task_epc, EVENT_PINNED);
> + __pmu_ctx_sched_in(&cpc->epc, EVENT_FLEXIBLE);
> + if (task_epc)
> + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
> +}
> +
> /*
> * We want to maintain the following priority of scheduling:
> * - CPU pinned (EVENT_CPU | EVENT_PINNED)
> @@ -2683,16 +2698,13 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
> * event_type is a bit mask of the types of events involved. For CPU events,
> * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
> */
> -/*
> - * XXX: ctx_resched() reschedule entire perf_event_context while adding new
> - * event to the context or enabling existing event in the context. We can
> - * probably optimize it by rescheduling only affected pmu_ctx.
> - */
> static void ctx_resched(struct perf_cpu_context *cpuctx,
> struct perf_event_context *task_ctx,
> - enum event_type_t event_type)
> + struct pmu *pmu, enum event_type_t event_type)
> {
> bool cpu_event = !!(event_type & EVENT_CPU);
> + struct perf_cpu_pmu_context *cpc = NULL;
> + struct perf_event_pmu_context *epc = NULL;
>
> /*
> * If pinned groups are involved, flexible groups also need to be
> @@ -2703,10 +2715,24 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>
> event_type &= EVENT_ALL;
>
> - perf_ctx_disable(&cpuctx->ctx, false);
> + if (pmu) {
> + cpc = this_cpu_ptr(pmu->cpu_pmu_context);
> + perf_pmu_disable(pmu);
> + } else {
> + perf_ctx_disable(&cpuctx->ctx, false);
> + }
> +
> if (task_ctx) {
> - perf_ctx_disable(task_ctx, false);
> - task_ctx_sched_out(task_ctx, event_type);
> + if (pmu) {
> + if (WARN_ON_ONCE(!cpc->task_epc || cpc->task_epc->ctx != task_ctx))
> + goto out;
> +
> + epc = cpc->task_epc;
> + __pmu_ctx_sched_out(epc, event_type);
> + } else {
> + perf_ctx_disable(task_ctx, false);
> + task_ctx_sched_out(task_ctx, event_type);
> + }
> }
>
> /*
> @@ -2716,15 +2742,30 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
> * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
> * - otherwise, do nothing more.
> */
> - if (cpu_event)
> - ctx_sched_out(&cpuctx->ctx, event_type);
> - else if (event_type & EVENT_PINNED)
> - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> + if (cpu_event) {
> + if (pmu)
> + __pmu_ctx_sched_out(&cpc->epc, event_type);
> + else
> + ctx_sched_out(&cpuctx->ctx, event_type);
> + } else if (event_type & EVENT_PINNED) {
> + if (pmu)
> + __pmu_ctx_sched_out(&cpc->epc, EVENT_FLEXIBLE);
> + else
> + ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> + }
> +
> + if (pmu)
> + perf_pmu_sched_in(cpc, epc);
> + else
> + perf_event_sched_in(cpuctx, task_ctx);
>
> - perf_event_sched_in(cpuctx, task_ctx);
> +out:
> + if (pmu)
> + perf_pmu_enable(pmu);
> + else
> + perf_ctx_enable(&cpuctx->ctx, false);
>
> - perf_ctx_enable(&cpuctx->ctx, false);
> - if (task_ctx)
> + if (task_ctx && !pmu)
> perf_ctx_enable(task_ctx, false);
> }
>
> @@ -2734,7 +2775,7 @@ void perf_pmu_resched(struct pmu *pmu)
> struct perf_event_context *task_ctx = cpuctx->task_ctx;
>
> perf_ctx_lock(cpuctx, task_ctx);
> - ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
> + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
> perf_ctx_unlock(cpuctx, task_ctx);
> }
>
> @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
> if (reprogram) {
> ctx_sched_out(ctx, EVENT_TIME);
> add_event_to_ctx(event, ctx);
> - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> + if (ctx->nr_events == 1) {
> + /* The first event needs to set ctx->is_active. */
> + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> + } else {
> + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> + get_event_type(event));
> + ctx_sched_in(ctx, EVENT_TIME);
The changelog doesn't mention the time difference much. As my
understanding, the time is shared among PMUs in the same ctx.
When perf does ctx_resched(), the time is deducted.
There is no problem to stop and restart the global time when perf
re-schedule all PMUs.
But if only one PMU is re-scheduled while others are still running, it
may be a problem to stop and restart the global time. Other PMUs will be
impacted.
Thanks,
Kan
> + }
> } else {
> add_event_to_ctx(event, ctx);
> }
> @@ -2962,7 +3010,8 @@ static void __perf_event_enable(struct perf_event *event,
> if (ctx->task)
> WARN_ON_ONCE(task_ctx != ctx);
>
> - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
> + ctx_sched_in(ctx, EVENT_TIME);
> }
>
> /*
> @@ -3230,6 +3279,13 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
> struct perf_event *event, *tmp;
> struct pmu *pmu = pmu_ctx->pmu;
>
> + /*
> + * CPU's pmu_ctx might not be active when __perf_pmu_resched() is called
> + * for task events and there's no cpu events.
> + */
> + if (ctx == NULL)
> + return;
> +
> if (ctx->task && !ctx->is_active) {
> struct perf_cpu_pmu_context *cpc;
>
> @@ -3872,10 +3928,22 @@ static void ctx_groups_sched_in(struct perf_event_context *ctx,
> }
> }
>
> -static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
> - struct pmu *pmu)
> +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
> + enum event_type_t event_type)
> {
> - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> + struct perf_event_context *ctx = pmu_ctx->ctx;
> +
> + /*
> + * CPU's pmu_ctx might not be active when __perf_pmu_resched() is called
> + * for task events and there's no cpu events.
> + */
> + if (ctx == NULL)
> + return;
> +
> + if (event_type & EVENT_PINNED)
> + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
> + if (event_type & EVENT_FLEXIBLE)
> + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
> }
>
> static void
> @@ -4309,14 +4377,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
> update_context_time(&cpuctx->ctx);
> __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
> rotate_ctx(&cpuctx->ctx, cpu_event);
> - __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
> + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
> }
>
> if (task_event)
> rotate_ctx(task_epc->ctx, task_event);
>
> if (task_event || (task_epc && cpu_event))
> - __pmu_ctx_sched_in(task_epc->ctx, pmu);
> + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
>
> perf_pmu_enable(pmu);
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> @@ -4394,7 +4462,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
> */
> if (enabled) {
> clone_ctx = unclone_ctx(ctx);
> - ctx_resched(cpuctx, ctx, event_type);
> + ctx_resched(cpuctx, ctx, NULL, event_type);
> } else {
> ctx_sched_in(ctx, EVENT_TIME);
> }
On Fri, Aug 02, 2024 at 02:30:19PM -0400, Liang, Kan wrote:
> > @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
> > if (reprogram) {
> > ctx_sched_out(ctx, EVENT_TIME);
> > add_event_to_ctx(event, ctx);
> > - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> > + if (ctx->nr_events == 1) {
> > + /* The first event needs to set ctx->is_active. */
> > + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> > + } else {
> > + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> > + get_event_type(event));
> > + ctx_sched_in(ctx, EVENT_TIME);
>
> The changelog doesn't mention the time difference much. As my
> understanding, the time is shared among PMUs in the same ctx.
> When perf does ctx_resched(), the time is deducted.
> There is no problem to stop and restart the global time when perf
> re-schedule all PMUs.
> But if only one PMU is re-scheduled while others are still running, it
> may be a problem to stop and restart the global time. Other PMUs will be
> impacted.
So afaict, since we hold ctx->lock, nobody can observe EVENT_TIME was
cleared for a little while.
So the point was to make all the various ctx_sched_out() calls have the
same timestamp. It does this by clearing EVENT_TIME first. Then the
first ctx_sched_in() will set it again, and later ctx_sched_in() won't
touch time.
That leaves a little hole, because the time between
ctx_sched_out(EVENT_TIME) and the first ctx_sched_in() gets lost.
This isn't typically a problem, but not very nice. Let me go find an
alternative solution for this. The simple update I did saturday is
broken as per the perf test.
On Mon, Aug 05, 2024 at 11:20:58AM +0200, Peter Zijlstra wrote:
> On Fri, Aug 02, 2024 at 02:30:19PM -0400, Liang, Kan wrote:
> > > @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
> > > if (reprogram) {
> > > ctx_sched_out(ctx, EVENT_TIME);
> > > add_event_to_ctx(event, ctx);
> > > - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> > > + if (ctx->nr_events == 1) {
> > > + /* The first event needs to set ctx->is_active. */
> > > + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> > > + } else {
> > > + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> > > + get_event_type(event));
> > > + ctx_sched_in(ctx, EVENT_TIME);
> >
> > The changelog doesn't mention the time difference much. As my
> > understanding, the time is shared among PMUs in the same ctx.
> > When perf does ctx_resched(), the time is deducted.
> > There is no problem to stop and restart the global time when perf
> > re-schedule all PMUs.
> > But if only one PMU is re-scheduled while others are still running, it
> > may be a problem to stop and restart the global time. Other PMUs will be
> > impacted.
>
> So afaict, since we hold ctx->lock, nobody can observe EVENT_TIME was
> cleared for a little while.
>
> So the point was to make all the various ctx_sched_out() calls have the
> same timestamp. It does this by clearing EVENT_TIME first. Then the
> first ctx_sched_in() will set it again, and later ctx_sched_in() won't
> touch time.
>
> That leaves a little hole, because the time between
> ctx_sched_out(EVENT_TIME) and the first ctx_sched_in() gets lost.
>
> This isn't typically a problem, but not very nice. Let me go find an
> alternative solution for this. The simple update I did saturday is
> broken as per the perf test.
OK, took a little longer than I would have liked, nor is it entirely
pretty, but it seems to pass 'perf test'.
Please look at: queue.git perf/resched
I'll try and post it all tomorrow.
On 2024-08-05 10:58 a.m., Peter Zijlstra wrote:
> On Mon, Aug 05, 2024 at 11:20:58AM +0200, Peter Zijlstra wrote:
>> On Fri, Aug 02, 2024 at 02:30:19PM -0400, Liang, Kan wrote:
>>>> @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
>>>> if (reprogram) {
>>>> ctx_sched_out(ctx, EVENT_TIME);
>>>> add_event_to_ctx(event, ctx);
>>>> - ctx_resched(cpuctx, task_ctx, get_event_type(event));
>>>> + if (ctx->nr_events == 1) {
>>>> + /* The first event needs to set ctx->is_active. */
>>>> + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
>>>> + } else {
>>>> + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
>>>> + get_event_type(event));
>>>> + ctx_sched_in(ctx, EVENT_TIME);
>>>
>>> The changelog doesn't mention the time difference much. As my
>>> understanding, the time is shared among PMUs in the same ctx.
>>> When perf does ctx_resched(), the time is deducted.
>>> There is no problem to stop and restart the global time when perf
>>> re-schedule all PMUs.
>>> But if only one PMU is re-scheduled while others are still running, it
>>> may be a problem to stop and restart the global time. Other PMUs will be
>>> impacted.
>>
>> So afaict, since we hold ctx->lock, nobody can observe EVENT_TIME was
>> cleared for a little while.
>>
>> So the point was to make all the various ctx_sched_out() calls have the
>> same timestamp. It does this by clearing EVENT_TIME first. Then the
>> first ctx_sched_in() will set it again, and later ctx_sched_in() won't
>> touch time.
>>
>> That leaves a little hole, because the time between
>> ctx_sched_out(EVENT_TIME) and the first ctx_sched_in() gets lost.
>>
>> This isn't typically a problem, but not very nice. Let me go find an
>> alternative solution for this. The simple update I did saturday is
>> broken as per the perf test.
>
> OK, took a little longer than I would have liked, nor is it entirely
> pretty, but it seems to pass 'perf test'.
>
> Please look at: queue.git perf/resched
Thanks. If I understand correctly, the freeze doesn't mean that the time
deduction. For the other PMUs, it can still see the time in during the
specific PMU reschedule, right? If so, the general idea looks good to me.
I also think of the vPMU time. For that case, perf has to deduct the
vPMU time. The freeze bit cannot be used in the vPMU case. But I
probably have to rebase the below patch on top of EVENT_FROZEN.
https://lore.kernel.org/kvm/20240801045907.4010984-10-mizhang@google.com/
Thanks,
Kan
On Mon, Aug 5, 2024 at 7:58 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Aug 05, 2024 at 11:20:58AM +0200, Peter Zijlstra wrote:
> > On Fri, Aug 02, 2024 at 02:30:19PM -0400, Liang, Kan wrote:
> > > > @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
> > > > if (reprogram) {
> > > > ctx_sched_out(ctx, EVENT_TIME);
> > > > add_event_to_ctx(event, ctx);
> > > > - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> > > > + if (ctx->nr_events == 1) {
> > > > + /* The first event needs to set ctx->is_active. */
> > > > + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> > > > + } else {
> > > > + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> > > > + get_event_type(event));
> > > > + ctx_sched_in(ctx, EVENT_TIME);
> > >
> > > The changelog doesn't mention the time difference much. As my
> > > understanding, the time is shared among PMUs in the same ctx.
> > > When perf does ctx_resched(), the time is deducted.
> > > There is no problem to stop and restart the global time when perf
> > > re-schedule all PMUs.
> > > But if only one PMU is re-scheduled while others are still running, it
> > > may be a problem to stop and restart the global time. Other PMUs will be
> > > impacted.
> >
> > So afaict, since we hold ctx->lock, nobody can observe EVENT_TIME was
> > cleared for a little while.
> >
> > So the point was to make all the various ctx_sched_out() calls have the
> > same timestamp. It does this by clearing EVENT_TIME first. Then the
> > first ctx_sched_in() will set it again, and later ctx_sched_in() won't
> > touch time.
> >
> > That leaves a little hole, because the time between
> > ctx_sched_out(EVENT_TIME) and the first ctx_sched_in() gets lost.
> >
> > This isn't typically a problem, but not very nice. Let me go find an
> > alternative solution for this. The simple update I did saturday is
> > broken as per the perf test.
>
> OK, took a little longer than I would have liked, nor is it entirely
> pretty, but it seems to pass 'perf test'.
>
> Please look at: queue.git perf/resched
>
> I'll try and post it all tomorrow.
Thanks for doing this. But some of my tests are still failing.
I'm seeing some system-wide events are not counted.
Let me take a deeper look at it.
Thanks,
Namhyung
On Mon, Aug 05, 2024 at 11:19:48PM -0700, Namhyung Kim wrote:
> On Mon, Aug 5, 2024 at 7:58 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Aug 05, 2024 at 11:20:58AM +0200, Peter Zijlstra wrote:
> > > On Fri, Aug 02, 2024 at 02:30:19PM -0400, Liang, Kan wrote:
> > > > > @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
> > > > > if (reprogram) {
> > > > > ctx_sched_out(ctx, EVENT_TIME);
> > > > > add_event_to_ctx(event, ctx);
> > > > > - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> > > > > + if (ctx->nr_events == 1) {
> > > > > + /* The first event needs to set ctx->is_active. */
> > > > > + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> > > > > + } else {
> > > > > + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> > > > > + get_event_type(event));
> > > > > + ctx_sched_in(ctx, EVENT_TIME);
> > > >
> > > > The changelog doesn't mention the time difference much. As my
> > > > understanding, the time is shared among PMUs in the same ctx.
> > > > When perf does ctx_resched(), the time is deducted.
> > > > There is no problem to stop and restart the global time when perf
> > > > re-schedule all PMUs.
> > > > But if only one PMU is re-scheduled while others are still running, it
> > > > may be a problem to stop and restart the global time. Other PMUs will be
> > > > impacted.
> > >
> > > So afaict, since we hold ctx->lock, nobody can observe EVENT_TIME was
> > > cleared for a little while.
> > >
> > > So the point was to make all the various ctx_sched_out() calls have the
> > > same timestamp. It does this by clearing EVENT_TIME first. Then the
> > > first ctx_sched_in() will set it again, and later ctx_sched_in() won't
> > > touch time.
> > >
> > > That leaves a little hole, because the time between
> > > ctx_sched_out(EVENT_TIME) and the first ctx_sched_in() gets lost.
> > >
> > > This isn't typically a problem, but not very nice. Let me go find an
> > > alternative solution for this. The simple update I did saturday is
> > > broken as per the perf test.
> >
> > OK, took a little longer than I would have liked, nor is it entirely
> > pretty, but it seems to pass 'perf test'.
> >
> > Please look at: queue.git perf/resched
> >
> > I'll try and post it all tomorrow.
>
> Thanks for doing this. But some of my tests are still failing.
> I'm seeing some system-wide events are not counted.
> Let me take a deeper look at it.
Does this help? What would be an easy reproducer?
---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c67fc43fe877..4a04611333d9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -179,23 +179,27 @@ static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
}
}
+static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
+{
+ /*
+ * If ctx_sched_in() didn't again set any ALL flags, clean up
+ * after ctx_sched_out() by clearing is_active.
+ */
+ if (ctx->is_active & EVENT_FROZEN) {
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+ else
+ ctx->is_active &= ~EVENT_FROZEN;
+ }
+ raw_spin_unlock(&ctx->lock);
+}
+
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- if (ctx) {
- /*
- * If ctx_sched_in() didn't again set any ALL flags, clean up
- * after ctx_sched_out() by clearing is_active.
- */
- if (ctx->is_active & EVENT_FROZEN) {
- if (!(ctx->is_active & EVENT_ALL))
- ctx->is_active = 0;
- else
- ctx->is_active &= ~EVENT_FROZEN;
- }
- raw_spin_unlock(&ctx->lock);
- }
- raw_spin_unlock(&cpuctx->ctx.lock);
+ if (ctx)
+ __perf_ctx_unlock(ctx);
+ __perf_ctx_unlock(&cpuctx->ctx.lock);
}
#define TASK_TOMBSTONE ((void *)-1L)
On Tue, Aug 06, 2024 at 09:56:30AM +0200, Peter Zijlstra wrote:
> Does this help? What would be an easy reproducer?
>
> ---
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index c67fc43fe877..4a04611333d9 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -179,23 +179,27 @@ static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
> }
> }
>
> +static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
> +{
> + /*
> + * If ctx_sched_in() didn't again set any ALL flags, clean up
> + * after ctx_sched_out() by clearing is_active.
> + */
> + if (ctx->is_active & EVENT_FROZEN) {
> + if (!(ctx->is_active & EVENT_ALL))
> + ctx->is_active = 0;
> + else
> + ctx->is_active &= ~EVENT_FROZEN;
> + }
> + raw_spin_unlock(&ctx->lock);
> +}
> +
> static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
> struct perf_event_context *ctx)
> {
> - if (ctx) {
> - /*
> - * If ctx_sched_in() didn't again set any ALL flags, clean up
> - * after ctx_sched_out() by clearing is_active.
> - */
> - if (ctx->is_active & EVENT_FROZEN) {
> - if (!(ctx->is_active & EVENT_ALL))
> - ctx->is_active = 0;
> - else
> - ctx->is_active &= ~EVENT_FROZEN;
> - }
> - raw_spin_unlock(&ctx->lock);
> - }
> - raw_spin_unlock(&cpuctx->ctx.lock);
> + if (ctx)
> + __perf_ctx_unlock(ctx);
> + __perf_ctx_unlock(&cpuctx->ctx.lock);
Obviously that wants to be just: &cpuctx->ctx :-)
> }
>
> #define TASK_TOMBSTONE ((void *)-1L)
On Tue, Aug 6, 2024 at 1:08 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Aug 06, 2024 at 09:56:30AM +0200, Peter Zijlstra wrote:
>
> > Does this help? What would be an easy reproducer?
Yep, it helps! I guess you can reproduce it easily with:
$ sudo perf stat -a sleep 1
> >
> > ---
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index c67fc43fe877..4a04611333d9 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -179,23 +179,27 @@ static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
> > }
> > }
> >
> > +static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
> > +{
> > + /*
> > + * If ctx_sched_in() didn't again set any ALL flags, clean up
> > + * after ctx_sched_out() by clearing is_active.
> > + */
> > + if (ctx->is_active & EVENT_FROZEN) {
> > + if (!(ctx->is_active & EVENT_ALL))
> > + ctx->is_active = 0;
> > + else
> > + ctx->is_active &= ~EVENT_FROZEN;
> > + }
> > + raw_spin_unlock(&ctx->lock);
> > +}
> > +
> > static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
> > struct perf_event_context *ctx)
> > {
> > - if (ctx) {
> > - /*
> > - * If ctx_sched_in() didn't again set any ALL flags, clean up
> > - * after ctx_sched_out() by clearing is_active.
> > - */
> > - if (ctx->is_active & EVENT_FROZEN) {
> > - if (!(ctx->is_active & EVENT_ALL))
> > - ctx->is_active = 0;
> > - else
> > - ctx->is_active &= ~EVENT_FROZEN;
> > - }
> > - raw_spin_unlock(&ctx->lock);
> > - }
> > - raw_spin_unlock(&cpuctx->ctx.lock);
> > + if (ctx)
> > + __perf_ctx_unlock(ctx);
> > + __perf_ctx_unlock(&cpuctx->ctx.lock);
>
> Obviously that wants to be just: &cpuctx->ctx :-)
Sure. And I think we need this for perf_ctx_lock() too.
Thanks,
Namhyung
>
> > }
> >
> > #define TASK_TOMBSTONE ((void *)-1L)
On Fri, Aug 02, 2024 at 02:30:19PM -0400, Liang, Kan wrote:
> > @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
> > if (reprogram) {
> > ctx_sched_out(ctx, EVENT_TIME);
> > add_event_to_ctx(event, ctx);
> > - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> > + if (ctx->nr_events == 1) {
> > + /* The first event needs to set ctx->is_active. */
> > + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> > + } else {
> > + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> > + get_event_type(event));
> > + ctx_sched_in(ctx, EVENT_TIME);
>
> The changelog doesn't mention the time difference much. As my
> understanding, the time is shared among PMUs in the same ctx.
> When perf does ctx_resched(), the time is deducted.
> There is no problem to stop and restart the global time when perf
> re-schedule all PMUs.
> But if only one PMU is re-scheduled while others are still running, it
> may be a problem to stop and restart the global time. Other PMUs will be
> impacted.
ctx_sched_in(EVENT_TIME) will onlt start time if it wasn't already
running. Also note that none of the sched_out calls have EVENT_TIME on.
So it will not stop time, only start time if if wasn't already running.
At the same time, that's likely only the case when nr_events==1, and in
that case it already calls the full fat version.
Anyway, this exception is part of the reason I don't like any of this
much. I'm staring to see if there's not something saner hiding inside
all this.
On Fri, Aug 02, 2024 at 08:38:41PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 02, 2024 at 02:30:19PM -0400, Liang, Kan wrote:
> > > @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
> > > if (reprogram) {
> > > ctx_sched_out(ctx, EVENT_TIME);
Clearly I should read better...
> > > add_event_to_ctx(event, ctx);
> > > - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> > > + if (ctx->nr_events == 1) {
> > > + /* The first event needs to set ctx->is_active. */
> > > + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> > > + } else {
> > > + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> > > + get_event_type(event));
> > > + ctx_sched_in(ctx, EVENT_TIME);
> >
> > The changelog doesn't mention the time difference much. As my
> > understanding, the time is shared among PMUs in the same ctx.
> > When perf does ctx_resched(), the time is deducted.
> > There is no problem to stop and restart the global time when perf
> > re-schedule all PMUs.
> > But if only one PMU is re-scheduled while others are still running, it
> > may be a problem to stop and restart the global time. Other PMUs will be
> > impacted.
So yeah, this stops ctx time but not all PMUs.
On Fri, Aug 02, 2024 at 08:43:50PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 02, 2024 at 08:38:41PM +0200, Peter Zijlstra wrote:
> > On Fri, Aug 02, 2024 at 02:30:19PM -0400, Liang, Kan wrote:
> > > > @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
> > > > if (reprogram) {
> > > > ctx_sched_out(ctx, EVENT_TIME);
>
> Clearly I should read better...
>
> > > > add_event_to_ctx(event, ctx);
> > > > - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> > > > + if (ctx->nr_events == 1) {
> > > > + /* The first event needs to set ctx->is_active. */
> > > > + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> > > > + } else {
> > > > + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> > > > + get_event_type(event));
> > > > + ctx_sched_in(ctx, EVENT_TIME);
> > >
> > > The changelog doesn't mention the time difference much. As my
> > > understanding, the time is shared among PMUs in the same ctx.
> > > When perf does ctx_resched(), the time is deducted.
> > > There is no problem to stop and restart the global time when perf
> > > re-schedule all PMUs.
> > > But if only one PMU is re-scheduled while others are still running, it
> > > may be a problem to stop and restart the global time. Other PMUs will be
> > > impacted.
>
> So yeah, this stops ctx time but not all PMUs.
But isn't this already the case? We don't have perf_ctx_disable() here
currently.
Bah, this heat is melting my brain.
On Fri, Aug 02, 2024 at 08:50:23PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 02, 2024 at 08:43:50PM +0200, Peter Zijlstra wrote:
> > On Fri, Aug 02, 2024 at 08:38:41PM +0200, Peter Zijlstra wrote:
> > > On Fri, Aug 02, 2024 at 02:30:19PM -0400, Liang, Kan wrote:
> > > > > @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
> > > > > if (reprogram) {
> > > > > ctx_sched_out(ctx, EVENT_TIME);
> >
> > Clearly I should read better...
> >
> > > > > add_event_to_ctx(event, ctx);
> > > > > - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> > > > > + if (ctx->nr_events == 1) {
> > > > > + /* The first event needs to set ctx->is_active. */
> > > > > + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> > > > > + } else {
> > > > > + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> > > > > + get_event_type(event));
> > > > > + ctx_sched_in(ctx, EVENT_TIME);
> > > >
> > > > The changelog doesn't mention the time difference much. As my
> > > > understanding, the time is shared among PMUs in the same ctx.
> > > > When perf does ctx_resched(), the time is deducted.
> > > > There is no problem to stop and restart the global time when perf
> > > > re-schedule all PMUs.
> > > > But if only one PMU is re-scheduled while others are still running, it
> > > > may be a problem to stop and restart the global time. Other PMUs will be
> > > > impacted.
> >
> > So yeah, this stops ctx time but not all PMUs.
>
> But isn't this already the case? We don't have perf_ctx_disable() here
> currently.
>
> Bah, this heat is melting my brain.
I think all it wants is to update time and ensure the added event and
the resched all use the same time, which could be done differently.
But I'll have to continue staring at this later.
On Fri, Aug 02, 2024 at 09:11:23PM +0200, Peter Zijlstra wrote:
> But I'll have to continue staring at this later.
OK, I have the below, which boots and seems able to do:
perf stat -ae power/energy-pkg/ -- sleep 1
and
perf top
also still works, so it must be perfect, right, right?
It should be split up in at least 3, possibly more patches, but that's
for Monday. Now I get to mow the lawn or any of the other real-life
things weekends are for :-)
It also isn't ideal in that it still has a ton of pmu_ctx_list
iteration, but at least it will skip all the expensive parts.
---
kernel/events/core.c | 210 +++++++++++++++++++++++++++------------------------
1 file changed, 110 insertions(+), 100 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c01a32687dad..2e30ac0fbaf6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -685,30 +685,32 @@ do { \
___p; \
})
+#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
+ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
+ if (_cgroup && !_epc->nr_cgroups) \
+ continue; \
+ else if (_pmu && _epc->pmu != _pmu) \
+ continue; \
+ else
+
static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
+ for_each_epc(pmu_ctx, ctx, NULL, cgroup)
perf_pmu_disable(pmu_ctx->pmu);
- }
}
static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
+ for_each_epc(pmu_ctx, ctx, NULL, cgroup)
perf_pmu_enable(pmu_ctx->pmu);
- }
}
-static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
-static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
#ifdef CONFIG_CGROUP_PERF
@@ -865,7 +867,7 @@ static void perf_cgroup_switch(struct task_struct *task)
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_ctx_disable(&cpuctx->ctx, true);
- ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
/*
* must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in
@@ -877,7 +879,7 @@ static void perf_cgroup_switch(struct task_struct *task)
* perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around
*/
- ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -2328,6 +2330,24 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
event_sched_out(event, ctx);
}
+static void
+ctx_update_time(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+ if (ctx->is_active & EVENT_TIME) {
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx, false);
+ }
+}
+
+static void
+ctx_update_event_time(struct perf_event_context *ctx, struct perf_event *event)
+{
+ if (ctx->is_active & EVENT_TIME) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+}
+
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
#define DETACH_DEAD 0x04UL
@@ -2347,10 +2367,7 @@ __perf_remove_from_context(struct perf_event *event,
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
unsigned long flags = (unsigned long)info;
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx, false);
- }
+ ctx_update_time(cpuctx, ctx);
/*
* Ensure event_sched_out() switches to OFF, at the very least
@@ -2435,12 +2452,8 @@ static void __perf_event_disable(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return;
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
-
perf_pmu_disable(event->pmu_ctx->pmu);
+ ctx_update_event_time(ctx, event);
if (event == event->group_leader)
group_sched_out(event, ctx);
@@ -2656,7 +2669,8 @@ static void add_event_to_ctx(struct perf_event *event,
}
static void task_ctx_sched_out(struct perf_event_context *ctx,
- enum event_type_t event_type)
+ struct pmu *pmu,
+ enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
@@ -2666,18 +2680,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, event_type);
+ ctx_sched_out(ctx, pmu, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+ struct perf_event_context *ctx,
+ struct pmu *pmu)
{
- ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, EVENT_PINNED);
- ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
}
/*
@@ -2695,16 +2710,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
-/*
- * XXX: ctx_resched() reschedule entire perf_event_context while adding new
- * event to the context or enabling existing event in the context. We can
- * probably optimize it by rescheduling only affected pmu_ctx.
- */
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
- enum event_type_t event_type)
+ struct pmu *pmu, enum event_type_t event_type)
{
bool cpu_event = !!(event_type & EVENT_CPU);
+ struct perf_event_pmu_context *epc;
/*
* If pinned groups are involved, flexible groups also need to be
@@ -2715,10 +2726,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
event_type &= EVENT_ALL;
- perf_ctx_disable(&cpuctx->ctx, false);
+ for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ perf_pmu_disable(epc->pmu);
+
if (task_ctx) {
- perf_ctx_disable(task_ctx, false);
- task_ctx_sched_out(task_ctx, event_type);
+ for_each_epc(epc, task_ctx, pmu, false)
+ perf_pmu_disable(epc->pmu);
+
+ task_ctx_sched_out(task_ctx, pmu, event_type);
}
/*
@@ -2729,15 +2744,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
- ctx_sched_out(&cpuctx->ctx, event_type);
+ ctx_sched_out(&cpuctx->ctx, pmu, event_type);
else if (event_type & EVENT_PINNED)
- ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, task_ctx);
+ perf_event_sched_in(cpuctx, task_ctx, pmu);
- perf_ctx_enable(&cpuctx->ctx, false);
- if (task_ctx)
- perf_ctx_enable(task_ctx, false);
+ for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ perf_pmu_enable(epc->pmu);
+
+ if (task_ctx) {
+ for_each_epc(epc, task_ctx, pmu, false)
+ perf_pmu_enable(epc->pmu);
+ }
}
void perf_pmu_resched(struct pmu *pmu)
@@ -2746,7 +2765,7 @@ void perf_pmu_resched(struct pmu *pmu)
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
- ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+ ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
perf_ctx_unlock(cpuctx, task_ctx);
}
@@ -2802,9 +2821,10 @@ static int __perf_install_in_context(void *info)
#endif
if (reprogram) {
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_update_time(cpuctx, ctx);
add_event_to_ctx(event, ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
+ get_event_type(event));
} else {
add_event_to_ctx(event, ctx);
}
@@ -2947,8 +2967,7 @@ static void __perf_event_enable(struct perf_event *event,
event->state <= PERF_EVENT_STATE_ERROR)
return;
- if (ctx->is_active)
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_update_time(cpuctx, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
@@ -2956,25 +2975,21 @@ static void __perf_event_enable(struct perf_event *event,
if (!ctx->is_active)
return;
- if (!event_filter_match(event)) {
- ctx_sched_in(ctx, EVENT_TIME);
+ if (!event_filter_match(event))
return;
- }
/*
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
- if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, EVENT_TIME);
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
return;
- }
task_ctx = cpuctx->task_ctx;
if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
}
/*
@@ -3250,7 +3265,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
cpc->task_epc = NULL;
}
- if (!event_type)
+ if (!(event_type & EVENT_ALL))
return;
perf_pmu_disable(pmu);
@@ -3276,8 +3291,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
perf_pmu_enable(pmu);
}
+/*
+ * Be very careful with the @pmu argument since this will change ctx state.
+ * The @pmu argument works for ctx_resched(), because that is symmetric in
+ * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
+ *
+ * However, if you were to be asymmetrical, you could end up with messed up
+ * state, eg. ctx->is_active cleared even though most EPCs would still actually
+ * be active.
+ */
static void
-ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
@@ -3331,11 +3355,8 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
is_active ^= ctx->is_active; /* changed bits */
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
__pmu_ctx_sched_out(pmu_ctx, is_active);
- }
}
/*
@@ -3579,7 +3600,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
inside_switch:
perf_ctx_sched_task_cb(ctx, false);
- task_ctx_sched_out(ctx, EVENT_ALL);
+ task_ctx_sched_out(ctx, NULL, EVENT_ALL);
perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
@@ -3877,29 +3898,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx,
merge_sched_in, &can_add_hw);
}
-static void ctx_groups_sched_in(struct perf_event_context *ctx,
- struct perf_event_groups *groups,
- bool cgroup)
+static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
- struct perf_event_pmu_context *pmu_ctx;
-
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
- pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
- }
-}
+ struct perf_event_context *ctx = pmu_ctx->ctx;
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
- struct pmu *pmu)
-{
- pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+ if (event_type & EVENT_PINNED)
+ pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+ if (event_type & EVENT_FLEXIBLE)
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
}
static void
-ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
bool cgroup = event_type & EVENT_CGROUP;
@@ -3935,12 +3949,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
- if (is_active & EVENT_PINNED)
- ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
+ if (is_active & EVENT_PINNED) {
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+ }
/* Then walk through the lower prio flexible groups */
- if (is_active & EVENT_FLEXIBLE)
- ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
+ if (is_active & EVENT_FLEXIBLE) {
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+ }
}
static void perf_event_context_sched_in(struct task_struct *task)
@@ -3983,10 +4001,10 @@ static void perf_event_context_sched_in(struct task_struct *task)
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
perf_ctx_disable(&cpuctx->ctx, false);
- ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
}
- perf_event_sched_in(cpuctx, ctx);
+ perf_event_sched_in(cpuctx, ctx, NULL);
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
@@ -4327,14 +4345,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
update_context_time(&cpuctx->ctx);
__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
- __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+ __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
}
if (task_event)
rotate_ctx(task_epc->ctx, task_event);
if (task_event || (task_epc && cpu_event))
- __pmu_ctx_sched_in(task_epc->ctx, pmu);
+ __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4400,7 +4418,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_update_time(cpuctx, ctx);
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
@@ -4412,9 +4430,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
*/
if (enabled) {
clone_ctx = unclone_ctx(ctx);
- ctx_resched(cpuctx, ctx, event_type);
- } else {
- ctx_sched_in(ctx, EVENT_TIME);
+ ctx_resched(cpuctx, ctx, NULL, event_type);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4517,10 +4533,7 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
+ ctx_update_event_time(ctx, event);
perf_event_update_time(event);
if (data->group)
@@ -4720,10 +4733,7 @@ static int perf_event_read(struct perf_event *event, bool group)
* May read while context is not active (e.g., thread is
* blocked), in that case we cannot update context time
*/
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
+ ctx_update_event_time(ctx, event);
perf_event_update_time(event);
if (group)
@@ -13202,7 +13212,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(child_ctx, EVENT_ALL);
+ task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
@@ -13751,7 +13761,7 @@ static void __perf_event_exit_context(void *__info)
struct perf_event *event;
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_sched_out(ctx, NULL, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
On Sat, Aug 3, 2024 at 3:32 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Aug 02, 2024 at 09:11:23PM +0200, Peter Zijlstra wrote:
>
> > But I'll have to continue staring at this later.
>
> OK, I have the below, which boots and seems able to do:
>
> perf stat -ae power/energy-pkg/ -- sleep 1
>
> and
>
> perf top
>
> also still works, so it must be perfect, right, right?
I really hope so. :) I'll test it over the weekend.
>
> It should be split up in at least 3, possibly more patches, but that's
> for Monday. Now I get to mow the lawn or any of the other real-life
> things weekends are for :-)
Sure.
>
> It also isn't ideal in that it still has a ton of pmu_ctx_list
> iteration, but at least it will skip all the expensive parts.
Yep, I think it's good enough.
>
> ---
> kernel/events/core.c | 210 +++++++++++++++++++++++++++------------------------
> 1 file changed, 110 insertions(+), 100 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index c01a32687dad..2e30ac0fbaf6 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -685,30 +685,32 @@ do { \
> ___p; \
> })
>
> +#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
> + list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
> + if (_cgroup && !_epc->nr_cgroups) \
> + continue; \
> + else if (_pmu && _epc->pmu != _pmu) \
> + continue; \
> + else
> +
> static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
> {
> struct perf_event_pmu_context *pmu_ctx;
>
> - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> - if (cgroup && !pmu_ctx->nr_cgroups)
> - continue;
> + for_each_epc(pmu_ctx, ctx, NULL, cgroup)
> perf_pmu_disable(pmu_ctx->pmu);
> - }
> }
>
> static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
> {
> struct perf_event_pmu_context *pmu_ctx;
>
> - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> - if (cgroup && !pmu_ctx->nr_cgroups)
> - continue;
> + for_each_epc(pmu_ctx, ctx, NULL, cgroup)
> perf_pmu_enable(pmu_ctx->pmu);
> - }
> }
>
> -static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
> -static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
> +static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
> +static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
>
> #ifdef CONFIG_CGROUP_PERF
>
> @@ -865,7 +867,7 @@ static void perf_cgroup_switch(struct task_struct *task)
> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> perf_ctx_disable(&cpuctx->ctx, true);
>
> - ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
> + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
> /*
> * must not be done before ctxswout due
> * to update_cgrp_time_from_cpuctx() in
> @@ -877,7 +879,7 @@ static void perf_cgroup_switch(struct task_struct *task)
> * perf_cgroup_set_timestamp() in ctx_sched_in()
> * to not have to pass task around
> */
> - ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
> + ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
>
> perf_ctx_enable(&cpuctx->ctx, true);
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> @@ -2328,6 +2330,24 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
> event_sched_out(event, ctx);
> }
>
> +static void
> +ctx_update_time(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +{
> + if (ctx->is_active & EVENT_TIME) {
> + update_context_time(ctx);
> + update_cgrp_time_from_cpuctx(cpuctx, false);
> + }
> +}
> +
> +static void
> +ctx_update_event_time(struct perf_event_context *ctx, struct perf_event *event)
> +{
> + if (ctx->is_active & EVENT_TIME) {
> + update_context_time(ctx);
> + update_cgrp_time_from_event(event);
> + }
> +}
> +
> #define DETACH_GROUP 0x01UL
> #define DETACH_CHILD 0x02UL
> #define DETACH_DEAD 0x04UL
> @@ -2347,10 +2367,7 @@ __perf_remove_from_context(struct perf_event *event,
> struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
> unsigned long flags = (unsigned long)info;
>
> - if (ctx->is_active & EVENT_TIME) {
> - update_context_time(ctx);
> - update_cgrp_time_from_cpuctx(cpuctx, false);
> - }
> + ctx_update_time(cpuctx, ctx);
>
> /*
> * Ensure event_sched_out() switches to OFF, at the very least
> @@ -2435,12 +2452,8 @@ static void __perf_event_disable(struct perf_event *event,
> if (event->state < PERF_EVENT_STATE_INACTIVE)
> return;
>
> - if (ctx->is_active & EVENT_TIME) {
> - update_context_time(ctx);
> - update_cgrp_time_from_event(event);
> - }
> -
> perf_pmu_disable(event->pmu_ctx->pmu);
> + ctx_update_event_time(ctx, event);
>
> if (event == event->group_leader)
> group_sched_out(event, ctx);
> @@ -2656,7 +2669,8 @@ static void add_event_to_ctx(struct perf_event *event,
> }
>
> static void task_ctx_sched_out(struct perf_event_context *ctx,
> - enum event_type_t event_type)
> + struct pmu *pmu,
> + enum event_type_t event_type)
> {
> struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>
> @@ -2666,18 +2680,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
> if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
> return;
>
> - ctx_sched_out(ctx, event_type);
> + ctx_sched_out(ctx, pmu, event_type);
> }
>
> static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
> - struct perf_event_context *ctx)
> + struct perf_event_context *ctx,
> + struct pmu *pmu)
> {
> - ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
> + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
> if (ctx)
> - ctx_sched_in(ctx, EVENT_PINNED);
> - ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
> + ctx_sched_in(ctx, pmu, EVENT_PINNED);
> + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
> if (ctx)
> - ctx_sched_in(ctx, EVENT_FLEXIBLE);
> + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
> }
>
> /*
> @@ -2695,16 +2710,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
> * event_type is a bit mask of the types of events involved. For CPU events,
> * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
> */
> -/*
> - * XXX: ctx_resched() reschedule entire perf_event_context while adding new
> - * event to the context or enabling existing event in the context. We can
> - * probably optimize it by rescheduling only affected pmu_ctx.
> - */
> static void ctx_resched(struct perf_cpu_context *cpuctx,
> struct perf_event_context *task_ctx,
> - enum event_type_t event_type)
> + struct pmu *pmu, enum event_type_t event_type)
> {
> bool cpu_event = !!(event_type & EVENT_CPU);
> + struct perf_event_pmu_context *epc;
>
> /*
> * If pinned groups are involved, flexible groups also need to be
> @@ -2715,10 +2726,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>
> event_type &= EVENT_ALL;
>
> - perf_ctx_disable(&cpuctx->ctx, false);
> + for_each_epc(epc, &cpuctx->ctx, pmu, false)
> + perf_pmu_disable(epc->pmu);
> +
> if (task_ctx) {
> - perf_ctx_disable(task_ctx, false);
> - task_ctx_sched_out(task_ctx, event_type);
> + for_each_epc(epc, task_ctx, pmu, false)
> + perf_pmu_disable(epc->pmu);
> +
> + task_ctx_sched_out(task_ctx, pmu, event_type);
> }
>
> /*
> @@ -2729,15 +2744,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
> * - otherwise, do nothing more.
> */
> if (cpu_event)
> - ctx_sched_out(&cpuctx->ctx, event_type);
> + ctx_sched_out(&cpuctx->ctx, pmu, event_type);
> else if (event_type & EVENT_PINNED)
> - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> + ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
>
> - perf_event_sched_in(cpuctx, task_ctx);
> + perf_event_sched_in(cpuctx, task_ctx, pmu);
>
> - perf_ctx_enable(&cpuctx->ctx, false);
> - if (task_ctx)
> - perf_ctx_enable(task_ctx, false);
> + for_each_epc(epc, &cpuctx->ctx, pmu, false)
> + perf_pmu_enable(epc->pmu);
> +
> + if (task_ctx) {
> + for_each_epc(epc, task_ctx, pmu, false)
> + perf_pmu_enable(epc->pmu);
> + }
> }
>
> void perf_pmu_resched(struct pmu *pmu)
> @@ -2746,7 +2765,7 @@ void perf_pmu_resched(struct pmu *pmu)
> struct perf_event_context *task_ctx = cpuctx->task_ctx;
>
> perf_ctx_lock(cpuctx, task_ctx);
> - ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
> + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
> perf_ctx_unlock(cpuctx, task_ctx);
> }
>
> @@ -2802,9 +2821,10 @@ static int __perf_install_in_context(void *info)
> #endif
>
> if (reprogram) {
> - ctx_sched_out(ctx, EVENT_TIME);
> + ctx_update_time(cpuctx, ctx);
So you don't want to stop the context time while adding or enabling a new
event, right? Then I'm not sure if it's needed to update the time here as
it'll be updated in the ctx_sched_out() again.
Also calling ctx_sched_out() will clear EVENT_TIME when is_active has
no EVENT_ALL and it'll stop the context time anyway, right?
Thanks,
Namhyung
> add_event_to_ctx(event, ctx);
> - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> + get_event_type(event));
> } else {
> add_event_to_ctx(event, ctx);
> }
> @@ -2947,8 +2967,7 @@ static void __perf_event_enable(struct perf_event *event,
> event->state <= PERF_EVENT_STATE_ERROR)
> return;
>
> - if (ctx->is_active)
> - ctx_sched_out(ctx, EVENT_TIME);
> + ctx_update_time(cpuctx, ctx);
>
> perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
> perf_cgroup_event_enable(event, ctx);
> @@ -2956,25 +2975,21 @@ static void __perf_event_enable(struct perf_event *event,
> if (!ctx->is_active)
> return;
>
> - if (!event_filter_match(event)) {
> - ctx_sched_in(ctx, EVENT_TIME);
> + if (!event_filter_match(event))
> return;
> - }
>
> /*
> * If the event is in a group and isn't the group leader,
> * then don't put it on unless the group is on.
> */
> - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
> - ctx_sched_in(ctx, EVENT_TIME);
> + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
> return;
> - }
>
> task_ctx = cpuctx->task_ctx;
> if (ctx->task)
> WARN_ON_ONCE(task_ctx != ctx);
>
> - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
> }
>
> /*
> @@ -3250,7 +3265,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
> cpc->task_epc = NULL;
> }
>
> - if (!event_type)
> + if (!(event_type & EVENT_ALL))
> return;
>
> perf_pmu_disable(pmu);
> @@ -3276,8 +3291,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
> perf_pmu_enable(pmu);
> }
>
> +/*
> + * Be very careful with the @pmu argument since this will change ctx state.
> + * The @pmu argument works for ctx_resched(), because that is symmetric in
> + * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
> + *
> + * However, if you were to be asymmetrical, you could end up with messed up
> + * state, eg. ctx->is_active cleared even though most EPCs would still actually
> + * be active.
> + */
> static void
> -ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
> +ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
> {
> struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> struct perf_event_pmu_context *pmu_ctx;
> @@ -3331,11 +3355,8 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
>
> is_active ^= ctx->is_active; /* changed bits */
>
> - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> - if (cgroup && !pmu_ctx->nr_cgroups)
> - continue;
> + for_each_epc(pmu_ctx, ctx, pmu, cgroup)
> __pmu_ctx_sched_out(pmu_ctx, is_active);
> - }
> }
>
> /*
> @@ -3579,7 +3600,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
>
> inside_switch:
> perf_ctx_sched_task_cb(ctx, false);
> - task_ctx_sched_out(ctx, EVENT_ALL);
> + task_ctx_sched_out(ctx, NULL, EVENT_ALL);
>
> perf_ctx_enable(ctx, false);
> raw_spin_unlock(&ctx->lock);
> @@ -3877,29 +3898,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx,
> merge_sched_in, &can_add_hw);
> }
>
> -static void ctx_groups_sched_in(struct perf_event_context *ctx,
> - struct perf_event_groups *groups,
> - bool cgroup)
> +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
> + enum event_type_t event_type)
> {
> - struct perf_event_pmu_context *pmu_ctx;
> -
> - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> - if (cgroup && !pmu_ctx->nr_cgroups)
> - continue;
> - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
> - }
> -}
> + struct perf_event_context *ctx = pmu_ctx->ctx;
>
> -static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
> - struct pmu *pmu)
> -{
> - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> + if (event_type & EVENT_PINNED)
> + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
> + if (event_type & EVENT_FLEXIBLE)
> + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
> }
>
> static void
> -ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
> +ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
> {
> struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> + struct perf_event_pmu_context *pmu_ctx;
> int is_active = ctx->is_active;
> bool cgroup = event_type & EVENT_CGROUP;
>
> @@ -3935,12 +3949,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
> * First go through the list and put on any pinned groups
> * in order to give them the best chance of going on.
> */
> - if (is_active & EVENT_PINNED)
> - ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
> + if (is_active & EVENT_PINNED) {
> + for_each_epc(pmu_ctx, ctx, pmu, cgroup)
> + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
> + }
>
> /* Then walk through the lower prio flexible groups */
> - if (is_active & EVENT_FLEXIBLE)
> - ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
> + if (is_active & EVENT_FLEXIBLE) {
> + for_each_epc(pmu_ctx, ctx, pmu, cgroup)
> + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
> + }
> }
>
> static void perf_event_context_sched_in(struct task_struct *task)
> @@ -3983,10 +4001,10 @@ static void perf_event_context_sched_in(struct task_struct *task)
> */
> if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
> perf_ctx_disable(&cpuctx->ctx, false);
> - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
> }
>
> - perf_event_sched_in(cpuctx, ctx);
> + perf_event_sched_in(cpuctx, ctx, NULL);
>
> perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
>
> @@ -4327,14 +4345,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
> update_context_time(&cpuctx->ctx);
> __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
> rotate_ctx(&cpuctx->ctx, cpu_event);
> - __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
> + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
> }
>
> if (task_event)
> rotate_ctx(task_epc->ctx, task_event);
>
> if (task_event || (task_epc && cpu_event))
> - __pmu_ctx_sched_in(task_epc->ctx, pmu);
> + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
>
> perf_pmu_enable(pmu);
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> @@ -4400,7 +4418,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
>
> cpuctx = this_cpu_ptr(&perf_cpu_context);
> perf_ctx_lock(cpuctx, ctx);
> - ctx_sched_out(ctx, EVENT_TIME);
> + ctx_update_time(cpuctx, ctx);
>
> list_for_each_entry(event, &ctx->event_list, event_entry) {
> enabled |= event_enable_on_exec(event, ctx);
> @@ -4412,9 +4430,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
> */
> if (enabled) {
> clone_ctx = unclone_ctx(ctx);
> - ctx_resched(cpuctx, ctx, event_type);
> - } else {
> - ctx_sched_in(ctx, EVENT_TIME);
> + ctx_resched(cpuctx, ctx, NULL, event_type);
> }
> perf_ctx_unlock(cpuctx, ctx);
>
> @@ -4517,10 +4533,7 @@ static void __perf_event_read(void *info)
> return;
>
> raw_spin_lock(&ctx->lock);
> - if (ctx->is_active & EVENT_TIME) {
> - update_context_time(ctx);
> - update_cgrp_time_from_event(event);
> - }
> + ctx_update_event_time(ctx, event);
>
> perf_event_update_time(event);
> if (data->group)
> @@ -4720,10 +4733,7 @@ static int perf_event_read(struct perf_event *event, bool group)
> * May read while context is not active (e.g., thread is
> * blocked), in that case we cannot update context time
> */
> - if (ctx->is_active & EVENT_TIME) {
> - update_context_time(ctx);
> - update_cgrp_time_from_event(event);
> - }
> + ctx_update_event_time(ctx, event);
>
> perf_event_update_time(event);
> if (group)
> @@ -13202,7 +13212,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
> * in.
> */
> raw_spin_lock_irq(&child_ctx->lock);
> - task_ctx_sched_out(child_ctx, EVENT_ALL);
> + task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
>
> /*
> * Now that the context is inactive, destroy the task <-> ctx relation
> @@ -13751,7 +13761,7 @@ static void __perf_event_exit_context(void *__info)
> struct perf_event *event;
>
> raw_spin_lock(&ctx->lock);
> - ctx_sched_out(ctx, EVENT_TIME);
> + ctx_sched_out(ctx, NULL, EVENT_TIME);
> list_for_each_entry(event, &ctx->event_list, event_entry)
> __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
> raw_spin_unlock(&ctx->lock);
>
On Sat, Aug 03, 2024 at 10:08:32AM -0700, Namhyung Kim wrote:
> > @@ -2802,9 +2821,10 @@ static int __perf_install_in_context(void *info)
> > #endif
> >
> > if (reprogram) {
> > - ctx_sched_out(ctx, EVENT_TIME);
> > + ctx_update_time(cpuctx, ctx);
>
> So you don't want to stop the context time while adding or enabling a new
> event, right? Then I'm not sure if it's needed to update the time here as
> it'll be updated in the ctx_sched_out() again.
>
> Also calling ctx_sched_out() will clear EVENT_TIME when is_active has
> no EVENT_ALL and it'll stop the context time anyway, right?
>
Hmm, I knew I was missing something. Let me ponder rhat a bit more, and
maybe read some of the history on this thing back to remember what exact
issue was being fixed here.
>
>
> > add_event_to_ctx(event, ctx);
> > - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> > + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> > + get_event_type(event));
> > } else {
> > add_event_to_ctx(event, ctx);
> > }
On Sat, Aug 3, 2024 at 10:08 AM Namhyung Kim <namhyung@kernel.org> wrote: > > On Sat, Aug 3, 2024 at 3:32 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Fri, Aug 02, 2024 at 09:11:23PM +0200, Peter Zijlstra wrote: > > > > > But I'll have to continue staring at this later. > > > > OK, I have the below, which boots and seems able to do: > > > > perf stat -ae power/energy-pkg/ -- sleep 1 > > > > and > > > > perf top > > > > also still works, so it must be perfect, right, right? > > I really hope so. :) I'll test it over the weekend. I found a failing test about the context time - it complained about difference in enabled vs running time of a software event. Thanks, Namhyung $ perf test -v times 46: Event times: --- start --- test child forked, pid 325425 Using CPUID GenuineIntel-6-55-7 ... attaching to current thread as disabled Opening: cpu-clock:u ------------------------------------------------------------ perf_event_attr: type 1 (software) size 136 config 0 (PERF_COUNT_SW_CPU_CLOCK) read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 exclude_kernel 1 exclude_hv 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid 325425 cpu -1 group_fd -1 flags 0x8 = 3 FAILED: ena 1026, run 881 attaching to CPU 0 as enabled Opening: cpu-clock:u ------------------------------------------------------------ perf_event_attr: type 1 (software) size 136 config 0 (PERF_COUNT_SW_CPU_CLOCK) read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 exclude_kernel 1 exclude_hv 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 = 3 FAILED: ena 6554, run 6079 ...
On Sun, Aug 04, 2024 at 11:39:18PM -0700, Namhyung Kim wrote: > On Sat, Aug 3, 2024 at 10:08 AM Namhyung Kim <namhyung@kernel.org> wrote: > > > > On Sat, Aug 3, 2024 at 3:32 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > On Fri, Aug 02, 2024 at 09:11:23PM +0200, Peter Zijlstra wrote: > > > > > > > But I'll have to continue staring at this later. > > > > > > OK, I have the below, which boots and seems able to do: > > > > > > perf stat -ae power/energy-pkg/ -- sleep 1 > > > > > > and > > > > > > perf top > > > > > > also still works, so it must be perfect, right, right? > > > > I really hope so. :) I'll test it over the weekend. > > I found a failing test about the context time - it complained about > difference in enabled vs running time of a software event. Yeah, it's that ctx_scheD_out(EVENT_TIME) thing, that's really needed. I'll make those changes go away when I split it all up.
On Fri, Aug 2, 2024 at 12:12 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Aug 02, 2024 at 08:50:23PM +0200, Peter Zijlstra wrote:
> > On Fri, Aug 02, 2024 at 08:43:50PM +0200, Peter Zijlstra wrote:
> > > On Fri, Aug 02, 2024 at 08:38:41PM +0200, Peter Zijlstra wrote:
> > > > On Fri, Aug 02, 2024 at 02:30:19PM -0400, Liang, Kan wrote:
> > > > > > @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
> > > > > > if (reprogram) {
> > > > > > ctx_sched_out(ctx, EVENT_TIME);
> > >
> > > Clearly I should read better...
> > >
> > > > > > add_event_to_ctx(event, ctx);
> > > > > > - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> > > > > > + if (ctx->nr_events == 1) {
> > > > > > + /* The first event needs to set ctx->is_active. */
> > > > > > + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> > > > > > + } else {
> > > > > > + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> > > > > > + get_event_type(event));
> > > > > > + ctx_sched_in(ctx, EVENT_TIME);
> > > > >
> > > > > The changelog doesn't mention the time difference much. As my
> > > > > understanding, the time is shared among PMUs in the same ctx.
> > > > > When perf does ctx_resched(), the time is deducted.
> > > > > There is no problem to stop and restart the global time when perf
> > > > > re-schedule all PMUs.
> > > > > But if only one PMU is re-scheduled while others are still running, it
> > > > > may be a problem to stop and restart the global time. Other PMUs will be
> > > > > impacted.
> > >
> > > So yeah, this stops ctx time but not all PMUs.
> >
> > But isn't this already the case? We don't have perf_ctx_disable() here
> > currently.
> >
> > Bah, this heat is melting my brain.
>
> I think all it wants is to update time and ensure the added event and
> the resched all use the same time, which could be done differently.
>
> But I'll have to continue staring at this later.
Right, probably we can move time tracking to pmu_ctx.
thanks,
Namhyung
On 2024-08-02 3:11 p.m., Peter Zijlstra wrote:
> On Fri, Aug 02, 2024 at 08:50:23PM +0200, Peter Zijlstra wrote:
>> On Fri, Aug 02, 2024 at 08:43:50PM +0200, Peter Zijlstra wrote:
>>> On Fri, Aug 02, 2024 at 08:38:41PM +0200, Peter Zijlstra wrote:
>>>> On Fri, Aug 02, 2024 at 02:30:19PM -0400, Liang, Kan wrote:
>>>>>> @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
>>>>>> if (reprogram) {
>>>>>> ctx_sched_out(ctx, EVENT_TIME);
>>>
>>> Clearly I should read better...
>>>
>>>>>> add_event_to_ctx(event, ctx);
>>>>>> - ctx_resched(cpuctx, task_ctx, get_event_type(event));
>>>>>> + if (ctx->nr_events == 1) {
>>>>>> + /* The first event needs to set ctx->is_active. */
>>>>>> + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
>>>>>> + } else {
>>>>>> + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
>>>>>> + get_event_type(event));
>>>>>> + ctx_sched_in(ctx, EVENT_TIME);
>>>>>
>>>>> The changelog doesn't mention the time difference much. As my
>>>>> understanding, the time is shared among PMUs in the same ctx.
>>>>> When perf does ctx_resched(), the time is deducted.
>>>>> There is no problem to stop and restart the global time when perf
>>>>> re-schedule all PMUs.
>>>>> But if only one PMU is re-scheduled while others are still running, it
>>>>> may be a problem to stop and restart the global time. Other PMUs will be
>>>>> impacted.
>>>
>>> So yeah, this stops ctx time but not all PMUs.
>>
>> But isn't this already the case? We don't have perf_ctx_disable() here
>> currently.
>>
>> Bah, this heat is melting my brain.
>
> I think all it wants is to update time and ensure the added event and
> the resched all use the same time, which could be done differently.
>
Yes. I think that's what the current code tries to do.
But it seems the current code doesn't do it clearly either.
ctx_sched_out(ctx, EVENT_TIME); <-- disable the time
ctx_resched()
perf_ctx_disable() <-- disable all PMUs
perf_event_sched_in()
ctx_sched_in() <-- enable the time
perf_ctx_enable() <-- enable all PMUs
I think the ctx_sched_out(ctx, EVENT_TIME) should be moved after the
perf_ctx_disable();.
Hope it can be fixed by the different way.
> But I'll have to continue staring at this later.
Sure.
Thanks,
Kan
Hi Peter,
On Tue, Jul 30, 2024 at 5:06 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Current ctx_resched() reschedules every events in all PMUs in the
> context even if it only needs to do it for a single event. This is the
> case when it opens a new event or enables an existing one. What we
> want is to reschedule events in the PMU only. Also perf_pmu_resched()
> currently calls ctx_resched() without PMU information.
>
> Let's add pmu argument to ctx_resched() to do the work for the given
> PMU only. And change the __pmu_ctx_sched_in() to be symmetrical to the
> _sched_out() counterpart for its arguments so that it can be called
> easily in the __perf_pmu_resched().
>
> Note that __perf_install_in_context() should call ctx_resched() for the
> very first event in the context in order to set ctx->is_active. Later
> events can be handled by __perf_pmu_resched().
>
> Care should be taken when it installs a task event for a PMU and
> there's no CPU event for the PMU. __perf_pmu_resched() will ask the
> CPU PMU context to schedule any events in it according to the group
> info. But as the PMU context was not activated, it didn't set the
> event context pointer. So I added new NULL checks in the
> __pmu_ctx_sched_{in,out}.
>
> With this change I can get 4x speed up (but actually it's proportional
> to the number of uncore PMU events) on a 2-socket Intel EMR machine in
> opening and closing a perf event for the core PMU in a loop while there
> are a bunch of uncore PMU events active on the CPU. The test code
> (stress-pmu) follows below.
>
> Before)
> # ./stress-pmu
> delta: 0.087068 sec (870 usec/op)
>
> After)
> # ./stress-pmu
> delta: 0.021440 sec (214 usec/op)
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> $ cat stress-pmu.c
> #include <stdio.h>
> #include <unistd.h>
> #include <linux/perf_event.h>
> #include <sys/syscall.h>
> #include <sys/time.h>
>
> /* from uncore cpumask on EMR */
> #define TARGET_CPU 60
>
> #define LOOP 100
> #define US2S 1000000
>
> int open_perf_event(int type, int config)
> {
> struct perf_event_attr attr = {
> .type = type,
> .config = config,
> };
> int fd;
>
> fd = syscall(SYS_perf_event_open, &attr, /*pid=*/-1, TARGET_CPU,
> /*group_fd=*/-1, /*flags=*/0);
> if (fd < 0)
> printf("perf_event_open failed (type=%d, config=%d): %m\n", type, config);
> return fd;
> }
>
> int main(int argc, char *argv[])
> {
> struct timeval ts1, ts2;
> unsigned long long delta;
> int target_cpu = TARGET_CPU;
>
> /* open random uncore PMU events */
> for (int i = 0; i < 100; i++)
> open_perf_event(/*type=*/i + 20, /*config=*/0);
>
> gettimeofday(&ts1, NULL);
> for (int i = 0; i < LOOP; i++)
> close(open_perf_event(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES));
> gettimeofday(&ts2, NULL);
>
> delta = ts2.tv_sec * US2S + ts2.tv_usec - (ts1.tv_sec * US2S + ts1.tv_usec);
> printf("delta: %llu.%06llu sec (%llu usec/op)\n",
> delta / US2S, delta % US2S, delta / LOOP);
> return 0;
> }
> ---
> v2) add 'pmu' argument to ctx_resched() to reduce duplication
Are you ok with this?
Thanks,
Namhyung
>
> kernel/events/core.c | 118 ++++++++++++++++++++++++++++++++++---------
> 1 file changed, 93 insertions(+), 25 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index f64c30e7d5da..41e2533859a4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -709,6 +709,10 @@ static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
>
> static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
> static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
> +static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
> + enum event_type_t event_type);
> +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
> + enum event_type_t event_type);
>
> #ifdef CONFIG_CGROUP_PERF
>
> @@ -2668,6 +2672,17 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
> ctx_sched_in(ctx, EVENT_FLEXIBLE);
> }
>
> +static void perf_pmu_sched_in(struct perf_cpu_pmu_context *cpc,
> + struct perf_event_pmu_context *task_epc)
> +{
> + __pmu_ctx_sched_in(&cpc->epc, EVENT_PINNED);
> + if (task_epc)
> + __pmu_ctx_sched_in(task_epc, EVENT_PINNED);
> + __pmu_ctx_sched_in(&cpc->epc, EVENT_FLEXIBLE);
> + if (task_epc)
> + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
> +}
> +
> /*
> * We want to maintain the following priority of scheduling:
> * - CPU pinned (EVENT_CPU | EVENT_PINNED)
> @@ -2683,16 +2698,13 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
> * event_type is a bit mask of the types of events involved. For CPU events,
> * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
> */
> -/*
> - * XXX: ctx_resched() reschedule entire perf_event_context while adding new
> - * event to the context or enabling existing event in the context. We can
> - * probably optimize it by rescheduling only affected pmu_ctx.
> - */
> static void ctx_resched(struct perf_cpu_context *cpuctx,
> struct perf_event_context *task_ctx,
> - enum event_type_t event_type)
> + struct pmu *pmu, enum event_type_t event_type)
> {
> bool cpu_event = !!(event_type & EVENT_CPU);
> + struct perf_cpu_pmu_context *cpc = NULL;
> + struct perf_event_pmu_context *epc = NULL;
>
> /*
> * If pinned groups are involved, flexible groups also need to be
> @@ -2703,10 +2715,24 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>
> event_type &= EVENT_ALL;
>
> - perf_ctx_disable(&cpuctx->ctx, false);
> + if (pmu) {
> + cpc = this_cpu_ptr(pmu->cpu_pmu_context);
> + perf_pmu_disable(pmu);
> + } else {
> + perf_ctx_disable(&cpuctx->ctx, false);
> + }
> +
> if (task_ctx) {
> - perf_ctx_disable(task_ctx, false);
> - task_ctx_sched_out(task_ctx, event_type);
> + if (pmu) {
> + if (WARN_ON_ONCE(!cpc->task_epc || cpc->task_epc->ctx != task_ctx))
> + goto out;
> +
> + epc = cpc->task_epc;
> + __pmu_ctx_sched_out(epc, event_type);
> + } else {
> + perf_ctx_disable(task_ctx, false);
> + task_ctx_sched_out(task_ctx, event_type);
> + }
> }
>
> /*
> @@ -2716,15 +2742,30 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
> * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
> * - otherwise, do nothing more.
> */
> - if (cpu_event)
> - ctx_sched_out(&cpuctx->ctx, event_type);
> - else if (event_type & EVENT_PINNED)
> - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> + if (cpu_event) {
> + if (pmu)
> + __pmu_ctx_sched_out(&cpc->epc, event_type);
> + else
> + ctx_sched_out(&cpuctx->ctx, event_type);
> + } else if (event_type & EVENT_PINNED) {
> + if (pmu)
> + __pmu_ctx_sched_out(&cpc->epc, EVENT_FLEXIBLE);
> + else
> + ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> + }
> +
> + if (pmu)
> + perf_pmu_sched_in(cpc, epc);
> + else
> + perf_event_sched_in(cpuctx, task_ctx);
>
> - perf_event_sched_in(cpuctx, task_ctx);
> +out:
> + if (pmu)
> + perf_pmu_enable(pmu);
> + else
> + perf_ctx_enable(&cpuctx->ctx, false);
>
> - perf_ctx_enable(&cpuctx->ctx, false);
> - if (task_ctx)
> + if (task_ctx && !pmu)
> perf_ctx_enable(task_ctx, false);
> }
>
> @@ -2734,7 +2775,7 @@ void perf_pmu_resched(struct pmu *pmu)
> struct perf_event_context *task_ctx = cpuctx->task_ctx;
>
> perf_ctx_lock(cpuctx, task_ctx);
> - ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
> + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
> perf_ctx_unlock(cpuctx, task_ctx);
> }
>
> @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
> if (reprogram) {
> ctx_sched_out(ctx, EVENT_TIME);
> add_event_to_ctx(event, ctx);
> - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> + if (ctx->nr_events == 1) {
> + /* The first event needs to set ctx->is_active. */
> + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> + } else {
> + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> + get_event_type(event));
> + ctx_sched_in(ctx, EVENT_TIME);
> + }
> } else {
> add_event_to_ctx(event, ctx);
> }
> @@ -2962,7 +3010,8 @@ static void __perf_event_enable(struct perf_event *event,
> if (ctx->task)
> WARN_ON_ONCE(task_ctx != ctx);
>
> - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
> + ctx_sched_in(ctx, EVENT_TIME);
> }
>
> /*
> @@ -3230,6 +3279,13 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
> struct perf_event *event, *tmp;
> struct pmu *pmu = pmu_ctx->pmu;
>
> + /*
> + * CPU's pmu_ctx might not be active when __perf_pmu_resched() is called
> + * for task events and there's no cpu events.
> + */
> + if (ctx == NULL)
> + return;
> +
> if (ctx->task && !ctx->is_active) {
> struct perf_cpu_pmu_context *cpc;
>
> @@ -3872,10 +3928,22 @@ static void ctx_groups_sched_in(struct perf_event_context *ctx,
> }
> }
>
> -static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
> - struct pmu *pmu)
> +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
> + enum event_type_t event_type)
> {
> - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> + struct perf_event_context *ctx = pmu_ctx->ctx;
> +
> + /*
> + * CPU's pmu_ctx might not be active when __perf_pmu_resched() is called
> + * for task events and there's no cpu events.
> + */
> + if (ctx == NULL)
> + return;
> +
> + if (event_type & EVENT_PINNED)
> + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
> + if (event_type & EVENT_FLEXIBLE)
> + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
> }
>
> static void
> @@ -4309,14 +4377,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
> update_context_time(&cpuctx->ctx);
> __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
> rotate_ctx(&cpuctx->ctx, cpu_event);
> - __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
> + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
> }
>
> if (task_event)
> rotate_ctx(task_epc->ctx, task_event);
>
> if (task_event || (task_epc && cpu_event))
> - __pmu_ctx_sched_in(task_epc->ctx, pmu);
> + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
>
> perf_pmu_enable(pmu);
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> @@ -4394,7 +4462,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
> */
> if (enabled) {
> clone_ctx = unclone_ctx(ctx);
> - ctx_resched(cpuctx, ctx, event_type);
> + ctx_resched(cpuctx, ctx, NULL, event_type);
> } else {
> ctx_sched_in(ctx, EVENT_TIME);
> }
> --
> 2.46.0.rc1.232.g9752f9e123-goog
>
© 2016 - 2025 Red Hat, Inc.