Profiling QEMU during Fedora 35 for PPC64 boot revealed that
6.39% of total time was being spent in helper_insns_inc(), on a
POWER9 machine. To avoid calling this helper every time PMCs had
to be incremented, an inline implementation of PMC5 increment and
check for overflow was developed. This led to a reduction of
about 12% in Fedora's boot time.
Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br>
---
target/ppc/helper.h | 1 +
target/ppc/power8-pmu.c | 74 +++++++++++++++++++++--------------------
target/ppc/power8-pmu.h | 3 ++
target/ppc/translate.c | 28 ++++++++++++++--
4 files changed, 67 insertions(+), 39 deletions(-)
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 57eee07256..f8cd00c976 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -29,6 +29,7 @@ DEF_HELPER_2(store_mmcr1, void, env, tl)
DEF_HELPER_3(store_pmc, void, env, i32, i64)
DEF_HELPER_2(read_pmc, tl, env, i32)
DEF_HELPER_2(insns_inc, void, env, i32)
+DEF_HELPER_1(handle_pmc5_overflow, void, env)
#endif
DEF_HELPER_1(check_tlb_flush_local, void, env)
DEF_HELPER_1(check_tlb_flush_global, void, env)
diff --git a/target/ppc/power8-pmu.c b/target/ppc/power8-pmu.c
index beeab5c494..1381072b9e 100644
--- a/target/ppc/power8-pmu.c
+++ b/target/ppc/power8-pmu.c
@@ -22,8 +22,6 @@
#if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
-#define PMC_COUNTER_NEGATIVE_VAL 0x80000000UL
-
static bool pmc_has_overflow_enabled(CPUPPCState *env, int sprn)
{
if (sprn == SPR_POWER_PMC1) {
@@ -88,49 +86,47 @@ static bool pmu_increment_insns(CPUPPCState *env, uint32_t num_insns)
bool overflow_triggered = false;
target_ulong tmp;
- if (unlikely(ins_cnt & 0x1e)) {
- if (ins_cnt & (1 << 1)) {
- tmp = env->spr[SPR_POWER_PMC1];
- tmp += num_insns;
- if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMC1CE)) {
- tmp = PMC_COUNTER_NEGATIVE_VAL;
- overflow_triggered = true;
- }
- env->spr[SPR_POWER_PMC1] = tmp;
+ if (ins_cnt & (1 << 1)) {
+ tmp = env->spr[SPR_POWER_PMC1];
+ tmp += num_insns;
+ if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMC1CE)) {
+ tmp = PMC_COUNTER_NEGATIVE_VAL;
+ overflow_triggered = true;
}
+ env->spr[SPR_POWER_PMC1] = tmp;
+ }
- if (ins_cnt & (1 << 2)) {
- tmp = env->spr[SPR_POWER_PMC2];
- tmp += num_insns;
- if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
- tmp = PMC_COUNTER_NEGATIVE_VAL;
- overflow_triggered = true;
- }
- env->spr[SPR_POWER_PMC2] = tmp;
+ if (ins_cnt & (1 << 2)) {
+ tmp = env->spr[SPR_POWER_PMC2];
+ tmp += num_insns;
+ if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
+ tmp = PMC_COUNTER_NEGATIVE_VAL;
+ overflow_triggered = true;
+ }
+ env->spr[SPR_POWER_PMC2] = tmp;
+ }
+
+ if (ins_cnt & (1 << 3)) {
+ tmp = env->spr[SPR_POWER_PMC3];
+ tmp += num_insns;
+ if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
+ tmp = PMC_COUNTER_NEGATIVE_VAL;
+ overflow_triggered = true;
}
+ env->spr[SPR_POWER_PMC3] = tmp;
+ }
- if (ins_cnt & (1 << 3)) {
- tmp = env->spr[SPR_POWER_PMC3];
+ if (ins_cnt & (1 << 4)) {
+ target_ulong mmcr1 = env->spr[SPR_POWER_MMCR1];
+ int sel = extract64(mmcr1, MMCR1_PMC4EVT_EXTR, MMCR1_EVT_SIZE);
+ if (sel == 0x02 || (env->spr[SPR_CTRL] & CTRL_RUN)) {
+ tmp = env->spr[SPR_POWER_PMC4];
tmp += num_insns;
if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
tmp = PMC_COUNTER_NEGATIVE_VAL;
overflow_triggered = true;
}
- env->spr[SPR_POWER_PMC3] = tmp;
- }
-
- if (ins_cnt & (1 << 4)) {
- target_ulong mmcr1 = env->spr[SPR_POWER_MMCR1];
- int sel = extract64(mmcr1, MMCR1_PMC4EVT_EXTR, MMCR1_EVT_SIZE);
- if (sel == 0x02 || (env->spr[SPR_CTRL] & CTRL_RUN)) {
- tmp = env->spr[SPR_POWER_PMC4];
- tmp += num_insns;
- if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
- tmp = PMC_COUNTER_NEGATIVE_VAL;
- overflow_triggered = true;
- }
- env->spr[SPR_POWER_PMC4] = tmp;
- }
+ env->spr[SPR_POWER_PMC4] = tmp;
}
}
@@ -310,6 +306,12 @@ static void fire_PMC_interrupt(PowerPCCPU *cpu)
raise_ebb_perfm_exception(env);
}
+void helper_handle_pmc5_overflow(CPUPPCState *env)
+{
+ env->spr[SPR_POWER_PMC5] = PMC_COUNTER_NEGATIVE_VAL;
+ fire_PMC_interrupt(env_archcpu(env));
+}
+
/* This helper assumes that the PMC is running. */
void helper_insns_inc(CPUPPCState *env, uint32_t num_insns)
{
diff --git a/target/ppc/power8-pmu.h b/target/ppc/power8-pmu.h
index 9692dd765e..c0093e2219 100644
--- a/target/ppc/power8-pmu.h
+++ b/target/ppc/power8-pmu.h
@@ -14,6 +14,9 @@
#define POWER8_PMU_H
#if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
+
+#define PMC_COUNTER_NEGATIVE_VAL 0x80000000UL
+
void cpu_ppc_pmu_init(CPUPPCState *env);
void pmu_update_summaries(CPUPPCState *env);
#else
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 8fda2cf836..5c74684eee 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -36,6 +36,7 @@
#include "exec/log.h"
#include "qemu/atomic128.h"
#include "spr_common.h"
+#include "power8-pmu.h"
#include "qemu/qemu-print.h"
#include "qapi/error.h"
@@ -4263,6 +4264,9 @@ static void pmu_count_insns(DisasContext *ctx)
}
#if !defined(CONFIG_USER_ONLY)
+ TCGLabel *l;
+ TCGv t0;
+
/*
* The PMU insns_inc() helper stops the internal PMU timer if a
* counter overflows happens. In that case, if the guest is
@@ -4271,8 +4275,26 @@ static void pmu_count_insns(DisasContext *ctx)
*/
gen_icount_io_start(ctx);
- gen_helper_insns_inc(cpu_env, tcg_constant_i32(ctx->base.num_insns));
-#else
+ /* Avoid helper calls when only PMC5-6 are enabled. */
+ if (!ctx->pmc_other) {
+ l = gen_new_label();
+ t0 = tcg_temp_new();
+
+ gen_load_spr(t0, SPR_POWER_PMC5);
+ tcg_gen_addi_tl(t0, t0, ctx->base.num_insns);
+ gen_store_spr(SPR_POWER_PMC5, t0);
+ /* Check for overflow, if it's enabled */
+ if (ctx->mmcr0_pmcjce) {
+ tcg_gen_brcondi_tl(TCG_COND_LT, t0, PMC_COUNTER_NEGATIVE_VAL, l);
+ gen_helper_handle_pmc5_overflow(cpu_env);
+ }
+
+ gen_set_label(l);
+ tcg_temp_free(t0);
+ } else {
+ gen_helper_insns_inc(cpu_env, tcg_constant_i32(ctx->base.num_insns));
+ }
+ #else
/*
* User mode can read (but not write) PMC5 and start/stop
* the PMU via MMCR0_FC. In this case just increment
@@ -4285,7 +4307,7 @@ static void pmu_count_insns(DisasContext *ctx)
gen_store_spr(SPR_POWER_PMC5, t0);
tcg_temp_free(t0);
-#endif /* #if !defined(CONFIG_USER_ONLY) */
+ #endif /* #if !defined(CONFIG_USER_ONLY) */
}
#else
static void pmu_count_insns(DisasContext *ctx)
--
2.25.1
On 10/21/22 14:01, Leandro Lupori wrote:
> Profiling QEMU during Fedora 35 for PPC64 boot revealed that
> 6.39% of total time was being spent in helper_insns_inc(), on a
> POWER9 machine. To avoid calling this helper every time PMCs had
> to be incremented, an inline implementation of PMC5 increment and
> check for overflow was developed. This led to a reduction of
> about 12% in Fedora's boot time.
>
> Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br>
> ---
Given that PMC5 is the counter that is most likely to be active, yeah,
isolating the case where PMC5 is incremented standalone makes sense.
Still, 12% performance gain is not too shaby. Not too shaby at all.
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> target/ppc/helper.h | 1 +
> target/ppc/power8-pmu.c | 74 +++++++++++++++++++++--------------------
> target/ppc/power8-pmu.h | 3 ++
> target/ppc/translate.c | 28 ++++++++++++++--
> 4 files changed, 67 insertions(+), 39 deletions(-)
>
> diff --git a/target/ppc/helper.h b/target/ppc/helper.h
> index 57eee07256..f8cd00c976 100644
> --- a/target/ppc/helper.h
> +++ b/target/ppc/helper.h
> @@ -29,6 +29,7 @@ DEF_HELPER_2(store_mmcr1, void, env, tl)
> DEF_HELPER_3(store_pmc, void, env, i32, i64)
> DEF_HELPER_2(read_pmc, tl, env, i32)
> DEF_HELPER_2(insns_inc, void, env, i32)
> +DEF_HELPER_1(handle_pmc5_overflow, void, env)
> #endif
> DEF_HELPER_1(check_tlb_flush_local, void, env)
> DEF_HELPER_1(check_tlb_flush_global, void, env)
> diff --git a/target/ppc/power8-pmu.c b/target/ppc/power8-pmu.c
> index beeab5c494..1381072b9e 100644
> --- a/target/ppc/power8-pmu.c
> +++ b/target/ppc/power8-pmu.c
> @@ -22,8 +22,6 @@
>
> #if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
>
> -#define PMC_COUNTER_NEGATIVE_VAL 0x80000000UL
> -
> static bool pmc_has_overflow_enabled(CPUPPCState *env, int sprn)
> {
> if (sprn == SPR_POWER_PMC1) {
> @@ -88,49 +86,47 @@ static bool pmu_increment_insns(CPUPPCState *env, uint32_t num_insns)
> bool overflow_triggered = false;
> target_ulong tmp;
>
> - if (unlikely(ins_cnt & 0x1e)) {
> - if (ins_cnt & (1 << 1)) {
> - tmp = env->spr[SPR_POWER_PMC1];
> - tmp += num_insns;
> - if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMC1CE)) {
> - tmp = PMC_COUNTER_NEGATIVE_VAL;
> - overflow_triggered = true;
> - }
> - env->spr[SPR_POWER_PMC1] = tmp;
> + if (ins_cnt & (1 << 1)) {
> + tmp = env->spr[SPR_POWER_PMC1];
> + tmp += num_insns;
> + if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMC1CE)) {
> + tmp = PMC_COUNTER_NEGATIVE_VAL;
> + overflow_triggered = true;
> }
> + env->spr[SPR_POWER_PMC1] = tmp;
> + }
>
> - if (ins_cnt & (1 << 2)) {
> - tmp = env->spr[SPR_POWER_PMC2];
> - tmp += num_insns;
> - if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
> - tmp = PMC_COUNTER_NEGATIVE_VAL;
> - overflow_triggered = true;
> - }
> - env->spr[SPR_POWER_PMC2] = tmp;
> + if (ins_cnt & (1 << 2)) {
> + tmp = env->spr[SPR_POWER_PMC2];
> + tmp += num_insns;
> + if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
> + tmp = PMC_COUNTER_NEGATIVE_VAL;
> + overflow_triggered = true;
> + }
> + env->spr[SPR_POWER_PMC2] = tmp;
> + }
> +
> + if (ins_cnt & (1 << 3)) {
> + tmp = env->spr[SPR_POWER_PMC3];
> + tmp += num_insns;
> + if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
> + tmp = PMC_COUNTER_NEGATIVE_VAL;
> + overflow_triggered = true;
> }
> + env->spr[SPR_POWER_PMC3] = tmp;
> + }
>
> - if (ins_cnt & (1 << 3)) {
> - tmp = env->spr[SPR_POWER_PMC3];
> + if (ins_cnt & (1 << 4)) {
> + target_ulong mmcr1 = env->spr[SPR_POWER_MMCR1];
> + int sel = extract64(mmcr1, MMCR1_PMC4EVT_EXTR, MMCR1_EVT_SIZE);
> + if (sel == 0x02 || (env->spr[SPR_CTRL] & CTRL_RUN)) {
> + tmp = env->spr[SPR_POWER_PMC4];
> tmp += num_insns;
> if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
> tmp = PMC_COUNTER_NEGATIVE_VAL;
> overflow_triggered = true;
> }
> - env->spr[SPR_POWER_PMC3] = tmp;
> - }
> -
> - if (ins_cnt & (1 << 4)) {
> - target_ulong mmcr1 = env->spr[SPR_POWER_MMCR1];
> - int sel = extract64(mmcr1, MMCR1_PMC4EVT_EXTR, MMCR1_EVT_SIZE);
> - if (sel == 0x02 || (env->spr[SPR_CTRL] & CTRL_RUN)) {
> - tmp = env->spr[SPR_POWER_PMC4];
> - tmp += num_insns;
> - if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
> - tmp = PMC_COUNTER_NEGATIVE_VAL;
> - overflow_triggered = true;
> - }
> - env->spr[SPR_POWER_PMC4] = tmp;
> - }
> + env->spr[SPR_POWER_PMC4] = tmp;
> }
> }
>
> @@ -310,6 +306,12 @@ static void fire_PMC_interrupt(PowerPCCPU *cpu)
> raise_ebb_perfm_exception(env);
> }
>
> +void helper_handle_pmc5_overflow(CPUPPCState *env)
> +{
> + env->spr[SPR_POWER_PMC5] = PMC_COUNTER_NEGATIVE_VAL;
> + fire_PMC_interrupt(env_archcpu(env));
> +}
> +
> /* This helper assumes that the PMC is running. */
> void helper_insns_inc(CPUPPCState *env, uint32_t num_insns)
> {
> diff --git a/target/ppc/power8-pmu.h b/target/ppc/power8-pmu.h
> index 9692dd765e..c0093e2219 100644
> --- a/target/ppc/power8-pmu.h
> +++ b/target/ppc/power8-pmu.h
> @@ -14,6 +14,9 @@
> #define POWER8_PMU_H
>
> #if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
> +
> +#define PMC_COUNTER_NEGATIVE_VAL 0x80000000UL
> +
> void cpu_ppc_pmu_init(CPUPPCState *env);
> void pmu_update_summaries(CPUPPCState *env);
> #else
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index 8fda2cf836..5c74684eee 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -36,6 +36,7 @@
> #include "exec/log.h"
> #include "qemu/atomic128.h"
> #include "spr_common.h"
> +#include "power8-pmu.h"
>
> #include "qemu/qemu-print.h"
> #include "qapi/error.h"
> @@ -4263,6 +4264,9 @@ static void pmu_count_insns(DisasContext *ctx)
> }
>
> #if !defined(CONFIG_USER_ONLY)
> + TCGLabel *l;
> + TCGv t0;
> +
> /*
> * The PMU insns_inc() helper stops the internal PMU timer if a
> * counter overflows happens. In that case, if the guest is
> @@ -4271,8 +4275,26 @@ static void pmu_count_insns(DisasContext *ctx)
> */
> gen_icount_io_start(ctx);
>
> - gen_helper_insns_inc(cpu_env, tcg_constant_i32(ctx->base.num_insns));
> -#else
> + /* Avoid helper calls when only PMC5-6 are enabled. */
> + if (!ctx->pmc_other) {
> + l = gen_new_label();
> + t0 = tcg_temp_new();
> +
> + gen_load_spr(t0, SPR_POWER_PMC5);
> + tcg_gen_addi_tl(t0, t0, ctx->base.num_insns);
> + gen_store_spr(SPR_POWER_PMC5, t0);
> + /* Check for overflow, if it's enabled */
> + if (ctx->mmcr0_pmcjce) {
> + tcg_gen_brcondi_tl(TCG_COND_LT, t0, PMC_COUNTER_NEGATIVE_VAL, l);
> + gen_helper_handle_pmc5_overflow(cpu_env);
> + }
> +
> + gen_set_label(l);
> + tcg_temp_free(t0);
> + } else {
> + gen_helper_insns_inc(cpu_env, tcg_constant_i32(ctx->base.num_insns));
> + }
> + #else
> /*
> * User mode can read (but not write) PMC5 and start/stop
> * the PMU via MMCR0_FC. In this case just increment
> @@ -4285,7 +4307,7 @@ static void pmu_count_insns(DisasContext *ctx)
> gen_store_spr(SPR_POWER_PMC5, t0);
>
> tcg_temp_free(t0);
> -#endif /* #if !defined(CONFIG_USER_ONLY) */
> + #endif /* #if !defined(CONFIG_USER_ONLY) */
> }
> #else
> static void pmu_count_insns(DisasContext *ctx)
On 10/25/22 16:29, Daniel Henrique Barboza wrote: > On 10/21/22 14:01, Leandro Lupori wrote: >> Profiling QEMU during Fedora 35 for PPC64 boot revealed that >> 6.39% of total time was being spent in helper_insns_inc(), on a >> POWER9 machine. To avoid calling this helper every time PMCs had >> to be incremented, an inline implementation of PMC5 increment and >> check for overflow was developed. This led to a reduction of >> about 12% in Fedora's boot time. >> >> Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br> >> --- > > Given that PMC5 is the counter that is most likely to be active, yeah, > isolating the case where PMC5 is incremented standalone makes sense. > > Still, 12% performance gain is not too shaby. Not too shaby at all. > I've tried to move more of helper_insns_inc() to the inline implementation, but then performance started to decrease. Initially I found this strange, but perf revealed a considerable increase of time spent in functions such as tcg_gen_code and liveness_pass_1. So as this code has to be generated and optimized for most TBs, it seems it makes code generation slower if it's too big. > > Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com> > >
© 2016 - 2026 Red Hat, Inc.