[PATCH v12 02/15] accel: collecting TB execution count

Fei Wu posted 15 patches 2 years, 8 months ago
Maintainers: Richard Henderson <richard.henderson@linaro.org>, Paolo Bonzini <pbonzini@redhat.com>, "Dr. David Alan Gilbert" <dave@treblig.org>, Laurent Vivier <laurent@vivier.eu>, "Marc-André Lureau" <marcandre.lureau@redhat.com>, "Daniel P. Berrangé" <berrange@redhat.com>, Thomas Huth <thuth@redhat.com>, "Philippe Mathieu-Daudé" <philmd@linaro.org>, Markus Armbruster <armbru@redhat.com>
There is a newer version of this series
[PATCH v12 02/15] accel: collecting TB execution count
Posted by Fei Wu 2 years, 8 months ago
From: "Vanderson M. do Rosario" <vandersonmr2@gmail.com>

If a TB has a TBS (TBStatistics) with the TB_EXEC_STATS
enabled, then we instrument the start code of this TB
to atomically count the number of times it is executed.
We count both the number of "normal" executions and atomic
executions of a TB.

The execution count of the TB is stored in its respective
TBS.

All TBStatistics are created by default with the flags from
default_tbstats_flag.

Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>
Message-Id: <20190829173437.5926-3-vandersonmr2@gmail.com>
[AJB: Fix author]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Fei Wu <fei2.wu@intel.com>
---
 accel/tcg/cpu-exec.c          |  6 ++++++
 accel/tcg/tb-stats.c          |  6 ++++++
 accel/tcg/tcg-runtime.c       |  8 ++++++++
 accel/tcg/tcg-runtime.h       |  1 +
 accel/tcg/translate-all.c     |  7 +++++--
 accel/tcg/translator.c        | 10 ++++++++++
 include/exec/gen-icount.h     |  1 +
 include/exec/tb-stats-flags.h |  5 +++++
 include/exec/tb-stats.h       | 13 +++++++++++++
 9 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index bc0e1c3299..a3f24c62a0 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -25,6 +25,7 @@
 #include "trace.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
+#include "exec/tb-stats.h"
 #include "tcg/tcg.h"
 #include "qemu/atomic.h"
 #include "qemu/rcu.h"
@@ -564,7 +565,12 @@ void cpu_exec_step_atomic(CPUState *cpu)
             mmap_unlock();
         }
 
+        if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
+            tb->tb_stats->executions.atomic++;
+        }
+
         cpu_exec_enter(cpu);
+
         /* execute the generated code */
         trace_exec_tb(tb, pc);
         cpu_tb_exec(cpu, tb, &tb_exit);
diff --git a/accel/tcg/tb-stats.c b/accel/tcg/tb-stats.c
index f988bd8a31..143a52ef5c 100644
--- a/accel/tcg/tb-stats.c
+++ b/accel/tcg/tb-stats.c
@@ -22,6 +22,7 @@ enum TBStatsStatus {
 };
 
 static enum TBStatsStatus tcg_collect_tb_stats;
+static uint32_t default_tbstats_flag;
 
 void init_tb_stats_htable(void)
 {
@@ -56,3 +57,8 @@ bool tb_stats_collection_paused(void)
 {
     return tcg_collect_tb_stats == TB_STATS_PAUSED;
 }
+
+uint32_t get_default_tbstats_flag(void)
+{
+    return default_tbstats_flag;
+}
diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
index e4e030043f..85cb795732 100644
--- a/accel/tcg/tcg-runtime.c
+++ b/accel/tcg/tcg-runtime.c
@@ -27,6 +27,7 @@
 #include "exec/helper-proto.h"
 #include "exec/cpu_ldst.h"
 #include "exec/exec-all.h"
+#include "exec/tb-stats.h"
 #include "disas/disas.h"
 #include "exec/log.h"
 #include "tcg/tcg.h"
@@ -148,3 +149,10 @@ void HELPER(exit_atomic)(CPUArchState *env)
 {
     cpu_loop_exit_atomic(env_cpu(env), GETPC());
 }
+
+void HELPER(inc_exec_freq)(void *ptr)
+{
+    TBStatistics *stats = (TBStatistics *) ptr;
+    tcg_debug_assert(stats);
+    ++stats->executions.normal;
+}
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index 6f8c2061d0..8d310e39c0 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -41,6 +41,7 @@ DEF_HELPER_FLAGS_3(memset, TCG_CALL_NO_RWG, ptr, ptr, int, ptr)
 
 DEF_HELPER_FLAGS_3(ld_i128, TCG_CALL_NO_WG, i128, env, i64, i32)
 DEF_HELPER_FLAGS_4(st_i128, TCG_CALL_NO_WG, void, env, i64, i128, i32)
+DEF_HELPER_FLAGS_1(inc_exec_freq, TCG_CALL_NO_RWG, void, ptr)
 
 DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG,
                    i32, env, i64, i32, i32, i32)
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index a5ebc5e9e2..31d4ddf4ab 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -312,6 +312,7 @@ static TBStatistics *tb_get_stats(tb_page_addr_t phys_pc, target_ulong pc,
     new_stats->pc = pc;
     new_stats->cs_base = cs_base;
     new_stats->flags = flags;
+    new_stats->stats_enabled = get_default_tbstats_flag();
 
     /*
      * All initialisation must be complete before we insert into qht
@@ -408,9 +409,11 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     /*
      * We want to fetch the stats structure before we start code
      * generation so we can count interesting things about this
-     * generation.
+     * generation. If dfilter is in effect we will only collect stats
+     * for the specified range.
      */
-    if (tb_stats_collection_enabled()) {
+    if (tb_stats_collection_enabled() &&
+        qemu_log_in_addr_range(tb->pc)) {
         tb->tb_stats = tb_get_stats(phys_pc, pc, cs_base, flags);
     } else {
         tb->tb_stats = NULL;
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index 7bda43ff61..165072c8c3 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -18,6 +18,15 @@
 #include "exec/plugin-gen.h"
 #include "exec/replay-core.h"
 
+static inline void gen_tb_exec_count(TranslationBlock *tb)
+{
+    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
+        TCGv_ptr ptr = tcg_temp_new_ptr();
+        tcg_gen_movi_ptr(ptr, (intptr_t)tb->tb_stats);
+        gen_helper_inc_exec_freq(ptr);
+    }
+}
+
 bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
 {
     /* Suppress goto_tb if requested. */
@@ -56,6 +65,7 @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
 
     /* Start translating.  */
     gen_tb_start(db->tb);
+    gen_tb_exec_count(tb);
     ops->tb_start(db, cpu);
     tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
 
diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index f6de79a6b4..20e7835ff0 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -2,6 +2,7 @@
 #define GEN_ICOUNT_H
 
 #include "exec/exec-all.h"
+#include "exec/tb-stats.h"
 
 /* Helpers for instruction counting code generation.  */
 
diff --git a/include/exec/tb-stats-flags.h b/include/exec/tb-stats-flags.h
index 87ee3d902e..fa71eb6f0c 100644
--- a/include/exec/tb-stats-flags.h
+++ b/include/exec/tb-stats-flags.h
@@ -11,6 +11,9 @@
 #ifndef TB_STATS_FLAGS
 #define TB_STATS_FLAGS
 
+#define TB_NOTHING    (1 << 0)
+#define TB_EXEC_STATS (1 << 1)
+
 /* TBStatistic collection controls */
 void enable_collect_tb_stats(void);
 void disable_collect_tb_stats(void);
@@ -18,4 +21,6 @@ void pause_collect_tb_stats(void);
 bool tb_stats_collection_enabled(void);
 bool tb_stats_collection_paused(void);
 
+uint32_t get_default_tbstats_flag(void);
+
 #endif
diff --git a/include/exec/tb-stats.h b/include/exec/tb-stats.h
index b519465665..eb1fa92a4e 100644
--- a/include/exec/tb-stats.h
+++ b/include/exec/tb-stats.h
@@ -31,6 +31,9 @@
 #include "exec/tb-stats-flags.h"
 #include "tcg/tcg.h"
 
+#define tb_stats_enabled(tb, JIT_STATS) \
+    (tb && tb->tb_stats && (tb->tb_stats->stats_enabled & JIT_STATS))
+
 typedef struct TBStatistics TBStatistics;
 
 /*
@@ -47,6 +50,16 @@ struct TBStatistics {
     uint32_t     flags;
     /* cs_base isn't included in the hash but we do check for matches */
     target_ulong cs_base;
+
+    /* which stats are enabled for this TBStats */
+    uint32_t stats_enabled;
+
+    /* Execution stats */
+    struct {
+        unsigned long normal;
+        unsigned long atomic;
+    } executions;
+
 };
 
 bool tb_stats_cmp(const void *ap, const void *bp);
-- 
2.25.1


Re: [PATCH v12 02/15] accel: collecting TB execution count
Posted by Richard Henderson 2 years, 8 months ago
On 5/18/23 06:57, Fei Wu wrote:
> +void HELPER(inc_exec_freq)(void *ptr)
> +{
> +    TBStatistics *stats = (TBStatistics *) ptr;
> +    tcg_debug_assert(stats);
> +    ++stats->executions.normal;
> +}
...
> +static inline void gen_tb_exec_count(TranslationBlock *tb)
> +{
> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
> +        TCGv_ptr ptr = tcg_temp_new_ptr();
> +        tcg_gen_movi_ptr(ptr, (intptr_t)tb->tb_stats);
> +        gen_helper_inc_exec_freq(ptr);
> +    }
> +}

This is 3 host instructions, easily expanded inline:

--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -11,6 +11,7 @@
  #include "qemu/error-report.h"
  #include "tcg/tcg.h"
  #include "tcg/tcg-op.h"
+#include "tcg/tcg-temp-internal.h"
  #include "exec/exec-all.h"
  #include "exec/gen-icount.h"
  #include "exec/log.h"
@@ -18,6 +19,30 @@
  #include "exec/plugin-gen.h"
  #include "exec/replay-core.h"

+
+static void gen_tb_exec_count(TranslationBlock *tb)
+{
+    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
+        TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
+
+        tcg_gen_movi_ptr(ptr, (intptr_t)&tb->tb_stats->executions.normal);
+        if (sizeof(tb->tb_stats->executions.normal) == 4) {
+            TCGv_i32 t = tcg_temp_ebb_new_i32();
+            tcg_gen_ld_i32(t, ptr, 0);
+            tcg_gen_addi_i32(t, t, 1);
+            tcg_gen_st_i32(t, ptr, 0);
+            tcg_temp_free_i32(t);
+        } else {
+            TCGv_i64 t = tcg_temp_ebb_new_i64();
+            tcg_gen_ld_i64(t, ptr, 0);
+            tcg_gen_addi_i64(t, t, 1);
+            tcg_gen_st_i64(t, ptr, 0);
+            tcg_temp_free_i64(t);
+        }
+        tcg_temp_free_ptr(ptr);
+    }
+}
+
  bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
  {
      /* Suppress goto_tb if requested. */


I'm not expecially keen on embedding the TBStatistics pointer directly like this; for most 
hosts we will have to put this constant into the constant pool.  Whereas the pointer 
already exists at tb->tb_stats, and tb is at a constant displacement prior to the code, so 
we already have mechanisms for generating pc-relative addresses.

However, that's premature optimization.  Let's get it working first.


r~
Re: [PATCH v12 02/15] accel: collecting TB execution count
Posted by Wu, Fei 2 years, 8 months ago
On 5/23/2023 8:45 AM, Richard Henderson wrote:
> On 5/18/23 06:57, Fei Wu wrote:
>> +void HELPER(inc_exec_freq)(void *ptr)
>> +{
>> +    TBStatistics *stats = (TBStatistics *) ptr;
>> +    tcg_debug_assert(stats);
>> +    ++stats->executions.normal;
>> +}
> ...
>> +static inline void gen_tb_exec_count(TranslationBlock *tb)
>> +{
>> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
>> +        TCGv_ptr ptr = tcg_temp_new_ptr();
>> +        tcg_gen_movi_ptr(ptr, (intptr_t)tb->tb_stats);
>> +        gen_helper_inc_exec_freq(ptr);
>> +    }
>> +}
> 
> This is 3 host instructions, easily expanded inline:
> 
> --- a/accel/tcg/translator.c
> +++ b/accel/tcg/translator.c
> @@ -11,6 +11,7 @@
>  #include "qemu/error-report.h"
>  #include "tcg/tcg.h"
>  #include "tcg/tcg-op.h"
> +#include "tcg/tcg-temp-internal.h"
>  #include "exec/exec-all.h"
>  #include "exec/gen-icount.h"
>  #include "exec/log.h"
> @@ -18,6 +19,30 @@
>  #include "exec/plugin-gen.h"
>  #include "exec/replay-core.h"
> 
> +
> +static void gen_tb_exec_count(TranslationBlock *tb)
> +{
> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
> +        TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
> +
> +        tcg_gen_movi_ptr(ptr, (intptr_t)&tb->tb_stats->executions.normal);
> +        if (sizeof(tb->tb_stats->executions.normal) == 4) {
> +            TCGv_i32 t = tcg_temp_ebb_new_i32();
> +            tcg_gen_ld_i32(t, ptr, 0);
> +            tcg_gen_addi_i32(t, t, 1);
> +            tcg_gen_st_i32(t, ptr, 0);
> +            tcg_temp_free_i32(t);
> +        } else {
> +            TCGv_i64 t = tcg_temp_ebb_new_i64();
> +            tcg_gen_ld_i64(t, ptr, 0);
> +            tcg_gen_addi_i64(t, t, 1);
> +            tcg_gen_st_i64(t, ptr, 0);
> +            tcg_temp_free_i64(t);
> +        }
> +        tcg_temp_free_ptr(ptr);
> +    }
> +}
> +
>  bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
>  {
>      /* Suppress goto_tb if requested. */
> 
> 
> I'm not expecially keen on embedding the TBStatistics pointer directly
> like this; for most hosts we will have to put this constant into the
> constant pool.  Whereas the pointer already exists at tb->tb_stats, and
> tb is at a constant displacement prior to the code, so we already have
> mechanisms for generating pc-relative addresses.
> 
> However, that's premature optimization.  Let's get it working first.
> 
Richard, have you reviewed the whole series? I will integrate your
change to next version.

Thanks,
Fei.
> 
> r~
> 


Re: [PATCH v12 02/15] accel: collecting TB execution count
Posted by Richard Henderson 2 years, 8 months ago
On 5/24/23 06:35, Wu, Fei wrote:
> On 5/23/2023 8:45 AM, Richard Henderson wrote:
>> On 5/18/23 06:57, Fei Wu wrote:
>>> +void HELPER(inc_exec_freq)(void *ptr)
>>> +{
>>> +    TBStatistics *stats = (TBStatistics *) ptr;
>>> +    tcg_debug_assert(stats);
>>> +    ++stats->executions.normal;
>>> +}
>> ...
>>> +static inline void gen_tb_exec_count(TranslationBlock *tb)
>>> +{
>>> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
>>> +        TCGv_ptr ptr = tcg_temp_new_ptr();
>>> +        tcg_gen_movi_ptr(ptr, (intptr_t)tb->tb_stats);
>>> +        gen_helper_inc_exec_freq(ptr);
>>> +    }
>>> +}
>>
>> This is 3 host instructions, easily expanded inline:
>>
>> --- a/accel/tcg/translator.c
>> +++ b/accel/tcg/translator.c
>> @@ -11,6 +11,7 @@
>>   #include "qemu/error-report.h"
>>   #include "tcg/tcg.h"
>>   #include "tcg/tcg-op.h"
>> +#include "tcg/tcg-temp-internal.h"
>>   #include "exec/exec-all.h"
>>   #include "exec/gen-icount.h"
>>   #include "exec/log.h"
>> @@ -18,6 +19,30 @@
>>   #include "exec/plugin-gen.h"
>>   #include "exec/replay-core.h"
>>
>> +
>> +static void gen_tb_exec_count(TranslationBlock *tb)
>> +{
>> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
>> +        TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
>> +
>> +        tcg_gen_movi_ptr(ptr, (intptr_t)&tb->tb_stats->executions.normal);
>> +        if (sizeof(tb->tb_stats->executions.normal) == 4) {
>> +            TCGv_i32 t = tcg_temp_ebb_new_i32();
>> +            tcg_gen_ld_i32(t, ptr, 0);
>> +            tcg_gen_addi_i32(t, t, 1);
>> +            tcg_gen_st_i32(t, ptr, 0);
>> +            tcg_temp_free_i32(t);
>> +        } else {
>> +            TCGv_i64 t = tcg_temp_ebb_new_i64();
>> +            tcg_gen_ld_i64(t, ptr, 0);
>> +            tcg_gen_addi_i64(t, t, 1);
>> +            tcg_gen_st_i64(t, ptr, 0);
>> +            tcg_temp_free_i64(t);
>> +        }
>> +        tcg_temp_free_ptr(ptr);
>> +    }
>> +}
>> +
>>   bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
>>   {
>>       /* Suppress goto_tb if requested. */
>>
>>
>> I'm not expecially keen on embedding the TBStatistics pointer directly
>> like this; for most hosts we will have to put this constant into the
>> constant pool.  Whereas the pointer already exists at tb->tb_stats, and
>> tb is at a constant displacement prior to the code, so we already have
>> mechanisms for generating pc-relative addresses.
>>
>> However, that's premature optimization.  Let's get it working first.
>>
> Richard, have you reviewed the whole series? I will integrate your
> change to next version.

No, it's difficult to see what's going on.
In your next revision, please remove CONFIG_PROFILER entirely first, which was what I was 
planning to do locally.


r~

Re: [PATCH v12 02/15] accel: collecting TB execution count
Posted by Wu, Fei 2 years, 8 months ago
On 5/25/2023 1:02 AM, Richard Henderson wrote:
> On 5/24/23 06:35, Wu, Fei wrote:
>> On 5/23/2023 8:45 AM, Richard Henderson wrote:
>>> On 5/18/23 06:57, Fei Wu wrote:
>>>> +void HELPER(inc_exec_freq)(void *ptr)
>>>> +{
>>>> +    TBStatistics *stats = (TBStatistics *) ptr;
>>>> +    tcg_debug_assert(stats);
>>>> +    ++stats->executions.normal;
>>>> +}
>>> ...
>>>> +static inline void gen_tb_exec_count(TranslationBlock *tb)
>>>> +{
>>>> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
>>>> +        TCGv_ptr ptr = tcg_temp_new_ptr();
>>>> +        tcg_gen_movi_ptr(ptr, (intptr_t)tb->tb_stats);
>>>> +        gen_helper_inc_exec_freq(ptr);
>>>> +    }
>>>> +}
>>>
>>> This is 3 host instructions, easily expanded inline:
>>>
>>> --- a/accel/tcg/translator.c
>>> +++ b/accel/tcg/translator.c
>>> @@ -11,6 +11,7 @@
>>>   #include "qemu/error-report.h"
>>>   #include "tcg/tcg.h"
>>>   #include "tcg/tcg-op.h"
>>> +#include "tcg/tcg-temp-internal.h"
>>>   #include "exec/exec-all.h"
>>>   #include "exec/gen-icount.h"
>>>   #include "exec/log.h"
>>> @@ -18,6 +19,30 @@
>>>   #include "exec/plugin-gen.h"
>>>   #include "exec/replay-core.h"
>>>
>>> +
>>> +static void gen_tb_exec_count(TranslationBlock *tb)
>>> +{
>>> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
>>> +        TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
>>> +
>>> +        tcg_gen_movi_ptr(ptr,
>>> (intptr_t)&tb->tb_stats->executions.normal);
>>> +        if (sizeof(tb->tb_stats->executions.normal) == 4) {
>>> +            TCGv_i32 t = tcg_temp_ebb_new_i32();
>>> +            tcg_gen_ld_i32(t, ptr, 0);
>>> +            tcg_gen_addi_i32(t, t, 1);
>>> +            tcg_gen_st_i32(t, ptr, 0);
>>> +            tcg_temp_free_i32(t);
>>> +        } else {
>>> +            TCGv_i64 t = tcg_temp_ebb_new_i64();
>>> +            tcg_gen_ld_i64(t, ptr, 0);
>>> +            tcg_gen_addi_i64(t, t, 1);
>>> +            tcg_gen_st_i64(t, ptr, 0);
>>> +            tcg_temp_free_i64(t);
>>> +        }
>>> +        tcg_temp_free_ptr(ptr);
>>> +    }
>>> +}
>>> +
>>>   bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
>>>   {
>>>       /* Suppress goto_tb if requested. */
>>>
>>>
>>> I'm not expecially keen on embedding the TBStatistics pointer directly
>>> like this; for most hosts we will have to put this constant into the
>>> constant pool.  Whereas the pointer already exists at tb->tb_stats, and
>>> tb is at a constant displacement prior to the code, so we already have
>>> mechanisms for generating pc-relative addresses.
>>>
>>> However, that's premature optimization.  Let's get it working first.
>>>
>> Richard, have you reviewed the whole series? I will integrate your
>> change to next version.
> 
> No, it's difficult to see what's going on.
> In your next revision, please remove CONFIG_PROFILER entirely first,
> which was what I was planning to do locally.
> 
This will change the series dramatically, the patches probably need to
be reorganized, a lot of rebases are required, and some functions such
as tcg_dump_op_count which is in the scope of CONFIG_PROFILER will be
removed first then added backed. It takes time, I will try to send it
out asap.

Thanks,
Fei.
> 
> r~


Re: [PATCH v12 02/15] accel: collecting TB execution count
Posted by Wu, Fei 2 years, 8 months ago
On 5/25/2023 1:02 AM, Richard Henderson wrote:
> On 5/24/23 06:35, Wu, Fei wrote:
>> On 5/23/2023 8:45 AM, Richard Henderson wrote:
>>> On 5/18/23 06:57, Fei Wu wrote:
>>>> +void HELPER(inc_exec_freq)(void *ptr)
>>>> +{
>>>> +    TBStatistics *stats = (TBStatistics *) ptr;
>>>> +    tcg_debug_assert(stats);
>>>> +    ++stats->executions.normal;
>>>> +}
>>> ...
>>>> +static inline void gen_tb_exec_count(TranslationBlock *tb)
>>>> +{
>>>> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
>>>> +        TCGv_ptr ptr = tcg_temp_new_ptr();
>>>> +        tcg_gen_movi_ptr(ptr, (intptr_t)tb->tb_stats);
>>>> +        gen_helper_inc_exec_freq(ptr);
>>>> +    }
>>>> +}
>>>
>>> This is 3 host instructions, easily expanded inline:
>>>
>>> --- a/accel/tcg/translator.c
>>> +++ b/accel/tcg/translator.c
>>> @@ -11,6 +11,7 @@
>>>   #include "qemu/error-report.h"
>>>   #include "tcg/tcg.h"
>>>   #include "tcg/tcg-op.h"
>>> +#include "tcg/tcg-temp-internal.h"
>>>   #include "exec/exec-all.h"
>>>   #include "exec/gen-icount.h"
>>>   #include "exec/log.h"
>>> @@ -18,6 +19,30 @@
>>>   #include "exec/plugin-gen.h"
>>>   #include "exec/replay-core.h"
>>>
>>> +
>>> +static void gen_tb_exec_count(TranslationBlock *tb)
>>> +{
>>> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
>>> +        TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
>>> +
>>> +        tcg_gen_movi_ptr(ptr,
>>> (intptr_t)&tb->tb_stats->executions.normal);
>>> +        if (sizeof(tb->tb_stats->executions.normal) == 4) {
>>> +            TCGv_i32 t = tcg_temp_ebb_new_i32();
>>> +            tcg_gen_ld_i32(t, ptr, 0);
>>> +            tcg_gen_addi_i32(t, t, 1);
>>> +            tcg_gen_st_i32(t, ptr, 0);
>>> +            tcg_temp_free_i32(t);
>>> +        } else {
>>> +            TCGv_i64 t = tcg_temp_ebb_new_i64();
>>> +            tcg_gen_ld_i64(t, ptr, 0);
>>> +            tcg_gen_addi_i64(t, t, 1);
>>> +            tcg_gen_st_i64(t, ptr, 0);
>>> +            tcg_temp_free_i64(t);
>>> +        }
>>> +        tcg_temp_free_ptr(ptr);
>>> +    }
>>> +}
>>> +
>>>   bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
>>>   {
>>>       /* Suppress goto_tb if requested. */
>>>
>>>
>>> I'm not expecially keen on embedding the TBStatistics pointer directly
>>> like this; for most hosts we will have to put this constant into the
>>> constant pool.  Whereas the pointer already exists at tb->tb_stats, and
>>> tb is at a constant displacement prior to the code, so we already have
>>> mechanisms for generating pc-relative addresses.
>>>
>>> However, that's premature optimization.  Let's get it working first.
>>>
>> Richard, have you reviewed the whole series? I will integrate your
>> change to next version.
> 
> No, it's difficult to see what's going on.
> In your next revision, please remove CONFIG_PROFILER entirely first,
> which was what I was planning to do locally.
> 
OK, I will update it.

Thanks,
Fei.

> 
> r~


Re: [PATCH v12 02/15] accel: collecting TB execution count
Posted by Wu, Fei 2 years, 8 months ago
On 5/23/2023 8:45 AM, Richard Henderson wrote:
> On 5/18/23 06:57, Fei Wu wrote:
>> +void HELPER(inc_exec_freq)(void *ptr)
>> +{
>> +    TBStatistics *stats = (TBStatistics *) ptr;
>> +    tcg_debug_assert(stats);
>> +    ++stats->executions.normal;
>> +}
> ...
>> +static inline void gen_tb_exec_count(TranslationBlock *tb)
>> +{
>> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
>> +        TCGv_ptr ptr = tcg_temp_new_ptr();
>> +        tcg_gen_movi_ptr(ptr, (intptr_t)tb->tb_stats);
>> +        gen_helper_inc_exec_freq(ptr);
>> +    }
>> +}
> 
> This is 3 host instructions, easily expanded inline:
> 
> --- a/accel/tcg/translator.c
> +++ b/accel/tcg/translator.c
> @@ -11,6 +11,7 @@
>  #include "qemu/error-report.h"
>  #include "tcg/tcg.h"
>  #include "tcg/tcg-op.h"
> +#include "tcg/tcg-temp-internal.h"
>  #include "exec/exec-all.h"
>  #include "exec/gen-icount.h"
>  #include "exec/log.h"
> @@ -18,6 +19,30 @@
>  #include "exec/plugin-gen.h"
>  #include "exec/replay-core.h"
> 
> +
> +static void gen_tb_exec_count(TranslationBlock *tb)
> +{
> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
> +        TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
> +
> +        tcg_gen_movi_ptr(ptr, (intptr_t)&tb->tb_stats->executions.normal);
> +        if (sizeof(tb->tb_stats->executions.normal) == 4) {
> +            TCGv_i32 t = tcg_temp_ebb_new_i32();
> +            tcg_gen_ld_i32(t, ptr, 0);
> +            tcg_gen_addi_i32(t, t, 1);
> +            tcg_gen_st_i32(t, ptr, 0);
> +            tcg_temp_free_i32(t);
> +        } else {
> +            TCGv_i64 t = tcg_temp_ebb_new_i64();
> +            tcg_gen_ld_i64(t, ptr, 0);
> +            tcg_gen_addi_i64(t, t, 1);
> +            tcg_gen_st_i64(t, ptr, 0);
> +            tcg_temp_free_i64(t);
> +        }
> +        tcg_temp_free_ptr(ptr);
> +    }
> +}
> +
>  bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
>  {
>      /* Suppress goto_tb if requested. */
> 
> 
> I'm not expecially keen on embedding the TBStatistics pointer directly
> like this; for most hosts we will have to put this constant into the
> constant pool.  Whereas the pointer already exists at tb->tb_stats, and
> tb is at a constant displacement prior to the code, so we already have
> mechanisms for generating pc-relative addresses.
> 
> However, that's premature optimization.  Let's get it working first.
> 
Here is the coremark results on a 4c qemu-system-riscv64, coremark takes
1cpu in this test.

(tb_stats stop)                         Iterations/Sec   : 5358.012664
(tb_stats start all)
    helper - qatomic_inc(normal)        Iterations/Sec   : 2416.626390
    helper - ++normal                   Iterations/Sec   : 4307.559767
    no helper - inline add              Iterations/Sec   : 5168.930031

Also if coremark runs on 4cpu, tb_stats will cost much more even in the
inline case, it's an extreme case though.

Thanks,
Fei.

> 
> r~
> 


Re: [PATCH v12 02/15] accel: collecting TB execution count
Posted by Wu, Fei 2 years, 8 months ago
On 5/23/2023 8:45 AM, Richard Henderson wrote:
> On 5/18/23 06:57, Fei Wu wrote:
>> +void HELPER(inc_exec_freq)(void *ptr)
>> +{
>> +    TBStatistics *stats = (TBStatistics *) ptr;
>> +    tcg_debug_assert(stats);
>> +    ++stats->executions.normal;
>> +}
> ...
>> +static inline void gen_tb_exec_count(TranslationBlock *tb)
>> +{
>> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
>> +        TCGv_ptr ptr = tcg_temp_new_ptr();
>> +        tcg_gen_movi_ptr(ptr, (intptr_t)tb->tb_stats);
>> +        gen_helper_inc_exec_freq(ptr);
>> +    }
>> +}
> 
> This is 3 host instructions, easily expanded inline:
> 
> --- a/accel/tcg/translator.c
> +++ b/accel/tcg/translator.c
> @@ -11,6 +11,7 @@
>  #include "qemu/error-report.h"
>  #include "tcg/tcg.h"
>  #include "tcg/tcg-op.h"
> +#include "tcg/tcg-temp-internal.h"
>  #include "exec/exec-all.h"
>  #include "exec/gen-icount.h"
>  #include "exec/log.h"
> @@ -18,6 +19,30 @@
>  #include "exec/plugin-gen.h"
>  #include "exec/replay-core.h"
> 
> +
> +static void gen_tb_exec_count(TranslationBlock *tb)
> +{
> +    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
> +        TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
> +
> +        tcg_gen_movi_ptr(ptr, (intptr_t)&tb->tb_stats->executions.normal);
> +        if (sizeof(tb->tb_stats->executions.normal) == 4) {
> +            TCGv_i32 t = tcg_temp_ebb_new_i32();
> +            tcg_gen_ld_i32(t, ptr, 0);
> +            tcg_gen_addi_i32(t, t, 1);
> +            tcg_gen_st_i32(t, ptr, 0);
> +            tcg_temp_free_i32(t);
> +        } else {
> +            TCGv_i64 t = tcg_temp_ebb_new_i64();
> +            tcg_gen_ld_i64(t, ptr, 0);
> +            tcg_gen_addi_i64(t, t, 1);
> +            tcg_gen_st_i64(t, ptr, 0);
> +            tcg_temp_free_i64(t);
> +        }
> +        tcg_temp_free_ptr(ptr);
> +    }
> +}
> +

Thank you for the method, I will try it and measure the gain, it is
indeed the hot path and usually takes a lot of time.

Thanks,
Fei.

>  bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
>  {
>      /* Suppress goto_tb if requested. */
> 
> 
> I'm not expecially keen on embedding the TBStatistics pointer directly
> like this; for most hosts we will have to put this constant into the
> constant pool.  Whereas the pointer already exists at tb->tb_stats, and
> tb is at a constant displacement prior to the code, so we already have
> mechanisms for generating pc-relative addresses.
> 
> However, that's premature optimization.  Let's get it working first.
> 
> 
> r~
>