[RFC PATCH 2/6] accel/tcg: Avoid uncessary call overhead from qemu_plugin_vcpu_mem_cb

Max Chou posted 6 patches 8 months, 2 weeks ago
Maintainers: Richard Henderson <richard.henderson@linaro.org>, Paolo Bonzini <pbonzini@redhat.com>, Riku Voipio <riku.voipio@iki.fi>, Palmer Dabbelt <palmer@dabbelt.com>, Alistair Francis <alistair.francis@wdc.com>, Bin Meng <bin.meng@windriver.com>, Weiwei Li <liwei1518@gmail.com>, Daniel Henrique Barboza <dbarboza@ventanamicro.com>, Liu Zhiwei <zhiwei_liu@linux.alibaba.com>
[RFC PATCH 2/6] accel/tcg: Avoid uncessary call overhead from qemu_plugin_vcpu_mem_cb
Posted by Max Chou 8 months, 2 weeks ago
If there are not any QEMU plugin memory callback functions, checking
before calling the qemu_plugin_vcpu_mem_cb function can reduce the
function call overhead.

Signed-off-by: Max Chou <max.chou@sifive.com>
---
 accel/tcg/ldst_common.c.inc | 40 +++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc
index c82048e377e..bf24986c562 100644
--- a/accel/tcg/ldst_common.c.inc
+++ b/accel/tcg/ldst_common.c.inc
@@ -134,7 +134,9 @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra)
 
     tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_UB);
     ret = do_ld1_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
-    plugin_load_cb(env, addr, oi);
+    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
+        plugin_load_cb(env, addr, oi);
+    }
     return ret;
 }
 
@@ -145,7 +147,9 @@ uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr,
 
     tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
     ret = do_ld2_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
-    plugin_load_cb(env, addr, oi);
+    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
+        plugin_load_cb(env, addr, oi);
+    }
     return ret;
 }
 
@@ -156,7 +160,9 @@ uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr,
 
     tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
     ret = do_ld4_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
-    plugin_load_cb(env, addr, oi);
+    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
+        plugin_load_cb(env, addr, oi);
+    }
     return ret;
 }
 
@@ -167,7 +173,9 @@ uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr,
 
     tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
     ret = do_ld8_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
-    plugin_load_cb(env, addr, oi);
+    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
+        plugin_load_cb(env, addr, oi);
+    }
     return ret;
 }
 
@@ -178,7 +186,9 @@ Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
 
     tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
     ret = do_ld16_mmu(env_cpu(env), addr, oi, ra);
-    plugin_load_cb(env, addr, oi);
+    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
+        plugin_load_cb(env, addr, oi);
+    }
     return ret;
 }
 
@@ -195,7 +205,9 @@ void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
                  MemOpIdx oi, uintptr_t retaddr)
 {
     helper_stb_mmu(env, addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
+    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
+        plugin_store_cb(env, addr, oi);
+    }
 }
 
 void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
@@ -203,7 +215,9 @@ void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
 {
     tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
     do_st2_mmu(env_cpu(env), addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
+    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
+        plugin_store_cb(env, addr, oi);
+    }
 }
 
 void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
@@ -211,7 +225,9 @@ void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 {
     tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
     do_st4_mmu(env_cpu(env), addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
+    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
+        plugin_store_cb(env, addr, oi);
+    }
 }
 
 void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
@@ -219,7 +235,9 @@ void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
 {
     tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
     do_st8_mmu(env_cpu(env), addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
+    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
+        plugin_store_cb(env, addr, oi);
+    }
 }
 
 void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
@@ -227,7 +245,9 @@ void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 {
     tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
     do_st16_mmu(env_cpu(env), addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
+    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
+        plugin_store_cb(env, addr, oi);
+    }
 }
 
 /*
-- 
2.34.1
Re: [RFC PATCH 2/6] accel/tcg: Avoid uncessary call overhead from qemu_plugin_vcpu_mem_cb
Posted by Daniel Henrique Barboza 8 months, 2 weeks ago

On 2/15/24 16:28, Max Chou wrote:
> If there are not any QEMU plugin memory callback functions, checking
> before calling the qemu_plugin_vcpu_mem_cb function can reduce the
> function call overhead.
> 
> Signed-off-by: Max Chou <max.chou@sifive.com>
> ---

This was in my TODO list for some time. Thanks for looking it up.

Can't we avoid all callbacks, not just qemu_plugin_vcpu_mem_cb, if there's no
plugin loaded? The performance increase when building with --disable-plugins
shouldn't be a thing - if the user isn't using plug-ins it should have a
penalty to it.

Thanks,

Daniel

>   accel/tcg/ldst_common.c.inc | 40 +++++++++++++++++++++++++++----------
>   1 file changed, 30 insertions(+), 10 deletions(-)
> 
> diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc
> index c82048e377e..bf24986c562 100644
> --- a/accel/tcg/ldst_common.c.inc
> +++ b/accel/tcg/ldst_common.c.inc
> @@ -134,7 +134,9 @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra)
>   
>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_UB);
>       ret = do_ld1_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
> -    plugin_load_cb(env, addr, oi);
> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
> +        plugin_load_cb(env, addr, oi);
> +    }
>       return ret;
>   }
>   
> @@ -145,7 +147,9 @@ uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr,
>   
>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
>       ret = do_ld2_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
> -    plugin_load_cb(env, addr, oi);
> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
> +        plugin_load_cb(env, addr, oi);
> +    }
>       return ret;
>   }
>   
> @@ -156,7 +160,9 @@ uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr,
>   
>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
>       ret = do_ld4_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
> -    plugin_load_cb(env, addr, oi);
> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
> +        plugin_load_cb(env, addr, oi);
> +    }
>       return ret;
>   }
>   
> @@ -167,7 +173,9 @@ uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr,
>   
>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
>       ret = do_ld8_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
> -    plugin_load_cb(env, addr, oi);
> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
> +        plugin_load_cb(env, addr, oi);
> +    }
>       return ret;
>   }
>   
> @@ -178,7 +186,9 @@ Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
>   
>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
>       ret = do_ld16_mmu(env_cpu(env), addr, oi, ra);
> -    plugin_load_cb(env, addr, oi);
> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
> +        plugin_load_cb(env, addr, oi);
> +    }
>       return ret;
>   }
>   
> @@ -195,7 +205,9 @@ void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
>                    MemOpIdx oi, uintptr_t retaddr)
>   {
>       helper_stb_mmu(env, addr, val, oi, retaddr);
> -    plugin_store_cb(env, addr, oi);
> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
> +        plugin_store_cb(env, addr, oi);
> +    }
>   }
>   
>   void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
> @@ -203,7 +215,9 @@ void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
>   {
>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
>       do_st2_mmu(env_cpu(env), addr, val, oi, retaddr);
> -    plugin_store_cb(env, addr, oi);
> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
> +        plugin_store_cb(env, addr, oi);
> +    }
>   }
>   
>   void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
> @@ -211,7 +225,9 @@ void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
>   {
>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
>       do_st4_mmu(env_cpu(env), addr, val, oi, retaddr);
> -    plugin_store_cb(env, addr, oi);
> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
> +        plugin_store_cb(env, addr, oi);
> +    }
>   }
>   
>   void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
> @@ -219,7 +235,9 @@ void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
>   {
>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
>       do_st8_mmu(env_cpu(env), addr, val, oi, retaddr);
> -    plugin_store_cb(env, addr, oi);
> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
> +        plugin_store_cb(env, addr, oi);
> +    }
>   }
>   
>   void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
> @@ -227,7 +245,9 @@ void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
>   {
>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
>       do_st16_mmu(env_cpu(env), addr, val, oi, retaddr);
> -    plugin_store_cb(env, addr, oi);
> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
> +        plugin_store_cb(env, addr, oi);
> +    }
>   }
>   
>   /*
Re: [RFC PATCH 2/6] accel/tcg: Avoid uncessary call overhead from qemu_plugin_vcpu_mem_cb
Posted by Max Chou 8 months, 2 weeks ago
Hi Daniel,

I think the the overhead from the plugin callbacks depending on the type 
of plugin callbacks.
Because there is only qemu_plugin_vcpu_mem_cb in the perf report of the 
glibc memcpy benchtest, so I just looked it up for the the memory 
callbacks in this RFC.
I'll try to get more experiment results to check the status of other 
plugin callbacks.

Thanks,
Max

On 2024/2/16 4:21 AM, Daniel Henrique Barboza wrote:
>
>
> On 2/15/24 16:28, Max Chou wrote:
>> If there are not any QEMU plugin memory callback functions, checking
>> before calling the qemu_plugin_vcpu_mem_cb function can reduce the
>> function call overhead.
>>
>> Signed-off-by: Max Chou <max.chou@sifive.com>
>> ---
>
> This was in my TODO list for some time. Thanks for looking it up.
>
> Can't we avoid all callbacks, not just qemu_plugin_vcpu_mem_cb, if 
> there's no
> plugin loaded? The performance increase when building with 
> --disable-plugins
> shouldn't be a thing - if the user isn't using plug-ins it should have a
> penalty to it.
>
> Thanks,
>
> Daniel
>
>>   accel/tcg/ldst_common.c.inc | 40 +++++++++++++++++++++++++++----------
>>   1 file changed, 30 insertions(+), 10 deletions(-)
>>
>> diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc
>> index c82048e377e..bf24986c562 100644
>> --- a/accel/tcg/ldst_common.c.inc
>> +++ b/accel/tcg/ldst_common.c.inc
>> @@ -134,7 +134,9 @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr 
>> addr, MemOpIdx oi, uintptr_t ra)
>>         tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_UB);
>>       ret = do_ld1_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
>> -    plugin_load_cb(env, addr, oi);
>> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
>> +        plugin_load_cb(env, addr, oi);
>> +    }
>>       return ret;
>>   }
>>   @@ -145,7 +147,9 @@ uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr 
>> addr,
>>         tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
>>       ret = do_ld2_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
>> -    plugin_load_cb(env, addr, oi);
>> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
>> +        plugin_load_cb(env, addr, oi);
>> +    }
>>       return ret;
>>   }
>>   @@ -156,7 +160,9 @@ uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr 
>> addr,
>>         tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
>>       ret = do_ld4_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
>> -    plugin_load_cb(env, addr, oi);
>> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
>> +        plugin_load_cb(env, addr, oi);
>> +    }
>>       return ret;
>>   }
>>   @@ -167,7 +173,9 @@ uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr 
>> addr,
>>         tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
>>       ret = do_ld8_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
>> -    plugin_load_cb(env, addr, oi);
>> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
>> +        plugin_load_cb(env, addr, oi);
>> +    }
>>       return ret;
>>   }
>>   @@ -178,7 +186,9 @@ Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr 
>> addr,
>>         tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
>>       ret = do_ld16_mmu(env_cpu(env), addr, oi, ra);
>> -    plugin_load_cb(env, addr, oi);
>> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
>> +        plugin_load_cb(env, addr, oi);
>> +    }
>>       return ret;
>>   }
>>   @@ -195,7 +205,9 @@ void cpu_stb_mmu(CPUArchState *env, abi_ptr 
>> addr, uint8_t val,
>>                    MemOpIdx oi, uintptr_t retaddr)
>>   {
>>       helper_stb_mmu(env, addr, val, oi, retaddr);
>> -    plugin_store_cb(env, addr, oi);
>> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
>> +        plugin_store_cb(env, addr, oi);
>> +    }
>>   }
>>     void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
>> @@ -203,7 +215,9 @@ void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, 
>> uint16_t val,
>>   {
>>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
>>       do_st2_mmu(env_cpu(env), addr, val, oi, retaddr);
>> -    plugin_store_cb(env, addr, oi);
>> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
>> +        plugin_store_cb(env, addr, oi);
>> +    }
>>   }
>>     void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
>> @@ -211,7 +225,9 @@ void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, 
>> uint32_t val,
>>   {
>>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
>>       do_st4_mmu(env_cpu(env), addr, val, oi, retaddr);
>> -    plugin_store_cb(env, addr, oi);
>> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
>> +        plugin_store_cb(env, addr, oi);
>> +    }
>>   }
>>     void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
>> @@ -219,7 +235,9 @@ void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, 
>> uint64_t val,
>>   {
>>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
>>       do_st8_mmu(env_cpu(env), addr, val, oi, retaddr);
>> -    plugin_store_cb(env, addr, oi);
>> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
>> +        plugin_store_cb(env, addr, oi);
>> +    }
>>   }
>>     void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
>> @@ -227,7 +245,9 @@ void cpu_st16_mmu(CPUArchState *env, abi_ptr 
>> addr, Int128 val,
>>   {
>>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
>>       do_st16_mmu(env_cpu(env), addr, val, oi, retaddr);
>> -    plugin_store_cb(env, addr, oi);
>> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
>> +        plugin_store_cb(env, addr, oi);
>> +    }
>>   }
>>     /*

Re: [RFC PATCH 2/6] accel/tcg: Avoid uncessary call overhead from qemu_plugin_vcpu_mem_cb
Posted by Richard Henderson 8 months, 2 weeks ago
On 2/15/24 09:28, Max Chou wrote:
> If there are not any QEMU plugin memory callback functions, checking
> before calling the qemu_plugin_vcpu_mem_cb function can reduce the
> function call overhead.
> 
> Signed-off-by: Max Chou <max.chou@sifive.com>
> ---
>   accel/tcg/ldst_common.c.inc | 40 +++++++++++++++++++++++++++----------
>   1 file changed, 30 insertions(+), 10 deletions(-)
> 
> diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc
> index c82048e377e..bf24986c562 100644
> --- a/accel/tcg/ldst_common.c.inc
> +++ b/accel/tcg/ldst_common.c.inc
> @@ -134,7 +134,9 @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra)
>   
>       tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_UB);
>       ret = do_ld1_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
> -    plugin_load_cb(env, addr, oi);
> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
> +        plugin_load_cb(env, addr, oi);
> +    }
>       return ret;
>   }

Rather than repeating N times, modify plugin_load_cb and plugin_store_cb just above to 
avoid the call to cpu_plugin_mem_cbs_enabled().  I expect the compiler is inlining those 
functions already.


r~
Re: [RFC PATCH 2/6] accel/tcg: Avoid uncessary call overhead from qemu_plugin_vcpu_mem_cb
Posted by Max Chou 8 months, 2 weeks ago
Hi Richard,

Thank you for the suggestion. I'll do a v2 with this.

Thanks,
Max

On 2024/2/16 4:03 AM, Richard Henderson wrote:
> On 2/15/24 09:28, Max Chou wrote:
>> If there are not any QEMU plugin memory callback functions, checking
>> before calling the qemu_plugin_vcpu_mem_cb function can reduce the
>> function call overhead.
>>
>> Signed-off-by: Max Chou <max.chou@sifive.com>
>> ---
>>   accel/tcg/ldst_common.c.inc | 40 +++++++++++++++++++++++++++----------
>>   1 file changed, 30 insertions(+), 10 deletions(-)
>>
>> diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc
>> index c82048e377e..bf24986c562 100644
>> --- a/accel/tcg/ldst_common.c.inc
>> +++ b/accel/tcg/ldst_common.c.inc
>> @@ -134,7 +134,9 @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr 
>> addr, MemOpIdx oi, uintptr_t ra)
>>         tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_UB);
>>       ret = do_ld1_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD);
>> -    plugin_load_cb(env, addr, oi);
>> +    if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
>> +        plugin_load_cb(env, addr, oi);
>> +    }
>>       return ret;
>>   }
>
> Rather than repeating N times, modify plugin_load_cb and 
> plugin_store_cb just above to avoid the call to 
> cpu_plugin_mem_cbs_enabled().  I expect the compiler is inlining those 
> functions already.
>
>
> r~