[PATCH] plugins: optimize cpu_index code generation

Pierrick Bouvier posted 1 patch 1 week, 3 days ago
There is a newer version of this series
accel/tcg/plugin-gen.c |  7 +++++++
plugins/core.c         | 13 +++++++++++++
2 files changed, 20 insertions(+)
[PATCH] plugins: optimize cpu_index code generation
Posted by Pierrick Bouvier 1 week, 3 days ago
When running with a single vcpu, we can return a constant instead of a
load when accessing cpu_index.
A side effect is that all tcg operations using it are optimized, most
notably scoreboard access.
When running a simple loop in user-mode, the speedup is around 20%.

Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
---
 accel/tcg/plugin-gen.c |  7 +++++++
 plugins/core.c         | 13 +++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index 0f47bfbb489..2eabeecbdcf 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -102,6 +102,13 @@ static void gen_disable_mem_helper(void)
 
 static TCGv_i32 gen_cpu_index(void)
 {
+    /*
+     * Optimize when we run with a single vcpu. All values using cpu_index,
+     * including scoreboard index, will be optimized out.
+     */
+    if (qemu_plugin_num_vcpus() == 1) {
+        return tcg_constant_i32(0);
+    }
     TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
     tcg_gen_ld_i32(cpu_index, tcg_env,
                    -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
diff --git a/plugins/core.c b/plugins/core.c
index bb105e8e688..8e32ca5ee08 100644
--- a/plugins/core.c
+++ b/plugins/core.c
@@ -266,6 +266,19 @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused)
 
     assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
     qemu_rec_mutex_lock(&plugin.lock);
+
+    /*
+     * We want to flush tb when a second cpu appear.
+     * When generating plugin code, we optimize cpu_index for num_vcpus == 1.
+     */
+    if (plugin.num_vcpus == 1) {
+        qemu_rec_mutex_unlock(&plugin.lock);
+        start_exclusive();
+        qemu_rec_mutex_lock(&plugin.lock);
+        tb_flush(cpu);
+        end_exclusive();
+    }
+
     plugin.num_vcpus = MAX(plugin.num_vcpus, cpu->cpu_index + 1);
     plugin_cpu_update__locked(&cpu->cpu_index, NULL, NULL);
     success = g_hash_table_insert(plugin.cpu_ht, &cpu->cpu_index,
-- 
2.39.5
Re: [PATCH] plugins: optimize cpu_index code generation
Posted by Richard Henderson 1 week, 2 days ago
On 11/26/24 13:02, Pierrick Bouvier wrote:
> @@ -266,6 +266,19 @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused)
>   
>       assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
>       qemu_rec_mutex_lock(&plugin.lock);
> +
> +    /*
> +     * We want to flush tb when a second cpu appear.
> +     * When generating plugin code, we optimize cpu_index for num_vcpus == 1.
> +     */
> +    if (plugin.num_vcpus == 1) {
> +        qemu_rec_mutex_unlock(&plugin.lock);
> +        start_exclusive();
> +        qemu_rec_mutex_lock(&plugin.lock);
> +        tb_flush(cpu);
> +        end_exclusive();
> +    }

We already did this when creating the new thread.
Though we're using slightly different tests:

         /*
          * If this is our first additional thread, we need to ensure we
          * generate code for parallel execution and flush old translations.
          * Do this now so that the copy gets CF_PARALLEL too.
          */
         if (!tcg_cflags_has(cpu, CF_PARALLEL)) {
             tcg_cflags_set(cpu, CF_PARALLEL);
             tb_flush(cpu);
         }


r~
Re: [PATCH] plugins: optimize cpu_index code generation
Posted by Pierrick Bouvier 1 week, 1 day ago
On 11/27/24 09:53, Richard Henderson wrote:
> On 11/26/24 13:02, Pierrick Bouvier wrote:
>> @@ -266,6 +266,19 @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused)
>>    
>>        assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
>>        qemu_rec_mutex_lock(&plugin.lock);
>> +
>> +    /*
>> +     * We want to flush tb when a second cpu appear.
>> +     * When generating plugin code, we optimize cpu_index for num_vcpus == 1.
>> +     */
>> +    if (plugin.num_vcpus == 1) {
>> +        qemu_rec_mutex_unlock(&plugin.lock);
>> +        start_exclusive();
>> +        qemu_rec_mutex_lock(&plugin.lock);
>> +        tb_flush(cpu);
>> +        end_exclusive();
>> +    }
> 
> We already did this when creating the new thread.
> Though we're using slightly different tests:
> 
>           /*
>            * If this is our first additional thread, we need to ensure we
>            * generate code for parallel execution and flush old translations.
>            * Do this now so that the copy gets CF_PARALLEL too.
>            */
>           if (!tcg_cflags_has(cpu, CF_PARALLEL)) {
>               tcg_cflags_set(cpu, CF_PARALLEL);
>               tb_flush(cpu);
>           }
> 

After removing the explicit flush, and relying on flush to honor 
CF_PARALLEL flags, I ran into random errors on values expected by 
'inline' plugin, when running a program that spawns multiple threads.

It seems that, when spawning them at once, we may execute old code 
waiting for the flush to happen.

> 
> r~
Re: [PATCH] plugins: optimize cpu_index code generation
Posted by Pierrick Bouvier 1 week, 2 days ago
Hi Richard,

On 11/27/24 09:53, Richard Henderson wrote:
> On 11/26/24 13:02, Pierrick Bouvier wrote:
>> @@ -266,6 +266,19 @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused)
>>    
>>        assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
>>        qemu_rec_mutex_lock(&plugin.lock);
>> +
>> +    /*
>> +     * We want to flush tb when a second cpu appear.
>> +     * When generating plugin code, we optimize cpu_index for num_vcpus == 1.
>> +     */
>> +    if (plugin.num_vcpus == 1) {
>> +        qemu_rec_mutex_unlock(&plugin.lock);
>> +        start_exclusive();
>> +        qemu_rec_mutex_lock(&plugin.lock);
>> +        tb_flush(cpu);
>> +        end_exclusive();
>> +    }
> 
> We already did this when creating the new thread.
> Though we're using slightly different tests:
> 
>           /*
>            * If this is our first additional thread, we need to ensure we
>            * generate code for parallel execution and flush old translations.
>            * Do this now so that the copy gets CF_PARALLEL too.
>            */
>           if (!tcg_cflags_has(cpu, CF_PARALLEL)) {
>               tcg_cflags_set(cpu, CF_PARALLEL);
>               tb_flush(cpu);
>           }
> 
> 
> r~

I noticed that it was redundant (for user-mode at least), but it seemed 
too implicit to rely on this.
As well, I didn't observe such a flush in system-mode, does it work the 
same as user-mode (regarding the CF_PARALLEL flag)?
Re: [PATCH] plugins: optimize cpu_index code generation
Posted by Richard Henderson 1 week, 2 days ago
On 11/27/24 11:57, Pierrick Bouvier wrote:
> I noticed that it was redundant (for user-mode at least), but it seemed too implicit to 
> rely on this.
> As well, I didn't observe such a flush in system-mode, does it work the same as user-mode 
> (regarding the CF_PARALLEL flag)?

Yes, we set CF_PARALLEL for system mode too.  We do it early, because we can tell the 1 vs 
many cpu count from the command-line.  Thus no flush required.


r~
Re: [PATCH] plugins: optimize cpu_index code generation
Posted by Pierrick Bouvier 1 week, 2 days ago
On 11/27/24 10:27, Richard Henderson wrote:
> On 11/27/24 11:57, Pierrick Bouvier wrote:
>> I noticed that it was redundant (for user-mode at least), but it seemed too implicit to
>> rely on this.
>> As well, I didn't observe such a flush in system-mode, does it work the same as user-mode
>> (regarding the CF_PARALLEL flag)?
> 
> Yes, we set CF_PARALLEL for system mode too.  We do it early, because we can tell the 1 vs
> many cpu count from the command-line.  Thus no flush required.
> 
> 
> r~

Yes I saw that now, we directly boot with the given number of vcpus, and 
they are initialized before any code generation is made.

I'll remove the flush part in v2 then.
Re: [PATCH] plugins: optimize cpu_index code generation
Posted by Pierrick Bouvier 1 week, 3 days ago
On 11/26/24 11:02, Pierrick Bouvier wrote:
> When running with a single vcpu, we can return a constant instead of a
> load when accessing cpu_index.
> A side effect is that all tcg operations using it are optimized, most
> notably scoreboard access.
> When running a simple loop in user-mode, the speedup is around 20%.
> 
> Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
> ---
>   accel/tcg/plugin-gen.c |  7 +++++++
>   plugins/core.c         | 13 +++++++++++++
>   2 files changed, 20 insertions(+)
> 
> diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
> index 0f47bfbb489..2eabeecbdcf 100644
> --- a/accel/tcg/plugin-gen.c
> +++ b/accel/tcg/plugin-gen.c
> @@ -102,6 +102,13 @@ static void gen_disable_mem_helper(void)
>   
>   static TCGv_i32 gen_cpu_index(void)
>   {
> +    /*
> +     * Optimize when we run with a single vcpu. All values using cpu_index,
> +     * including scoreboard index, will be optimized out.
> +     */
> +    if (qemu_plugin_num_vcpus() == 1) {
> +        return tcg_constant_i32(0);
> +    }
>       TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
>       tcg_gen_ld_i32(cpu_index, tcg_env,
>                      -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
> diff --git a/plugins/core.c b/plugins/core.c
> index bb105e8e688..8e32ca5ee08 100644
> --- a/plugins/core.c
> +++ b/plugins/core.c
> @@ -266,6 +266,19 @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused)
>   
>       assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
>       qemu_rec_mutex_lock(&plugin.lock);
> +
> +    /*
> +     * We want to flush tb when a second cpu appear.
> +     * When generating plugin code, we optimize cpu_index for num_vcpus == 1.
> +     */
> +    if (plugin.num_vcpus == 1) {
> +        qemu_rec_mutex_unlock(&plugin.lock);
> +        start_exclusive();
> +        qemu_rec_mutex_lock(&plugin.lock);
> +        tb_flush(cpu);
> +        end_exclusive();
> +    }
> +
>       plugin.num_vcpus = MAX(plugin.num_vcpus, cpu->cpu_index + 1);
>       plugin_cpu_update__locked(&cpu->cpu_index, NULL, NULL);
>       success = g_hash_table_insert(plugin.cpu_ht, &cpu->cpu_index,

This patch is for 10.0.