:p
atchew
Login
When running with a single vcpu, we can return a constant instead of a load when accessing cpu_index. A side effect is that all tcg operations using it are optimized, most notably scoreboard access. When running a simple loop in user-mode, the speedup is around 20%. Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org> --- accel/tcg/plugin-gen.c | 7 +++++++ plugins/core.c | 13 +++++++++++++ 2 files changed, 20 insertions(+) diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c index XXXXXXX..XXXXXXX 100644 --- a/accel/tcg/plugin-gen.c +++ b/accel/tcg/plugin-gen.c @@ -XXX,XX +XXX,XX @@ static void gen_disable_mem_helper(void) static TCGv_i32 gen_cpu_index(void) { + /* + * Optimize when we run with a single vcpu. All values using cpu_index, + * including scoreboard index, will be optimized out. + */ + if (qemu_plugin_num_vcpus() == 1) { + return tcg_constant_i32(0); + } TCGv_i32 cpu_index = tcg_temp_ebb_new_i32(); tcg_gen_ld_i32(cpu_index, tcg_env, -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index)); diff --git a/plugins/core.c b/plugins/core.c index XXXXXXX..XXXXXXX 100644 --- a/plugins/core.c +++ b/plugins/core.c @@ -XXX,XX +XXX,XX @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused) assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX); qemu_rec_mutex_lock(&plugin.lock); + + /* + * We want to flush tb when a second cpu appear. + * When generating plugin code, we optimize cpu_index for num_vcpus == 1. + */ + if (plugin.num_vcpus == 1) { + qemu_rec_mutex_unlock(&plugin.lock); + start_exclusive(); + qemu_rec_mutex_lock(&plugin.lock); + tb_flush(cpu); + end_exclusive(); + } + plugin.num_vcpus = MAX(plugin.num_vcpus, cpu->cpu_index + 1); plugin_cpu_update__locked(&cpu->cpu_index, NULL, NULL); success = g_hash_table_insert(plugin.cpu_ht, &cpu->cpu_index, -- 2.39.5
When running with a single vcpu, we can return a constant instead of a load when accessing cpu_index. A side effect is that all tcg operations using it are optimized, most notably scoreboard access. When running a simple loop in user-mode, the speedup is around 20%. Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org> --- v2: - no need to do a flush, as user-mode already does it when spawning a second cpu (to honor CF_PARALLEL flags). - change condition detection to use CF_PARALLEL instead --- accel/tcg/plugin-gen.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c index XXXXXXX..XXXXXXX 100644 --- a/accel/tcg/plugin-gen.c +++ b/accel/tcg/plugin-gen.c @@ -XXX,XX +XXX,XX @@ static void gen_disable_mem_helper(void) static TCGv_i32 gen_cpu_index(void) { + /* + * Optimize when we run with a single vcpu. All values using cpu_index, + * including scoreboard index, will be optimized out. + * User-mode calls tb_flush when setting this flag. In system-mode, all + * vcpus are created before generating code. + */ + if (!tcg_cflags_has(current_cpu, CF_PARALLEL)) { + return tcg_constant_i32(current_cpu->cpu_index); + } TCGv_i32 cpu_index = tcg_temp_ebb_new_i32(); tcg_gen_ld_i32(cpu_index, tcg_env, -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index)); -- 2.39.5