tcg/arm/tcg-target.inc.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-)
It is, after all, just two instructions.
Profiling on a cortex-a15, using -d nochain to increase the number
of exit_tb that are executed, shows a minor improvement of 0.5%.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/arm/tcg-target.inc.c | 32 +++++++++++++-------------------
1 file changed, 13 insertions(+), 19 deletions(-)
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index 94d80d79d1..2a9ebfe25a 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -1745,24 +1745,18 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
#endif
}
-static tcg_insn_unit *tb_ret_addr;
+static void tcg_out_epilogue(TCGContext *s);
-static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
- const TCGArg *args, const int *const_args)
+static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+ const TCGArg *args, const int *const_args)
{
TCGArg a0, a1, a2, a3, a4, a5;
int c;
switch (opc) {
case INDEX_op_exit_tb:
- /* Reuse the zeroing that exists for goto_ptr. */
- a0 = args[0];
- if (a0 == 0) {
- tcg_out_goto(s, COND_AL, s->code_gen_epilogue);
- } else {
- tcg_out_movi32(s, COND_AL, TCG_REG_R0, args[0]);
- tcg_out_goto(s, COND_AL, tb_ret_addr);
- }
+ tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]);
+ tcg_out_epilogue(s);
break;
case INDEX_op_goto_tb:
{
@@ -2284,19 +2278,17 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
+ TCG_TARGET_STACK_ALIGN - 1) \
& -TCG_TARGET_STACK_ALIGN)
+#define STACK_ADDEND (FRAME_SIZE - PUSH_SIZE)
+
static void tcg_target_qemu_prologue(TCGContext *s)
{
- int stack_addend;
-
/* Calling convention requires us to save r4-r11 and lr. */
/* stmdb sp!, { r4 - r11, lr } */
tcg_out32(s, (COND_AL << 28) | 0x092d4ff0);
/* Reserve callee argument and tcg temp space. */
- stack_addend = FRAME_SIZE - PUSH_SIZE;
-
tcg_out_dat_rI(s, COND_AL, ARITH_SUB, TCG_REG_CALL_STACK,
- TCG_REG_CALL_STACK, stack_addend, 1);
+ TCG_REG_CALL_STACK, STACK_ADDEND, 1);
tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
CPU_TEMP_BUF_NLONGS * sizeof(long));
@@ -2310,11 +2302,13 @@ static void tcg_target_qemu_prologue(TCGContext *s)
*/
s->code_gen_epilogue = s->code_ptr;
tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0);
+ tcg_out_epilogue(s);
+}
- /* TB epilogue */
- tb_ret_addr = s->code_ptr;
+static void tcg_out_epilogue(TCGContext *s)
+{
tcg_out_dat_rI(s, COND_AL, ARITH_ADD, TCG_REG_CALL_STACK,
- TCG_REG_CALL_STACK, stack_addend, 1);
+ TCG_REG_CALL_STACK, STACK_ADDEND, 1);
/* ldmia sp!, { r4 - r11, pc } */
tcg_out32(s, (COND_AL << 28) | 0x08bd8ff0);
--
2.17.1
Hi Richard, On 10/15/19 3:29 AM, Richard Henderson wrote: > It is, after all, just two instructions. > > Profiling on a cortex-a15, using -d nochain to increase the number > of exit_tb that are executed, shows a minor improvement of 0.5%. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > tcg/arm/tcg-target.inc.c | 32 +++++++++++++------------------- > 1 file changed, 13 insertions(+), 19 deletions(-) > > diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c > index 94d80d79d1..2a9ebfe25a 100644 > --- a/tcg/arm/tcg-target.inc.c > +++ b/tcg/arm/tcg-target.inc.c > @@ -1745,24 +1745,18 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) > #endif > } > > -static tcg_insn_unit *tb_ret_addr; > +static void tcg_out_epilogue(TCGContext *s); > > -static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > - const TCGArg *args, const int *const_args) > +static void tcg_out_op(TCGContext *s, TCGOpcode opc, > + const TCGArg *args, const int *const_args) > { > TCGArg a0, a1, a2, a3, a4, a5; > int c; > > switch (opc) { > case INDEX_op_exit_tb: > - /* Reuse the zeroing that exists for goto_ptr. */ > - a0 = args[0]; > - if (a0 == 0) { > - tcg_out_goto(s, COND_AL, s->code_gen_epilogue); > - } else { > - tcg_out_movi32(s, COND_AL, TCG_REG_R0, args[0]); > - tcg_out_goto(s, COND_AL, tb_ret_addr); > - } > + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]); > + tcg_out_epilogue(s); > break; > case INDEX_op_goto_tb: > { > @@ -2284,19 +2278,17 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) > + TCG_TARGET_STACK_ALIGN - 1) \ > & -TCG_TARGET_STACK_ALIGN) > > +#define STACK_ADDEND (FRAME_SIZE - PUSH_SIZE) > + > static void tcg_target_qemu_prologue(TCGContext *s) > { > - int stack_addend; > - > /* Calling convention requires us to save r4-r11 and lr. */ > /* stmdb sp!, { r4 - r11, lr } */ > tcg_out32(s, (COND_AL << 28) | 0x092d4ff0); > > /* Reserve callee argument and tcg temp space. */ > - stack_addend = FRAME_SIZE - PUSH_SIZE; > - > tcg_out_dat_rI(s, COND_AL, ARITH_SUB, TCG_REG_CALL_STACK, > - TCG_REG_CALL_STACK, stack_addend, 1); > + TCG_REG_CALL_STACK, STACK_ADDEND, 1); > tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, > CPU_TEMP_BUF_NLONGS * sizeof(long)); > > @@ -2310,11 +2302,13 @@ static void tcg_target_qemu_prologue(TCGContext *s) > */ > s->code_gen_epilogue = s->code_ptr; > tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0); > + tcg_out_epilogue(s); > +} > > - /* TB epilogue */ > - tb_ret_addr = s->code_ptr; > +static void tcg_out_epilogue(TCGContext *s) Do you mind splitting this patch in 2? First use tcg_out_epilogue(), then optimize tcg_out_op(). > +{ > tcg_out_dat_rI(s, COND_AL, ARITH_ADD, TCG_REG_CALL_STACK, > - TCG_REG_CALL_STACK, stack_addend, 1); > + TCG_REG_CALL_STACK, STACK_ADDEND, 1); > > /* ldmia sp!, { r4 - r11, pc } */ > tcg_out32(s, (COND_AL << 28) | 0x08bd8ff0); >
© 2016 - 2024 Red Hat, Inc.