tcg/arm: Expand epilogue inline

[PATCH] tcg/arm: Expand epilogue inline

Posted by Richard Henderson 6 years, 3 months ago

It is, after all, just two instructions.

Profiling on a cortex-a15, using -d nochain to increase the number
of exit_tb that are executed, shows a minor improvement of 0.5%.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.inc.c | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index 94d80d79d1..2a9ebfe25a 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -1745,24 +1745,18 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 #endif
 }
 
-static tcg_insn_unit *tb_ret_addr;
+static void tcg_out_epilogue(TCGContext *s);
 
-static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
-                const TCGArg *args, const int *const_args)
+static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                       const TCGArg *args, const int *const_args)
 {
     TCGArg a0, a1, a2, a3, a4, a5;
     int c;
 
     switch (opc) {
     case INDEX_op_exit_tb:
-        /* Reuse the zeroing that exists for goto_ptr.  */
-        a0 = args[0];
-        if (a0 == 0) {
-            tcg_out_goto(s, COND_AL, s->code_gen_epilogue);
-        } else {
-            tcg_out_movi32(s, COND_AL, TCG_REG_R0, args[0]);
-            tcg_out_goto(s, COND_AL, tb_ret_addr);
-        }
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]);
+        tcg_out_epilogue(s);
         break;
     case INDEX_op_goto_tb:
         {
@@ -2284,19 +2278,17 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
       + TCG_TARGET_STACK_ALIGN - 1) \
      & -TCG_TARGET_STACK_ALIGN)
 
+#define STACK_ADDEND  (FRAME_SIZE - PUSH_SIZE)
+
 static void tcg_target_qemu_prologue(TCGContext *s)
 {
-    int stack_addend;
-
     /* Calling convention requires us to save r4-r11 and lr.  */
     /* stmdb sp!, { r4 - r11, lr } */
     tcg_out32(s, (COND_AL << 28) | 0x092d4ff0);
 
     /* Reserve callee argument and tcg temp space.  */
-    stack_addend = FRAME_SIZE - PUSH_SIZE;
-
     tcg_out_dat_rI(s, COND_AL, ARITH_SUB, TCG_REG_CALL_STACK,
-                   TCG_REG_CALL_STACK, stack_addend, 1);
+                   TCG_REG_CALL_STACK, STACK_ADDEND, 1);
     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
                   CPU_TEMP_BUF_NLONGS * sizeof(long));
 
@@ -2310,11 +2302,13 @@ static void tcg_target_qemu_prologue(TCGContext *s)
      */
     s->code_gen_epilogue = s->code_ptr;
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0);
+    tcg_out_epilogue(s);
+}
 
-    /* TB epilogue */
-    tb_ret_addr = s->code_ptr;
+static void tcg_out_epilogue(TCGContext *s)
+{
     tcg_out_dat_rI(s, COND_AL, ARITH_ADD, TCG_REG_CALL_STACK,
-                   TCG_REG_CALL_STACK, stack_addend, 1);
+                   TCG_REG_CALL_STACK, STACK_ADDEND, 1);
 
     /* ldmia sp!, { r4 - r11, pc } */
     tcg_out32(s, (COND_AL << 28) | 0x08bd8ff0);
-- 
2.17.1

Re: [PATCH] tcg/arm: Expand epilogue inline

Posted by Philippe Mathieu-Daudé 6 years, 3 months ago

Hi Richard,

On 10/15/19 3:29 AM, Richard Henderson wrote:
> It is, after all, just two instructions.
> 
> Profiling on a cortex-a15, using -d nochain to increase the number
> of exit_tb that are executed, shows a minor improvement of 0.5%.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   tcg/arm/tcg-target.inc.c | 32 +++++++++++++-------------------
>   1 file changed, 13 insertions(+), 19 deletions(-)
> 
> diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
> index 94d80d79d1..2a9ebfe25a 100644
> --- a/tcg/arm/tcg-target.inc.c
> +++ b/tcg/arm/tcg-target.inc.c
> @@ -1745,24 +1745,18 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
>   #endif
>   }
>   
> -static tcg_insn_unit *tb_ret_addr;
> +static void tcg_out_epilogue(TCGContext *s);
>   
> -static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
> -                const TCGArg *args, const int *const_args)
> +static void tcg_out_op(TCGContext *s, TCGOpcode opc,
> +                       const TCGArg *args, const int *const_args)
>   {
>       TCGArg a0, a1, a2, a3, a4, a5;
>       int c;
>   
>       switch (opc) {
>       case INDEX_op_exit_tb:
> -        /* Reuse the zeroing that exists for goto_ptr.  */
> -        a0 = args[0];
> -        if (a0 == 0) {
> -            tcg_out_goto(s, COND_AL, s->code_gen_epilogue);
> -        } else {
> -            tcg_out_movi32(s, COND_AL, TCG_REG_R0, args[0]);
> -            tcg_out_goto(s, COND_AL, tb_ret_addr);
> -        }
> +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]);
> +        tcg_out_epilogue(s);
>           break;
>       case INDEX_op_goto_tb:
>           {
> @@ -2284,19 +2278,17 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
>         + TCG_TARGET_STACK_ALIGN - 1) \
>        & -TCG_TARGET_STACK_ALIGN)
>   
> +#define STACK_ADDEND  (FRAME_SIZE - PUSH_SIZE)
> +
>   static void tcg_target_qemu_prologue(TCGContext *s)
>   {
> -    int stack_addend;
> -
>       /* Calling convention requires us to save r4-r11 and lr.  */
>       /* stmdb sp!, { r4 - r11, lr } */
>       tcg_out32(s, (COND_AL << 28) | 0x092d4ff0);
>   
>       /* Reserve callee argument and tcg temp space.  */
> -    stack_addend = FRAME_SIZE - PUSH_SIZE;
> -
>       tcg_out_dat_rI(s, COND_AL, ARITH_SUB, TCG_REG_CALL_STACK,
> -                   TCG_REG_CALL_STACK, stack_addend, 1);
> +                   TCG_REG_CALL_STACK, STACK_ADDEND, 1);
>       tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
>                     CPU_TEMP_BUF_NLONGS * sizeof(long));
>   
> @@ -2310,11 +2302,13 @@ static void tcg_target_qemu_prologue(TCGContext *s)
>        */
>       s->code_gen_epilogue = s->code_ptr;
>       tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0);
> +    tcg_out_epilogue(s);
> +}
>   
> -    /* TB epilogue */
> -    tb_ret_addr = s->code_ptr;
> +static void tcg_out_epilogue(TCGContext *s)

Do you mind splitting this patch in 2?
First use tcg_out_epilogue(), then optimize tcg_out_op().

> +{
>       tcg_out_dat_rI(s, COND_AL, ARITH_ADD, TCG_REG_CALL_STACK,
> -                   TCG_REG_CALL_STACK, stack_addend, 1);
> +                   TCG_REG_CALL_STACK, STACK_ADDEND, 1);
>   
>       /* ldmia sp!, { r4 - r11, pc } */
>       tcg_out32(s, (COND_AL << 28) | 0x08bd8ff0);
>