target/mips/translate: Simplify PCPYH using deposit/extract

[PATCH] target/mips/translate: Simplify PCPYH using deposit/extract

Posted by Philippe Mathieu-Daudé 5 years, 2 months ago

Simplify (and optimize) the Parallel Copy Halfword
instruction using deposit() / extract() helpers.

Ref: C790-Specific Instruction Set, Appendix B-63.

Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
---
 target/mips/translate.c | 35 ++++++++++-------------------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/target/mips/translate.c b/target/mips/translate.c
index c64a1bc42e1..17a28557c2c 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -25220,34 +25220,19 @@ static void gen_mmi_pcpyh(DisasContext *ctx)
         tcg_gen_movi_i64(cpu_mmr[rd], 0);
     } else {
         TCGv_i64 t0 = tcg_temp_new();
-        TCGv_i64 t1 = tcg_temp_new();
-        uint64_t mask = (1ULL << 16) - 1;
 
-        tcg_gen_andi_i64(t0, cpu_gpr[rt], mask);
-        tcg_gen_movi_i64(t1, 0);
-        tcg_gen_or_i64(t1, t0, t1);
-        tcg_gen_shli_i64(t0, t0, 16);
-        tcg_gen_or_i64(t1, t0, t1);
-        tcg_gen_shli_i64(t0, t0, 16);
-        tcg_gen_or_i64(t1, t0, t1);
-        tcg_gen_shli_i64(t0, t0, 16);
-        tcg_gen_or_i64(t1, t0, t1);
+        tcg_gen_extract_i64(t0, cpu_gpr[a->rt], 0, 16);
+        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t0, 0, 16);
+        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t0, 16, 16);
+        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t0, 32, 16);
+        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t0, 48, 16);
 
-        tcg_gen_mov_i64(cpu_gpr[rd], t1);
+        tcg_gen_extract_i64(t0, cpu_gpr_hi[a->rt], 0, 16);
+        tcg_gen_deposit_i64(cpu_gpr_hi[a->rd], cpu_gpr_hi[a->rd], t0, 0, 16);
+        tcg_gen_deposit_i64(cpu_gpr_hi[a->rd], cpu_gpr_hi[a->rd], t0, 16, 16);
+        tcg_gen_deposit_i64(cpu_gpr_hi[a->rd], cpu_gpr_hi[a->rd], t0, 32, 16);
+        tcg_gen_deposit_i64(cpu_gpr_hi[a->rd], cpu_gpr_hi[a->rd], t0, 48, 16);
 
-        tcg_gen_andi_i64(t0, cpu_mmr[rt], mask);
-        tcg_gen_movi_i64(t1, 0);
-        tcg_gen_or_i64(t1, t0, t1);
-        tcg_gen_shli_i64(t0, t0, 16);
-        tcg_gen_or_i64(t1, t0, t1);
-        tcg_gen_shli_i64(t0, t0, 16);
-        tcg_gen_or_i64(t1, t0, t1);
-        tcg_gen_shli_i64(t0, t0, 16);
-        tcg_gen_or_i64(t1, t0, t1);
-
-        tcg_gen_mov_i64(cpu_mmr[rd], t1);
-
-        tcg_temp_free(t0);
         tcg_temp_free(t1);
     }
 }
-- 
2.26.2

Re: [PATCH] target/mips/translate: Simplify PCPYH using deposit/extract

Posted by Richard Henderson 5 years, 2 months ago

On 11/24/20 2:38 AM, Philippe Mathieu-Daudé wrote:
> Simplify (and optimize) the Parallel Copy Halfword
> instruction using deposit() / extract() helpers.
> 
> Ref: C790-Specific Instruction Set, Appendix B-63.
> 
> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
> ---
>  target/mips/translate.c | 35 ++++++++++-------------------------
>  1 file changed, 10 insertions(+), 25 deletions(-)
> 
> diff --git a/target/mips/translate.c b/target/mips/translate.c
> index c64a1bc42e1..17a28557c2c 100644
> --- a/target/mips/translate.c
> +++ b/target/mips/translate.c
> @@ -25220,34 +25220,19 @@ static void gen_mmi_pcpyh(DisasContext *ctx)
>          tcg_gen_movi_i64(cpu_mmr[rd], 0);
>      } else {
>          TCGv_i64 t0 = tcg_temp_new();
> -        TCGv_i64 t1 = tcg_temp_new();
> -        uint64_t mask = (1ULL << 16) - 1;
>  
> -        tcg_gen_andi_i64(t0, cpu_gpr[rt], mask);
> -        tcg_gen_movi_i64(t1, 0);
> -        tcg_gen_or_i64(t1, t0, t1);
> -        tcg_gen_shli_i64(t0, t0, 16);
> -        tcg_gen_or_i64(t1, t0, t1);
> -        tcg_gen_shli_i64(t0, t0, 16);
> -        tcg_gen_or_i64(t1, t0, t1);
> -        tcg_gen_shli_i64(t0, t0, 16);
> -        tcg_gen_or_i64(t1, t0, t1);
> +        tcg_gen_extract_i64(t0, cpu_gpr[a->rt], 0, 16);
> +        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t0, 0, 16);
> +        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t0, 16, 16);
> +        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t0, 32, 16);
> +        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t0, 48, 16);

Actually, this would be better as

   tcg_gen_ext16u_i64(t0, cpu_gpr[rt]);
   tcg_gen_muli_i64(cpu_gpr[a->rd], t0, dup_const(1, MO_16));


r~

Re: [PATCH] target/mips/translate: Simplify PCPYH using deposit/extract

Posted by Richard Henderson 5 years, 2 months ago

On 11/24/20 9:28 AM, Richard Henderson wrote:
>> +        tcg_gen_extract_i64(t0, cpu_gpr[a->rt], 0, 16);
>> +        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t0, 0, 16);
>> +        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t0, 16, 16);
>> +        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t0, 32, 16);
>> +        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t0, 48, 16);
> 
> Actually, this would be better as
> 
>    tcg_gen_ext16u_i64(t0, cpu_gpr[rt]);
>    tcg_gen_muli_i64(cpu_gpr[a->rd], t0, dup_const(1, MO_16));

Hmm, while that's fine for 64-bit hosts (and ideal for x86_64), it's not ideal
for the 32-bit hosts we have left.

This can also be done with

  // replicate lower 16 bits, garbage in upper 32.
  tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rt],
                      cpu_gpr[a->rt], 16, 48);
  // replicate lower 32 bits
  tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd],
                      cpu_gpr[a->rd], 32, 32);


r~