[RFC PATCH 18/42] target/mips/tx79: Introduce PEXTU[BHW] opcodes (Parallel Extend Lower)

Philippe Mathieu-Daudé posted 42 patches 4 years, 9 months ago
There is a newer version of this series
[RFC PATCH 18/42] target/mips/tx79: Introduce PEXTU[BHW] opcodes (Parallel Extend Lower)
Posted by Philippe Mathieu-Daudé 4 years, 9 months ago
Introduce the 'Parallel Extend Lower' opcodes:

 - PEXTLB (Parallel Extend Upper from Byte)
 - PEXTLH (Parallel Extend Upper from Halfword)
 - PEXTLW (Parallel Extend Upper from Word)

Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
---
 target/mips/tx79.decode      |  3 ++
 target/mips/tx79_translate.c | 78 ++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)

diff --git a/target/mips/tx79.decode b/target/mips/tx79.decode
index ead5f8281e5..98f21d33e3f 100644
--- a/target/mips/tx79.decode
+++ b/target/mips/tx79.decode
@@ -34,6 +34,9 @@ MTLO1           011100 .....  0000000000 00000 010011   @rs
 PSUBW           011100 ..... ..... ..... 00001 001000   @rs_rt_rd
 PSUBH           011100 ..... ..... ..... 00101 001000   @rs_rt_rd
 PSUBB           011100 ..... ..... ..... 01001 001000   @rs_rt_rd
+PEXTLW          011100 ..... ..... ..... 10010 001000   @rs_rt_rd
+PEXTLH          011100 ..... ..... ..... 10110 001000   @rs_rt_rd
+PEXTLB          011100 ..... ..... ..... 11010 001000   @rs_rt_rd
 
 # MMI1
 
diff --git a/target/mips/tx79_translate.c b/target/mips/tx79_translate.c
index 0a2fb28600b..11968d6edab 100644
--- a/target/mips/tx79_translate.c
+++ b/target/mips/tx79_translate.c
@@ -332,6 +332,84 @@ static bool trans_PNOR(DisasContext *ctx, arg_rtype *a)
  * PEXTLW  rd, rs, rt        Parallel Extend Lower from Word
  */
 
+static bool trans_PEXTLx(DisasContext *ctx, arg_rtype *a, unsigned wlen)
+{
+    TCGv_i64 ax, bx;
+
+    if (a->rd == 0) {
+        /* nop */
+        return true;
+    }
+
+    ax = tcg_temp_new_i64();
+    bx = tcg_temp_new_i64();
+
+    gen_load_gpr(ax, a->rs);
+    gen_load_gpr(bx, a->rt);
+
+    /* Lower halve */
+    for (int i = 0; i < 64 / (2 * wlen); i++) {
+        tcg_gen_deposit_i64(cpu_gpr[a->rd],
+                            cpu_gpr[a->rd], bx, 2 * wlen * i, wlen);
+        tcg_gen_deposit_i64(cpu_gpr[a->rd],
+                            cpu_gpr[a->rd], ax, 2 * wlen * i + wlen, wlen);
+        tcg_gen_shri_i64(bx, bx, wlen);
+        tcg_gen_shri_i64(ax, ax, wlen);
+    }
+    /* Upper halve */
+    for (int i = 0; i < 64 / (2 * wlen); i++) {
+        tcg_gen_deposit_i64(cpu_gpr_hi[a->rd],
+                            cpu_gpr_hi[a->rd], bx, 2 * wlen * i, wlen);
+        tcg_gen_deposit_i64(cpu_gpr_hi[a->rd],
+                            cpu_gpr_hi[a->rd], ax, 2 * wlen * i + wlen, wlen);
+        tcg_gen_shri_i64(bx, bx, wlen);
+        tcg_gen_shri_i64(ax, ax, wlen);
+    }
+
+    tcg_temp_free(bx);
+    tcg_temp_free(ax);
+
+    return true;
+}
+
+/* Parallel Extend Lower from Byte */
+static bool trans_PEXTLB(DisasContext *ctx, arg_rtype *a)
+{
+    return trans_PEXTLx(ctx, a, 8);
+}
+
+/* Parallel Extend Lower from Halfword */
+static bool trans_PEXTLH(DisasContext *ctx, arg_rtype *a)
+{
+    return trans_PEXTLx(ctx, a, 16);
+}
+
+/* Parallel Extend Lower from Word */
+static bool trans_PEXTLW(DisasContext *ctx, arg_rtype *a)
+{
+    TCGv_i64 ax, bx;
+
+    if (a->rd == 0) {
+        /* nop */
+        return true;
+    }
+
+    ax = tcg_temp_new_i64();
+    bx = tcg_temp_new_i64();
+
+    gen_load_gpr(ax, a->rs);
+    gen_load_gpr(bx, a->rt);
+
+    tcg_gen_deposit_i64(cpu_gpr[a->rd], bx, ax, 32, 32);
+    tcg_gen_shri_i64(bx, bx, 32);
+    tcg_gen_deposit_i64(cpu_gpr_hi[a->rd], ax, bx, 0, 32);
+
+    tcg_temp_free(bx);
+    tcg_temp_free(ax);
+
+    return true;
+}
+
 /* Parallel Extend Upper from Word */
 static bool trans_PEXTUW(DisasContext *ctx, arg_rtype *a)
 {
-- 
2.26.2

Re: [RFC PATCH 18/42] target/mips/tx79: Introduce PEXTU[BHW] opcodes (Parallel Extend Lower)
Posted by Richard Henderson 4 years, 9 months ago
On 2/14/21 9:58 AM, Philippe Mathieu-Daudé wrote:
> Introduce the 'Parallel Extend Lower' opcodes:

$SUBJECT s/PEXTU/PEXTL/.

> +    /* Lower halve */
> +    for (int i = 0; i < 64 / (2 * wlen); i++) {
> +        tcg_gen_deposit_i64(cpu_gpr[a->rd],
> +                            cpu_gpr[a->rd], bx, 2 * wlen * i, wlen);
> +        tcg_gen_deposit_i64(cpu_gpr[a->rd],
> +                            cpu_gpr[a->rd], ax, 2 * wlen * i + wlen, wlen);
> +        tcg_gen_shri_i64(bx, bx, wlen);
> +        tcg_gen_shri_i64(ax, ax, wlen);
> +    }
> +    /* Upper halve */
> +    for (int i = 0; i < 64 / (2 * wlen); i++) {
> +        tcg_gen_deposit_i64(cpu_gpr_hi[a->rd],
> +                            cpu_gpr_hi[a->rd], bx, 2 * wlen * i, wlen);
> +        tcg_gen_deposit_i64(cpu_gpr_hi[a->rd],
> +                            cpu_gpr_hi[a->rd], ax, 2 * wlen * i + wlen, wlen);
> +        tcg_gen_shri_i64(bx, bx, wlen);
> +        tcg_gen_shri_i64(ax, ax, wlen);
> +    }

Right, so, this expands to (4 * 4 * 2) = 32 operations for pextlb, if deposit
is supported, or ((4*2 + 2) * 4 * 2) = 80 operations if not (4 per deposit).

We can do a bit better, though, exploiting parallelism.

/* 5 or 8 operations, w/ or w/o deposit */
void gen_widen_b(TCGv_i64 d, TCGv_i64 s)
{
    TCGv_i64 x = tcg_temp_new_i64();
    TCGv_i64 y = tcg_temp_new_i64();
    TCGv_i64 m0 = tcg_constant_i64(0x0000ff000000ff00ull);

    /* s = abcdefgh */
    tcg_gen_deposit_i64(x, s, s, 16, 48);
    /* x = cdefghgh */
    tcg_gen_and_i64(y, x, m);
    /* y = 00e000g0 */
    tcg_gen_andc_i64(x, x, m0);
    /* x = 000f000h */
    tcg_gen_shli_i64(y, y, 8);
    /* y = 0e000g00 */
    tcg_gen_or_i64(d, x, y);
    /* d = 0e0f0g0h */

    tcg_temp_free_i64(x);
    tcg_temp_free_i64(y);
}

/* 12 or 18 operations w/ or w/o deposit */
void gen_pextb(TCGv_i64 d, TCGv_i64 s, TCGv_i64 t)
{
    TCGv_i64 x = tcg_temp_new_i64();

    gen_widen_b(x, s);
    gen_widen_b(d, s);
    tcg_gen_shli_i64(x, x, 8);
    tcg_gen_or_i64(d, d, x);

    tcg_temp_free_i64(x);
}

then

    gen_read_gpr(s, a->rs);
    gen_read_gpr(t, a->rt);
    gen_pextb(cpu_gpr[a->rd], s, t);

    tcg_gen_shri_i64(s, s, 32);
    tcg_gen_shri_i64(t, t, 32);
    gen_pextb(cpu_gpr_hi[a->rd], s, t);

gives you the result in 26 or 38 operations.

Similarly

void gen_widen_h(TCGv_i64 d, TCGv_i64 s)
{
    TCGv_i64 x = tcg_temp_new_i64();

    /* s = abcd */
    tcg_gen_andi_i64(x, s, 0xffff0000u);
    /* x = 00c0 */
    tcg_gen_deposit_i64(d, s, x, 16, 48);
    /* d = 0c0d */

    tcg_temp_free_i64(x);
}


r~