[v3] Adding partial support for 128-bit riscv target

[PATCH v3 10/21] target/riscv: support for 128-bit loads and store

Posted by Frédéric Pétrot 4 years, 3 months ago

The 128-bit ISA adds ldu, lq and sq. We provide here support for these
instructions. Note that although we compute a 128-bit address, we only use
the lower 64-bit to actually address memory, cowardly utilizing the
existing address translation mechanism of QEMU.

Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
---
 target/riscv/insn16.decode              |  32 +++++-
 target/riscv/insn32.decode              |   4 +
 target/riscv/translate.c                |   7 --
 target/riscv/insn_trans/trans_rvi.c.inc | 146 ++++++++++++++++++++++--
 4 files changed, 171 insertions(+), 18 deletions(-)

diff --git a/target/riscv/insn16.decode b/target/riscv/insn16.decode
index 2e9212663c..151fc6e567 100644
--- a/target/riscv/insn16.decode
+++ b/target/riscv/insn16.decode
@@ -39,6 +39,10 @@
 %imm_addi16sp  12:s1 3:2 5:1 2:1 6:1 !function=ex_shift_4
 %imm_lui       12:s1 2:5             !function=ex_shift_12
 
+# Added for 128 bit support
+%uimm_cl_q    5:2 10:3               !function=ex_shift_3
+%uimm_6bit_lq 2:3 12:1 5:2           !function=ex_shift_3
+%uimm_6bit_sq 7:3 10:3               !function=ex_shift_3
 
 # Argument sets imported from insn32.decode:
 &empty                  !extern
@@ -54,16 +58,20 @@
 # Formats 16:
 @cr        ....  ..... .....  .. &r      rs2=%rs2_5       rs1=%rd     %rd
 @ci        ... . ..... .....  .. &i      imm=%imm_ci      rs1=%rd     %rd
+@cl_q      ... . .....  ..... .. &i      imm=%uimm_6bit_lq rs1=2 %rd
 @cl_d      ... ... ... .. ... .. &i      imm=%uimm_cl_d   rs1=%rs1_3  rd=%rs2_3
 @cl_w      ... ... ... .. ... .. &i      imm=%uimm_cl_w   rs1=%rs1_3  rd=%rs2_3
 @cs_2      ... ... ... .. ... .. &r      rs2=%rs2_3       rs1=%rs1_3  rd=%rs1_3
+@cs_q      ... ... ... .. ... .. &s      imm=%uimm_cl_q   rs1=%rs1_3  rs2=%rs2_3
 @cs_d      ... ... ... .. ... .. &s      imm=%uimm_cl_d   rs1=%rs1_3  rs2=%rs2_3
 @cs_w      ... ... ... .. ... .. &s      imm=%uimm_cl_w   rs1=%rs1_3  rs2=%rs2_3
 @cj        ...    ........... .. &j      imm=%imm_cj
 @cb_z      ... ... ... .. ... .. &b      imm=%imm_cb      rs1=%rs1_3  rs2=0
 
+@c_lqsp    ... . .....  ..... .. &i      imm=%uimm_6bit_lq rs1=2 %rd
 @c_ldsp    ... . .....  ..... .. &i      imm=%uimm_6bit_ld rs1=2 %rd
 @c_lwsp    ... . .....  ..... .. &i      imm=%uimm_6bit_lw rs1=2 %rd
+@c_sqsp    ... . .....  ..... .. &s      imm=%uimm_6bit_sq rs1=2 rs2=%rs2_5
 @c_sdsp    ... . .....  ..... .. &s      imm=%uimm_6bit_sd rs1=2 rs2=%rs2_5
 @c_swsp    ... . .....  ..... .. &s      imm=%uimm_6bit_sw rs1=2 rs2=%rs2_5
 @c_li      ... . .....  ..... .. &i      imm=%imm_ci rs1=0 %rd
@@ -87,9 +95,17 @@
   illegal         000  000 000 00 --- 00
   addi            000  ... ... .. ... 00 @c_addi4spn
 }
-fld               001  ... ... .. ... 00 @cl_d
+{
+  fld             001  ... ... .. ... 00 @cl_d
+  # *** RV128C specific Standard Extension (Quadrant 0) ***
+  lq              001  ... ... .. ... 00 @cl_q
+}
 lw                010  ... ... .. ... 00 @cl_w
-fsd               101  ... ... .. ... 00 @cs_d
+{
+  fsd             101  ... ... .. ... 00 @cs_d
+  # *** RV128C specific Standard Extension (Quadrant 0) ***
+  sq              101  ... ... .. ... 00 @cs_q
+}
 sw                110  ... ... .. ... 00 @cs_w
 
 # *** RV32C and RV64C specific Standard Extension (Quadrant 0) ***
@@ -132,7 +148,11 @@ addw              100 1 11 ... 01 ... 01 @cs_2
 
 # *** RV32/64C Standard Extension (Quadrant 2) ***
 slli              000 .  .....  ..... 10 @c_shift2
-fld               001 .  .....  ..... 10 @c_ldsp
+{
+  fld             001 .  .....  ..... 10 @c_ldsp
+  # *** RV128C specific Standard Extension (Quadrant 2) ***
+  lq              001  ... ... .. ... 10 @c_lqsp
+}
 {
   illegal         010 -  00000  ----- 10 # c.lwsp, RES rd=0
   lw              010 .  .....  ..... 10 @c_lwsp
@@ -147,7 +167,11 @@ fld               001 .  .....  ..... 10 @c_ldsp
   jalr            100 1  .....  00000 10 @c_jalr rd=1  # C.JALR
   add             100 1  .....  ..... 10 @cr
 }
-fsd               101   ......  ..... 10 @c_sdsp
+{
+  fsd             101   ......  ..... 10 @c_sdsp
+  # *** RV128C specific Standard Extension (Quadrant 2) ***
+  sq              101  ... ... .. ... 10 @c_sqsp
+}
 sw                110 .  .....  ..... 10 @c_swsp
 
 # *** RV32C and RV64C specific Standard Extension (Quadrant 2) ***
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index 2f251dac1b..1e7ddecc22 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -163,6 +163,10 @@ sllw     0000000 .....  ..... 001 ..... 0111011 @r
 srlw     0000000 .....  ..... 101 ..... 0111011 @r
 sraw     0100000 .....  ..... 101 ..... 0111011 @r
 
+# *** RV128I Base Instruction Set (in addition to RV64I) ***
+ldu      ............   ..... 111 ..... 0000011 @i
+lq       ............   ..... 010 ..... 0001111 @i
+sq       ............   ..... 100 ..... 0100011 @s
 # *** RV32M Standard Extension ***
 mul      0000001 .....  ..... 000 ..... 0110011 @r
 mulh     0000001 .....  ..... 001 ..... 0110011 @r
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index b6ddcf7a10..e8f08f921e 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -429,13 +429,6 @@ static bool gen_logic_imm_fn(DisasContext *ctx, arg_i *a, DisasExtend ext,
 
     gen_set_gpr(ctx, a->rd, dest);
 
-    /* devilish temporary code so that the patch compiles */
-    if (get_xl_max(ctx) == MXL_RV128) {
-        (void)get_gprh(ctx, 6);
-        (void)dest_gprh(ctx, 6);
-        gen_set_gprh(ctx, 6, NULL);
-    }
-
     return true;
 }
 
diff --git a/target/riscv/insn_trans/trans_rvi.c.inc b/target/riscv/insn_trans/trans_rvi.c.inc
index 5c2a117a70..92f41f3a86 100644
--- a/target/riscv/insn_trans/trans_rvi.c.inc
+++ b/target/riscv/insn_trans/trans_rvi.c.inc
@@ -134,7 +134,15 @@ static bool trans_bgeu(DisasContext *ctx, arg_bgeu *a)
     return gen_branch(ctx, a, TCG_COND_GEU);
 }
 
-static bool gen_load(DisasContext *ctx, arg_lb *a, MemOp memop)
+static void gen_addi2_i128(TCGv retl, TCGv reth,
+                           TCGv srcl, TCGv srch, target_long imm)
+{
+    TCGv imml  = tcg_constant_tl(imm),
+         immh  = tcg_constant_tl(-(imm < 0));
+    tcg_gen_add2_tl(retl, reth, srcl, srch, imml, immh);
+}
+
+static bool gen_load_tl(DisasContext *ctx, arg_lb *a, MemOp memop)
 {
     TCGv dest = dest_gpr(ctx, a->rd);
     TCGv addr = get_gpr(ctx, a->rs1, EXT_NONE);
@@ -150,6 +158,63 @@ static bool gen_load(DisasContext *ctx, arg_lb *a, MemOp memop)
     return true;
 }
 
+/*
+ * TODO: we should assert that src1h == 0, as we do not change the
+ *       address translation mechanism
+ */
+static bool gen_load_i128(DisasContext *ctx, arg_lb *a, MemOp memop)
+{
+    TCGv src1l = get_gpr(ctx, a->rs1, EXT_NONE);
+    TCGv src1h = get_gprh(ctx, a->rs1);
+    TCGv destl = dest_gpr(ctx, a->rd);
+    TCGv desth = dest_gprh(ctx, a->rd);
+    TCGv addrl = tcg_temp_new();
+    TCGv addrh = tcg_temp_new();
+    TCGv imml = tcg_temp_new();
+    TCGv immh = tcg_constant_tl(-(a->imm < 0));
+
+    /* Build a 128-bit address */
+    if (a->imm != 0) {
+        tcg_gen_movi_tl(imml, a->imm);
+        tcg_gen_add2_tl(addrl, addrh, src1l, src1h, imml, immh);
+    } else {
+        tcg_gen_mov_tl(addrl, src1l);
+        tcg_gen_mov_tl(addrh, src1h);
+    }
+
+    if (memop != (MemOp)MO_TEO) {
+        tcg_gen_qemu_ld_tl(destl, addrl, ctx->mem_idx, memop);
+        if (memop & MO_SIGN) {
+            tcg_gen_sari_tl(desth, destl, 63);
+        } else {
+            tcg_gen_movi_tl(desth, 0);
+        }
+    } else {
+        tcg_gen_qemu_ld_tl(memop & MO_BSWAP ? desth : destl, addrl,
+                           ctx->mem_idx, MO_TEQ);
+        gen_addi2_i128(addrl, addrh, addrl, addrh, 8);
+        tcg_gen_qemu_ld_tl(memop & MO_BSWAP ? destl : desth, addrl,
+                           ctx->mem_idx, MO_TEQ);
+    }
+
+    gen_set_gpr(ctx, a->rd, destl);
+    gen_set_gprh(ctx, a->rd, desth);
+
+    tcg_temp_free(addrl);
+    tcg_temp_free(addrh);
+    tcg_temp_free(imml);
+    return true;
+}
+
+static bool gen_load(DisasContext *ctx, arg_lb *a, MemOp memop)
+{
+    if (get_xl(ctx) == MXL_RV128) {
+        return gen_load_i128(ctx, a, memop);
+    } else {
+        return gen_load_tl(ctx, a, memop);
+    }
+}
+
 static bool trans_lb(DisasContext *ctx, arg_lb *a)
 {
     return gen_load(ctx, a, MO_SB);
@@ -165,6 +230,18 @@ static bool trans_lw(DisasContext *ctx, arg_lw *a)
     return gen_load(ctx, a, MO_TESL);
 }
 
+static bool trans_ld(DisasContext *ctx, arg_ld *a)
+{
+    REQUIRE_64_OR_128BIT(ctx);
+    return gen_load(ctx, a, MO_TESQ);
+}
+
+static bool trans_lq(DisasContext *ctx, arg_lq *a)
+{
+    REQUIRE_128BIT(ctx);
+    return gen_load(ctx, a, MO_TEO);
+}
+
 static bool trans_lbu(DisasContext *ctx, arg_lbu *a)
 {
     return gen_load(ctx, a, MO_UB);
@@ -177,17 +254,17 @@ static bool trans_lhu(DisasContext *ctx, arg_lhu *a)
 
 static bool trans_lwu(DisasContext *ctx, arg_lwu *a)
 {
-    REQUIRE_64BIT(ctx);
+    REQUIRE_64_OR_128BIT(ctx);
     return gen_load(ctx, a, MO_TEUL);
 }
 
-static bool trans_ld(DisasContext *ctx, arg_ld *a)
+static bool trans_ldu(DisasContext *ctx, arg_ldu *a)
 {
-    REQUIRE_64BIT(ctx);
-    return gen_load(ctx, a, MO_TEQ);
+    REQUIRE_128BIT(ctx);
+    return gen_load(ctx, a, MO_TEUQ);
 }
 
-static bool gen_store(DisasContext *ctx, arg_sb *a, MemOp memop)
+static bool gen_store_tl(DisasContext *ctx, arg_sb *a, MemOp memop)
 {
     TCGv addr = get_gpr(ctx, a->rs1, EXT_NONE);
     TCGv data = get_gpr(ctx, a->rs2, EXT_NONE);
@@ -202,6 +279,55 @@ static bool gen_store(DisasContext *ctx, arg_sb *a, MemOp memop)
     return true;
 }
 
+/*
+ * TODO: we should assert that src1h == 0, as we do not change the
+ *       address translation mechanism
+ */
+static bool gen_store_i128(DisasContext *ctx, arg_sb *a, MemOp memop)
+{
+    TCGv src1l = get_gpr(ctx, a->rs1, EXT_NONE);
+    TCGv src1h = get_gprh(ctx, a->rs1);
+    TCGv src2l = get_gpr(ctx, a->rs2, EXT_NONE);
+    TCGv src2h = get_gprh(ctx, a->rs2);
+    TCGv addrl = tcg_temp_new();
+    TCGv addrh = tcg_temp_new();
+    TCGv imml = tcg_temp_new();
+    TCGv immh = tcg_constant_tl(-(a->imm < 0));
+
+    /* Build a 128-bit address */
+    if (a->imm != 0) {
+        tcg_gen_movi_tl(imml, a->imm);
+        tcg_gen_add2_tl(addrl, addrh, src1l, src1h, imml, immh);
+    } else {
+        tcg_gen_mov_tl(addrl, src1l);
+        tcg_gen_mov_tl(addrh, src1h);
+    }
+
+    if (memop != (MemOp)MO_TEO) {
+        tcg_gen_qemu_st_tl(src2l, addrl, ctx->mem_idx, memop);
+    } else {
+        tcg_gen_qemu_st_tl(memop & MO_BSWAP ? src2h : src2l, addrl,
+            ctx->mem_idx, MO_TEQ);
+        gen_addi2_i128(addrl, addrh, addrl, addrh, 8);
+        tcg_gen_qemu_st_tl(memop & MO_BSWAP ? src2l : src2h, addrl,
+            ctx->mem_idx, MO_TEQ);
+    }
+
+    tcg_temp_free(addrl);
+    tcg_temp_free(addrh);
+    tcg_temp_free(imml);
+    return true;
+}
+
+static bool gen_store(DisasContext *ctx, arg_sb *a, MemOp memop)
+{
+    if (get_xl(ctx) == MXL_RV128) {
+        return gen_store_i128(ctx, a, memop);
+    } else {
+        return gen_store_tl(ctx, a, memop);
+    }
+}
+
 static bool trans_sb(DisasContext *ctx, arg_sb *a)
 {
     return gen_store(ctx, a, MO_SB);
@@ -219,10 +345,16 @@ static bool trans_sw(DisasContext *ctx, arg_sw *a)
 
 static bool trans_sd(DisasContext *ctx, arg_sd *a)
 {
-    REQUIRE_64BIT(ctx);
+    REQUIRE_64_OR_128BIT(ctx);
     return gen_store(ctx, a, MO_TEQ);
 }
 
+static bool trans_sq(DisasContext *ctx, arg_sq *a)
+{
+    REQUIRE_128BIT(ctx);
+    return gen_store(ctx, a, MO_TEO);
+}
+
 static bool trans_addi(DisasContext *ctx, arg_addi *a)
 {
     return gen_arith_imm_fn(ctx, a, EXT_NONE, tcg_gen_addi_tl);
-- 
2.33.0

Re: [PATCH v3 10/21] target/riscv: support for 128-bit loads and store

Posted by Richard Henderson 4 years, 3 months ago

On 10/19/21 2:48 AM, Frédéric Pétrot wrote:
> +# Added for 128 bit support
> +%uimm_cl_q    5:2 10:3               !function=ex_shift_3
> +%uimm_6bit_lq 2:3 12:1 5:2           !function=ex_shift_3
> +%uimm_6bit_sq 7:3 10:3               !function=ex_shift_3
>   

These are incorrect.  LQ and LQSP are scaled by shift 4, not 3.  And the immediate bits 
are differently swizzled from LD and LW.

> -fld               001  ... ... .. ... 00 @cl_d
> +{
> +  fld             001  ... ... .. ... 00 @cl_d
> +  # *** RV128C specific Standard Extension (Quadrant 0) ***
> +  lq              001  ... ... .. ... 00 @cl_q
> +}

You need to move lq first, so that it overrides fld when RV128 is enabled.  Otherwise you 
have to invent some c_fld_not_rv32 pattern with the proper XLEN predicate inside.

Likewise for all of the other groups.

> +/*
> + * TODO: we should assert that src1h == 0, as we do not change the
> + *       address translation mechanism
> + */
> +static bool gen_load_i128(DisasContext *ctx, arg_lb *a, MemOp memop)
> +{
> +    TCGv src1l = get_gpr(ctx, a->rs1, EXT_NONE);
> +    TCGv src1h = get_gprh(ctx, a->rs1);
> +    TCGv destl = dest_gpr(ctx, a->rd);
> +    TCGv desth = dest_gprh(ctx, a->rd);
> +    TCGv addrl = tcg_temp_new();
> +    TCGv addrh = tcg_temp_new();
> +    TCGv imml = tcg_temp_new();
> +    TCGv immh = tcg_constant_tl(-(a->imm < 0));
> +
> +    /* Build a 128-bit address */
> +    if (a->imm != 0) {
> +        tcg_gen_movi_tl(imml, a->imm);
> +        tcg_gen_add2_tl(addrl, addrh, src1l, src1h, imml, immh);
> +    } else {
> +        tcg_gen_mov_tl(addrl, src1l);
> +        tcg_gen_mov_tl(addrh, src1h);
> +    }

Hmm.. I thought I remembered some clause by which the top N bits of the address could be 
ignored, but I can't find it now.

In any case, even if it should be done eventually, I don't think it's worthwhile to 
compute addrh at all right now.

> +    if (memop != (MemOp)MO_TEO) {

Why the cast?  MO_TEO is a MemOp enumerator.

> +        tcg_gen_qemu_ld_tl(memop & MO_BSWAP ? desth : destl, addrl,
> +                           ctx->mem_idx, MO_TEQ);
> +        gen_addi2_i128(addrl, addrh, addrl, addrh, 8);
> +        tcg_gen_qemu_ld_tl(memop & MO_BSWAP ? destl : desth, addrl,
> +                           ctx->mem_idx, MO_TEQ);

In addition... we need an atomic load here for aligned 128-bit addresses (unaligned 
addresses are allowed to be non-atomic).

We don't currently have such an operation in TCG, though we need one (the Power8 LQ 
instruction is also only atomic when aligned).

We should either add this right away (shouldn't be too hard), or change the default to 
thread=single for -cpu rv128.  We should disable thread=multi if !HAVE_ATOMIC128, because 
we will be constantly trapping with EXCP_ATOMIC.

Similarly for store, of course.

r~