From: Philipp Tomsich <philipp.tomsich@vrull.eu>
The 1.0.0 version of Zbb does not contain gorc/gorci. Instead, a
orc.b instruction (equivalent to the orc.b pseudo-instruction built on
gorci from pre-0.93 draft-B) is available, mainly targeting
string-processing workloads.
This commit adds the new orc.b instruction and removed gorc/gorci.
Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Message-id: 20210911140016.834071-12-philipp.tomsich@vrull.eu
Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
---
target/riscv/helper.h | 2 --
target/riscv/insn32.decode | 6 +---
target/riscv/bitmanip_helper.c | 26 -----------------
target/riscv/insn_trans/trans_rvb.c.inc | 39 +++++++++++--------------
4 files changed, 18 insertions(+), 55 deletions(-)
diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index 8a318a2dbc..a9bda2c8ac 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -61,8 +61,6 @@ DEF_HELPER_FLAGS_1(fclass_d, TCG_CALL_NO_RWG_SE, tl, i64)
/* Bitmanip */
DEF_HELPER_FLAGS_2(grev, TCG_CALL_NO_RWG_SE, tl, tl, tl)
DEF_HELPER_FLAGS_2(grevw, TCG_CALL_NO_RWG_SE, tl, tl, tl)
-DEF_HELPER_FLAGS_2(gorc, TCG_CALL_NO_RWG_SE, tl, tl, tl)
-DEF_HELPER_FLAGS_2(gorcw, TCG_CALL_NO_RWG_SE, tl, tl, tl)
DEF_HELPER_FLAGS_2(clmul, TCG_CALL_NO_RWG_SE, tl, tl, tl)
DEF_HELPER_FLAGS_2(clmulr, TCG_CALL_NO_RWG_SE, tl, tl, tl)
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index a509cfee11..59202196dc 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -681,6 +681,7 @@ max 0000101 .......... 110 ..... 0110011 @r
maxu 0000101 .......... 111 ..... 0110011 @r
min 0000101 .......... 100 ..... 0110011 @r
minu 0000101 .......... 101 ..... 0110011 @r
+orc_b 001010 000111 ..... 101 ..... 0010011 @r2
orn 0100000 .......... 110 ..... 0110011 @r
rol 0110000 .......... 001 ..... 0110011 @r
ror 0110000 .......... 101 ..... 0110011 @r
@@ -702,19 +703,14 @@ pack 0000100 .......... 100 ..... 0110011 @r
packu 0100100 .......... 100 ..... 0110011 @r
packh 0000100 .......... 111 ..... 0110011 @r
grev 0110100 .......... 101 ..... 0110011 @r
-gorc 0010100 .......... 101 ..... 0110011 @r
-
grevi 01101. ........... 101 ..... 0010011 @sh
-gorci 00101. ........... 101 ..... 0010011 @sh
# *** RV64B Standard Extension (in addition to RV32B) ***
packw 0000100 .......... 100 ..... 0111011 @r
packuw 0100100 .......... 100 ..... 0111011 @r
grevw 0110100 .......... 101 ..... 0111011 @r
-gorcw 0010100 .......... 101 ..... 0111011 @r
greviw 0110100 .......... 101 ..... 0011011 @sh5
-gorciw 0010100 .......... 101 ..... 0011011 @sh5
# *** RV32 Zbc Standard Extension ***
clmul 0000101 .......... 001 ..... 0110011 @r
diff --git a/target/riscv/bitmanip_helper.c b/target/riscv/bitmanip_helper.c
index 73be5a81c7..bb48388fcd 100644
--- a/target/riscv/bitmanip_helper.c
+++ b/target/riscv/bitmanip_helper.c
@@ -64,32 +64,6 @@ target_ulong HELPER(grevw)(target_ulong rs1, target_ulong rs2)
return do_grev(rs1, rs2, 32);
}
-static target_ulong do_gorc(target_ulong rs1,
- target_ulong rs2,
- int bits)
-{
- target_ulong x = rs1;
- int i, shift;
-
- for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) {
- if (rs2 & shift) {
- x |= do_swap(x, adjacent_masks[i], shift);
- }
- }
-
- return x;
-}
-
-target_ulong HELPER(gorc)(target_ulong rs1, target_ulong rs2)
-{
- return do_gorc(rs1, rs2, TARGET_LONG_BITS);
-}
-
-target_ulong HELPER(gorcw)(target_ulong rs1, target_ulong rs2)
-{
- return do_gorc(rs1, rs2, 32);
-}
-
target_ulong HELPER(clmul)(target_ulong rs1, target_ulong rs2)
{
target_ulong result = 0;
diff --git a/target/riscv/insn_trans/trans_rvb.c.inc b/target/riscv/insn_trans/trans_rvb.c.inc
index bdfb495f24..d32af5915a 100644
--- a/target/riscv/insn_trans/trans_rvb.c.inc
+++ b/target/riscv/insn_trans/trans_rvb.c.inc
@@ -295,16 +295,27 @@ static bool trans_grevi(DisasContext *ctx, arg_grevi *a)
return gen_shift_imm_fn(ctx, a, EXT_NONE, gen_grevi);
}
-static bool trans_gorc(DisasContext *ctx, arg_gorc *a)
+static void gen_orc_b(TCGv ret, TCGv source1)
{
- REQUIRE_EXT(ctx, RVB);
- return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc);
+ TCGv tmp = tcg_temp_new();
+ TCGv ones = tcg_constant_tl(dup_const_tl(MO_8, 0x01));
+
+ /* Set lsb in each byte if the byte was zero. */
+ tcg_gen_sub_tl(tmp, source1, ones);
+ tcg_gen_andc_tl(tmp, tmp, source1);
+ tcg_gen_shri_tl(tmp, tmp, 7);
+ tcg_gen_andc_tl(tmp, ones, tmp);
+
+ /* Replicate the lsb of each byte across the byte. */
+ tcg_gen_muli_tl(ret, tmp, 0xff);
+
+ tcg_temp_free(tmp);
}
-static bool trans_gorci(DisasContext *ctx, arg_gorci *a)
+static bool trans_orc_b(DisasContext *ctx, arg_orc_b *a)
{
- REQUIRE_EXT(ctx, RVB);
- return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc);
+ REQUIRE_ZBB(ctx);
+ return gen_unary(ctx, a, EXT_ZERO, gen_orc_b);
}
#define GEN_SHADD(SHAMT) \
@@ -476,22 +487,6 @@ static bool trans_greviw(DisasContext *ctx, arg_greviw *a)
return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_grev);
}
-static bool trans_gorcw(DisasContext *ctx, arg_gorcw *a)
-{
- REQUIRE_64BIT(ctx);
- REQUIRE_EXT(ctx, RVB);
- ctx->w = true;
- return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc);
-}
-
-static bool trans_gorciw(DisasContext *ctx, arg_gorciw *a)
-{
- REQUIRE_64BIT(ctx);
- REQUIRE_EXT(ctx, RVB);
- ctx->w = true;
- return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc);
-}
-
#define GEN_SHADD_UW(SHAMT) \
static void gen_sh##SHAMT##add_uw(TCGv ret, TCGv arg1, TCGv arg2) \
{ \
--
2.31.1
On Thu, Oct 7, 2021 at 8:58 AM Alistair Francis
<alistair.francis@opensource.wdc.com> wrote:
>
> From: Philipp Tomsich <philipp.tomsich@vrull.eu>
>
> The 1.0.0 version of Zbb does not contain gorc/gorci. Instead, a
> orc.b instruction (equivalent to the orc.b pseudo-instruction built on
> gorci from pre-0.93 draft-B) is available, mainly targeting
> string-processing workloads.
>
> This commit adds the new orc.b instruction and removed gorc/gorci.
>
> Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
> Message-id: 20210911140016.834071-12-philipp.tomsich@vrull.eu
> Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
> ---
> target/riscv/helper.h | 2 --
> target/riscv/insn32.decode | 6 +---
> target/riscv/bitmanip_helper.c | 26 -----------------
> target/riscv/insn_trans/trans_rvb.c.inc | 39 +++++++++++--------------
> 4 files changed, 18 insertions(+), 55 deletions(-)
>
> diff --git a/target/riscv/helper.h b/target/riscv/helper.h
> index 8a318a2dbc..a9bda2c8ac 100644
> --- a/target/riscv/helper.h
> +++ b/target/riscv/helper.h
> @@ -61,8 +61,6 @@ DEF_HELPER_FLAGS_1(fclass_d, TCG_CALL_NO_RWG_SE, tl, i64)
> /* Bitmanip */
> DEF_HELPER_FLAGS_2(grev, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> DEF_HELPER_FLAGS_2(grevw, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> -DEF_HELPER_FLAGS_2(gorc, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> -DEF_HELPER_FLAGS_2(gorcw, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> DEF_HELPER_FLAGS_2(clmul, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> DEF_HELPER_FLAGS_2(clmulr, TCG_CALL_NO_RWG_SE, tl, tl, tl)
>
> diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
> index a509cfee11..59202196dc 100644
> --- a/target/riscv/insn32.decode
> +++ b/target/riscv/insn32.decode
> @@ -681,6 +681,7 @@ max 0000101 .......... 110 ..... 0110011 @r
> maxu 0000101 .......... 111 ..... 0110011 @r
> min 0000101 .......... 100 ..... 0110011 @r
> minu 0000101 .......... 101 ..... 0110011 @r
> +orc_b 001010 000111 ..... 101 ..... 0010011 @r2
> orn 0100000 .......... 110 ..... 0110011 @r
> rol 0110000 .......... 001 ..... 0110011 @r
> ror 0110000 .......... 101 ..... 0110011 @r
> @@ -702,19 +703,14 @@ pack 0000100 .......... 100 ..... 0110011 @r
> packu 0100100 .......... 100 ..... 0110011 @r
> packh 0000100 .......... 111 ..... 0110011 @r
> grev 0110100 .......... 101 ..... 0110011 @r
> -gorc 0010100 .......... 101 ..... 0110011 @r
> -
> grevi 01101. ........... 101 ..... 0010011 @sh
> -gorci 00101. ........... 101 ..... 0010011 @sh
>
> # *** RV64B Standard Extension (in addition to RV32B) ***
> packw 0000100 .......... 100 ..... 0111011 @r
> packuw 0100100 .......... 100 ..... 0111011 @r
> grevw 0110100 .......... 101 ..... 0111011 @r
> -gorcw 0010100 .......... 101 ..... 0111011 @r
>
> greviw 0110100 .......... 101 ..... 0011011 @sh5
> -gorciw 0010100 .......... 101 ..... 0011011 @sh5
>
> # *** RV32 Zbc Standard Extension ***
> clmul 0000101 .......... 001 ..... 0110011 @r
> diff --git a/target/riscv/bitmanip_helper.c b/target/riscv/bitmanip_helper.c
> index 73be5a81c7..bb48388fcd 100644
> --- a/target/riscv/bitmanip_helper.c
> +++ b/target/riscv/bitmanip_helper.c
> @@ -64,32 +64,6 @@ target_ulong HELPER(grevw)(target_ulong rs1, target_ulong rs2)
> return do_grev(rs1, rs2, 32);
> }
>
> -static target_ulong do_gorc(target_ulong rs1,
> - target_ulong rs2,
> - int bits)
> -{
> - target_ulong x = rs1;
> - int i, shift;
> -
> - for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) {
> - if (rs2 & shift) {
> - x |= do_swap(x, adjacent_masks[i], shift);
> - }
> - }
> -
> - return x;
> -}
> -
> -target_ulong HELPER(gorc)(target_ulong rs1, target_ulong rs2)
> -{
> - return do_gorc(rs1, rs2, TARGET_LONG_BITS);
> -}
> -
> -target_ulong HELPER(gorcw)(target_ulong rs1, target_ulong rs2)
> -{
> - return do_gorc(rs1, rs2, 32);
> -}
> -
> target_ulong HELPER(clmul)(target_ulong rs1, target_ulong rs2)
> {
> target_ulong result = 0;
> diff --git a/target/riscv/insn_trans/trans_rvb.c.inc b/target/riscv/insn_trans/trans_rvb.c.inc
> index bdfb495f24..d32af5915a 100644
> --- a/target/riscv/insn_trans/trans_rvb.c.inc
> +++ b/target/riscv/insn_trans/trans_rvb.c.inc
> @@ -295,16 +295,27 @@ static bool trans_grevi(DisasContext *ctx, arg_grevi *a)
> return gen_shift_imm_fn(ctx, a, EXT_NONE, gen_grevi);
> }
>
> -static bool trans_gorc(DisasContext *ctx, arg_gorc *a)
> +static void gen_orc_b(TCGv ret, TCGv source1)
> {
> - REQUIRE_EXT(ctx, RVB);
> - return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc);
> + TCGv tmp = tcg_temp_new();
> + TCGv ones = tcg_constant_tl(dup_const_tl(MO_8, 0x01));
> +
> + /* Set lsb in each byte if the byte was zero. */
> + tcg_gen_sub_tl(tmp, source1, ones);
> + tcg_gen_andc_tl(tmp, tmp, source1);
> + tcg_gen_shri_tl(tmp, tmp, 7);
> + tcg_gen_andc_tl(tmp, ones, tmp);
> +
> + /* Replicate the lsb of each byte across the byte. */
> + tcg_gen_muli_tl(ret, tmp, 0xff);
> +
> + tcg_temp_free(tmp);
> }
It seems there is a bug in the current orc.b implementation,
the following 7 hexadecimal patterns return one wrong byte (0x00
instead of 0xff):
orc.b(0x............01..) = 0x............00.. (instead of 0x............ff..)
orc.b(0x..........01....) = 0x..........00.... (instead of 0x..........ff....)
orc.b(0x........01......) = 0x........00...... (instead of 0x........ff......)
orc.b(0x......01........) = 0x......00........ (instead of 0x......ff........)
orc.b(0x....01..........) = 0x....00.......... (instead of 0x....ff..........)
orc.b(0x..01............) = 0x..00............ (instead of 0x..ff............)
orc.b(0x01..............) = 0x00.............. (instead of 0xff..............)
(see test cases below)
The issue seems to be related to the propagation of the carry.
I had a hard time fixing it. With some help, I have added a prolog
which basically computes:
(source1 | ((source1 << 1) & ~ones)) in order to avoid the carry.
I will send the patch as a follow-up in this thread as 'Patch 1A'.
But it's notably less optimized than the current code, so feel free
to come up with better options.
Actually my initial stab at fixing it was implementing a more
straightforward but less astute 'divide and conquer' method
where bits are or'ed by pairs, then the pairs are or'ed by pair ...
using the following formula:
tmp = source1 | (source1 >> 1)
tmp = tmp | (tmp >> 2)
tmp = tmp | (tmp >> 4)
ret = tmp & 0x0101010101010101
ret = tmp * 0xff
as it's notably less optimized than the current code when converted in
tcg_gen_ primitives but de par with the fixed version.
I'm also sending in this thread as 'Patch 1B' as I feel it's slightly
easier to grasp.
Test cases run on current implementation:
orc.b(0x0000000000000000) = 0x0000000000000000 OK (expect 0x0000000000000000)
orc.b(0x0000000000000001) = 0x00000000000000ff OK (expect 0x00000000000000ff)
orc.b(0x0000000000000002) = 0x00000000000000ff OK (expect 0x00000000000000ff)
orc.b(0x0000000000000004) = 0x00000000000000ff OK (expect 0x00000000000000ff)
orc.b(0x0000000000000008) = 0x00000000000000ff OK (expect 0x00000000000000ff)
orc.b(0x0000000000000010) = 0x00000000000000ff OK (expect 0x00000000000000ff)
orc.b(0x0000000000000020) = 0x00000000000000ff OK (expect 0x00000000000000ff)
orc.b(0x0000000000000040) = 0x00000000000000ff OK (expect 0x00000000000000ff)
orc.b(0x0000000000000080) = 0x00000000000000ff OK (expect 0x00000000000000ff)
orc.b(0x0000000000000100) = 0x0000000000000000 FAIL (expect 0x000000000000ff00)
orc.b(0x0000000000000200) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
orc.b(0x0000000000000400) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
orc.b(0x0000000000000800) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
orc.b(0x0000000000001000) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
orc.b(0x0000000000002000) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
orc.b(0x0000000000004000) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
orc.b(0x0000000000008000) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
orc.b(0x0000000000010000) = 0x0000000000000000 FAIL (expect 0x0000000000ff0000)
orc.b(0x0000000000020000) = 0x0000000000ff0000 OK (expect 0x0000000000ff0000)
orc.b(0x0000000001000000) = 0x0000000000000000 FAIL (expect 0x00000000ff000000)
orc.b(0x0000000002000000) = 0x00000000ff000000 OK (expect 0x00000000ff000000)
orc.b(0x0000000100000000) = 0x0000000000000000 FAIL (expect 0x000000ff00000000)
orc.b(0x0000000200000000) = 0x000000ff00000000 OK (expect 0x000000ff00000000)
orc.b(0x0000010000000000) = 0x0000000000000000 FAIL (expect 0x0000ff0000000000)
orc.b(0x0000020000000000) = 0x0000ff0000000000 OK (expect 0x0000ff0000000000)
orc.b(0x0001000000000000) = 0x0000000000000000 FAIL (expect 0x00ff000000000000)
orc.b(0x0002000000000000) = 0x00ff000000000000 OK (expect 0x00ff000000000000)
orc.b(0x0100000000000000) = 0x0000000000000000 FAIL (expect 0xff00000000000000)
orc.b(0x0200000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
orc.b(0x0400000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
orc.b(0x0800000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
orc.b(0x1000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
orc.b(0x2000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
orc.b(0x4000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
orc.b(0x8000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
orc.b(0xffffffffffffffff) = 0xffffffffffffffff OK (expect 0xffffffffffffffff)
orc.b(0x00ff00ff00ff00ff) = 0x00ff00ff00ff00ff OK (expect 0x00ff00ff00ff00ff)
orc.b(0xff00ff00ff00ff00) = 0xff00ff00ff00ff00 OK (expect 0xff00ff00ff00ff00)
orc.b(0x0001000100010001) = 0x00000000000000ff FAIL (expect 0x00ff00ff00ff00ff)
orc.b(0x0100010001000100) = 0x0000000000000000 FAIL (expect 0xff00ff00ff00ff00)
orc.b(0x8040201008040201) = 0xffffffffffffffff OK (expect 0xffffffffffffffff)
orc.b(0x0804020180402010) = 0xffffffffffffffff OK (expect 0xffffffffffffffff)
orc.b(0x010055aa00401100) = 0x0000ffff00ffff00 FAIL (expect 0xff00ffff00ffff00)
>
> -static bool trans_gorci(DisasContext *ctx, arg_gorci *a)
> +static bool trans_orc_b(DisasContext *ctx, arg_orc_b *a)
> {
> - REQUIRE_EXT(ctx, RVB);
> - return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc);
> + REQUIRE_ZBB(ctx);
> + return gen_unary(ctx, a, EXT_ZERO, gen_orc_b);
> }
>
> #define GEN_SHADD(SHAMT) \
> @@ -476,22 +487,6 @@ static bool trans_greviw(DisasContext *ctx, arg_greviw *a)
> return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_grev);
> }
>
> -static bool trans_gorcw(DisasContext *ctx, arg_gorcw *a)
> -{
> - REQUIRE_64BIT(ctx);
> - REQUIRE_EXT(ctx, RVB);
> - ctx->w = true;
> - return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc);
> -}
> -
> -static bool trans_gorciw(DisasContext *ctx, arg_gorciw *a)
> -{
> - REQUIRE_64BIT(ctx);
> - REQUIRE_EXT(ctx, RVB);
> - ctx->w = true;
> - return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc);
> -}
> -
> #define GEN_SHADD_UW(SHAMT) \
> static void gen_sh##SHAMT##add_uw(TCGv ret, TCGv arg1, TCGv arg2) \
> { \
> --
> 2.31.1
>
>
I had a much simpler version initially (using 3 x mask/shift/or, for
12 instructions after setup of constants), but took up the suggestion
to optimize based on haszero(v)...
Indeed this appears to not do what we expect, when there's only 0x01
set in a byte.
The less optimized form, with a single constant, that would still do
what we want is:
/* set high-bit for non-zero bytes */
constant = dup_const_tl(MO_8, 0x7f);
tmp = v & constant; // AND
tmp += constant; // ADD
tmp |= v; // OR
/* extract high-bit to low-bit, for each word */
tmp &= ~constant; // ANDC
tmp >>= 7; // SHR
/* multiply with 0xff to populate entire byte where the low-bit is set */
tmp *= 0xff; // MUL
I'll submit a patch with this one later today, once I had a chance to
pass this through a full test.
Thanks,
Philipp.
On Wed, 13 Oct 2021 at 11:36, Vincent Palatin <vpalatin@rivosinc.com> wrote:
>
> On Thu, Oct 7, 2021 at 8:58 AM Alistair Francis
> <alistair.francis@opensource.wdc.com> wrote:
> >
> > From: Philipp Tomsich <philipp.tomsich@vrull.eu>
> >
> > The 1.0.0 version of Zbb does not contain gorc/gorci. Instead, a
> > orc.b instruction (equivalent to the orc.b pseudo-instruction built on
> > gorci from pre-0.93 draft-B) is available, mainly targeting
> > string-processing workloads.
> >
> > This commit adds the new orc.b instruction and removed gorc/gorci.
> >
> > Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
> > Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> > Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
> > Message-id: 20210911140016.834071-12-philipp.tomsich@vrull.eu
> > Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
> > ---
> > target/riscv/helper.h | 2 --
> > target/riscv/insn32.decode | 6 +---
> > target/riscv/bitmanip_helper.c | 26 -----------------
> > target/riscv/insn_trans/trans_rvb.c.inc | 39 +++++++++++--------------
> > 4 files changed, 18 insertions(+), 55 deletions(-)
> >
> > diff --git a/target/riscv/helper.h b/target/riscv/helper.h
> > index 8a318a2dbc..a9bda2c8ac 100644
> > --- a/target/riscv/helper.h
> > +++ b/target/riscv/helper.h
> > @@ -61,8 +61,6 @@ DEF_HELPER_FLAGS_1(fclass_d, TCG_CALL_NO_RWG_SE, tl, i64)
> > /* Bitmanip */
> > DEF_HELPER_FLAGS_2(grev, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > DEF_HELPER_FLAGS_2(grevw, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > -DEF_HELPER_FLAGS_2(gorc, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > -DEF_HELPER_FLAGS_2(gorcw, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > DEF_HELPER_FLAGS_2(clmul, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > DEF_HELPER_FLAGS_2(clmulr, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> >
> > diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
> > index a509cfee11..59202196dc 100644
> > --- a/target/riscv/insn32.decode
> > +++ b/target/riscv/insn32.decode
> > @@ -681,6 +681,7 @@ max 0000101 .......... 110 ..... 0110011 @r
> > maxu 0000101 .......... 111 ..... 0110011 @r
> > min 0000101 .......... 100 ..... 0110011 @r
> > minu 0000101 .......... 101 ..... 0110011 @r
> > +orc_b 001010 000111 ..... 101 ..... 0010011 @r2
> > orn 0100000 .......... 110 ..... 0110011 @r
> > rol 0110000 .......... 001 ..... 0110011 @r
> > ror 0110000 .......... 101 ..... 0110011 @r
> > @@ -702,19 +703,14 @@ pack 0000100 .......... 100 ..... 0110011 @r
> > packu 0100100 .......... 100 ..... 0110011 @r
> > packh 0000100 .......... 111 ..... 0110011 @r
> > grev 0110100 .......... 101 ..... 0110011 @r
> > -gorc 0010100 .......... 101 ..... 0110011 @r
> > -
> > grevi 01101. ........... 101 ..... 0010011 @sh
> > -gorci 00101. ........... 101 ..... 0010011 @sh
> >
> > # *** RV64B Standard Extension (in addition to RV32B) ***
> > packw 0000100 .......... 100 ..... 0111011 @r
> > packuw 0100100 .......... 100 ..... 0111011 @r
> > grevw 0110100 .......... 101 ..... 0111011 @r
> > -gorcw 0010100 .......... 101 ..... 0111011 @r
> >
> > greviw 0110100 .......... 101 ..... 0011011 @sh5
> > -gorciw 0010100 .......... 101 ..... 0011011 @sh5
> >
> > # *** RV32 Zbc Standard Extension ***
> > clmul 0000101 .......... 001 ..... 0110011 @r
> > diff --git a/target/riscv/bitmanip_helper.c b/target/riscv/bitmanip_helper.c
> > index 73be5a81c7..bb48388fcd 100644
> > --- a/target/riscv/bitmanip_helper.c
> > +++ b/target/riscv/bitmanip_helper.c
> > @@ -64,32 +64,6 @@ target_ulong HELPER(grevw)(target_ulong rs1, target_ulong rs2)
> > return do_grev(rs1, rs2, 32);
> > }
> >
> > -static target_ulong do_gorc(target_ulong rs1,
> > - target_ulong rs2,
> > - int bits)
> > -{
> > - target_ulong x = rs1;
> > - int i, shift;
> > -
> > - for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) {
> > - if (rs2 & shift) {
> > - x |= do_swap(x, adjacent_masks[i], shift);
> > - }
> > - }
> > -
> > - return x;
> > -}
> > -
> > -target_ulong HELPER(gorc)(target_ulong rs1, target_ulong rs2)
> > -{
> > - return do_gorc(rs1, rs2, TARGET_LONG_BITS);
> > -}
> > -
> > -target_ulong HELPER(gorcw)(target_ulong rs1, target_ulong rs2)
> > -{
> > - return do_gorc(rs1, rs2, 32);
> > -}
> > -
> > target_ulong HELPER(clmul)(target_ulong rs1, target_ulong rs2)
> > {
> > target_ulong result = 0;
> > diff --git a/target/riscv/insn_trans/trans_rvb.c.inc b/target/riscv/insn_trans/trans_rvb.c.inc
> > index bdfb495f24..d32af5915a 100644
> > --- a/target/riscv/insn_trans/trans_rvb.c.inc
> > +++ b/target/riscv/insn_trans/trans_rvb.c.inc
> > @@ -295,16 +295,27 @@ static bool trans_grevi(DisasContext *ctx, arg_grevi *a)
> > return gen_shift_imm_fn(ctx, a, EXT_NONE, gen_grevi);
> > }
> >
> > -static bool trans_gorc(DisasContext *ctx, arg_gorc *a)
> > +static void gen_orc_b(TCGv ret, TCGv source1)
> > {
> > - REQUIRE_EXT(ctx, RVB);
> > - return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc);
> > + TCGv tmp = tcg_temp_new();
> > + TCGv ones = tcg_constant_tl(dup_const_tl(MO_8, 0x01));
> > +
> > + /* Set lsb in each byte if the byte was zero. */
> > + tcg_gen_sub_tl(tmp, source1, ones);
> > + tcg_gen_andc_tl(tmp, tmp, source1);
> > + tcg_gen_shri_tl(tmp, tmp, 7);
> > + tcg_gen_andc_tl(tmp, ones, tmp);
> > +
> > + /* Replicate the lsb of each byte across the byte. */
> > + tcg_gen_muli_tl(ret, tmp, 0xff);
> > +
> > + tcg_temp_free(tmp);
> > }
>
> It seems there is a bug in the current orc.b implementation,
> the following 7 hexadecimal patterns return one wrong byte (0x00
> instead of 0xff):
> orc.b(0x............01..) = 0x............00.. (instead of 0x............ff..)
> orc.b(0x..........01....) = 0x..........00.... (instead of 0x..........ff....)
> orc.b(0x........01......) = 0x........00...... (instead of 0x........ff......)
> orc.b(0x......01........) = 0x......00........ (instead of 0x......ff........)
> orc.b(0x....01..........) = 0x....00.......... (instead of 0x....ff..........)
> orc.b(0x..01............) = 0x..00............ (instead of 0x..ff............)
> orc.b(0x01..............) = 0x00.............. (instead of 0xff..............)
> (see test cases below)
>
> The issue seems to be related to the propagation of the carry.
> I had a hard time fixing it. With some help, I have added a prolog
> which basically computes:
> (source1 | ((source1 << 1) & ~ones)) in order to avoid the carry.
> I will send the patch as a follow-up in this thread as 'Patch 1A'.
>
> But it's notably less optimized than the current code, so feel free
> to come up with better options.
> Actually my initial stab at fixing it was implementing a more
> straightforward but less astute 'divide and conquer' method
> where bits are or'ed by pairs, then the pairs are or'ed by pair ...
> using the following formula:
> tmp = source1 | (source1 >> 1)
> tmp = tmp | (tmp >> 2)
> tmp = tmp | (tmp >> 4)
> ret = tmp & 0x0101010101010101
> ret = tmp * 0xff
> as it's notably less optimized than the current code when converted in
> tcg_gen_ primitives but de par with the fixed version.
> I'm also sending in this thread as 'Patch 1B' as I feel it's slightly
> easier to grasp.
>
>
> Test cases run on current implementation:
> orc.b(0x0000000000000000) = 0x0000000000000000 OK (expect 0x0000000000000000)
> orc.b(0x0000000000000001) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> orc.b(0x0000000000000002) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> orc.b(0x0000000000000004) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> orc.b(0x0000000000000008) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> orc.b(0x0000000000000010) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> orc.b(0x0000000000000020) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> orc.b(0x0000000000000040) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> orc.b(0x0000000000000080) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> orc.b(0x0000000000000100) = 0x0000000000000000 FAIL (expect 0x000000000000ff00)
> orc.b(0x0000000000000200) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> orc.b(0x0000000000000400) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> orc.b(0x0000000000000800) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> orc.b(0x0000000000001000) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> orc.b(0x0000000000002000) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> orc.b(0x0000000000004000) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> orc.b(0x0000000000008000) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> orc.b(0x0000000000010000) = 0x0000000000000000 FAIL (expect 0x0000000000ff0000)
> orc.b(0x0000000000020000) = 0x0000000000ff0000 OK (expect 0x0000000000ff0000)
> orc.b(0x0000000001000000) = 0x0000000000000000 FAIL (expect 0x00000000ff000000)
> orc.b(0x0000000002000000) = 0x00000000ff000000 OK (expect 0x00000000ff000000)
> orc.b(0x0000000100000000) = 0x0000000000000000 FAIL (expect 0x000000ff00000000)
> orc.b(0x0000000200000000) = 0x000000ff00000000 OK (expect 0x000000ff00000000)
> orc.b(0x0000010000000000) = 0x0000000000000000 FAIL (expect 0x0000ff0000000000)
> orc.b(0x0000020000000000) = 0x0000ff0000000000 OK (expect 0x0000ff0000000000)
> orc.b(0x0001000000000000) = 0x0000000000000000 FAIL (expect 0x00ff000000000000)
> orc.b(0x0002000000000000) = 0x00ff000000000000 OK (expect 0x00ff000000000000)
> orc.b(0x0100000000000000) = 0x0000000000000000 FAIL (expect 0xff00000000000000)
> orc.b(0x0200000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> orc.b(0x0400000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> orc.b(0x0800000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> orc.b(0x1000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> orc.b(0x2000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> orc.b(0x4000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> orc.b(0x8000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> orc.b(0xffffffffffffffff) = 0xffffffffffffffff OK (expect 0xffffffffffffffff)
> orc.b(0x00ff00ff00ff00ff) = 0x00ff00ff00ff00ff OK (expect 0x00ff00ff00ff00ff)
> orc.b(0xff00ff00ff00ff00) = 0xff00ff00ff00ff00 OK (expect 0xff00ff00ff00ff00)
> orc.b(0x0001000100010001) = 0x00000000000000ff FAIL (expect 0x00ff00ff00ff00ff)
> orc.b(0x0100010001000100) = 0x0000000000000000 FAIL (expect 0xff00ff00ff00ff00)
> orc.b(0x8040201008040201) = 0xffffffffffffffff OK (expect 0xffffffffffffffff)
> orc.b(0x0804020180402010) = 0xffffffffffffffff OK (expect 0xffffffffffffffff)
> orc.b(0x010055aa00401100) = 0x0000ffff00ffff00 FAIL (expect 0xff00ffff00ffff00)
>
>
> >
> > -static bool trans_gorci(DisasContext *ctx, arg_gorci *a)
> > +static bool trans_orc_b(DisasContext *ctx, arg_orc_b *a)
> > {
> > - REQUIRE_EXT(ctx, RVB);
> > - return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc);
> > + REQUIRE_ZBB(ctx);
> > + return gen_unary(ctx, a, EXT_ZERO, gen_orc_b);
> > }
> >
> > #define GEN_SHADD(SHAMT) \
> > @@ -476,22 +487,6 @@ static bool trans_greviw(DisasContext *ctx, arg_greviw *a)
> > return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_grev);
> > }
> >
> > -static bool trans_gorcw(DisasContext *ctx, arg_gorcw *a)
> > -{
> > - REQUIRE_64BIT(ctx);
> > - REQUIRE_EXT(ctx, RVB);
> > - ctx->w = true;
> > - return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc);
> > -}
> > -
> > -static bool trans_gorciw(DisasContext *ctx, arg_gorciw *a)
> > -{
> > - REQUIRE_64BIT(ctx);
> > - REQUIRE_EXT(ctx, RVB);
> > - ctx->w = true;
> > - return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc);
> > -}
> > -
> > #define GEN_SHADD_UW(SHAMT) \
> > static void gen_sh##SHAMT##add_uw(TCGv ret, TCGv arg1, TCGv arg2) \
> > { \
> > --
> > 2.31.1
> >
> >
On Wed, Oct 13, 2021 at 3:13 PM Philipp Tomsich
<philipp.tomsich@vrull.eu> wrote:
>
> I had a much simpler version initially (using 3 x mask/shift/or, for
> 12 instructions after setup of constants), but took up the suggestion
> to optimize based on haszero(v)...
> Indeed this appears to not do what we expect, when there's only 0x01
> set in a byte.
>
> The less optimized form, with a single constant, that would still do
> what we want is:
> /* set high-bit for non-zero bytes */
> constant = dup_const_tl(MO_8, 0x7f);
> tmp = v & constant; // AND
> tmp += constant; // ADD
> tmp |= v; // OR
> /* extract high-bit to low-bit, for each word */
> tmp &= ~constant; // ANDC
> tmp >>= 7; // SHR
> /* multiply with 0xff to populate entire byte where the low-bit is set */
> tmp *= 0xff; // MUL
>
> I'll submit a patch with this one later today, once I had a chance to
> pass this through a full test.
Thanks for the insight.
I have tried it, implemented as:
```
static void gen_orc_b(TCGv ret, TCGv source1)
{
TCGv tmp = tcg_temp_new();
TCGv constant = tcg_constant_tl(dup_const_tl(MO_8, 0x7f));
/* set high-bit for non-zero bytes */
tcg_gen_and_tl(tmp, source1, constant);
tcg_gen_add_tl(tmp, tmp, constant);
tcg_gen_or_tl(tmp, tmp, source1);
/* extract high-bit to low-bit, for each word */
tcg_gen_andc_tl(tmp, tmp, constant);
tcg_gen_shri_tl(tmp, tmp, 7);
/* Replicate the lsb of each byte across the byte. */
tcg_gen_muli_tl(ret, tmp, 0xff);
tcg_temp_free(tmp);
}
```
It does pass my own test sequences.
>
> On Wed, 13 Oct 2021 at 11:36, Vincent Palatin <vpalatin@rivosinc.com> wrote:
> >
> > On Thu, Oct 7, 2021 at 8:58 AM Alistair Francis
> > <alistair.francis@opensource.wdc.com> wrote:
> > >
> > > From: Philipp Tomsich <philipp.tomsich@vrull.eu>
> > >
> > > The 1.0.0 version of Zbb does not contain gorc/gorci. Instead, a
> > > orc.b instruction (equivalent to the orc.b pseudo-instruction built on
> > > gorci from pre-0.93 draft-B) is available, mainly targeting
> > > string-processing workloads.
> > >
> > > This commit adds the new orc.b instruction and removed gorc/gorci.
> > >
> > > Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
> > > Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> > > Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
> > > Message-id: 20210911140016.834071-12-philipp.tomsich@vrull.eu
> > > Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
> > > ---
> > > target/riscv/helper.h | 2 --
> > > target/riscv/insn32.decode | 6 +---
> > > target/riscv/bitmanip_helper.c | 26 -----------------
> > > target/riscv/insn_trans/trans_rvb.c.inc | 39 +++++++++++--------------
> > > 4 files changed, 18 insertions(+), 55 deletions(-)
> > >
> > > diff --git a/target/riscv/helper.h b/target/riscv/helper.h
> > > index 8a318a2dbc..a9bda2c8ac 100644
> > > --- a/target/riscv/helper.h
> > > +++ b/target/riscv/helper.h
> > > @@ -61,8 +61,6 @@ DEF_HELPER_FLAGS_1(fclass_d, TCG_CALL_NO_RWG_SE, tl, i64)
> > > /* Bitmanip */
> > > DEF_HELPER_FLAGS_2(grev, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > > DEF_HELPER_FLAGS_2(grevw, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > > -DEF_HELPER_FLAGS_2(gorc, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > > -DEF_HELPER_FLAGS_2(gorcw, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > > DEF_HELPER_FLAGS_2(clmul, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > > DEF_HELPER_FLAGS_2(clmulr, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > >
> > > diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
> > > index a509cfee11..59202196dc 100644
> > > --- a/target/riscv/insn32.decode
> > > +++ b/target/riscv/insn32.decode
> > > @@ -681,6 +681,7 @@ max 0000101 .......... 110 ..... 0110011 @r
> > > maxu 0000101 .......... 111 ..... 0110011 @r
> > > min 0000101 .......... 100 ..... 0110011 @r
> > > minu 0000101 .......... 101 ..... 0110011 @r
> > > +orc_b 001010 000111 ..... 101 ..... 0010011 @r2
> > > orn 0100000 .......... 110 ..... 0110011 @r
> > > rol 0110000 .......... 001 ..... 0110011 @r
> > > ror 0110000 .......... 101 ..... 0110011 @r
> > > @@ -702,19 +703,14 @@ pack 0000100 .......... 100 ..... 0110011 @r
> > > packu 0100100 .......... 100 ..... 0110011 @r
> > > packh 0000100 .......... 111 ..... 0110011 @r
> > > grev 0110100 .......... 101 ..... 0110011 @r
> > > -gorc 0010100 .......... 101 ..... 0110011 @r
> > > -
> > > grevi 01101. ........... 101 ..... 0010011 @sh
> > > -gorci 00101. ........... 101 ..... 0010011 @sh
> > >
> > > # *** RV64B Standard Extension (in addition to RV32B) ***
> > > packw 0000100 .......... 100 ..... 0111011 @r
> > > packuw 0100100 .......... 100 ..... 0111011 @r
> > > grevw 0110100 .......... 101 ..... 0111011 @r
> > > -gorcw 0010100 .......... 101 ..... 0111011 @r
> > >
> > > greviw 0110100 .......... 101 ..... 0011011 @sh5
> > > -gorciw 0010100 .......... 101 ..... 0011011 @sh5
> > >
> > > # *** RV32 Zbc Standard Extension ***
> > > clmul 0000101 .......... 001 ..... 0110011 @r
> > > diff --git a/target/riscv/bitmanip_helper.c b/target/riscv/bitmanip_helper.c
> > > index 73be5a81c7..bb48388fcd 100644
> > > --- a/target/riscv/bitmanip_helper.c
> > > +++ b/target/riscv/bitmanip_helper.c
> > > @@ -64,32 +64,6 @@ target_ulong HELPER(grevw)(target_ulong rs1, target_ulong rs2)
> > > return do_grev(rs1, rs2, 32);
> > > }
> > >
> > > -static target_ulong do_gorc(target_ulong rs1,
> > > - target_ulong rs2,
> > > - int bits)
> > > -{
> > > - target_ulong x = rs1;
> > > - int i, shift;
> > > -
> > > - for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) {
> > > - if (rs2 & shift) {
> > > - x |= do_swap(x, adjacent_masks[i], shift);
> > > - }
> > > - }
> > > -
> > > - return x;
> > > -}
> > > -
> > > -target_ulong HELPER(gorc)(target_ulong rs1, target_ulong rs2)
> > > -{
> > > - return do_gorc(rs1, rs2, TARGET_LONG_BITS);
> > > -}
> > > -
> > > -target_ulong HELPER(gorcw)(target_ulong rs1, target_ulong rs2)
> > > -{
> > > - return do_gorc(rs1, rs2, 32);
> > > -}
> > > -
> > > target_ulong HELPER(clmul)(target_ulong rs1, target_ulong rs2)
> > > {
> > > target_ulong result = 0;
> > > diff --git a/target/riscv/insn_trans/trans_rvb.c.inc b/target/riscv/insn_trans/trans_rvb.c.inc
> > > index bdfb495f24..d32af5915a 100644
> > > --- a/target/riscv/insn_trans/trans_rvb.c.inc
> > > +++ b/target/riscv/insn_trans/trans_rvb.c.inc
> > > @@ -295,16 +295,27 @@ static bool trans_grevi(DisasContext *ctx, arg_grevi *a)
> > > return gen_shift_imm_fn(ctx, a, EXT_NONE, gen_grevi);
> > > }
> > >
> > > -static bool trans_gorc(DisasContext *ctx, arg_gorc *a)
> > > +static void gen_orc_b(TCGv ret, TCGv source1)
> > > {
> > > - REQUIRE_EXT(ctx, RVB);
> > > - return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc);
> > > + TCGv tmp = tcg_temp_new();
> > > + TCGv ones = tcg_constant_tl(dup_const_tl(MO_8, 0x01));
> > > +
> > > + /* Set lsb in each byte if the byte was zero. */
> > > + tcg_gen_sub_tl(tmp, source1, ones);
> > > + tcg_gen_andc_tl(tmp, tmp, source1);
> > > + tcg_gen_shri_tl(tmp, tmp, 7);
> > > + tcg_gen_andc_tl(tmp, ones, tmp);
> > > +
> > > + /* Replicate the lsb of each byte across the byte. */
> > > + tcg_gen_muli_tl(ret, tmp, 0xff);
> > > +
> > > + tcg_temp_free(tmp);
> > > }
> >
> > It seems there is a bug in the current orc.b implementation,
> > the following 7 hexadecimal patterns return one wrong byte (0x00
> > instead of 0xff):
> > orc.b(0x............01..) = 0x............00.. (instead of 0x............ff..)
> > orc.b(0x..........01....) = 0x..........00.... (instead of 0x..........ff....)
> > orc.b(0x........01......) = 0x........00...... (instead of 0x........ff......)
> > orc.b(0x......01........) = 0x......00........ (instead of 0x......ff........)
> > orc.b(0x....01..........) = 0x....00.......... (instead of 0x....ff..........)
> > orc.b(0x..01............) = 0x..00............ (instead of 0x..ff............)
> > orc.b(0x01..............) = 0x00.............. (instead of 0xff..............)
> > (see test cases below)
> >
> > The issue seems to be related to the propagation of the carry.
> > I had a hard time fixing it. With some help, I have added a prolog
> > which basically computes:
> > (source1 | ((source1 << 1) & ~ones)) in order to avoid the carry.
> > I will send the patch as a follow-up in this thread as 'Patch 1A'.
> >
> > But it's notably less optimized than the current code, so feel free
> > to come up with better options.
> > Actually my initial stab at fixing it was implementing a more
> > straightforward but less astute 'divide and conquer' method
> > where bits are or'ed by pairs, then the pairs are or'ed by pair ...
> > using the following formula:
> > tmp = source1 | (source1 >> 1)
> > tmp = tmp | (tmp >> 2)
> > tmp = tmp | (tmp >> 4)
> > ret = tmp & 0x0101010101010101
> > ret = tmp * 0xff
> > as it's notably less optimized than the current code when converted in
> > tcg_gen_ primitives but de par with the fixed version.
> > I'm also sending in this thread as 'Patch 1B' as I feel it's slightly
> > easier to grasp.
> >
> >
> > Test cases run on current implementation:
> > orc.b(0x0000000000000000) = 0x0000000000000000 OK (expect 0x0000000000000000)
> > orc.b(0x0000000000000001) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> > orc.b(0x0000000000000002) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> > orc.b(0x0000000000000004) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> > orc.b(0x0000000000000008) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> > orc.b(0x0000000000000010) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> > orc.b(0x0000000000000020) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> > orc.b(0x0000000000000040) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> > orc.b(0x0000000000000080) = 0x00000000000000ff OK (expect 0x00000000000000ff)
> > orc.b(0x0000000000000100) = 0x0000000000000000 FAIL (expect 0x000000000000ff00)
> > orc.b(0x0000000000000200) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> > orc.b(0x0000000000000400) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> > orc.b(0x0000000000000800) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> > orc.b(0x0000000000001000) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> > orc.b(0x0000000000002000) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> > orc.b(0x0000000000004000) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> > orc.b(0x0000000000008000) = 0x000000000000ff00 OK (expect 0x000000000000ff00)
> > orc.b(0x0000000000010000) = 0x0000000000000000 FAIL (expect 0x0000000000ff0000)
> > orc.b(0x0000000000020000) = 0x0000000000ff0000 OK (expect 0x0000000000ff0000)
> > orc.b(0x0000000001000000) = 0x0000000000000000 FAIL (expect 0x00000000ff000000)
> > orc.b(0x0000000002000000) = 0x00000000ff000000 OK (expect 0x00000000ff000000)
> > orc.b(0x0000000100000000) = 0x0000000000000000 FAIL (expect 0x000000ff00000000)
> > orc.b(0x0000000200000000) = 0x000000ff00000000 OK (expect 0x000000ff00000000)
> > orc.b(0x0000010000000000) = 0x0000000000000000 FAIL (expect 0x0000ff0000000000)
> > orc.b(0x0000020000000000) = 0x0000ff0000000000 OK (expect 0x0000ff0000000000)
> > orc.b(0x0001000000000000) = 0x0000000000000000 FAIL (expect 0x00ff000000000000)
> > orc.b(0x0002000000000000) = 0x00ff000000000000 OK (expect 0x00ff000000000000)
> > orc.b(0x0100000000000000) = 0x0000000000000000 FAIL (expect 0xff00000000000000)
> > orc.b(0x0200000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> > orc.b(0x0400000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> > orc.b(0x0800000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> > orc.b(0x1000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> > orc.b(0x2000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> > orc.b(0x4000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> > orc.b(0x8000000000000000) = 0xff00000000000000 OK (expect 0xff00000000000000)
> > orc.b(0xffffffffffffffff) = 0xffffffffffffffff OK (expect 0xffffffffffffffff)
> > orc.b(0x00ff00ff00ff00ff) = 0x00ff00ff00ff00ff OK (expect 0x00ff00ff00ff00ff)
> > orc.b(0xff00ff00ff00ff00) = 0xff00ff00ff00ff00 OK (expect 0xff00ff00ff00ff00)
> > orc.b(0x0001000100010001) = 0x00000000000000ff FAIL (expect 0x00ff00ff00ff00ff)
> > orc.b(0x0100010001000100) = 0x0000000000000000 FAIL (expect 0xff00ff00ff00ff00)
> > orc.b(0x8040201008040201) = 0xffffffffffffffff OK (expect 0xffffffffffffffff)
> > orc.b(0x0804020180402010) = 0xffffffffffffffff OK (expect 0xffffffffffffffff)
> > orc.b(0x010055aa00401100) = 0x0000ffff00ffff00 FAIL (expect 0xff00ffff00ffff00)
> >
> >
> > >
> > > -static bool trans_gorci(DisasContext *ctx, arg_gorci *a)
> > > +static bool trans_orc_b(DisasContext *ctx, arg_orc_b *a)
> > > {
> > > - REQUIRE_EXT(ctx, RVB);
> > > - return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc);
> > > + REQUIRE_ZBB(ctx);
> > > + return gen_unary(ctx, a, EXT_ZERO, gen_orc_b);
> > > }
> > >
> > > #define GEN_SHADD(SHAMT) \
> > > @@ -476,22 +487,6 @@ static bool trans_greviw(DisasContext *ctx, arg_greviw *a)
> > > return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_grev);
> > > }
> > >
> > > -static bool trans_gorcw(DisasContext *ctx, arg_gorcw *a)
> > > -{
> > > - REQUIRE_64BIT(ctx);
> > > - REQUIRE_EXT(ctx, RVB);
> > > - ctx->w = true;
> > > - return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc);
> > > -}
> > > -
> > > -static bool trans_gorciw(DisasContext *ctx, arg_gorciw *a)
> > > -{
> > > - REQUIRE_64BIT(ctx);
> > > - REQUIRE_EXT(ctx, RVB);
> > > - ctx->w = true;
> > > - return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc);
> > > -}
> > > -
> > > #define GEN_SHADD_UW(SHAMT) \
> > > static void gen_sh##SHAMT##add_uw(TCGv ret, TCGv arg1, TCGv arg2) \
> > > { \
> > > --
> > > 2.31.1
> > >
> > >
On Wed, 13 Oct 2021 at 15:44, Vincent Palatin <vpalatin@rivosinc.com> wrote:
>
> On Wed, Oct 13, 2021 at 3:13 PM Philipp Tomsich
> <philipp.tomsich@vrull.eu> wrote:
> >
> > I had a much simpler version initially (using 3 x mask/shift/or, for
> > 12 instructions after setup of constants), but took up the suggestion
> > to optimize based on haszero(v)...
> > Indeed this appears to not do what we expect, when there's only 0x01
> > set in a byte.
> >
> > The less optimized form, with a single constant, that would still do
> > what we want is:
> > /* set high-bit for non-zero bytes */
> > constant = dup_const_tl(MO_8, 0x7f);
> > tmp = v & constant; // AND
> > tmp += constant; // ADD
> > tmp |= v; // OR
> > /* extract high-bit to low-bit, for each word */
> > tmp &= ~constant; // ANDC
> > tmp >>= 7; // SHR
> > /* multiply with 0xff to populate entire byte where the low-bit is set */
> > tmp *= 0xff; // MUL
> >
> > I'll submit a patch with this one later today, once I had a chance to
> > pass this through a full test.
>
>
> Thanks for the insight.
>
> I have tried it, implemented as:
> ```
> static void gen_orc_b(TCGv ret, TCGv source1)
> {
> TCGv tmp = tcg_temp_new();
> TCGv constant = tcg_constant_tl(dup_const_tl(MO_8, 0x7f));
>
> /* set high-bit for non-zero bytes */
> tcg_gen_and_tl(tmp, source1, constant);
> tcg_gen_add_tl(tmp, tmp, constant);
> tcg_gen_or_tl(tmp, tmp, source1);
> /* extract high-bit to low-bit, for each word */
> tcg_gen_andc_tl(tmp, tmp, constant);
> tcg_gen_shri_tl(tmp, tmp, 7);
>
> /* Replicate the lsb of each byte across the byte. */
> tcg_gen_muli_tl(ret, tmp, 0xff);
>
> tcg_temp_free(tmp);
> }
> ```
>
> It does pass my own test sequences.
I am running it against SPEC at the moment, using optimized
strlen/strcpy/strcmp functions using orc.b.
The verdict on that should be available later today...
Philipp.
On 10/13/21 6:49 AM, Philipp Tomsich wrote:
> On Wed, 13 Oct 2021 at 15:44, Vincent Palatin <vpalatin@rivosinc.com> wrote:
>>
>> On Wed, Oct 13, 2021 at 3:13 PM Philipp Tomsich
>> <philipp.tomsich@vrull.eu> wrote:
>>>
>>> I had a much simpler version initially (using 3 x mask/shift/or, for
>>> 12 instructions after setup of constants), but took up the suggestion
>>> to optimize based on haszero(v)...
>>> Indeed this appears to not do what we expect, when there's only 0x01
>>> set in a byte.
>>>
>>> The less optimized form, with a single constant, that would still do
>>> what we want is:
>>> /* set high-bit for non-zero bytes */
>>> constant = dup_const_tl(MO_8, 0x7f);
>>> tmp = v & constant; // AND
>>> tmp += constant; // ADD
>>> tmp |= v; // OR
>>> /* extract high-bit to low-bit, for each word */
>>> tmp &= ~constant; // ANDC
>>> tmp >>= 7; // SHR
>>> /* multiply with 0xff to populate entire byte where the low-bit is set */
>>> tmp *= 0xff; // MUL
>>>
>>> I'll submit a patch with this one later today, once I had a chance to
>>> pass this through a full test.
>>
>>
>> Thanks for the insight.
>>
>> I have tried it, implemented as:
>> ```
>> static void gen_orc_b(TCGv ret, TCGv source1)
>> {
>> TCGv tmp = tcg_temp_new();
>> TCGv constant = tcg_constant_tl(dup_const_tl(MO_8, 0x7f));
>>
>> /* set high-bit for non-zero bytes */
>> tcg_gen_and_tl(tmp, source1, constant);
>> tcg_gen_add_tl(tmp, tmp, constant);
>> tcg_gen_or_tl(tmp, tmp, source1);
>> /* extract high-bit to low-bit, for each word */
>> tcg_gen_andc_tl(tmp, tmp, constant);
>> tcg_gen_shri_tl(tmp, tmp, 7);
>>
>> /* Replicate the lsb of each byte across the byte. */
>> tcg_gen_muli_tl(ret, tmp, 0xff);
>>
>> tcg_temp_free(tmp);
>> }
>> ```
>>
>> It does pass my own test sequences.
>
> I am running it against SPEC at the moment, using optimized
> strlen/strcpy/strcmp functions using orc.b.
> The verdict on that should be available later today...
off topic but relates, for Zb (and similar things in the future) whats
the strategy for change management/discovery. I understand you can
hardcode things for quick test, but for a proper glibc implementation
this would be an IFUNC but there seems to be no architectural way per
spec (for software/kernel) to discover this.
Same issue is with building linux kernel with Zb - how do we make sure
that hardware/sim supports Zb when running corresponding software.
It seems some generic discovery/enumeration scheme is in works but what
to do in the interim.
Thx,
-Vineet
On 10/13/21 9:20 AM, Vineet Gupta wrote: > off topic but relates, for Zb (and similar things in the future) whats the strategy for > change management/discovery. I understand you can hardcode things for quick test, but for > a proper glibc implementation this would be an IFUNC but there seems to be no > architectural way per spec (for software/kernel) to discover this. Since the architecture restricted access to these CSRs, you do have to coordinate with the kernel. There is an AT_HWCAP value that is given to userland, but it is currently masked to only provide a few of the MISA bits. This will need to be extended for both V and Zb. It doesn't help that Zb has been split into lots of smaller extensions, which (if done simplistically) will quickly consume all of the bits within AT_HWCAP. So: I strongly suggest that RISC-V spend a few moments considering a way to represent this that will easily support the myriad extensions. One possibility is to add more AT_* entries straight away -- AT_HWCAP_ZB, which contains one bit for all of the Zb[abcs] extensions. Possibly set the "main" AT_HWCAP "b" bit if Zb is present at some minimal level. > Same issue is with building linux kernel with Zb - how do we make sure that hardware/sim > supports Zb when running corresponding software. On the kernel side this is easier -- read the CSRs then patch the kernel. There are existing ways to manage this sort of thing. r~
On Wed, 13 Oct 2021 at 18:51, Richard Henderson < richard.henderson@linaro.org> wrote: > On 10/13/21 9:20 AM, Vineet Gupta wrote: > > off topic but relates, for Zb (and similar things in the future) whats > the strategy for > > change management/discovery. I understand you can hardcode things for > quick test, but for > > a proper glibc implementation this would be an IFUNC but there seems to > be no > > architectural way per spec (for software/kernel) to discover this. > > Since the architecture restricted access to these CSRs, you do have to > coordinate with the > kernel. > Zb[abcs] will not be discoverable via MISA bits. A unified low-level discovery mechanisms (and a way to inject this information to userspace via the auxiliary vector) are being developed at the moment. There is an AT_HWCAP value that is given to userland, but it is currently > masked to only > provide a few of the MISA bits. This will need to be extended for both V > and Zb. It > doesn't help that Zb has been split into lots of smaller extensions, which > (if done > simplistically) will quickly consume all of the bits within AT_HWCAP. > It looks like HWCAP, HWCAP2 and AT_PLATFORM and AT_BASE_PLATFORM will be used. Kito presented the (then current) state of thinking at the Linux Plumbers Conference… > So: I strongly suggest that RISC-V spend a few moments considering a way > to represent this > that will easily support the myriad extensions. One possibility is to add > more AT_* > entries straight away -- AT_HWCAP_ZB, which contains one bit for all of > the Zb[abcs] > extensions. Possibly set the "main" AT_HWCAP "b" bit if Zb is present at > some minimal level. > > > Same issue is with building linux kernel with Zb - how do we make sure > that hardware/sim > > supports Zb when running corresponding software. > > On the kernel side this is easier -- read the CSRs then patch the kernel. > There are existing ways to manage this sort of thing. > > > r~ >
The implementation was failing for the following 7 hexadecimal patterns
which return one wrong byte (0x00 instead of 0xff):
orc.b(0x............01..) = 0x............00.. (instead of 0x............ff..)
orc.b(0x..........01....) = 0x..........00.... (instead of 0x..........ff....)
orc.b(0x........01......) = 0x........00...... (instead of 0x........ff......)
orc.b(0x......01........) = 0x......00........ (instead of 0x......ff........)
orc.b(0x....01..........) = 0x....00.......... (instead of 0x....ff..........)
orc.b(0x..01............) = 0x..00............ (instead of 0x..ff............)
orc.b(0x01..............) = 0x00.............. (instead of 0xff..............)
Implement a simpler but less astute/optimized 'divide and conquer' method
where bits are or'ed by pairs, then the pairs are or'ed by pair ...
Signed-off-by: Vincent Palatin <vpalatin@rivosinc.com>
---
target/riscv/insn_trans/trans_rvb.c.inc | 18 +++++++++++++-----
1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/target/riscv/insn_trans/trans_rvb.c.inc b/target/riscv/insn_trans/trans_rvb.c.inc
index 185c3e9a60..04f795652d 100644
--- a/target/riscv/insn_trans/trans_rvb.c.inc
+++ b/target/riscv/insn_trans/trans_rvb.c.inc
@@ -249,18 +249,26 @@ static bool trans_rev8_64(DisasContext *ctx, arg_rev8_64 *a)
static void gen_orc_b(TCGv ret, TCGv source1)
{
TCGv tmp = tcg_temp_new();
+ TCGv shifted = tcg_temp_new();
TCGv ones = tcg_constant_tl(dup_const_tl(MO_8, 0x01));
- /* Set lsb in each byte if the byte was zero. */
- tcg_gen_sub_tl(tmp, source1, ones);
- tcg_gen_andc_tl(tmp, tmp, source1);
- tcg_gen_shri_tl(tmp, tmp, 7);
- tcg_gen_andc_tl(tmp, ones, tmp);
+ /*
+ * Divide and conquer: show one byte of the word in the comments,
+ * with U meaning Useful or'ed bit, X Junk content bit, . don't care.
+ */
+ tcg_gen_shri_tl(shifted, source1, 1);
+ tcg_gen_or_tl(tmp, source1, shifted); /* tmp[15:8] = XU.U.U.U */
+ tcg_gen_shri_tl(shifted, tmp, 2);
+ tcg_gen_or_tl(tmp, shifted, tmp); /* tmp[15:8] = XXXU...U */
+ tcg_gen_shri_tl(shifted, tmp, 4);
+ tcg_gen_or_tl(tmp, shifted, tmp); /* tmp[15:8] = XXXXXXXU */
+ tcg_gen_and_tl(tmp, ones, tmp); /* tmp[15:8] = 0000000U */
/* Replicate the lsb of each byte across the byte. */
tcg_gen_muli_tl(ret, tmp, 0xff);
tcg_temp_free(tmp);
+ tcg_temp_free(shifted);
}
static bool trans_orc_b(DisasContext *ctx, arg_orc_b *a)
--
2.25.1
The implementation was failing for the following 7 hexadecimal patterns
which return one wrong byte (0x00 instead of 0xff):
orc.b(0x............01..) = 0x............00.. (instead of 0x............ff..)
orc.b(0x..........01....) = 0x..........00.... (instead of 0x..........ff....)
orc.b(0x........01......) = 0x........00...... (instead of 0x........ff......)
orc.b(0x......01........) = 0x......00........ (instead of 0x......ff........)
orc.b(0x....01..........) = 0x....00.......... (instead of 0x....ff..........)
orc.b(0x..01............) = 0x..00............ (instead of 0x..ff............)
orc.b(0x01..............) = 0x00.............. (instead of 0xff..............)
Try to keep the carry from propagating and triggering the incorrect
results.
Signed-off-by: Vincent Palatin <vpalatin@rivosinc.com>
---
target/riscv/insn_trans/trans_rvb.c.inc | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/target/riscv/insn_trans/trans_rvb.c.inc b/target/riscv/insn_trans/trans_rvb.c.inc
index 185c3e9a60..b9fc272789 100644
--- a/target/riscv/insn_trans/trans_rvb.c.inc
+++ b/target/riscv/insn_trans/trans_rvb.c.inc
@@ -249,11 +249,17 @@ static bool trans_rev8_64(DisasContext *ctx, arg_rev8_64 *a)
static void gen_orc_b(TCGv ret, TCGv source1)
{
TCGv tmp = tcg_temp_new();
+ TCGv tmp2 = tcg_temp_new();
TCGv ones = tcg_constant_tl(dup_const_tl(MO_8, 0x01));
+ /* avoid carry propagation */
+ tcg_gen_shli_tl(tmp, source1, 1);
+ tcg_gen_or_tl(tmp, source1, tmp);
+ tcg_gen_andc_tl(tmp2, tmp, ones);
+
/* Set lsb in each byte if the byte was zero. */
- tcg_gen_sub_tl(tmp, source1, ones);
- tcg_gen_andc_tl(tmp, tmp, source1);
+ tcg_gen_sub_tl(tmp, tmp2, ones);
+ tcg_gen_andc_tl(tmp, tmp, tmp2);
tcg_gen_shri_tl(tmp, tmp, 7);
tcg_gen_andc_tl(tmp, ones, tmp);
@@ -261,6 +267,7 @@ static void gen_orc_b(TCGv ret, TCGv source1)
tcg_gen_muli_tl(ret, tmp, 0xff);
tcg_temp_free(tmp);
+ tcg_temp_free(tmp2);
}
static bool trans_orc_b(DisasContext *ctx, arg_orc_b *a)
--
2.25.1
© 2016 - 2025 Red Hat, Inc.