[v2] Add LoongArch LSX instructions

[RFC PATCH v2 30/44] target/loongarch: Implement vclo vclz

Posted by Song Gao 2 years, 10 months ago

This patch includes:
- VCLO.{B/H/W/D};
- VCLZ.{B/H/W/D}.

Signed-off-by: Song Gao <gaosong@loongson.cn>
---
 target/loongarch/disas.c                    |  9 ++++++
 target/loongarch/helper.h                   |  9 ++++++
 target/loongarch/insn_trans/trans_lsx.c.inc |  9 ++++++
 target/loongarch/insns.decode               |  9 ++++++
 target/loongarch/lsx_helper.c               | 31 +++++++++++++++++++++
 5 files changed, 67 insertions(+)

diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index 405e8885cd..0c82a1d9d1 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -1258,3 +1258,12 @@ INSN_LSX(vssrarni_bu_h,    vv_i)
 INSN_LSX(vssrarni_hu_w,    vv_i)
 INSN_LSX(vssrarni_wu_d,    vv_i)
 INSN_LSX(vssrarni_du_q,    vv_i)
+
+INSN_LSX(vclo_b,           vv)
+INSN_LSX(vclo_h,           vv)
+INSN_LSX(vclo_w,           vv)
+INSN_LSX(vclo_d,           vv)
+INSN_LSX(vclz_b,           vv)
+INSN_LSX(vclz_h,           vv)
+INSN_LSX(vclz_w,           vv)
+INSN_LSX(vclz_d,           vv)
diff --git a/target/loongarch/helper.h b/target/loongarch/helper.h
index d602de390b..a7facc6bc1 100644
--- a/target/loongarch/helper.h
+++ b/target/loongarch/helper.h
@@ -486,3 +486,12 @@ DEF_HELPER_4(vssrarni_bu_h, void, env, i32, i32, i32)
 DEF_HELPER_4(vssrarni_hu_w, void, env, i32, i32, i32)
 DEF_HELPER_4(vssrarni_wu_d, void, env, i32, i32, i32)
 DEF_HELPER_4(vssrarni_du_q, void, env, i32, i32, i32)
+
+DEF_HELPER_3(vclo_b, void, env, i32, i32)
+DEF_HELPER_3(vclo_h, void, env, i32, i32)
+DEF_HELPER_3(vclo_w, void, env, i32, i32)
+DEF_HELPER_3(vclo_d, void, env, i32, i32)
+DEF_HELPER_3(vclz_b, void, env, i32, i32)
+DEF_HELPER_3(vclz_h, void, env, i32, i32)
+DEF_HELPER_3(vclz_w, void, env, i32, i32)
+DEF_HELPER_3(vclz_d, void, env, i32, i32)
diff --git a/target/loongarch/insn_trans/trans_lsx.c.inc b/target/loongarch/insn_trans/trans_lsx.c.inc
index c732c43580..5d81c02103 100644
--- a/target/loongarch/insn_trans/trans_lsx.c.inc
+++ b/target/loongarch/insn_trans/trans_lsx.c.inc
@@ -2785,3 +2785,12 @@ TRANS(vssrarni_bu_h, gen_vv_i, gen_helper_vssrarni_bu_h)
 TRANS(vssrarni_hu_w, gen_vv_i, gen_helper_vssrarni_hu_w)
 TRANS(vssrarni_wu_d, gen_vv_i, gen_helper_vssrarni_wu_d)
 TRANS(vssrarni_du_q, gen_vv_i, gen_helper_vssrarni_du_q)
+
+TRANS(vclo_b, gen_vv, gen_helper_vclo_b)
+TRANS(vclo_h, gen_vv, gen_helper_vclo_h)
+TRANS(vclo_w, gen_vv, gen_helper_vclo_w)
+TRANS(vclo_d, gen_vv, gen_helper_vclo_d)
+TRANS(vclz_b, gen_vv, gen_helper_vclz_b)
+TRANS(vclz_h, gen_vv, gen_helper_vclz_h)
+TRANS(vclz_w, gen_vv, gen_helper_vclz_w)
+TRANS(vclz_d, gen_vv, gen_helper_vclz_d)
diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index bb4b2a8632..7591ec1bab 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -959,3 +959,12 @@ vssrarni_bu_h    0111 00110110 11000 1 .... ..... .....   @vv_ui4
 vssrarni_hu_w    0111 00110110 11001 ..... ..... .....    @vv_ui5
 vssrarni_wu_d    0111 00110110 1101 ...... ..... .....    @vv_ui6
 vssrarni_du_q    0111 00110110 111 ....... ..... .....    @vv_ui7
+
+vclo_b           0111 00101001 11000 00000 ..... .....    @vv
+vclo_h           0111 00101001 11000 00001 ..... .....    @vv
+vclo_w           0111 00101001 11000 00010 ..... .....    @vv
+vclo_d           0111 00101001 11000 00011 ..... .....    @vv
+vclz_b           0111 00101001 11000 00100 ..... .....    @vv
+vclz_h           0111 00101001 11000 00101 ..... .....    @vv
+vclz_w           0111 00101001 11000 00110 ..... .....    @vv
+vclz_d           0111 00101001 11000 00111 ..... .....    @vv
diff --git a/target/loongarch/lsx_helper.c b/target/loongarch/lsx_helper.c
index 4b933f8a69..8ec479dc2d 100644
--- a/target/loongarch/lsx_helper.c
+++ b/target/loongarch/lsx_helper.c
@@ -2170,3 +2170,34 @@ void HELPER(vssrarni_du_q)(CPULoongArchState *env,
 VSSRARNUI(vssrarni_bu_h, 16, B, H)
 VSSRARNUI(vssrarni_hu_w, 32, H, W)
 VSSRARNUI(vssrarni_wu_d, 64, W, D)
+
+#define DO_2OP(NAME, BIT, E, T, DO_OP)                              \
+void HELPER(NAME)(CPULoongArchState *env, uint32_t vd, uint32_t vj) \
+{                                                                   \
+    int i;                                                          \
+    VReg *Vd = &(env->fpr[vd].vreg);                                \
+    VReg *Vj = &(env->fpr[vj].vreg);                                \
+                                                                    \
+    for (i = 0; i < LSX_LEN/BIT; i++)                               \
+    {                                                               \
+        Vd->E(i) = DO_OP((T)Vj->E(i));                              \
+    }                                                               \
+}
+
+#define DO_CLO_B(N)  (clz32((uint8_t)~N) - 24)
+#define DO_CLO_H(N)  (clz32((uint16_t)~N) - 16)
+#define DO_CLO_W(N)  (clz32((uint32_t)~N))
+#define DO_CLO_D(N)  (clz64((uint64_t)~N))
+#define DO_CLZ_B(N)  (clz32(N) - 24)
+#define DO_CLZ_H(N)  (clz32(N) - 16)
+#define DO_CLZ_W(N)  (clz32(N))
+#define DO_CLZ_D(N)  (clz64(N))
+
+DO_2OP(vclo_b, 8, B, uint8_t, DO_CLO_B)
+DO_2OP(vclo_h, 16, H, uint16_t, DO_CLO_H)
+DO_2OP(vclo_w, 32, W, uint32_t, DO_CLO_W)
+DO_2OP(vclo_d, 64, D, uint64_t, DO_CLO_D)
+DO_2OP(vclz_b, 8, B, uint8_t, DO_CLZ_B)
+DO_2OP(vclz_h, 16, H, uint16_t, DO_CLZ_H)
+DO_2OP(vclz_w, 32, W, uint32_t, DO_CLZ_W)
+DO_2OP(vclz_d, 64, D, uint64_t, DO_CLZ_D)
-- 
2.31.1

Re: [RFC PATCH v2 30/44] target/loongarch: Implement vclo vclz

Posted by Richard Henderson 2 years, 10 months ago

On 3/27/23 20:06, Song Gao wrote:
> +#define DO_CLO_B(N)  (clz32((uint8_t)~N) - 24)
> +#define DO_CLO_H(N)  (clz32((uint16_t)~N) - 16)

I think this is wrong.  You *want* the high bits to be set, so that they are ones, and 
included in the count, which you then subtract off.  You want the "real" count to start 
after the 24th leading 1.

r~

Re: [RFC PATCH v2 30/44] target/loongarch: Implement vclo vclz

Posted by gaosong 2 years, 10 months ago

在 2023/4/2 上午11:34, Richard Henderson 写道:
> On 3/27/23 20:06, Song Gao wrote:
>> +#define DO_CLO_B(N)  (clz32((uint8_t)~N) - 24)
>> +#define DO_CLO_H(N)  (clz32((uint16_t)~N) - 16)
>
> I think this is wrong. 
It is wried,  the result is always right. :-\
and  (clz32(~N) - 24)  or (clz32((uint32_t)~N) - 24) is wrong.
> You *want* the high bits to be set, so that they are ones, and 
> included in the count, which you then subtract off.  You want the 
> "real" count to start after the 24th leading 1.
>
Yes,
and  we use clz32(),   how about the following way?

#define DO_CLO_B(N)  (clz32( ~N & 0xff) -24)
#define DO_CLO_H(N)  (clz32( ~N & 0xffff) -16)

Thanks.
Song Gao

Re: [RFC PATCH v2 30/44] target/loongarch: Implement vclo vclz

Posted by Richard Henderson 2 years, 10 months ago

On 4/7/23 00:40, gaosong wrote:
> 
> 在 2023/4/2 上午11:34, Richard Henderson 写道:
>> On 3/27/23 20:06, Song Gao wrote:
>>> +#define DO_CLO_B(N)  (clz32((uint8_t)~N) - 24)
>>> +#define DO_CLO_H(N)  (clz32((uint16_t)~N) - 16)
>>
>> I think this is wrong. 
> It is wried,  the result is always right. :-\
> and  (clz32(~N) - 24)  or (clz32((uint32_t)~N) - 24) is wrong.
>> You *want* the high bits to be set, so that they are ones, and included in the count, 
>> which you then subtract off.  You want the "real" count to start after the 24th leading 1.
>>
> Yes,
> and  we use clz32(),   how about the following way?
> 
> #define DO_CLO_B(N)  (clz32( ~N & 0xff) -24)
> #define DO_CLO_H(N)  (clz32( ~N & 0xffff) -16)

Ah yes, I see.  My mistake.  Either old or new formulation is fine.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~