All of the inputs to these instructions are 32-bits. Rather than
extend each input to 64-bits and then extract the high 32-bits of
the output, use tcg_gen_muls2_i32 and other 32-bit generator functions.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/translate.c | 72 +++++++++++++++---------------------------
1 file changed, 26 insertions(+), 46 deletions(-)
diff --git a/target/arm/translate.c b/target/arm/translate.c
index ddc54e77e4..77154be743 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -391,34 +391,6 @@ static void gen_revsh(TCGv_i32 var)
tcg_gen_ext16s_i32(var, var);
}
-/* Return (b << 32) + a. Mark inputs as dead */
-static TCGv_i64 gen_addq_msw(TCGv_i64 a, TCGv_i32 b)
-{
- TCGv_i64 tmp64 = tcg_temp_new_i64();
-
- tcg_gen_extu_i32_i64(tmp64, b);
- tcg_temp_free_i32(b);
- tcg_gen_shli_i64(tmp64, tmp64, 32);
- tcg_gen_add_i64(a, tmp64, a);
-
- tcg_temp_free_i64(tmp64);
- return a;
-}
-
-/* Return (b << 32) - a. Mark inputs as dead. */
-static TCGv_i64 gen_subq_msw(TCGv_i64 a, TCGv_i32 b)
-{
- TCGv_i64 tmp64 = tcg_temp_new_i64();
-
- tcg_gen_extu_i32_i64(tmp64, b);
- tcg_temp_free_i32(b);
- tcg_gen_shli_i64(tmp64, tmp64, 32);
- tcg_gen_sub_i64(a, tmp64, a);
-
- tcg_temp_free_i64(tmp64);
- return a;
-}
-
/* 32x32->64 multiply. Marks inputs as dead. */
static TCGv_i64 gen_mulu_i64_i32(TCGv_i32 a, TCGv_i32 b)
{
@@ -8872,23 +8844,27 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
(SMMUL, SMMLA, SMMLS) */
tmp = load_reg(s, rm);
tmp2 = load_reg(s, rs);
- tmp64 = gen_muls_i64_i32(tmp, tmp2);
+ tcg_gen_muls2_i32(tmp2, tmp, tmp, tmp2);
if (rd != 15) {
- tmp = load_reg(s, rd);
+ tmp3 = load_reg(s, rd);
if (insn & (1 << 6)) {
- tmp64 = gen_subq_msw(tmp64, tmp);
+ tcg_gen_sub_i32(tmp, tmp, tmp3);
} else {
- tmp64 = gen_addq_msw(tmp64, tmp);
+ tcg_gen_add_i32(tmp, tmp, tmp3);
}
+ tcg_temp_free_i32(tmp3);
}
if (insn & (1 << 5)) {
- tcg_gen_addi_i64(tmp64, tmp64, 0x80000000u);
+ /*
+ * Adding 0x80000000 to the 64-bit quantity
+ * means that we have carry in to the high
+ * word when the low word has the high bit set.
+ */
+ tcg_gen_shri_i32(tmp2, tmp2, 31);
+ tcg_gen_add_i32(tmp, tmp, tmp2);
}
- tcg_gen_shri_i64(tmp64, tmp64, 32);
- tmp = tcg_temp_new_i32();
- tcg_gen_extrl_i64_i32(tmp, tmp64);
- tcg_temp_free_i64(tmp64);
+ tcg_temp_free_i32(tmp2);
store_reg(s, rn, tmp);
break;
case 0:
@@ -10114,22 +10090,26 @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn)
}
break;
case 5: case 6: /* 32 * 32 -> 32msb (SMMUL, SMMLA, SMMLS) */
- tmp64 = gen_muls_i64_i32(tmp, tmp2);
+ tcg_gen_muls2_i32(tmp2, tmp, tmp, tmp2);
if (rs != 15) {
- tmp = load_reg(s, rs);
+ tmp3 = load_reg(s, rs);
if (insn & (1 << 20)) {
- tmp64 = gen_addq_msw(tmp64, tmp);
+ tcg_gen_add_i32(tmp, tmp, tmp3);
} else {
- tmp64 = gen_subq_msw(tmp64, tmp);
+ tcg_gen_sub_i32(tmp, tmp, tmp3);
}
+ tcg_temp_free_i32(tmp3);
}
if (insn & (1 << 4)) {
- tcg_gen_addi_i64(tmp64, tmp64, 0x80000000u);
+ /*
+ * Adding 0x80000000 to the 64-bit quantity
+ * means that we have carry in to the high
+ * word when the low word has the high bit set.
+ */
+ tcg_gen_shri_i32(tmp2, tmp2, 31);
+ tcg_gen_add_i32(tmp, tmp, tmp2);
}
- tcg_gen_shri_i64(tmp64, tmp64, 32);
- tmp = tcg_temp_new_i32();
- tcg_gen_extrl_i64_i32(tmp, tmp64);
- tcg_temp_free_i64(tmp64);
+ tcg_temp_free_i32(tmp2);
break;
case 7: /* Unsigned sum of absolute differences. */
gen_helper_usad8(tmp, tmp, tmp2);
--
2.17.1
Hi Richard,
On Thu, Aug 8, 2019 at 10:28 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> All of the inputs to these instructions are 32-bits. Rather than
> extend each input to 64-bits and then extract the high 32-bits of
> the output, use tcg_gen_muls2_i32 and other 32-bit generator functions.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> target/arm/translate.c | 72 +++++++++++++++---------------------------
> 1 file changed, 26 insertions(+), 46 deletions(-)
>
> diff --git a/target/arm/translate.c b/target/arm/translate.c
> index ddc54e77e4..77154be743 100644
> --- a/target/arm/translate.c
> +++ b/target/arm/translate.c
> @@ -391,34 +391,6 @@ static void gen_revsh(TCGv_i32 var)
> tcg_gen_ext16s_i32(var, var);
> }
>
> -/* Return (b << 32) + a. Mark inputs as dead */
> -static TCGv_i64 gen_addq_msw(TCGv_i64 a, TCGv_i32 b)
> -{
> - TCGv_i64 tmp64 = tcg_temp_new_i64();
> -
> - tcg_gen_extu_i32_i64(tmp64, b);
> - tcg_temp_free_i32(b);
> - tcg_gen_shli_i64(tmp64, tmp64, 32);
> - tcg_gen_add_i64(a, tmp64, a);
> -
> - tcg_temp_free_i64(tmp64);
> - return a;
> -}
> -
> -/* Return (b << 32) - a. Mark inputs as dead. */
> -static TCGv_i64 gen_subq_msw(TCGv_i64 a, TCGv_i32 b)
> -{
> - TCGv_i64 tmp64 = tcg_temp_new_i64();
> -
> - tcg_gen_extu_i32_i64(tmp64, b);
> - tcg_temp_free_i32(b);
> - tcg_gen_shli_i64(tmp64, tmp64, 32);
> - tcg_gen_sub_i64(a, tmp64, a);
> -
> - tcg_temp_free_i64(tmp64);
> - return a;
> -}
> -
> /* 32x32->64 multiply. Marks inputs as dead. */
> static TCGv_i64 gen_mulu_i64_i32(TCGv_i32 a, TCGv_i32 b)
> {
> @@ -8872,23 +8844,27 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
> (SMMUL, SMMLA, SMMLS) */
> tmp = load_reg(s, rm);
> tmp2 = load_reg(s, rs);
> - tmp64 = gen_muls_i64_i32(tmp, tmp2);
> + tcg_gen_muls2_i32(tmp2, tmp, tmp, tmp2);
>
> if (rd != 15) {
> - tmp = load_reg(s, rd);
> + tmp3 = load_reg(s, rd);
> if (insn & (1 << 6)) {
> - tmp64 = gen_subq_msw(tmp64, tmp);
> + tcg_gen_sub_i32(tmp, tmp, tmp3);
Shouldn't you subtract tmp from tmp3?
> } else {
> - tmp64 = gen_addq_msw(tmp64, tmp);
> + tcg_gen_add_i32(tmp, tmp, tmp3);
> }
> + tcg_temp_free_i32(tmp3);
> }
> if (insn & (1 << 5)) {
> - tcg_gen_addi_i64(tmp64, tmp64, 0x80000000u);
> + /*
> + * Adding 0x80000000 to the 64-bit quantity
> + * means that we have carry in to the high
> + * word when the low word has the high bit set.
> + */
> + tcg_gen_shri_i32(tmp2, tmp2, 31);
> + tcg_gen_add_i32(tmp, tmp, tmp2);
> }
> - tcg_gen_shri_i64(tmp64, tmp64, 32);
> - tmp = tcg_temp_new_i32();
> - tcg_gen_extrl_i64_i32(tmp, tmp64);
> - tcg_temp_free_i64(tmp64);
> + tcg_temp_free_i32(tmp2);
> store_reg(s, rn, tmp);
> break;
> case 0:
> @@ -10114,22 +10090,26 @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn)
> }
> break;
> case 5: case 6: /* 32 * 32 -> 32msb (SMMUL, SMMLA, SMMLS) */
> - tmp64 = gen_muls_i64_i32(tmp, tmp2);
> + tcg_gen_muls2_i32(tmp2, tmp, tmp, tmp2);
> if (rs != 15) {
> - tmp = load_reg(s, rs);
> + tmp3 = load_reg(s, rs);
> if (insn & (1 << 20)) {
> - tmp64 = gen_addq_msw(tmp64, tmp);
> + tcg_gen_add_i32(tmp, tmp, tmp3);
> } else {
> - tmp64 = gen_subq_msw(tmp64, tmp);
> + tcg_gen_sub_i32(tmp, tmp, tmp3);
Same here.
Also the way you do the computation means you don't propagate the
borrow from the lower 32-bit of the 64-bit product when doing the
subtraction.
Thanks,
Laurent
> }
> + tcg_temp_free_i32(tmp3);
> }
> if (insn & (1 << 4)) {
> - tcg_gen_addi_i64(tmp64, tmp64, 0x80000000u);
> + /*
> + * Adding 0x80000000 to the 64-bit quantity
> + * means that we have carry in to the high
> + * word when the low word has the high bit set.
> + */
> + tcg_gen_shri_i32(tmp2, tmp2, 31);
> + tcg_gen_add_i32(tmp, tmp, tmp2);
> }
> - tcg_gen_shri_i64(tmp64, tmp64, 32);
> - tmp = tcg_temp_new_i32();
> - tcg_gen_extrl_i64_i32(tmp, tmp64);
> - tcg_temp_free_i64(tmp64);
> + tcg_temp_free_i32(tmp2);
> break;
> case 7: /* Unsigned sum of absolute differences. */
> gen_helper_usad8(tmp, tmp, tmp2);
> --
> 2.17.1
>
>
© 2016 - 2025 Red Hat, Inc.