On 3/7/23 15:34, Richard Henderson wrote:
> Compute all carry bits in parallel instead of a loop.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
Hmmmm the function was added by 6addef4d27268 9 months ago. All tcg ops you used
here were available back then.
I guess this existing implementation was an oversight on our end.
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> Cc: Daniel Henrique Barboza <danielhb413@gmail.com>
> Cc: Cédric Le Goater <clg@kaod.org>
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Cc: Greg Kurz <groug@kaod.org>
> Cc: qemu-ppc@nongnu.org
> ---
> target/ppc/translate/fixedpoint-impl.c.inc | 44 +++++++++++-----------
> 1 file changed, 23 insertions(+), 21 deletions(-)
>
> diff --git a/target/ppc/translate/fixedpoint-impl.c.inc b/target/ppc/translate/fixedpoint-impl.c.inc
> index 20ea484c3d..02d86b77a8 100644
> --- a/target/ppc/translate/fixedpoint-impl.c.inc
> +++ b/target/ppc/translate/fixedpoint-impl.c.inc
> @@ -484,33 +484,35 @@ static bool trans_PEXTD(DisasContext *ctx, arg_X *a)
>
> static bool trans_ADDG6S(DisasContext *ctx, arg_X *a)
> {
> - const uint64_t carry_bits = 0x1111111111111111ULL;
> - TCGv t0, t1, carry, zero = tcg_constant_tl(0);
> + const target_ulong carry_bits = (target_ulong)-1 / 0xf;
> + TCGv in1, in2, carryl, carryh, tmp;
> + TCGv zero = tcg_constant_tl(0);
>
> REQUIRE_INSNS_FLAGS2(ctx, BCDA_ISA206);
>
> - t0 = tcg_temp_new();
> - t1 = tcg_const_tl(0);
> - carry = tcg_const_tl(0);
> + in1 = cpu_gpr[a->ra];
> + in2 = cpu_gpr[a->rb];
> + tmp = tcg_temp_new();
> + carryl = tcg_temp_new();
> + carryh = tcg_temp_new();
>
> - for (int i = 0; i < 16; i++) {
> - tcg_gen_shri_tl(t0, cpu_gpr[a->ra], i * 4);
> - tcg_gen_andi_tl(t0, t0, 0xf);
> - tcg_gen_add_tl(t1, t1, t0);
> + /* Addition with carry. */
> + tcg_gen_add2_tl(carryl, carryh, in1, zero, in2, zero);
> + /* Addition without carry. */
> + tcg_gen_xor_tl(tmp, in1, in2);
> + /* Difference between the two is carry in to each bit. */
> + tcg_gen_xor_tl(carryl, carryl, tmp);
>
> - tcg_gen_shri_tl(t0, cpu_gpr[a->rb], i * 4);
> - tcg_gen_andi_tl(t0, t0, 0xf);
> - tcg_gen_add_tl(t1, t1, t0);
> + /*
> + * The carry-out that we're looking for is the carry-in to
> + * the next nibble. Shift the double-word down one nibble,
> + * which puts all of the bits back into one word.
> + */
> + tcg_gen_extract2_tl(carryl, carryl, carryh, 4);
>
> - tcg_gen_andi_tl(t1, t1, 0x10);
> - tcg_gen_setcond_tl(TCG_COND_NE, t1, t1, zero);
> -
> - tcg_gen_shli_tl(t0, t1, i * 4);
> - tcg_gen_or_tl(carry, carry, t0);
> - }
> -
> - tcg_gen_xori_tl(carry, carry, (target_long)carry_bits);
> - tcg_gen_muli_tl(cpu_gpr[a->rt], carry, 6);
> + /* Invert, isolate the carry bits, and produce 6's. */
> + tcg_gen_andc_tl(carryl, tcg_constant_tl(carry_bits), carryl);
> + tcg_gen_muli_tl(cpu_gpr[a->rt], carryl, 6);
> return true;
> }
>