On 2/14/21 9:58 AM, Philippe Mathieu-Daudé wrote:
> Introduce the 'Parallel Extend Lower' opcodes:
$SUBJECT s/PEXTU/PEXTL/.
> + /* Lower halve */
> + for (int i = 0; i < 64 / (2 * wlen); i++) {
> + tcg_gen_deposit_i64(cpu_gpr[a->rd],
> + cpu_gpr[a->rd], bx, 2 * wlen * i, wlen);
> + tcg_gen_deposit_i64(cpu_gpr[a->rd],
> + cpu_gpr[a->rd], ax, 2 * wlen * i + wlen, wlen);
> + tcg_gen_shri_i64(bx, bx, wlen);
> + tcg_gen_shri_i64(ax, ax, wlen);
> + }
> + /* Upper halve */
> + for (int i = 0; i < 64 / (2 * wlen); i++) {
> + tcg_gen_deposit_i64(cpu_gpr_hi[a->rd],
> + cpu_gpr_hi[a->rd], bx, 2 * wlen * i, wlen);
> + tcg_gen_deposit_i64(cpu_gpr_hi[a->rd],
> + cpu_gpr_hi[a->rd], ax, 2 * wlen * i + wlen, wlen);
> + tcg_gen_shri_i64(bx, bx, wlen);
> + tcg_gen_shri_i64(ax, ax, wlen);
> + }
Right, so, this expands to (4 * 4 * 2) = 32 operations for pextlb, if deposit
is supported, or ((4*2 + 2) * 4 * 2) = 80 operations if not (4 per deposit).
We can do a bit better, though, exploiting parallelism.
/* 5 or 8 operations, w/ or w/o deposit */
void gen_widen_b(TCGv_i64 d, TCGv_i64 s)
{
TCGv_i64 x = tcg_temp_new_i64();
TCGv_i64 y = tcg_temp_new_i64();
TCGv_i64 m0 = tcg_constant_i64(0x0000ff000000ff00ull);
/* s = abcdefgh */
tcg_gen_deposit_i64(x, s, s, 16, 48);
/* x = cdefghgh */
tcg_gen_and_i64(y, x, m);
/* y = 00e000g0 */
tcg_gen_andc_i64(x, x, m0);
/* x = 000f000h */
tcg_gen_shli_i64(y, y, 8);
/* y = 0e000g00 */
tcg_gen_or_i64(d, x, y);
/* d = 0e0f0g0h */
tcg_temp_free_i64(x);
tcg_temp_free_i64(y);
}
/* 12 or 18 operations w/ or w/o deposit */
void gen_pextb(TCGv_i64 d, TCGv_i64 s, TCGv_i64 t)
{
TCGv_i64 x = tcg_temp_new_i64();
gen_widen_b(x, s);
gen_widen_b(d, s);
tcg_gen_shli_i64(x, x, 8);
tcg_gen_or_i64(d, d, x);
tcg_temp_free_i64(x);
}
then
gen_read_gpr(s, a->rs);
gen_read_gpr(t, a->rt);
gen_pextb(cpu_gpr[a->rd], s, t);
tcg_gen_shri_i64(s, s, 32);
tcg_gen_shri_i64(t, t, 32);
gen_pextb(cpu_gpr_hi[a->rd], s, t);
gives you the result in 26 or 38 operations.
Similarly
void gen_widen_h(TCGv_i64 d, TCGv_i64 s)
{
TCGv_i64 x = tcg_temp_new_i64();
/* s = abcd */
tcg_gen_andi_i64(x, s, 0xffff0000u);
/* x = 00c0 */
tcg_gen_deposit_i64(d, s, x, 16, 48);
/* d = 0c0d */
tcg_temp_free_i64(x);
}
r~