The expansions that we chose in tcg-op.c may be less than optimial.
Delay lowering until optimize, so that we have propagated constants
and have computed known zero/one masks.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/optimize.c | 194 +++++++++++++++++++++++++++++++++++++++++++------
tcg/tcg-op.c | 189 +++++++++++++++--------------------------------
2 files changed, 230 insertions(+), 153 deletions(-)
diff --git a/tcg/optimize.c b/tcg/optimize.c
index f69702b26e..5df57049c2 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -1865,12 +1865,17 @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
static bool fold_deposit(OptContext *ctx, TCGOp *op)
{
- TempOptInfo *t1 = arg_info(op->args[1]);
- TempOptInfo *t2 = arg_info(op->args[2]);
+ TCGArg ret = op->args[0];
+ TCGArg arg1 = op->args[1];
+ TCGArg arg2 = op->args[2];
int ofs = op->args[3];
int len = op->args[4];
- int width = 8 * tcg_type_size(ctx->type);
- uint64_t z_mask, o_mask, s_mask;
+ TempOptInfo *t1 = arg_info(arg1);
+ TempOptInfo *t2 = arg_info(arg2);
+ int width;
+ uint64_t z_mask, o_mask, s_mask, type_mask, len_mask;
+ TCGOp *op2;
+ bool valid;
if (ti_is_const(t1) && ti_is_const(t2)) {
return tcg_opt_gen_movi(ctx, op, op->args[0],
@@ -1878,35 +1883,182 @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
ti_const_val(t2)));
}
- /* Inserting a value into zero at offset 0. */
- if (ti_is_const_val(t1, 0) && ofs == 0) {
- uint64_t mask = MAKE_64BIT_MASK(0, len);
+ width = 8 * tcg_type_size(ctx->type);
+ type_mask = MAKE_64BIT_MASK(0, width);
+ len_mask = MAKE_64BIT_MASK(0, len);
+ /* Inserting all-zero into a value. */
+ if ((t2->z_mask & len_mask) == 0) {
op->opc = INDEX_op_and;
- op->args[1] = op->args[2];
- op->args[2] = arg_new_constant(ctx, mask);
+ op->args[2] = arg_new_constant(ctx, ~(len_mask << ofs));
return fold_and(ctx, op);
}
- /* Inserting zero into a value. */
- if (ti_is_const_val(t2, 0)) {
- uint64_t mask = deposit64(-1, ofs, len, 0);
-
- op->opc = INDEX_op_and;
- op->args[2] = arg_new_constant(ctx, mask);
- return fold_and(ctx, op);
+ /* Inserting all-one into a value. */
+ if ((t2->o_mask & len_mask) == len_mask) {
+ op->opc = INDEX_op_or;
+ op->args[2] = arg_new_constant(ctx, len_mask << ofs);
+ return fold_or(ctx, op);
}
- /* The s_mask from the top portion of the deposit is still valid. */
- if (ofs + len == width) {
- s_mask = t2->s_mask << ofs;
- } else {
- s_mask = t1->s_mask & ~MAKE_64BIT_MASK(0, ofs + len);
+ valid = TCG_TARGET_deposit_valid(ctx->type, ofs, len);
+
+ /* Lower invalid deposit of constant as AND + OR. */
+ if (!valid && ti_is_const(t2)) {
+ uint64_t ins_val = (ti_const_val(t2) & len_mask) << ofs;
+
+ op2 = opt_insert_before(ctx, op, INDEX_op_and, 3);
+ op2->args[0] = ret;
+ op2->args[1] = arg1;
+ op2->args[2] = arg_new_constant(ctx, ~(len_mask << ofs));
+ fold_and(ctx, op2);
+
+ op->opc = INDEX_op_or;
+ op->args[1] = ret;
+ op->args[2] = arg_new_constant(ctx, ins_val);
+ return fold_or(ctx, op);
}
+ /*
+ * Compute result masks before calling other fold_* subroutines
+ * which could modify the masks of our inputs.
+ */
z_mask = deposit64(t1->z_mask, ofs, len, t2->z_mask);
o_mask = deposit64(t1->o_mask, ofs, len, t2->o_mask);
+ if (ofs + len < width) {
+ s_mask = t1->s_mask & ~MAKE_64BIT_MASK(0, ofs + len);
+ } else {
+ s_mask = t2->s_mask << ofs;
+ }
+ /* Inserting a value into zero. */
+ if (ti_is_const_val(t1, 0)) {
+ uint64_t need_mask;
+
+ /* Always lower deposit into zero at 0 as AND. */
+ if (ofs == 0) {
+ op->opc = INDEX_op_and;
+ op->args[1] = arg2;
+ op->args[2] = arg_new_constant(ctx, len_mask);
+ return fold_and(ctx, op);
+ }
+
+ /*
+ * If the portion of the value outside len that remains after
+ * shifting is zero, we can elide the mask and just shift.
+ */
+ need_mask = t2->z_mask & ~len_mask;
+ need_mask = (need_mask << ofs) & type_mask;
+ if (!need_mask) {
+ op->opc = INDEX_op_shl;
+ op->args[1] = arg2;
+ op->args[2] = arg_new_constant(ctx, ofs);
+ goto done;
+ }
+
+ /* Lower invalid deposit into zero as AND + SHL or SHL + AND. */
+ if (!valid) {
+ if (TCG_TARGET_extract_valid(ctx->type, 0, ofs + len) &&
+ !TCG_TARGET_extract_valid(ctx->type, 0, len)) {
+ op2 = opt_insert_before(ctx, op, INDEX_op_shl, 3);
+ op2->args[0] = ret;
+ op2->args[1] = arg2;
+ op2->args[2] = arg_new_constant(ctx, ofs);
+
+ op->opc = INDEX_op_extract;
+ op->args[1] = ret;
+ op->args[2] = 0;
+ op->args[3] = ofs + len;
+ goto done;
+ }
+
+ op2 = opt_insert_before(ctx, op, INDEX_op_and, 3);
+ op2->args[0] = ret;
+ op2->args[1] = arg2;
+ op2->args[2] = arg_new_constant(ctx, len_mask);
+ fold_and(ctx, op2);
+
+ op->opc = INDEX_op_shl;
+ op->args[1] = ret;
+ op->args[2] = arg_new_constant(ctx, ofs);
+ goto done;
+ }
+ }
+
+ /* After special cases, lower invalid deposit. */
+ if (!valid) {
+ TCGArg tmp;
+ bool has_ext2 = tcg_op_supported(INDEX_op_extract2, ctx->type, 0);
+ bool has_rotl = tcg_op_supported(INDEX_op_rotl, ctx->type, 0);
+
+ /*
+ * ret = arg2:arg1 >> len
+ * ret = rotl(ret, len)
+ */
+ if (ofs == 0 && has_ext2 && has_rotl) {
+ op2 = opt_insert_before(ctx, op, INDEX_op_extract2, 4);
+ op2->args[0] = ret;
+ op2->args[1] = arg1;
+ op2->args[2] = arg2;
+ op2->args[3] = len;
+
+ op->opc = INDEX_op_rotl;
+ op->args[1] = ret;
+ op->args[2] = arg_new_constant(ctx, len);
+ goto done;
+ }
+
+ /*
+ * tmp = arg1 << len
+ * ret = arg2:tmp >> len
+ */
+ if (ofs + len == width && has_ext2) {
+ tmp = ret == arg2 ? arg_new_temp(ctx) : ret;
+
+ op2 = opt_insert_before(ctx, op, INDEX_op_shl, 4);
+ op2->args[0] = tmp;
+ op2->args[1] = arg1;
+ op2->args[2] = arg_new_constant(ctx, len);
+
+ op->opc = INDEX_op_extract2;
+ op->args[0] = ret;
+ op->args[1] = tmp;
+ op->args[2] = arg2;
+ op->args[3] = len;
+ goto done;
+ }
+
+ /*
+ * tmp = arg2 & mask
+ * ret = arg1 & ~(mask << ofs)
+ * tmp = tmp << ofs
+ * ret = ret | tmp
+ */
+ tmp = arg_new_temp(ctx);
+
+ op2 = opt_insert_before(ctx, op, INDEX_op_and, 3);
+ op2->args[0] = tmp;
+ op2->args[1] = arg2;
+ op2->args[2] = arg_new_constant(ctx, len_mask);
+ fold_and(ctx, op2);
+
+ op2 = opt_insert_before(ctx, op, INDEX_op_shl, 3);
+ op2->args[0] = tmp;
+ op2->args[1] = tmp;
+ op2->args[2] = arg_new_constant(ctx, ofs);
+
+ op2 = opt_insert_before(ctx, op, INDEX_op_and, 3);
+ op2->args[0] = ret;
+ op2->args[1] = arg1;
+ op2->args[2] = arg_new_constant(ctx, ~(len_mask << ofs));
+ fold_and(ctx, op2);
+
+ op->opc = INDEX_op_or;
+ op->args[1] = ret;
+ op->args[2] = tmp;
+ }
+
+ done:
return fold_masks_zos(ctx, op, z_mask, o_mask, s_mask);
}
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index ab7b409be6..abce307f26 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -884,9 +884,6 @@ void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
unsigned int ofs, unsigned int len)
{
- uint32_t mask;
- TCGv_i32 t1;
-
tcg_debug_assert(ofs < 32);
tcg_debug_assert(len > 0);
tcg_debug_assert(len <= 32);
@@ -894,39 +891,9 @@ void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
if (len == 32) {
tcg_gen_mov_i32(ret, arg2);
- return;
- }
- if (TCG_TARGET_deposit_valid(TCG_TYPE_I32, ofs, len)) {
- tcg_gen_op5ii_i32(INDEX_op_deposit, ret, arg1, arg2, ofs, len);
- return;
- }
-
- t1 = tcg_temp_ebb_new_i32();
-
- if (tcg_op_supported(INDEX_op_extract2, TCG_TYPE_I32, 0)) {
- if (ofs + len == 32) {
- tcg_gen_shli_i32(t1, arg1, len);
- tcg_gen_extract2_i32(ret, t1, arg2, len);
- goto done;
- }
- if (ofs == 0) {
- tcg_gen_extract2_i32(ret, arg1, arg2, len);
- tcg_gen_rotli_i32(ret, ret, len);
- goto done;
- }
- }
-
- mask = (1u << len) - 1;
- if (ofs + len < 32) {
- tcg_gen_andi_i32(t1, arg2, mask);
- tcg_gen_shli_i32(t1, t1, ofs);
} else {
- tcg_gen_shli_i32(t1, arg2, ofs);
+ tcg_gen_op5ii_i32(INDEX_op_deposit, ret, arg1, arg2, ofs, len);
}
- tcg_gen_andi_i32(ret, arg1, ~(mask << ofs));
- tcg_gen_or_i32(ret, ret, t1);
- done:
- tcg_temp_free_i32(t1);
}
void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
@@ -940,28 +907,10 @@ void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
if (ofs + len == 32) {
tcg_gen_shli_i32(ret, arg, ofs);
} else if (ofs == 0) {
- tcg_gen_andi_i32(ret, arg, (1u << len) - 1);
- } else if (TCG_TARGET_deposit_valid(TCG_TYPE_I32, ofs, len)) {
+ tcg_gen_extract_i32(ret, arg, 0, len);
+ } else {
TCGv_i32 zero = tcg_constant_i32(0);
tcg_gen_op5ii_i32(INDEX_op_deposit, ret, zero, arg, ofs, len);
- } else {
- /*
- * To help two-operand hosts we prefer to zero-extend first,
- * which allows ARG to stay live.
- */
- if (TCG_TARGET_extract_valid(TCG_TYPE_I32, 0, len)) {
- tcg_gen_extract_i32(ret, arg, 0, len);
- tcg_gen_shli_i32(ret, ret, ofs);
- return;
- }
- /* Otherwise prefer zero-extension over AND for code size. */
- if (TCG_TARGET_extract_valid(TCG_TYPE_I32, 0, ofs + len)) {
- tcg_gen_shli_i32(ret, arg, ofs);
- tcg_gen_extract_i32(ret, ret, 0, ofs + len);
- return;
- }
- tcg_gen_andi_i32(ret, arg, (1u << len) - 1);
- tcg_gen_shli_i32(ret, ret, ofs);
}
}
@@ -2523,9 +2472,6 @@ void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
unsigned int ofs, unsigned int len)
{
- uint64_t mask;
- TCGv_i64 t1;
-
tcg_debug_assert(ofs < 64);
tcg_debug_assert(len > 0);
tcg_debug_assert(len <= 64);
@@ -2533,55 +2479,40 @@ void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
if (len == 64) {
tcg_gen_mov_i64(ret, arg2);
- return;
- }
-
- if (TCG_TARGET_REG_BITS == 64) {
- if (TCG_TARGET_deposit_valid(TCG_TYPE_I64, ofs, len)) {
- tcg_gen_op5ii_i64(INDEX_op_deposit, ret, arg1, arg2, ofs, len);
- return;
- }
+ } else if (TCG_TARGET_REG_BITS == 64) {
+ tcg_gen_op5ii_i64(INDEX_op_deposit, ret, arg1, arg2, ofs, len);
+ } else if (ofs >= 32) {
+ tcg_gen_deposit_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1),
+ TCGV_LOW(arg2), ofs - 32, len);
+ tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg1));
+ } else if (ofs + len <= 32) {
+ tcg_gen_deposit_i32(TCGV_LOW(ret), TCGV_LOW(arg1),
+ TCGV_LOW(arg2), ofs, len);
+ tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1));
+ } else if (ofs == 0) {
+ tcg_gen_deposit_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1),
+ TCGV_HIGH(arg2), 0, len - 32);
+ tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg2));
} else {
- if (ofs >= 32) {
- tcg_gen_deposit_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1),
- TCGV_LOW(arg2), ofs - 32, len);
- tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg1));
- return;
- }
- if (ofs + len <= 32) {
- tcg_gen_deposit_i32(TCGV_LOW(ret), TCGV_LOW(arg1),
- TCGV_LOW(arg2), ofs, len);
- tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1));
- return;
- }
- }
+ /* The 64-bit deposit is split across the 32-bit halves. */
+ unsigned lo_len = 32 - ofs;
+ unsigned hi_len = len - lo_len;
+ TCGv_i32 tl = tcg_temp_ebb_new_i32();
+ TCGv_i32 th = tcg_temp_ebb_new_i32();
- t1 = tcg_temp_ebb_new_i64();
-
- if (tcg_op_supported(INDEX_op_extract2, TCG_TYPE_I64, 0)) {
- if (ofs + len == 64) {
- tcg_gen_shli_i64(t1, arg1, len);
- tcg_gen_extract2_i64(ret, t1, arg2, len);
- goto done;
+ tcg_gen_deposit_i32(tl, TCGV_LOW(arg1), TCGV_LOW(arg2), ofs, lo_len);
+ if (len <= 32) {
+ tcg_gen_shri_i32(th, TCGV_LOW(arg2), lo_len);
+ } else {
+ tcg_gen_extract2_i32(th, TCGV_LOW(arg2), TCGV_HIGH(arg2), lo_len);
}
- if (ofs == 0) {
- tcg_gen_extract2_i64(ret, arg1, arg2, len);
- tcg_gen_rotli_i64(ret, ret, len);
- goto done;
- }
- }
+ tcg_gen_deposit_i32(th, TCGV_HIGH(arg1), th, 0, hi_len);
- mask = (1ull << len) - 1;
- if (ofs + len < 64) {
- tcg_gen_andi_i64(t1, arg2, mask);
- tcg_gen_shli_i64(t1, t1, ofs);
- } else {
- tcg_gen_shli_i64(t1, arg2, ofs);
+ tcg_gen_mov_i32(TCGV_LOW(ret), tl);
+ tcg_gen_mov_i32(TCGV_HIGH(ret), th);
+ tcg_temp_free_i32(tl);
+ tcg_temp_free_i32(th);
}
- tcg_gen_andi_i64(ret, arg1, ~(mask << ofs));
- tcg_gen_or_i64(ret, ret, t1);
- done:
- tcg_temp_free_i64(t1);
}
void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
@@ -2596,41 +2527,35 @@ void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
tcg_gen_shli_i64(ret, arg, ofs);
} else if (ofs == 0) {
tcg_gen_andi_i64(ret, arg, (1ull << len) - 1);
- } else if (TCG_TARGET_REG_BITS == 64 &&
- TCG_TARGET_deposit_valid(TCG_TYPE_I64, ofs, len)) {
+ } else if (TCG_TARGET_REG_BITS == 64) {
TCGv_i64 zero = tcg_constant_i64(0);
tcg_gen_op5ii_i64(INDEX_op_deposit, ret, zero, arg, ofs, len);
+ } else if (ofs >= 32) {
+ tcg_gen_deposit_z_i32(TCGV_HIGH(ret), TCGV_LOW(arg), ofs - 32, len);
+ tcg_gen_movi_i32(TCGV_LOW(ret), 0);
+ } else if (ofs + len <= 32) {
+ tcg_gen_deposit_z_i32(TCGV_LOW(ret), TCGV_LOW(arg), ofs, len);
+ tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+ } else if (ofs == 0) {
+ tcg_gen_deposit_z_i32(TCGV_HIGH(ret), TCGV_HIGH(arg), 0, len - 32);
+ tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
} else {
- if (TCG_TARGET_REG_BITS == 32) {
- if (ofs >= 32) {
- tcg_gen_deposit_z_i32(TCGV_HIGH(ret), TCGV_LOW(arg),
- ofs - 32, len);
- tcg_gen_movi_i32(TCGV_LOW(ret), 0);
- return;
- }
- if (ofs + len <= 32) {
- tcg_gen_deposit_z_i32(TCGV_LOW(ret), TCGV_LOW(arg), ofs, len);
- tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
- return;
- }
+ /* The 64-bit deposit is split across the 32-bit halves. */
+ unsigned lo_len = 32 - ofs;
+ unsigned hi_len = len - lo_len;
+ TCGv_i32 tl = tcg_temp_ebb_new_i32();
+ TCGv_i32 th = TCGV_HIGH(ret);
+
+ tcg_gen_shli_i32(tl, TCGV_LOW(arg), ofs);
+ if (len <= 32) {
+ tcg_gen_extract_i32(th, TCGV_LOW(arg), lo_len, hi_len);
+ } else {
+ tcg_gen_extract2_i32(th, TCGV_LOW(arg), TCGV_HIGH(arg), lo_len);
+ tcg_gen_extract_i32(th, th, 0, hi_len);
}
- /*
- * To help two-operand hosts we prefer to zero-extend first,
- * which allows ARG to stay live.
- */
- if (TCG_TARGET_extract_valid(TCG_TYPE_I64, 0, len)) {
- tcg_gen_extract_i64(ret, arg, 0, len);
- tcg_gen_shli_i64(ret, ret, ofs);
- return;
- }
- /* Otherwise prefer zero-extension over AND for code size. */
- if (TCG_TARGET_extract_valid(TCG_TYPE_I64, 0, ofs + len)) {
- tcg_gen_shli_i64(ret, arg, ofs);
- tcg_gen_extract_i64(ret, ret, 0, ofs + len);
- return;
- }
- tcg_gen_andi_i64(ret, arg, (1ull << len) - 1);
- tcg_gen_shli_i64(ret, ret, ofs);
+
+ tcg_gen_mov_i32(TCGV_LOW(ret), tl);
+ tcg_temp_free_i32(tl);
}
}
--
2.43.0
© 2016 - 2025 Red Hat, Inc.