include/tcg/tcg.h | 4 ++++ tcg/i386/tcg-target.h | 2 ++ tcg/tcg-op.c | 20 ++++++++++++++++---- 3 files changed, 22 insertions(+), 4 deletions(-)
On x86, this is more efficient because it enables generation of
more LEA instructions.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
include/tcg/tcg.h | 4 ++++
tcg/i386/tcg-target.h | 2 ++
tcg/tcg-op.c | 20 ++++++++++++++++----
3 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index a9282cdcc60..48e5aeef173 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -109,6 +109,10 @@ typedef uint64_t TCGRegSet;
#define TCG_TARGET_HAS_sub2_i32 1
#endif
+#ifndef TCG_TARGET_PREFERS_addi
+#define TCG_TARGET_PREFERS_addi 0
+#endif
+
#ifndef TCG_TARGET_deposit_i32_valid
#define TCG_TARGET_deposit_i32_valid(ofs, len) 1
#endif
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 8417ea4899e..452c6eba2d6 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -227,6 +227,8 @@ typedef enum {
#define TCG_TARGET_HAS_bitsel_vec have_avx512vl
#define TCG_TARGET_HAS_cmpsel_vec -1
+#define TCG_TARGET_PREFERS_addi 1
+
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
(((ofs) == 0 && ((len) == 8 || (len) == 16)) || \
(TCG_TARGET_REG_BITS == 32 && (ofs) == 8 && (len) == 8))
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 828eb9ee460..48c667032de 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -151,6 +151,8 @@ void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
/* some cases can be optimized here */
if (arg2 == 0) {
tcg_gen_mov_i32(ret, arg1);
+ } else if (TCG_TARGET_PREFERS_addi) {
+ tcg_gen_add_i32(ret, arg1, tcg_constant_i32(-arg2));
} else {
tcg_gen_sub_i32(ret, arg1, tcg_constant_i32(arg2));
}
@@ -1369,11 +1371,21 @@ void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
if (arg2 == 0) {
tcg_gen_mov_i64(ret, arg1);
} else if (TCG_TARGET_REG_BITS == 64) {
- tcg_gen_sub_i64(ret, arg1, tcg_constant_i64(arg2));
+ if (TCG_TARGET_PREFERS_addi) {
+ tcg_gen_add_i64(ret, arg1, tcg_constant_i64(-arg2));
+ } else {
+ tcg_gen_sub_i64(ret, arg1, tcg_constant_i64(arg2));
+ }
} else {
- tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
- TCGV_LOW(arg1), TCGV_HIGH(arg1),
- tcg_constant_i32(arg2), tcg_constant_i32(arg2 >> 32));
+ if (TCG_TARGET_PREFERS_addi) {
+ tcg_gen_add2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
+ TCGV_LOW(arg1), TCGV_HIGH(arg1),
+ tcg_constant_i32(-arg2), tcg_constant_i32(-arg2 >> 32));
+ } else {
+ tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
+ TCGV_LOW(arg1), TCGV_HIGH(arg1),
+ tcg_constant_i32(arg2), tcg_constant_i32(arg2 >> 32));
+ }
}
}
--
2.41.0
On 10/25/23 11:59, Paolo Bonzini wrote: > On x86, this is more efficient because it enables generation of > more LEA instructions. > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > --- > include/tcg/tcg.h | 4 ++++ > tcg/i386/tcg-target.h | 2 ++ > tcg/tcg-op.c | 20 ++++++++++++++++---- > 3 files changed, 22 insertions(+), 4 deletions(-) > > diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h > index a9282cdcc60..48e5aeef173 100644 > --- a/include/tcg/tcg.h > +++ b/include/tcg/tcg.h > @@ -109,6 +109,10 @@ typedef uint64_t TCGRegSet; > #define TCG_TARGET_HAS_sub2_i32 1 > #endif > > +#ifndef TCG_TARGET_PREFERS_addi > +#define TCG_TARGET_PREFERS_addi 0 > +#endif I would rather do this unconditionally. Many of the tcg backends do this manually, e.g. s390x: case INDEX_op_sub_i64: a0 = args[0], a1 = args[1], a2 = args[2]; if (const_args[2]) { a2 = -a2; goto do_addi_64; } else { tcg_out_insn(s, RRFa, SGRK, a0, a1, a2); } break; While we could do something similar for i386, it would be better to not require such hoops in each backend. We would also want to perform this transformation in optimize.c. r~
© 2016 - 2024 Red Hat, Inc.