tcg/loongarch64/tcg-target.c.inc | 56 +++++++++++++++++++++++++++++--- tcg/loongarch64/tcg-target.h | 5 ++- 2 files changed, 53 insertions(+), 8 deletions(-)
Similar to the ARM64, LoongArch has PC-relative instructions such as
PCADDU18I. These instructions can be used to support direct jump for
LoongArch. Additionally, if instruction "B offset" can cover the target
address(target is within ±128MB range), a single "B offset" plus a nop
will be used by "tb_target_set_jump_target".
Signed-off-by: Qi Hu <huqi@loongson.cn>
---
tcg/loongarch64/tcg-target.c.inc | 56 +++++++++++++++++++++++++++++---
tcg/loongarch64/tcg-target.h | 5 ++-
2 files changed, 53 insertions(+), 8 deletions(-)
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index f5a214a17f..2a068a52cc 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -1031,6 +1031,35 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
#endif
}
+/* LoongArch use `andi zero, zero, 0` as NOP. */
+#define NOP OPC_ANDI
+static void tcg_out_nop(TCGContext *s)
+{
+ tcg_out32(s, NOP);
+}
+
+void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+ uintptr_t jmp_rw, uintptr_t addr)
+{
+ tcg_insn_unit i1, i2;
+ ptrdiff_t upper, lower;
+ ptrdiff_t offset = (addr - jmp_rx) >> 2;
+
+ if (offset == sextreg(offset, 0, 28)) {
+ i1 = encode_sd10k16_insn(OPC_B, offset);
+ i2 = NOP;
+ } else {
+ lower = (int16_t)offset;
+ upper = (offset - lower) >> 16;
+
+ i1 = encode_dsj20_insn(OPC_PCADDU18I, TCG_REG_T0, upper);
+ i2 = encode_djsk16_insn(OPC_JIRL, TCG_REG_ZERO, TCG_REG_T0, lower);
+ }
+ uint64_t pair = ((uint64_t)i2 << 32) | i1;
+ qatomic_set((uint64_t *)jmp_rw, pair);
+ flush_idcache_range(jmp_rx, jmp_rw, 8);
+}
+
/*
* Entry-points
*/
@@ -1058,11 +1087,28 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
break;
case INDEX_op_goto_tb:
- assert(s->tb_jmp_insn_offset == 0);
- /* indirect jump method */
- tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
- (uintptr_t)(s->tb_jmp_target_addr + a0));
- tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+ if (s->tb_jmp_insn_offset != NULL) {
+ /* TCG_TARGET_HAS_direct_jump */
+ /*
+ * Ensure that "patch area" are 8-byte aligned so that an
+ * atomic write can be used to patch the target address.
+ */
+ if ((uintptr_t)s->code_ptr & 7) {
+ tcg_out_nop(s);
+ }
+ s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+ /*
+ * actual branch destination will be patched by
+ * tb_target_set_jmp_target later
+ */
+ tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
+ tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+ } else {
+ /* !TCG_TARGET_HAS_direct_jump */
+ tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
+ (uintptr_t)(s->tb_jmp_target_addr + a0));
+ tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+ }
set_jmp_reset_offset(s, a0);
break;
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index 67380b2432..ee207ec32c 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -42,7 +42,7 @@
#define TCG_TARGET_INSN_UNIT_SIZE 4
#define TCG_TARGET_NB_REGS 32
-#define MAX_CODE_GEN_BUFFER_SIZE SIZE_MAX
+#define MAX_CODE_GEN_BUFFER_SIZE (128 * GiB)
typedef enum {
TCG_REG_ZERO,
@@ -123,7 +123,7 @@ typedef enum {
#define TCG_TARGET_HAS_clz_i32 1
#define TCG_TARGET_HAS_ctz_i32 1
#define TCG_TARGET_HAS_ctpop_i32 0
-#define TCG_TARGET_HAS_direct_jump 0
+#define TCG_TARGET_HAS_direct_jump 1
#define TCG_TARGET_HAS_brcond2 0
#define TCG_TARGET_HAS_setcond2 0
#define TCG_TARGET_HAS_qemu_st8_i32 0
@@ -166,7 +166,6 @@ typedef enum {
#define TCG_TARGET_HAS_muluh_i64 1
#define TCG_TARGET_HAS_mulsh_i64 1
-/* not defined -- call should be eliminated at compile time */
void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
#define TCG_TARGET_DEFAULT_MO (0)
--
2.37.3
On 10/14/22 11:40, Qi Hu wrote: > Similar to the ARM64, LoongArch has PC-relative instructions such as > PCADDU18I. These instructions can be used to support direct jump for > LoongArch. Additionally, if instruction "B offset" can cover the target > address(target is within ±128MB range), a single "B offset" plus a nop > will be used by "tb_target_set_jump_target". > > Signed-off-by: Qi Hu <huqi@loongson.cn> > --- > tcg/loongarch64/tcg-target.c.inc | 56 +++++++++++++++++++++++++++++--- > tcg/loongarch64/tcg-target.h | 5 ++- > 2 files changed, 53 insertions(+), 8 deletions(-) > > diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc > index f5a214a17f..2a068a52cc 100644 > --- a/tcg/loongarch64/tcg-target.c.inc > +++ b/tcg/loongarch64/tcg-target.c.inc > @@ -1031,6 +1031,35 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args) > #endif > } > > +/* LoongArch use `andi zero, zero, 0` as NOP. */ "uses" > +#define NOP OPC_ANDI > +static void tcg_out_nop(TCGContext *s) > +{ > + tcg_out32(s, NOP); > +} > + > +void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx, > + uintptr_t jmp_rw, uintptr_t addr) > +{ > + tcg_insn_unit i1, i2; > + ptrdiff_t upper, lower; > + ptrdiff_t offset = (addr - jmp_rx) >> 2; > + > + if (offset == sextreg(offset, 0, 28)) { You've already shifted by 2 above, so here you should check if `offset` fits in 26 bits instead. > + i1 = encode_sd10k16_insn(OPC_B, offset); > + i2 = NOP; > + } else { Could also add an assertion that `offset` fits in 36 bits, as we're now bounded by `MAX_CODEGEN_BUFFER_SIZE` and don't really have the capability to go larger than that any more, unlike when indirect jumps were used. > + lower = (int16_t)offset; > + upper = (offset - lower) >> 16; > + > + i1 = encode_dsj20_insn(OPC_PCADDU18I, TCG_REG_T0, upper); > + i2 = encode_djsk16_insn(OPC_JIRL, TCG_REG_ZERO, TCG_REG_T0, lower); Instead of hard-coding T0, what about simply using the TMP0 that's explicitly reserved as a scratch register? > + } > + uint64_t pair = ((uint64_t)i2 << 32) | i1; > + qatomic_set((uint64_t *)jmp_rw, pair); > + flush_idcache_range(jmp_rx, jmp_rw, 8); > +} > + > /* > * Entry-points > */ > @@ -1058,11 +1087,28 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, > break; > > case INDEX_op_goto_tb: > - assert(s->tb_jmp_insn_offset == 0); > - /* indirect jump method */ > - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO, > - (uintptr_t)(s->tb_jmp_target_addr + a0)); > - tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0); > + if (s->tb_jmp_insn_offset != NULL) { > + /* TCG_TARGET_HAS_direct_jump */ > + /* > + * Ensure that "patch area" are 8-byte aligned so that an "Ensure that the patch area is" > + * atomic write can be used to patch the target address. > + */ > + if ((uintptr_t)s->code_ptr & 7) { > + tcg_out_nop(s); > + } > + s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); > + /* > + * actual branch destination will be patched by > + * tb_target_set_jmp_target later > + */ > + tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0); > + tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0); > + } else { > + /* !TCG_TARGET_HAS_direct_jump */ > + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO, > + (uintptr_t)(s->tb_jmp_target_addr + a0)); > + tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0); Why not remove the else block altogether? Removing the respective unused code for aarch64 could be considered a separate logical change and I think we can do the loongarch64 part independently of whether it's appropriate/accepted for aarch64. > + } > set_jmp_reset_offset(s, a0); > break; > > diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h > index 67380b2432..ee207ec32c 100644 > --- a/tcg/loongarch64/tcg-target.h > +++ b/tcg/loongarch64/tcg-target.h > @@ -42,7 +42,7 @@ > > #define TCG_TARGET_INSN_UNIT_SIZE 4 > #define TCG_TARGET_NB_REGS 32 > -#define MAX_CODE_GEN_BUFFER_SIZE SIZE_MAX > +#define MAX_CODE_GEN_BUFFER_SIZE (128 * GiB) If I were you I'd put a comment above saying the 128GiB limit is due to the PCADDU18I + JIRL sequence giving a total of 20 + 16 + 2 = 38 bits for signed offsets, which is +/- 128GiB. But I'm fine without it too. > > typedef enum { > TCG_REG_ZERO, > @@ -123,7 +123,7 @@ typedef enum { > #define TCG_TARGET_HAS_clz_i32 1 > #define TCG_TARGET_HAS_ctz_i32 1 > #define TCG_TARGET_HAS_ctpop_i32 0 > -#define TCG_TARGET_HAS_direct_jump 0 > +#define TCG_TARGET_HAS_direct_jump 1 > #define TCG_TARGET_HAS_brcond2 0 > #define TCG_TARGET_HAS_setcond2 0 > #define TCG_TARGET_HAS_qemu_st8_i32 0 > @@ -166,7 +166,6 @@ typedef enum { > #define TCG_TARGET_HAS_muluh_i64 1 > #define TCG_TARGET_HAS_mulsh_i64 1 > > -/* not defined -- call should be eliminated at compile time */ > void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t); > > #define TCG_TARGET_DEFAULT_MO (0) With the nits addressed, Reviewed-by: WANG Xuerui <git@xen0n.name>
On 10/14/22 14:40, Qi Hu wrote: > Similar to the ARM64, LoongArch has PC-relative instructions such as > PCADDU18I. These instructions can be used to support direct jump for > LoongArch. Additionally, if instruction "B offset" can cover the target > address(target is within ±128MB range), a single "B offset" plus a nop > will be used by "tb_target_set_jump_target". > > Signed-off-by: Qi Hu<huqi@loongson.cn> > --- > tcg/loongarch64/tcg-target.c.inc | 56 +++++++++++++++++++++++++++++--- > tcg/loongarch64/tcg-target.h | 5 ++- > 2 files changed, 53 insertions(+), 8 deletions(-) Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r~
© 2016 - 2024 Red Hat, Inc.