Direct block chaining is documented here
https://qemu.readthedocs.io/en/latest/devel/tcg.html#direct-block-chaining
Recall that Hexagon allows packets with multiple jumps where only the first
one with a true predicate will actually jump. So, we can only use direct
block chaining when the packet contains a single PC-relative jump. We add
the following to DisasContext in order to perform direct block chaining at
the end of packet commit (in gen_end_tb)
has_single_direct_branch
Indicates that we can use direct block chaining
branch_cond
The condition under which the branch is taken
branch_dest
The destination of the branch
Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
---
target/hexagon/translate.h | 3 +++
target/hexagon/genptr.c | 13 ++++++-------
target/hexagon/translate.c | 39 +++++++++++++++++++++++++++++++++++++-
3 files changed, 47 insertions(+), 8 deletions(-)
diff --git a/target/hexagon/translate.h b/target/hexagon/translate.h
index eae358cf33..e60dbf0e7a 100644
--- a/target/hexagon/translate.h
+++ b/target/hexagon/translate.h
@@ -54,6 +54,9 @@ typedef struct DisasContext {
bool qreg_is_predicated[NUM_QREGS];
int qreg_log_idx;
bool pre_commit;
+ bool has_single_direct_branch;
+ TCGv branch_cond;
+ target_ulong branch_dest;
} DisasContext;
static inline void ctx_log_reg_write(DisasContext *ctx, int rnum)
diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
index fba76d3b38..07b4326e56 100644
--- a/target/hexagon/genptr.c
+++ b/target/hexagon/genptr.c
@@ -505,15 +505,14 @@ static void gen_write_new_pc_pcrel(DisasContext *ctx, Packet *pkt,
gen_set_label(pred_false);
}
} else {
- TCGLabel *pred_false = NULL;
+ /* Defer this jump to the end of the TB */
+ g_assert(ctx->branch_cond == NULL);
+ ctx->has_single_direct_branch = true;
if (pred != NULL) {
- pred_false = gen_new_label();
- tcg_gen_brcondi_tl(TCG_COND_EQ, pred, 0, pred_false);
- }
- tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], dest);
- if (pred != NULL) {
- gen_set_label(pred_false);
+ ctx->branch_cond = tcg_temp_local_new();
+ tcg_gen_mov_tl(ctx->branch_cond, pred);
}
+ ctx->branch_dest = dest;
}
}
diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
index 71ad2da682..29e2caaf0f 100644
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@@ -116,10 +116,44 @@ static void gen_exec_counters(DisasContext *ctx)
hex_gpr[HEX_REG_QEMU_HVX_CNT], ctx->num_hvx_insns);
}
+static bool use_goto_tb(DisasContext *ctx, target_ulong dest)
+{
+ return translator_use_goto_tb(&ctx->base, dest);
+}
+
+static void gen_goto_tb(DisasContext *ctx, int idx, target_ulong dest)
+{
+ if (use_goto_tb(ctx, dest)) {
+ tcg_gen_goto_tb(idx);
+ tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], dest);
+ tcg_gen_exit_tb(ctx->base.tb, idx);
+ } else {
+ tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], dest);
+ tcg_gen_lookup_and_goto_ptr();
+ }
+}
+
static void gen_end_tb(DisasContext *ctx)
{
gen_exec_counters(ctx);
- tcg_gen_exit_tb(NULL, 0);
+
+ if (ctx->has_single_direct_branch) {
+ if (ctx->branch_cond != NULL) {
+ TCGLabel *skip = gen_new_label();
+ tcg_gen_brcondi_tl(TCG_COND_EQ, ctx->branch_cond, 0, skip);
+ gen_goto_tb(ctx, 0, ctx->branch_dest);
+ gen_set_label(skip);
+ gen_goto_tb(ctx, 1, ctx->next_PC);
+ tcg_temp_free(ctx->branch_cond);
+ ctx->branch_cond = NULL;
+ } else {
+ gen_goto_tb(ctx, 0, ctx->branch_dest);
+ }
+ } else {
+ tcg_gen_lookup_and_goto_ptr();
+ }
+
+ g_assert(ctx->branch_cond == NULL);
ctx->base.is_jmp = DISAS_NORETURN;
}
@@ -803,6 +837,9 @@ static void hexagon_tr_init_disas_context(DisasContextBase *dcbase,
static void hexagon_tr_tb_start(DisasContextBase *db, CPUState *cpu)
{
+ DisasContext *ctx = container_of(db, DisasContext, base);
+ ctx->has_single_direct_branch = false;
+ ctx->branch_cond = NULL;
}
static void hexagon_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu)
--
2.17.1
Taylor Simpson <tsimpson@quicinc.com> wrote:
>
> diff --git a/target/hexagon/translate.h b/target/hexagon/translate.h
> index eae358cf33..e60dbf0e7a 100644
> --- a/target/hexagon/translate.h
> +++ b/target/hexagon/translate.h
> @@ -54,6 +54,9 @@ typedef struct DisasContext {
> bool qreg_is_predicated[NUM_QREGS];
> int qreg_log_idx;
> bool pre_commit;
> + bool has_single_direct_branch;
> + TCGv branch_cond;
> + target_ulong branch_dest;
> } DisasContext;
>
> static inline void ctx_log_reg_write(DisasContext *ctx, int rnum)
> diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
> index fba76d3b38..07b4326e56 100644
> --- a/target/hexagon/genptr.c
> +++ b/target/hexagon/genptr.c
> @@ -505,15 +505,14 @@ static void gen_write_new_pc_pcrel(DisasContext *ctx, Packet *pkt,
> gen_set_label(pred_false);
> }
> } else {
> - TCGLabel *pred_false = NULL;
> + /* Defer this jump to the end of the TB */
> + g_assert(ctx->branch_cond == NULL);
> + ctx->has_single_direct_branch = true;
> if (pred != NULL) {
> - pred_false = gen_new_label();
> - tcg_gen_brcondi_tl(TCG_COND_EQ, pred, 0, pred_false);
> - }
> - tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], dest);
> - if (pred != NULL) {
> - gen_set_label(pred_false);
> + ctx->branch_cond = tcg_temp_local_new();
> + tcg_gen_mov_tl(ctx->branch_cond, pred);
> }
> + ctx->branch_dest = dest;
> }
> }
Do we want to perform this logic at gen_write_new_pc_addr() as well?
Although, in that case, we would need a separate ctx->branch_dest to
hold a TCGv instead of target_ulong...
Or have a single variable (TCGv) but add an extra
tcg_gen_addi(ctx->branch_dest, tcg_gen_constant_tl(pkt->pc), pc_off)
call to gen_write_new_pc_pcrel(). (In which case, we could also
unify the two gen_write_new_pc_* functions and have one as a thin
wrapper around the other.) IDK about the extra overhead from
tcg_gen_addi(), though.
> -----Original Message----- > From: Matheus Tavares Bernardino <quic_mathbern@quicinc.com> > Sent: Thursday, October 20, 2022 10:25 AM > To: Taylor Simpson <tsimpson@quicinc.com> > Cc: ale@rev.ng; anjo@rev.ng; Brian Cain <bcain@quicinc.com>; > philmd@linaro.org; qemu-devel@nongnu.org; Matheus Bernardino (QUIC) > <quic_mathbern@quicinc.com>; richard.henderson@linaro.org > Subject: Re: [PATCH 7/8] Hexagon (target/hexagon) Use direct block chaining > for direct jump/branch > > > Do we want to perform this logic at gen_write_new_pc_addr() as well? > > Although, in that case, we would need a separate ctx->branch_dest to hold a > TCGv instead of target_ulong... > > Or have a single variable (TCGv) but add an extra tcg_gen_addi(ctx- > >branch_dest, tcg_gen_constant_tl(pkt->pc), pc_off) call to > gen_write_new_pc_pcrel(). (In which case, we could also unify the two > gen_write_new_pc_* functions and have one as a thin wrapper around the > other.) IDK about the extra overhead from tcg_gen_addi(), though. We get the best performance from direct block chaining when the destination is a constant at translation time (i.e., a direct branch) because we can use goto_tb + exit_tb. Take a look at gen_end_tb in translate.c to see how this is ultimately done. For a single direct branch and a tight loop, we get the best performance. Otherwise, we use tcg_gen_lookup_and_goto_ptr which is still better than what we were doing before. Thanks, Taylor
© 2016 - 2026 Red Hat, Inc.