[Qemu-devel] [RFC] target/xtensa: rework zero overhead loops implementation

Max Filippov posted 1 patch 5 years, 5 months ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20181004010505.26184-1-jcmvbkbc@gmail.com
Test docker-clang@ubuntu failed
Test checkpatch passed
There is a newer version of this series
dtc                       |  2 +-
target/xtensa/cpu.h       | 11 ++++++++++
target/xtensa/helper.h    |  2 --
target/xtensa/op_helper.c | 24 ---------------------
target/xtensa/translate.c | 53 ++++++++++++++++++++---------------------------
5 files changed, 34 insertions(+), 58 deletions(-)
[Qemu-devel] [RFC] target/xtensa: rework zero overhead loops implementation
Posted by Max Filippov 5 years, 5 months ago
Don't invalidate TB with the end of zero overhead loop when LBEG or LEND
change. Instead encode the distance from the TB start to the LEND in the
TB flags and generate loopback code when offset of the next PC from the
TB start equals that distance. Distance not greater than the maximal
instruction length is encoded literally, greater distances are capped at
the target page size and encoded as the maximal instruction length plus
the greatest power of 2 that is not bigger than the distance.

Although this change adds dynamic TB search at the end of each zero
overhead loop the resulting emulation speed is about 10% higher in
uClibc-ng and LTP tests.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 dtc                       |  2 +-
 target/xtensa/cpu.h       | 11 ++++++++++
 target/xtensa/helper.h    |  2 --
 target/xtensa/op_helper.c | 24 ---------------------
 target/xtensa/translate.c | 53 ++++++++++++++++++++---------------------------
 5 files changed, 34 insertions(+), 58 deletions(-)

diff --git a/dtc b/dtc
index 88f18909db73..e54388015af1 160000
--- a/dtc
+++ b/dtc
@@ -1 +1 @@
-Subproject commit 88f18909db731a627456f26d779445f84e449536
+Subproject commit e54388015af1fb4bf04d0bca99caba1074d9cc42
diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h
index 34e5ccd9f1d6..ad76d75aadde 100644
--- a/target/xtensa/cpu.h
+++ b/target/xtensa/cpu.h
@@ -694,6 +694,8 @@ static inline int cpu_mmu_index(CPUXtensaState *env, bool ifetch)
 #define XTENSA_TBFLAG_CWOE 0x40000
 #define XTENSA_TBFLAG_CALLINC_MASK 0x180000
 #define XTENSA_TBFLAG_CALLINC_SHIFT 19
+#define XTENSA_TBFLAG_LEND_MASK 0xfe00000
+#define XTENSA_TBFLAG_LEND_SHIFT 21
 
 static inline void cpu_get_tb_cpu_state(CPUXtensaState *env, target_ulong *pc,
         target_ulong *cs_base, uint32_t *flags)
@@ -706,6 +708,15 @@ static inline void cpu_get_tb_cpu_state(CPUXtensaState *env, target_ulong *pc,
     *flags |= xtensa_get_ring(env);
     if (env->sregs[PS] & PS_EXCM) {
         *flags |= XTENSA_TBFLAG_EXCM;
+    } else if (xtensa_option_enabled(env->config, XTENSA_OPTION_LOOP)) {
+        target_ulong lend_dist = env->sregs[LEND] - env->pc;
+
+        if (lend_dist > (1u << TARGET_PAGE_BITS)) {
+            lend_dist = MAX_INSN_LENGTH + 31 - TARGET_PAGE_BITS;
+        } else if (lend_dist > MAX_INSN_LENGTH) {
+            lend_dist = MAX_INSN_LENGTH + 31 - clz32(lend_dist);
+        }
+        *flags |= lend_dist << XTENSA_TBFLAG_LEND_SHIFT;
     }
     if (xtensa_option_enabled(env->config, XTENSA_OPTION_EXTENDED_L32R) &&
             (env->sregs[LITBASE] & 1)) {
diff --git a/target/xtensa/helper.h b/target/xtensa/helper.h
index 10153c245360..2ebba0b2c2bf 100644
--- a/target/xtensa/helper.h
+++ b/target/xtensa/helper.h
@@ -12,8 +12,6 @@ DEF_HELPER_2(rotw, void, env, i32)
 DEF_HELPER_3(window_check, noreturn, env, i32, i32)
 DEF_HELPER_1(restore_owb, void, env)
 DEF_HELPER_2(movsp, void, env, i32)
-DEF_HELPER_2(wsr_lbeg, void, env, i32)
-DEF_HELPER_2(wsr_lend, void, env, i32)
 #ifndef CONFIG_USER_ONLY
 DEF_HELPER_1(simcall, void, env)
 #endif
diff --git a/target/xtensa/op_helper.c b/target/xtensa/op_helper.c
index e4b42ab3e56c..078aeb6c2c94 100644
--- a/target/xtensa/op_helper.c
+++ b/target/xtensa/op_helper.c
@@ -107,13 +107,6 @@ static void tb_invalidate_virtual_addr(CPUXtensaState *env, uint32_t vaddr)
     }
 }
 
-#else
-
-static void tb_invalidate_virtual_addr(CPUXtensaState *env, uint32_t vaddr)
-{
-    tb_invalidate_phys_addr(vaddr);
-}
-
 #endif
 
 void HELPER(exception)(CPUXtensaState *env, uint32_t excp)
@@ -370,23 +363,6 @@ void HELPER(movsp)(CPUXtensaState *env, uint32_t pc)
     }
 }
 
-void HELPER(wsr_lbeg)(CPUXtensaState *env, uint32_t v)
-{
-    if (env->sregs[LBEG] != v) {
-        tb_invalidate_virtual_addr(env, env->sregs[LEND] - 1);
-        env->sregs[LBEG] = v;
-    }
-}
-
-void HELPER(wsr_lend)(CPUXtensaState *env, uint32_t v)
-{
-    if (env->sregs[LEND] != v) {
-        tb_invalidate_virtual_addr(env, env->sregs[LEND] - 1);
-        env->sregs[LEND] = v;
-        tb_invalidate_virtual_addr(env, env->sregs[LEND] - 1);
-    }
-}
-
 void HELPER(dump_state)(CPUXtensaState *env)
 {
     XtensaCPU *cpu = xtensa_env_get_cpu(env);
diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index 46e13384488e..c48285ce207e 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -54,7 +54,7 @@ struct DisasContext {
     int cring;
     int ring;
     uint32_t lbeg;
-    uint32_t lend;
+    uint32_t lend_dist;
 
     bool sar_5bit;
     bool sar_m32_5bit;
@@ -431,14 +431,13 @@ static void gen_callwi(DisasContext *dc, int callinc, uint32_t dest, int slot)
 
 static bool gen_check_loop_end(DisasContext *dc, int slot)
 {
-    if (option_enabled(dc, XTENSA_OPTION_LOOP) &&
-            !(dc->base.tb->flags & XTENSA_TBFLAG_EXCM) &&
-            dc->base.pc_next == dc->lend) {
+    if (dc->lend_dist && dc->lend_dist <= MAX_INSN_LENGTH &&
+        dc->base.pc_next - dc->base.pc_first == dc->lend_dist) {
         TCGLabel *label = gen_new_label();
 
         tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_SR[LCOUNT], 0, label);
         tcg_gen_subi_i32(cpu_SR[LCOUNT], cpu_SR[LCOUNT], 1);
-        gen_jumpi(dc, dc->lbeg, slot);
+        gen_jump(dc, cpu_SR[LBEG]);
         gen_set_label(label);
         gen_jumpi(dc, dc->base.pc_next, -1);
         return true;
@@ -534,16 +533,6 @@ static void gen_rsr(DisasContext *dc, TCGv_i32 d, uint32_t sr)
     }
 }
 
-static void gen_wsr_lbeg(DisasContext *dc, uint32_t sr, TCGv_i32 s)
-{
-    gen_helper_wsr_lbeg(cpu_env, s);
-}
-
-static void gen_wsr_lend(DisasContext *dc, uint32_t sr, TCGv_i32 s)
-{
-    gen_helper_wsr_lend(cpu_env, s);
-}
-
 static void gen_wsr_sar(DisasContext *dc, uint32_t sr, TCGv_i32 s)
 {
     tcg_gen_andi_i32(cpu_SR[sr], s, 0x3f);
@@ -743,8 +732,6 @@ static void gen_wsr(DisasContext *dc, uint32_t sr, TCGv_i32 s)
 {
     static void (* const wsr_handler[256])(DisasContext *dc,
                                            uint32_t sr, TCGv_i32 v) = {
-        [LBEG] = gen_wsr_lbeg,
-        [LEND] = gen_wsr_lend,
         [SAR] = gen_wsr_sar,
         [BR] = gen_wsr_br,
         [LITBASE] = gen_wsr_litbase,
@@ -1098,7 +1085,11 @@ static void xtensa_tr_init_disas_context(DisasContextBase *dcbase,
     dc->ring = tb_flags & XTENSA_TBFLAG_RING_MASK;
     dc->cring = (tb_flags & XTENSA_TBFLAG_EXCM) ? 0 : dc->ring;
     dc->lbeg = env->sregs[LBEG];
-    dc->lend = env->sregs[LEND];
+    dc->lend_dist = (tb_flags & XTENSA_TBFLAG_LEND_MASK) >>
+        XTENSA_TBFLAG_LEND_SHIFT;
+    if (dc->lend_dist > MAX_INSN_LENGTH) {
+        dc->lend_dist = (1 << (dc->lend_dist - MAX_INSN_LENGTH)) + 1;
+    }
     dc->debug = tb_flags & XTENSA_TBFLAG_DEBUG;
     dc->icount = tb_flags & XTENSA_TBFLAG_ICOUNT;
     dc->cpenable = (tb_flags & XTENSA_TBFLAG_CPENABLE_MASK) >>
@@ -1150,7 +1141,6 @@ static void xtensa_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
 {
     DisasContext *dc = container_of(dcbase, DisasContext, base);
     CPUXtensaState *env = cpu->env_ptr;
-    target_ulong page_start;
 
     /* These two conditions only apply to the first insn in the TB,
        but this is the first TranslateOps hook that allows exiting.  */
@@ -1189,11 +1179,16 @@ static void xtensa_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
     }
 
     /* End the TB if the next insn will cross into the next page.  */
-    page_start = dc->base.pc_first & TARGET_PAGE_MASK;
-    if (dc->base.is_jmp == DISAS_NEXT &&
-        (dc->pc - page_start >= TARGET_PAGE_SIZE ||
-         dc->pc - page_start + xtensa_insn_len(env, dc) > TARGET_PAGE_SIZE)) {
-        dc->base.is_jmp = DISAS_TOO_MANY;
+    if (dc->base.is_jmp == DISAS_NEXT) {
+        target_ulong page_start = dc->base.pc_first & TARGET_PAGE_MASK;
+        unsigned next_insn_len = xtensa_insn_len(env, dc);
+
+        if (dc->pc - page_start >= TARGET_PAGE_SIZE ||
+            dc->pc - page_start + next_insn_len > TARGET_PAGE_SIZE ||
+            (dc->lend_dist &&
+             dc->pc - dc->base.pc_first + next_insn_len > dc->lend_dist)) {
+            dc->base.is_jmp = DISAS_TOO_MANY;
+        }
     }
 }
 
@@ -1712,12 +1707,10 @@ static void translate_loop(DisasContext *dc, const uint32_t arg[],
                            const uint32_t par[])
 {
     uint32_t lend = arg[1];
-    TCGv_i32 tmp = tcg_const_i32(lend);
 
     tcg_gen_subi_i32(cpu_SR[LCOUNT], cpu_R[arg[0]], 1);
     tcg_gen_movi_i32(cpu_SR[LBEG], dc->base.pc_next);
-    gen_helper_wsr_lend(cpu_env, tmp);
-    tcg_temp_free(tmp);
+    tcg_gen_movi_i32(cpu_SR[LEND], lend);
 
     if (par[0] != TCG_COND_NEVER) {
         TCGLabel *label = gen_new_label();
@@ -4609,7 +4602,6 @@ static const XtensaOpcodeOps core_ops[] = {
         .translate = translate_wsr,
         .test_ill = test_ill_wsr,
         .par = (const uint32_t[]){LBEG},
-        .op_flags = XTENSA_OP_EXIT_TB_0,
         .windowed_register_op = 0x1,
     }, {
         .name = "wsr.lcount",
@@ -4622,7 +4614,7 @@ static const XtensaOpcodeOps core_ops[] = {
         .translate = translate_wsr,
         .test_ill = test_ill_wsr,
         .par = (const uint32_t[]){LEND},
-        .op_flags = XTENSA_OP_EXIT_TB_0,
+        .op_flags = XTENSA_OP_EXIT_TB_M1,
         .windowed_register_op = 0x1,
     }, {
         .name = "wsr.litbase",
@@ -5183,7 +5175,6 @@ static const XtensaOpcodeOps core_ops[] = {
         .translate = translate_xsr,
         .test_ill = test_ill_xsr,
         .par = (const uint32_t[]){LBEG},
-        .op_flags = XTENSA_OP_EXIT_TB_0,
         .windowed_register_op = 0x1,
     }, {
         .name = "xsr.lcount",
@@ -5196,7 +5187,7 @@ static const XtensaOpcodeOps core_ops[] = {
         .translate = translate_xsr,
         .test_ill = test_ill_xsr,
         .par = (const uint32_t[]){LEND},
-        .op_flags = XTENSA_OP_EXIT_TB_0,
+        .op_flags = XTENSA_OP_EXIT_TB_M1,
         .windowed_register_op = 0x1,
     }, {
         .name = "xsr.litbase",
-- 
2.11.0


Re: [Qemu-devel] [RFC] target/xtensa: rework zero overhead loops implementation
Posted by Max Filippov 5 years, 5 months ago
On Wed, Oct 3, 2018 at 6:05 PM Max Filippov <jcmvbkbc@gmail.com> wrote:
>
> Don't invalidate TB with the end of zero overhead loop when LBEG or LEND
> change. Instead encode the distance from the TB start to the LEND in the
> TB flags and generate loopback code when offset of the next PC from the
> TB start equals that distance. Distance not greater than the maximal
> instruction length is encoded literally, greater distances are capped at
> the target page size and encoded as the maximal instruction length plus
> the greatest power of 2 that is not bigger than the distance.
>
> Although this change adds dynamic TB search at the end of each zero
> overhead loop the resulting emulation speed is about 10% higher in
> uClibc-ng and LTP tests.

I thought about it some more and it looks like this is not going to work
in general case in the presence of TB linking: a block with a big (and thus
not precise) LEND distance may be linked to a block with a small (and
thus precise) LEND distance. Then LEND may change so that next time
it still goes to the first TB. In that case it shouldn't go from the first TB
to the second, but with this scheme it will.

-- 
Thanks.
-- Max

Re: [Qemu-devel] [RFC] target/xtensa: rework zero overhead loops implementation
Posted by Richard Henderson 5 years, 5 months ago
On 10/4/18 3:14 PM, Max Filippov wrote:
> I thought about it some more and it looks like this is not going to work
> in general case in the presence of TB linking: a block with a big (and thus
> not precise) LEND distance may be linked to a block with a small (and
> thus precise) LEND distance. Then LEND may change so that next time
> it still goes to the first TB. In that case it shouldn't go from the first TB
> to the second, but with this scheme it will.

Indeed.  Perhaps think of ways in which LBEG and LEND can be represented
relative to each other and PC and store than in the 32-bits you have available
in the CS_BASE field.

Think first about how, if PC >= LEND or LEND - PC > PAGE_SIZE, that all of the
loop stuff is irrelevant because we won't hit LEND within this TB.

Think second about how to represent the common case -- how LOOP sets LBEG and
LEND together.  That is, LEND - LBEG <= 256.  So, usually, we can also have an
exact representation of LBEG and have a direct link rather than an indirect
link.  But I presume that one can play games with special registers to create
ranges that LOOP won't.  So we need some setting that will indicate that.

Consider CS_BASE fields:

  [12: 0]  EDIF = LEND - PC, if PC < LEND && LEND - PC < 2*PAGE_SIZE, or 0.
  [20:13]  BDIF = LEND - LBEG, if LEND - LBEG < 256, or 0.

So you can tell if advancing PC within a TB will exactly match LEND.  You can
tell what LBEG should be, except if BDIF == 0.  In that, presumably rare case,
you load LBEG at runtime as you did in this patch.

Note that if CS_BASE == 0, and thus EDIF == 0, looping is disabled for the TB.

I'll note that this also makes XTENSA_TBFLAG_EXCM redundant.  Simply skip
setting CS_BASE to a non-zero value instead.


r~

Re: [Qemu-devel] [RFC] target/xtensa: rework zero overhead loops implementation
Posted by Max Filippov 5 years, 5 months ago
Hi Richard,

thank you for taking a look.

On Thu, Oct 4, 2018 at 2:13 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
> Think first about how, if PC >= LEND or LEND - PC > PAGE_SIZE, that all of the
> loop stuff is irrelevant because we won't hit LEND within this TB.
>
> Think second about how to represent the common case -- how LOOP sets LBEG and
> LEND together.  That is, LEND - LBEG <= 256.  So, usually, we can also have an
> exact representation of LBEG and have a direct link rather than an indirect
> link.

I tried similar thing as a followup to my original patch, recording entire LBEG
in the cs_base -- surprisingly to me it made no measurable difference on my
tests.

>  But I presume that one can play games with special registers to create
> ranges that LOOP won't.  So we need some setting that will indicate that.
>
> Consider CS_BASE fields:
>
>   [12: 0]  EDIF = LEND - PC, if PC < LEND && LEND - PC < 2*PAGE_SIZE, or 0.

That has the same issue as my original patch: two TBs with PC > LEND
in both may be linked together, and then LEND may change, so that
PC1 > LEND and PC2 < LEND.
But I guess always storing low 13 bits of LEND - PC should work?
I'll play with it...

-- 
Thanks.
-- Max

Re: [Qemu-devel] [RFC] target/xtensa: rework zero overhead loops implementation
Posted by Max Filippov 5 years, 5 months ago
On Thu, Oct 4, 2018 at 3:03 PM Max Filippov <jcmvbkbc@gmail.com> wrote:
> But I guess always storing low 13 bits of LEND - PC should work?

...when they're within two pages from each other...

-- 
Thanks.
-- Max