[PATCH 11/13] target/i386: execute multiple REP/REPZ iterations without leaving TB

Paolo Bonzini posted 13 patches 3 months, 3 weeks ago
[PATCH 11/13] target/i386: execute multiple REP/REPZ iterations without leaving TB
Posted by Paolo Bonzini 3 months, 3 weeks ago
Use a TCG loop so that it is not necessary to go through the setup steps
of REP and through the I/O check on every iteration.  Interestingly, this
is not a particularly effective optimization on its own, though it avoids
the cost of correct RF emulation that was added in the previous patch.
The main benefit lies in allowing the hoisting of loop invariants outside
the loop, which will happen separately.

The loop exits when the low 16 bits of CX/ECX/RCX are zero (so generally
speaking the string operation runs in 65536 iteration batches) to give
the main loop an opportunity to pick up interrupts.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/translate.c | 55 +++++++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 2ea8a418612..e0f9f7748bc 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -1331,13 +1331,28 @@ static void gen_outs(DisasContext *s, MemOp ot)
     gen_bpt_io(s, s->tmp2_i32, ot);
 }
 
+#define REP_MAX 65535
+
 static void do_gen_rep(DisasContext *s, MemOp ot,
                        void (*fn)(DisasContext *s, MemOp ot),
                        bool is_repz_nz)
 {
+    TCGLabel *last = gen_new_label();
+    TCGLabel *loop = gen_new_label();
     TCGLabel *done = gen_new_label();
+
     target_ulong cx_mask = MAKE_64BIT_MASK(0, 8 << s->aflag);
     TCGv cx_next = tcg_temp_new();
+
+    /*
+     * Check if we must translate a single iteration only.  Normally, HF_RF_MASK
+     * would also limit translation blocks to one instruction, so that gen_eob
+     * can reset the flag; here however RF is set throughout the repetition, so
+     * we can plow through until CX/ECX/RCX is zero.
+     */
+    bool can_loop =
+        (!(tb_cflags(s->base.tb) & (CF_USE_ICOUNT | CF_SINGLE_STEP))
+	 && !(s->flags & (HF_TF_MASK | HF_INHIBIT_IRQ_MASK)));
     bool had_rf = s->flags & HF_RF_MASK;
 
     /*
@@ -1362,19 +1377,29 @@ static void do_gen_rep(DisasContext *s, MemOp ot,
     /* Any iteration at all?  */
     tcg_gen_brcondi_tl(TCG_COND_TSTEQ, cpu_regs[R_ECX], cx_mask, done);
 
-    fn(s, ot);
-
-    tcg_gen_subi_tl(cx_next, cpu_regs[R_ECX], 1);
-
     /*
-     * Write back cx_next to CX/ECX/RCX.  There can be no carry, so zero
-     * extend if needed but do not do expensive deposit operations.
+     * From now on we operate on the value of CX/ECX/RCX that will be written
+     * back, which is stored in cx_next.  There can be no carry, so we can zero
+     * extend here if needed and not do any expensive deposit operations later.
      */
+    tcg_gen_subi_tl(cx_next, cpu_regs[R_ECX], 1);
 #ifdef TARGET_X86_64
     if (s->aflag == MO_32) {
         tcg_gen_ext32u_tl(cx_next, cx_next);
+        cx_mask = ~0;
     }
 #endif
+
+    /*
+     * The last iteration is handled outside the loop, so that cx_next
+     * can never underflow.
+     */
+    if (can_loop) {
+        tcg_gen_brcondi_tl(TCG_COND_TSTEQ, cx_next, cx_mask, last);
+    }
+
+    gen_set_label(loop);
+    fn(s, ot);
     tcg_gen_mov_tl(cpu_regs[R_ECX], cx_next);
     gen_update_cc_op(s);
 
@@ -1384,6 +1409,12 @@ static void do_gen_rep(DisasContext *s, MemOp ot,
         gen_jcc_noeob(s, (JCC_Z << 1) | (nz ^ 1), done);
     }
 
+    if (can_loop) {
+        tcg_gen_subi_tl(cx_next, cpu_regs[R_ECX], 1);
+        tcg_gen_brcondi_tl(TCG_COND_TSTNE, cx_next, REP_MAX, loop);
+        tcg_gen_brcondi_tl(TCG_COND_TSTEQ, cx_next, cx_mask, last);
+    }
+
     /*
      * Traps or interrupts set RF_MASK if they happen after any iteration
      * but the last.  Set it here before giving the main loop a chance to
@@ -1396,6 +1427,18 @@ static void do_gen_rep(DisasContext *s, MemOp ot,
     /* Go to the main loop but reenter the same instruction.  */
     gen_jmp_rel_csize(s, -cur_insn_len(s), 0);
 
+    if (can_loop) {
+        /*
+         * The last iteration needs no conditional jump, even if is_repz_nz,
+         * because the repeats are ending anyway.
+         */
+        gen_set_label(last);
+        set_cc_op(s, CC_OP_DYNAMIC);
+        fn(s, ot);
+        tcg_gen_mov_tl(cpu_regs[R_ECX], cx_next);
+        gen_update_cc_op(s);
+    }
+
     /* CX/ECX/RCX is zero, or REPZ/REPNZ broke the repetition.  */
     gen_set_label(done);
     set_cc_op(s, CC_OP_DYNAMIC);
-- 
2.47.1
Re: [PATCH 11/13] target/i386: execute multiple REP/REPZ iterations without leaving TB
Posted by Richard Henderson 3 months, 3 weeks ago
On 12/15/24 03:06, Paolo Bonzini wrote:
> +
> +    /*
> +     * The last iteration is handled outside the loop, so that cx_next
> +     * can never underflow.
> +     */
> +    if (can_loop) {
> +        tcg_gen_brcondi_tl(TCG_COND_TSTEQ, cx_next, cx_mask, last);
> +    }
> +
> +    gen_set_label(loop);

I know unused labels will get eliminated, but clearer if you emit the label within the IF 
as well.

> @@ -1384,6 +1409,12 @@ static void do_gen_rep(DisasContext *s, MemOp ot,
>           gen_jcc_noeob(s, (JCC_Z << 1) | (nz ^ 1), done);
>       }
>   
> +    if (can_loop) {
> +        tcg_gen_subi_tl(cx_next, cpu_regs[R_ECX], 1);

Since we've just written back cx_next to ECX, this is the same as cx_next -= 1, yes?


Anyway,
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~
Re: [PATCH 11/13] target/i386: execute multiple REP/REPZ iterations without leaving TB
Posted by Paolo Bonzini 3 months, 3 weeks ago
Il dom 15 dic 2024, 16:07 Richard Henderson <richard.henderson@linaro.org>
ha scritto:

> > @@ -1384,6 +1409,12 @@ static void do_gen_rep(DisasContext *s, MemOp ot,

>           gen_jcc_noeob(s, (JCC_Z << 1) | (nz ^ 1), done);
> >       }
> >
> > +    if (can_loop) {
> > +        tcg_gen_subi_tl(cx_next, cpu_regs[R_ECX], 1);
>
> Since we've just written back cx_next to ECX, this is the same as cx_next
> -= 1, yes?
>

Yeah, I wanted to make cx_next die at the assignment to ECX but it probably
does not make a difference to generated code.

Paolo

>
>
> Anyway,
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
>
>
> r~
>
>
Re: [PATCH 11/13] target/i386: execute multiple REP/REPZ iterations without leaving TB
Posted by Richard Henderson 3 months, 3 weeks ago
On 12/15/24 09:17, Paolo Bonzini wrote:
> 
> 
> Il dom 15 dic 2024, 16:07 Richard Henderson <richard.henderson@linaro.org 
> <mailto:richard.henderson@linaro.org>> ha scritto:
> 
>      > @@ -1384,6 +1409,12 @@ static void do_gen_rep(DisasContext *s, MemOp ot,
> 
>      >           gen_jcc_noeob(s, (JCC_Z << 1) | (nz ^ 1), done);
>      >       }
>      >
>      > +    if (can_loop) {
>      > +        tcg_gen_subi_tl(cx_next, cpu_regs[R_ECX], 1);
> 
>     Since we've just written back cx_next to ECX, this is the same as cx_next -= 1, yes?
> 
> 
> Yeah, I wanted to make cx_next die at the assignment to ECX but it probably does not make 
> a difference to generated code.

Not really.  It would only make a difference if cx_next was never live outside the EBB. 
But it is live across the branches to LOOP and LAST.

What might make a difference is to use the knowledge of known values in ECX, but less 
usage of cx_next itself.  Let cx_next die at the two

+        tcg_gen_brcondi_tl(TCG_COND_TSTEQ, cx_next, cx_mask, last);

by repeating the subtraction when updating ECX, i.e.

-    tcg_gen_mov_tl(cpu_regs[R_ECX], cx_next);
+    tcg_gen_subi_tl(cpu_regs[R_ECX], cpu_regs[R_ECX], 1);

This would avoid spilling cx_next to the stack.

There's a the ext32u to place somewhere.

I guess you can't hoist outside the loop before the first invocation of FN, due to the 
fault path.  To eliminate it from the main loop you'd have to unroll once.

	// no iteration
	brcond tsteq ecx, mask, done

	sub cxnext, ecx, 1
	brcond tsteq cxnext, mask, last

	// first iteration
	fn
	sub ecx, ecx, 1
	extu ecx, ecx

	sub cxnext, ecx, 1
	brcond eq cxnext, 0, last

	// subsequent iterations, ecx now known zero-extended.
  loop:
	fn
	sub ecx, ecx, 1

	sub cxnext, ecx, 1
	brcond tstne, cxnext, max, loop
	brcond eq cxnext, 0, last

etc.  It doesn't seem worthwhile to eliminate one ext32u, which will almost certainly be 
scheduled into the noise.


r~