[PATCH 12/13] target/i386: pull computation of string update value out of loop

Paolo Bonzini posted 13 patches 3 months, 3 weeks ago
[PATCH 12/13] target/i386: pull computation of string update value out of loop
Posted by Paolo Bonzini 3 months, 3 weeks ago
This is a common operation that is executed many times in rep
movs or rep stos loops.  It can improve performance by several
percentage points.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/translate.c | 54 ++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index e0f9f7748bc..4b652cc23e1 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -831,16 +831,13 @@ static bool gen_check_io(DisasContext *s, MemOp ot, TCGv_i32 port,
 #endif
 }
 
-static void gen_movs(DisasContext *s, MemOp ot)
+static void gen_movs(DisasContext *s, MemOp ot, TCGv dshift)
 {
-    TCGv dshift;
-
     gen_string_movl_A0_ESI(s);
     gen_op_ld_v(s, ot, s->T0, s->A0);
     gen_string_movl_A0_EDI(s);
     gen_op_st_v(s, ot, s->T0, s->A0);
 
-    dshift = gen_compute_Dshift(s, ot);
     gen_op_add_reg(s, s->aflag, R_ESI, dshift);
     gen_op_add_reg(s, s->aflag, R_EDI, dshift);
 }
@@ -1244,22 +1241,22 @@ static inline void gen_jcc(DisasContext *s, int b, TCGLabel *l1)
     }
 }
 
-static void gen_stos(DisasContext *s, MemOp ot)
+static void gen_stos(DisasContext *s, MemOp ot, TCGv dshift)
 {
     gen_string_movl_A0_EDI(s);
     gen_op_st_v(s, ot, s->T0, s->A0);
-    gen_op_add_reg(s, s->aflag, R_EDI, gen_compute_Dshift(s, ot));
+    gen_op_add_reg(s, s->aflag, R_EDI, dshift);
 }
 
-static void gen_lods(DisasContext *s, MemOp ot)
+static void gen_lods(DisasContext *s, MemOp ot, TCGv dshift)
 {
     gen_string_movl_A0_ESI(s);
     gen_op_ld_v(s, ot, s->T0, s->A0);
     gen_op_mov_reg_v(s, ot, R_EAX, s->T0);
-    gen_op_add_reg(s, s->aflag, R_ESI, gen_compute_Dshift(s, ot));
+    gen_op_add_reg(s, s->aflag, R_ESI, dshift);
 }
 
-static void gen_scas(DisasContext *s, MemOp ot)
+static void gen_scas(DisasContext *s, MemOp ot, TCGv dshift)
 {
     gen_string_movl_A0_EDI(s);
     gen_op_ld_v(s, ot, s->T1, s->A0);
@@ -1268,13 +1265,11 @@ static void gen_scas(DisasContext *s, MemOp ot)
     tcg_gen_sub_tl(cpu_cc_dst, s->T0, s->T1);
     set_cc_op(s, CC_OP_SUBB + ot);
 
-    gen_op_add_reg(s, s->aflag, R_EDI, gen_compute_Dshift(s, ot));
+    gen_op_add_reg(s, s->aflag, R_EDI, dshift);
 }
 
-static void gen_cmps(DisasContext *s, MemOp ot)
+static void gen_cmps(DisasContext *s, MemOp ot, TCGv dshift)
 {
-    TCGv dshift;
-
     gen_string_movl_A0_EDI(s);
     gen_op_ld_v(s, ot, s->T1, s->A0);
     gen_string_movl_A0_ESI(s);
@@ -1284,7 +1279,6 @@ static void gen_cmps(DisasContext *s, MemOp ot)
     tcg_gen_sub_tl(cpu_cc_dst, s->T0, s->T1);
     set_cc_op(s, CC_OP_SUBB + ot);
 
-    dshift = gen_compute_Dshift(s, ot);
     gen_op_add_reg(s, s->aflag, R_ESI, dshift);
     gen_op_add_reg(s, s->aflag, R_EDI, dshift);
 }
@@ -1303,7 +1297,7 @@ static void gen_bpt_io(DisasContext *s, TCGv_i32 t_port, int ot)
     }
 }
 
-static void gen_ins(DisasContext *s, MemOp ot)
+static void gen_ins(DisasContext *s, MemOp ot, TCGv dshift)
 {
     gen_string_movl_A0_EDI(s);
     /* Note: we must do this dummy write first to be restartable in
@@ -1314,11 +1308,11 @@ static void gen_ins(DisasContext *s, MemOp ot)
     tcg_gen_andi_i32(s->tmp2_i32, s->tmp2_i32, 0xffff);
     gen_helper_in_func(ot, s->T0, s->tmp2_i32);
     gen_op_st_v(s, ot, s->T0, s->A0);
-    gen_op_add_reg(s, s->aflag, R_EDI, gen_compute_Dshift(s, ot));
+    gen_op_add_reg(s, s->aflag, R_EDI, dshift);
     gen_bpt_io(s, s->tmp2_i32, ot);
 }
 
-static void gen_outs(DisasContext *s, MemOp ot)
+static void gen_outs(DisasContext *s, MemOp ot, TCGv dshift)
 {
     gen_string_movl_A0_ESI(s);
     gen_op_ld_v(s, ot, s->T0, s->A0);
@@ -1327,14 +1321,14 @@ static void gen_outs(DisasContext *s, MemOp ot)
     tcg_gen_andi_i32(s->tmp2_i32, s->tmp2_i32, 0xffff);
     tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T0);
     gen_helper_out_func(ot, s->tmp2_i32, s->tmp3_i32);
-    gen_op_add_reg(s, s->aflag, R_ESI, gen_compute_Dshift(s, ot));
+    gen_op_add_reg(s, s->aflag, R_ESI, dshift);
     gen_bpt_io(s, s->tmp2_i32, ot);
 }
 
 #define REP_MAX 65535
 
-static void do_gen_rep(DisasContext *s, MemOp ot,
-                       void (*fn)(DisasContext *s, MemOp ot),
+static void do_gen_rep(DisasContext *s, MemOp ot, TCGv dshift,
+                       void (*fn)(DisasContext *s, MemOp ot, TCGv dshift),
                        bool is_repz_nz)
 {
     TCGLabel *last = gen_new_label();
@@ -1399,7 +1393,7 @@ static void do_gen_rep(DisasContext *s, MemOp ot,
     }
 
     gen_set_label(loop);
-    fn(s, ot);
+    fn(s, ot, dshift);
     tcg_gen_mov_tl(cpu_regs[R_ECX], cx_next);
     gen_update_cc_op(s);
 
@@ -1434,7 +1428,7 @@ static void do_gen_rep(DisasContext *s, MemOp ot,
          */
         gen_set_label(last);
         set_cc_op(s, CC_OP_DYNAMIC);
-        fn(s, ot);
+        fn(s, ot, dshift);
         tcg_gen_mov_tl(cpu_regs[R_ECX], cx_next);
         gen_update_cc_op(s);
     }
@@ -1449,23 +1443,27 @@ static void do_gen_rep(DisasContext *s, MemOp ot,
 }
 
 static void gen_repz(DisasContext *s, MemOp ot,
-                     void (*fn)(DisasContext *s, MemOp ot))
+                     void (*fn)(DisasContext *s, MemOp ot, TCGv dshift))
 
 {
+    TCGv dshift = gen_compute_Dshift(s, ot);
+
     if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
-        do_gen_rep(s, ot, fn, false);
+        do_gen_rep(s, ot, dshift, fn, false);
     } else {
-        fn(s, ot);
+        fn(s, ot, dshift);
     }
 }
 
 static void gen_repz_nz(DisasContext *s, MemOp ot,
-                        void (*fn)(DisasContext *s, MemOp ot))
+                        void (*fn)(DisasContext *s, MemOp ot, TCGv dshift))
 {
+    TCGv dshift = gen_compute_Dshift(s, ot);
+
     if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
-        do_gen_rep(s, ot, fn, true);
+        do_gen_rep(s, ot, dshift, fn, true);
     } else {
-        fn(s, ot);
+        fn(s, ot, dshift);
     }
 }
 
-- 
2.47.1
Re: [PATCH 12/13] target/i386: pull computation of string update value out of loop
Posted by Richard Henderson 3 months, 3 weeks ago
On 12/15/24 03:06, Paolo Bonzini wrote:
> This is a common operation that is executed many times in rep
> movs or rep stos loops.  It can improve performance by several
> percentage points.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   target/i386/tcg/translate.c | 54 ++++++++++++++++++-------------------
>   1 file changed, 26 insertions(+), 28 deletions(-)
> 
> diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
> index e0f9f7748bc..4b652cc23e1 100644
> --- a/target/i386/tcg/translate.c
> +++ b/target/i386/tcg/translate.c
> @@ -831,16 +831,13 @@ static bool gen_check_io(DisasContext *s, MemOp ot, TCGv_i32 port,
>   #endif
>   }
>   
> -static void gen_movs(DisasContext *s, MemOp ot)
> +static void gen_movs(DisasContext *s, MemOp ot, TCGv dshift)
>   {
> -    TCGv dshift;
> -
>       gen_string_movl_A0_ESI(s);
>       gen_op_ld_v(s, ot, s->T0, s->A0);
>       gen_string_movl_A0_EDI(s);
>       gen_op_st_v(s, ot, s->T0, s->A0);
>   
> -    dshift = gen_compute_Dshift(s, ot);
>       gen_op_add_reg(s, s->aflag, R_ESI, dshift);
>       gen_op_add_reg(s, s->aflag, R_EDI, dshift);
>   }
> @@ -1244,22 +1241,22 @@ static inline void gen_jcc(DisasContext *s, int b, TCGLabel *l1)
>       }
>   }
>   
> -static void gen_stos(DisasContext *s, MemOp ot)
> +static void gen_stos(DisasContext *s, MemOp ot, TCGv dshift)
>   {
>       gen_string_movl_A0_EDI(s);
>       gen_op_st_v(s, ot, s->T0, s->A0);
> -    gen_op_add_reg(s, s->aflag, R_EDI, gen_compute_Dshift(s, ot));
> +    gen_op_add_reg(s, s->aflag, R_EDI, dshift);
>   }
>   
> -static void gen_lods(DisasContext *s, MemOp ot)
> +static void gen_lods(DisasContext *s, MemOp ot, TCGv dshift)
>   {
>       gen_string_movl_A0_ESI(s);
>       gen_op_ld_v(s, ot, s->T0, s->A0);
>       gen_op_mov_reg_v(s, ot, R_EAX, s->T0);
> -    gen_op_add_reg(s, s->aflag, R_ESI, gen_compute_Dshift(s, ot));
> +    gen_op_add_reg(s, s->aflag, R_ESI, dshift);
>   }
>   
> -static void gen_scas(DisasContext *s, MemOp ot)
> +static void gen_scas(DisasContext *s, MemOp ot, TCGv dshift)
>   {
>       gen_string_movl_A0_EDI(s);
>       gen_op_ld_v(s, ot, s->T1, s->A0);
> @@ -1268,13 +1265,11 @@ static void gen_scas(DisasContext *s, MemOp ot)
>       tcg_gen_sub_tl(cpu_cc_dst, s->T0, s->T1);
>       set_cc_op(s, CC_OP_SUBB + ot);
>   
> -    gen_op_add_reg(s, s->aflag, R_EDI, gen_compute_Dshift(s, ot));
> +    gen_op_add_reg(s, s->aflag, R_EDI, dshift);
>   }
>   
> -static void gen_cmps(DisasContext *s, MemOp ot)
> +static void gen_cmps(DisasContext *s, MemOp ot, TCGv dshift)
>   {
> -    TCGv dshift;
> -
>       gen_string_movl_A0_EDI(s);
>       gen_op_ld_v(s, ot, s->T1, s->A0);
>       gen_string_movl_A0_ESI(s);
> @@ -1284,7 +1279,6 @@ static void gen_cmps(DisasContext *s, MemOp ot)
>       tcg_gen_sub_tl(cpu_cc_dst, s->T0, s->T1);
>       set_cc_op(s, CC_OP_SUBB + ot);
>   
> -    dshift = gen_compute_Dshift(s, ot);
>       gen_op_add_reg(s, s->aflag, R_ESI, dshift);
>       gen_op_add_reg(s, s->aflag, R_EDI, dshift);
>   }
> @@ -1303,7 +1297,7 @@ static void gen_bpt_io(DisasContext *s, TCGv_i32 t_port, int ot)
>       }
>   }
>   
> -static void gen_ins(DisasContext *s, MemOp ot)
> +static void gen_ins(DisasContext *s, MemOp ot, TCGv dshift)
>   {
>       gen_string_movl_A0_EDI(s);
>       /* Note: we must do this dummy write first to be restartable in
> @@ -1314,11 +1308,11 @@ static void gen_ins(DisasContext *s, MemOp ot)
>       tcg_gen_andi_i32(s->tmp2_i32, s->tmp2_i32, 0xffff);
>       gen_helper_in_func(ot, s->T0, s->tmp2_i32);
>       gen_op_st_v(s, ot, s->T0, s->A0);
> -    gen_op_add_reg(s, s->aflag, R_EDI, gen_compute_Dshift(s, ot));
> +    gen_op_add_reg(s, s->aflag, R_EDI, dshift);
>       gen_bpt_io(s, s->tmp2_i32, ot);
>   }
>   
> -static void gen_outs(DisasContext *s, MemOp ot)
> +static void gen_outs(DisasContext *s, MemOp ot, TCGv dshift)
>   {
>       gen_string_movl_A0_ESI(s);
>       gen_op_ld_v(s, ot, s->T0, s->A0);
> @@ -1327,14 +1321,14 @@ static void gen_outs(DisasContext *s, MemOp ot)
>       tcg_gen_andi_i32(s->tmp2_i32, s->tmp2_i32, 0xffff);
>       tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T0);
>       gen_helper_out_func(ot, s->tmp2_i32, s->tmp3_i32);
> -    gen_op_add_reg(s, s->aflag, R_ESI, gen_compute_Dshift(s, ot));
> +    gen_op_add_reg(s, s->aflag, R_ESI, dshift);
>       gen_bpt_io(s, s->tmp2_i32, ot);
>   }
>   
>   #define REP_MAX 65535
>   
> -static void do_gen_rep(DisasContext *s, MemOp ot,
> -                       void (*fn)(DisasContext *s, MemOp ot),
> +static void do_gen_rep(DisasContext *s, MemOp ot, TCGv dshift,
> +                       void (*fn)(DisasContext *s, MemOp ot, TCGv dshift),
>                          bool is_repz_nz)
>   {
>       TCGLabel *last = gen_new_label();
> @@ -1399,7 +1393,7 @@ static void do_gen_rep(DisasContext *s, MemOp ot,
>       }
>   
>       gen_set_label(loop);
> -    fn(s, ot);
> +    fn(s, ot, dshift);
>       tcg_gen_mov_tl(cpu_regs[R_ECX], cx_next);
>       gen_update_cc_op(s);
>   
> @@ -1434,7 +1428,7 @@ static void do_gen_rep(DisasContext *s, MemOp ot,
>            */
>           gen_set_label(last);
>           set_cc_op(s, CC_OP_DYNAMIC);
> -        fn(s, ot);
> +        fn(s, ot, dshift);
>           tcg_gen_mov_tl(cpu_regs[R_ECX], cx_next);
>           gen_update_cc_op(s);
>       }
> @@ -1449,23 +1443,27 @@ static void do_gen_rep(DisasContext *s, MemOp ot,
>   }
>   
>   static void gen_repz(DisasContext *s, MemOp ot,
> -                     void (*fn)(DisasContext *s, MemOp ot))
> +                     void (*fn)(DisasContext *s, MemOp ot, TCGv dshift))
>   
>   {
> +    TCGv dshift = gen_compute_Dshift(s, ot);
> +
>       if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
> -        do_gen_rep(s, ot, fn, false);
> +        do_gen_rep(s, ot, dshift, fn, false);
>       } else {
> -        fn(s, ot);
> +        fn(s, ot, dshift);
>       }
>   }
>   
>   static void gen_repz_nz(DisasContext *s, MemOp ot,
> -                        void (*fn)(DisasContext *s, MemOp ot))
> +                        void (*fn)(DisasContext *s, MemOp ot, TCGv dshift))
>   {
> +    TCGv dshift = gen_compute_Dshift(s, ot);
> +
>       if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
> -        do_gen_rep(s, ot, fn, true);
> +        do_gen_rep(s, ot, dshift, fn, true);
>       } else {
> -        fn(s, ot);
> +        fn(s, ot, dshift);
>       }
>   }
>   

Still not a fan of the repetition.  I think this could easily be

static void do_gen_rep(DisasContext *s, MemOp ot,
                        void (*fn)(DisasContext *s, MemOp ot, TCGv dshift),
                        bool is_repz_nz)
{
     TCGv dshift = gen_compute_Dshift(s, ot);
     TCGLabel *last, *loop, *done;
     target_ulong cx_mask;
     TCGv cx_next;
     bool can_loop, had_rf;

     if (!(s->prefix & (REPZ | REPNZ))) {
         fn(s, ot, dshift);
         return;
     }

     done = gen_new_label();
     can_loop = ...
     everything else.
}

Or, if you prefer, use two functions gen_repz_nz{0,1}.


r~