ppc-for-2.12 queue 20180108

[Qemu-devel] [PULL 01/12] target-ppc: optimize cmp translation

Posted by David Gibson 8 years, 1 month ago

From: "pbonzini@redhat.com" <pbonzini@redhat.com>

We know that only one bit (in addition to SO) is going to be set in
the condition register, so do two movconds instead of three setconds,
three shifts and two ORs.

For ppc64-linux-user, the code size reduction is around 5% and the
performance improvement slightly less than 10%.  For softmmu, the
improvement is around 5%.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target/ppc/translate.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 4075fc8589..8a6bd329d0 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -605,27 +605,22 @@ static opc_handler_t invalid_handler = {
 static inline void gen_op_cmp(TCGv arg0, TCGv arg1, int s, int crf)
 {
     TCGv t0 = tcg_temp_new();
-    TCGv_i32 t1 = tcg_temp_new_i32();
-
-    tcg_gen_trunc_tl_i32(cpu_crf[crf], cpu_so);
-
-    tcg_gen_setcond_tl((s ? TCG_COND_LT: TCG_COND_LTU), t0, arg0, arg1);
-    tcg_gen_trunc_tl_i32(t1, t0);
-    tcg_gen_shli_i32(t1, t1, CRF_LT_BIT);
-    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
+    TCGv t1 = tcg_temp_new();
+    TCGv_i32 t = tcg_temp_new_i32();
 
-    tcg_gen_setcond_tl((s ? TCG_COND_GT: TCG_COND_GTU), t0, arg0, arg1);
-    tcg_gen_trunc_tl_i32(t1, t0);
-    tcg_gen_shli_i32(t1, t1, CRF_GT_BIT);
-    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
+    tcg_gen_movi_tl(t0, CRF_EQ);
+    tcg_gen_movi_tl(t1, CRF_LT);
+    tcg_gen_movcond_tl((s ? TCG_COND_LT : TCG_COND_LTU), t0, arg0, arg1, t1, t0);
+    tcg_gen_movi_tl(t1, CRF_GT);
+    tcg_gen_movcond_tl((s ? TCG_COND_GT : TCG_COND_GTU), t0, arg0, arg1, t1, t0);
 
-    tcg_gen_setcond_tl(TCG_COND_EQ, t0, arg0, arg1);
-    tcg_gen_trunc_tl_i32(t1, t0);
-    tcg_gen_shli_i32(t1, t1, CRF_EQ_BIT);
-    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
+    tcg_gen_trunc_tl_i32(t, t0);
+    tcg_gen_trunc_tl_i32(cpu_crf[crf], cpu_so);
+    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t);
 
     tcg_temp_free(t0);
-    tcg_temp_free_i32(t1);
+    tcg_temp_free(t1);
+    tcg_temp_free_i32(t);
 }
 
 static inline void gen_op_cmpi(TCGv arg0, target_ulong arg1, int s, int crf)
-- 
2.14.3

Re: [Qemu-devel] [PULL 01/12] target-ppc: optimize cmp translation

Posted by Paolo Bonzini 8 years ago

On 08/01/2018 06:53, David Gibson wrote:
> From: "pbonzini@redhat.com" <pbonzini@redhat.com>
> 
> We know that only one bit (in addition to SO) is going to be set in
> the condition register, so do two movconds instead of three setconds,
> three shifts and two ORs.
> 
> For ppc64-linux-user, the code size reduction is around 5% and the
> performance improvement slightly less than 10%.  For softmmu, the
> improvement is around 5%.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  target/ppc/translate.c | 29 ++++++++++++-----------------
>  1 file changed, 12 insertions(+), 17 deletions(-)
> 
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index 4075fc8589..8a6bd329d0 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -605,27 +605,22 @@ static opc_handler_t invalid_handler = {
>  static inline void gen_op_cmp(TCGv arg0, TCGv arg1, int s, int crf)
>  {
>      TCGv t0 = tcg_temp_new();
> -    TCGv_i32 t1 = tcg_temp_new_i32();
> -
> -    tcg_gen_trunc_tl_i32(cpu_crf[crf], cpu_so);
> -
> -    tcg_gen_setcond_tl((s ? TCG_COND_LT: TCG_COND_LTU), t0, arg0, arg1);
> -    tcg_gen_trunc_tl_i32(t1, t0);
> -    tcg_gen_shli_i32(t1, t1, CRF_LT_BIT);
> -    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
> +    TCGv t1 = tcg_temp_new();
> +    TCGv_i32 t = tcg_temp_new_i32();
>  
> -    tcg_gen_setcond_tl((s ? TCG_COND_GT: TCG_COND_GTU), t0, arg0, arg1);
> -    tcg_gen_trunc_tl_i32(t1, t0);
> -    tcg_gen_shli_i32(t1, t1, CRF_GT_BIT);
> -    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
> +    tcg_gen_movi_tl(t0, CRF_EQ);
> +    tcg_gen_movi_tl(t1, CRF_LT);
> +    tcg_gen_movcond_tl((s ? TCG_COND_LT : TCG_COND_LTU), t0, arg0, arg1, t1, t0);
> +    tcg_gen_movi_tl(t1, CRF_GT);
> +    tcg_gen_movcond_tl((s ? TCG_COND_GT : TCG_COND_GTU), t0, arg0, arg1, t1, t0);
>  
> -    tcg_gen_setcond_tl(TCG_COND_EQ, t0, arg0, arg1);
> -    tcg_gen_trunc_tl_i32(t1, t0);
> -    tcg_gen_shli_i32(t1, t1, CRF_EQ_BIT);
> -    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
> +    tcg_gen_trunc_tl_i32(t, t0);
> +    tcg_gen_trunc_tl_i32(cpu_crf[crf], cpu_so);
> +    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t);
>  
>      tcg_temp_free(t0);
> -    tcg_temp_free_i32(t1);
> +    tcg_temp_free(t1);
> +    tcg_temp_free_i32(t);
>  }
>  
>  static inline void gen_op_cmpi(TCGv arg0, target_ulong arg1, int s, int crf)
> 

David, can you queue this again now that the ARM backend has been fixed?

Thanks,

Paolo

Re: [Qemu-devel] [PULL 01/12] target-ppc: optimize cmp translation

Posted by David Gibson 8 years ago

On Thu, Jan 18, 2018 at 05:36:40PM +0100, Paolo Bonzini wrote:
> On 08/01/2018 06:53, David Gibson wrote:
> > From: "pbonzini@redhat.com" <pbonzini@redhat.com>
> > 
> > We know that only one bit (in addition to SO) is going to be set in
> > the condition register, so do two movconds instead of three setconds,
> > three shifts and two ORs.
> > 
> > For ppc64-linux-user, the code size reduction is around 5% and the
> > performance improvement slightly less than 10%.  For softmmu, the
> > improvement is around 5%.
> > 
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  target/ppc/translate.c | 29 ++++++++++++-----------------
> >  1 file changed, 12 insertions(+), 17 deletions(-)
> > 
> > diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> > index 4075fc8589..8a6bd329d0 100644
> > --- a/target/ppc/translate.c
> > +++ b/target/ppc/translate.c
> > @@ -605,27 +605,22 @@ static opc_handler_t invalid_handler = {
> >  static inline void gen_op_cmp(TCGv arg0, TCGv arg1, int s, int crf)
> >  {
> >      TCGv t0 = tcg_temp_new();
> > -    TCGv_i32 t1 = tcg_temp_new_i32();
> > -
> > -    tcg_gen_trunc_tl_i32(cpu_crf[crf], cpu_so);
> > -
> > -    tcg_gen_setcond_tl((s ? TCG_COND_LT: TCG_COND_LTU), t0, arg0, arg1);
> > -    tcg_gen_trunc_tl_i32(t1, t0);
> > -    tcg_gen_shli_i32(t1, t1, CRF_LT_BIT);
> > -    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
> > +    TCGv t1 = tcg_temp_new();
> > +    TCGv_i32 t = tcg_temp_new_i32();
> >  
> > -    tcg_gen_setcond_tl((s ? TCG_COND_GT: TCG_COND_GTU), t0, arg0, arg1);
> > -    tcg_gen_trunc_tl_i32(t1, t0);
> > -    tcg_gen_shli_i32(t1, t1, CRF_GT_BIT);
> > -    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
> > +    tcg_gen_movi_tl(t0, CRF_EQ);
> > +    tcg_gen_movi_tl(t1, CRF_LT);
> > +    tcg_gen_movcond_tl((s ? TCG_COND_LT : TCG_COND_LTU), t0, arg0, arg1, t1, t0);
> > +    tcg_gen_movi_tl(t1, CRF_GT);
> > +    tcg_gen_movcond_tl((s ? TCG_COND_GT : TCG_COND_GTU), t0, arg0, arg1, t1, t0);
> >  
> > -    tcg_gen_setcond_tl(TCG_COND_EQ, t0, arg0, arg1);
> > -    tcg_gen_trunc_tl_i32(t1, t0);
> > -    tcg_gen_shli_i32(t1, t1, CRF_EQ_BIT);
> > -    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
> > +    tcg_gen_trunc_tl_i32(t, t0);
> > +    tcg_gen_trunc_tl_i32(cpu_crf[crf], cpu_so);
> > +    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t);
> >  
> >      tcg_temp_free(t0);
> > -    tcg_temp_free_i32(t1);
> > +    tcg_temp_free(t1);
> > +    tcg_temp_free_i32(t);
> >  }
> >  
> >  static inline void gen_op_cmpi(TCGv arg0, target_ulong arg1, int s, int crf)
> > 
> 
> David, can you queue this again now that the ARM backend has been
> fixed?

Done, thanks.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson