[PATCH] tcg/i386: use 8-bit OR or XOR for unsigned 8-bit immediates

Paolo Bonzini posted 1 patch 11 months ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20231228120524.70239-1-pbonzini@redhat.com
Maintainers: Richard Henderson <richard.henderson@linaro.org>
tcg/i386/tcg-target.c.inc | 7 +++++++
1 file changed, 7 insertions(+)
[PATCH] tcg/i386: use 8-bit OR or XOR for unsigned 8-bit immediates
Posted by Paolo Bonzini 11 months ago
In the case where OR or XOR has an 8-bit immediate between 128 and 255, we can
operate on a low-byte register and shorten the output by two or three bytes
(two if a prefix byte is needed for REX.B).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tcg/i386/tcg-target.c.inc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 1791b959738..a24a23f43b1 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -244,6 +244,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 #define P_EVEX          0x100000        /* Requires EVEX encoding */
 
+#define OPC_ARITH_EbIb	(0x80)
 #define OPC_ARITH_EvIz	(0x81)
 #define OPC_ARITH_EvIb	(0x83)
 #define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
@@ -1366,6 +1367,12 @@ static void tgen_arithi(TCGContext *s, int c, int r0,
         tcg_out8(s, val);
         return;
     }
+    if (val == (uint8_t)val && (c == ARITH_OR || c == ARITH_XOR) &&
+        (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
+        tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
+        tcg_out8(s, val);
+        return;
+    }
     if (rexw == 0 || val == (int32_t)val) {
         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
         tcg_out32(s, val);
-- 
2.43.0
Re: [PATCH] tcg/i386: use 8-bit OR or XOR for unsigned 8-bit immediates
Posted by Richard Henderson 11 months ago
On 12/28/23 23:05, Paolo Bonzini wrote:
> In the case where OR or XOR has an 8-bit immediate between 128 and 255, we can
> operate on a low-byte register and shorten the output by two or three bytes
> (two if a prefix byte is needed for REX.B).
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   tcg/i386/tcg-target.c.inc | 7 +++++++
>   1 file changed, 7 insertions(+)

Queued.  I adjusted the function a bit to use a switch.


r~

> 
> diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
> index 1791b959738..a24a23f43b1 100644
> --- a/tcg/i386/tcg-target.c.inc
> +++ b/tcg/i386/tcg-target.c.inc
> @@ -244,6 +244,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
>   #define P_VEXL          0x80000         /* Set VEX.L = 1 */
>   #define P_EVEX          0x100000        /* Requires EVEX encoding */
>   
> +#define OPC_ARITH_EbIb	(0x80)
>   #define OPC_ARITH_EvIz	(0x81)
>   #define OPC_ARITH_EvIb	(0x83)
>   #define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
> @@ -1366,6 +1367,12 @@ static void tgen_arithi(TCGContext *s, int c, int r0,
>           tcg_out8(s, val);
>           return;
>       }
> +    if (val == (uint8_t)val && (c == ARITH_OR || c == ARITH_XOR) &&
> +        (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
> +        tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
> +        tcg_out8(s, val);
> +        return;
> +    }
>       if (rexw == 0 || val == (int32_t)val) {
>           tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
>           tcg_out32(s, val);
Re: [PATCH] tcg/i386: use 8-bit OR or XOR for unsigned 8-bit immediates
Posted by Richard Henderson 11 months ago
On 12/28/23 23:05, Paolo Bonzini wrote:
> In the case where OR or XOR has an 8-bit immediate between 128 and 255, we can
> operate on a low-byte register and shorten the output by two or three bytes
> (two if a prefix byte is needed for REX.B).
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   tcg/i386/tcg-target.c.inc | 7 +++++++
>   1 file changed, 7 insertions(+)

At least once upon a time the partial register stall like this was quite slow.  IIRC there 
have been improvements in the last couple of generations, but it's still slower.

Data to show this is worthwhile?


r~

> 
> diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
> index 1791b959738..a24a23f43b1 100644
> --- a/tcg/i386/tcg-target.c.inc
> +++ b/tcg/i386/tcg-target.c.inc
> @@ -244,6 +244,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
>   #define P_VEXL          0x80000         /* Set VEX.L = 1 */
>   #define P_EVEX          0x100000        /* Requires EVEX encoding */
>   
> +#define OPC_ARITH_EbIb	(0x80)
>   #define OPC_ARITH_EvIz	(0x81)
>   #define OPC_ARITH_EvIb	(0x83)
>   #define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
> @@ -1366,6 +1367,12 @@ static void tgen_arithi(TCGContext *s, int c, int r0,
>           tcg_out8(s, val);
>           return;
>       }
> +    if (val == (uint8_t)val && (c == ARITH_OR || c == ARITH_XOR) &&
> +        (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
> +        tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
> +        tcg_out8(s, val);
> +        return;
> +    }
>       if (rexw == 0 || val == (int32_t)val) {
>           tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
>           tcg_out32(s, val);
Re: [PATCH] tcg/i386: use 8-bit OR or XOR for unsigned 8-bit immediates
Posted by Paolo Bonzini 11 months ago
Il gio 28 dic 2023, 21:45 Richard Henderson <richard.henderson@linaro.org>
ha scritto:

> On 12/28/23 23:05, Paolo Bonzini wrote:
> > In the case where OR or XOR has an 8-bit immediate between 128 and 255,
> we can
> > operate on a low-byte register and shorten the output by two or three
> bytes
> > (two if a prefix byte is needed for REX.B).
> >
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> >   tcg/i386/tcg-target.c.inc | 7 +++++++
> >   1 file changed, 7 insertions(+)
>
> At least once upon a time the partial register stall like this was quite
> slow.  IIRC there
> have been improvements in the last couple of generations, but it's still
> slower.
>
> Data to show this is worthwhile?
>

To be honest I simply had noticed that GCC generates it just fine these
days.

However, Agner Fog says that the (previously very high) penalty for partial
register access became just 1 uop starting with the Pentium D, and it's
gone completely except for AH/BH/CH/DH starting with Haswell.

On Atom and AMD processors there's a false dependency on the rest of the
register, but you'd have a (true) dependency anyway for OR r32, imm. The
only case where the false dependency matters is for instructions such as
MOV AL, imm; these have such a dependency on Atom and AMD processors but
not on recent Intel processors.

Paolo


>
> r~
>
> >
> > diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
> > index 1791b959738..a24a23f43b1 100644
> > --- a/tcg/i386/tcg-target.c.inc
> > +++ b/tcg/i386/tcg-target.c.inc
> > @@ -244,6 +244,7 @@ static bool tcg_target_const_match(int64_t val,
> TCGType type, int ct, int vece)
> >   #define P_VEXL          0x80000         /* Set VEX.L = 1 */
> >   #define P_EVEX          0x100000        /* Requires EVEX encoding */
> >
> > +#define OPC_ARITH_EbIb       (0x80)
> >   #define OPC_ARITH_EvIz      (0x81)
> >   #define OPC_ARITH_EvIb      (0x83)
> >   #define OPC_ARITH_GvEv      (0x03)          /* ... plus (ARITH_FOO <<
> 3) */
> > @@ -1366,6 +1367,12 @@ static void tgen_arithi(TCGContext *s, int c, int
> r0,
> >           tcg_out8(s, val);
> >           return;
> >       }
> > +    if (val == (uint8_t)val && (c == ARITH_OR || c == ARITH_XOR) &&
> > +        (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
> > +        tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
> > +        tcg_out8(s, val);
> > +        return;
> > +    }
> >       if (rexw == 0 || val == (int32_t)val) {
> >           tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
> >           tcg_out32(s, val);
>
>