In the case where OR or XOR has an 8-bit immediate between 128 and 255, we can
operate on a low-byte register and shorten the output by two or three bytes
(two if a prefix byte is needed for REX.B).
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
tcg/i386/tcg-target.c.inc | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 1791b959738..a24a23f43b1 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -244,6 +244,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
#define P_VEXL 0x80000 /* Set VEX.L = 1 */
#define P_EVEX 0x100000 /* Requires EVEX encoding */
+#define OPC_ARITH_EbIb (0x80)
#define OPC_ARITH_EvIz (0x81)
#define OPC_ARITH_EvIb (0x83)
#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
@@ -1366,6 +1367,12 @@ static void tgen_arithi(TCGContext *s, int c, int r0,
tcg_out8(s, val);
return;
}
+ if (val == (uint8_t)val && (c == ARITH_OR || c == ARITH_XOR) &&
+ (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
+ tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
+ tcg_out8(s, val);
+ return;
+ }
if (rexw == 0 || val == (int32_t)val) {
tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
tcg_out32(s, val);
--
2.43.0
On 12/28/23 23:05, Paolo Bonzini wrote: > In the case where OR or XOR has an 8-bit immediate between 128 and 255, we can > operate on a low-byte register and shorten the output by two or three bytes > (two if a prefix byte is needed for REX.B). > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > --- > tcg/i386/tcg-target.c.inc | 7 +++++++ > 1 file changed, 7 insertions(+) Queued. I adjusted the function a bit to use a switch. r~ > > diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc > index 1791b959738..a24a23f43b1 100644 > --- a/tcg/i386/tcg-target.c.inc > +++ b/tcg/i386/tcg-target.c.inc > @@ -244,6 +244,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece) > #define P_VEXL 0x80000 /* Set VEX.L = 1 */ > #define P_EVEX 0x100000 /* Requires EVEX encoding */ > > +#define OPC_ARITH_EbIb (0x80) > #define OPC_ARITH_EvIz (0x81) > #define OPC_ARITH_EvIb (0x83) > #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ > @@ -1366,6 +1367,12 @@ static void tgen_arithi(TCGContext *s, int c, int r0, > tcg_out8(s, val); > return; > } > + if (val == (uint8_t)val && (c == ARITH_OR || c == ARITH_XOR) && > + (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { > + tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); > + tcg_out8(s, val); > + return; > + } > if (rexw == 0 || val == (int32_t)val) { > tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); > tcg_out32(s, val);
On 12/28/23 23:05, Paolo Bonzini wrote: > In the case where OR or XOR has an 8-bit immediate between 128 and 255, we can > operate on a low-byte register and shorten the output by two or three bytes > (two if a prefix byte is needed for REX.B). > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > --- > tcg/i386/tcg-target.c.inc | 7 +++++++ > 1 file changed, 7 insertions(+) At least once upon a time the partial register stall like this was quite slow. IIRC there have been improvements in the last couple of generations, but it's still slower. Data to show this is worthwhile? r~ > > diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc > index 1791b959738..a24a23f43b1 100644 > --- a/tcg/i386/tcg-target.c.inc > +++ b/tcg/i386/tcg-target.c.inc > @@ -244,6 +244,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece) > #define P_VEXL 0x80000 /* Set VEX.L = 1 */ > #define P_EVEX 0x100000 /* Requires EVEX encoding */ > > +#define OPC_ARITH_EbIb (0x80) > #define OPC_ARITH_EvIz (0x81) > #define OPC_ARITH_EvIb (0x83) > #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ > @@ -1366,6 +1367,12 @@ static void tgen_arithi(TCGContext *s, int c, int r0, > tcg_out8(s, val); > return; > } > + if (val == (uint8_t)val && (c == ARITH_OR || c == ARITH_XOR) && > + (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { > + tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); > + tcg_out8(s, val); > + return; > + } > if (rexw == 0 || val == (int32_t)val) { > tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); > tcg_out32(s, val);
Il gio 28 dic 2023, 21:45 Richard Henderson <richard.henderson@linaro.org> ha scritto: > On 12/28/23 23:05, Paolo Bonzini wrote: > > In the case where OR or XOR has an 8-bit immediate between 128 and 255, > we can > > operate on a low-byte register and shorten the output by two or three > bytes > > (two if a prefix byte is needed for REX.B). > > > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > > --- > > tcg/i386/tcg-target.c.inc | 7 +++++++ > > 1 file changed, 7 insertions(+) > > At least once upon a time the partial register stall like this was quite > slow. IIRC there > have been improvements in the last couple of generations, but it's still > slower. > > Data to show this is worthwhile? > To be honest I simply had noticed that GCC generates it just fine these days. However, Agner Fog says that the (previously very high) penalty for partial register access became just 1 uop starting with the Pentium D, and it's gone completely except for AH/BH/CH/DH starting with Haswell. On Atom and AMD processors there's a false dependency on the rest of the register, but you'd have a (true) dependency anyway for OR r32, imm. The only case where the false dependency matters is for instructions such as MOV AL, imm; these have such a dependency on Atom and AMD processors but not on recent Intel processors. Paolo > > r~ > > > > > diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc > > index 1791b959738..a24a23f43b1 100644 > > --- a/tcg/i386/tcg-target.c.inc > > +++ b/tcg/i386/tcg-target.c.inc > > @@ -244,6 +244,7 @@ static bool tcg_target_const_match(int64_t val, > TCGType type, int ct, int vece) > > #define P_VEXL 0x80000 /* Set VEX.L = 1 */ > > #define P_EVEX 0x100000 /* Requires EVEX encoding */ > > > > +#define OPC_ARITH_EbIb (0x80) > > #define OPC_ARITH_EvIz (0x81) > > #define OPC_ARITH_EvIb (0x83) > > #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << > 3) */ > > @@ -1366,6 +1367,12 @@ static void tgen_arithi(TCGContext *s, int c, int > r0, > > tcg_out8(s, val); > > return; > > } > > + if (val == (uint8_t)val && (c == ARITH_OR || c == ARITH_XOR) && > > + (r0 < 4 || TCG_TARGET_REG_BITS == 64)) { > > + tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0); > > + tcg_out8(s, val); > > + return; > > + } > > if (rexw == 0 || val == (int32_t)val) { > > tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0); > > tcg_out32(s, val); > >
© 2016 - 2024 Red Hat, Inc.