[PATCH for-6.1] tcg/i386: Split P_VEXW from P_REXW

Richard Henderson posted 1 patch 2 years, 9 months ago
Test checkpatch passed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20210810232530.1033519-1-richard.henderson@linaro.org
Maintainers: Richard Henderson <richard.henderson@linaro.org>
tcg/i386/tcg-target.c.inc | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
[PATCH for-6.1] tcg/i386: Split P_VEXW from P_REXW
Posted by Richard Henderson 2 years, 9 months ago
We need to be able to represent VEX.W on a 32-bit host, where REX.W
will always be zero.  Fixes the encoding for VPSLLVQ and VPSRLVQ.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/385
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 98d924b91a..997510109d 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -241,8 +241,9 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define P_EXT		0x100		/* 0x0f opcode prefix */
 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 #define P_DATA16        0x400           /* 0x66 opcode prefix */
+#define P_VEXW          0x1000          /* Set VEX.W = 1 */
 #if TCG_TARGET_REG_BITS == 64
-# define P_REXW         0x1000          /* Set REX.W = 1 */
+# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
 # define P_REXB_R       0x2000          /* REG field as byte register */
 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 # define P_GS           0x8000          /* gs segment override */
@@ -410,13 +411,13 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
-#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
+#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
-#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
+#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
-#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
+#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 #define OPC_XCHG_ax_r32	(0x90)
 
@@ -576,7 +577,7 @@ static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 
     /* Use the two byte form if possible, which cannot encode
        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
-    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
+    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
         && ((rm | index) & 8) == 0) {
         /* Two byte VEX prefix.  */
         tcg_out8(s, 0xc5);
@@ -601,7 +602,7 @@ static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
         tcg_out8(s, tmp);
 
-        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
+        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
     }
 
     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
-- 
2.25.1


Re: [PATCH for-6.1] tcg/i386: Split P_VEXW from P_REXW
Posted by Peter Maydell 2 years, 8 months ago
On Wed, 11 Aug 2021 at 00:26, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> We need to be able to represent VEX.W on a 32-bit host, where REX.W
> will always be zero.  Fixes the encoding for VPSLLVQ and VPSRLVQ.
>
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/385
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

This patch fixes the "wget https://github.com/ -O /dev/null" part of
the test case in issue 385, but not the "apt update" part: I see
this with an i686 qemu-arm binary:

root@e104462:/# apt-get update
Get:1 http://archive.raspberrypi.org/debian buster InRelease [32.6 kB]
Get:2 http://raspbian.raspberrypi.org/raspbian buster InRelease [15.0 kB]
Err:1 http://archive.raspberrypi.org/debian buster InRelease
  At least one invalid signature was encountered.
Err:2 http://raspbian.raspberrypi.org/raspbian buster InRelease
  At least one invalid signature was encountered.
Fetched 47.6 kB in 1s (91.3 kB/s)
Reading package lists... Done
W: An error occurred during the signature verification. The repository
is not updated and the previous index files will be used. GPG error:
http://archive.raspberrypi.org/debian buster InRelease: At least one
invalid signature was encountered.
W: An error occurred during the signature verification. The repository
is not updated and the previous index files will be used. GPG error:
http://raspbian.raspberrypi.org/raspbian buster InRelease: At least
one invalid signature was encountered.
W: Failed to fetch
http://raspbian.raspberrypi.org/raspbian/dists/buster/InRelease  At
least one invalid signature was encountered.
W: Failed to fetch
http://archive.raspberrypi.org/debian/dists/buster/InRelease  At least
one invalid signature was encountered.
W: Some index files failed to download. They have been ignored, or old
ones used instead.

whereas an x86-64 binary downloads everything without errors:

root@e104462:/# apt update
Get:1 http://archive.raspberrypi.org/debian buster InRelease [32.6 kB]
Get:2 http://raspbian.raspberrypi.org/raspbian buster InRelease [15.0 kB]
Get:3 http://archive.raspberrypi.org/debian buster/main armhf Packages
[378 kB]
Get:4 http://raspbian.raspberrypi.org/raspbian buster/main armhf
Packages [13.0 MB]
Fetched 13.4 MB in 49s (272 kB/s)
Reading package lists... Done
Building dependency tree
Reading state information... Done
44 packages can be upgraded. Run 'apt list --upgradable' to see them.


So there must still be another bug here...

-- PMM

Re: [PATCH for-6.1] tcg/i386: Split P_VEXW from P_REXW
Posted by Peter Maydell 2 years, 8 months ago
On Wed, 11 Aug 2021 at 00:26, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> We need to be able to represent VEX.W on a 32-bit host, where REX.W
> will always be zero.  Fixes the encoding for VPSLLVQ and VPSRLVQ.
>
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/385
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/i386/tcg-target.c.inc | 13 +++++++------
>  1 file changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
> index 98d924b91a..997510109d 100644
> --- a/tcg/i386/tcg-target.c.inc
> +++ b/tcg/i386/tcg-target.c.inc
> @@ -241,8 +241,9 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
>  #define P_EXT          0x100           /* 0x0f opcode prefix */
>  #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
>  #define P_DATA16        0x400           /* 0x66 opcode prefix */
> +#define P_VEXW          0x1000          /* Set VEX.W = 1 */
>  #if TCG_TARGET_REG_BITS == 64
> -# define P_REXW         0x1000          /* Set REX.W = 1 */
> +# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
>  # define P_REXB_R       0x2000          /* REG field as byte register */
>  # define P_REXB_RM      0x4000          /* R/M field as byte register */
>  # define P_GS           0x8000          /* gs segment override */
> @@ -410,13 +411,13 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
>  #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
>  #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
>  #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
> -#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
> +#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
>  #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
>  #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
> -#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
> +#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
>  #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
>  #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
> -#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
> +#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
>  #define OPC_VZEROUPPER  (0x77 | P_EXT)
>  #define OPC_XCHG_ax_r32        (0x90)
>
> @@ -576,7 +577,7 @@ static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
>
>      /* Use the two byte form if possible, which cannot encode
>         VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
> -    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
> +    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
>          && ((rm | index) & 8) == 0) {
>          /* Two byte VEX prefix.  */
>          tcg_out8(s, 0xc5);
> @@ -601,7 +602,7 @@ static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
>          tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
>          tcg_out8(s, tmp);
>
> -        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
> +        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
>      }
>
>      tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */

These changes look OK as far as they go, but it's not clear to
me why the other places that set P_REXW are all OK to use P_REXW
and not P_VEXW. For instance tcg_out_mov() sets rexw = P_REXW
and some of the codepaths there will then pass that into
tcg_out_vex_modrm() which ends up in tcg_out_vex_opc().

More generally, is there somewhere we can assert that we
didn't try to use a REXW prefix for i386 codegen rather
than just silently ignoring it ?

thanks
-- PMM

Re: [PATCH for-6.1] tcg/i386: Split P_VEXW from P_REXW
Posted by Richard Henderson 2 years, 8 months ago
On 8/13/21 12:37 AM, Peter Maydell wrote:
> These changes look OK as far as they go, but it's not clear to
> me why the other places that set P_REXW are all OK to use P_REXW
> and not P_VEXW. For instance tcg_out_mov() sets rexw = P_REXW
> and some of the codepaths there will then pass that into
> tcg_out_vex_modrm() which ends up in tcg_out_vex_opc().

This distinguishes between 32-bit and 64-bit transfer between vector and general register. 
  Which of course doesn't make sense for i386.

> More generally, is there somewhere we can assert that we
> didn't try to use a REXW prefix for i386 codegen rather
> than just silently ignoring it ?

I guess tcg_out_opc might be a place.  But mostly we try to avoid generating those places 
in the first place.  E.g.

#if TCG_TARGET_REG_BITS == 64
# define OP_32_64(x) \
         case glue(glue(INDEX_op_, x), _i64): \
             rexw = P_REXW; /* FALLTHRU */    \
         case glue(glue(INDEX_op_, x), _i32)
#else
# define OP_32_64(x) \
         case glue(glue(INDEX_op_, x), _i32)
#endif


r~

Re: [PATCH for-6.1] tcg/i386: Split P_VEXW from P_REXW
Posted by Richard Henderson 2 years, 8 months ago
On 8/13/21 6:59 AM, Richard Henderson wrote:
> On 8/13/21 12:37 AM, Peter Maydell wrote:
>> These changes look OK as far as they go, but it's not clear to
>> me why the other places that set P_REXW are all OK to use P_REXW
>> and not P_VEXW. For instance tcg_out_mov() sets rexw = P_REXW
>> and some of the codepaths there will then pass that into
>> tcg_out_vex_modrm() which ends up in tcg_out_vex_opc().
> 
> This distinguishes between 32-bit and 64-bit transfer between vector and general register. 
>   Which of course doesn't make sense for i386.

I read this again and realized it doesn't really clear things up.

The older opcodes which originated with SSE used completely separate opcodes to talk about 
64-bit quantities within the vector registers.  E.g.

#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)

or even

#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)

It's only with the newest AVX2 instructions that they decided to use VEX.W to talk about 
the size of the vector element as opposed to the size of the general register on the other 
end.  Which includes the two vector shift with the shift amount coming from a vector 
argument (as opposed to immediate):

#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)

I guess I can add
Fixes: a2ce146a068 ("tcg/i386: Support vector variable shift opcodes")

because I failed to consider that P_REXW was always 0 for 32-bit.


r~

Re: [PATCH for-6.1] tcg/i386: Split P_VEXW from P_REXW
Posted by Peter Maydell 2 years, 8 months ago
On Fri, 13 Aug 2021 at 17:59, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 8/13/21 12:37 AM, Peter Maydell wrote:
> > These changes look OK as far as they go, but it's not clear to
> > me why the other places that set P_REXW are all OK to use P_REXW
> > and not P_VEXW. For instance tcg_out_mov() sets rexw = P_REXW
> > and some of the codepaths there will then pass that into
> > tcg_out_vex_modrm() which ends up in tcg_out_vex_opc().
>
> This distinguishes between 32-bit and 64-bit transfer between vector and general register.
>   Which of course doesn't make sense for i386.
>
> > More generally, is there somewhere we can assert that we
> > didn't try to use a REXW prefix for i386 codegen rather
> > than just silently ignoring it ?
>
> I guess tcg_out_opc might be a place.  But mostly we try to avoid generating those places
> in the first place.  E.g.
>
> #if TCG_TARGET_REG_BITS == 64
> # define OP_32_64(x) \
>          case glue(glue(INDEX_op_, x), _i64): \
>              rexw = P_REXW; /* FALLTHRU */    \
>          case glue(glue(INDEX_op_, x), _i32)
> #else
> # define OP_32_64(x) \
>          case glue(glue(INDEX_op_, x), _i32)
> #endif

Right, if we do that everywhere we could make P_REXW the same value
on 32 bit and 64 bit hosts and assert that P_REXW doesn't ever actually
get passed to the functions where we look at it to generate
code. It's only if there's codepaths which rely on P_REXW being 0
on i386 in order to not generate invalid code that an assert would
get awkward...

-- PMM

Re: [PATCH for-6.1] tcg/i386: Split P_VEXW from P_REXW
Posted by Richard Henderson 2 years, 8 months ago
PIng for review, or this slips to 6.2.

On 8/10/21 1:25 PM, Richard Henderson wrote:
> We need to be able to represent VEX.W on a 32-bit host, where REX.W
> will always be zero.  Fixes the encoding for VPSLLVQ and VPSRLVQ.
> 
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/385
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   tcg/i386/tcg-target.c.inc | 13 +++++++------
>   1 file changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
> index 98d924b91a..997510109d 100644
> --- a/tcg/i386/tcg-target.c.inc
> +++ b/tcg/i386/tcg-target.c.inc
> @@ -241,8 +241,9 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
>   #define P_EXT		0x100		/* 0x0f opcode prefix */
>   #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
>   #define P_DATA16        0x400           /* 0x66 opcode prefix */
> +#define P_VEXW          0x1000          /* Set VEX.W = 1 */
>   #if TCG_TARGET_REG_BITS == 64
> -# define P_REXW         0x1000          /* Set REX.W = 1 */
> +# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
>   # define P_REXB_R       0x2000          /* REG field as byte register */
>   # define P_REXB_RM      0x4000          /* R/M field as byte register */
>   # define P_GS           0x8000          /* gs segment override */
> @@ -410,13 +411,13 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
>   #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
>   #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
>   #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
> -#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
> +#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
>   #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
>   #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
> -#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
> +#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
>   #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
>   #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
> -#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
> +#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
>   #define OPC_VZEROUPPER  (0x77 | P_EXT)
>   #define OPC_XCHG_ax_r32	(0x90)
>   
> @@ -576,7 +577,7 @@ static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
>   
>       /* Use the two byte form if possible, which cannot encode
>          VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
> -    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
> +    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
>           && ((rm | index) & 8) == 0) {
>           /* Two byte VEX prefix.  */
>           tcg_out8(s, 0xc5);
> @@ -601,7 +602,7 @@ static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
>           tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
>           tcg_out8(s, tmp);
>   
> -        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
> +        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
>       }
>   
>       tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
> 


Re: [PATCH for-6.1] tcg/i386: Split P_VEXW from P_REXW
Posted by Peter Maydell 2 years, 8 months ago
On Thu, 12 Aug 2021 at 19:29, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> PIng for review, or this slips to 6.2.
>
> On 8/10/21 1:25 PM, Richard Henderson wrote:
> > We need to be able to represent VEX.W on a 32-bit host, where REX.W
> > will always be zero.  Fixes the encoding for VPSLLVQ and VPSRLVQ.
> >
> > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/385
> > Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Bug report says this isn't a regression since 6.0, and reporter
says there's still issues with this fix, so I think this is
probably going to go into 6.2 anyway. At any rate, I don't
think it's sufficiently rc to make us spin an rc4 if we weren't
going to anyway.

-- PMM