tcg/i386: Fix dupi/dupm for avx1 and 32-bit hosts

[Qemu-devel] [PATCH] tcg/i386: Fix dupi/dupm for avx1 and 32-bit hosts

Posted by Richard Henderson 6 years, 8 months ago

The VBROADCASTSD instruction only allows %ymm registers as destination.
Rather than forcing VEX.L and writing to the entire 256-bit register,
revert to using MOVDDUP with an %xmm register.  This is sufficient for
an avx1 host since we do not support TCG_TYPE_V256 for that case.

Also fix the 32-bit avx2, which should have used VPBROADCASTW.

Fixes: 1e262b49b533
Reported-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.inc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index aafd01cb49..b3601446cd 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -358,6 +358,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
+#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
@@ -921,7 +922,7 @@ static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
     } else {
         switch (vece) {
         case MO_64:
-            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSD, r, 0, base, offset);
+            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
             break;
         case MO_32:
             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
@@ -963,12 +964,12 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
         } else if (have_avx2) {
             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
         } else {
-            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSD, ret);
+            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
         }
         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
     } else {
         if (have_avx2) {
-            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSD + vex_l, ret);
+            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTW + vex_l, ret);
         } else {
             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
         }
-- 
2.17.1

Re: [Qemu-devel] [PATCH] tcg/i386: Fix dupi/dupm for avx1 and 32-bit hosts

Posted by Mark Cave-Ayland 6 years, 8 months ago

On 16/05/2019 23:50, Richard Henderson wrote:

> The VBROADCASTSD instruction only allows %ymm registers as destination.
> Rather than forcing VEX.L and writing to the entire 256-bit register,
> revert to using MOVDDUP with an %xmm register.  This is sufficient for
> an avx1 host since we do not support TCG_TYPE_V256 for that case.
> 
> Also fix the 32-bit avx2, which should have used VPBROADCASTW.
> 
> Fixes: 1e262b49b533
> Reported-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/i386/tcg-target.inc.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
> index aafd01cb49..b3601446cd 100644
> --- a/tcg/i386/tcg-target.inc.c
> +++ b/tcg/i386/tcg-target.inc.c
> @@ -358,6 +358,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
>  #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
>  #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
>  #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
> +#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
>  #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
>  #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
>  #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
> @@ -921,7 +922,7 @@ static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
>      } else {
>          switch (vece) {
>          case MO_64:
> -            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSD, r, 0, base, offset);
> +            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
>              break;
>          case MO_32:
>              tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
> @@ -963,12 +964,12 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
>          } else if (have_avx2) {
>              tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
>          } else {
> -            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSD, ret);
> +            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
>          }
>          new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
>      } else {
>          if (have_avx2) {
> -            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSD + vex_l, ret);
> +            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTW + vex_l, ret);
>          } else {
>              tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
>          }

Indeed, this fixes the issue for me here - thank you!

Tested-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>


ATB,

Mark.