[PATCH 18/25] target/i386: convert LZCNT/TZCNT/BSF/BSR/POPCNT to new decoder

Paolo Bonzini posted 25 patches 5 months, 2 weeks ago
There is a newer version of this series
[PATCH 18/25] target/i386: convert LZCNT/TZCNT/BSF/BSR/POPCNT to new decoder
Posted by Paolo Bonzini 5 months, 2 weeks ago
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/decode-new.h     |  1 +
 target/i386/tcg/translate.c      | 74 ----------------------------
 target/i386/tcg/decode-new.c.inc | 51 +++++++++++++++++++-
 target/i386/tcg/emit.c.inc       | 82 ++++++++++++++++++++++++++++++++
 4 files changed, 132 insertions(+), 76 deletions(-)

diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index f781bb5bbec..13be23145a8 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -119,6 +119,7 @@ typedef enum X86CPUIDFeature {
     X86_FEAT_FXSR,
     X86_FEAT_MOVBE,
     X86_FEAT_PCLMULQDQ,
+    X86_FEAT_POPCNT,
     X86_FEAT_SHA_NI,
     X86_FEAT_SSE,
     X86_FEAT_SSE2,
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 1e9036eb6e3..a9cf1332b43 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -818,11 +818,6 @@ static void gen_movs(DisasContext *s, MemOp ot)
     gen_op_add_reg(s, s->aflag, R_EDI, dshift);
 }
 
-static void gen_op_update1_cc(DisasContext *s)
-{
-    tcg_gen_mov_tl(cpu_cc_dst, s->T0);
-}
-
 static void gen_op_update2_cc(DisasContext *s)
 {
     tcg_gen_mov_tl(cpu_cc_src, s->T1);
@@ -3167,56 +3162,6 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b)
         }
         break;
 
-    case 0x1bc: /* bsf / tzcnt */
-    case 0x1bd: /* bsr / lzcnt */
-        ot = dflag;
-        modrm = x86_ldub_code(env, s);
-        reg = ((modrm >> 3) & 7) | REX_R(s);
-        gen_ld_modrm(env, s, modrm, ot);
-        gen_extu(ot, s->T0);
-
-        /* Note that lzcnt and tzcnt are in different extensions.  */
-        if ((prefixes & PREFIX_REPZ)
-            && (b & 1
-                ? s->cpuid_ext3_features & CPUID_EXT3_ABM
-                : s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) {
-            int size = 8 << ot;
-            /* For lzcnt/tzcnt, C bit is defined related to the input. */
-            tcg_gen_mov_tl(cpu_cc_src, s->T0);
-            if (b & 1) {
-                /* For lzcnt, reduce the target_ulong result by the
-                   number of zeros that we expect to find at the top.  */
-                tcg_gen_clzi_tl(s->T0, s->T0, TARGET_LONG_BITS);
-                tcg_gen_subi_tl(s->T0, s->T0, TARGET_LONG_BITS - size);
-            } else {
-                /* For tzcnt, a zero input must return the operand size.  */
-                tcg_gen_ctzi_tl(s->T0, s->T0, size);
-            }
-            /* For lzcnt/tzcnt, Z bit is defined related to the result.  */
-            gen_op_update1_cc(s);
-            set_cc_op(s, CC_OP_BMILGB + ot);
-        } else {
-            /* For bsr/bsf, only the Z bit is defined and it is related
-               to the input and not the result.  */
-            tcg_gen_mov_tl(cpu_cc_dst, s->T0);
-            set_cc_op(s, CC_OP_LOGICB + ot);
-
-            /* ??? The manual says that the output is undefined when the
-               input is zero, but real hardware leaves it unchanged, and
-               real programs appear to depend on that.  Accomplish this
-               by passing the output as the value to return upon zero.  */
-            if (b & 1) {
-                /* For bsr, return the bit index of the first 1 bit,
-                   not the count of leading zeros.  */
-                tcg_gen_xori_tl(s->T1, cpu_regs[reg], TARGET_LONG_BITS - 1);
-                tcg_gen_clz_tl(s->T0, s->T0, s->T1);
-                tcg_gen_xori_tl(s->T0, s->T0, TARGET_LONG_BITS - 1);
-            } else {
-                tcg_gen_ctz_tl(s->T0, s->T0, cpu_regs[reg]);
-            }
-        }
-        gen_op_mov_reg_v(s, ot, reg, s->T0);
-        break;
     case 0x100:
         modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
@@ -3811,25 +3756,6 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b)
         }
         gen_nop_modrm(env, s, modrm);
         break;
-    case 0x1b8: /* SSE4.2 popcnt */
-        if ((prefixes & (PREFIX_REPZ | PREFIX_LOCK | PREFIX_REPNZ)) !=
-             PREFIX_REPZ)
-            goto illegal_op;
-        if (!(s->cpuid_ext_features & CPUID_EXT_POPCNT))
-            goto illegal_op;
-
-        modrm = x86_ldub_code(env, s);
-        reg = ((modrm >> 3) & 7) | REX_R(s);
-
-        ot = dflag;
-        gen_ld_modrm(env, s, modrm, ot);
-        gen_extu(ot, s->T0);
-        tcg_gen_mov_tl(cpu_cc_src, s->T0);
-        tcg_gen_ctpop_tl(s->T0, s->T0);
-        gen_op_mov_reg_v(s, ot, reg, s->T0);
-
-        set_cc_op(s, CC_OP_POPCNT);
-        break;
     default:
         g_assert_not_reached();
     }
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index bd9e7cd4df9..64ec731bf4a 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -469,6 +469,50 @@ static void decode_0F7F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui
     *entry = *decode_by_prefix(s, opcodes_0F7F);
 }
 
+static void decode_0FB8(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    static const X86OpEntry popcnt =
+        X86_OP_ENTRYwr(POPCNT,    G,v, E,v,  cpuid(POPCNT) zextT0);
+
+    if (s->prefix & PREFIX_REPZ) {
+        *entry = popcnt;
+    } else {
+        memset(entry, 0, sizeof(*entry));
+    }
+}
+
+static void decode_0FBC(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    /* For BSF, pass 2op as the third operand so that we can use zextT0 */
+    static const X86OpEntry opcodes_0FBC[4] = {
+        X86_OP_ENTRY3(BSF,    G,v, E,v, 2op,v, zextT0),
+        X86_OP_ENTRY3(BSF,    G,v, E,v, 2op,v, zextT0), /* 0x66 */
+        X86_OP_ENTRYwr(TZCNT, G,v, E,v,        zextT0), /* 0xf3 */
+        X86_OP_ENTRY3(BSF,    G,v, E,v, 2op,v, zextT0), /* 0xf2 */
+    };
+    if (!(s->cpuid_ext3_features & CPUID_EXT3_ABM)) {
+        *entry = opcodes_0FBC[0];
+    } else {
+        *entry = *decode_by_prefix(s, opcodes_0FBC);
+    }
+}
+
+static void decode_0FBD(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    /* For BSR, pass 2op as the third operand so that we can use zextT0 */
+    static const X86OpEntry opcodes_0FBD[4] = {
+        X86_OP_ENTRY3(BSR,    G,v, E,v, 2op,v, zextT0),
+        X86_OP_ENTRY3(BSR,    G,v, E,v, 2op,v, zextT0), /* 0x66 */
+        X86_OP_ENTRYwr(LZCNT, G,v, E,v,        zextT0), /* 0xf3 */
+        X86_OP_ENTRY3(BSR,    G,v, E,v, 2op,v, zextT0), /* 0xf2 */
+    };
+    if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) {
+        *entry = opcodes_0FBD[0];
+    } else {
+        *entry = *decode_by_prefix(s, opcodes_0FBD);
+    }
+}
+
 static void decode_0FD6(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
 {
     static const X86OpEntry movq[4] = {
@@ -1273,10 +1317,13 @@ static const X86OpEntry opcodes_0F[256] = {
      */
     [0xaf] = X86_OP_ENTRY3(IMUL3,  G,v, E,v, 2op,v, sextT0),
 
+    [0xb8] = X86_OP_GROUP0(0FB8),
     /* decoded as modrm, which is visible as a difference between page fault and #UD */
     [0xb9] = X86_OP_ENTRYr(UD,     nop,v),                        /* UD1 */
     [0xba] = X86_OP_GROUP2(group8, E,v, I,b),
     [0xbb] = X86_OP_ENTRY2(BTC,    E,v, G,v,             btEvGv),
+    [0xbc] = X86_OP_GROUP0(0FBC),
+    [0xbd] = X86_OP_GROUP0(0FBD),
     [0xbe] = X86_OP_ENTRY3(MOV,    G,v, E,b, None, None, sextT0), /* MOVSX */
     [0xbf] = X86_OP_ENTRY3(MOV,    G,v, E,w, None, None, sextT0), /* MOVSX */
 
@@ -2174,6 +2221,8 @@ static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid)
         return (s->cpuid_ext_features & CPUID_EXT_MOVBE);
     case X86_FEAT_PCLMULQDQ:
         return (s->cpuid_ext_features & CPUID_EXT_PCLMULQDQ);
+    case X86_FEAT_POPCNT:
+        return (s->cpuid_ext_features & CPUID_EXT_POPCNT);
     case X86_FEAT_SSE:
         return (s->cpuid_features & CPUID_SSE);
     case X86_FEAT_SSE2:
@@ -2562,8 +2611,6 @@ static void disas_insn(DisasContext *s, CPUState *cpu)
             case 0x00 ... 0x01: /* mostly privileged instructions */
             case 0x1a ... 0x1b: /* MPX */
             case 0xb0 ... 0xb1: /* cmpxchg */
-            case 0xb8:          /* POPCNT */
-            case 0xbc ... 0xbd: /* LZCNT/TZCNT */
             case 0xc0 ... 0xc1: /* xadd */
             case 0xc7:          /* grp9 */
                 disas_insn_old(s, cpu, b + 0x100);
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index aabc86669c2..2fbf2a5ce8c 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -1345,6 +1345,47 @@ static void gen_BOUND(DisasContext *s, X86DecodedInsn *decode)
     }
 }
 
+/* Non-standard convention - on entry T0 is zero-extended input, T1 is the output.  */
+static void gen_BSF(DisasContext *s, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[0].ot;
+
+    /* Only the Z bit is defined and it is related to the input.  */
+    decode->cc_dst = tcg_temp_new();
+    decode->cc_op = CC_OP_LOGICB + ot;
+    tcg_gen_mov_tl(decode->cc_dst, s->T0);
+
+    /*
+     * The manual says that the output is undefined when the
+     * input is zero, but real hardware leaves it unchanged, and
+     * real programs appear to depend on that.  Accomplish this
+     * by passing the output as the value to return upon zero.
+     */
+    tcg_gen_ctz_tl(s->T0, s->T0, s->T1);
+}
+
+/* Non-standard convention - on entry T0 is zero-extended input, T1 is the output.  */
+static void gen_BSR(DisasContext *s, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[0].ot;
+
+    /* Only the Z bit is defined and it is related to the input.  */
+    decode->cc_dst = tcg_temp_new();
+    decode->cc_op = CC_OP_LOGICB + ot;
+    tcg_gen_mov_tl(decode->cc_dst, s->T0);
+
+    /*
+     * The manual says that the output is undefined when the
+     * input is zero, but real hardware leaves it unchanged, and
+     * real programs appear to depend on that.  Accomplish this
+     * by passing the output as the value to return upon zero.
+     * Plus, return the bit index of the first 1 bit.
+     */
+    tcg_gen_xori_tl(s->T1, s->T1, TARGET_LONG_BITS - 1);
+    tcg_gen_clz_tl(s->T0, s->T0, s->T1);
+    tcg_gen_xori_tl(s->T0, s->T0, TARGET_LONG_BITS - 1);
+}
+
 static void gen_BSWAP(DisasContext *s, X86DecodedInsn *decode)
 {
 #ifdef TARGET_X86_64
@@ -2254,6 +2295,24 @@ static void gen_LSS(DisasContext *s, X86DecodedInsn *decode)
     gen_lxx_seg(s, decode, R_SS);
 }
 
+static void gen_LZCNT(DisasContext *s, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[0].ot;
+
+    /* C bit (cc_src) is defined related to the input.  */
+    decode->cc_src = tcg_temp_new();
+    decode->cc_dst = s->T0;
+    decode->cc_op = CC_OP_BMILGB + ot;
+    tcg_gen_mov_tl(decode->cc_src, s->T0);
+
+    /*
+     * Reduce the target_ulong result by the number of zeros that
+     * we expect to find at the top.
+     */
+    tcg_gen_clzi_tl(s->T0, s->T0, TARGET_LONG_BITS);
+    tcg_gen_subi_tl(s->T0, s->T0, TARGET_LONG_BITS - (8 << ot));
+}
+
 static void gen_MFENCE(DisasContext *s, X86DecodedInsn *decode)
 {
     tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
@@ -2812,6 +2871,15 @@ static void gen_POPA(DisasContext *s, X86DecodedInsn *decode)
     gen_popa(s);
 }
 
+static void gen_POPCNT(DisasContext *s, X86DecodedInsn *decode)
+{
+    decode->cc_src = tcg_temp_new();
+    decode->cc_op = CC_OP_POPCNT;
+
+    tcg_gen_mov_tl(decode->cc_src, s->T0);
+    tcg_gen_ctpop_tl(s->T0, s->T0);
+}
+
 static void gen_POPF(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot;
@@ -3893,6 +3961,20 @@ static void gen_SYSRET(DisasContext *s, X86DecodedInsn *decode)
     s->base.is_jmp = DISAS_EOB_RECHECK_TF;
 }
 
+static void gen_TZCNT(DisasContext *s, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[0].ot;
+
+    /* C bit (cc_src) is defined related to the input.  */
+    decode->cc_src = tcg_temp_new();
+    decode->cc_dst = s->T0;
+    decode->cc_op = CC_OP_BMILGB + ot;
+    tcg_gen_mov_tl(decode->cc_src, s->T0);
+
+    /* A zero input returns the operand size.  */
+    tcg_gen_ctzi_tl(s->T0, s->T0, 8 << ot);
+}
+
 static void gen_UD(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_illegal_opcode(s);
-- 
2.45.1
Re: [PATCH 18/25] target/i386: convert LZCNT/TZCNT/BSF/BSR/POPCNT to new decoder
Posted by Richard Henderson 5 months, 2 weeks ago
On 6/8/24 01:41, Paolo Bonzini wrote:
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> ---
>   target/i386/tcg/decode-new.h     |  1 +
>   target/i386/tcg/translate.c      | 74 ----------------------------
>   target/i386/tcg/decode-new.c.inc | 51 +++++++++++++++++++-
>   target/i386/tcg/emit.c.inc       | 82 ++++++++++++++++++++++++++++++++
>   4 files changed, 132 insertions(+), 76 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

r~