[PATCH 06/10] target/i386: convert CMPXCHG8B/CMPXCHG16B to new decoder

Paolo Bonzini posted 10 patches 5 months, 1 week ago
Maintainers: Paolo Bonzini <pbonzini@redhat.com>, Richard Henderson <richard.henderson@linaro.org>, Eduardo Habkost <eduardo@habkost.net>
[PATCH 06/10] target/i386: convert CMPXCHG8B/CMPXCHG16B to new decoder
Posted by Paolo Bonzini 5 months, 1 week ago
This moves the last LOCK-enabled instructions to the new decoder.  It is now
possible to assume that PREFIX_LOCK gen_multi0F is called only after checking
that LOCK was not specified.

The gen_cmpxchg8b and gen_cmpxchg16b functions even have the correct
prototype already; the only thing that needs to be done is removing the
gen_lea_modrm() call.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/decode-new.h     |   2 +
 target/i386/tcg/translate.c      | 121 +------------------------------
 target/i386/tcg/decode-new.c.inc |  34 ++++++---
 target/i386/tcg/emit.c.inc       |  96 ++++++++++++++++++++++++
 4 files changed, 124 insertions(+), 129 deletions(-)

diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index bebc77bd54b..7f23d373ea7 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -114,6 +114,8 @@ typedef enum X86CPUIDFeature {
     X86_FEAT_CLWB,
     X86_FEAT_CMOV,
     X86_FEAT_CMPCCXADD,
+    X86_FEAT_CX8,
+    X86_FEAT_CX16,
     X86_FEAT_F16C,
     X86_FEAT_FMA,
     X86_FEAT_FSGSBASE,
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 1d845ff66bb..c60f18c7482 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2298,104 +2298,6 @@ static void gen_sty_env_A0(DisasContext *s, int offset, bool align)
     tcg_gen_qemu_st_i128(t, s->tmp0, mem_index, mop);
 }
 
-static void gen_cmpxchg8b(DisasContext *s, X86DecodedInsn *decode)
-{
-    TCGv_i64 cmp, val, old;
-    TCGv Z;
-
-    gen_lea_modrm(s, decode);
-
-    cmp = tcg_temp_new_i64();
-    val = tcg_temp_new_i64();
-    old = tcg_temp_new_i64();
-
-    /* Construct the comparison values from the register pair. */
-    tcg_gen_concat_tl_i64(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
-    tcg_gen_concat_tl_i64(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
-
-    /* Only require atomic with LOCK; non-parallel handled in generator. */
-    if (s->prefix & PREFIX_LOCK) {
-        tcg_gen_atomic_cmpxchg_i64(old, s->A0, cmp, val, s->mem_index, MO_TEUQ);
-    } else {
-        tcg_gen_nonatomic_cmpxchg_i64(old, s->A0, cmp, val,
-                                      s->mem_index, MO_TEUQ);
-    }
-
-    /* Set tmp0 to match the required value of Z. */
-    tcg_gen_setcond_i64(TCG_COND_EQ, cmp, old, cmp);
-    Z = tcg_temp_new();
-    tcg_gen_trunc_i64_tl(Z, cmp);
-
-    /*
-     * Extract the result values for the register pair.
-     * For 32-bit, we may do this unconditionally, because on success (Z=1),
-     * the old value matches the previous value in EDX:EAX.  For x86_64,
-     * the store must be conditional, because we must leave the source
-     * registers unchanged on success, and zero-extend the writeback
-     * on failure (Z=0).
-     */
-    if (TARGET_LONG_BITS == 32) {
-        tcg_gen_extr_i64_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], old);
-    } else {
-        TCGv zero = tcg_constant_tl(0);
-
-        tcg_gen_extr_i64_tl(s->T0, s->T1, old);
-        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EAX], Z, zero,
-                           s->T0, cpu_regs[R_EAX]);
-        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EDX], Z, zero,
-                           s->T1, cpu_regs[R_EDX]);
-    }
-
-    /* Update Z. */
-    gen_compute_eflags(s);
-    tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, Z, ctz32(CC_Z), 1);
-}
-
-#ifdef TARGET_X86_64
-static void gen_cmpxchg16b(DisasContext *s, X86DecodedInsn *decode)
-{
-    MemOp mop = MO_TE | MO_128 | MO_ALIGN;
-    TCGv_i64 t0, t1;
-    TCGv_i128 cmp, val;
-
-    gen_lea_modrm(s, decode);
-
-    cmp = tcg_temp_new_i128();
-    val = tcg_temp_new_i128();
-    tcg_gen_concat_i64_i128(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
-    tcg_gen_concat_i64_i128(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
-
-    /* Only require atomic with LOCK; non-parallel handled in generator. */
-    if (s->prefix & PREFIX_LOCK) {
-        tcg_gen_atomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop);
-    } else {
-        tcg_gen_nonatomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop);
-    }
-
-    tcg_gen_extr_i128_i64(s->T0, s->T1, val);
-
-    /* Determine success after the fact. */
-    t0 = tcg_temp_new_i64();
-    t1 = tcg_temp_new_i64();
-    tcg_gen_xor_i64(t0, s->T0, cpu_regs[R_EAX]);
-    tcg_gen_xor_i64(t1, s->T1, cpu_regs[R_EDX]);
-    tcg_gen_or_i64(t0, t0, t1);
-
-    /* Update Z. */
-    gen_compute_eflags(s);
-    tcg_gen_setcondi_i64(TCG_COND_EQ, t0, t0, 0);
-    tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, t0, ctz32(CC_Z), 1);
-
-    /*
-     * Extract the result values for the register pair.  We may do this
-     * unconditionally, because on success (Z=1), the old value matches
-     * the previous value in RDX:RAX.
-     */
-    tcg_gen_mov_i64(cpu_regs[R_EAX], s->T0);
-    tcg_gen_mov_i64(cpu_regs[R_EDX], s->T1);
-}
-#endif
-
 #include "emit.c.inc"
 
 static void gen_x87(DisasContext *s, X86DecodedInsn *decode)
@@ -2971,29 +2873,10 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode)
 
     /* now check op code */
     switch (b) {
-    case 0x1c7: /* cmpxchg8b */
+    case 0x1c7: /* RDSEED, RDPID with f3 prefix */
         mod = (modrm >> 6) & 3;
         switch ((modrm >> 3) & 7) {
-        case 1: /* CMPXCHG8, CMPXCHG16 */
-            if (mod == 3) {
-                goto illegal_op;
-            }
-#ifdef TARGET_X86_64
-            if (dflag == MO_64) {
-                if (!(s->cpuid_ext_features & CPUID_EXT_CX16)) {
-                    goto illegal_op;
-                }
-                gen_cmpxchg16b(s, decode);
-                break;
-            }
-#endif
-            if (!(s->cpuid_features & CPUID_CX8)) {
-                goto illegal_op;
-            }
-            gen_cmpxchg8b(s, decode);
-            break;
-
-        case 7: /* RDSEED, RDPID with f3 prefix */
+        case 7:
             if (mod != 3 ||
                 (s->prefix & (PREFIX_LOCK | PREFIX_REPNZ))) {
                 goto illegal_op;
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 45f4aed4611..fa51aadfcf2 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -288,6 +288,25 @@ static void decode_group8(DisasContext *s, CPUX86State *env, X86OpEntry *entry,
     }
 }
 
+static void decode_group9(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    static const X86OpEntry group9_reg =
+        X86_OP_ENTRY0(multi0F);  /* unconverted */
+    static const X86OpEntry cmpxchg8b =
+        X86_OP_ENTRY1(CMPXCHG8B,  M,q,  lock p_00 cpuid(CX8));
+    static const X86OpEntry cmpxchg16b =
+        X86_OP_ENTRY1(CMPXCHG16B, M,dq, lock p_00 cpuid(CX16));
+
+    int modrm = get_modrm(s, env);
+    int op = (modrm >> 3) & 7;
+
+    if ((modrm >> 6) == 3) {
+        *entry = group9_reg;
+    } else if (op == 1) {
+        *entry = REX_W(s) ? cmpxchg16b : cmpxchg8b;
+    }
+}
+
 static void decode_group15(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
 {
     static const X86OpEntry group15_reg[8] = {
@@ -1203,7 +1222,7 @@ static const X86OpEntry opcodes_0F[256] = {
     [0xc4] = X86_OP_ENTRY4(PINSRW,     V,dq,H,dq,E,w,       vex5 mmx p_00_66),
     [0xc5] = X86_OP_ENTRY3(PEXTRW,     G,d, U,dq,I,b,       vex5 mmx p_00_66),
     [0xc6] = X86_OP_ENTRY4(VSHUF,      V,x, H,x, W,x,       vex4 p_00_66),
-    [0xc7] = X86_OP_ENTRY1(multi0F,    nop,v,               nolea), /* unconverted */
+    [0xc7] = X86_OP_GROUP0(group9),
 
     [0xd0] = X86_OP_ENTRY3(VADDSUB,   V,x, H,x, W,x,        vex2 cpuid(SSE3) p_66_f2),
     [0xd1] = X86_OP_ENTRY3(PSRLW_r,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
@@ -2241,8 +2260,12 @@ static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid)
         return (s->cpuid_features & CPUID_CMOV);
     case X86_FEAT_CLFLUSH:
         return (s->cpuid_features & CPUID_CLFLUSH);
+    case X86_FEAT_CX8:
+        return (s->cpuid_features & CPUID_CX8);
     case X86_FEAT_FXSR:
         return (s->cpuid_features & CPUID_FXSR);
+    case X86_FEAT_CX16:
+        return (s->cpuid_ext_features & CPUID_EXT_CX16);
     case X86_FEAT_F16C:
         return (s->cpuid_ext_features & CPUID_EXT_F16C);
     case X86_FEAT_FMA:
@@ -2722,15 +2745,6 @@ static void disas_insn(DisasContext *s, CPUState *cpu)
         break;
     }
 
-    /*
-     * hack for old decoder: 0F C7 has both instructions that accept LOCK
-     * and instructions that don't, but also needs X86_SPECIAL_NoLoadEA.
-     * Keep this here until CMPXCHG8B/CMPXCHG16B is separated from the
-     * other unconverted opcodes.
-     */
-    if (decode.e.gen == gen_multi0F) {
-        accept_lock = true;
-    }
     if ((s->prefix & PREFIX_LOCK) && !accept_lock) {
         goto illegal_op;
     }
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index edadb51ae89..98f10ad8b4e 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -1782,6 +1782,102 @@ static void gen_CMPXCHG(DisasContext *s, X86DecodedInsn *decode)
     decode->cc_op = CC_OP_SUBB + ot;
 }
 
+static void gen_CMPXCHG16B(DisasContext *s, X86DecodedInsn *decode)
+{
+#ifdef TARGET_X86_64
+    MemOp mop = MO_TE | MO_128 | MO_ALIGN;
+    TCGv_i64 t0, t1;
+    TCGv_i128 cmp, val;
+
+    cmp = tcg_temp_new_i128();
+    val = tcg_temp_new_i128();
+    tcg_gen_concat_i64_i128(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
+    tcg_gen_concat_i64_i128(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
+
+    /* Only require atomic with LOCK; non-parallel handled in generator. */
+    if (s->prefix & PREFIX_LOCK) {
+        tcg_gen_atomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop);
+    } else {
+        tcg_gen_nonatomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop);
+    }
+
+    tcg_gen_extr_i128_i64(s->T0, s->T1, val);
+
+    /* Determine success after the fact. */
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+    tcg_gen_xor_i64(t0, s->T0, cpu_regs[R_EAX]);
+    tcg_gen_xor_i64(t1, s->T1, cpu_regs[R_EDX]);
+    tcg_gen_or_i64(t0, t0, t1);
+
+    /* Update Z. */
+    gen_compute_eflags(s);
+    tcg_gen_setcondi_i64(TCG_COND_EQ, t0, t0, 0);
+    tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, t0, ctz32(CC_Z), 1);
+
+    /*
+     * Extract the result values for the register pair.  We may do this
+     * unconditionally, because on success (Z=1), the old value matches
+     * the previous value in RDX:RAX.
+     */
+    tcg_gen_mov_i64(cpu_regs[R_EAX], s->T0);
+    tcg_gen_mov_i64(cpu_regs[R_EDX], s->T1);
+#else
+    abort();
+#endif
+}
+
+static void gen_CMPXCHG8B(DisasContext *s, X86DecodedInsn *decode)
+{
+    TCGv_i64 cmp, val, old;
+    TCGv Z;
+
+    cmp = tcg_temp_new_i64();
+    val = tcg_temp_new_i64();
+    old = tcg_temp_new_i64();
+
+    /* Construct the comparison values from the register pair. */
+    tcg_gen_concat_tl_i64(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
+    tcg_gen_concat_tl_i64(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
+
+    /* Only require atomic with LOCK; non-parallel handled in generator. */
+    if (s->prefix & PREFIX_LOCK) {
+        tcg_gen_atomic_cmpxchg_i64(old, s->A0, cmp, val, s->mem_index, MO_TEUQ);
+    } else {
+        tcg_gen_nonatomic_cmpxchg_i64(old, s->A0, cmp, val,
+                                      s->mem_index, MO_TEUQ);
+    }
+
+    /* Set tmp0 to match the required value of Z. */
+    tcg_gen_setcond_i64(TCG_COND_EQ, cmp, old, cmp);
+    Z = tcg_temp_new();
+    tcg_gen_trunc_i64_tl(Z, cmp);
+
+    /*
+     * Extract the result values for the register pair.
+     * For 32-bit, we may do this unconditionally, because on success (Z=1),
+     * the old value matches the previous value in EDX:EAX.  For x86_64,
+     * the store must be conditional, because we must leave the source
+     * registers unchanged on success, and zero-extend the writeback
+     * on failure (Z=0).
+     */
+    if (TARGET_LONG_BITS == 32) {
+        tcg_gen_extr_i64_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], old);
+    } else {
+        TCGv zero = tcg_constant_tl(0);
+
+        tcg_gen_extr_i64_tl(s->T0, s->T1, old);
+        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EAX], Z, zero,
+                           s->T0, cpu_regs[R_EAX]);
+        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EDX], Z, zero,
+                           s->T1, cpu_regs[R_EDX]);
+    }
+
+    /* Update Z. */
+    gen_compute_eflags(s);
+    tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, Z, ctz32(CC_Z), 1);
+}
+
 static void gen_CPUID(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);
-- 
2.45.2
Re: [PATCH 06/10] target/i386: convert CMPXCHG8B/CMPXCHG16B to new decoder
Posted by Richard Henderson 5 months, 1 week ago
On 6/20/24 02:54, Paolo Bonzini wrote:
> This moves the last LOCK-enabled instructions to the new decoder.  It is now
> possible to assume that PREFIX_LOCK gen_multi0F is called only after checking
> that LOCK was not specified.
> 
> The gen_cmpxchg8b and gen_cmpxchg16b functions even have the correct
> prototype already; the only thing that needs to be done is removing the
> gen_lea_modrm() call.
> 
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> ---
>   target/i386/tcg/decode-new.h     |   2 +
>   target/i386/tcg/translate.c      | 121 +------------------------------
>   target/i386/tcg/decode-new.c.inc |  34 ++++++---
>   target/i386/tcg/emit.c.inc       |  96 ++++++++++++++++++++++++
>   4 files changed, 124 insertions(+), 129 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

r~