[PATCH 08/18] target/i386: implement CMPccXADD

Paolo Bonzini posted 18 patches 1 year, 1 month ago
Maintainers: Paolo Bonzini <pbonzini@redhat.com>, Richard Henderson <richard.henderson@linaro.org>, Eduardo Habkost <eduardo@habkost.net>
There is a newer version of this series
[PATCH 08/18] target/i386: implement CMPccXADD
Posted by Paolo Bonzini 1 year, 1 month ago
The main difficulty here is that a page fault when writing to the destination
must not overwrite the flags.  Therefore, the compute-flags helper must be
called with a temporary destination instead of using gen_jcc1*.

For simplicity, I am using an unconditional cmpxchg operation, that becomes
a NOP if the comparison fails.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.c                |  2 +-
 target/i386/tcg/decode-new.c.inc | 30 ++++++++++
 target/i386/tcg/decode-new.h     |  2 +
 target/i386/tcg/emit.c.inc       | 98 ++++++++++++++++++++++++++++++++
 target/i386/tcg/translate.c      |  2 +
 5 files changed, 133 insertions(+), 1 deletion(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 8beb989701c..80f0445301b 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -738,7 +738,7 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
 #define TCG_7_0_EDX_FEATURES (CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_KERNEL_FEATURES)
 
 #define TCG_7_1_EAX_FEATURES (CPUID_7_1_EAX_FZRM | CPUID_7_1_EAX_FSRS | \
-          CPUID_7_1_EAX_FSRC)
+          CPUID_7_1_EAX_FSRC | CPUID_7_1_EAX_CMPCCXADD)
 #define TCG_7_1_EDX_FEATURES 0
 #define TCG_7_2_EDX_FEATURES 0
 #define TCG_APM_FEATURES 0
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index bad561ff66d..01c46e6a789 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -516,6 +516,28 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = {
     [0xdd] = X86_OP_ENTRY3(VAESENCLAST, V,x,  H,x,       W,x,  vex4 cpuid(AES) p_66),
     [0xde] = X86_OP_ENTRY3(VAESDEC,     V,x,  H,x,       W,x,  vex4 cpuid(AES) p_66),
     [0xdf] = X86_OP_ENTRY3(VAESDECLAST, V,x,  H,x,       W,x,  vex4 cpuid(AES) p_66),
+
+    /*
+     * REG selects srcdest2 operand, VEX.vvvv selects src3.  VEX class not found
+     * in manual, assumed to be 13 from the VEX.L0 = constraint.
+     */
+    [0xe0] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xe1] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xe2] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xe3] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xe4] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xe5] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xe6] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xe7] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+
+    [0xe8] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xe9] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xea] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xeb] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xec] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xed] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xee] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+    [0xef] = X86_OP_ENTRY3(CMPccXADD,   EM,y, G,y, B,y,  vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
 };
 
 /* five rows for no prefix, 66, F3, F2, 66+F2  */
@@ -1273,8 +1295,13 @@ static bool decode_op(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
 
     case X86_TYPE_WM:  /* modrm byte selects an XMM/YMM memory operand */
         op->unit = X86_OP_SSE;
+        goto get_modrm_mem;
+
+    case X86_TYPE_EM:  /* modrm byte selects an ALU memory operand */
+        op->unit = X86_OP_INT;
         /* fall through */
     case X86_TYPE_M:  /* modrm byte selects a memory operand */
+    get_modrm_mem:
         modrm = get_modrm(s, env);
         if ((modrm >> 6) == 3) {
             return false;
@@ -1511,6 +1538,9 @@ static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid)
         return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2);
     case X86_FEAT_SHA_NI:
         return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_SHA_NI);
+
+    case X86_FEAT_CMPCCXADD:
+        return (s->cpuid_7_1_eax_features & CPUID_7_1_EAX_CMPCCXADD);
     }
     g_assert_not_reached();
 }
diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index b2879136614..b22de02ce54 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -47,6 +47,7 @@ typedef enum X86OpType {
     X86_TYPE_Y, /* string destination */
 
     /* Custom */
+    X86_TYPE_EM, /* modrm byte selects an ALU memory operand */
     X86_TYPE_WM, /* modrm byte selects an XMM/YMM memory operand */
     X86_TYPE_2op, /* 2-operand RMW instruction */
     X86_TYPE_LoBits, /* encoded in bits 0-2 of the operand + REX.B */
@@ -104,6 +105,7 @@ typedef enum X86CPUIDFeature {
     X86_FEAT_AVX2,
     X86_FEAT_BMI1,
     X86_FEAT_BMI2,
+    X86_FEAT_CMPCCXADD,
     X86_FEAT_F16C,
     X86_FEAT_FMA,
     X86_FEAT_MOVBE,
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index b5dfdc409e5..9f70e9dbaa6 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -1168,6 +1168,104 @@ static void gen_BZHI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     prepare_update1_cc(decode, s, CC_OP_BMILGB + ot);
 }
 
+static void gen_CMPccXADD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    TCGv z_tl = tcg_constant_tl(0);
+    TCGLabel *label_top = gen_new_label();
+    TCGLabel *label_bottom = gen_new_label();
+    TCGv oldv = tcg_temp_new();
+    TCGv memv = tcg_temp_new();
+    TCGv newv = tcg_temp_new();
+    TCGv cmpv = tcg_temp_new();
+    TCGv tmp_cc = tcg_temp_new();
+
+    TCGv cmp_lhs, cmp_rhs;
+    MemOp ot, ot_full;
+
+    int jcc_op = (decode->b >> 1) & 7;
+    static const uint8_t cond[16] = {
+        TCG_COND_NE,  /* o, just test OF=1 */
+        TCG_COND_EQ,  /* no, just test OF=0 */
+        TCG_COND_LTU, /* b */
+        TCG_COND_GEU, /* ae (nb) */
+        TCG_COND_EQ,  /* z */
+        TCG_COND_NE,  /* nz */
+        TCG_COND_LEU, /* be */
+        TCG_COND_GTU, /* a (nbe) */
+        TCG_COND_LT,  /* s, compares result against 0 */
+        TCG_COND_GE,  /* ns, compares result against 0 */
+        TCG_COND_NE,  /* p, just test PF=1 */
+        TCG_COND_EQ,  /* np, just test PF=0 */
+        TCG_COND_LT,  /* l */
+        TCG_COND_GE,  /* ge (nl) */
+        TCG_COND_LE,  /* le */
+        TCG_COND_GT,  /* g (nle) */
+    };
+
+    ot = decode->op[0].ot;
+    ot_full = ot | MO_LE;
+    if (jcc_op >= JCC_S) {
+        /*
+         * Sign-extend values before subtracting for S, P (zero/sign extension
+         * does not matter there) L, LE and their inverses.
+         */
+        ot_full |= MO_SIGN;
+    }
+
+    gen_ext_tl(cmpv, cpu_regs[decode->op[1].n], ot_full);
+
+    /*
+     * Cmpxchg loop starts here.
+     * s->A0: dest address; cmpv: compare operand; s->T1: addition operand.
+     */
+    gen_set_label(label_top);
+    gen_op_ld_v(s, ot_full, memv, s->A0);
+    tcg_gen_sub_tl(s->T0, memv, cmpv);
+
+    /* Compute comparison result but do not clobber cc_* yet.  */
+    switch (jcc_op) {
+    case JCC_O:
+    case JCC_P:
+        tcg_gen_sub_tl(s->T0, memv, cmpv);
+        gen_helper_cc_compute_all(tmp_cc, s->T0, cmpv, z_tl,
+                                  tcg_constant_i32(CC_OP_SUBB + ot));
+        decode->cc_src = tmp_cc;
+        set_cc_op(s, CC_OP_EFLAGS);
+
+        tcg_gen_andi_tl(s->T0, tmp_cc, (jcc_op == JCC_O ? CC_O : CC_P));
+        cmp_lhs = s->T0, cmp_rhs = z_tl;
+        break;
+
+    case JCC_S:
+        cmp_lhs = s->T0, cmp_rhs = z_tl;
+        goto cc_sub;
+
+    default:
+        cmp_lhs = memv, cmp_rhs = cmpv;
+    cc_sub:
+        decode->cc_dst = s->T0;
+        decode->cc_src = cmpv;
+        decode->cc_srcT = memv;
+        set_cc_op(s, CC_OP_SUBB + ot);
+        break;
+    }
+
+    /* Compute new value: if condition does not hold, just store back memv */
+    tcg_gen_add_tl(newv, memv, s->T1);
+    tcg_gen_movcond_tl(cond[decode->b & 15], newv, cmp_lhs, cmp_rhs, newv, memv);
+    tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, memv, newv, s->mem_index, ot_full);
+
+    /* Exit unconditionally if cmpxchg succeeded.  */
+    tcg_gen_brcond_tl(TCG_COND_EQ, oldv, memv, label_bottom);
+
+    /* Try again if there was actually a store to make.  */
+    tcg_gen_brcond_tl(cond[decode->b & 15], cmp_lhs, cmp_rhs, label_top);
+    gen_set_label(label_bottom);
+
+    /* Store old value only after a successful store.  */
+    gen_writeback(s, decode, 1, memv);
+}
+
 static void gen_CRC32(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[2].ot;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index d7d6c85877d..038151a8c3e 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -122,6 +122,7 @@ typedef struct DisasContext {
     int cpuid_ext3_features;
     int cpuid_7_0_ebx_features;
     int cpuid_7_0_ecx_features;
+    int cpuid_7_1_eax_features;
     int cpuid_xsave_features;
 
     /* TCG local temps */
@@ -6957,6 +6958,7 @@ static void i386_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
     dc->cpuid_ext3_features = env->features[FEAT_8000_0001_ECX];
     dc->cpuid_7_0_ebx_features = env->features[FEAT_7_0_EBX];
     dc->cpuid_7_0_ecx_features = env->features[FEAT_7_0_ECX];
+    dc->cpuid_7_1_eax_features = env->features[FEAT_7_1_EAX];
     dc->cpuid_xsave_features = env->features[FEAT_XSAVE];
     dc->jmp_opt = !((cflags & CF_NO_GOTO_TB) ||
                     (flags & (HF_TF_MASK | HF_INHIBIT_IRQ_MASK)));
-- 
2.41.0
Re: [PATCH 08/18] target/i386: implement CMPccXADD
Posted by Richard Henderson 1 year, 1 month ago
On 10/14/23 03:01, Paolo Bonzini wrote:
> +static void gen_CMPccXADD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
> +{
> +    TCGv z_tl = tcg_constant_tl(0);
> +    TCGLabel *label_top = gen_new_label();
> +    TCGLabel *label_bottom = gen_new_label();
> +    TCGv oldv = tcg_temp_new();
> +    TCGv memv = tcg_temp_new();
> +    TCGv newv = tcg_temp_new();
> +    TCGv cmpv = tcg_temp_new();
> +    TCGv tmp_cc = tcg_temp_new();
> +
> +    TCGv cmp_lhs, cmp_rhs;
> +    MemOp ot, ot_full;
> +
> +    int jcc_op = (decode->b >> 1) & 7;
> +    static const uint8_t cond[16] = {

TCGCond.

> +        TCG_COND_NE,  /* o, just test OF=1 */
> +        TCG_COND_EQ,  /* no, just test OF=0 */
> +        TCG_COND_LTU, /* b */
> +        TCG_COND_GEU, /* ae (nb) */
> +        TCG_COND_EQ,  /* z */
> +        TCG_COND_NE,  /* nz */
> +        TCG_COND_LEU, /* be */
> +        TCG_COND_GTU, /* a (nbe) */
> +        TCG_COND_LT,  /* s, compares result against 0 */
> +        TCG_COND_GE,  /* ns, compares result against 0 */
> +        TCG_COND_NE,  /* p, just test PF=1 */
> +        TCG_COND_EQ,  /* np, just test PF=0 */
> +        TCG_COND_LT,  /* l */
> +        TCG_COND_GE,  /* ge (nl) */
> +        TCG_COND_LE,  /* le */
> +        TCG_COND_GT,  /* g (nle) */
> +    };

You don't need the full table here:

     cond = cond_table[jcc_op];
     if (decode->b & 1)
         cond = tcg_invert_cond(cond)


> +    /* Compute comparison result but do not clobber cc_* yet.  */
> +    switch (jcc_op) {
> +    case JCC_O:
> +    case JCC_P:
> +        tcg_gen_sub_tl(s->T0, memv, cmpv);
> +        gen_helper_cc_compute_all(tmp_cc, s->T0, cmpv, z_tl,
> +                                  tcg_constant_i32(CC_OP_SUBB + ot));
> +        decode->cc_src = tmp_cc;
> +        set_cc_op(s, CC_OP_EFLAGS);
> +
> +        tcg_gen_andi_tl(s->T0, tmp_cc, (jcc_op == JCC_O ? CC_O : CC_P));
> +        cmp_lhs = s->T0, cmp_rhs = z_tl;

I'm not keen on the weight of the helper function within a cmpxchg loop.
I think you should compute these two cases explicitly:

     JCC_O:
         // Need operands sign-extended.
         // cond_table[JCC_O] = TCG_COND_LT -- sign bit set.
         tcg_gen_xor_tl(tmp, cmpv, memv);
         tcg_gen_xor_tl(cmp_lhs, cmpv, s->T0);
         tcg_gen_and_tl(cmp_lhs, cmp_lhs, tmp);
         cmp_rhs = z_tl;
         break;

     JCC_P:
         // cond_table[JCC_P] = TCG_COND_EQ -- even parity.
         tcg_gen_ext8u_tl(cmp_lhs, s->T0);
         tcg_gen_ctpop_tl(cmp_lhs, cmp_lhs);
         tcg_gen_andi_tl(cmp_lhs, cmp_lhs, 1);
         cmp_rhs = z_tl;
         break;

> +    cc_sub:
> +        decode->cc_dst = s->T0;
> +        decode->cc_src = cmpv;
> +        decode->cc_srcT = memv;
> +        set_cc_op(s, CC_OP_SUBB + ot);
> +        break;

At which point this is common to all cases.

> +    }
> +
> +    /* Compute new value: if condition does not hold, just store back memv */
> +    tcg_gen_add_tl(newv, memv, s->T1);
> +    tcg_gen_movcond_tl(cond[decode->b & 15], newv, cmp_lhs, cmp_rhs, newv, memv);
> +    tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, memv, newv, s->mem_index, ot_full);
> +
> +    /* Exit unconditionally if cmpxchg succeeded.  */
> +    tcg_gen_brcond_tl(TCG_COND_EQ, oldv, memv, label_bottom);
> +
> +    /* Try again if there was actually a store to make.  */
> +    tcg_gen_brcond_tl(cond[decode->b & 15], cmp_lhs, cmp_rhs, label_top);

I'm tempted to have this unlikely case sync the pc and exit the tb.
This would restart the current instruction after testing for exit request.

But I suppose we have plenty of other places with unbounded cmpxchg loops...


r~