The main difficulty here is that a page fault when writing to the destination
must not overwrite the flags. Therefore, the compute-flags helper must be
called with a temporary destination instead of using gen_jcc1*.
For simplicity, I am using an unconditional cmpxchg operation, that becomes
a NOP if the comparison fails.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/cpu.c | 2 +-
target/i386/tcg/decode-new.c.inc | 30 ++++++++++
target/i386/tcg/decode-new.h | 2 +
target/i386/tcg/emit.c.inc | 98 ++++++++++++++++++++++++++++++++
target/i386/tcg/translate.c | 2 +
5 files changed, 133 insertions(+), 1 deletion(-)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 8beb989701c..80f0445301b 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -738,7 +738,7 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
#define TCG_7_0_EDX_FEATURES (CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_KERNEL_FEATURES)
#define TCG_7_1_EAX_FEATURES (CPUID_7_1_EAX_FZRM | CPUID_7_1_EAX_FSRS | \
- CPUID_7_1_EAX_FSRC)
+ CPUID_7_1_EAX_FSRC | CPUID_7_1_EAX_CMPCCXADD)
#define TCG_7_1_EDX_FEATURES 0
#define TCG_7_2_EDX_FEATURES 0
#define TCG_APM_FEATURES 0
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index bad561ff66d..01c46e6a789 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -516,6 +516,28 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = {
[0xdd] = X86_OP_ENTRY3(VAESENCLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
[0xde] = X86_OP_ENTRY3(VAESDEC, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
[0xdf] = X86_OP_ENTRY3(VAESDECLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
+
+ /*
+ * REG selects srcdest2 operand, VEX.vvvv selects src3. VEX class not found
+ * in manual, assumed to be 13 from the VEX.L0 = constraint.
+ */
+ [0xe0] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xe1] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xe2] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xe3] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xe4] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xe5] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xe6] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xe7] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+
+ [0xe8] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xe9] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xea] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xeb] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xec] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xed] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xee] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
+ [0xef] = X86_OP_ENTRY3(CMPccXADD, EM,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66),
};
/* five rows for no prefix, 66, F3, F2, 66+F2 */
@@ -1273,8 +1295,13 @@ static bool decode_op(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
case X86_TYPE_WM: /* modrm byte selects an XMM/YMM memory operand */
op->unit = X86_OP_SSE;
+ goto get_modrm_mem;
+
+ case X86_TYPE_EM: /* modrm byte selects an ALU memory operand */
+ op->unit = X86_OP_INT;
/* fall through */
case X86_TYPE_M: /* modrm byte selects a memory operand */
+ get_modrm_mem:
modrm = get_modrm(s, env);
if ((modrm >> 6) == 3) {
return false;
@@ -1511,6 +1538,9 @@ static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid)
return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2);
case X86_FEAT_SHA_NI:
return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_SHA_NI);
+
+ case X86_FEAT_CMPCCXADD:
+ return (s->cpuid_7_1_eax_features & CPUID_7_1_EAX_CMPCCXADD);
}
g_assert_not_reached();
}
diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index b2879136614..b22de02ce54 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -47,6 +47,7 @@ typedef enum X86OpType {
X86_TYPE_Y, /* string destination */
/* Custom */
+ X86_TYPE_EM, /* modrm byte selects an ALU memory operand */
X86_TYPE_WM, /* modrm byte selects an XMM/YMM memory operand */
X86_TYPE_2op, /* 2-operand RMW instruction */
X86_TYPE_LoBits, /* encoded in bits 0-2 of the operand + REX.B */
@@ -104,6 +105,7 @@ typedef enum X86CPUIDFeature {
X86_FEAT_AVX2,
X86_FEAT_BMI1,
X86_FEAT_BMI2,
+ X86_FEAT_CMPCCXADD,
X86_FEAT_F16C,
X86_FEAT_FMA,
X86_FEAT_MOVBE,
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index b5dfdc409e5..9f70e9dbaa6 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -1168,6 +1168,104 @@ static void gen_BZHI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
prepare_update1_cc(decode, s, CC_OP_BMILGB + ot);
}
+static void gen_CMPccXADD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv z_tl = tcg_constant_tl(0);
+ TCGLabel *label_top = gen_new_label();
+ TCGLabel *label_bottom = gen_new_label();
+ TCGv oldv = tcg_temp_new();
+ TCGv memv = tcg_temp_new();
+ TCGv newv = tcg_temp_new();
+ TCGv cmpv = tcg_temp_new();
+ TCGv tmp_cc = tcg_temp_new();
+
+ TCGv cmp_lhs, cmp_rhs;
+ MemOp ot, ot_full;
+
+ int jcc_op = (decode->b >> 1) & 7;
+ static const uint8_t cond[16] = {
+ TCG_COND_NE, /* o, just test OF=1 */
+ TCG_COND_EQ, /* no, just test OF=0 */
+ TCG_COND_LTU, /* b */
+ TCG_COND_GEU, /* ae (nb) */
+ TCG_COND_EQ, /* z */
+ TCG_COND_NE, /* nz */
+ TCG_COND_LEU, /* be */
+ TCG_COND_GTU, /* a (nbe) */
+ TCG_COND_LT, /* s, compares result against 0 */
+ TCG_COND_GE, /* ns, compares result against 0 */
+ TCG_COND_NE, /* p, just test PF=1 */
+ TCG_COND_EQ, /* np, just test PF=0 */
+ TCG_COND_LT, /* l */
+ TCG_COND_GE, /* ge (nl) */
+ TCG_COND_LE, /* le */
+ TCG_COND_GT, /* g (nle) */
+ };
+
+ ot = decode->op[0].ot;
+ ot_full = ot | MO_LE;
+ if (jcc_op >= JCC_S) {
+ /*
+ * Sign-extend values before subtracting for S, P (zero/sign extension
+ * does not matter there) L, LE and their inverses.
+ */
+ ot_full |= MO_SIGN;
+ }
+
+ gen_ext_tl(cmpv, cpu_regs[decode->op[1].n], ot_full);
+
+ /*
+ * Cmpxchg loop starts here.
+ * s->A0: dest address; cmpv: compare operand; s->T1: addition operand.
+ */
+ gen_set_label(label_top);
+ gen_op_ld_v(s, ot_full, memv, s->A0);
+ tcg_gen_sub_tl(s->T0, memv, cmpv);
+
+ /* Compute comparison result but do not clobber cc_* yet. */
+ switch (jcc_op) {
+ case JCC_O:
+ case JCC_P:
+ tcg_gen_sub_tl(s->T0, memv, cmpv);
+ gen_helper_cc_compute_all(tmp_cc, s->T0, cmpv, z_tl,
+ tcg_constant_i32(CC_OP_SUBB + ot));
+ decode->cc_src = tmp_cc;
+ set_cc_op(s, CC_OP_EFLAGS);
+
+ tcg_gen_andi_tl(s->T0, tmp_cc, (jcc_op == JCC_O ? CC_O : CC_P));
+ cmp_lhs = s->T0, cmp_rhs = z_tl;
+ break;
+
+ case JCC_S:
+ cmp_lhs = s->T0, cmp_rhs = z_tl;
+ goto cc_sub;
+
+ default:
+ cmp_lhs = memv, cmp_rhs = cmpv;
+ cc_sub:
+ decode->cc_dst = s->T0;
+ decode->cc_src = cmpv;
+ decode->cc_srcT = memv;
+ set_cc_op(s, CC_OP_SUBB + ot);
+ break;
+ }
+
+ /* Compute new value: if condition does not hold, just store back memv */
+ tcg_gen_add_tl(newv, memv, s->T1);
+ tcg_gen_movcond_tl(cond[decode->b & 15], newv, cmp_lhs, cmp_rhs, newv, memv);
+ tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, memv, newv, s->mem_index, ot_full);
+
+ /* Exit unconditionally if cmpxchg succeeded. */
+ tcg_gen_brcond_tl(TCG_COND_EQ, oldv, memv, label_bottom);
+
+ /* Try again if there was actually a store to make. */
+ tcg_gen_brcond_tl(cond[decode->b & 15], cmp_lhs, cmp_rhs, label_top);
+ gen_set_label(label_bottom);
+
+ /* Store old value only after a successful store. */
+ gen_writeback(s, decode, 1, memv);
+}
+
static void gen_CRC32(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
MemOp ot = decode->op[2].ot;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index d7d6c85877d..038151a8c3e 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -122,6 +122,7 @@ typedef struct DisasContext {
int cpuid_ext3_features;
int cpuid_7_0_ebx_features;
int cpuid_7_0_ecx_features;
+ int cpuid_7_1_eax_features;
int cpuid_xsave_features;
/* TCG local temps */
@@ -6957,6 +6958,7 @@ static void i386_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
dc->cpuid_ext3_features = env->features[FEAT_8000_0001_ECX];
dc->cpuid_7_0_ebx_features = env->features[FEAT_7_0_EBX];
dc->cpuid_7_0_ecx_features = env->features[FEAT_7_0_ECX];
+ dc->cpuid_7_1_eax_features = env->features[FEAT_7_1_EAX];
dc->cpuid_xsave_features = env->features[FEAT_XSAVE];
dc->jmp_opt = !((cflags & CF_NO_GOTO_TB) ||
(flags & (HF_TF_MASK | HF_INHIBIT_IRQ_MASK)));
--
2.41.0
On 10/14/23 03:01, Paolo Bonzini wrote: > +static void gen_CMPccXADD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) > +{ > + TCGv z_tl = tcg_constant_tl(0); > + TCGLabel *label_top = gen_new_label(); > + TCGLabel *label_bottom = gen_new_label(); > + TCGv oldv = tcg_temp_new(); > + TCGv memv = tcg_temp_new(); > + TCGv newv = tcg_temp_new(); > + TCGv cmpv = tcg_temp_new(); > + TCGv tmp_cc = tcg_temp_new(); > + > + TCGv cmp_lhs, cmp_rhs; > + MemOp ot, ot_full; > + > + int jcc_op = (decode->b >> 1) & 7; > + static const uint8_t cond[16] = { TCGCond. > + TCG_COND_NE, /* o, just test OF=1 */ > + TCG_COND_EQ, /* no, just test OF=0 */ > + TCG_COND_LTU, /* b */ > + TCG_COND_GEU, /* ae (nb) */ > + TCG_COND_EQ, /* z */ > + TCG_COND_NE, /* nz */ > + TCG_COND_LEU, /* be */ > + TCG_COND_GTU, /* a (nbe) */ > + TCG_COND_LT, /* s, compares result against 0 */ > + TCG_COND_GE, /* ns, compares result against 0 */ > + TCG_COND_NE, /* p, just test PF=1 */ > + TCG_COND_EQ, /* np, just test PF=0 */ > + TCG_COND_LT, /* l */ > + TCG_COND_GE, /* ge (nl) */ > + TCG_COND_LE, /* le */ > + TCG_COND_GT, /* g (nle) */ > + }; You don't need the full table here: cond = cond_table[jcc_op]; if (decode->b & 1) cond = tcg_invert_cond(cond) > + /* Compute comparison result but do not clobber cc_* yet. */ > + switch (jcc_op) { > + case JCC_O: > + case JCC_P: > + tcg_gen_sub_tl(s->T0, memv, cmpv); > + gen_helper_cc_compute_all(tmp_cc, s->T0, cmpv, z_tl, > + tcg_constant_i32(CC_OP_SUBB + ot)); > + decode->cc_src = tmp_cc; > + set_cc_op(s, CC_OP_EFLAGS); > + > + tcg_gen_andi_tl(s->T0, tmp_cc, (jcc_op == JCC_O ? CC_O : CC_P)); > + cmp_lhs = s->T0, cmp_rhs = z_tl; I'm not keen on the weight of the helper function within a cmpxchg loop. I think you should compute these two cases explicitly: JCC_O: // Need operands sign-extended. // cond_table[JCC_O] = TCG_COND_LT -- sign bit set. tcg_gen_xor_tl(tmp, cmpv, memv); tcg_gen_xor_tl(cmp_lhs, cmpv, s->T0); tcg_gen_and_tl(cmp_lhs, cmp_lhs, tmp); cmp_rhs = z_tl; break; JCC_P: // cond_table[JCC_P] = TCG_COND_EQ -- even parity. tcg_gen_ext8u_tl(cmp_lhs, s->T0); tcg_gen_ctpop_tl(cmp_lhs, cmp_lhs); tcg_gen_andi_tl(cmp_lhs, cmp_lhs, 1); cmp_rhs = z_tl; break; > + cc_sub: > + decode->cc_dst = s->T0; > + decode->cc_src = cmpv; > + decode->cc_srcT = memv; > + set_cc_op(s, CC_OP_SUBB + ot); > + break; At which point this is common to all cases. > + } > + > + /* Compute new value: if condition does not hold, just store back memv */ > + tcg_gen_add_tl(newv, memv, s->T1); > + tcg_gen_movcond_tl(cond[decode->b & 15], newv, cmp_lhs, cmp_rhs, newv, memv); > + tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, memv, newv, s->mem_index, ot_full); > + > + /* Exit unconditionally if cmpxchg succeeded. */ > + tcg_gen_brcond_tl(TCG_COND_EQ, oldv, memv, label_bottom); > + > + /* Try again if there was actually a store to make. */ > + tcg_gen_brcond_tl(cond[decode->b & 15], cmp_lhs, cmp_rhs, label_top); I'm tempted to have this unlikely case sync the pc and exit the tb. This would restart the current instruction after testing for exit request. But I suppose we have plenty of other places with unbounded cmpxchg loops... r~
© 2016 - 2024 Red Hat, Inc.