target/i386/tcg: implement APX

[PATCH 18/18] target/i386/tcg: optimize CCMP
Posted by Paolo Bonzini 18 hours ago
Use CC_OP_SUB* and CC_OP_LOGIC* if specific dfv values make it possible;
this avoids having to compute the carry.  This happens relatively often
for dfv=0, and also for dfv=sf for code produced by clang.

Of the combinations that cannot be optimized, both GCC and clang
generate Z.  GCC also generates OS and OSZ.

Do not bother doing this for CTEST; the savings are modest because it
does not need complex code to compute the carry-out vector (CC_SRC
is always 0).  In addition, trivially replacing the arguments to the AND
would only support dfv=0 (produced by -2 & 2) or dfv=S (produced by -2 &
-2), because TEST cannot produce any value where CF or OF are not zero.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/emit.c.inc | 63 +++++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 7 deletions(-)

diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index a1c3680db3c..da0e00e6652 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -1748,6 +1748,56 @@ static void gen_dfv_movcond(DisasContext *s, X86DecodedInsn *decode, int cond, u
                        decode->cc_dst, tcg_constant_tl(dst));
 }
 
+static bool gen_ccmp_movcond(DisasContext *s, int cond, uint32_t dfv, MemOp ot)
+{
+    target_ulong op0, op1, max_int;
+    if (cond == (CCMP_T << 1)) {
+        return true;
+    }
+
+    /*
+     * For some values of dfv, it's possible to overwrite operands
+     * instead of using CC_OP_CCMP.
+     */
+    max_int = (target_ulong)1 << ((8 << ot) - 1);
+    switch(dfv) {
+    case 0:
+        op0 = 1, op1 = 0;
+        break;
+    case CC_P|CC_C:
+        op0 = 1, op1 = -16;
+        break;
+    case CC_S:
+        op0 = -3, op1 = 0;
+        break;
+    case CC_S|CC_P|CC_C:
+        op0 = 0, op1 = 16;
+        break;
+    case CC_O:
+        /*
+         * For 8-bit results the sign bit is in bit 7 and, having no copies to the
+         * right, it flips the parity flag.
+         */
+        op0 = -max_int, op1 = 16 + (ot == MO_8);
+        break;
+    case CC_O|CC_S|CC_P|CC_C:
+        /* Same here. */
+        op0 = max_int, op1 = -1 - (ot == MO_8);
+        break;
+    default:
+        return false;
+    }
+
+    CCPrepare cc = gen_prepare_cc(s, cond, NULL);
+    if (!cc.use_reg2) {
+        cc.reg2 = tcg_constant_tl(cc.imm);
+    }
+
+    tcg_gen_movcond_tl(cc.cond, s->T0, cc.reg, cc.reg2, s->T0, tcg_constant_tl(op0));
+    tcg_gen_movcond_tl(cc.cond, s->T1, cc.reg, cc.reg2, s->T1, tcg_constant_tl(op1));
+    return true;
+}
+
 static void gen_SUB(DisasContext *s, X86DecodedInsn *decode);
 static void gen_CMP(DisasContext *s, X86DecodedInsn *decode)
 {
@@ -1756,16 +1806,15 @@ static void gen_CMP(DisasContext *s, X86DecodedInsn *decode)
     MemOp ot = decode->op[1].ot;
     TCGv cout;
 
-    switch (cond) {
-    case CCMP_T << 1:
-        gen_SUB(s, decode);
-        return;
-    case (CCMP_T << 1) | 1:
+    if (cond == ((CCMP_T << 1) | 1)) {
         decode->cc_op = CC_OP_EFLAGS;
         decode->cc_src = tcg_constant_tl(dfv);
         return;
-    default:
-        break;
+    }
+
+    if (gen_ccmp_movcond(s, cond, dfv, ot)) {
+        gen_SUB(s, decode);
+        return;
     }
 
     decode->cc_op = CC_OP_CCMPB + ot;
-- 
2.52.0