1
The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:
1
The following changes since commit 36eae3a732a1f2aa81391e871ac0e9bb3233e7d7:
2
2
3
Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)
3
Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-migration-20220302b' into staging (2022-03-02 20:55:48 +0000)
4
4
5
are available in the Git repository at:
5
are available in the Git repository at:
6
6
7
https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027
7
https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220303
8
8
9
for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:
9
for you to fetch changes up to f23e6de25c31cadd9a3b7122f9384e6b259ce37f:
10
10
11
tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)
11
tcg/loongarch64: Support TCG_TARGET_SIGNED_ADDR32 (2022-03-03 10:47:20 -1000)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Improvements to qemu/int128
14
Reorder do_constant_folding_cond test to satisfy valgrind.
15
Fixes for 128/64 division.
15
Fix value of MAX_OPC_PARAM_IARGS.
16
Cleanup tcg/optimize.c
16
Add opcodes for vector nand, nor, eqv.
17
Optimize redundant sign extensions
17
Support vector nand, nor, eqv on PPC and S390X hosts.
18
Support AVX512VL, AVX512BW, AVX512DQ, and AVX512VBMI2.
19
Support 32-bit guest addresses as signed values.
18
20
19
----------------------------------------------------------------
21
----------------------------------------------------------------
20
Frédéric Pétrot (1):
22
Alex Bennée (1):
21
qemu/int128: Add int128_{not,xor}
23
tcg/optimize: only read val after const check
22
24
23
Luis Pires (4):
25
Richard Henderson (28):
24
host-utils: move checks out of divu128/divs128
26
tcg: Add opcodes for vector nand, nor, eqv
25
host-utils: move udiv_qrnnd() to host-utils
27
tcg/ppc: Implement vector NAND, NOR, EQV
26
host-utils: add 128-bit quotient support to divu128/divs128
28
tcg/s390x: Implement vector NAND, NOR, EQV
27
host-utils: add unit tests for divu128/divs128
29
tcg/i386: Detect AVX512
30
tcg/i386: Add tcg_out_evex_opc
31
tcg/i386: Use tcg_can_emit_vec_op in expand_vec_cmp_noinv
32
tcg/i386: Implement avx512 variable shifts
33
tcg/i386: Implement avx512 scalar shift
34
tcg/i386: Implement avx512 immediate sari shift
35
tcg/i386: Implement avx512 immediate rotate
36
tcg/i386: Implement avx512 variable rotate
37
tcg/i386: Support avx512vbmi2 vector shift-double instructions
38
tcg/i386: Expand vector word rotate as avx512vbmi2 shift-double
39
tcg/i386: Remove rotls_vec from tcg_target_op_def
40
tcg/i386: Expand scalar rotate with avx512 insns
41
tcg/i386: Implement avx512 min/max/abs
42
tcg/i386: Implement avx512 multiply
43
tcg/i386: Implement more logical operations for avx512
44
tcg/i386: Implement bitsel for avx512
45
tcg: Add TCG_TARGET_SIGNED_ADDR32
46
accel/tcg: Split out g2h_tlbe
47
accel/tcg: Support TCG_TARGET_SIGNED_ADDR32 for softmmu
48
accel/tcg: Add guest_base_signed_addr32 for user-only
49
linux-user: Support TCG_TARGET_SIGNED_ADDR32
50
tcg/aarch64: Support TCG_TARGET_SIGNED_ADDR32
51
tcg/mips: Support TCG_TARGET_SIGNED_ADDR32
52
tcg/riscv: Support TCG_TARGET_SIGNED_ADDR32
53
tcg/loongarch64: Support TCG_TARGET_SIGNED_ADDR32
28
54
29
Richard Henderson (51):
55
Ziqiao Kong (1):
30
tcg/optimize: Rename "mask" to "z_mask"
56
tcg: Set MAX_OPC_PARAM_IARGS to 7
31
tcg/optimize: Split out OptContext
32
tcg/optimize: Remove do_default label
33
tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
34
tcg/optimize: Move prev_mb into OptContext
35
tcg/optimize: Split out init_arguments
36
tcg/optimize: Split out copy_propagate
37
tcg/optimize: Split out fold_call
38
tcg/optimize: Drop nb_oargs, nb_iargs locals
39
tcg/optimize: Change fail return for do_constant_folding_cond*
40
tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
41
tcg/optimize: Split out finish_folding
42
tcg/optimize: Use a boolean to avoid a mass of continues
43
tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
44
tcg/optimize: Split out fold_const{1,2}
45
tcg/optimize: Split out fold_setcond2
46
tcg/optimize: Split out fold_brcond2
47
tcg/optimize: Split out fold_brcond
48
tcg/optimize: Split out fold_setcond
49
tcg/optimize: Split out fold_mulu2_i32
50
tcg/optimize: Split out fold_addsub2_i32
51
tcg/optimize: Split out fold_movcond
52
tcg/optimize: Split out fold_extract2
53
tcg/optimize: Split out fold_extract, fold_sextract
54
tcg/optimize: Split out fold_deposit
55
tcg/optimize: Split out fold_count_zeros
56
tcg/optimize: Split out fold_bswap
57
tcg/optimize: Split out fold_dup, fold_dup2
58
tcg/optimize: Split out fold_mov
59
tcg/optimize: Split out fold_xx_to_i
60
tcg/optimize: Split out fold_xx_to_x
61
tcg/optimize: Split out fold_xi_to_i
62
tcg/optimize: Add type to OptContext
63
tcg/optimize: Split out fold_to_not
64
tcg/optimize: Split out fold_sub_to_neg
65
tcg/optimize: Split out fold_xi_to_x
66
tcg/optimize: Split out fold_ix_to_i
67
tcg/optimize: Split out fold_masks
68
tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
69
tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
70
tcg/optimize: Sink commutative operand swapping into fold functions
71
tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
72
tcg/optimize: Use fold_xx_to_i for orc
73
tcg/optimize: Use fold_xi_to_x for mul
74
tcg/optimize: Use fold_xi_to_x for div
75
tcg/optimize: Use fold_xx_to_i for rem
76
tcg/optimize: Optimize sign extensions
77
tcg/optimize: Propagate sign info for logical operations
78
tcg/optimize: Propagate sign info for setcond
79
tcg/optimize: Propagate sign info for bit counting
80
tcg/optimize: Propagate sign info for shifting
81
57
82
include/fpu/softfloat-macros.h | 82 --
58
include/exec/cpu-all.h | 20 +-
83
include/hw/clock.h | 5 +-
59
include/exec/cpu_ldst.h | 3 +-
84
include/qemu/host-utils.h | 121 +-
60
include/qemu/cpuid.h | 20 +-
85
include/qemu/int128.h | 20 +
61
include/tcg/tcg-opc.h | 3 +
86
target/ppc/int_helper.c | 23 +-
62
include/tcg/tcg.h | 5 +-
87
tcg/optimize.c | 2644 ++++++++++++++++++++++++----------------
63
tcg/aarch64/tcg-target-sa32.h | 7 +
88
tests/unit/test-div128.c | 197 +++
64
tcg/aarch64/tcg-target.h | 3 +
89
util/host-utils.c | 147 ++-
65
tcg/arm/tcg-target-sa32.h | 1 +
90
tests/unit/meson.build | 1 +
66
tcg/arm/tcg-target.h | 3 +
91
9 files changed, 2053 insertions(+), 1187 deletions(-)
67
tcg/i386/tcg-target-con-set.h | 1 +
92
create mode 100644 tests/unit/test-div128.c
68
tcg/i386/tcg-target-sa32.h | 1 +
69
tcg/i386/tcg-target.h | 17 +-
70
tcg/i386/tcg-target.opc.h | 3 +
71
tcg/loongarch64/tcg-target-sa32.h | 1 +
72
tcg/mips/tcg-target-sa32.h | 9 +
73
tcg/ppc/tcg-target-sa32.h | 1 +
74
tcg/ppc/tcg-target.h | 3 +
75
tcg/riscv/tcg-target-sa32.h | 5 +
76
tcg/s390x/tcg-target-sa32.h | 1 +
77
tcg/s390x/tcg-target.h | 3 +
78
tcg/sparc/tcg-target-sa32.h | 1 +
79
tcg/tci/tcg-target-sa32.h | 1 +
80
accel/tcg/cputlb.c | 36 ++--
81
bsd-user/main.c | 4 +
82
linux-user/elfload.c | 62 ++++--
83
linux-user/main.c | 3 +
84
tcg/optimize.c | 20 +-
85
tcg/tcg-op-vec.c | 27 ++-
86
tcg/tcg.c | 10 +
87
tcg/aarch64/tcg-target.c.inc | 81 +++++---
88
tcg/i386/tcg-target.c.inc | 387 +++++++++++++++++++++++++++++++-------
89
tcg/loongarch64/tcg-target.c.inc | 15 +-
90
tcg/mips/tcg-target.c.inc | 10 +-
91
tcg/ppc/tcg-target.c.inc | 15 ++
92
tcg/riscv/tcg-target.c.inc | 8 +-
93
tcg/s390x/tcg-target.c.inc | 17 ++
94
tcg/tci/tcg-target.c.inc | 2 +-
95
37 files changed, 640 insertions(+), 169 deletions(-)
96
create mode 100644 tcg/aarch64/tcg-target-sa32.h
97
create mode 100644 tcg/arm/tcg-target-sa32.h
98
create mode 100644 tcg/i386/tcg-target-sa32.h
99
create mode 100644 tcg/loongarch64/tcg-target-sa32.h
100
create mode 100644 tcg/mips/tcg-target-sa32.h
101
create mode 100644 tcg/ppc/tcg-target-sa32.h
102
create mode 100644 tcg/riscv/tcg-target-sa32.h
103
create mode 100644 tcg/s390x/tcg-target-sa32.h
104
create mode 100644 tcg/sparc/tcg-target-sa32.h
105
create mode 100644 tcg/tci/tcg-target-sa32.h
93
106
diff view generated by jsdifflib
Deleted patch
1
From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
2
1
3
Addition of not and xor on 128-bit integers.
4
5
Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
6
Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
7
Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
8
[rth: Split out logical operations.]
9
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
10
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
11
---
12
include/qemu/int128.h | 20 ++++++++++++++++++++
13
1 file changed, 20 insertions(+)
14
15
diff --git a/include/qemu/int128.h b/include/qemu/int128.h
16
index XXXXXXX..XXXXXXX 100644
17
--- a/include/qemu/int128.h
18
+++ b/include/qemu/int128.h
19
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
20
return a;
21
}
22
23
+static inline Int128 int128_not(Int128 a)
24
+{
25
+ return ~a;
26
+}
27
+
28
static inline Int128 int128_and(Int128 a, Int128 b)
29
{
30
return a & b;
31
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
32
return a | b;
33
}
34
35
+static inline Int128 int128_xor(Int128 a, Int128 b)
36
+{
37
+ return a ^ b;
38
+}
39
+
40
static inline Int128 int128_rshift(Int128 a, int n)
41
{
42
return a >> n;
43
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
44
return int128_make128(a, (a < 0) ? -1 : 0);
45
}
46
47
+static inline Int128 int128_not(Int128 a)
48
+{
49
+ return int128_make128(~a.lo, ~a.hi);
50
+}
51
+
52
static inline Int128 int128_and(Int128 a, Int128 b)
53
{
54
return int128_make128(a.lo & b.lo, a.hi & b.hi);
55
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
56
return int128_make128(a.lo | b.lo, a.hi | b.hi);
57
}
58
59
+static inline Int128 int128_xor(Int128 a, Int128 b)
60
+{
61
+ return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
62
+}
63
+
64
static inline Int128 int128_rshift(Int128 a, int n)
65
{
66
int64_t h;
67
--
68
2.25.1
69
70
diff view generated by jsdifflib
1
Compute the type of the operation early.
1
From: Alex Bennée <alex.bennee@linaro.org>
2
2
3
There are at least 4 places that used a def->flags ladder
3
valgrind pointed out that arg_info()->val can be undefined which will
4
to determine the type of the operation being optimized.
4
be the case if the arguments are not constant. The ordering of the
5
checks will have ensured we never relied on an undefined value but for
6
the sake of completeness re-order the code to be clear.
5
7
6
There were two places that assumed !TCG_OPF_64BIT means
8
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
7
TCG_TYPE_I32, and so could potentially compute incorrect
9
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
8
results for vector operations.
10
Message-Id: <20220209112142.3367525-1-alex.bennee@linaro.org>
9
10
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
11
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
11
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
12
---
12
---
13
tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
13
tcg/optimize.c | 8 ++++----
14
1 file changed, 89 insertions(+), 60 deletions(-)
14
1 file changed, 4 insertions(+), 4 deletions(-)
15
15
16
diff --git a/tcg/optimize.c b/tcg/optimize.c
16
diff --git a/tcg/optimize.c b/tcg/optimize.c
17
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
18
--- a/tcg/optimize.c
18
--- a/tcg/optimize.c
19
+++ b/tcg/optimize.c
19
+++ b/tcg/optimize.c
20
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
21
22
/* In flight values from optimization. */
23
uint64_t z_mask;
24
+ TCGType type;
25
} OptContext;
26
27
static inline TempOptInfo *ts_info(TCGTemp *ts)
28
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
29
{
30
TCGTemp *dst_ts = arg_temp(dst);
31
TCGTemp *src_ts = arg_temp(src);
32
- const TCGOpDef *def;
33
TempOptInfo *di;
34
TempOptInfo *si;
35
uint64_t z_mask;
36
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
37
reset_ts(dst_ts);
38
di = ts_info(dst_ts);
39
si = ts_info(src_ts);
40
- def = &tcg_op_defs[op->opc];
41
- if (def->flags & TCG_OPF_VECTOR) {
42
- new_op = INDEX_op_mov_vec;
43
- } else if (def->flags & TCG_OPF_64BIT) {
44
- new_op = INDEX_op_mov_i64;
45
- } else {
46
+
47
+ switch (ctx->type) {
48
+ case TCG_TYPE_I32:
49
new_op = INDEX_op_mov_i32;
50
+ break;
51
+ case TCG_TYPE_I64:
52
+ new_op = INDEX_op_mov_i64;
53
+ break;
54
+ case TCG_TYPE_V64:
55
+ case TCG_TYPE_V128:
56
+ case TCG_TYPE_V256:
57
+ /* TCGOP_VECL and TCGOP_VECE remain unchanged. */
58
+ new_op = INDEX_op_mov_vec;
59
+ break;
60
+ default:
61
+ g_assert_not_reached();
62
}
63
op->opc = new_op;
64
- /* TCGOP_VECL and TCGOP_VECE remain unchanged. */
65
op->args[0] = dst;
66
op->args[1] = src;
67
68
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
69
static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
70
TCGArg dst, uint64_t val)
71
{
72
- const TCGOpDef *def = &tcg_op_defs[op->opc];
73
- TCGType type;
74
- TCGTemp *tv;
75
-
76
- if (def->flags & TCG_OPF_VECTOR) {
77
- type = TCGOP_VECL(op) + TCG_TYPE_V64;
78
- } else if (def->flags & TCG_OPF_64BIT) {
79
- type = TCG_TYPE_I64;
80
- } else {
81
- type = TCG_TYPE_I32;
82
- }
83
-
84
/* Convert movi to mov with constant temp. */
85
- tv = tcg_constant_internal(type, val);
86
+ TCGTemp *tv = tcg_constant_internal(ctx->type, val);
87
+
88
init_ts_info(ctx, tv);
89
return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
90
}
91
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
92
}
93
}
94
95
-static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
96
+static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
97
+ uint64_t x, uint64_t y)
98
{
99
- const TCGOpDef *def = &tcg_op_defs[op];
100
uint64_t res = do_constant_folding_2(op, x, y);
101
- if (!(def->flags & TCG_OPF_64BIT)) {
102
+ if (type == TCG_TYPE_I32) {
103
res = (int32_t)res;
104
}
105
return res;
106
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
20
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
107
* Return -1 if the condition can't be simplified,
21
static int do_constant_folding_cond(TCGType type, TCGArg x,
108
* and the result of the condition (0 or 1) if it can.
109
*/
110
-static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
111
+static int do_constant_folding_cond(TCGType type, TCGArg x,
112
TCGArg y, TCGCond c)
22
TCGArg y, TCGCond c)
113
{
23
{
114
uint64_t xv = arg_info(x)->val;
24
- uint64_t xv = arg_info(x)->val;
115
uint64_t yv = arg_info(y)->val;
25
- uint64_t yv = arg_info(y)->val;
116
26
-
117
if (arg_is_const(x) && arg_is_const(y)) {
27
if (arg_is_const(x) && arg_is_const(y)) {
118
- const TCGOpDef *def = &tcg_op_defs[op];
28
+ uint64_t xv = arg_info(x)->val;
119
- tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
29
+ uint64_t yv = arg_info(y)->val;
120
- if (def->flags & TCG_OPF_64BIT) {
30
+
121
- return do_constant_folding_cond_64(xv, yv, c);
31
switch (type) {
122
- } else {
32
case TCG_TYPE_I32:
123
+ switch (type) {
124
+ case TCG_TYPE_I32:
125
return do_constant_folding_cond_32(xv, yv, c);
33
return do_constant_folding_cond_32(xv, yv, c);
126
+ case TCG_TYPE_I64:
34
@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond(TCGType type, TCGArg x,
127
+ return do_constant_folding_cond_64(xv, yv, c);
128
+ default:
129
+ /* Only scalar comparisons are optimizable */
130
+ return -1;
131
}
35
}
132
} else if (args_are_copies(x, y)) {
36
} else if (args_are_copies(x, y)) {
133
return do_constant_folding_cond_eq(c);
37
return do_constant_folding_cond_eq(c);
134
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
38
- } else if (arg_is_const(y) && yv == 0) {
135
uint64_t t;
39
+ } else if (arg_is_const(y) && arg_info(y)->val == 0) {
136
40
switch (c) {
137
t = arg_info(op->args[1])->val;
41
case TCG_COND_LTU:
138
- t = do_constant_folding(op->opc, t, 0);
42
return 0;
139
+ t = do_constant_folding(op->opc, ctx->type, t, 0);
140
return tcg_opt_gen_movi(ctx, op, op->args[0], t);
141
}
142
return false;
143
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
144
uint64_t t1 = arg_info(op->args[1])->val;
145
uint64_t t2 = arg_info(op->args[2])->val;
146
147
- t1 = do_constant_folding(op->opc, t1, t2);
148
+ t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
149
return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
150
}
151
return false;
152
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
153
static bool fold_brcond(OptContext *ctx, TCGOp *op)
154
{
155
TCGCond cond = op->args[2];
156
- int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
157
+ int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
158
159
if (i == 0) {
160
tcg_op_remove(ctx->tcg, op);
161
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
162
* Simplify EQ/NE comparisons where one of the pairs
163
* can be simplified.
164
*/
165
- i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
166
+ i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
167
op->args[2], cond);
168
switch (i ^ inv) {
169
case 0:
170
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
171
goto do_brcond_high;
172
}
173
174
- i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
175
+ i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
176
op->args[3], cond);
177
switch (i ^ inv) {
178
case 0:
179
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
180
if (arg_is_const(op->args[1])) {
181
uint64_t t = arg_info(op->args[1])->val;
182
183
- t = do_constant_folding(op->opc, t, op->args[2]);
184
+ t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
185
return tcg_opt_gen_movi(ctx, op, op->args[0], t);
186
}
187
return false;
188
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
189
uint64_t t = arg_info(op->args[1])->val;
190
191
if (t != 0) {
192
- t = do_constant_folding(op->opc, t, 0);
193
+ t = do_constant_folding(op->opc, ctx->type, t, 0);
194
return tcg_opt_gen_movi(ctx, op, op->args[0], t);
195
}
196
return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
197
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
198
199
static bool fold_movcond(OptContext *ctx, TCGOp *op)
200
{
201
- TCGOpcode opc = op->opc;
202
TCGCond cond = op->args[5];
203
- int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
204
+ int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
205
206
if (i >= 0) {
207
return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
208
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
209
if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
210
uint64_t tv = arg_info(op->args[3])->val;
211
uint64_t fv = arg_info(op->args[4])->val;
212
+ TCGOpcode opc;
213
214
- opc = (opc == INDEX_op_movcond_i32
215
- ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
216
+ switch (ctx->type) {
217
+ case TCG_TYPE_I32:
218
+ opc = INDEX_op_setcond_i32;
219
+ break;
220
+ case TCG_TYPE_I64:
221
+ opc = INDEX_op_setcond_i64;
222
+ break;
223
+ default:
224
+ g_assert_not_reached();
225
+ }
226
227
if (tv == 1 && fv == 0) {
228
op->opc = opc;
229
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
230
static bool fold_setcond(OptContext *ctx, TCGOp *op)
231
{
232
TCGCond cond = op->args[3];
233
- int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
234
+ int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
235
236
if (i >= 0) {
237
return tcg_opt_gen_movi(ctx, op, op->args[0], i);
238
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
239
* Simplify EQ/NE comparisons where one of the pairs
240
* can be simplified.
241
*/
242
- i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
243
+ i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
244
op->args[3], cond);
245
switch (i ^ inv) {
246
case 0:
247
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
248
goto do_setcond_high;
249
}
250
251
- i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
252
+ i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
253
op->args[4], cond);
254
switch (i ^ inv) {
255
case 0:
256
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
257
init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
258
copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
259
260
+ /* Pre-compute the type of the operation. */
261
+ if (def->flags & TCG_OPF_VECTOR) {
262
+ ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
263
+ } else if (def->flags & TCG_OPF_64BIT) {
264
+ ctx.type = TCG_TYPE_I64;
265
+ } else {
266
+ ctx.type = TCG_TYPE_I32;
267
+ }
268
+
269
/* For commutative operations make constant second argument */
270
switch (opc) {
271
CASE_OP_32_64_VEC(add):
272
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
273
/* Proceed with possible constant folding. */
274
break;
275
}
276
- if (opc == INDEX_op_sub_i32) {
277
+ switch (ctx.type) {
278
+ case TCG_TYPE_I32:
279
neg_op = INDEX_op_neg_i32;
280
have_neg = TCG_TARGET_HAS_neg_i32;
281
- } else if (opc == INDEX_op_sub_i64) {
282
+ break;
283
+ case TCG_TYPE_I64:
284
neg_op = INDEX_op_neg_i64;
285
have_neg = TCG_TARGET_HAS_neg_i64;
286
- } else if (TCG_TARGET_HAS_neg_vec) {
287
- TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
288
- unsigned vece = TCGOP_VECE(op);
289
- neg_op = INDEX_op_neg_vec;
290
- have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
291
- } else {
292
break;
293
+ case TCG_TYPE_V64:
294
+ case TCG_TYPE_V128:
295
+ case TCG_TYPE_V256:
296
+ neg_op = INDEX_op_neg_vec;
297
+ have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
298
+ TCGOP_VECE(op)) > 0;
299
+ break;
300
+ default:
301
+ g_assert_not_reached();
302
}
303
if (!have_neg) {
304
break;
305
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
306
TCGOpcode not_op;
307
bool have_not;
308
309
- if (def->flags & TCG_OPF_VECTOR) {
310
- not_op = INDEX_op_not_vec;
311
- have_not = TCG_TARGET_HAS_not_vec;
312
- } else if (def->flags & TCG_OPF_64BIT) {
313
- not_op = INDEX_op_not_i64;
314
- have_not = TCG_TARGET_HAS_not_i64;
315
- } else {
316
+ switch (ctx.type) {
317
+ case TCG_TYPE_I32:
318
not_op = INDEX_op_not_i32;
319
have_not = TCG_TARGET_HAS_not_i32;
320
+ break;
321
+ case TCG_TYPE_I64:
322
+ not_op = INDEX_op_not_i64;
323
+ have_not = TCG_TARGET_HAS_not_i64;
324
+ break;
325
+ case TCG_TYPE_V64:
326
+ case TCG_TYPE_V128:
327
+ case TCG_TYPE_V256:
328
+ not_op = INDEX_op_not_vec;
329
+ have_not = TCG_TARGET_HAS_not_vec;
330
+ break;
331
+ default:
332
+ g_assert_not_reached();
333
}
334
if (!have_not) {
335
break;
336
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
337
below, we can ignore high bits, but for further optimizations we
338
need to record that the high bits contain garbage. */
339
partmask = z_mask;
340
- if (!(def->flags & TCG_OPF_64BIT)) {
341
+ if (ctx.type == TCG_TYPE_I32) {
342
z_mask |= ~(tcg_target_ulong)0xffffffffu;
343
partmask &= 0xffffffffu;
344
affected &= 0xffffffffu;
345
--
43
--
346
2.25.1
44
2.25.1
347
45
348
46
diff view generated by jsdifflib
1
From: Luis Pires <luis.pires@eldorado.org.br>
1
From: Ziqiao Kong <ziqiaokong@gmail.com>
2
2
3
In preparation for changing the divu128/divs128 implementations
3
The last entry of DEF_HELPERS_FLAGS_n is DEF_HELPER_FLAGS_7 and
4
to allow for quotients larger than 64 bits, move the div-by-zero
4
thus the MAX_OPC_PARAM_IARGS should be 7.
5
and overflow checks to the callers.
6
5
7
Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
6
Reviewed-by: Taylor Simpson <tsimpson@quicinc.com>
8
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Signed-off-by: Ziqiao Kong <ziqiaokong@gmail.com>
9
Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
8
Message-Id: <20220227113127.414533-2-ziqiaokong@gmail.com>
9
Fixes: e6cadf49c3d ("tcg: Add support for a helper with 7 arguments")
10
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
10
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
11
---
11
---
12
include/hw/clock.h | 5 +++--
12
include/tcg/tcg.h | 2 +-
13
include/qemu/host-utils.h | 34 ++++++++++++---------------------
13
tcg/tci/tcg-target.c.inc | 2 +-
14
target/ppc/int_helper.c | 14 +++++++++-----
14
2 files changed, 2 insertions(+), 2 deletions(-)
15
util/host-utils.c | 40 ++++++++++++++++++---------------------
16
4 files changed, 42 insertions(+), 51 deletions(-)
17
15
18
diff --git a/include/hw/clock.h b/include/hw/clock.h
16
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
19
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
20
--- a/include/hw/clock.h
18
--- a/include/tcg/tcg.h
21
+++ b/include/hw/clock.h
19
+++ b/include/tcg/tcg.h
22
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
20
@@ -XXX,XX +XXX,XX @@
23
return 0;
21
#else
24
}
22
#define MAX_OPC_PARAM_PER_ARG 1
25
/*
23
#endif
26
- * Ignore divu128() return value as we've caught div-by-zero and don't
24
-#define MAX_OPC_PARAM_IARGS 6
27
- * need different behaviour for overflow.
25
+#define MAX_OPC_PARAM_IARGS 7
28
+ * BUG: when CONFIG_INT128 is not defined, the current implementation of
26
#define MAX_OPC_PARAM_OARGS 1
29
+ * divu128 does not return a valid truncated quotient, so the result will
27
#define MAX_OPC_PARAM_ARGS (MAX_OPC_PARAM_IARGS + MAX_OPC_PARAM_OARGS)
30
+ * be wrong.
28
31
*/
29
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
32
divu128(&lo, &hi, clk->period);
33
return lo;
34
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
35
index XXXXXXX..XXXXXXX 100644
30
index XXXXXXX..XXXXXXX 100644
36
--- a/include/qemu/host-utils.h
31
--- a/tcg/tci/tcg-target.c.inc
37
+++ b/include/qemu/host-utils.h
32
+++ b/tcg/tci/tcg-target.c.inc
38
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
33
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
39
return (__int128_t)a * b / c;
34
TCG_REG_R0,
40
}
35
};
41
36
42
-static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
37
-#if MAX_OPC_PARAM_IARGS != 6
43
+static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
38
+#if MAX_OPC_PARAM_IARGS != 7
44
{
39
# error Fix needed, number of supported input arguments changed!
45
- if (divisor == 0) {
46
- return 1;
47
- } else {
48
- __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
49
- __uint128_t result = dividend / divisor;
50
- *plow = result;
51
- *phigh = dividend % divisor;
52
- return result > UINT64_MAX;
53
- }
54
+ __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
55
+ __uint128_t result = dividend / divisor;
56
+ *plow = result;
57
+ *phigh = dividend % divisor;
58
}
59
60
-static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
61
+static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
62
{
63
- if (divisor == 0) {
64
- return 1;
65
- } else {
66
- __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
67
- __int128_t result = dividend / divisor;
68
- *plow = result;
69
- *phigh = dividend % divisor;
70
- return result != *plow;
71
- }
72
+ __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
73
+ __int128_t result = dividend / divisor;
74
+ *plow = result;
75
+ *phigh = dividend % divisor;
76
}
77
#else
78
void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
79
void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
80
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
81
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
82
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
83
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
84
85
static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
86
{
87
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
88
index XXXXXXX..XXXXXXX 100644
89
--- a/target/ppc/int_helper.c
90
+++ b/target/ppc/int_helper.c
91
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
92
uint64_t rt = 0;
93
int overflow = 0;
94
95
- overflow = divu128(&rt, &ra, rb);
96
-
97
- if (unlikely(overflow)) {
98
+ if (unlikely(rb == 0 || ra >= rb)) {
99
+ overflow = 1;
100
rt = 0; /* Undefined */
101
+ } else {
102
+ divu128(&rt, &ra, rb);
103
}
104
105
if (oe) {
106
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
107
int64_t rt = 0;
108
int64_t ra = (int64_t)rau;
109
int64_t rb = (int64_t)rbu;
110
- int overflow = divs128(&rt, &ra, rb);
111
+ int overflow = 0;
112
113
- if (unlikely(overflow)) {
114
+ if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
115
+ overflow = 1;
116
rt = 0; /* Undefined */
117
+ } else {
118
+ divs128(&rt, &ra, rb);
119
}
120
121
if (oe) {
122
diff --git a/util/host-utils.c b/util/host-utils.c
123
index XXXXXXX..XXXXXXX 100644
124
--- a/util/host-utils.c
125
+++ b/util/host-utils.c
126
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
127
*phigh = rh;
128
}
129
130
-/* Unsigned 128x64 division. Returns 1 if overflow (divide by zero or */
131
-/* quotient exceeds 64 bits). Otherwise returns quotient via plow and */
132
-/* remainder via phigh. */
133
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
134
+/*
135
+ * Unsigned 128-by-64 division. Returns quotient via plow and
136
+ * remainder via phigh.
137
+ * The result must fit in 64 bits (plow) - otherwise, the result
138
+ * is undefined.
139
+ * This function will cause a division by zero if passed a zero divisor.
140
+ */
141
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
142
{
143
uint64_t dhi = *phigh;
144
uint64_t dlo = *plow;
145
unsigned i;
146
uint64_t carry = 0;
147
148
- if (divisor == 0) {
149
- return 1;
150
- } else if (dhi == 0) {
151
+ if (divisor == 0 || dhi == 0) {
152
*plow = dlo / divisor;
153
*phigh = dlo % divisor;
154
- return 0;
155
- } else if (dhi >= divisor) {
156
- return 1;
157
} else {
158
159
for (i = 0; i < 64; i++) {
160
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
161
162
*plow = dlo;
163
*phigh = dhi;
164
- return 0;
165
}
166
}
167
168
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
169
+/*
170
+ * Signed 128-by-64 division. Returns quotient via plow and
171
+ * remainder via phigh.
172
+ * The result must fit in 64 bits (plow) - otherwise, the result
173
+ * is undefined.
174
+ * This function will cause a division by zero if passed a zero divisor.
175
+ */
176
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
177
{
178
int sgn_dvdnd = *phigh < 0;
179
int sgn_divsr = divisor < 0;
180
- int overflow = 0;
181
182
if (sgn_dvdnd) {
183
*plow = ~(*plow);
184
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
185
divisor = 0 - divisor;
186
}
187
188
- overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
189
+ divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
190
191
if (sgn_dvdnd ^ sgn_divsr) {
192
*plow = 0 - *plow;
193
}
194
-
195
- if (!overflow) {
196
- if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
197
- overflow = 1;
198
- }
199
- }
200
-
201
- return overflow;
202
}
203
#endif
40
#endif
204
41
205
--
42
--
206
2.25.1
43
2.25.1
207
208
diff view generated by jsdifflib
Deleted patch
1
From: Luis Pires <luis.pires@eldorado.org.br>
2
1
3
Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
4
so it can be reused by divu128().
5
6
Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
7
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
8
Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
9
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
10
---
11
include/fpu/softfloat-macros.h | 82 ----------------------------------
12
include/qemu/host-utils.h | 81 +++++++++++++++++++++++++++++++++
13
2 files changed, 81 insertions(+), 82 deletions(-)
14
15
diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
16
index XXXXXXX..XXXXXXX 100644
17
--- a/include/fpu/softfloat-macros.h
18
+++ b/include/fpu/softfloat-macros.h
19
@@ -XXX,XX +XXX,XX @@
20
* so some portions are provided under:
21
* the SoftFloat-2a license
22
* the BSD license
23
- * GPL-v2-or-later
24
*
25
* Any future contributions to this file after December 1st 2014 will be
26
* taken to be licensed under the Softfloat-2a license unless specifically
27
@@ -XXX,XX +XXX,XX @@ this code that are retained.
28
* THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
-/* Portions of this work are licensed under the terms of the GNU GPL,
32
- * version 2 or later. See the COPYING file in the top-level directory.
33
- */
34
-
35
#ifndef FPU_SOFTFLOAT_MACROS_H
36
#define FPU_SOFTFLOAT_MACROS_H
37
38
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
39
40
}
41
42
-/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
43
- * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
44
- *
45
- * Licensed under the GPLv2/LGPLv3
46
- */
47
-static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
48
- uint64_t n0, uint64_t d)
49
-{
50
-#if defined(__x86_64__)
51
- uint64_t q;
52
- asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
53
- return q;
54
-#elif defined(__s390x__) && !defined(__clang__)
55
- /* Need to use a TImode type to get an even register pair for DLGR. */
56
- unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
57
- asm("dlgr %0, %1" : "+r"(n) : "r"(d));
58
- *r = n >> 64;
59
- return n;
60
-#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
61
- /* From Power ISA 2.06, programming note for divdeu. */
62
- uint64_t q1, q2, Q, r1, r2, R;
63
- asm("divdeu %0,%2,%4; divdu %1,%3,%4"
64
- : "=&r"(q1), "=r"(q2)
65
- : "r"(n1), "r"(n0), "r"(d));
66
- r1 = -(q1 * d); /* low part of (n1<<64) - (q1 * d) */
67
- r2 = n0 - (q2 * d);
68
- Q = q1 + q2;
69
- R = r1 + r2;
70
- if (R >= d || R < r2) { /* overflow implies R > d */
71
- Q += 1;
72
- R -= d;
73
- }
74
- *r = R;
75
- return Q;
76
-#else
77
- uint64_t d0, d1, q0, q1, r1, r0, m;
78
-
79
- d0 = (uint32_t)d;
80
- d1 = d >> 32;
81
-
82
- r1 = n1 % d1;
83
- q1 = n1 / d1;
84
- m = q1 * d0;
85
- r1 = (r1 << 32) | (n0 >> 32);
86
- if (r1 < m) {
87
- q1 -= 1;
88
- r1 += d;
89
- if (r1 >= d) {
90
- if (r1 < m) {
91
- q1 -= 1;
92
- r1 += d;
93
- }
94
- }
95
- }
96
- r1 -= m;
97
-
98
- r0 = r1 % d1;
99
- q0 = r1 / d1;
100
- m = q0 * d0;
101
- r0 = (r0 << 32) | (uint32_t)n0;
102
- if (r0 < m) {
103
- q0 -= 1;
104
- r0 += d;
105
- if (r0 >= d) {
106
- if (r0 < m) {
107
- q0 -= 1;
108
- r0 += d;
109
- }
110
- }
111
- }
112
- r0 -= m;
113
-
114
- *r = r0;
115
- return (q1 << 32) | q0;
116
-#endif
117
-}
118
-
119
/*----------------------------------------------------------------------------
120
| Returns an approximation to the square root of the 32-bit significand given
121
| by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
122
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
123
index XXXXXXX..XXXXXXX 100644
124
--- a/include/qemu/host-utils.h
125
+++ b/include/qemu/host-utils.h
126
@@ -XXX,XX +XXX,XX @@
127
* THE SOFTWARE.
128
*/
129
130
+/* Portions of this work are licensed under the terms of the GNU GPL,
131
+ * version 2 or later. See the COPYING file in the top-level directory.
132
+ */
133
+
134
#ifndef HOST_UTILS_H
135
#define HOST_UTILS_H
136
137
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
138
*/
139
void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
140
141
+/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
142
+ * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
143
+ *
144
+ * Licensed under the GPLv2/LGPLv3
145
+ */
146
+static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
147
+ uint64_t n0, uint64_t d)
148
+{
149
+#if defined(__x86_64__)
150
+ uint64_t q;
151
+ asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
152
+ return q;
153
+#elif defined(__s390x__) && !defined(__clang__)
154
+ /* Need to use a TImode type to get an even register pair for DLGR. */
155
+ unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
156
+ asm("dlgr %0, %1" : "+r"(n) : "r"(d));
157
+ *r = n >> 64;
158
+ return n;
159
+#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
160
+ /* From Power ISA 2.06, programming note for divdeu. */
161
+ uint64_t q1, q2, Q, r1, r2, R;
162
+ asm("divdeu %0,%2,%4; divdu %1,%3,%4"
163
+ : "=&r"(q1), "=r"(q2)
164
+ : "r"(n1), "r"(n0), "r"(d));
165
+ r1 = -(q1 * d); /* low part of (n1<<64) - (q1 * d) */
166
+ r2 = n0 - (q2 * d);
167
+ Q = q1 + q2;
168
+ R = r1 + r2;
169
+ if (R >= d || R < r2) { /* overflow implies R > d */
170
+ Q += 1;
171
+ R -= d;
172
+ }
173
+ *r = R;
174
+ return Q;
175
+#else
176
+ uint64_t d0, d1, q0, q1, r1, r0, m;
177
+
178
+ d0 = (uint32_t)d;
179
+ d1 = d >> 32;
180
+
181
+ r1 = n1 % d1;
182
+ q1 = n1 / d1;
183
+ m = q1 * d0;
184
+ r1 = (r1 << 32) | (n0 >> 32);
185
+ if (r1 < m) {
186
+ q1 -= 1;
187
+ r1 += d;
188
+ if (r1 >= d) {
189
+ if (r1 < m) {
190
+ q1 -= 1;
191
+ r1 += d;
192
+ }
193
+ }
194
+ }
195
+ r1 -= m;
196
+
197
+ r0 = r1 % d1;
198
+ q0 = r1 / d1;
199
+ m = q0 * d0;
200
+ r0 = (r0 << 32) | (uint32_t)n0;
201
+ if (r0 < m) {
202
+ q0 -= 1;
203
+ r0 += d;
204
+ if (r0 >= d) {
205
+ if (r0 < m) {
206
+ q0 -= 1;
207
+ r0 += d;
208
+ }
209
+ }
210
+ }
211
+ r0 -= m;
212
+
213
+ *r = r0;
214
+ return (q1 << 32) | q0;
215
+#endif
216
+}
217
+
218
#endif
219
--
220
2.25.1
221
222
diff view generated by jsdifflib
Deleted patch
1
From: Luis Pires <luis.pires@eldorado.org.br>
2
1
3
These will be used to implement new decimal floating point
4
instructions from Power ISA 3.1.
5
6
The remainder is now returned directly by divu128/divs128,
7
freeing up phigh to receive the high 64 bits of the quotient.
8
9
Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
10
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
11
Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
12
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
13
---
14
include/hw/clock.h | 6 +-
15
include/qemu/host-utils.h | 20 ++++--
16
target/ppc/int_helper.c | 9 +--
17
util/host-utils.c | 133 +++++++++++++++++++++++++-------------
18
4 files changed, 108 insertions(+), 60 deletions(-)
19
20
diff --git a/include/hw/clock.h b/include/hw/clock.h
21
index XXXXXXX..XXXXXXX 100644
22
--- a/include/hw/clock.h
23
+++ b/include/hw/clock.h
24
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
25
if (clk->period == 0) {
26
return 0;
27
}
28
- /*
29
- * BUG: when CONFIG_INT128 is not defined, the current implementation of
30
- * divu128 does not return a valid truncated quotient, so the result will
31
- * be wrong.
32
- */
33
+
34
divu128(&lo, &hi, clk->period);
35
return lo;
36
}
37
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
38
index XXXXXXX..XXXXXXX 100644
39
--- a/include/qemu/host-utils.h
40
+++ b/include/qemu/host-utils.h
41
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
42
return (__int128_t)a * b / c;
43
}
44
45
-static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
46
+static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
47
+ uint64_t divisor)
48
{
49
__uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
50
__uint128_t result = dividend / divisor;
51
+
52
*plow = result;
53
- *phigh = dividend % divisor;
54
+ *phigh = result >> 64;
55
+ return dividend % divisor;
56
}
57
58
-static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
59
+static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
60
+ int64_t divisor)
61
{
62
- __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
63
+ __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
64
__int128_t result = dividend / divisor;
65
+
66
*plow = result;
67
- *phigh = dividend % divisor;
68
+ *phigh = result >> 64;
69
+ return dividend % divisor;
70
}
71
#else
72
void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
73
void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
74
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
75
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
76
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
77
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
78
79
static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
80
{
81
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
82
index XXXXXXX..XXXXXXX 100644
83
--- a/target/ppc/int_helper.c
84
+++ b/target/ppc/int_helper.c
85
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
86
87
uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
88
{
89
- int64_t rt = 0;
90
+ uint64_t rt = 0;
91
int64_t ra = (int64_t)rau;
92
int64_t rb = (int64_t)rbu;
93
int overflow = 0;
94
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
95
int cr;
96
uint64_t lo_value;
97
uint64_t hi_value;
98
+ uint64_t rem;
99
ppc_avr_t ret = { .u64 = { 0, 0 } };
100
101
if (b->VsrSD(0) < 0) {
102
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
103
* In that case, we leave r unchanged.
104
*/
105
} else {
106
- divu128(&lo_value, &hi_value, 1000000000000000ULL);
107
+ rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
108
109
- for (i = 1; i < 16; hi_value /= 10, i++) {
110
- bcd_put_digit(&ret, hi_value % 10, i);
111
+ for (i = 1; i < 16; rem /= 10, i++) {
112
+ bcd_put_digit(&ret, rem % 10, i);
113
}
114
115
for (; i < 32; lo_value /= 10, i++) {
116
diff --git a/util/host-utils.c b/util/host-utils.c
117
index XXXXXXX..XXXXXXX 100644
118
--- a/util/host-utils.c
119
+++ b/util/host-utils.c
120
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
121
}
122
123
/*
124
- * Unsigned 128-by-64 division. Returns quotient via plow and
125
- * remainder via phigh.
126
- * The result must fit in 64 bits (plow) - otherwise, the result
127
- * is undefined.
128
- * This function will cause a division by zero if passed a zero divisor.
129
+ * Unsigned 128-by-64 division.
130
+ * Returns the remainder.
131
+ * Returns quotient via plow and phigh.
132
+ * Also returns the remainder via the function return value.
133
*/
134
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
135
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
136
{
137
uint64_t dhi = *phigh;
138
uint64_t dlo = *plow;
139
- unsigned i;
140
- uint64_t carry = 0;
141
+ uint64_t rem, dhighest;
142
+ int sh;
143
144
if (divisor == 0 || dhi == 0) {
145
*plow = dlo / divisor;
146
- *phigh = dlo % divisor;
147
+ *phigh = 0;
148
+ return dlo % divisor;
149
} else {
150
+ sh = clz64(divisor);
151
152
- for (i = 0; i < 64; i++) {
153
- carry = dhi >> 63;
154
- dhi = (dhi << 1) | (dlo >> 63);
155
- if (carry || (dhi >= divisor)) {
156
- dhi -= divisor;
157
- carry = 1;
158
- } else {
159
- carry = 0;
160
+ if (dhi < divisor) {
161
+ if (sh != 0) {
162
+ /* normalize the divisor, shifting the dividend accordingly */
163
+ divisor <<= sh;
164
+ dhi = (dhi << sh) | (dlo >> (64 - sh));
165
+ dlo <<= sh;
166
}
167
- dlo = (dlo << 1) | carry;
168
+
169
+ *phigh = 0;
170
+ *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
171
+ } else {
172
+ if (sh != 0) {
173
+ /* normalize the divisor, shifting the dividend accordingly */
174
+ divisor <<= sh;
175
+ dhighest = dhi >> (64 - sh);
176
+ dhi = (dhi << sh) | (dlo >> (64 - sh));
177
+ dlo <<= sh;
178
+
179
+ *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
180
+ } else {
181
+ /**
182
+ * dhi >= divisor
183
+ * Since the MSB of divisor is set (sh == 0),
184
+ * (dhi - divisor) < divisor
185
+ *
186
+ * Thus, the high part of the quotient is 1, and we can
187
+ * calculate the low part with a single call to udiv_qrnnd
188
+ * after subtracting divisor from dhi
189
+ */
190
+ dhi -= divisor;
191
+ *phigh = 1;
192
+ }
193
+
194
+ *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
195
}
196
197
- *plow = dlo;
198
- *phigh = dhi;
199
+ /*
200
+ * since the dividend/divisor might have been normalized,
201
+ * the remainder might also have to be shifted back
202
+ */
203
+ return rem >> sh;
204
}
205
}
206
207
/*
208
- * Signed 128-by-64 division. Returns quotient via plow and
209
- * remainder via phigh.
210
- * The result must fit in 64 bits (plow) - otherwise, the result
211
- * is undefined.
212
- * This function will cause a division by zero if passed a zero divisor.
213
+ * Signed 128-by-64 division.
214
+ * Returns quotient via plow and phigh.
215
+ * Also returns the remainder via the function return value.
216
*/
217
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
218
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
219
{
220
- int sgn_dvdnd = *phigh < 0;
221
- int sgn_divsr = divisor < 0;
222
+ bool neg_quotient = false, neg_remainder = false;
223
+ uint64_t unsig_hi = *phigh, unsig_lo = *plow;
224
+ uint64_t rem;
225
226
- if (sgn_dvdnd) {
227
- *plow = ~(*plow);
228
- *phigh = ~(*phigh);
229
- if (*plow == (int64_t)-1) {
230
+ if (*phigh < 0) {
231
+ neg_quotient = !neg_quotient;
232
+ neg_remainder = !neg_remainder;
233
+
234
+ if (unsig_lo == 0) {
235
+ unsig_hi = -unsig_hi;
236
+ } else {
237
+ unsig_hi = ~unsig_hi;
238
+ unsig_lo = -unsig_lo;
239
+ }
240
+ }
241
+
242
+ if (divisor < 0) {
243
+ neg_quotient = !neg_quotient;
244
+
245
+ divisor = -divisor;
246
+ }
247
+
248
+ rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
249
+
250
+ if (neg_quotient) {
251
+ if (unsig_lo == 0) {
252
+ *phigh = -unsig_hi;
253
*plow = 0;
254
- (*phigh)++;
255
- } else {
256
- (*plow)++;
257
- }
258
+ } else {
259
+ *phigh = ~unsig_hi;
260
+ *plow = -unsig_lo;
261
+ }
262
+ } else {
263
+ *phigh = unsig_hi;
264
+ *plow = unsig_lo;
265
}
266
267
- if (sgn_divsr) {
268
- divisor = 0 - divisor;
269
- }
270
-
271
- divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
272
-
273
- if (sgn_dvdnd ^ sgn_divsr) {
274
- *plow = 0 - *plow;
275
+ if (neg_remainder) {
276
+ return -rem;
277
+ } else {
278
+ return rem;
279
}
280
}
281
#endif
282
--
283
2.25.1
284
285
diff view generated by jsdifflib
1
Rename to fold_addsub2.
1
We've had placeholders for these opcodes for a while,
2
Use Int128 to implement the wider operation.
2
and should have support on ppc, s390x and avx512 hosts.
3
3
4
Tested-by: Alex Bennée <alex.bennee@linaro.org>
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
6
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
6
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
8
---
8
---
9
tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
9
include/tcg/tcg-opc.h | 3 +++
10
1 file changed, 44 insertions(+), 21 deletions(-)
10
include/tcg/tcg.h | 3 +++
11
11
tcg/aarch64/tcg-target.h | 3 +++
12
tcg/arm/tcg-target.h | 3 +++
13
tcg/i386/tcg-target.h | 3 +++
14
tcg/ppc/tcg-target.h | 3 +++
15
tcg/s390x/tcg-target.h | 3 +++
16
tcg/optimize.c | 12 ++++++------
17
tcg/tcg-op-vec.c | 27 ++++++++++++++++++---------
18
tcg/tcg.c | 6 ++++++
19
10 files changed, 51 insertions(+), 15 deletions(-)
20
21
diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
22
index XXXXXXX..XXXXXXX 100644
23
--- a/include/tcg/tcg-opc.h
24
+++ b/include/tcg/tcg-opc.h
25
@@ -XXX,XX +XXX,XX @@ DEF(or_vec, 1, 2, 0, IMPLVEC)
26
DEF(xor_vec, 1, 2, 0, IMPLVEC)
27
DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
28
DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
29
+DEF(nand_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_nand_vec))
30
+DEF(nor_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_nor_vec))
31
+DEF(eqv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_eqv_vec))
32
DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
33
34
DEF(shli_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec))
35
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
36
index XXXXXXX..XXXXXXX 100644
37
--- a/include/tcg/tcg.h
38
+++ b/include/tcg/tcg.h
39
@@ -XXX,XX +XXX,XX @@ typedef uint64_t TCGRegSet;
40
#define TCG_TARGET_HAS_not_vec 0
41
#define TCG_TARGET_HAS_andc_vec 0
42
#define TCG_TARGET_HAS_orc_vec 0
43
+#define TCG_TARGET_HAS_nand_vec 0
44
+#define TCG_TARGET_HAS_nor_vec 0
45
+#define TCG_TARGET_HAS_eqv_vec 0
46
#define TCG_TARGET_HAS_roti_vec 0
47
#define TCG_TARGET_HAS_rots_vec 0
48
#define TCG_TARGET_HAS_rotv_vec 0
49
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
50
index XXXXXXX..XXXXXXX 100644
51
--- a/tcg/aarch64/tcg-target.h
52
+++ b/tcg/aarch64/tcg-target.h
53
@@ -XXX,XX +XXX,XX @@ typedef enum {
54
55
#define TCG_TARGET_HAS_andc_vec 1
56
#define TCG_TARGET_HAS_orc_vec 1
57
+#define TCG_TARGET_HAS_nand_vec 0
58
+#define TCG_TARGET_HAS_nor_vec 0
59
+#define TCG_TARGET_HAS_eqv_vec 0
60
#define TCG_TARGET_HAS_not_vec 1
61
#define TCG_TARGET_HAS_neg_vec 1
62
#define TCG_TARGET_HAS_abs_vec 1
63
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
64
index XXXXXXX..XXXXXXX 100644
65
--- a/tcg/arm/tcg-target.h
66
+++ b/tcg/arm/tcg-target.h
67
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
68
69
#define TCG_TARGET_HAS_andc_vec 1
70
#define TCG_TARGET_HAS_orc_vec 1
71
+#define TCG_TARGET_HAS_nand_vec 0
72
+#define TCG_TARGET_HAS_nor_vec 0
73
+#define TCG_TARGET_HAS_eqv_vec 0
74
#define TCG_TARGET_HAS_not_vec 1
75
#define TCG_TARGET_HAS_neg_vec 1
76
#define TCG_TARGET_HAS_abs_vec 1
77
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
78
index XXXXXXX..XXXXXXX 100644
79
--- a/tcg/i386/tcg-target.h
80
+++ b/tcg/i386/tcg-target.h
81
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
82
83
#define TCG_TARGET_HAS_andc_vec 1
84
#define TCG_TARGET_HAS_orc_vec 0
85
+#define TCG_TARGET_HAS_nand_vec 0
86
+#define TCG_TARGET_HAS_nor_vec 0
87
+#define TCG_TARGET_HAS_eqv_vec 0
88
#define TCG_TARGET_HAS_not_vec 0
89
#define TCG_TARGET_HAS_neg_vec 0
90
#define TCG_TARGET_HAS_abs_vec 1
91
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
92
index XXXXXXX..XXXXXXX 100644
93
--- a/tcg/ppc/tcg-target.h
94
+++ b/tcg/ppc/tcg-target.h
95
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
96
97
#define TCG_TARGET_HAS_andc_vec 1
98
#define TCG_TARGET_HAS_orc_vec have_isa_2_07
99
+#define TCG_TARGET_HAS_nand_vec 0
100
+#define TCG_TARGET_HAS_nor_vec 0
101
+#define TCG_TARGET_HAS_eqv_vec 0
102
#define TCG_TARGET_HAS_not_vec 1
103
#define TCG_TARGET_HAS_neg_vec have_isa_3_00
104
#define TCG_TARGET_HAS_abs_vec 0
105
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
106
index XXXXXXX..XXXXXXX 100644
107
--- a/tcg/s390x/tcg-target.h
108
+++ b/tcg/s390x/tcg-target.h
109
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
110
111
#define TCG_TARGET_HAS_andc_vec 1
112
#define TCG_TARGET_HAS_orc_vec HAVE_FACILITY(VECTOR_ENH1)
113
+#define TCG_TARGET_HAS_nand_vec 0
114
+#define TCG_TARGET_HAS_nor_vec 0
115
+#define TCG_TARGET_HAS_eqv_vec 0
116
#define TCG_TARGET_HAS_not_vec 1
117
#define TCG_TARGET_HAS_neg_vec 1
118
#define TCG_TARGET_HAS_abs_vec 1
12
diff --git a/tcg/optimize.c b/tcg/optimize.c
119
diff --git a/tcg/optimize.c b/tcg/optimize.c
13
index XXXXXXX..XXXXXXX 100644
120
index XXXXXXX..XXXXXXX 100644
14
--- a/tcg/optimize.c
121
--- a/tcg/optimize.c
15
+++ b/tcg/optimize.c
122
+++ b/tcg/optimize.c
16
@@ -XXX,XX +XXX,XX @@
123
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
17
*/
124
CASE_OP_32_64_VEC(orc):
18
125
return x | ~y;
19
#include "qemu/osdep.h"
126
20
+#include "qemu/int128.h"
127
- CASE_OP_32_64(eqv):
21
#include "tcg/tcg-op.h"
128
+ CASE_OP_32_64_VEC(eqv):
22
#include "tcg-internal.h"
129
return ~(x ^ y);
23
130
24
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
131
- CASE_OP_32_64(nand):
25
return false;
132
+ CASE_OP_32_64_VEC(nand):
133
return ~(x & y);
134
135
- CASE_OP_32_64(nor):
136
+ CASE_OP_32_64_VEC(nor):
137
return ~(x | y);
138
139
case INDEX_op_clz_i32:
140
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
141
case INDEX_op_dup2_vec:
142
done = fold_dup2(&ctx, op);
143
break;
144
- CASE_OP_32_64(eqv):
145
+ CASE_OP_32_64_VEC(eqv):
146
done = fold_eqv(&ctx, op);
147
break;
148
CASE_OP_32_64(extract):
149
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
150
CASE_OP_32_64(mulu2):
151
done = fold_multiply2(&ctx, op);
152
break;
153
- CASE_OP_32_64(nand):
154
+ CASE_OP_32_64_VEC(nand):
155
done = fold_nand(&ctx, op);
156
break;
157
CASE_OP_32_64(neg):
158
done = fold_neg(&ctx, op);
159
break;
160
- CASE_OP_32_64(nor):
161
+ CASE_OP_32_64_VEC(nor):
162
done = fold_nor(&ctx, op);
163
break;
164
CASE_OP_32_64_VEC(not):
165
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
166
index XXXXXXX..XXXXXXX 100644
167
--- a/tcg/tcg-op-vec.c
168
+++ b/tcg/tcg-op-vec.c
169
@@ -XXX,XX +XXX,XX @@ void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
170
171
void tcg_gen_nand_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
172
{
173
- /* TODO: Add TCG_TARGET_HAS_nand_vec when adding a backend supports it. */
174
- tcg_gen_and_vec(0, r, a, b);
175
- tcg_gen_not_vec(0, r, r);
176
+ if (TCG_TARGET_HAS_nand_vec) {
177
+ vec_gen_op3(INDEX_op_nand_vec, 0, r, a, b);
178
+ } else {
179
+ tcg_gen_and_vec(0, r, a, b);
180
+ tcg_gen_not_vec(0, r, r);
181
+ }
26
}
182
}
27
183
28
-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
184
void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
29
+static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
30
{
185
{
31
if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
186
- /* TODO: Add TCG_TARGET_HAS_nor_vec when adding a backend supports it. */
32
arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
187
- tcg_gen_or_vec(0, r, a, b);
33
- uint32_t al = arg_info(op->args[2])->val;
188
- tcg_gen_not_vec(0, r, r);
34
- uint32_t ah = arg_info(op->args[3])->val;
189
+ if (TCG_TARGET_HAS_nor_vec) {
35
- uint32_t bl = arg_info(op->args[4])->val;
190
+ vec_gen_op3(INDEX_op_nor_vec, 0, r, a, b);
36
- uint32_t bh = arg_info(op->args[5])->val;
191
+ } else {
37
- uint64_t a = ((uint64_t)ah << 32) | al;
192
+ tcg_gen_or_vec(0, r, a, b);
38
- uint64_t b = ((uint64_t)bh << 32) | bl;
193
+ tcg_gen_not_vec(0, r, r);
39
+ uint64_t al = arg_info(op->args[2])->val;
194
+ }
40
+ uint64_t ah = arg_info(op->args[3])->val;
41
+ uint64_t bl = arg_info(op->args[4])->val;
42
+ uint64_t bh = arg_info(op->args[5])->val;
43
TCGArg rl, rh;
44
- TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
45
+ TCGOp *op2;
46
47
- if (add) {
48
- a += b;
49
+ if (ctx->type == TCG_TYPE_I32) {
50
+ uint64_t a = deposit64(al, 32, 32, ah);
51
+ uint64_t b = deposit64(bl, 32, 32, bh);
52
+
53
+ if (add) {
54
+ a += b;
55
+ } else {
56
+ a -= b;
57
+ }
58
+
59
+ al = sextract64(a, 0, 32);
60
+ ah = sextract64(a, 32, 32);
61
} else {
62
- a -= b;
63
+ Int128 a = int128_make128(al, ah);
64
+ Int128 b = int128_make128(bl, bh);
65
+
66
+ if (add) {
67
+ a = int128_add(a, b);
68
+ } else {
69
+ a = int128_sub(a, b);
70
+ }
71
+
72
+ al = int128_getlo(a);
73
+ ah = int128_gethi(a);
74
}
75
76
rl = op->args[0];
77
rh = op->args[1];
78
- tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
79
- tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
80
+
81
+ /* The proper opcode is supplied by tcg_opt_gen_mov. */
82
+ op2 = tcg_op_insert_before(ctx->tcg, op, 0);
83
+
84
+ tcg_opt_gen_movi(ctx, op, rl, al);
85
+ tcg_opt_gen_movi(ctx, op2, rh, ah);
86
return true;
87
}
88
return false;
89
}
195
}
90
196
91
-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
197
void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
92
+static bool fold_add2(OptContext *ctx, TCGOp *op)
93
{
198
{
94
- return fold_addsub2_i32(ctx, op, true);
199
- /* TODO: Add TCG_TARGET_HAS_eqv_vec when adding a backend supports it. */
95
+ return fold_addsub2(ctx, op, true);
200
- tcg_gen_xor_vec(0, r, a, b);
201
- tcg_gen_not_vec(0, r, r);
202
+ if (TCG_TARGET_HAS_eqv_vec) {
203
+ vec_gen_op3(INDEX_op_eqv_vec, 0, r, a, b);
204
+ } else {
205
+ tcg_gen_xor_vec(0, r, a, b);
206
+ tcg_gen_not_vec(0, r, r);
207
+ }
96
}
208
}
97
209
98
static bool fold_and(OptContext *ctx, TCGOp *op)
210
static bool do_op2(unsigned vece, TCGv_vec r, TCGv_vec a, TCGOpcode opc)
99
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
211
diff --git a/tcg/tcg.c b/tcg/tcg.c
100
return false;
212
index XXXXXXX..XXXXXXX 100644
101
}
213
--- a/tcg/tcg.c
102
214
+++ b/tcg/tcg.c
103
-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
215
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
104
+static bool fold_sub2(OptContext *ctx, TCGOp *op)
216
return have_vec && TCG_TARGET_HAS_andc_vec;
105
{
217
case INDEX_op_orc_vec:
106
- return fold_addsub2_i32(ctx, op, false);
218
return have_vec && TCG_TARGET_HAS_orc_vec;
107
+ return fold_addsub2(ctx, op, false);
219
+ case INDEX_op_nand_vec:
108
}
220
+ return have_vec && TCG_TARGET_HAS_nand_vec;
109
221
+ case INDEX_op_nor_vec:
110
static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
222
+ return have_vec && TCG_TARGET_HAS_nor_vec;
111
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
223
+ case INDEX_op_eqv_vec:
112
CASE_OP_32_64_VEC(add):
224
+ return have_vec && TCG_TARGET_HAS_eqv_vec;
113
done = fold_add(&ctx, op);
225
case INDEX_op_mul_vec:
114
break;
226
return have_vec && TCG_TARGET_HAS_mul_vec;
115
- case INDEX_op_add2_i32:
227
case INDEX_op_shli_vec:
116
- done = fold_add2_i32(&ctx, op);
117
+ CASE_OP_32_64(add2):
118
+ done = fold_add2(&ctx, op);
119
break;
120
CASE_OP_32_64_VEC(and):
121
done = fold_and(&ctx, op);
122
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
123
CASE_OP_32_64_VEC(sub):
124
done = fold_sub(&ctx, op);
125
break;
126
- case INDEX_op_sub2_i32:
127
- done = fold_sub2_i32(&ctx, op);
128
+ CASE_OP_32_64(sub2):
129
+ done = fold_sub2(&ctx, op);
130
break;
131
CASE_OP_32_64_VEC(xor):
132
done = fold_xor(&ctx, op);
133
--
228
--
134
2.25.1
229
2.25.1
135
230
136
231
diff view generated by jsdifflib
1
Recognize the constant function for remainder.
1
Tested-by: Alex Bennée <alex.bennee@linaro.org>
2
2
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
3
Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
4
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
3
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
---
5
---
7
tcg/optimize.c | 6 +++++-
6
tcg/ppc/tcg-target.h | 6 +++---
8
1 file changed, 5 insertions(+), 1 deletion(-)
7
tcg/ppc/tcg-target.c.inc | 15 +++++++++++++++
8
2 files changed, 18 insertions(+), 3 deletions(-)
9
9
10
diff --git a/tcg/optimize.c b/tcg/optimize.c
10
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
11
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
12
--- a/tcg/optimize.c
12
--- a/tcg/ppc/tcg-target.h
13
+++ b/tcg/optimize.c
13
+++ b/tcg/ppc/tcg-target.h
14
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
14
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
15
15
16
static bool fold_remainder(OptContext *ctx, TCGOp *op)
16
#define TCG_TARGET_HAS_andc_vec 1
17
{
17
#define TCG_TARGET_HAS_orc_vec have_isa_2_07
18
- return fold_const2(ctx, op);
18
-#define TCG_TARGET_HAS_nand_vec 0
19
+ if (fold_const2(ctx, op) ||
19
-#define TCG_TARGET_HAS_nor_vec 0
20
+ fold_xx_to_i(ctx, op, 0)) {
20
-#define TCG_TARGET_HAS_eqv_vec 0
21
+ return true;
21
+#define TCG_TARGET_HAS_nand_vec have_isa_2_07
22
+ }
22
+#define TCG_TARGET_HAS_nor_vec 1
23
+ return false;
23
+#define TCG_TARGET_HAS_eqv_vec have_isa_2_07
24
}
24
#define TCG_TARGET_HAS_not_vec 1
25
25
#define TCG_TARGET_HAS_neg_vec have_isa_3_00
26
static bool fold_setcond(OptContext *ctx, TCGOp *op)
26
#define TCG_TARGET_HAS_abs_vec 0
27
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
28
index XXXXXXX..XXXXXXX 100644
29
--- a/tcg/ppc/tcg-target.c.inc
30
+++ b/tcg/ppc/tcg-target.c.inc
31
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
32
case INDEX_op_xor_vec:
33
case INDEX_op_andc_vec:
34
case INDEX_op_not_vec:
35
+ case INDEX_op_nor_vec:
36
+ case INDEX_op_eqv_vec:
37
+ case INDEX_op_nand_vec:
38
return 1;
39
case INDEX_op_orc_vec:
40
return have_isa_2_07;
41
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
42
case INDEX_op_orc_vec:
43
insn = VORC;
44
break;
45
+ case INDEX_op_nand_vec:
46
+ insn = VNAND;
47
+ break;
48
+ case INDEX_op_nor_vec:
49
+ insn = VNOR;
50
+ break;
51
+ case INDEX_op_eqv_vec:
52
+ insn = VEQV;
53
+ break;
54
55
case INDEX_op_cmp_vec:
56
switch (args[3]) {
57
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
58
case INDEX_op_xor_vec:
59
case INDEX_op_andc_vec:
60
case INDEX_op_orc_vec:
61
+ case INDEX_op_nor_vec:
62
+ case INDEX_op_eqv_vec:
63
+ case INDEX_op_nand_vec:
64
case INDEX_op_cmp_vec:
65
case INDEX_op_ssadd_vec:
66
case INDEX_op_sssub_vec:
27
--
67
--
28
2.25.1
68
2.25.1
29
69
30
70
diff view generated by jsdifflib
1
Recognize the identity function for division.
1
Tested-by: Alex Bennée <alex.bennee@linaro.org>
2
2
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
3
Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
3
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
5
---
8
tcg/optimize.c | 6 +++++-
6
tcg/s390x/tcg-target.h | 6 +++---
9
1 file changed, 5 insertions(+), 1 deletion(-)
7
tcg/s390x/tcg-target.c.inc | 17 +++++++++++++++++
8
2 files changed, 20 insertions(+), 3 deletions(-)
10
9
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
10
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
12
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
12
--- a/tcg/s390x/tcg-target.h
14
+++ b/tcg/optimize.c
13
+++ b/tcg/s390x/tcg-target.h
15
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
14
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
16
15
17
static bool fold_divide(OptContext *ctx, TCGOp *op)
16
#define TCG_TARGET_HAS_andc_vec 1
18
{
17
#define TCG_TARGET_HAS_orc_vec HAVE_FACILITY(VECTOR_ENH1)
19
- return fold_const2(ctx, op);
18
-#define TCG_TARGET_HAS_nand_vec 0
20
+ if (fold_const2(ctx, op) ||
19
-#define TCG_TARGET_HAS_nor_vec 0
21
+ fold_xi_to_x(ctx, op, 1)) {
20
-#define TCG_TARGET_HAS_eqv_vec 0
22
+ return true;
21
+#define TCG_TARGET_HAS_nand_vec HAVE_FACILITY(VECTOR_ENH1)
23
+ }
22
+#define TCG_TARGET_HAS_nor_vec 1
24
+ return false;
23
+#define TCG_TARGET_HAS_eqv_vec HAVE_FACILITY(VECTOR_ENH1)
25
}
24
#define TCG_TARGET_HAS_not_vec 1
26
25
#define TCG_TARGET_HAS_neg_vec 1
27
static bool fold_dup(OptContext *ctx, TCGOp *op)
26
#define TCG_TARGET_HAS_abs_vec 1
27
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
28
index XXXXXXX..XXXXXXX 100644
29
--- a/tcg/s390x/tcg-target.c.inc
30
+++ b/tcg/s390x/tcg-target.c.inc
31
@@ -XXX,XX +XXX,XX @@ typedef enum S390Opcode {
32
VRRc_VMXL = 0xe7fd,
33
VRRc_VN = 0xe768,
34
VRRc_VNC = 0xe769,
35
+ VRRc_VNN = 0xe76e,
36
VRRc_VNO = 0xe76b,
37
+ VRRc_VNX = 0xe76c,
38
VRRc_VO = 0xe76a,
39
VRRc_VOC = 0xe76f,
40
VRRc_VPKS = 0xe797, /* we leave the m5 cs field 0 */
41
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
42
case INDEX_op_xor_vec:
43
tcg_out_insn(s, VRRc, VX, a0, a1, a2, 0);
44
break;
45
+ case INDEX_op_nand_vec:
46
+ tcg_out_insn(s, VRRc, VNN, a0, a1, a2, 0);
47
+ break;
48
+ case INDEX_op_nor_vec:
49
+ tcg_out_insn(s, VRRc, VNO, a0, a1, a2, 0);
50
+ break;
51
+ case INDEX_op_eqv_vec:
52
+ tcg_out_insn(s, VRRc, VNX, a0, a1, a2, 0);
53
+ break;
54
55
case INDEX_op_shli_vec:
56
tcg_out_insn(s, VRSa, VESL, a0, a2, TCG_REG_NONE, a1, vece);
57
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
58
case INDEX_op_and_vec:
59
case INDEX_op_andc_vec:
60
case INDEX_op_bitsel_vec:
61
+ case INDEX_op_eqv_vec:
62
+ case INDEX_op_nand_vec:
63
case INDEX_op_neg_vec:
64
+ case INDEX_op_nor_vec:
65
case INDEX_op_not_vec:
66
case INDEX_op_or_vec:
67
case INDEX_op_orc_vec:
68
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
69
case INDEX_op_or_vec:
70
case INDEX_op_orc_vec:
71
case INDEX_op_xor_vec:
72
+ case INDEX_op_nand_vec:
73
+ case INDEX_op_nor_vec:
74
+ case INDEX_op_eqv_vec:
75
case INDEX_op_cmp_vec:
76
case INDEX_op_mul_vec:
77
case INDEX_op_rotlv_vec:
28
--
78
--
29
2.25.1
79
2.25.1
30
80
31
81
diff view generated by jsdifflib
1
Return -1 instead of 2 for failure, so that we can
1
There are some operation sizes in some subsets of AVX512 that
2
use comparisons against 0 for all cases.
2
are missing from previous iterations of AVX. Detect them.
3
3
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
4
Tested-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
7
---
8
tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
8
include/qemu/cpuid.h | 20 +++++++++++++++++---
9
1 file changed, 74 insertions(+), 71 deletions(-)
9
tcg/i386/tcg-target.h | 4 ++++
10
tcg/i386/tcg-target.c.inc | 24 ++++++++++++++++++++++--
11
3 files changed, 43 insertions(+), 5 deletions(-)
10
12
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
13
diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h
12
index XXXXXXX..XXXXXXX 100644
14
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
15
--- a/include/qemu/cpuid.h
14
+++ b/tcg/optimize.c
16
+++ b/include/qemu/cpuid.h
15
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
17
@@ -XXX,XX +XXX,XX @@
18
#ifndef bit_AVX2
19
#define bit_AVX2 (1 << 5)
20
#endif
21
-#ifndef bit_AVX512F
22
-#define bit_AVX512F (1 << 16)
23
-#endif
24
#ifndef bit_BMI2
25
#define bit_BMI2 (1 << 8)
26
#endif
27
+#ifndef bit_AVX512F
28
+#define bit_AVX512F (1 << 16)
29
+#endif
30
+#ifndef bit_AVX512DQ
31
+#define bit_AVX512DQ (1 << 17)
32
+#endif
33
+#ifndef bit_AVX512BW
34
+#define bit_AVX512BW (1 << 30)
35
+#endif
36
+#ifndef bit_AVX512VL
37
+#define bit_AVX512VL (1u << 31)
38
+#endif
39
+
40
+/* Leaf 7, %ecx */
41
+#ifndef bit_AVX512VBMI2
42
+#define bit_AVX512VBMI2 (1 << 6)
43
+#endif
44
45
/* Leaf 0x80000001, %ecx */
46
#ifndef bit_LZCNT
47
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
48
index XXXXXXX..XXXXXXX 100644
49
--- a/tcg/i386/tcg-target.h
50
+++ b/tcg/i386/tcg-target.h
51
@@ -XXX,XX +XXX,XX @@ extern bool have_bmi1;
52
extern bool have_popcnt;
53
extern bool have_avx1;
54
extern bool have_avx2;
55
+extern bool have_avx512bw;
56
+extern bool have_avx512dq;
57
+extern bool have_avx512vbmi2;
58
+extern bool have_avx512vl;
59
extern bool have_movbe;
60
61
/* optional instructions */
62
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
63
index XXXXXXX..XXXXXXX 100644
64
--- a/tcg/i386/tcg-target.c.inc
65
+++ b/tcg/i386/tcg-target.c.inc
66
@@ -XXX,XX +XXX,XX @@ bool have_bmi1;
67
bool have_popcnt;
68
bool have_avx1;
69
bool have_avx2;
70
+bool have_avx512bw;
71
+bool have_avx512dq;
72
+bool have_avx512vbmi2;
73
+bool have_avx512vl;
74
bool have_movbe;
75
76
#ifdef CONFIG_CPUID_H
77
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
78
static void tcg_target_init(TCGContext *s)
79
{
80
#ifdef CONFIG_CPUID_H
81
- unsigned a, b, c, d, b7 = 0;
82
+ unsigned a, b, c, d, b7 = 0, c7 = 0;
83
unsigned max = __get_cpuid_max(0, 0);
84
85
if (max >= 7) {
86
/* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
87
- __cpuid_count(7, 0, a, b7, c, d);
88
+ __cpuid_count(7, 0, a, b7, c7, d);
89
have_bmi1 = (b7 & bit_BMI) != 0;
90
have_bmi2 = (b7 & bit_BMI2) != 0;
16
}
91
}
17
}
92
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
18
93
if ((xcrl & 6) == 6) {
19
-/* Return 2 if the condition can't be simplified, and the result
94
have_avx1 = (c & bit_AVX) != 0;
20
- of the condition (0 or 1) if it can */
95
have_avx2 = (b7 & bit_AVX2) != 0;
21
-static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
96
+
22
- TCGArg y, TCGCond c)
97
+ /*
23
+/*
98
+ * There are interesting instructions in AVX512, so long
24
+ * Return -1 if the condition can't be simplified,
99
+ * as we have AVX512VL, which indicates support for EVEX
25
+ * and the result of the condition (0 or 1) if it can.
100
+ * on sizes smaller than 512 bits. We are required to
26
+ */
101
+ * check that OPMASK and all extended ZMM state are enabled
27
+static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
102
+ * even if we're not using them -- the insns will fault.
28
+ TCGArg y, TCGCond c)
103
+ */
29
{
104
+ if ((xcrl & 0xe0) == 0xe0
30
uint64_t xv = arg_info(x)->val;
105
+ && (b7 & bit_AVX512F)
31
uint64_t yv = arg_info(y)->val;
106
+ && (b7 & bit_AVX512VL)) {
32
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
107
+ have_avx512vl = true;
33
case TCG_COND_GEU:
108
+ have_avx512bw = (b7 & bit_AVX512BW) != 0;
34
return 1;
109
+ have_avx512dq = (b7 & bit_AVX512DQ) != 0;
35
default:
110
+ have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
36
- return 2;
111
+ }
37
+ return -1;
112
}
38
}
113
}
39
}
114
}
40
- return 2;
41
+ return -1;
42
}
43
44
-/* Return 2 if the condition can't be simplified, and the result
45
- of the condition (0 or 1) if it can */
46
-static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
47
+/*
48
+ * Return -1 if the condition can't be simplified,
49
+ * and the result of the condition (0 or 1) if it can.
50
+ */
51
+static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
52
{
53
TCGArg al = p1[0], ah = p1[1];
54
TCGArg bl = p2[0], bh = p2[1];
55
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
56
if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
57
return do_constant_folding_cond_eq(c);
58
}
59
- return 2;
60
+ return -1;
61
}
62
63
static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
64
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
65
break;
66
67
CASE_OP_32_64(setcond):
68
- tmp = do_constant_folding_cond(opc, op->args[1],
69
- op->args[2], op->args[3]);
70
- if (tmp != 2) {
71
- tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
72
+ i = do_constant_folding_cond(opc, op->args[1],
73
+ op->args[2], op->args[3]);
74
+ if (i >= 0) {
75
+ tcg_opt_gen_movi(&ctx, op, op->args[0], i);
76
continue;
77
}
78
break;
79
80
CASE_OP_32_64(brcond):
81
- tmp = do_constant_folding_cond(opc, op->args[0],
82
- op->args[1], op->args[2]);
83
- switch (tmp) {
84
- case 0:
85
+ i = do_constant_folding_cond(opc, op->args[0],
86
+ op->args[1], op->args[2]);
87
+ if (i == 0) {
88
tcg_op_remove(s, op);
89
continue;
90
- case 1:
91
+ } else if (i > 0) {
92
memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
93
op->opc = opc = INDEX_op_br;
94
op->args[0] = op->args[3];
95
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
96
break;
97
98
CASE_OP_32_64(movcond):
99
- tmp = do_constant_folding_cond(opc, op->args[1],
100
- op->args[2], op->args[5]);
101
- if (tmp != 2) {
102
- tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
103
+ i = do_constant_folding_cond(opc, op->args[1],
104
+ op->args[2], op->args[5]);
105
+ if (i >= 0) {
106
+ tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
107
continue;
108
}
109
if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
110
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
111
break;
112
113
case INDEX_op_brcond2_i32:
114
- tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
115
- op->args[4]);
116
- if (tmp == 0) {
117
+ i = do_constant_folding_cond2(&op->args[0], &op->args[2],
118
+ op->args[4]);
119
+ if (i == 0) {
120
do_brcond_false:
121
tcg_op_remove(s, op);
122
continue;
123
}
124
- if (tmp == 1) {
125
+ if (i > 0) {
126
do_brcond_true:
127
op->opc = opc = INDEX_op_br;
128
op->args[0] = op->args[5];
129
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
130
if (op->args[4] == TCG_COND_EQ) {
131
/* Simplify EQ comparisons where one of the pairs
132
can be simplified. */
133
- tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
134
- op->args[0], op->args[2],
135
- TCG_COND_EQ);
136
- if (tmp == 0) {
137
+ i = do_constant_folding_cond(INDEX_op_brcond_i32,
138
+ op->args[0], op->args[2],
139
+ TCG_COND_EQ);
140
+ if (i == 0) {
141
goto do_brcond_false;
142
- } else if (tmp == 1) {
143
+ } else if (i > 0) {
144
goto do_brcond_high;
145
}
146
- tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
147
- op->args[1], op->args[3],
148
- TCG_COND_EQ);
149
- if (tmp == 0) {
150
+ i = do_constant_folding_cond(INDEX_op_brcond_i32,
151
+ op->args[1], op->args[3],
152
+ TCG_COND_EQ);
153
+ if (i == 0) {
154
goto do_brcond_false;
155
- } else if (tmp != 1) {
156
+ } else if (i < 0) {
157
break;
158
}
159
do_brcond_low:
160
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
161
if (op->args[4] == TCG_COND_NE) {
162
/* Simplify NE comparisons where one of the pairs
163
can be simplified. */
164
- tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
165
- op->args[0], op->args[2],
166
- TCG_COND_NE);
167
- if (tmp == 0) {
168
+ i = do_constant_folding_cond(INDEX_op_brcond_i32,
169
+ op->args[0], op->args[2],
170
+ TCG_COND_NE);
171
+ if (i == 0) {
172
goto do_brcond_high;
173
- } else if (tmp == 1) {
174
+ } else if (i > 0) {
175
goto do_brcond_true;
176
}
177
- tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
178
- op->args[1], op->args[3],
179
- TCG_COND_NE);
180
- if (tmp == 0) {
181
+ i = do_constant_folding_cond(INDEX_op_brcond_i32,
182
+ op->args[1], op->args[3],
183
+ TCG_COND_NE);
184
+ if (i == 0) {
185
goto do_brcond_low;
186
- } else if (tmp == 1) {
187
+ } else if (i > 0) {
188
goto do_brcond_true;
189
}
190
}
191
break;
192
193
case INDEX_op_setcond2_i32:
194
- tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
195
- op->args[5]);
196
- if (tmp != 2) {
197
+ i = do_constant_folding_cond2(&op->args[1], &op->args[3],
198
+ op->args[5]);
199
+ if (i >= 0) {
200
do_setcond_const:
201
- tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
202
+ tcg_opt_gen_movi(&ctx, op, op->args[0], i);
203
continue;
204
}
205
if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
206
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
207
if (op->args[5] == TCG_COND_EQ) {
208
/* Simplify EQ comparisons where one of the pairs
209
can be simplified. */
210
- tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
211
- op->args[1], op->args[3],
212
- TCG_COND_EQ);
213
- if (tmp == 0) {
214
+ i = do_constant_folding_cond(INDEX_op_setcond_i32,
215
+ op->args[1], op->args[3],
216
+ TCG_COND_EQ);
217
+ if (i == 0) {
218
goto do_setcond_const;
219
- } else if (tmp == 1) {
220
+ } else if (i > 0) {
221
goto do_setcond_high;
222
}
223
- tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
224
- op->args[2], op->args[4],
225
- TCG_COND_EQ);
226
- if (tmp == 0) {
227
+ i = do_constant_folding_cond(INDEX_op_setcond_i32,
228
+ op->args[2], op->args[4],
229
+ TCG_COND_EQ);
230
+ if (i == 0) {
231
goto do_setcond_high;
232
- } else if (tmp != 1) {
233
+ } else if (i < 0) {
234
break;
235
}
236
do_setcond_low:
237
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
238
if (op->args[5] == TCG_COND_NE) {
239
/* Simplify NE comparisons where one of the pairs
240
can be simplified. */
241
- tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
242
- op->args[1], op->args[3],
243
- TCG_COND_NE);
244
- if (tmp == 0) {
245
+ i = do_constant_folding_cond(INDEX_op_setcond_i32,
246
+ op->args[1], op->args[3],
247
+ TCG_COND_NE);
248
+ if (i == 0) {
249
goto do_setcond_high;
250
- } else if (tmp == 1) {
251
+ } else if (i > 0) {
252
goto do_setcond_const;
253
}
254
- tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
255
- op->args[2], op->args[4],
256
- TCG_COND_NE);
257
- if (tmp == 0) {
258
+ i = do_constant_folding_cond(INDEX_op_setcond_i32,
259
+ op->args[2], op->args[4],
260
+ TCG_COND_NE);
261
+ if (i == 0) {
262
goto do_setcond_low;
263
- } else if (tmp == 1) {
264
+ } else if (i > 0) {
265
goto do_setcond_const;
266
}
267
}
268
--
115
--
269
2.25.1
116
2.25.1
270
117
271
118
diff view generated by jsdifflib
1
Move all of the known-zero optimizations into the per-opcode
1
The evex encoding is added here, for use in a subsequent patch.
2
functions. Use fold_masks when there is a possibility of the
3
result being determined, and simply set ctx->z_mask otherwise.
4
2
3
Tested-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
6
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
8
---
6
---
9
tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
7
tcg/i386/tcg-target.c.inc | 51 ++++++++++++++++++++++++++++++++++++++-
10
1 file changed, 294 insertions(+), 251 deletions(-)
8
1 file changed, 50 insertions(+), 1 deletion(-)
11
9
12
diff --git a/tcg/optimize.c b/tcg/optimize.c
10
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
13
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
14
--- a/tcg/optimize.c
12
--- a/tcg/i386/tcg-target.c.inc
15
+++ b/tcg/optimize.c
13
+++ b/tcg/i386/tcg-target.c.inc
16
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
14
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
17
TCGTempSet temps_used;
15
#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
18
16
#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
19
/* In flight values from optimization. */
17
#define P_VEXL 0x80000 /* Set VEX.L = 1 */
20
- uint64_t z_mask;
18
+#define P_EVEX 0x100000 /* Requires EVEX encoding */
21
+ uint64_t a_mask; /* mask bit is 0 iff value identical to first input */
19
22
+ uint64_t z_mask; /* mask bit is 0 iff value bit is 0 */
20
#define OPC_ARITH_EvIz    (0x81)
23
TCGType type;
21
#define OPC_ARITH_EvIb    (0x83)
24
} OptContext;
22
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
25
23
tcg_out8(s, opc);
26
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
27
return false;
28
}
24
}
29
25
30
+static bool fold_masks(OptContext *ctx, TCGOp *op)
26
+static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
27
+ int rm, int index)
31
+{
28
+{
32
+ uint64_t a_mask = ctx->a_mask;
29
+ /* The entire 4-byte evex prefix; with R' and V' set. */
33
+ uint64_t z_mask = ctx->z_mask;
30
+ uint32_t p = 0x08041062;
31
+ int mm, pp;
34
+
32
+
35
+ /*
33
+ tcg_debug_assert(have_avx512vl);
36
+ * 32-bit ops generate 32-bit results. For the result is zero test
37
+ * below, we can ignore high bits, but for further optimizations we
38
+ * need to record that the high bits contain garbage.
39
+ */
40
+ if (ctx->type == TCG_TYPE_I32) {
41
+ ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
42
+ a_mask &= MAKE_64BIT_MASK(0, 32);
43
+ z_mask &= MAKE_64BIT_MASK(0, 32);
44
+ }
45
+
34
+
46
+ if (z_mask == 0) {
35
+ /* EVEX.mm */
47
+ return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
36
+ if (opc & P_EXT3A) {
48
+ }
37
+ mm = 3;
49
+ if (a_mask == 0) {
38
+ } else if (opc & P_EXT38) {
50
+ return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
39
+ mm = 2;
51
+ }
40
+ } else if (opc & P_EXT) {
52
+ return false;
41
+ mm = 1;
53
+}
42
+ } else {
54
+
55
/*
56
* Convert @op to NOT, if NOT is supported by the host.
57
* Return true f the conversion is successful, which will still
58
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
59
60
static bool fold_and(OptContext *ctx, TCGOp *op)
61
{
62
+ uint64_t z1, z2;
63
+
64
if (fold_const2(ctx, op) ||
65
fold_xi_to_i(ctx, op, 0) ||
66
fold_xi_to_x(ctx, op, -1) ||
67
fold_xx_to_x(ctx, op)) {
68
return true;
69
}
70
- return false;
71
+
72
+ z1 = arg_info(op->args[1])->z_mask;
73
+ z2 = arg_info(op->args[2])->z_mask;
74
+ ctx->z_mask = z1 & z2;
75
+
76
+ /*
77
+ * Known-zeros does not imply known-ones. Therefore unless
78
+ * arg2 is constant, we can't infer affected bits from it.
79
+ */
80
+ if (arg_is_const(op->args[2])) {
81
+ ctx->a_mask = z1 & ~z2;
82
+ }
83
+
84
+ return fold_masks(ctx, op);
85
}
86
87
static bool fold_andc(OptContext *ctx, TCGOp *op)
88
{
89
+ uint64_t z1;
90
+
91
if (fold_const2(ctx, op) ||
92
fold_xx_to_i(ctx, op, 0) ||
93
fold_xi_to_x(ctx, op, 0) ||
94
fold_ix_to_not(ctx, op, -1)) {
95
return true;
96
}
97
- return false;
98
+
99
+ z1 = arg_info(op->args[1])->z_mask;
100
+
101
+ /*
102
+ * Known-zeros does not imply known-ones. Therefore unless
103
+ * arg2 is constant, we can't infer anything from it.
104
+ */
105
+ if (arg_is_const(op->args[2])) {
106
+ uint64_t z2 = ~arg_info(op->args[2])->z_mask;
107
+ ctx->a_mask = z1 & ~z2;
108
+ z1 &= z2;
109
+ }
110
+ ctx->z_mask = z1;
111
+
112
+ return fold_masks(ctx, op);
113
}
114
115
static bool fold_brcond(OptContext *ctx, TCGOp *op)
116
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
117
118
static bool fold_bswap(OptContext *ctx, TCGOp *op)
119
{
120
+ uint64_t z_mask, sign;
121
+
122
if (arg_is_const(op->args[1])) {
123
uint64_t t = arg_info(op->args[1])->val;
124
125
t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
126
return tcg_opt_gen_movi(ctx, op, op->args[0], t);
127
}
128
- return false;
129
+
130
+ z_mask = arg_info(op->args[1])->z_mask;
131
+ switch (op->opc) {
132
+ case INDEX_op_bswap16_i32:
133
+ case INDEX_op_bswap16_i64:
134
+ z_mask = bswap16(z_mask);
135
+ sign = INT16_MIN;
136
+ break;
137
+ case INDEX_op_bswap32_i32:
138
+ case INDEX_op_bswap32_i64:
139
+ z_mask = bswap32(z_mask);
140
+ sign = INT32_MIN;
141
+ break;
142
+ case INDEX_op_bswap64_i64:
143
+ z_mask = bswap64(z_mask);
144
+ sign = INT64_MIN;
145
+ break;
146
+ default:
147
+ g_assert_not_reached();
43
+ g_assert_not_reached();
148
+ }
44
+ }
149
+
45
+
150
+ switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
46
+ /* EVEX.pp */
151
+ case TCG_BSWAP_OZ:
47
+ if (opc & P_DATA16) {
152
+ break;
48
+ pp = 1; /* 0x66 */
153
+ case TCG_BSWAP_OS:
49
+ } else if (opc & P_SIMDF3) {
154
+ /* If the sign bit may be 1, force all the bits above to 1. */
50
+ pp = 2; /* 0xf3 */
155
+ if (z_mask & sign) {
51
+ } else if (opc & P_SIMDF2) {
156
+ z_mask |= sign;
52
+ pp = 3; /* 0xf2 */
157
+ }
53
+ } else {
158
+ break;
54
+ pp = 0;
159
+ default:
160
+ /* The high bits are undefined: force all bits above the sign to 1. */
161
+ z_mask |= sign << 1;
162
+ break;
163
+ }
164
+ ctx->z_mask = z_mask;
165
+
166
+ return fold_masks(ctx, op);
167
}
168
169
static bool fold_call(OptContext *ctx, TCGOp *op)
170
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
171
172
static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
173
{
174
+ uint64_t z_mask;
175
+
176
if (arg_is_const(op->args[1])) {
177
uint64_t t = arg_info(op->args[1])->val;
178
179
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
180
}
181
return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
182
}
183
+
184
+ switch (ctx->type) {
185
+ case TCG_TYPE_I32:
186
+ z_mask = 31;
187
+ break;
188
+ case TCG_TYPE_I64:
189
+ z_mask = 63;
190
+ break;
191
+ default:
192
+ g_assert_not_reached();
193
+ }
194
+ ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
195
+
196
return false;
197
}
198
199
static bool fold_ctpop(OptContext *ctx, TCGOp *op)
200
{
201
- return fold_const1(ctx, op);
202
+ if (fold_const1(ctx, op)) {
203
+ return true;
204
+ }
55
+ }
205
+
56
+
206
+ switch (ctx->type) {
57
+ p = deposit32(p, 8, 2, mm);
207
+ case TCG_TYPE_I32:
58
+ p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */
208
+ ctx->z_mask = 32 | 31;
59
+ p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */
209
+ break;
60
+ p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */
210
+ case TCG_TYPE_I64:
61
+ p = deposit32(p, 16, 2, pp);
211
+ ctx->z_mask = 64 | 63;
62
+ p = deposit32(p, 19, 4, ~v);
212
+ break;
63
+ p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
213
+ default:
64
+ p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
214
+ g_assert_not_reached();
215
+ }
216
+ return false;
217
}
218
219
static bool fold_deposit(OptContext *ctx, TCGOp *op)
220
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
221
t1 = deposit64(t1, op->args[3], op->args[4], t2);
222
return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
223
}
224
+
65
+
225
+ ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
66
+ tcg_out32(s, p);
226
+ op->args[3], op->args[4],
67
+ tcg_out8(s, opc);
227
+ arg_info(op->args[2])->z_mask);
228
return false;
229
}
230
231
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
232
233
static bool fold_extract(OptContext *ctx, TCGOp *op)
234
{
235
+ uint64_t z_mask_old, z_mask;
236
+
237
if (arg_is_const(op->args[1])) {
238
uint64_t t;
239
240
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
241
t = extract64(t, op->args[2], op->args[3]);
242
return tcg_opt_gen_movi(ctx, op, op->args[0], t);
243
}
244
- return false;
245
+
246
+ z_mask_old = arg_info(op->args[1])->z_mask;
247
+ z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
248
+ if (op->args[2] == 0) {
249
+ ctx->a_mask = z_mask_old ^ z_mask;
250
+ }
251
+ ctx->z_mask = z_mask;
252
+
253
+ return fold_masks(ctx, op);
254
}
255
256
static bool fold_extract2(OptContext *ctx, TCGOp *op)
257
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
258
259
static bool fold_exts(OptContext *ctx, TCGOp *op)
260
{
261
- return fold_const1(ctx, op);
262
+ uint64_t z_mask_old, z_mask, sign;
263
+ bool type_change = false;
264
+
265
+ if (fold_const1(ctx, op)) {
266
+ return true;
267
+ }
268
+
269
+ z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
270
+
271
+ switch (op->opc) {
272
+ CASE_OP_32_64(ext8s):
273
+ sign = INT8_MIN;
274
+ z_mask = (uint8_t)z_mask;
275
+ break;
276
+ CASE_OP_32_64(ext16s):
277
+ sign = INT16_MIN;
278
+ z_mask = (uint16_t)z_mask;
279
+ break;
280
+ case INDEX_op_ext_i32_i64:
281
+ type_change = true;
282
+ QEMU_FALLTHROUGH;
283
+ case INDEX_op_ext32s_i64:
284
+ sign = INT32_MIN;
285
+ z_mask = (uint32_t)z_mask;
286
+ break;
287
+ default:
288
+ g_assert_not_reached();
289
+ }
290
+
291
+ if (z_mask & sign) {
292
+ z_mask |= sign;
293
+ } else if (!type_change) {
294
+ ctx->a_mask = z_mask_old ^ z_mask;
295
+ }
296
+ ctx->z_mask = z_mask;
297
+
298
+ return fold_masks(ctx, op);
299
}
300
301
static bool fold_extu(OptContext *ctx, TCGOp *op)
302
{
303
- return fold_const1(ctx, op);
304
+ uint64_t z_mask_old, z_mask;
305
+ bool type_change = false;
306
+
307
+ if (fold_const1(ctx, op)) {
308
+ return true;
309
+ }
310
+
311
+ z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
312
+
313
+ switch (op->opc) {
314
+ CASE_OP_32_64(ext8u):
315
+ z_mask = (uint8_t)z_mask;
316
+ break;
317
+ CASE_OP_32_64(ext16u):
318
+ z_mask = (uint16_t)z_mask;
319
+ break;
320
+ case INDEX_op_extrl_i64_i32:
321
+ case INDEX_op_extu_i32_i64:
322
+ type_change = true;
323
+ QEMU_FALLTHROUGH;
324
+ case INDEX_op_ext32u_i64:
325
+ z_mask = (uint32_t)z_mask;
326
+ break;
327
+ case INDEX_op_extrh_i64_i32:
328
+ type_change = true;
329
+ z_mask >>= 32;
330
+ break;
331
+ default:
332
+ g_assert_not_reached();
333
+ }
334
+
335
+ ctx->z_mask = z_mask;
336
+ if (!type_change) {
337
+ ctx->a_mask = z_mask_old ^ z_mask;
338
+ }
339
+ return fold_masks(ctx, op);
340
}
341
342
static bool fold_mb(OptContext *ctx, TCGOp *op)
343
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
344
return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
345
}
346
347
+ ctx->z_mask = arg_info(op->args[3])->z_mask
348
+ | arg_info(op->args[4])->z_mask;
349
+
350
if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
351
uint64_t tv = arg_info(op->args[3])->val;
352
uint64_t fv = arg_info(op->args[4])->val;
353
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
354
355
static bool fold_neg(OptContext *ctx, TCGOp *op)
356
{
357
+ uint64_t z_mask;
358
+
359
if (fold_const1(ctx, op)) {
360
return true;
361
}
362
+
363
+ /* Set to 1 all bits to the left of the rightmost. */
364
+ z_mask = arg_info(op->args[1])->z_mask;
365
+ ctx->z_mask = -(z_mask & -z_mask);
366
+
367
/*
368
* Because of fold_sub_to_neg, we want to always return true,
369
* via finish_folding.
370
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
371
fold_xx_to_x(ctx, op)) {
372
return true;
373
}
374
- return false;
375
+
376
+ ctx->z_mask = arg_info(op->args[1])->z_mask
377
+ | arg_info(op->args[2])->z_mask;
378
+ return fold_masks(ctx, op);
379
}
380
381
static bool fold_orc(OptContext *ctx, TCGOp *op)
382
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
383
384
static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
385
{
386
+ const TCGOpDef *def = &tcg_op_defs[op->opc];
387
+ MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
388
+ MemOp mop = get_memop(oi);
389
+ int width = 8 * memop_size(mop);
390
+
391
+ if (!(mop & MO_SIGN) && width < 64) {
392
+ ctx->z_mask = MAKE_64BIT_MASK(0, width);
393
+ }
394
+
395
/* Opcodes that touch guest memory stop the mb optimization. */
396
ctx->prev_mb = NULL;
397
return false;
398
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
399
if (i >= 0) {
400
return tcg_opt_gen_movi(ctx, op, op->args[0], i);
401
}
402
+
403
+ ctx->z_mask = 1;
404
return false;
405
}
406
407
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
408
op->opc = INDEX_op_setcond_i32;
409
break;
410
}
411
+
412
+ ctx->z_mask = 1;
413
return false;
414
415
do_setcond_const:
416
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
417
418
static bool fold_sextract(OptContext *ctx, TCGOp *op)
419
{
420
+ int64_t z_mask_old, z_mask;
421
+
422
if (arg_is_const(op->args[1])) {
423
uint64_t t;
424
425
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
426
t = sextract64(t, op->args[2], op->args[3]);
427
return tcg_opt_gen_movi(ctx, op, op->args[0], t);
428
}
429
- return false;
430
+
431
+ z_mask_old = arg_info(op->args[1])->z_mask;
432
+ z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
433
+ if (op->args[2] == 0 && z_mask >= 0) {
434
+ ctx->a_mask = z_mask_old ^ z_mask;
435
+ }
436
+ ctx->z_mask = z_mask;
437
+
438
+ return fold_masks(ctx, op);
439
}
440
441
static bool fold_shift(OptContext *ctx, TCGOp *op)
442
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
443
fold_xi_to_x(ctx, op, 0)) {
444
return true;
445
}
446
+
447
+ if (arg_is_const(op->args[2])) {
448
+ ctx->z_mask = do_constant_folding(op->opc, ctx->type,
449
+ arg_info(op->args[1])->z_mask,
450
+ arg_info(op->args[2])->val);
451
+ return fold_masks(ctx, op);
452
+ }
453
return false;
454
}
455
456
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
457
return fold_addsub2_i32(ctx, op, false);
458
}
459
460
+static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
461
+{
462
+ /* We can't do any folding with a load, but we can record bits. */
463
+ switch (op->opc) {
464
+ CASE_OP_32_64(ld8u):
465
+ ctx->z_mask = MAKE_64BIT_MASK(0, 8);
466
+ break;
467
+ CASE_OP_32_64(ld16u):
468
+ ctx->z_mask = MAKE_64BIT_MASK(0, 16);
469
+ break;
470
+ case INDEX_op_ld32u_i64:
471
+ ctx->z_mask = MAKE_64BIT_MASK(0, 32);
472
+ break;
473
+ default:
474
+ g_assert_not_reached();
475
+ }
476
+ return false;
477
+}
68
+}
478
+
69
+
479
static bool fold_xor(OptContext *ctx, TCGOp *op)
70
static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
480
{
71
{
481
if (fold_const2(ctx, op) ||
72
- tcg_out_vex_opc(s, opc, r, v, rm, 0);
482
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
73
+ if (opc & P_EVEX) {
483
fold_xi_to_not(ctx, op, -1)) {
74
+ tcg_out_evex_opc(s, opc, r, v, rm, 0);
484
return true;
75
+ } else {
485
}
76
+ tcg_out_vex_opc(s, opc, r, v, rm, 0);
486
- return false;
77
+ }
487
+
78
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
488
+ ctx->z_mask = arg_info(op->args[1])->z_mask
489
+ | arg_info(op->args[2])->z_mask;
490
+ return fold_masks(ctx, op);
491
}
79
}
492
80
493
/* Propagate constants and copies, fold constant expressions. */
494
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
495
}
496
497
QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
498
- uint64_t z_mask, partmask, affected, tmp;
499
TCGOpcode opc = op->opc;
500
const TCGOpDef *def;
501
bool done = false;
502
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
503
break;
504
}
505
506
- /* Simplify using known-zero bits. Currently only ops with a single
507
- output argument is supported. */
508
- z_mask = -1;
509
- affected = -1;
510
- switch (opc) {
511
- CASE_OP_32_64(ext8s):
512
- if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
513
- break;
514
- }
515
- QEMU_FALLTHROUGH;
516
- CASE_OP_32_64(ext8u):
517
- z_mask = 0xff;
518
- goto and_const;
519
- CASE_OP_32_64(ext16s):
520
- if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
521
- break;
522
- }
523
- QEMU_FALLTHROUGH;
524
- CASE_OP_32_64(ext16u):
525
- z_mask = 0xffff;
526
- goto and_const;
527
- case INDEX_op_ext32s_i64:
528
- if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
529
- break;
530
- }
531
- QEMU_FALLTHROUGH;
532
- case INDEX_op_ext32u_i64:
533
- z_mask = 0xffffffffU;
534
- goto and_const;
535
-
536
- CASE_OP_32_64(and):
537
- z_mask = arg_info(op->args[2])->z_mask;
538
- if (arg_is_const(op->args[2])) {
539
- and_const:
540
- affected = arg_info(op->args[1])->z_mask & ~z_mask;
541
- }
542
- z_mask = arg_info(op->args[1])->z_mask & z_mask;
543
- break;
544
-
545
- case INDEX_op_ext_i32_i64:
546
- if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
547
- break;
548
- }
549
- QEMU_FALLTHROUGH;
550
- case INDEX_op_extu_i32_i64:
551
- /* We do not compute affected as it is a size changing op. */
552
- z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
553
- break;
554
-
555
- CASE_OP_32_64(andc):
556
- /* Known-zeros does not imply known-ones. Therefore unless
557
- op->args[2] is constant, we can't infer anything from it. */
558
- if (arg_is_const(op->args[2])) {
559
- z_mask = ~arg_info(op->args[2])->z_mask;
560
- goto and_const;
561
- }
562
- /* But we certainly know nothing outside args[1] may be set. */
563
- z_mask = arg_info(op->args[1])->z_mask;
564
- break;
565
-
566
- case INDEX_op_sar_i32:
567
- if (arg_is_const(op->args[2])) {
568
- tmp = arg_info(op->args[2])->val & 31;
569
- z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
570
- }
571
- break;
572
- case INDEX_op_sar_i64:
573
- if (arg_is_const(op->args[2])) {
574
- tmp = arg_info(op->args[2])->val & 63;
575
- z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
576
- }
577
- break;
578
-
579
- case INDEX_op_shr_i32:
580
- if (arg_is_const(op->args[2])) {
581
- tmp = arg_info(op->args[2])->val & 31;
582
- z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
583
- }
584
- break;
585
- case INDEX_op_shr_i64:
586
- if (arg_is_const(op->args[2])) {
587
- tmp = arg_info(op->args[2])->val & 63;
588
- z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
589
- }
590
- break;
591
-
592
- case INDEX_op_extrl_i64_i32:
593
- z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
594
- break;
595
- case INDEX_op_extrh_i64_i32:
596
- z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
597
- break;
598
-
599
- CASE_OP_32_64(shl):
600
- if (arg_is_const(op->args[2])) {
601
- tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
602
- z_mask = arg_info(op->args[1])->z_mask << tmp;
603
- }
604
- break;
605
-
606
- CASE_OP_32_64(neg):
607
- /* Set to 1 all bits to the left of the rightmost. */
608
- z_mask = -(arg_info(op->args[1])->z_mask
609
- & -arg_info(op->args[1])->z_mask);
610
- break;
611
-
612
- CASE_OP_32_64(deposit):
613
- z_mask = deposit64(arg_info(op->args[1])->z_mask,
614
- op->args[3], op->args[4],
615
- arg_info(op->args[2])->z_mask);
616
- break;
617
-
618
- CASE_OP_32_64(extract):
619
- z_mask = extract64(arg_info(op->args[1])->z_mask,
620
- op->args[2], op->args[3]);
621
- if (op->args[2] == 0) {
622
- affected = arg_info(op->args[1])->z_mask & ~z_mask;
623
- }
624
- break;
625
- CASE_OP_32_64(sextract):
626
- z_mask = sextract64(arg_info(op->args[1])->z_mask,
627
- op->args[2], op->args[3]);
628
- if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
629
- affected = arg_info(op->args[1])->z_mask & ~z_mask;
630
- }
631
- break;
632
-
633
- CASE_OP_32_64(or):
634
- CASE_OP_32_64(xor):
635
- z_mask = arg_info(op->args[1])->z_mask
636
- | arg_info(op->args[2])->z_mask;
637
- break;
638
-
639
- case INDEX_op_clz_i32:
640
- case INDEX_op_ctz_i32:
641
- z_mask = arg_info(op->args[2])->z_mask | 31;
642
- break;
643
-
644
- case INDEX_op_clz_i64:
645
- case INDEX_op_ctz_i64:
646
- z_mask = arg_info(op->args[2])->z_mask | 63;
647
- break;
648
-
649
- case INDEX_op_ctpop_i32:
650
- z_mask = 32 | 31;
651
- break;
652
- case INDEX_op_ctpop_i64:
653
- z_mask = 64 | 63;
654
- break;
655
-
656
- CASE_OP_32_64(setcond):
657
- case INDEX_op_setcond2_i32:
658
- z_mask = 1;
659
- break;
660
-
661
- CASE_OP_32_64(movcond):
662
- z_mask = arg_info(op->args[3])->z_mask
663
- | arg_info(op->args[4])->z_mask;
664
- break;
665
-
666
- CASE_OP_32_64(ld8u):
667
- z_mask = 0xff;
668
- break;
669
- CASE_OP_32_64(ld16u):
670
- z_mask = 0xffff;
671
- break;
672
- case INDEX_op_ld32u_i64:
673
- z_mask = 0xffffffffu;
674
- break;
675
-
676
- CASE_OP_32_64(qemu_ld):
677
- {
678
- MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
679
- MemOp mop = get_memop(oi);
680
- if (!(mop & MO_SIGN)) {
681
- z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
682
- }
683
- }
684
- break;
685
-
686
- CASE_OP_32_64(bswap16):
687
- z_mask = arg_info(op->args[1])->z_mask;
688
- if (z_mask <= 0xffff) {
689
- op->args[2] |= TCG_BSWAP_IZ;
690
- }
691
- z_mask = bswap16(z_mask);
692
- switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
693
- case TCG_BSWAP_OZ:
694
- break;
695
- case TCG_BSWAP_OS:
696
- z_mask = (int16_t)z_mask;
697
- break;
698
- default: /* undefined high bits */
699
- z_mask |= MAKE_64BIT_MASK(16, 48);
700
- break;
701
- }
702
- break;
703
-
704
- case INDEX_op_bswap32_i64:
705
- z_mask = arg_info(op->args[1])->z_mask;
706
- if (z_mask <= 0xffffffffu) {
707
- op->args[2] |= TCG_BSWAP_IZ;
708
- }
709
- z_mask = bswap32(z_mask);
710
- switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
711
- case TCG_BSWAP_OZ:
712
- break;
713
- case TCG_BSWAP_OS:
714
- z_mask = (int32_t)z_mask;
715
- break;
716
- default: /* undefined high bits */
717
- z_mask |= MAKE_64BIT_MASK(32, 32);
718
- break;
719
- }
720
- break;
721
-
722
- default:
723
- break;
724
- }
725
-
726
- /* 32-bit ops generate 32-bit results. For the result is zero test
727
- below, we can ignore high bits, but for further optimizations we
728
- need to record that the high bits contain garbage. */
729
- partmask = z_mask;
730
- if (ctx.type == TCG_TYPE_I32) {
731
- z_mask |= ~(tcg_target_ulong)0xffffffffu;
732
- partmask &= 0xffffffffu;
733
- affected &= 0xffffffffu;
734
- }
735
- ctx.z_mask = z_mask;
736
-
737
- if (partmask == 0) {
738
- tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
739
- continue;
740
- }
741
- if (affected == 0) {
742
- tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
743
- continue;
744
- }
745
+ /* Assume all bits affected, and no bits known zero. */
746
+ ctx.a_mask = -1;
747
+ ctx.z_mask = -1;
748
749
/*
750
* Process each opcode.
751
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
752
case INDEX_op_extrh_i64_i32:
753
done = fold_extu(&ctx, op);
754
break;
755
+ CASE_OP_32_64(ld8u):
756
+ CASE_OP_32_64(ld16u):
757
+ case INDEX_op_ld32u_i64:
758
+ done = fold_tcg_ld(&ctx, op);
759
+ break;
760
case INDEX_op_mb:
761
done = fold_mb(&ctx, op);
762
break;
763
--
81
--
764
2.25.1
82
2.25.1
765
83
766
84
diff view generated by jsdifflib
1
Certain targets, like riscv, produce signed 32-bit results.
1
The condition for UMIN/UMAX availability is about to change;
2
This can lead to lots of redundant extensions as values are
2
use the canonical version.
3
manipulated.
4
3
5
Begin by tracking only the obvious sign-extensions, and
4
Tested-by: Alex Bennée <alex.bennee@linaro.org>
6
converting them to simple copies when possible.
7
8
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
9
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
10
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
11
---
7
---
12
tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
8
tcg/i386/tcg-target.c.inc | 8 ++++----
13
1 file changed, 102 insertions(+), 21 deletions(-)
9
1 file changed, 4 insertions(+), 4 deletions(-)
14
10
15
diff --git a/tcg/optimize.c b/tcg/optimize.c
11
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
16
index XXXXXXX..XXXXXXX 100644
12
index XXXXXXX..XXXXXXX 100644
17
--- a/tcg/optimize.c
13
--- a/tcg/i386/tcg-target.c.inc
18
+++ b/tcg/optimize.c
14
+++ b/tcg/i386/tcg-target.c.inc
19
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
15
@@ -XXX,XX +XXX,XX @@ static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
20
TCGTemp *next_copy;
16
fixup = NEED_SWAP | NEED_INV;
21
uint64_t val;
17
break;
22
uint64_t z_mask; /* mask bit is 0 if and only if value bit is 0 */
18
case TCG_COND_LEU:
23
+ uint64_t s_mask; /* a left-aligned mask of clrsb(value) bits. */
19
- if (vece <= MO_32) {
24
} TempOptInfo;
20
+ if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
25
21
fixup = NEED_UMIN;
26
typedef struct OptContext {
22
} else {
27
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
23
fixup = NEED_BIAS | NEED_INV;
28
/* In flight values from optimization. */
29
uint64_t a_mask; /* mask bit is 0 iff value identical to first input */
30
uint64_t z_mask; /* mask bit is 0 iff value bit is 0 */
31
+ uint64_t s_mask; /* mask of clrsb(value) bits */
32
TCGType type;
33
} OptContext;
34
35
+/* Calculate the smask for a specific value. */
36
+static uint64_t smask_from_value(uint64_t value)
37
+{
38
+ int rep = clrsb64(value);
39
+ return ~(~0ull >> rep);
40
+}
41
+
42
+/*
43
+ * Calculate the smask for a given set of known-zeros.
44
+ * If there are lots of zeros on the left, we can consider the remainder
45
+ * an unsigned field, and thus the corresponding signed field is one bit
46
+ * larger.
47
+ */
48
+static uint64_t smask_from_zmask(uint64_t zmask)
49
+{
50
+ /*
51
+ * Only the 0 bits are significant for zmask, thus the msb itself
52
+ * must be zero, else we have no sign information.
53
+ */
54
+ int rep = clz64(zmask);
55
+ if (rep == 0) {
56
+ return 0;
57
+ }
58
+ rep -= 1;
59
+ return ~(~0ull >> rep);
60
+}
61
+
62
static inline TempOptInfo *ts_info(TCGTemp *ts)
63
{
64
return ts->state_ptr;
65
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
66
ti->prev_copy = ts;
67
ti->is_const = false;
68
ti->z_mask = -1;
69
+ ti->s_mask = 0;
70
}
71
72
static void reset_temp(TCGArg arg)
73
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
74
ti->is_const = true;
75
ti->val = ts->val;
76
ti->z_mask = ts->val;
77
+ ti->s_mask = smask_from_value(ts->val);
78
} else {
79
ti->is_const = false;
80
ti->z_mask = -1;
81
+ ti->s_mask = 0;
82
}
83
}
84
85
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
86
op->args[1] = src;
87
88
di->z_mask = si->z_mask;
89
+ di->s_mask = si->s_mask;
90
91
if (src_ts->type == dst_ts->type) {
92
TempOptInfo *ni = ts_info(si->next_copy);
93
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
94
95
nb_oargs = def->nb_oargs;
96
for (i = 0; i < nb_oargs; i++) {
97
- reset_temp(op->args[i]);
98
+ TCGTemp *ts = arg_temp(op->args[i]);
99
+ reset_ts(ts);
100
/*
101
- * Save the corresponding known-zero bits mask for the
102
+ * Save the corresponding known-zero/sign bits mask for the
103
* first output argument (only one supported so far).
104
*/
105
if (i == 0) {
106
- arg_info(op->args[i])->z_mask = ctx->z_mask;
107
+ ts_info(ts)->z_mask = ctx->z_mask;
108
+ ts_info(ts)->s_mask = ctx->s_mask;
109
}
110
}
111
}
112
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
113
{
114
uint64_t a_mask = ctx->a_mask;
115
uint64_t z_mask = ctx->z_mask;
116
+ uint64_t s_mask = ctx->s_mask;
117
118
/*
119
* 32-bit ops generate 32-bit results, which for the purpose of
120
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
121
if (ctx->type == TCG_TYPE_I32) {
122
a_mask = (int32_t)a_mask;
123
z_mask = (int32_t)z_mask;
124
+ s_mask |= MAKE_64BIT_MASK(32, 32);
125
ctx->z_mask = z_mask;
126
+ ctx->s_mask = s_mask;
127
}
128
129
if (z_mask == 0) {
130
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
131
132
static bool fold_bswap(OptContext *ctx, TCGOp *op)
133
{
134
- uint64_t z_mask, sign;
135
+ uint64_t z_mask, s_mask, sign;
136
137
if (arg_is_const(op->args[1])) {
138
uint64_t t = arg_info(op->args[1])->val;
139
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
140
}
141
142
z_mask = arg_info(op->args[1])->z_mask;
143
+
144
switch (op->opc) {
145
case INDEX_op_bswap16_i32:
146
case INDEX_op_bswap16_i64:
147
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
148
default:
149
g_assert_not_reached();
150
}
151
+ s_mask = smask_from_zmask(z_mask);
152
153
switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
154
case TCG_BSWAP_OZ:
155
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
156
/* If the sign bit may be 1, force all the bits above to 1. */
157
if (z_mask & sign) {
158
z_mask |= sign;
159
+ s_mask = sign << 1;
160
}
24
}
161
break;
25
break;
162
default:
26
case TCG_COND_GTU:
163
/* The high bits are undefined: force all bits above the sign to 1. */
27
- if (vece <= MO_32) {
164
z_mask |= sign << 1;
28
+ if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
165
+ s_mask = 0;
29
fixup = NEED_UMIN | NEED_INV;
30
} else {
31
fixup = NEED_BIAS;
32
}
166
break;
33
break;
167
}
34
case TCG_COND_GEU:
168
ctx->z_mask = z_mask;
35
- if (vece <= MO_32) {
169
+ ctx->s_mask = s_mask;
36
+ if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
170
37
fixup = NEED_UMAX;
171
return fold_masks(ctx, op);
38
} else {
172
}
39
fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
173
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
40
}
174
static bool fold_extract(OptContext *ctx, TCGOp *op)
175
{
176
uint64_t z_mask_old, z_mask;
177
+ int pos = op->args[2];
178
+ int len = op->args[3];
179
180
if (arg_is_const(op->args[1])) {
181
uint64_t t;
182
183
t = arg_info(op->args[1])->val;
184
- t = extract64(t, op->args[2], op->args[3]);
185
+ t = extract64(t, pos, len);
186
return tcg_opt_gen_movi(ctx, op, op->args[0], t);
187
}
188
189
z_mask_old = arg_info(op->args[1])->z_mask;
190
- z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
191
- if (op->args[2] == 0) {
192
+ z_mask = extract64(z_mask_old, pos, len);
193
+ if (pos == 0) {
194
ctx->a_mask = z_mask_old ^ z_mask;
195
}
196
ctx->z_mask = z_mask;
197
+ ctx->s_mask = smask_from_zmask(z_mask);
198
199
return fold_masks(ctx, op);
200
}
201
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
202
203
static bool fold_exts(OptContext *ctx, TCGOp *op)
204
{
205
- uint64_t z_mask_old, z_mask, sign;
206
+ uint64_t s_mask_old, s_mask, z_mask, sign;
207
bool type_change = false;
208
209
if (fold_const1(ctx, op)) {
210
return true;
211
}
212
213
- z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
214
+ z_mask = arg_info(op->args[1])->z_mask;
215
+ s_mask = arg_info(op->args[1])->s_mask;
216
+ s_mask_old = s_mask;
217
218
switch (op->opc) {
219
CASE_OP_32_64(ext8s):
220
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
221
222
if (z_mask & sign) {
223
z_mask |= sign;
224
- } else if (!type_change) {
225
- ctx->a_mask = z_mask_old ^ z_mask;
226
}
227
+ s_mask |= sign << 1;
228
+
229
ctx->z_mask = z_mask;
230
+ ctx->s_mask = s_mask;
231
+ if (!type_change) {
232
+ ctx->a_mask = s_mask & ~s_mask_old;
233
+ }
234
235
return fold_masks(ctx, op);
236
}
237
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
238
}
239
240
ctx->z_mask = z_mask;
241
+ ctx->s_mask = smask_from_zmask(z_mask);
242
if (!type_change) {
243
ctx->a_mask = z_mask_old ^ z_mask;
244
}
245
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
246
MemOp mop = get_memop(oi);
247
int width = 8 * memop_size(mop);
248
249
- if (!(mop & MO_SIGN) && width < 64) {
250
- ctx->z_mask = MAKE_64BIT_MASK(0, width);
251
+ if (width < 64) {
252
+ ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
253
+ if (!(mop & MO_SIGN)) {
254
+ ctx->z_mask = MAKE_64BIT_MASK(0, width);
255
+ ctx->s_mask <<= 1;
256
+ }
257
}
258
259
/* Opcodes that touch guest memory stop the mb optimization. */
260
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
261
262
static bool fold_sextract(OptContext *ctx, TCGOp *op)
263
{
264
- int64_t z_mask_old, z_mask;
265
+ uint64_t z_mask, s_mask, s_mask_old;
266
+ int pos = op->args[2];
267
+ int len = op->args[3];
268
269
if (arg_is_const(op->args[1])) {
270
uint64_t t;
271
272
t = arg_info(op->args[1])->val;
273
- t = sextract64(t, op->args[2], op->args[3]);
274
+ t = sextract64(t, pos, len);
275
return tcg_opt_gen_movi(ctx, op, op->args[0], t);
276
}
277
278
- z_mask_old = arg_info(op->args[1])->z_mask;
279
- z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
280
- if (op->args[2] == 0 && z_mask >= 0) {
281
- ctx->a_mask = z_mask_old ^ z_mask;
282
- }
283
+ z_mask = arg_info(op->args[1])->z_mask;
284
+ z_mask = sextract64(z_mask, pos, len);
285
ctx->z_mask = z_mask;
286
287
+ s_mask_old = arg_info(op->args[1])->s_mask;
288
+ s_mask = sextract64(s_mask_old, pos, len);
289
+ s_mask |= MAKE_64BIT_MASK(len, 64 - len);
290
+ ctx->s_mask = s_mask;
291
+
292
+ if (pos == 0) {
293
+ ctx->a_mask = s_mask & ~s_mask_old;
294
+ }
295
+
296
return fold_masks(ctx, op);
297
}
298
299
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
300
{
301
/* We can't do any folding with a load, but we can record bits. */
302
switch (op->opc) {
303
+ CASE_OP_32_64(ld8s):
304
+ ctx->s_mask = MAKE_64BIT_MASK(8, 56);
305
+ break;
306
CASE_OP_32_64(ld8u):
307
ctx->z_mask = MAKE_64BIT_MASK(0, 8);
308
+ ctx->s_mask = MAKE_64BIT_MASK(9, 55);
309
+ break;
310
+ CASE_OP_32_64(ld16s):
311
+ ctx->s_mask = MAKE_64BIT_MASK(16, 48);
312
break;
41
break;
313
CASE_OP_32_64(ld16u):
42
case TCG_COND_LTU:
314
ctx->z_mask = MAKE_64BIT_MASK(0, 16);
43
- if (vece <= MO_32) {
315
+ ctx->s_mask = MAKE_64BIT_MASK(17, 47);
44
+ if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
316
+ break;
45
fixup = NEED_UMAX | NEED_INV;
317
+ case INDEX_op_ld32s_i64:
46
} else {
318
+ ctx->s_mask = MAKE_64BIT_MASK(32, 32);
47
fixup = NEED_BIAS | NEED_SWAP;
319
break;
320
case INDEX_op_ld32u_i64:
321
ctx->z_mask = MAKE_64BIT_MASK(0, 32);
322
+ ctx->s_mask = MAKE_64BIT_MASK(33, 31);
323
break;
324
default:
325
g_assert_not_reached();
326
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
327
ctx.type = TCG_TYPE_I32;
328
}
329
330
- /* Assume all bits affected, and no bits known zero. */
331
+ /* Assume all bits affected, no bits known zero, no sign reps. */
332
ctx.a_mask = -1;
333
ctx.z_mask = -1;
334
+ ctx.s_mask = 0;
335
336
/*
337
* Process each opcode.
338
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
339
case INDEX_op_extrh_i64_i32:
340
done = fold_extu(&ctx, op);
341
break;
342
+ CASE_OP_32_64(ld8s):
343
CASE_OP_32_64(ld8u):
344
+ CASE_OP_32_64(ld16s):
345
CASE_OP_32_64(ld16u):
346
+ case INDEX_op_ld32s_i64:
347
case INDEX_op_ld32u_i64:
348
done = fold_tcg_ld(&ctx, op);
349
break;
350
--
48
--
351
2.25.1
49
2.25.1
352
50
353
51
diff view generated by jsdifflib
1
For constant shifts, we can simply shift the s_mask.
1
AVX512VL has VPSRAVQ, and
2
AVX512BW has VPSLLVW, VPSRAVW, VPSRLVW.
2
3
3
For variable shifts, we know that sar does not reduce
4
Tested-by: Alex Bennée <alex.bennee@linaro.org>
4
the s_mask, which helps for sequences like
5
6
ext32s_i64 t, in
7
sar_i64 t, t, v
8
ext32s_i64 out, t
9
10
allowing the final extend to be eliminated.
11
12
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
13
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
14
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
15
---
7
---
16
tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
8
tcg/i386/tcg-target.c.inc | 32 ++++++++++++++++++++++++--------
17
1 file changed, 47 insertions(+), 3 deletions(-)
9
1 file changed, 24 insertions(+), 8 deletions(-)
18
10
19
diff --git a/tcg/optimize.c b/tcg/optimize.c
11
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
20
index XXXXXXX..XXXXXXX 100644
12
index XXXXXXX..XXXXXXX 100644
21
--- a/tcg/optimize.c
13
--- a/tcg/i386/tcg-target.c.inc
22
+++ b/tcg/optimize.c
14
+++ b/tcg/i386/tcg-target.c.inc
23
@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
15
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
24
return ~(~0ull >> rep);
16
#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
25
}
17
#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
26
18
#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
27
+/*
19
+#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
28
+ * Recreate a properly left-aligned smask after manipulation.
20
#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16)
29
+ * Some bit-shuffling, particularly shifts and rotates, may
21
#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
30
+ * retain sign bits on the left, but may scatter disconnected
22
+#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
31
+ * sign bits on the right. Retain only what remains to the left.
23
#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16)
32
+ */
24
+#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
33
+static uint64_t smask_from_smask(int64_t smask)
25
+#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
34
+{
26
#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16)
35
+ /* Only the 1 bits are significant for smask */
27
#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
36
+ return smask_from_zmask(~smask);
28
#define OPC_VZEROUPPER (0x77 | P_EXT)
37
+}
29
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
38
+
30
OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
39
static inline TempOptInfo *ts_info(TCGTemp *ts)
31
};
40
{
32
static int const shlv_insn[4] = {
41
return ts->state_ptr;
33
- /* TODO: AVX512 adds support for MO_16. */
42
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
34
- OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
43
35
+ OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
44
static bool fold_shift(OptContext *ctx, TCGOp *op)
36
};
45
{
37
static int const shrv_insn[4] = {
46
+ uint64_t s_mask, z_mask, sign;
38
- /* TODO: AVX512 adds support for MO_16. */
47
+
39
- OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
48
if (fold_const2(ctx, op) ||
40
+ OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
49
fold_ix_to_i(ctx, op, 0) ||
41
};
50
fold_xi_to_x(ctx, op, 0)) {
42
static int const sarv_insn[4] = {
51
return true;
43
- /* TODO: AVX512 adds support for MO_16, MO_64. */
52
}
44
- OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
53
45
+ OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
54
+ s_mask = arg_info(op->args[1])->s_mask;
46
};
55
+ z_mask = arg_info(op->args[1])->z_mask;
47
static int const shls_insn[4] = {
56
+
48
OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
57
if (arg_is_const(op->args[2])) {
49
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
58
- ctx->z_mask = do_constant_folding(op->opc, ctx->type,
50
59
- arg_info(op->args[1])->z_mask,
51
case INDEX_op_shlv_vec:
60
- arg_info(op->args[2])->val);
52
case INDEX_op_shrv_vec:
61
+ int sh = arg_info(op->args[2])->val;
53
- return have_avx2 && vece >= MO_32;
62
+
54
+ switch (vece) {
63
+ ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
55
+ case MO_16:
64
+
56
+ return have_avx512bw;
65
+ s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
57
+ case MO_32:
66
+ ctx->s_mask = smask_from_smask(s_mask);
58
+ case MO_64:
67
+
59
+ return have_avx2;
68
return fold_masks(ctx, op);
69
}
70
+
71
+ switch (op->opc) {
72
+ CASE_OP_32_64(sar):
73
+ /*
74
+ * Arithmetic right shift will not reduce the number of
75
+ * input sign repetitions.
76
+ */
77
+ ctx->s_mask = s_mask;
78
+ break;
79
+ CASE_OP_32_64(shr):
80
+ /*
81
+ * If the sign bit is known zero, then logical right shift
82
+ * will not reduced the number of input sign repetitions.
83
+ */
84
+ sign = (s_mask & -s_mask) >> 1;
85
+ if (!(z_mask & sign)) {
86
+ ctx->s_mask = s_mask;
87
+ }
60
+ }
88
+ break;
61
+ return 0;
89
+ default:
62
case INDEX_op_sarv_vec:
90
+ break;
63
- return have_avx2 && vece == MO_32;
91
+ }
64
+ switch (vece) {
92
+
65
+ case MO_16:
93
return false;
66
+ return have_avx512bw;
94
}
67
+ case MO_32:
95
68
+ return have_avx2;
69
+ case MO_64:
70
+ return have_avx512vl;
71
+ }
72
+ return 0;
73
case INDEX_op_rotlv_vec:
74
case INDEX_op_rotrv_vec:
75
return have_avx2 && vece >= MO_32 ? -1 : 0;
96
--
76
--
97
2.25.1
77
2.25.1
98
78
99
79
diff view generated by jsdifflib
1
The results are generally 6 bit unsigned values, though
1
AVX512VL has VPSRAQ.
2
the count leading and trailing bits may produce any value
3
for a zero input.
4
2
3
Tested-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
6
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
8
---
6
---
9
tcg/optimize.c | 3 ++-
7
tcg/i386/tcg-target.c.inc | 12 ++++++++++--
10
1 file changed, 2 insertions(+), 1 deletion(-)
8
1 file changed, 10 insertions(+), 2 deletions(-)
11
9
12
diff --git a/tcg/optimize.c b/tcg/optimize.c
10
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
13
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
14
--- a/tcg/optimize.c
12
--- a/tcg/i386/tcg-target.c.inc
15
+++ b/tcg/optimize.c
13
+++ b/tcg/i386/tcg-target.c.inc
16
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
14
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
17
g_assert_not_reached();
15
#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16)
18
}
16
#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16)
19
ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
17
#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16)
20
-
18
+#define OPC_VPSRAQ (0x72 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
21
+ ctx->s_mask = smask_from_zmask(ctx->z_mask);
19
#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16)
22
return false;
20
#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16)
23
}
21
#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16)
24
22
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
25
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
23
OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
26
default:
24
};
27
g_assert_not_reached();
25
static int const sars_insn[4] = {
28
}
26
- OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
29
+ ctx->s_mask = smask_from_zmask(ctx->z_mask);
27
+ OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
30
return false;
28
};
31
}
29
static int const abs_insn[4] = {
30
/* TODO: AVX512 adds support for MO_64. */
31
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
32
case INDEX_op_shrs_vec:
33
return vece >= MO_16;
34
case INDEX_op_sars_vec:
35
- return vece >= MO_16 && vece <= MO_32;
36
+ switch (vece) {
37
+ case MO_16:
38
+ case MO_32:
39
+ return 1;
40
+ case MO_64:
41
+ return have_avx512vl;
42
+ }
43
+ return 0;
44
case INDEX_op_rotls_vec:
45
return vece >= MO_16 ? -1 : 0;
32
46
33
--
47
--
34
2.25.1
48
2.25.1
35
49
36
50
diff view generated by jsdifflib
1
The result is either 0 or 1, which means that we have
1
AVX512 has VPSRAQ with immediate operand, in the same form as
2
a 2 bit signed result, and thus 62 bits of sign.
2
with AVX, but requires EVEX encoding and W1.
3
For clarity, use the smask_from_zmask function.
4
3
4
Tested-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
6
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
8
---
7
---
9
tcg/optimize.c | 2 ++
8
tcg/i386/tcg-target.c.inc | 30 +++++++++++++++++++++---------
10
1 file changed, 2 insertions(+)
9
1 file changed, 21 insertions(+), 9 deletions(-)
11
10
12
diff --git a/tcg/optimize.c b/tcg/optimize.c
11
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
13
index XXXXXXX..XXXXXXX 100644
12
index XXXXXXX..XXXXXXX 100644
14
--- a/tcg/optimize.c
13
--- a/tcg/i386/tcg-target.c.inc
15
+++ b/tcg/optimize.c
14
+++ b/tcg/i386/tcg-target.c.inc
16
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
15
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
17
}
16
break;
18
17
19
ctx->z_mask = 1;
18
case INDEX_op_shli_vec:
20
+ ctx->s_mask = smask_from_zmask(1);
19
+ insn = shift_imm_insn[vece];
21
return false;
20
sub = 6;
22
}
21
goto gen_shift;
23
22
case INDEX_op_shri_vec:
24
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
23
+ insn = shift_imm_insn[vece];
25
}
24
sub = 2;
26
25
goto gen_shift;
27
ctx->z_mask = 1;
26
case INDEX_op_sari_vec:
28
+ ctx->s_mask = smask_from_zmask(1);
27
- tcg_debug_assert(vece != MO_64);
29
return false;
28
+ if (vece == MO_64) {
30
29
+ insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
31
do_setcond_const:
30
+ } else {
31
+ insn = shift_imm_insn[vece];
32
+ }
33
sub = 4;
34
gen_shift:
35
tcg_debug_assert(vece != MO_8);
36
- insn = shift_imm_insn[vece];
37
if (type == TCG_TYPE_V256) {
38
insn |= P_VEXL;
39
}
40
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
41
return vece == MO_8 ? -1 : 1;
42
43
case INDEX_op_sari_vec:
44
- /* We must expand the operation for MO_8. */
45
- if (vece == MO_8) {
46
+ switch (vece) {
47
+ case MO_8:
48
return -1;
49
- }
50
- /* We can emulate this for MO_64, but it does not pay off
51
- unless we're producing at least 4 values. */
52
- if (vece == MO_64) {
53
+ case MO_16:
54
+ case MO_32:
55
+ return 1;
56
+ case MO_64:
57
+ if (have_avx512vl) {
58
+ return 1;
59
+ }
60
+ /*
61
+ * We can emulate this for MO_64, but it does not pay off
62
+ * unless we're producing at least 4 values.
63
+ */
64
return type >= TCG_TYPE_V256 ? -1 : 0;
65
}
66
- return 1;
67
+ return 0;
68
69
case INDEX_op_shls_vec:
70
case INDEX_op_shrs_vec:
32
--
71
--
33
2.25.1
72
2.25.1
34
73
35
74
diff view generated by jsdifflib
1
Recognize the identity function for low-part multiply.
1
AVX512VL has VPROLD and VPROLQ, layered onto the same
2
opcode as PSHIFTD, but requires EVEX encoding and W1.
2
3
3
Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
5
---
8
tcg/optimize.c | 3 ++-
6
tcg/i386/tcg-target.h | 2 +-
9
1 file changed, 2 insertions(+), 1 deletion(-)
7
tcg/i386/tcg-target.c.inc | 15 +++++++++++++--
8
2 files changed, 14 insertions(+), 3 deletions(-)
10
9
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
10
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
12
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
12
--- a/tcg/i386/tcg-target.h
14
+++ b/tcg/optimize.c
13
+++ b/tcg/i386/tcg-target.h
15
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
14
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
16
static bool fold_mul(OptContext *ctx, TCGOp *op)
15
#define TCG_TARGET_HAS_not_vec 0
17
{
16
#define TCG_TARGET_HAS_neg_vec 0
18
if (fold_const2(ctx, op) ||
17
#define TCG_TARGET_HAS_abs_vec 1
19
- fold_xi_to_i(ctx, op, 0)) {
18
-#define TCG_TARGET_HAS_roti_vec 0
20
+ fold_xi_to_i(ctx, op, 0) ||
19
+#define TCG_TARGET_HAS_roti_vec have_avx512vl
21
+ fold_xi_to_x(ctx, op, 1)) {
20
#define TCG_TARGET_HAS_rots_vec 0
22
return true;
21
#define TCG_TARGET_HAS_rotv_vec 0
23
}
22
#define TCG_TARGET_HAS_shi_vec 1
24
return false;
23
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
24
index XXXXXXX..XXXXXXX 100644
25
--- a/tcg/i386/tcg-target.c.inc
26
+++ b/tcg/i386/tcg-target.c.inc
27
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
28
#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
29
#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
30
#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
31
-#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
32
+#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
33
#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
34
#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16)
35
#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16)
36
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
37
insn = shift_imm_insn[vece];
38
}
39
sub = 4;
40
+ goto gen_shift;
41
+ case INDEX_op_rotli_vec:
42
+ insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */
43
+ if (vece == MO_64) {
44
+ insn |= P_VEXW;
45
+ }
46
+ sub = 1;
47
+ goto gen_shift;
48
gen_shift:
49
tcg_debug_assert(vece != MO_8);
50
if (type == TCG_TYPE_V256) {
51
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
52
case INDEX_op_shli_vec:
53
case INDEX_op_shri_vec:
54
case INDEX_op_sari_vec:
55
+ case INDEX_op_rotli_vec:
56
case INDEX_op_x86_psrldq_vec:
57
return C_O1_I1(x, x);
58
59
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
60
case INDEX_op_xor_vec:
61
case INDEX_op_andc_vec:
62
return 1;
63
- case INDEX_op_rotli_vec:
64
case INDEX_op_cmp_vec:
65
case INDEX_op_cmpsel_vec:
66
return -1;
67
68
+ case INDEX_op_rotli_vec:
69
+ return have_avx512vl && vece >= MO_32 ? 1 : -1;
70
+
71
case INDEX_op_shli_vec:
72
case INDEX_op_shri_vec:
73
/* We must expand the operation for MO_8. */
25
--
74
--
26
2.25.1
75
2.25.1
27
28
diff view generated by jsdifflib
1
Recognize the constant function for or-complement.
1
AVX512VL has VPROLVD and VPRORVQ.
2
2
3
Tested-by: Alex Bennée <alex.bennee@linaro.org>
3
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
6
---
8
tcg/optimize.c | 1 +
7
tcg/i386/tcg-target.h | 2 +-
9
1 file changed, 1 insertion(+)
8
tcg/i386/tcg-target.c.inc | 25 ++++++++++++++++++++++++-
9
2 files changed, 25 insertions(+), 2 deletions(-)
10
10
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
11
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
12
index XXXXXXX..XXXXXXX 100644
12
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
13
--- a/tcg/i386/tcg-target.h
14
+++ b/tcg/optimize.c
14
+++ b/tcg/i386/tcg-target.h
15
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
15
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
16
static bool fold_orc(OptContext *ctx, TCGOp *op)
16
#define TCG_TARGET_HAS_abs_vec 1
17
{
17
#define TCG_TARGET_HAS_roti_vec have_avx512vl
18
if (fold_const2(ctx, op) ||
18
#define TCG_TARGET_HAS_rots_vec 0
19
+ fold_xx_to_i(ctx, op, -1) ||
19
-#define TCG_TARGET_HAS_rotv_vec 0
20
fold_xi_to_x(ctx, op, -1) ||
20
+#define TCG_TARGET_HAS_rotv_vec have_avx512vl
21
fold_ix_to_not(ctx, op, 0)) {
21
#define TCG_TARGET_HAS_shi_vec 1
22
return true;
22
#define TCG_TARGET_HAS_shs_vec 1
23
#define TCG_TARGET_HAS_shv_vec have_avx2
24
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
25
index XXXXXXX..XXXXXXX 100644
26
--- a/tcg/i386/tcg-target.c.inc
27
+++ b/tcg/i386/tcg-target.c.inc
28
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
29
#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
30
#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
31
#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
32
+#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
33
+#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
34
+#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
35
+#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
36
#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
37
#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16)
38
#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
39
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
40
static int const umax_insn[4] = {
41
OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
42
};
43
+ static int const rotlv_insn[4] = {
44
+ OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
45
+ };
46
+ static int const rotrv_insn[4] = {
47
+ OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
48
+ };
49
static int const shlv_insn[4] = {
50
OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
51
};
52
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
53
case INDEX_op_sarv_vec:
54
insn = sarv_insn[vece];
55
goto gen_simd;
56
+ case INDEX_op_rotlv_vec:
57
+ insn = rotlv_insn[vece];
58
+ goto gen_simd;
59
+ case INDEX_op_rotrv_vec:
60
+ insn = rotrv_insn[vece];
61
+ goto gen_simd;
62
case INDEX_op_shls_vec:
63
insn = shls_insn[vece];
64
goto gen_simd;
65
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
66
case INDEX_op_shlv_vec:
67
case INDEX_op_shrv_vec:
68
case INDEX_op_sarv_vec:
69
+ case INDEX_op_rotlv_vec:
70
+ case INDEX_op_rotrv_vec:
71
case INDEX_op_shls_vec:
72
case INDEX_op_shrs_vec:
73
case INDEX_op_sars_vec:
74
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
75
return 0;
76
case INDEX_op_rotlv_vec:
77
case INDEX_op_rotrv_vec:
78
- return have_avx2 && vece >= MO_32 ? -1 : 0;
79
+ switch (vece) {
80
+ case MO_32:
81
+ case MO_64:
82
+ return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
83
+ }
84
+ return 0;
85
86
case INDEX_op_mul_vec:
87
if (vece == MO_8) {
23
--
88
--
24
2.25.1
89
2.25.1
25
90
26
91
diff view generated by jsdifflib
1
Sign repetitions are perforce all identical, whether they are 1 or 0.
1
We will use VPSHLD, VPSHLDV and VPSHRDV for 16-bit rotates.
2
Bitwise operations preserve the relative quantity of the repetitions.
3
2
3
Tested-by: Alex Bennée <alex.bennee@linaro.org>
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
6
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
8
---
6
---
9
tcg/optimize.c | 29 +++++++++++++++++++++++++++++
7
tcg/i386/tcg-target-con-set.h | 1 +
10
1 file changed, 29 insertions(+)
8
tcg/i386/tcg-target.opc.h | 3 +++
9
tcg/i386/tcg-target.c.inc | 38 +++++++++++++++++++++++++++++++++++
10
3 files changed, 42 insertions(+)
11
11
12
diff --git a/tcg/optimize.c b/tcg/optimize.c
12
diff --git a/tcg/i386/tcg-target-con-set.h b/tcg/i386/tcg-target-con-set.h
13
index XXXXXXX..XXXXXXX 100644
13
index XXXXXXX..XXXXXXX 100644
14
--- a/tcg/optimize.c
14
--- a/tcg/i386/tcg-target-con-set.h
15
+++ b/tcg/optimize.c
15
+++ b/tcg/i386/tcg-target-con-set.h
16
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
16
@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, r, rI)
17
z2 = arg_info(op->args[2])->z_mask;
17
C_O1_I2(x, x, x)
18
ctx->z_mask = z1 & z2;
18
C_N1_I2(r, r, r)
19
19
C_N1_I2(r, r, rW)
20
+ /*
20
+C_O1_I3(x, 0, x, x)
21
+ * Sign repetitions are perforce all identical, whether they are 1 or 0.
21
C_O1_I3(x, x, x, x)
22
+ * Bitwise operations preserve the relative quantity of the repetitions.
22
C_O1_I4(r, r, re, r, 0)
23
+ */
23
C_O1_I4(r, r, r, ri, ri)
24
+ ctx->s_mask = arg_info(op->args[1])->s_mask
24
diff --git a/tcg/i386/tcg-target.opc.h b/tcg/i386/tcg-target.opc.h
25
+ & arg_info(op->args[2])->s_mask;
25
index XXXXXXX..XXXXXXX 100644
26
--- a/tcg/i386/tcg-target.opc.h
27
+++ b/tcg/i386/tcg-target.opc.h
28
@@ -XXX,XX +XXX,XX @@ DEF(x86_psrldq_vec, 1, 1, 1, IMPLVEC)
29
DEF(x86_vperm2i128_vec, 1, 2, 1, IMPLVEC)
30
DEF(x86_punpckl_vec, 1, 2, 0, IMPLVEC)
31
DEF(x86_punpckh_vec, 1, 2, 0, IMPLVEC)
32
+DEF(x86_vpshldi_vec, 1, 2, 1, IMPLVEC)
33
+DEF(x86_vpshldv_vec, 1, 3, 0, IMPLVEC)
34
+DEF(x86_vpshrdv_vec, 1, 3, 0, IMPLVEC)
35
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
36
index XXXXXXX..XXXXXXX 100644
37
--- a/tcg/i386/tcg-target.c.inc
38
+++ b/tcg/i386/tcg-target.c.inc
39
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
40
#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
41
#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
42
#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
43
+#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
44
+#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
45
+#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
46
+#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
47
+#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
48
+#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
49
+#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
50
+#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
51
+#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
52
#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
53
#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16)
54
#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
55
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
56
static int const sars_insn[4] = {
57
OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
58
};
59
+ static int const vpshldi_insn[4] = {
60
+ OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
61
+ };
62
+ static int const vpshldv_insn[4] = {
63
+ OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
64
+ };
65
+ static int const vpshrdv_insn[4] = {
66
+ OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
67
+ };
68
static int const abs_insn[4] = {
69
/* TODO: AVX512 adds support for MO_64. */
70
OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
71
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
72
case INDEX_op_x86_packus_vec:
73
insn = packus_insn[vece];
74
goto gen_simd;
75
+ case INDEX_op_x86_vpshldv_vec:
76
+ insn = vpshldv_insn[vece];
77
+ a1 = a2;
78
+ a2 = args[3];
79
+ goto gen_simd;
80
+ case INDEX_op_x86_vpshrdv_vec:
81
+ insn = vpshrdv_insn[vece];
82
+ a1 = a2;
83
+ a2 = args[3];
84
+ goto gen_simd;
85
#if TCG_TARGET_REG_BITS == 32
86
case INDEX_op_dup2_vec:
87
/* First merge the two 32-bit inputs to a single 64-bit element. */
88
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
89
insn = OPC_VPERM2I128;
90
sub = args[3];
91
goto gen_simd_imm8;
92
+ case INDEX_op_x86_vpshldi_vec:
93
+ insn = vpshldi_insn[vece];
94
+ sub = args[3];
95
+ goto gen_simd_imm8;
96
gen_simd_imm8:
97
+ tcg_debug_assert(insn != OPC_UD2);
98
if (type == TCG_TYPE_V256) {
99
insn |= P_VEXL;
100
}
101
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
102
case INDEX_op_x86_vperm2i128_vec:
103
case INDEX_op_x86_punpckl_vec:
104
case INDEX_op_x86_punpckh_vec:
105
+ case INDEX_op_x86_vpshldi_vec:
106
#if TCG_TARGET_REG_BITS == 32
107
case INDEX_op_dup2_vec:
108
#endif
109
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
110
case INDEX_op_x86_psrldq_vec:
111
return C_O1_I1(x, x);
112
113
+ case INDEX_op_x86_vpshldv_vec:
114
+ case INDEX_op_x86_vpshrdv_vec:
115
+ return C_O1_I3(x, 0, x, x);
26
+
116
+
27
/*
117
case INDEX_op_x86_vpblendvb_vec:
28
* Known-zeros does not imply known-ones. Therefore unless
118
return C_O1_I3(x, x, x, x);
29
* arg2 is constant, we can't infer affected bits from it.
30
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
31
}
32
ctx->z_mask = z1;
33
34
+ ctx->s_mask = arg_info(op->args[1])->s_mask
35
+ & arg_info(op->args[2])->s_mask;
36
return fold_masks(ctx, op);
37
}
38
39
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
40
fold_xi_to_not(ctx, op, 0)) {
41
return true;
42
}
43
+
44
+ ctx->s_mask = arg_info(op->args[1])->s_mask
45
+ & arg_info(op->args[2])->s_mask;
46
return false;
47
}
48
49
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
50
51
ctx->z_mask = arg_info(op->args[3])->z_mask
52
| arg_info(op->args[4])->z_mask;
53
+ ctx->s_mask = arg_info(op->args[3])->s_mask
54
+ & arg_info(op->args[4])->s_mask;
55
56
if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
57
uint64_t tv = arg_info(op->args[3])->val;
58
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
59
fold_xi_to_not(ctx, op, -1)) {
60
return true;
61
}
62
+
63
+ ctx->s_mask = arg_info(op->args[1])->s_mask
64
+ & arg_info(op->args[2])->s_mask;
65
return false;
66
}
67
68
@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
69
fold_xi_to_not(ctx, op, 0)) {
70
return true;
71
}
72
+
73
+ ctx->s_mask = arg_info(op->args[1])->s_mask
74
+ & arg_info(op->args[2])->s_mask;
75
return false;
76
}
77
78
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
79
return true;
80
}
81
82
+ ctx->s_mask = arg_info(op->args[1])->s_mask;
83
+
84
/* Because of fold_to_not, we want to always return true, via finish. */
85
finish_folding(ctx, op);
86
return true;
87
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
88
89
ctx->z_mask = arg_info(op->args[1])->z_mask
90
| arg_info(op->args[2])->z_mask;
91
+ ctx->s_mask = arg_info(op->args[1])->s_mask
92
+ & arg_info(op->args[2])->s_mask;
93
return fold_masks(ctx, op);
94
}
95
96
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
97
fold_ix_to_not(ctx, op, 0)) {
98
return true;
99
}
100
+
101
+ ctx->s_mask = arg_info(op->args[1])->s_mask
102
+ & arg_info(op->args[2])->s_mask;
103
return false;
104
}
105
106
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
107
108
ctx->z_mask = arg_info(op->args[1])->z_mask
109
| arg_info(op->args[2])->z_mask;
110
+ ctx->s_mask = arg_info(op->args[1])->s_mask
111
+ & arg_info(op->args[2])->s_mask;
112
return fold_masks(ctx, op);
113
}
114
119
115
--
120
--
116
2.25.1
121
2.25.1
117
122
118
123
diff view generated by jsdifflib
1
Most of these are handled by creating a fold_const2_commutative
1
While there are no specific 16-bit rotate instructions, there
2
to handle all of the binary operators. The rest were already
2
are double-word shifts, which can perform the same operation.
3
handled on a case-by-case basis in the switch, and have their
4
own fold function in which to place the call.
5
3
6
We now have only one major switch on TCGOpcode.
4
Tested-by: Alex Bennée <alex.bennee@linaro.org>
7
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
8
Introduce NO_DEST and a block comment for swap_commutative in
9
order to make the handling of brcond and movcond opcodes cleaner.
10
11
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
12
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
13
---
7
---
14
tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
8
tcg/i386/tcg-target.c.inc | 18 +++++++++++++++++-
15
1 file changed, 70 insertions(+), 72 deletions(-)
9
1 file changed, 17 insertions(+), 1 deletion(-)
16
10
17
diff --git a/tcg/optimize.c b/tcg/optimize.c
11
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
18
index XXXXXXX..XXXXXXX 100644
12
index XXXXXXX..XXXXXXX 100644
19
--- a/tcg/optimize.c
13
--- a/tcg/i386/tcg-target.c.inc
20
+++ b/tcg/optimize.c
14
+++ b/tcg/i386/tcg-target.c.inc
21
@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
15
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
22
return -1;
16
case INDEX_op_rotlv_vec:
23
}
17
case INDEX_op_rotrv_vec:
24
18
switch (vece) {
25
+/**
19
+ case MO_16:
26
+ * swap_commutative:
20
+ return have_avx512vbmi2 ? -1 : 0;
27
+ * @dest: TCGArg of the destination argument, or NO_DEST.
21
case MO_32:
28
+ * @p1: first paired argument
22
case MO_64:
29
+ * @p2: second paired argument
23
return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
30
+ *
24
@@ -XXX,XX +XXX,XX @@ static void expand_vec_rotli(TCGType type, unsigned vece,
31
+ * If *@p1 is a constant and *@p2 is not, swap.
25
return;
32
+ * If *@p2 matches @dest, swap.
33
+ * Return true if a swap was performed.
34
+ */
35
+
36
+#define NO_DEST temp_arg(NULL)
37
+
38
static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
39
{
40
TCGArg a1 = *p1, a2 = *p2;
41
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
42
return false;
43
}
44
45
+static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
46
+{
47
+ swap_commutative(op->args[0], &op->args[1], &op->args[2]);
48
+ return fold_const2(ctx, op);
49
+}
50
+
51
static bool fold_masks(OptContext *ctx, TCGOp *op)
52
{
53
uint64_t a_mask = ctx->a_mask;
54
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
55
56
static bool fold_add(OptContext *ctx, TCGOp *op)
57
{
58
- if (fold_const2(ctx, op) ||
59
+ if (fold_const2_commutative(ctx, op) ||
60
fold_xi_to_x(ctx, op, 0)) {
61
return true;
62
}
26
}
63
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
27
64
28
+ if (have_avx512vbmi2) {
65
static bool fold_add2(OptContext *ctx, TCGOp *op)
29
+ vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
66
{
30
+ tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
67
+ /* Note that the high and low parts may be independently swapped. */
31
+ return;
68
+ swap_commutative(op->args[0], &op->args[2], &op->args[4]);
69
+ swap_commutative(op->args[1], &op->args[3], &op->args[5]);
70
+
71
return fold_addsub2(ctx, op, true);
72
}
73
74
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
75
{
76
uint64_t z1, z2;
77
78
- if (fold_const2(ctx, op) ||
79
+ if (fold_const2_commutative(ctx, op) ||
80
fold_xi_to_i(ctx, op, 0) ||
81
fold_xi_to_x(ctx, op, -1) ||
82
fold_xx_to_x(ctx, op)) {
83
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
84
static bool fold_brcond(OptContext *ctx, TCGOp *op)
85
{
86
TCGCond cond = op->args[2];
87
- int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
88
+ int i;
89
90
+ if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
91
+ op->args[2] = cond = tcg_swap_cond(cond);
92
+ }
32
+ }
93
+
33
+
94
+ i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
34
t = tcg_temp_new_vec(type);
95
if (i == 0) {
35
tcg_gen_shli_vec(vece, t, v1, imm);
96
tcg_op_remove(ctx->tcg, op);
36
tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
97
return true;
37
@@ -XXX,XX +XXX,XX @@ static void expand_vec_rotls(TCGType type, unsigned vece,
98
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
38
static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
99
static bool fold_brcond2(OptContext *ctx, TCGOp *op)
39
TCGv_vec v1, TCGv_vec sh, bool right)
100
{
40
{
101
TCGCond cond = op->args[4];
41
- TCGv_vec t = tcg_temp_new_vec(type);
102
- int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
42
+ TCGv_vec t;
103
TCGArg label = op->args[5];
43
104
- int inv = 0;
44
+ if (have_avx512vbmi2) {
105
+ int i, inv = 0;
45
+ vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
106
46
+ type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
107
+ if (swap_commutative2(&op->args[0], &op->args[2])) {
47
+ tcgv_vec_arg(v1), tcgv_vec_arg(sh));
108
+ op->args[4] = cond = tcg_swap_cond(cond);
48
+ return;
109
+ }
49
+ }
110
+
50
+
111
+ i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
51
+ t = tcg_temp_new_vec(type);
112
if (i >= 0) {
52
tcg_gen_dupi_vec(vece, t, 8 << vece);
113
goto do_brcond_const;
53
tcg_gen_sub_vec(vece, t, t, sh);
114
}
54
if (right) {
115
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
116
117
static bool fold_eqv(OptContext *ctx, TCGOp *op)
118
{
119
- if (fold_const2(ctx, op) ||
120
+ if (fold_const2_commutative(ctx, op) ||
121
fold_xi_to_x(ctx, op, -1) ||
122
fold_xi_to_not(ctx, op, 0)) {
123
return true;
124
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
125
static bool fold_movcond(OptContext *ctx, TCGOp *op)
126
{
127
TCGCond cond = op->args[5];
128
- int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
129
+ int i;
130
131
+ if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
132
+ op->args[5] = cond = tcg_swap_cond(cond);
133
+ }
134
+ /*
135
+ * Canonicalize the "false" input reg to match the destination reg so
136
+ * that the tcg backend can implement a "move if true" operation.
137
+ */
138
+ if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
139
+ op->args[5] = cond = tcg_invert_cond(cond);
140
+ }
141
+
142
+ i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
143
if (i >= 0) {
144
return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
145
}
146
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
147
148
static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
149
{
150
- if (fold_const2(ctx, op) ||
151
+ if (fold_const2_commutative(ctx, op) ||
152
fold_xi_to_i(ctx, op, 0)) {
153
return true;
154
}
155
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
156
157
static bool fold_multiply2(OptContext *ctx, TCGOp *op)
158
{
159
+ swap_commutative(op->args[0], &op->args[2], &op->args[3]);
160
+
161
if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
162
uint64_t a = arg_info(op->args[2])->val;
163
uint64_t b = arg_info(op->args[3])->val;
164
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
165
166
static bool fold_nand(OptContext *ctx, TCGOp *op)
167
{
168
- if (fold_const2(ctx, op) ||
169
+ if (fold_const2_commutative(ctx, op) ||
170
fold_xi_to_not(ctx, op, -1)) {
171
return true;
172
}
173
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
174
175
static bool fold_nor(OptContext *ctx, TCGOp *op)
176
{
177
- if (fold_const2(ctx, op) ||
178
+ if (fold_const2_commutative(ctx, op) ||
179
fold_xi_to_not(ctx, op, 0)) {
180
return true;
181
}
182
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
183
184
static bool fold_or(OptContext *ctx, TCGOp *op)
185
{
186
- if (fold_const2(ctx, op) ||
187
+ if (fold_const2_commutative(ctx, op) ||
188
fold_xi_to_x(ctx, op, 0) ||
189
fold_xx_to_x(ctx, op)) {
190
return true;
191
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
192
static bool fold_setcond(OptContext *ctx, TCGOp *op)
193
{
194
TCGCond cond = op->args[3];
195
- int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
196
+ int i;
197
198
+ if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
199
+ op->args[3] = cond = tcg_swap_cond(cond);
200
+ }
201
+
202
+ i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
203
if (i >= 0) {
204
return tcg_opt_gen_movi(ctx, op, op->args[0], i);
205
}
206
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
207
static bool fold_setcond2(OptContext *ctx, TCGOp *op)
208
{
209
TCGCond cond = op->args[5];
210
- int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
211
- int inv = 0;
212
+ int i, inv = 0;
213
214
+ if (swap_commutative2(&op->args[1], &op->args[3])) {
215
+ op->args[5] = cond = tcg_swap_cond(cond);
216
+ }
217
+
218
+ i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
219
if (i >= 0) {
220
goto do_setcond_const;
221
}
222
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
223
224
static bool fold_xor(OptContext *ctx, TCGOp *op)
225
{
226
- if (fold_const2(ctx, op) ||
227
+ if (fold_const2_commutative(ctx, op) ||
228
fold_xx_to_i(ctx, op, 0) ||
229
fold_xi_to_x(ctx, op, 0) ||
230
fold_xi_to_not(ctx, op, -1)) {
231
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
232
ctx.type = TCG_TYPE_I32;
233
}
234
235
- /* For commutative operations make constant second argument */
236
- switch (opc) {
237
- CASE_OP_32_64_VEC(add):
238
- CASE_OP_32_64_VEC(mul):
239
- CASE_OP_32_64_VEC(and):
240
- CASE_OP_32_64_VEC(or):
241
- CASE_OP_32_64_VEC(xor):
242
- CASE_OP_32_64(eqv):
243
- CASE_OP_32_64(nand):
244
- CASE_OP_32_64(nor):
245
- CASE_OP_32_64(muluh):
246
- CASE_OP_32_64(mulsh):
247
- swap_commutative(op->args[0], &op->args[1], &op->args[2]);
248
- break;
249
- CASE_OP_32_64(brcond):
250
- if (swap_commutative(-1, &op->args[0], &op->args[1])) {
251
- op->args[2] = tcg_swap_cond(op->args[2]);
252
- }
253
- break;
254
- CASE_OP_32_64(setcond):
255
- if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
256
- op->args[3] = tcg_swap_cond(op->args[3]);
257
- }
258
- break;
259
- CASE_OP_32_64(movcond):
260
- if (swap_commutative(-1, &op->args[1], &op->args[2])) {
261
- op->args[5] = tcg_swap_cond(op->args[5]);
262
- }
263
- /* For movcond, we canonicalize the "false" input reg to match
264
- the destination reg so that the tcg backend can implement
265
- a "move if true" operation. */
266
- if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
267
- op->args[5] = tcg_invert_cond(op->args[5]);
268
- }
269
- break;
270
- CASE_OP_32_64(add2):
271
- swap_commutative(op->args[0], &op->args[2], &op->args[4]);
272
- swap_commutative(op->args[1], &op->args[3], &op->args[5]);
273
- break;
274
- CASE_OP_32_64(mulu2):
275
- CASE_OP_32_64(muls2):
276
- swap_commutative(op->args[0], &op->args[2], &op->args[3]);
277
- break;
278
- case INDEX_op_brcond2_i32:
279
- if (swap_commutative2(&op->args[0], &op->args[2])) {
280
- op->args[4] = tcg_swap_cond(op->args[4]);
281
- }
282
- break;
283
- case INDEX_op_setcond2_i32:
284
- if (swap_commutative2(&op->args[1], &op->args[3])) {
285
- op->args[5] = tcg_swap_cond(op->args[5]);
286
- }
287
- break;
288
- default:
289
- break;
290
- }
291
-
292
/* Assume all bits affected, and no bits known zero. */
293
ctx.a_mask = -1;
294
ctx.z_mask = -1;
295
--
55
--
296
2.25.1
56
2.25.1
297
57
298
58
diff view generated by jsdifflib
1
This "garbage" setting pre-dates the addition of the type
1
There is no such instruction on x86, so we should
2
changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
2
not be pretending it has arguments.
3
and INDEX_op_extr{l,h}_i64_i32.
4
3
5
So now we have a definitive points at which to adjust z_mask
4
Tested-by: Alex Bennée <alex.bennee@linaro.org>
6
to eliminate such bits from the 32-bit operands.
7
8
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
9
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
10
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
11
---
7
---
12
tcg/optimize.c | 35 ++++++++++++++++-------------------
8
tcg/i386/tcg-target.c.inc | 1 -
13
1 file changed, 16 insertions(+), 19 deletions(-)
9
1 file changed, 1 deletion(-)
14
10
15
diff --git a/tcg/optimize.c b/tcg/optimize.c
11
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
16
index XXXXXXX..XXXXXXX 100644
12
index XXXXXXX..XXXXXXX 100644
17
--- a/tcg/optimize.c
13
--- a/tcg/i386/tcg-target.c.inc
18
+++ b/tcg/optimize.c
14
+++ b/tcg/i386/tcg-target.c.inc
19
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
15
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
20
ti->is_const = true;
16
case INDEX_op_shls_vec:
21
ti->val = ts->val;
17
case INDEX_op_shrs_vec:
22
ti->z_mask = ts->val;
18
case INDEX_op_sars_vec:
23
- if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
19
- case INDEX_op_rotls_vec:
24
- /* High bits of a 32-bit quantity are garbage. */
20
case INDEX_op_cmp_vec:
25
- ti->z_mask |= ~0xffffffffull;
21
case INDEX_op_x86_shufps_vec:
26
- }
22
case INDEX_op_x86_blend_vec:
27
} else {
28
ti->is_const = false;
29
ti->z_mask = -1;
30
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
31
TCGTemp *src_ts = arg_temp(src);
32
TempOptInfo *di;
33
TempOptInfo *si;
34
- uint64_t z_mask;
35
TCGOpcode new_op;
36
37
if (ts_are_copies(dst_ts, src_ts)) {
38
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
39
op->args[0] = dst;
40
op->args[1] = src;
41
42
- z_mask = si->z_mask;
43
- if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
44
- /* High bits of the destination are now garbage. */
45
- z_mask |= ~0xffffffffull;
46
- }
47
- di->z_mask = z_mask;
48
+ di->z_mask = si->z_mask;
49
50
if (src_ts->type == dst_ts->type) {
51
TempOptInfo *ni = ts_info(si->next_copy);
52
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
53
static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
54
TCGArg dst, uint64_t val)
55
{
56
- /* Convert movi to mov with constant temp. */
57
- TCGTemp *tv = tcg_constant_internal(ctx->type, val);
58
+ TCGTemp *tv;
59
60
+ if (ctx->type == TCG_TYPE_I32) {
61
+ val = (int32_t)val;
62
+ }
63
+
64
+ /* Convert movi to mov with constant temp. */
65
+ tv = tcg_constant_internal(ctx->type, val);
66
init_ts_info(ctx, tv);
67
return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
68
}
69
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
70
uint64_t z_mask = ctx->z_mask;
71
72
/*
73
- * 32-bit ops generate 32-bit results. For the result is zero test
74
- * below, we can ignore high bits, but for further optimizations we
75
- * need to record that the high bits contain garbage.
76
+ * 32-bit ops generate 32-bit results, which for the purpose of
77
+ * simplifying tcg are sign-extended. Certainly that's how we
78
+ * represent our constants elsewhere. Note that the bits will
79
+ * be reset properly for a 64-bit value when encountering the
80
+ * type changing opcodes.
81
*/
82
if (ctx->type == TCG_TYPE_I32) {
83
- ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
84
- a_mask &= MAKE_64BIT_MASK(0, 32);
85
- z_mask &= MAKE_64BIT_MASK(0, 32);
86
+ a_mask = (int32_t)a_mask;
87
+ z_mask = (int32_t)z_mask;
88
+ ctx->z_mask = z_mask;
89
}
90
91
if (z_mask == 0) {
92
--
23
--
93
2.25.1
24
2.25.1
94
25
95
26
diff view generated by jsdifflib
1
There was no real reason for calls to have separate code here.
1
Expand 32-bit and 64-bit scalar rotate with VPRO[LR]V;
2
Unify init for calls vs non-calls using the call path, which
2
expand 16-bit scalar rotate with VPSHLDV.
3
handles TCG_CALL_DUMMY_ARG.
4
3
4
Tested-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
6
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
7
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
8
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
9
---
7
---
10
tcg/optimize.c | 25 +++++++++++--------------
8
tcg/i386/tcg-target.c.inc | 49 +++++++++++++++++++++++----------------
11
1 file changed, 11 insertions(+), 14 deletions(-)
9
1 file changed, 29 insertions(+), 20 deletions(-)
12
10
13
diff --git a/tcg/optimize.c b/tcg/optimize.c
11
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
14
index XXXXXXX..XXXXXXX 100644
12
index XXXXXXX..XXXXXXX 100644
15
--- a/tcg/optimize.c
13
--- a/tcg/i386/tcg-target.c.inc
16
+++ b/tcg/optimize.c
14
+++ b/tcg/i386/tcg-target.c.inc
17
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
15
@@ -XXX,XX +XXX,XX @@ static void expand_vec_rotli(TCGType type, unsigned vece,
18
}
16
tcg_temp_free_vec(t);
19
}
17
}
20
18
21
-static void init_arg_info(OptContext *ctx, TCGArg arg)
19
-static void expand_vec_rotls(TCGType type, unsigned vece,
20
- TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
22
-{
21
-{
23
- init_ts_info(ctx, arg_temp(arg));
22
- TCGv_i32 rsh;
23
- TCGv_vec t;
24
-
25
- tcg_debug_assert(vece != MO_8);
26
-
27
- t = tcg_temp_new_vec(type);
28
- rsh = tcg_temp_new_i32();
29
-
30
- tcg_gen_neg_i32(rsh, lsh);
31
- tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
32
- tcg_gen_shls_vec(vece, t, v1, lsh);
33
- tcg_gen_shrs_vec(vece, v0, v1, rsh);
34
- tcg_gen_or_vec(vece, v0, v0, t);
35
- tcg_temp_free_vec(t);
36
- tcg_temp_free_i32(rsh);
24
-}
37
-}
25
-
38
-
26
static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
39
static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
40
TCGv_vec v1, TCGv_vec sh, bool right)
27
{
41
{
28
TCGTemp *i, *g, *l;
42
@@ -XXX,XX +XXX,XX @@ static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
29
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
43
tcg_temp_free_vec(t);
30
return false;
31
}
44
}
32
45
33
+static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
46
+static void expand_vec_rotls(TCGType type, unsigned vece,
47
+ TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
34
+{
48
+{
35
+ for (int i = 0; i < nb_args; i++) {
49
+ TCGv_vec t = tcg_temp_new_vec(type);
36
+ TCGTemp *ts = arg_temp(op->args[i]);
50
+
37
+ if (ts) {
51
+ tcg_debug_assert(vece != MO_8);
38
+ init_ts_info(ctx, ts);
52
+
53
+ if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
54
+ tcg_gen_dup_i32_vec(vece, t, lsh);
55
+ if (vece >= MO_32) {
56
+ tcg_gen_rotlv_vec(vece, v0, v1, t);
57
+ } else {
58
+ expand_vec_rotv(type, vece, v0, v1, t, false);
39
+ }
59
+ }
60
+ } else {
61
+ TCGv_i32 rsh = tcg_temp_new_i32();
62
+
63
+ tcg_gen_neg_i32(rsh, lsh);
64
+ tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
65
+ tcg_gen_shls_vec(vece, t, v1, lsh);
66
+ tcg_gen_shrs_vec(vece, v0, v1, rsh);
67
+ tcg_gen_or_vec(vece, v0, v0, t);
68
+
69
+ tcg_temp_free_i32(rsh);
40
+ }
70
+ }
71
+
72
+ tcg_temp_free_vec(t);
41
+}
73
+}
42
+
74
+
43
/* Propagate constants and copies, fold constant expressions. */
75
static void expand_vec_mul(TCGType type, unsigned vece,
44
void tcg_optimize(TCGContext *s)
76
TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
45
{
77
{
46
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
47
if (opc == INDEX_op_call) {
48
nb_oargs = TCGOP_CALLO(op);
49
nb_iargs = TCGOP_CALLI(op);
50
- for (i = 0; i < nb_oargs + nb_iargs; i++) {
51
- TCGTemp *ts = arg_temp(op->args[i]);
52
- if (ts) {
53
- init_ts_info(&ctx, ts);
54
- }
55
- }
56
} else {
57
nb_oargs = def->nb_oargs;
58
nb_iargs = def->nb_iargs;
59
- for (i = 0; i < nb_oargs + nb_iargs; i++) {
60
- init_arg_info(&ctx, op->args[i]);
61
- }
62
}
63
+ init_arguments(&ctx, op, nb_oargs + nb_iargs);
64
65
/* Do copy propagation */
66
for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
67
--
78
--
68
2.25.1
79
2.25.1
69
80
70
81
diff view generated by jsdifflib
1
Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
1
AVX512VL has VPABSQ, VPMAXSQ, VPMAXUQ, VPMINSQ, VPMINUQ.
2
and muls2_i64.
3
2
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
3
Tested-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
6
---
8
tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
7
tcg/i386/tcg-target.c.inc | 18 +++++++++++-------
9
1 file changed, 35 insertions(+), 9 deletions(-)
8
1 file changed, 11 insertions(+), 7 deletions(-)
10
9
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
10
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
12
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
12
--- a/tcg/i386/tcg-target.c.inc
14
+++ b/tcg/optimize.c
13
+++ b/tcg/i386/tcg-target.c.inc
15
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
14
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
16
return false;
15
#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16)
17
}
16
#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16)
18
17
#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16)
19
-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
18
+#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
20
+static bool fold_multiply2(OptContext *ctx, TCGOp *op)
19
#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
21
{
20
#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
22
if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
21
#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
23
- uint32_t a = arg_info(op->args[2])->val;
22
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
24
- uint32_t b = arg_info(op->args[3])->val;
23
#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16)
25
- uint64_t r = (uint64_t)a * b;
24
#define OPC_PMAXSW (0xee | P_EXT | P_DATA16)
26
+ uint64_t a = arg_info(op->args[2])->val;
25
#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16)
27
+ uint64_t b = arg_info(op->args[3])->val;
26
+#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
28
+ uint64_t h, l;
27
#define OPC_PMAXUB (0xde | P_EXT | P_DATA16)
29
TCGArg rl, rh;
28
#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16)
30
- TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
29
#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16)
31
+ TCGOp *op2;
30
+#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
32
+
31
#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16)
33
+ switch (op->opc) {
32
#define OPC_PMINSW (0xea | P_EXT | P_DATA16)
34
+ case INDEX_op_mulu2_i32:
33
#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16)
35
+ l = (uint64_t)(uint32_t)a * (uint32_t)b;
34
+#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
36
+ h = (int32_t)(l >> 32);
35
#define OPC_PMINUB (0xda | P_EXT | P_DATA16)
37
+ l = (int32_t)l;
36
#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16)
38
+ break;
37
#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16)
39
+ case INDEX_op_muls2_i32:
38
+#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
40
+ l = (int64_t)(int32_t)a * (int32_t)b;
39
#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
41
+ h = l >> 32;
40
#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
42
+ l = (int32_t)l;
41
#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
43
+ break;
42
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
44
+ case INDEX_op_mulu2_i64:
43
OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
45
+ mulu64(&l, &h, a, b);
44
};
46
+ break;
45
static int const smin_insn[4] = {
47
+ case INDEX_op_muls2_i64:
46
- OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
48
+ muls64(&l, &h, a, b);
47
+ OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
49
+ break;
48
};
50
+ default:
49
static int const smax_insn[4] = {
51
+ g_assert_not_reached();
50
- OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
52
+ }
51
+ OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
53
52
};
54
rl = op->args[0];
53
static int const umin_insn[4] = {
55
rh = op->args[1];
54
- OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
56
- tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
55
+ OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
57
- tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
56
};
58
+
57
static int const umax_insn[4] = {
59
+ /* The proper opcode is supplied by tcg_opt_gen_mov. */
58
- OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
60
+ op2 = tcg_op_insert_before(ctx->tcg, op, 0);
59
+ OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
61
+
60
};
62
+ tcg_opt_gen_movi(ctx, op, rl, l);
61
static int const rotlv_insn[4] = {
63
+ tcg_opt_gen_movi(ctx, op2, rh, h);
62
OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
64
return true;
63
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
65
}
64
OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
66
return false;
65
};
67
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
66
static int const abs_insn[4] = {
68
CASE_OP_32_64(muluh):
67
- /* TODO: AVX512 adds support for MO_64. */
69
done = fold_mul_highpart(&ctx, op);
68
- OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
70
break;
69
+ OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
71
- case INDEX_op_mulu2_i32:
70
};
72
- done = fold_mulu2_i32(&ctx, op);
71
73
+ CASE_OP_32_64(muls2):
72
TCGType type = vecl + TCG_TYPE_V64;
74
+ CASE_OP_32_64(mulu2):
73
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
75
+ done = fold_multiply2(&ctx, op);
74
case INDEX_op_umin_vec:
76
break;
75
case INDEX_op_umax_vec:
77
CASE_OP_32_64(nand):
76
case INDEX_op_abs_vec:
78
done = fold_nand(&ctx, op);
77
- return vece <= MO_32;
78
+ return vece <= MO_32 || have_avx512vl;
79
80
default:
81
return 0;
79
--
82
--
80
2.25.1
83
2.25.1
81
84
82
85
diff view generated by jsdifflib
1
Split out a whole bunch of placeholder functions, which are
1
AVX512DQ has VPMULLQ.
2
currently identical. That won't last as more code gets moved.
3
2
4
Use CASE_32_64_VEC for some logical operators that previously
3
Tested-by: Alex Bennée <alex.bennee@linaro.org>
5
missed the addition of vectors.
6
7
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
8
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
9
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
10
---
6
---
11
tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
7
tcg/i386/tcg-target.c.inc | 12 ++++++------
12
1 file changed, 219 insertions(+), 52 deletions(-)
8
1 file changed, 6 insertions(+), 6 deletions(-)
13
9
14
diff --git a/tcg/optimize.c b/tcg/optimize.c
10
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
15
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
16
--- a/tcg/optimize.c
12
--- a/tcg/i386/tcg-target.c.inc
17
+++ b/tcg/optimize.c
13
+++ b/tcg/i386/tcg-target.c.inc
18
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
14
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
19
}
15
#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
20
}
16
#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
21
17
#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
22
+/*
18
+#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
23
+ * The fold_* functions return true when processing is complete,
19
#define OPC_POR (0xeb | P_EXT | P_DATA16)
24
+ * usually by folding the operation to a constant or to a copy,
20
#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
25
+ * and calling tcg_opt_gen_{mov,movi}. They may do other things,
21
#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
26
+ * like collect information about the value produced, for use in
22
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
27
+ * optimizing a subsequent operation.
23
OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
28
+ *
24
};
29
+ * These first fold_* functions are all helpers, used by other
25
static int const mul_insn[4] = {
30
+ * folders for more specific operations.
26
- OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
31
+ */
27
+ OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
32
+
28
};
33
+static bool fold_const1(OptContext *ctx, TCGOp *op)
29
static int const shift_imm_insn[4] = {
34
+{
30
OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
35
+ if (arg_is_const(op->args[1])) {
31
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
36
+ uint64_t t;
32
return 0;
37
+
33
38
+ t = arg_info(op->args[1])->val;
34
case INDEX_op_mul_vec:
39
+ t = do_constant_folding(op->opc, t, 0);
35
- if (vece == MO_8) {
40
+ return tcg_opt_gen_movi(ctx, op, op->args[0], t);
36
- /* We can expand the operation for MO_8. */
41
+ }
37
+ switch (vece) {
42
+ return false;
38
+ case MO_8:
43
+}
39
return -1;
44
+
40
- }
45
+static bool fold_const2(OptContext *ctx, TCGOp *op)
41
- if (vece == MO_64) {
46
+{
42
- return 0;
47
+ if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
43
+ case MO_64:
48
+ uint64_t t1 = arg_info(op->args[1])->val;
44
+ return have_avx512dq;
49
+ uint64_t t2 = arg_info(op->args[2])->val;
50
+
51
+ t1 = do_constant_folding(op->opc, t1, t2);
52
+ return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
53
+ }
54
+ return false;
55
+}
56
+
57
+/*
58
+ * These outermost fold_<op> functions are sorted alphabetically.
59
+ */
60
+
61
+static bool fold_add(OptContext *ctx, TCGOp *op)
62
+{
63
+ return fold_const2(ctx, op);
64
+}
65
+
66
+static bool fold_and(OptContext *ctx, TCGOp *op)
67
+{
68
+ return fold_const2(ctx, op);
69
+}
70
+
71
+static bool fold_andc(OptContext *ctx, TCGOp *op)
72
+{
73
+ return fold_const2(ctx, op);
74
+}
75
+
76
static bool fold_call(OptContext *ctx, TCGOp *op)
77
{
78
TCGContext *s = ctx->tcg;
79
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
80
return true;
81
}
82
83
+static bool fold_ctpop(OptContext *ctx, TCGOp *op)
84
+{
85
+ return fold_const1(ctx, op);
86
+}
87
+
88
+static bool fold_divide(OptContext *ctx, TCGOp *op)
89
+{
90
+ return fold_const2(ctx, op);
91
+}
92
+
93
+static bool fold_eqv(OptContext *ctx, TCGOp *op)
94
+{
95
+ return fold_const2(ctx, op);
96
+}
97
+
98
+static bool fold_exts(OptContext *ctx, TCGOp *op)
99
+{
100
+ return fold_const1(ctx, op);
101
+}
102
+
103
+static bool fold_extu(OptContext *ctx, TCGOp *op)
104
+{
105
+ return fold_const1(ctx, op);
106
+}
107
+
108
static bool fold_mb(OptContext *ctx, TCGOp *op)
109
{
110
/* Eliminate duplicate and redundant fence instructions. */
111
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
112
return true;
113
}
114
115
+static bool fold_mul(OptContext *ctx, TCGOp *op)
116
+{
117
+ return fold_const2(ctx, op);
118
+}
119
+
120
+static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
121
+{
122
+ return fold_const2(ctx, op);
123
+}
124
+
125
+static bool fold_nand(OptContext *ctx, TCGOp *op)
126
+{
127
+ return fold_const2(ctx, op);
128
+}
129
+
130
+static bool fold_neg(OptContext *ctx, TCGOp *op)
131
+{
132
+ return fold_const1(ctx, op);
133
+}
134
+
135
+static bool fold_nor(OptContext *ctx, TCGOp *op)
136
+{
137
+ return fold_const2(ctx, op);
138
+}
139
+
140
+static bool fold_not(OptContext *ctx, TCGOp *op)
141
+{
142
+ return fold_const1(ctx, op);
143
+}
144
+
145
+static bool fold_or(OptContext *ctx, TCGOp *op)
146
+{
147
+ return fold_const2(ctx, op);
148
+}
149
+
150
+static bool fold_orc(OptContext *ctx, TCGOp *op)
151
+{
152
+ return fold_const2(ctx, op);
153
+}
154
+
155
static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
156
{
157
/* Opcodes that touch guest memory stop the mb optimization. */
158
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
159
return false;
160
}
161
162
+static bool fold_remainder(OptContext *ctx, TCGOp *op)
163
+{
164
+ return fold_const2(ctx, op);
165
+}
166
+
167
+static bool fold_shift(OptContext *ctx, TCGOp *op)
168
+{
169
+ return fold_const2(ctx, op);
170
+}
171
+
172
+static bool fold_sub(OptContext *ctx, TCGOp *op)
173
+{
174
+ return fold_const2(ctx, op);
175
+}
176
+
177
+static bool fold_xor(OptContext *ctx, TCGOp *op)
178
+{
179
+ return fold_const2(ctx, op);
180
+}
181
+
182
/* Propagate constants and copies, fold constant expressions. */
183
void tcg_optimize(TCGContext *s)
184
{
185
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
186
}
187
break;
188
189
- CASE_OP_32_64(not):
190
- CASE_OP_32_64(neg):
191
- CASE_OP_32_64(ext8s):
192
- CASE_OP_32_64(ext8u):
193
- CASE_OP_32_64(ext16s):
194
- CASE_OP_32_64(ext16u):
195
- CASE_OP_32_64(ctpop):
196
- case INDEX_op_ext32s_i64:
197
- case INDEX_op_ext32u_i64:
198
- case INDEX_op_ext_i32_i64:
199
- case INDEX_op_extu_i32_i64:
200
- case INDEX_op_extrl_i64_i32:
201
- case INDEX_op_extrh_i64_i32:
202
- if (arg_is_const(op->args[1])) {
203
- tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
204
- tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
205
- continue;
206
- }
207
- break;
208
-
209
CASE_OP_32_64(bswap16):
210
CASE_OP_32_64(bswap32):
211
case INDEX_op_bswap64_i64:
212
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
213
}
214
break;
215
216
- CASE_OP_32_64(add):
217
- CASE_OP_32_64(sub):
218
- CASE_OP_32_64(mul):
219
- CASE_OP_32_64(or):
220
- CASE_OP_32_64(and):
221
- CASE_OP_32_64(xor):
222
- CASE_OP_32_64(shl):
223
- CASE_OP_32_64(shr):
224
- CASE_OP_32_64(sar):
225
- CASE_OP_32_64(rotl):
226
- CASE_OP_32_64(rotr):
227
- CASE_OP_32_64(andc):
228
- CASE_OP_32_64(orc):
229
- CASE_OP_32_64(eqv):
230
- CASE_OP_32_64(nand):
231
- CASE_OP_32_64(nor):
232
- CASE_OP_32_64(muluh):
233
- CASE_OP_32_64(mulsh):
234
- CASE_OP_32_64(div):
235
- CASE_OP_32_64(divu):
236
- CASE_OP_32_64(rem):
237
- CASE_OP_32_64(remu):
238
- if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
239
- tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
240
- arg_info(op->args[2])->val);
241
- tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
242
- continue;
243
- }
244
- break;
245
-
246
CASE_OP_32_64(clz):
247
CASE_OP_32_64(ctz):
248
if (arg_is_const(op->args[1])) {
249
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
250
}
251
break;
252
253
+ default:
254
+ break;
255
+
256
+ /* ---------------------------------------------------------- */
257
+ /* Sorted alphabetically by opcode as much as possible. */
258
+
259
+ CASE_OP_32_64_VEC(add):
260
+ done = fold_add(&ctx, op);
261
+ break;
262
+ CASE_OP_32_64_VEC(and):
263
+ done = fold_and(&ctx, op);
264
+ break;
265
+ CASE_OP_32_64_VEC(andc):
266
+ done = fold_andc(&ctx, op);
267
+ break;
268
+ CASE_OP_32_64(ctpop):
269
+ done = fold_ctpop(&ctx, op);
270
+ break;
271
+ CASE_OP_32_64(div):
272
+ CASE_OP_32_64(divu):
273
+ done = fold_divide(&ctx, op);
274
+ break;
275
+ CASE_OP_32_64(eqv):
276
+ done = fold_eqv(&ctx, op);
277
+ break;
278
+ CASE_OP_32_64(ext8s):
279
+ CASE_OP_32_64(ext16s):
280
+ case INDEX_op_ext32s_i64:
281
+ case INDEX_op_ext_i32_i64:
282
+ done = fold_exts(&ctx, op);
283
+ break;
284
+ CASE_OP_32_64(ext8u):
285
+ CASE_OP_32_64(ext16u):
286
+ case INDEX_op_ext32u_i64:
287
+ case INDEX_op_extu_i32_i64:
288
+ case INDEX_op_extrl_i64_i32:
289
+ case INDEX_op_extrh_i64_i32:
290
+ done = fold_extu(&ctx, op);
291
+ break;
292
case INDEX_op_mb:
293
done = fold_mb(&ctx, op);
294
break;
295
+ CASE_OP_32_64(mul):
296
+ done = fold_mul(&ctx, op);
297
+ break;
298
+ CASE_OP_32_64(mulsh):
299
+ CASE_OP_32_64(muluh):
300
+ done = fold_mul_highpart(&ctx, op);
301
+ break;
302
+ CASE_OP_32_64(nand):
303
+ done = fold_nand(&ctx, op);
304
+ break;
305
+ CASE_OP_32_64(neg):
306
+ done = fold_neg(&ctx, op);
307
+ break;
308
+ CASE_OP_32_64(nor):
309
+ done = fold_nor(&ctx, op);
310
+ break;
311
+ CASE_OP_32_64_VEC(not):
312
+ done = fold_not(&ctx, op);
313
+ break;
314
+ CASE_OP_32_64_VEC(or):
315
+ done = fold_or(&ctx, op);
316
+ break;
317
+ CASE_OP_32_64_VEC(orc):
318
+ done = fold_orc(&ctx, op);
319
+ break;
320
case INDEX_op_qemu_ld_i32:
321
case INDEX_op_qemu_ld_i64:
322
done = fold_qemu_ld(&ctx, op);
323
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
324
case INDEX_op_qemu_st_i64:
325
done = fold_qemu_st(&ctx, op);
326
break;
327
-
328
- default:
329
+ CASE_OP_32_64(rem):
330
+ CASE_OP_32_64(remu):
331
+ done = fold_remainder(&ctx, op);
332
+ break;
333
+ CASE_OP_32_64(rotl):
334
+ CASE_OP_32_64(rotr):
335
+ CASE_OP_32_64(sar):
336
+ CASE_OP_32_64(shl):
337
+ CASE_OP_32_64(shr):
338
+ done = fold_shift(&ctx, op);
339
+ break;
340
+ CASE_OP_32_64_VEC(sub):
341
+ done = fold_sub(&ctx, op);
342
+ break;
343
+ CASE_OP_32_64_VEC(xor):
344
+ done = fold_xor(&ctx, op);
345
break;
346
}
45
}
46
return 1;
347
47
348
--
48
--
349
2.25.1
49
2.25.1
350
50
351
51
diff view generated by jsdifflib
1
Pull the "op r, 0, b => movi r, 0" optimization into a function,
1
AVX512VL has a general ternary logic operation, VPTERNLOGQ,
2
and use it in fold_shift.
2
which can implement NOT, ORC, NAND, NOR, EQV.
3
3
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
4
Tested-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
7
---
8
tcg/optimize.c | 28 ++++++++++------------------
8
tcg/i386/tcg-target.h | 10 +++++-----
9
1 file changed, 10 insertions(+), 18 deletions(-)
9
tcg/i386/tcg-target.c.inc | 34 ++++++++++++++++++++++++++++++++++
10
2 files changed, 39 insertions(+), 5 deletions(-)
10
11
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
12
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
12
index XXXXXXX..XXXXXXX 100644
13
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
14
--- a/tcg/i386/tcg-target.h
14
+++ b/tcg/optimize.c
15
+++ b/tcg/i386/tcg-target.h
15
@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
16
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
16
return false;
17
#define TCG_TARGET_HAS_v256 have_avx2
17
}
18
18
19
#define TCG_TARGET_HAS_andc_vec 1
19
+/* If the binary operation has first argument @i, fold to @i. */
20
-#define TCG_TARGET_HAS_orc_vec 0
20
+static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
21
-#define TCG_TARGET_HAS_nand_vec 0
21
+{
22
-#define TCG_TARGET_HAS_nor_vec 0
22
+ if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
23
-#define TCG_TARGET_HAS_eqv_vec 0
23
+ return tcg_opt_gen_movi(ctx, op, op->args[0], i);
24
-#define TCG_TARGET_HAS_not_vec 0
24
+ }
25
+#define TCG_TARGET_HAS_orc_vec have_avx512vl
25
+ return false;
26
+#define TCG_TARGET_HAS_nand_vec have_avx512vl
26
+}
27
+#define TCG_TARGET_HAS_nor_vec have_avx512vl
28
+#define TCG_TARGET_HAS_eqv_vec have_avx512vl
29
+#define TCG_TARGET_HAS_not_vec have_avx512vl
30
#define TCG_TARGET_HAS_neg_vec 0
31
#define TCG_TARGET_HAS_abs_vec 1
32
#define TCG_TARGET_HAS_roti_vec have_avx512vl
33
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
34
index XXXXXXX..XXXXXXX 100644
35
--- a/tcg/i386/tcg-target.c.inc
36
+++ b/tcg/i386/tcg-target.c.inc
37
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
38
#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
39
#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16)
40
#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
41
+#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
42
#define OPC_VZEROUPPER (0x77 | P_EXT)
43
#define OPC_XCHG_ax_r32    (0x90)
44
45
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
46
insn = vpshldi_insn[vece];
47
sub = args[3];
48
goto gen_simd_imm8;
27
+
49
+
28
/* If the binary operation has first argument @i, fold to NOT. */
50
+ case INDEX_op_not_vec:
29
static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
51
+ insn = OPC_VPTERNLOGQ;
30
{
52
+ a2 = a1;
31
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
53
+ sub = 0x33; /* !B */
32
static bool fold_shift(OptContext *ctx, TCGOp *op)
54
+ goto gen_simd_imm8;
33
{
55
+ case INDEX_op_nor_vec:
34
if (fold_const2(ctx, op) ||
56
+ insn = OPC_VPTERNLOGQ;
35
+ fold_ix_to_i(ctx, op, 0) ||
57
+ sub = 0x11; /* norCB */
36
fold_xi_to_x(ctx, op, 0)) {
58
+ goto gen_simd_imm8;
37
return true;
59
+ case INDEX_op_nand_vec:
38
}
60
+ insn = OPC_VPTERNLOGQ;
39
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
61
+ sub = 0x77; /* nandCB */
40
break;
62
+ goto gen_simd_imm8;
41
}
63
+ case INDEX_op_eqv_vec:
42
64
+ insn = OPC_VPTERNLOGQ;
43
- /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
65
+ sub = 0x99; /* xnorCB */
44
- and "sub r, 0, a => neg r, a" case. */
66
+ goto gen_simd_imm8;
45
- switch (opc) {
67
+ case INDEX_op_orc_vec:
46
- CASE_OP_32_64(shl):
68
+ insn = OPC_VPTERNLOGQ;
47
- CASE_OP_32_64(shr):
69
+ sub = 0xdd; /* orB!C */
48
- CASE_OP_32_64(sar):
70
+ goto gen_simd_imm8;
49
- CASE_OP_32_64(rotl):
71
+
50
- CASE_OP_32_64(rotr):
72
gen_simd_imm8:
51
- if (arg_is_const(op->args[1])
73
tcg_debug_assert(insn != OPC_UD2);
52
- && arg_info(op->args[1])->val == 0) {
74
if (type == TCG_TYPE_V256) {
53
- tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
75
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
54
- continue;
76
case INDEX_op_or_vec:
55
- }
77
case INDEX_op_xor_vec:
56
- break;
78
case INDEX_op_andc_vec:
57
- default:
79
+ case INDEX_op_orc_vec:
58
- break;
80
+ case INDEX_op_nand_vec:
59
- }
81
+ case INDEX_op_nor_vec:
60
-
82
+ case INDEX_op_eqv_vec:
61
/* Simplify using known-zero bits. Currently only ops with a single
83
case INDEX_op_ssadd_vec:
62
output argument is supported. */
84
case INDEX_op_usadd_vec:
63
z_mask = -1;
85
case INDEX_op_sssub_vec:
86
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
87
88
case INDEX_op_abs_vec:
89
case INDEX_op_dup_vec:
90
+ case INDEX_op_not_vec:
91
case INDEX_op_shli_vec:
92
case INDEX_op_shri_vec:
93
case INDEX_op_sari_vec:
94
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
95
case INDEX_op_or_vec:
96
case INDEX_op_xor_vec:
97
case INDEX_op_andc_vec:
98
+ case INDEX_op_orc_vec:
99
+ case INDEX_op_nand_vec:
100
+ case INDEX_op_nor_vec:
101
+ case INDEX_op_eqv_vec:
102
+ case INDEX_op_not_vec:
103
return 1;
104
case INDEX_op_cmp_vec:
105
case INDEX_op_cmpsel_vec:
64
--
106
--
65
2.25.1
107
2.25.1
66
108
67
109
diff view generated by jsdifflib
1
Pull the "op r, a, i => mov r, a" optimization into a function,
1
The general ternary logic operation can implement BITSEL.
2
and use them in the outer-most logical operations.
2
Funnel the 4-operand operation into three variants of the
3
3-operand instruction, depending on input operand overlap.
3
4
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
5
Tested-by: Alex Bennée <alex.bennee@linaro.org>
6
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
---
8
---
7
tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
9
tcg/i386/tcg-target.h | 2 +-
8
1 file changed, 26 insertions(+), 35 deletions(-)
10
tcg/i386/tcg-target.c.inc | 20 +++++++++++++++++++-
11
2 files changed, 20 insertions(+), 2 deletions(-)
9
12
10
diff --git a/tcg/optimize.c b/tcg/optimize.c
13
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
11
index XXXXXXX..XXXXXXX 100644
14
index XXXXXXX..XXXXXXX 100644
12
--- a/tcg/optimize.c
15
--- a/tcg/i386/tcg-target.h
13
+++ b/tcg/optimize.c
16
+++ b/tcg/i386/tcg-target.h
14
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
17
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
15
return false;
18
#define TCG_TARGET_HAS_mul_vec 1
16
}
19
#define TCG_TARGET_HAS_sat_vec 1
17
20
#define TCG_TARGET_HAS_minmax_vec 1
18
+/* If the binary operation has second argument @i, fold to identity. */
21
-#define TCG_TARGET_HAS_bitsel_vec 0
19
+static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
22
+#define TCG_TARGET_HAS_bitsel_vec have_avx512vl
20
+{
23
#define TCG_TARGET_HAS_cmpsel_vec -1
21
+ if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
24
22
+ return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
25
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
23
+ }
26
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
24
+ return false;
27
index XXXXXXX..XXXXXXX 100644
25
+}
28
--- a/tcg/i386/tcg-target.c.inc
29
+++ b/tcg/i386/tcg-target.c.inc
30
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
31
32
TCGType type = vecl + TCG_TYPE_V64;
33
int insn, sub;
34
- TCGArg a0, a1, a2;
35
+ TCGArg a0, a1, a2, a3;
36
37
a0 = args[0];
38
a1 = args[1];
39
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
40
sub = 0xdd; /* orB!C */
41
goto gen_simd_imm8;
42
43
+ case INDEX_op_bitsel_vec:
44
+ insn = OPC_VPTERNLOGQ;
45
+ a3 = args[3];
46
+ if (a0 == a1) {
47
+ a1 = a2;
48
+ a2 = a3;
49
+ sub = 0xca; /* A?B:C */
50
+ } else if (a0 == a2) {
51
+ a2 = a3;
52
+ sub = 0xe2; /* B?A:C */
53
+ } else {
54
+ tcg_out_mov(s, type, a0, a3);
55
+ sub = 0xb8; /* B?C:A */
56
+ }
57
+ goto gen_simd_imm8;
26
+
58
+
27
/* If the binary operation has second argument @i, fold to NOT. */
59
gen_simd_imm8:
28
static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
60
tcg_debug_assert(insn != OPC_UD2);
29
{
61
if (type == TCG_TYPE_V256) {
30
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
62
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
31
63
case INDEX_op_x86_vpshrdv_vec:
32
static bool fold_add(OptContext *ctx, TCGOp *op)
64
return C_O1_I3(x, 0, x, x);
33
{
65
34
- return fold_const2(ctx, op);
66
+ case INDEX_op_bitsel_vec:
35
+ if (fold_const2(ctx, op) ||
67
case INDEX_op_x86_vpblendvb_vec:
36
+ fold_xi_to_x(ctx, op, 0)) {
68
return C_O1_I3(x, x, x, x);
37
+ return true;
69
38
+ }
70
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
39
+ return false;
71
case INDEX_op_nor_vec:
40
}
72
case INDEX_op_eqv_vec:
41
73
case INDEX_op_not_vec:
42
static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
74
+ case INDEX_op_bitsel_vec:
43
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
75
return 1;
44
{
76
case INDEX_op_cmp_vec:
45
if (fold_const2(ctx, op) ||
77
case INDEX_op_cmpsel_vec:
46
fold_xi_to_i(ctx, op, 0) ||
47
+ fold_xi_to_x(ctx, op, -1) ||
48
fold_xx_to_x(ctx, op)) {
49
return true;
50
}
51
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
52
{
53
if (fold_const2(ctx, op) ||
54
fold_xx_to_i(ctx, op, 0) ||
55
+ fold_xi_to_x(ctx, op, 0) ||
56
fold_ix_to_not(ctx, op, -1)) {
57
return true;
58
}
59
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
60
static bool fold_eqv(OptContext *ctx, TCGOp *op)
61
{
62
if (fold_const2(ctx, op) ||
63
+ fold_xi_to_x(ctx, op, -1) ||
64
fold_xi_to_not(ctx, op, 0)) {
65
return true;
66
}
67
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
68
static bool fold_or(OptContext *ctx, TCGOp *op)
69
{
70
if (fold_const2(ctx, op) ||
71
+ fold_xi_to_x(ctx, op, 0) ||
72
fold_xx_to_x(ctx, op)) {
73
return true;
74
}
75
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
76
static bool fold_orc(OptContext *ctx, TCGOp *op)
77
{
78
if (fold_const2(ctx, op) ||
79
+ fold_xi_to_x(ctx, op, -1) ||
80
fold_ix_to_not(ctx, op, 0)) {
81
return true;
82
}
83
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
84
85
static bool fold_shift(OptContext *ctx, TCGOp *op)
86
{
87
- return fold_const2(ctx, op);
88
+ if (fold_const2(ctx, op) ||
89
+ fold_xi_to_x(ctx, op, 0)) {
90
+ return true;
91
+ }
92
+ return false;
93
}
94
95
static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
96
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
97
{
98
if (fold_const2(ctx, op) ||
99
fold_xx_to_i(ctx, op, 0) ||
100
+ fold_xi_to_x(ctx, op, 0) ||
101
fold_sub_to_neg(ctx, op)) {
102
return true;
103
}
104
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
105
{
106
if (fold_const2(ctx, op) ||
107
fold_xx_to_i(ctx, op, 0) ||
108
+ fold_xi_to_x(ctx, op, 0) ||
109
fold_xi_to_not(ctx, op, -1)) {
110
return true;
111
}
112
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
113
break;
114
}
115
116
- /* Simplify expression for "op r, a, const => mov r, a" cases */
117
- switch (opc) {
118
- CASE_OP_32_64_VEC(add):
119
- CASE_OP_32_64_VEC(sub):
120
- CASE_OP_32_64_VEC(or):
121
- CASE_OP_32_64_VEC(xor):
122
- CASE_OP_32_64_VEC(andc):
123
- CASE_OP_32_64(shl):
124
- CASE_OP_32_64(shr):
125
- CASE_OP_32_64(sar):
126
- CASE_OP_32_64(rotl):
127
- CASE_OP_32_64(rotr):
128
- if (!arg_is_const(op->args[1])
129
- && arg_is_const(op->args[2])
130
- && arg_info(op->args[2])->val == 0) {
131
- tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
132
- continue;
133
- }
134
- break;
135
- CASE_OP_32_64_VEC(and):
136
- CASE_OP_32_64_VEC(orc):
137
- CASE_OP_32_64(eqv):
138
- if (!arg_is_const(op->args[1])
139
- && arg_is_const(op->args[2])
140
- && arg_info(op->args[2])->val == -1) {
141
- tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
142
- continue;
143
- }
144
- break;
145
- default:
146
- break;
147
- }
148
-
149
/* Simplify using known-zero bits. Currently only ops with a single
150
output argument is supported. */
151
z_mask = -1;
152
--
78
--
153
2.25.1
79
2.25.1
154
80
155
81
diff view generated by jsdifflib
1
From: Luis Pires <luis.pires@eldorado.org.br>
1
Define as 0 for all tcg hosts. Put this in a separate header,
2
because we'll want this in places that do not ordinarily have
3
access to all of tcg/tcg.h.
2
4
3
Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
5
Reviewed-by: WANG Xuerui <git@xen0n.name>
4
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
7
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
8
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
9
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
10
---
8
tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
11
tcg/aarch64/tcg-target-sa32.h | 1 +
9
tests/unit/meson.build | 1 +
12
tcg/arm/tcg-target-sa32.h | 1 +
10
2 files changed, 198 insertions(+)
13
tcg/i386/tcg-target-sa32.h | 1 +
11
create mode 100644 tests/unit/test-div128.c
14
tcg/loongarch64/tcg-target-sa32.h | 1 +
15
tcg/mips/tcg-target-sa32.h | 1 +
16
tcg/ppc/tcg-target-sa32.h | 1 +
17
tcg/riscv/tcg-target-sa32.h | 1 +
18
tcg/s390x/tcg-target-sa32.h | 1 +
19
tcg/sparc/tcg-target-sa32.h | 1 +
20
tcg/tci/tcg-target-sa32.h | 1 +
21
tcg/tcg.c | 4 ++++
22
11 files changed, 14 insertions(+)
23
create mode 100644 tcg/aarch64/tcg-target-sa32.h
24
create mode 100644 tcg/arm/tcg-target-sa32.h
25
create mode 100644 tcg/i386/tcg-target-sa32.h
26
create mode 100644 tcg/loongarch64/tcg-target-sa32.h
27
create mode 100644 tcg/mips/tcg-target-sa32.h
28
create mode 100644 tcg/ppc/tcg-target-sa32.h
29
create mode 100644 tcg/riscv/tcg-target-sa32.h
30
create mode 100644 tcg/s390x/tcg-target-sa32.h
31
create mode 100644 tcg/sparc/tcg-target-sa32.h
32
create mode 100644 tcg/tci/tcg-target-sa32.h
12
33
13
diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
34
diff --git a/tcg/aarch64/tcg-target-sa32.h b/tcg/aarch64/tcg-target-sa32.h
14
new file mode 100644
35
new file mode 100644
15
index XXXXXXX..XXXXXXX
36
index XXXXXXX..XXXXXXX
16
--- /dev/null
37
--- /dev/null
17
+++ b/tests/unit/test-div128.c
38
+++ b/tcg/aarch64/tcg-target-sa32.h
39
@@ -0,0 +1 @@
40
+#define TCG_TARGET_SIGNED_ADDR32 0
41
diff --git a/tcg/arm/tcg-target-sa32.h b/tcg/arm/tcg-target-sa32.h
42
new file mode 100644
43
index XXXXXXX..XXXXXXX
44
--- /dev/null
45
+++ b/tcg/arm/tcg-target-sa32.h
46
@@ -0,0 +1 @@
47
+#define TCG_TARGET_SIGNED_ADDR32 0
48
diff --git a/tcg/i386/tcg-target-sa32.h b/tcg/i386/tcg-target-sa32.h
49
new file mode 100644
50
index XXXXXXX..XXXXXXX
51
--- /dev/null
52
+++ b/tcg/i386/tcg-target-sa32.h
53
@@ -0,0 +1 @@
54
+#define TCG_TARGET_SIGNED_ADDR32 0
55
diff --git a/tcg/loongarch64/tcg-target-sa32.h b/tcg/loongarch64/tcg-target-sa32.h
56
new file mode 100644
57
index XXXXXXX..XXXXXXX
58
--- /dev/null
59
+++ b/tcg/loongarch64/tcg-target-sa32.h
60
@@ -0,0 +1 @@
61
+#define TCG_TARGET_SIGNED_ADDR32 0
62
diff --git a/tcg/mips/tcg-target-sa32.h b/tcg/mips/tcg-target-sa32.h
63
new file mode 100644
64
index XXXXXXX..XXXXXXX
65
--- /dev/null
66
+++ b/tcg/mips/tcg-target-sa32.h
67
@@ -0,0 +1 @@
68
+#define TCG_TARGET_SIGNED_ADDR32 0
69
diff --git a/tcg/ppc/tcg-target-sa32.h b/tcg/ppc/tcg-target-sa32.h
70
new file mode 100644
71
index XXXXXXX..XXXXXXX
72
--- /dev/null
73
+++ b/tcg/ppc/tcg-target-sa32.h
74
@@ -0,0 +1 @@
75
+#define TCG_TARGET_SIGNED_ADDR32 0
76
diff --git a/tcg/riscv/tcg-target-sa32.h b/tcg/riscv/tcg-target-sa32.h
77
new file mode 100644
78
index XXXXXXX..XXXXXXX
79
--- /dev/null
80
+++ b/tcg/riscv/tcg-target-sa32.h
81
@@ -0,0 +1 @@
82
+#define TCG_TARGET_SIGNED_ADDR32 0
83
diff --git a/tcg/s390x/tcg-target-sa32.h b/tcg/s390x/tcg-target-sa32.h
84
new file mode 100644
85
index XXXXXXX..XXXXXXX
86
--- /dev/null
87
+++ b/tcg/s390x/tcg-target-sa32.h
88
@@ -0,0 +1 @@
89
+#define TCG_TARGET_SIGNED_ADDR32 0
90
diff --git a/tcg/sparc/tcg-target-sa32.h b/tcg/sparc/tcg-target-sa32.h
91
new file mode 100644
92
index XXXXXXX..XXXXXXX
93
--- /dev/null
94
+++ b/tcg/sparc/tcg-target-sa32.h
95
@@ -0,0 +1 @@
96
+#define TCG_TARGET_SIGNED_ADDR32 0
97
diff --git a/tcg/tci/tcg-target-sa32.h b/tcg/tci/tcg-target-sa32.h
98
new file mode 100644
99
index XXXXXXX..XXXXXXX
100
--- /dev/null
101
+++ b/tcg/tci/tcg-target-sa32.h
102
@@ -0,0 +1 @@
103
+#define TCG_TARGET_SIGNED_ADDR32 0
104
diff --git a/tcg/tcg.c b/tcg/tcg.c
105
index XXXXXXX..XXXXXXX 100644
106
--- a/tcg/tcg.c
107
+++ b/tcg/tcg.c
18
@@ -XXX,XX +XXX,XX @@
108
@@ -XXX,XX +XXX,XX @@
19
+/*
109
#include "exec/log.h"
20
+ * Test 128-bit division functions
110
#include "tcg/tcg-ldst.h"
21
+ *
111
#include "tcg-internal.h"
22
+ * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
112
+#include "tcg-target-sa32.h"
23
+ *
24
+ * This library is free software; you can redistribute it and/or
25
+ * modify it under the terms of the GNU Lesser General Public
26
+ * License as published by the Free Software Foundation; either
27
+ * version 2.1 of the License, or (at your option) any later version.
28
+ *
29
+ * This library is distributed in the hope that it will be useful,
30
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
31
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
32
+ * Lesser General Public License for more details.
33
+ *
34
+ * You should have received a copy of the GNU Lesser General Public
35
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
36
+ */
37
+
113
+
38
+#include "qemu/osdep.h"
114
+/* Sanity check for TCG_TARGET_SIGNED_ADDR32. */
39
+#include "qemu/host-utils.h"
115
+QEMU_BUILD_BUG_ON(TCG_TARGET_REG_BITS == 32 && TCG_TARGET_SIGNED_ADDR32);
40
+
116
41
+typedef struct {
117
#ifdef CONFIG_TCG_INTERPRETER
42
+ uint64_t high;
118
#include <ffi.h>
43
+ uint64_t low;
44
+ uint64_t rhigh;
45
+ uint64_t rlow;
46
+ uint64_t divisor;
47
+ uint64_t remainder;
48
+} test_data_unsigned;
49
+
50
+typedef struct {
51
+ int64_t high;
52
+ uint64_t low;
53
+ int64_t rhigh;
54
+ uint64_t rlow;
55
+ int64_t divisor;
56
+ int64_t remainder;
57
+} test_data_signed;
58
+
59
+static const test_data_unsigned test_table_unsigned[] = {
60
+ /* Dividend fits in 64 bits */
61
+ { 0x0000000000000000ULL, 0x0000000000000000ULL,
62
+ 0x0000000000000000ULL, 0x0000000000000000ULL,
63
+ 0x0000000000000001ULL, 0x0000000000000000ULL},
64
+ { 0x0000000000000000ULL, 0x0000000000000001ULL,
65
+ 0x0000000000000000ULL, 0x0000000000000001ULL,
66
+ 0x0000000000000001ULL, 0x0000000000000000ULL},
67
+ { 0x0000000000000000ULL, 0x0000000000000003ULL,
68
+ 0x0000000000000000ULL, 0x0000000000000001ULL,
69
+ 0x0000000000000002ULL, 0x0000000000000001ULL},
70
+ { 0x0000000000000000ULL, 0x8000000000000000ULL,
71
+ 0x0000000000000000ULL, 0x8000000000000000ULL,
72
+ 0x0000000000000001ULL, 0x0000000000000000ULL},
73
+ { 0x0000000000000000ULL, 0xa000000000000000ULL,
74
+ 0x0000000000000000ULL, 0x0000000000000002ULL,
75
+ 0x4000000000000000ULL, 0x2000000000000000ULL},
76
+ { 0x0000000000000000ULL, 0x8000000000000000ULL,
77
+ 0x0000000000000000ULL, 0x0000000000000001ULL,
78
+ 0x8000000000000000ULL, 0x0000000000000000ULL},
79
+
80
+ /* Dividend > 64 bits, with MSB 0 */
81
+ { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
82
+ 0x123456789abcdefeULL, 0xefedcba987654321ULL,
83
+ 0x0000000000000001ULL, 0x0000000000000000ULL},
84
+ { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
85
+ 0x0000000000000001ULL, 0x000000000000000dULL,
86
+ 0x123456789abcdefeULL, 0x03456789abcdf03bULL},
87
+ { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
88
+ 0x0123456789abcdefULL, 0xeefedcba98765432ULL,
89
+ 0x0000000000000010ULL, 0x0000000000000001ULL},
90
+
91
+ /* Dividend > 64 bits, with MSB 1 */
92
+ { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
93
+ 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
94
+ 0x0000000000000001ULL, 0x0000000000000000ULL},
95
+ { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
96
+ 0x0000000000000001ULL, 0x0000000000000000ULL,
97
+ 0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
98
+ { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
99
+ 0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
100
+ 0x0000000000000010ULL, 0x000000000000000fULL},
101
+ { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
102
+ 0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
103
+ 0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
104
+
105
+ /**
106
+ * Divisor == 64 bits, with MSB 1
107
+ * and high 64 bits of dividend >= divisor
108
+ * (for testing normalization)
109
+ */
110
+ { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
111
+ 0x0000000000000001ULL, 0x0000000000000000ULL,
112
+ 0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
113
+ { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
114
+ 0x0000000000000001ULL, 0xfddbb9977553310aULL,
115
+ 0x8000000000000001ULL, 0x78899aabbccddf05ULL},
116
+
117
+ /* Dividend > 64 bits, divisor almost as big */
118
+ { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
119
+ 0x0000000000000000ULL, 0x000000000000000fULL,
120
+ 0x123456789abcdefeULL, 0x123456789abcde1fULL},
121
+};
122
+
123
+static const test_data_signed test_table_signed[] = {
124
+ /* Positive dividend, positive/negative divisors */
125
+ { 0x0000000000000000LL, 0x0000000000bc614eULL,
126
+ 0x0000000000000000LL, 0x0000000000bc614eULL,
127
+ 0x0000000000000001LL, 0x0000000000000000LL},
128
+ { 0x0000000000000000LL, 0x0000000000bc614eULL,
129
+ 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
130
+ 0xffffffffffffffffLL, 0x0000000000000000LL},
131
+ { 0x0000000000000000LL, 0x0000000000bc614eULL,
132
+ 0x0000000000000000LL, 0x00000000005e30a7ULL,
133
+ 0x0000000000000002LL, 0x0000000000000000LL},
134
+ { 0x0000000000000000LL, 0x0000000000bc614eULL,
135
+ 0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
136
+ 0xfffffffffffffffeLL, 0x0000000000000000LL},
137
+ { 0x0000000000000000LL, 0x0000000000bc614eULL,
138
+ 0x0000000000000000LL, 0x0000000000178c29ULL,
139
+ 0x0000000000000008LL, 0x0000000000000006LL},
140
+ { 0x0000000000000000LL, 0x0000000000bc614eULL,
141
+ 0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
142
+ 0xfffffffffffffff8LL, 0x0000000000000006LL},
143
+ { 0x0000000000000000LL, 0x0000000000bc614eULL,
144
+ 0x0000000000000000LL, 0x000000000000550dULL,
145
+ 0x0000000000000237LL, 0x0000000000000183LL},
146
+ { 0x0000000000000000LL, 0x0000000000bc614eULL,
147
+ 0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
148
+ 0xfffffffffffffdc9LL, 0x0000000000000183LL},
149
+
150
+ /* Negative dividend, positive/negative divisors */
151
+ { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
152
+ 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
153
+ 0x0000000000000001LL, 0x0000000000000000LL},
154
+ { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
155
+ 0x0000000000000000LL, 0x0000000000bc614eULL,
156
+ 0xffffffffffffffffLL, 0x0000000000000000LL},
157
+ { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
158
+ 0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
159
+ 0x0000000000000002LL, 0x0000000000000000LL},
160
+ { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
161
+ 0x0000000000000000LL, 0x00000000005e30a7ULL,
162
+ 0xfffffffffffffffeLL, 0x0000000000000000LL},
163
+ { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
164
+ 0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
165
+ 0x0000000000000008LL, 0xfffffffffffffffaLL},
166
+ { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
167
+ 0x0000000000000000LL, 0x0000000000178c29ULL,
168
+ 0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
169
+ { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
170
+ 0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
171
+ 0x0000000000000237LL, 0xfffffffffffffe7dLL},
172
+ { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
173
+ 0x0000000000000000LL, 0x000000000000550dULL,
174
+ 0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
175
+};
176
+
177
+static void test_divu128(void)
178
+{
179
+ int i;
180
+ uint64_t rem;
181
+ test_data_unsigned tmp;
182
+
183
+ for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
184
+ tmp = test_table_unsigned[i];
185
+
186
+ rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
187
+ g_assert_cmpuint(tmp.low, ==, tmp.rlow);
188
+ g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
189
+ g_assert_cmpuint(rem, ==, tmp.remainder);
190
+ }
191
+}
192
+
193
+static void test_divs128(void)
194
+{
195
+ int i;
196
+ int64_t rem;
197
+ test_data_signed tmp;
198
+
199
+ for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
200
+ tmp = test_table_signed[i];
201
+
202
+ rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
203
+ g_assert_cmpuint(tmp.low, ==, tmp.rlow);
204
+ g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
205
+ g_assert_cmpuint(rem, ==, tmp.remainder);
206
+ }
207
+}
208
+
209
+int main(int argc, char **argv)
210
+{
211
+ g_test_init(&argc, &argv, NULL);
212
+ g_test_add_func("/host-utils/test_divu128", test_divu128);
213
+ g_test_add_func("/host-utils/test_divs128", test_divs128);
214
+ return g_test_run();
215
+}
216
diff --git a/tests/unit/meson.build b/tests/unit/meson.build
217
index XXXXXXX..XXXXXXX 100644
218
--- a/tests/unit/meson.build
219
+++ b/tests/unit/meson.build
220
@@ -XXX,XX +XXX,XX @@ tests = {
221
# all code tested by test-x86-cpuid is inside topology.h
222
'test-x86-cpuid': [],
223
'test-cutils': [],
224
+ 'test-div128': [],
225
'test-shift128': [],
226
'test-mul64': [],
227
# all code tested by test-int128 is inside int128.h
228
--
119
--
229
2.25.1
120
2.25.1
230
121
231
122
diff view generated by jsdifflib
Deleted patch
1
Prepare for tracking different masks by renaming this one.
2
1
3
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
8
tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
9
1 file changed, 72 insertions(+), 70 deletions(-)
10
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
12
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
14
+++ b/tcg/optimize.c
15
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
16
TCGTemp *prev_copy;
17
TCGTemp *next_copy;
18
uint64_t val;
19
- uint64_t mask;
20
+ uint64_t z_mask; /* mask bit is 0 if and only if value bit is 0 */
21
} TempOptInfo;
22
23
static inline TempOptInfo *ts_info(TCGTemp *ts)
24
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
25
ti->next_copy = ts;
26
ti->prev_copy = ts;
27
ti->is_const = false;
28
- ti->mask = -1;
29
+ ti->z_mask = -1;
30
}
31
32
static void reset_temp(TCGArg arg)
33
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
34
if (ts->kind == TEMP_CONST) {
35
ti->is_const = true;
36
ti->val = ts->val;
37
- ti->mask = ts->val;
38
+ ti->z_mask = ts->val;
39
if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
40
/* High bits of a 32-bit quantity are garbage. */
41
- ti->mask |= ~0xffffffffull;
42
+ ti->z_mask |= ~0xffffffffull;
43
}
44
} else {
45
ti->is_const = false;
46
- ti->mask = -1;
47
+ ti->z_mask = -1;
48
}
49
}
50
51
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
52
const TCGOpDef *def;
53
TempOptInfo *di;
54
TempOptInfo *si;
55
- uint64_t mask;
56
+ uint64_t z_mask;
57
TCGOpcode new_op;
58
59
if (ts_are_copies(dst_ts, src_ts)) {
60
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
61
op->args[0] = dst;
62
op->args[1] = src;
63
64
- mask = si->mask;
65
+ z_mask = si->z_mask;
66
if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
67
/* High bits of the destination are now garbage. */
68
- mask |= ~0xffffffffull;
69
+ z_mask |= ~0xffffffffull;
70
}
71
- di->mask = mask;
72
+ di->z_mask = z_mask;
73
74
if (src_ts->type == dst_ts->type) {
75
TempOptInfo *ni = ts_info(si->next_copy);
76
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
77
}
78
79
QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
80
- uint64_t mask, partmask, affected, tmp;
81
+ uint64_t z_mask, partmask, affected, tmp;
82
int nb_oargs, nb_iargs;
83
TCGOpcode opc = op->opc;
84
const TCGOpDef *def = &tcg_op_defs[opc];
85
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
86
87
/* Simplify using known-zero bits. Currently only ops with a single
88
output argument is supported. */
89
- mask = -1;
90
+ z_mask = -1;
91
affected = -1;
92
switch (opc) {
93
CASE_OP_32_64(ext8s):
94
- if ((arg_info(op->args[1])->mask & 0x80) != 0) {
95
+ if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
96
break;
97
}
98
QEMU_FALLTHROUGH;
99
CASE_OP_32_64(ext8u):
100
- mask = 0xff;
101
+ z_mask = 0xff;
102
goto and_const;
103
CASE_OP_32_64(ext16s):
104
- if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
105
+ if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
106
break;
107
}
108
QEMU_FALLTHROUGH;
109
CASE_OP_32_64(ext16u):
110
- mask = 0xffff;
111
+ z_mask = 0xffff;
112
goto and_const;
113
case INDEX_op_ext32s_i64:
114
- if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
115
+ if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
116
break;
117
}
118
QEMU_FALLTHROUGH;
119
case INDEX_op_ext32u_i64:
120
- mask = 0xffffffffU;
121
+ z_mask = 0xffffffffU;
122
goto and_const;
123
124
CASE_OP_32_64(and):
125
- mask = arg_info(op->args[2])->mask;
126
+ z_mask = arg_info(op->args[2])->z_mask;
127
if (arg_is_const(op->args[2])) {
128
and_const:
129
- affected = arg_info(op->args[1])->mask & ~mask;
130
+ affected = arg_info(op->args[1])->z_mask & ~z_mask;
131
}
132
- mask = arg_info(op->args[1])->mask & mask;
133
+ z_mask = arg_info(op->args[1])->z_mask & z_mask;
134
break;
135
136
case INDEX_op_ext_i32_i64:
137
- if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
138
+ if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
139
break;
140
}
141
QEMU_FALLTHROUGH;
142
case INDEX_op_extu_i32_i64:
143
/* We do not compute affected as it is a size changing op. */
144
- mask = (uint32_t)arg_info(op->args[1])->mask;
145
+ z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
146
break;
147
148
CASE_OP_32_64(andc):
149
/* Known-zeros does not imply known-ones. Therefore unless
150
op->args[2] is constant, we can't infer anything from it. */
151
if (arg_is_const(op->args[2])) {
152
- mask = ~arg_info(op->args[2])->mask;
153
+ z_mask = ~arg_info(op->args[2])->z_mask;
154
goto and_const;
155
}
156
/* But we certainly know nothing outside args[1] may be set. */
157
- mask = arg_info(op->args[1])->mask;
158
+ z_mask = arg_info(op->args[1])->z_mask;
159
break;
160
161
case INDEX_op_sar_i32:
162
if (arg_is_const(op->args[2])) {
163
tmp = arg_info(op->args[2])->val & 31;
164
- mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
165
+ z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
166
}
167
break;
168
case INDEX_op_sar_i64:
169
if (arg_is_const(op->args[2])) {
170
tmp = arg_info(op->args[2])->val & 63;
171
- mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
172
+ z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
173
}
174
break;
175
176
case INDEX_op_shr_i32:
177
if (arg_is_const(op->args[2])) {
178
tmp = arg_info(op->args[2])->val & 31;
179
- mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
180
+ z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
181
}
182
break;
183
case INDEX_op_shr_i64:
184
if (arg_is_const(op->args[2])) {
185
tmp = arg_info(op->args[2])->val & 63;
186
- mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
187
+ z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
188
}
189
break;
190
191
case INDEX_op_extrl_i64_i32:
192
- mask = (uint32_t)arg_info(op->args[1])->mask;
193
+ z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
194
break;
195
case INDEX_op_extrh_i64_i32:
196
- mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
197
+ z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
198
break;
199
200
CASE_OP_32_64(shl):
201
if (arg_is_const(op->args[2])) {
202
tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
203
- mask = arg_info(op->args[1])->mask << tmp;
204
+ z_mask = arg_info(op->args[1])->z_mask << tmp;
205
}
206
break;
207
208
CASE_OP_32_64(neg):
209
/* Set to 1 all bits to the left of the rightmost. */
210
- mask = -(arg_info(op->args[1])->mask
211
- & -arg_info(op->args[1])->mask);
212
+ z_mask = -(arg_info(op->args[1])->z_mask
213
+ & -arg_info(op->args[1])->z_mask);
214
break;
215
216
CASE_OP_32_64(deposit):
217
- mask = deposit64(arg_info(op->args[1])->mask,
218
- op->args[3], op->args[4],
219
- arg_info(op->args[2])->mask);
220
+ z_mask = deposit64(arg_info(op->args[1])->z_mask,
221
+ op->args[3], op->args[4],
222
+ arg_info(op->args[2])->z_mask);
223
break;
224
225
CASE_OP_32_64(extract):
226
- mask = extract64(arg_info(op->args[1])->mask,
227
- op->args[2], op->args[3]);
228
+ z_mask = extract64(arg_info(op->args[1])->z_mask,
229
+ op->args[2], op->args[3]);
230
if (op->args[2] == 0) {
231
- affected = arg_info(op->args[1])->mask & ~mask;
232
+ affected = arg_info(op->args[1])->z_mask & ~z_mask;
233
}
234
break;
235
CASE_OP_32_64(sextract):
236
- mask = sextract64(arg_info(op->args[1])->mask,
237
- op->args[2], op->args[3]);
238
- if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
239
- affected = arg_info(op->args[1])->mask & ~mask;
240
+ z_mask = sextract64(arg_info(op->args[1])->z_mask,
241
+ op->args[2], op->args[3]);
242
+ if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
243
+ affected = arg_info(op->args[1])->z_mask & ~z_mask;
244
}
245
break;
246
247
CASE_OP_32_64(or):
248
CASE_OP_32_64(xor):
249
- mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
250
+ z_mask = arg_info(op->args[1])->z_mask
251
+ | arg_info(op->args[2])->z_mask;
252
break;
253
254
case INDEX_op_clz_i32:
255
case INDEX_op_ctz_i32:
256
- mask = arg_info(op->args[2])->mask | 31;
257
+ z_mask = arg_info(op->args[2])->z_mask | 31;
258
break;
259
260
case INDEX_op_clz_i64:
261
case INDEX_op_ctz_i64:
262
- mask = arg_info(op->args[2])->mask | 63;
263
+ z_mask = arg_info(op->args[2])->z_mask | 63;
264
break;
265
266
case INDEX_op_ctpop_i32:
267
- mask = 32 | 31;
268
+ z_mask = 32 | 31;
269
break;
270
case INDEX_op_ctpop_i64:
271
- mask = 64 | 63;
272
+ z_mask = 64 | 63;
273
break;
274
275
CASE_OP_32_64(setcond):
276
case INDEX_op_setcond2_i32:
277
- mask = 1;
278
+ z_mask = 1;
279
break;
280
281
CASE_OP_32_64(movcond):
282
- mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
283
+ z_mask = arg_info(op->args[3])->z_mask
284
+ | arg_info(op->args[4])->z_mask;
285
break;
286
287
CASE_OP_32_64(ld8u):
288
- mask = 0xff;
289
+ z_mask = 0xff;
290
break;
291
CASE_OP_32_64(ld16u):
292
- mask = 0xffff;
293
+ z_mask = 0xffff;
294
break;
295
case INDEX_op_ld32u_i64:
296
- mask = 0xffffffffu;
297
+ z_mask = 0xffffffffu;
298
break;
299
300
CASE_OP_32_64(qemu_ld):
301
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
302
MemOpIdx oi = op->args[nb_oargs + nb_iargs];
303
MemOp mop = get_memop(oi);
304
if (!(mop & MO_SIGN)) {
305
- mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
306
+ z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
307
}
308
}
309
break;
310
311
CASE_OP_32_64(bswap16):
312
- mask = arg_info(op->args[1])->mask;
313
- if (mask <= 0xffff) {
314
+ z_mask = arg_info(op->args[1])->z_mask;
315
+ if (z_mask <= 0xffff) {
316
op->args[2] |= TCG_BSWAP_IZ;
317
}
318
- mask = bswap16(mask);
319
+ z_mask = bswap16(z_mask);
320
switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
321
case TCG_BSWAP_OZ:
322
break;
323
case TCG_BSWAP_OS:
324
- mask = (int16_t)mask;
325
+ z_mask = (int16_t)z_mask;
326
break;
327
default: /* undefined high bits */
328
- mask |= MAKE_64BIT_MASK(16, 48);
329
+ z_mask |= MAKE_64BIT_MASK(16, 48);
330
break;
331
}
332
break;
333
334
case INDEX_op_bswap32_i64:
335
- mask = arg_info(op->args[1])->mask;
336
- if (mask <= 0xffffffffu) {
337
+ z_mask = arg_info(op->args[1])->z_mask;
338
+ if (z_mask <= 0xffffffffu) {
339
op->args[2] |= TCG_BSWAP_IZ;
340
}
341
- mask = bswap32(mask);
342
+ z_mask = bswap32(z_mask);
343
switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
344
case TCG_BSWAP_OZ:
345
break;
346
case TCG_BSWAP_OS:
347
- mask = (int32_t)mask;
348
+ z_mask = (int32_t)z_mask;
349
break;
350
default: /* undefined high bits */
351
- mask |= MAKE_64BIT_MASK(32, 32);
352
+ z_mask |= MAKE_64BIT_MASK(32, 32);
353
break;
354
}
355
break;
356
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
357
/* 32-bit ops generate 32-bit results. For the result is zero test
358
below, we can ignore high bits, but for further optimizations we
359
need to record that the high bits contain garbage. */
360
- partmask = mask;
361
+ partmask = z_mask;
362
if (!(def->flags & TCG_OPF_64BIT)) {
363
- mask |= ~(tcg_target_ulong)0xffffffffu;
364
+ z_mask |= ~(tcg_target_ulong)0xffffffffu;
365
partmask &= 0xffffffffu;
366
affected &= 0xffffffffu;
367
}
368
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
369
vs the high word of the input. */
370
do_setcond_high:
371
reset_temp(op->args[0]);
372
- arg_info(op->args[0])->mask = 1;
373
+ arg_info(op->args[0])->z_mask = 1;
374
op->opc = INDEX_op_setcond_i32;
375
op->args[1] = op->args[2];
376
op->args[2] = op->args[4];
377
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
378
}
379
do_setcond_low:
380
reset_temp(op->args[0]);
381
- arg_info(op->args[0])->mask = 1;
382
+ arg_info(op->args[0])->z_mask = 1;
383
op->opc = INDEX_op_setcond_i32;
384
op->args[2] = op->args[3];
385
op->args[3] = op->args[5];
386
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
387
/* Default case: we know nothing about operation (or were unable
388
to compute the operation result) so no propagation is done.
389
We trash everything if the operation is the end of a basic
390
- block, otherwise we only trash the output args. "mask" is
391
+ block, otherwise we only trash the output args. "z_mask" is
392
the non-zero bits mask for the first output arg. */
393
if (def->flags & TCG_OPF_BB_END) {
394
memset(&temps_used, 0, sizeof(temps_used));
395
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
396
/* Save the corresponding known-zero bits mask for the
397
first output argument (only one supported so far). */
398
if (i == 0) {
399
- arg_info(op->args[i])->mask = mask;
400
+ arg_info(op->args[i])->z_mask = z_mask;
401
}
402
}
403
}
404
--
405
2.25.1
406
407
diff view generated by jsdifflib
Deleted patch
1
Provide what will become a larger context for splitting
2
the very large tcg_optimize function.
3
1
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
6
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
8
---
9
tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
10
1 file changed, 40 insertions(+), 37 deletions(-)
11
12
diff --git a/tcg/optimize.c b/tcg/optimize.c
13
index XXXXXXX..XXXXXXX 100644
14
--- a/tcg/optimize.c
15
+++ b/tcg/optimize.c
16
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
17
uint64_t z_mask; /* mask bit is 0 if and only if value bit is 0 */
18
} TempOptInfo;
19
20
+typedef struct OptContext {
21
+ TCGTempSet temps_used;
22
+} OptContext;
23
+
24
static inline TempOptInfo *ts_info(TCGTemp *ts)
25
{
26
return ts->state_ptr;
27
@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
28
}
29
30
/* Initialize and activate a temporary. */
31
-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
32
+static void init_ts_info(OptContext *ctx, TCGTemp *ts)
33
{
34
size_t idx = temp_idx(ts);
35
TempOptInfo *ti;
36
37
- if (test_bit(idx, temps_used->l)) {
38
+ if (test_bit(idx, ctx->temps_used.l)) {
39
return;
40
}
41
- set_bit(idx, temps_used->l);
42
+ set_bit(idx, ctx->temps_used.l);
43
44
ti = ts->state_ptr;
45
if (ti == NULL) {
46
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
47
}
48
}
49
50
-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
51
+static void init_arg_info(OptContext *ctx, TCGArg arg)
52
{
53
- init_ts_info(temps_used, arg_temp(arg));
54
+ init_ts_info(ctx, arg_temp(arg));
55
}
56
57
static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
58
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
59
}
60
}
61
62
-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
63
+static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
64
TCGOp *op, TCGArg dst, uint64_t val)
65
{
66
const TCGOpDef *def = &tcg_op_defs[op->opc];
67
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
68
69
/* Convert movi to mov with constant temp. */
70
tv = tcg_constant_internal(type, val);
71
- init_ts_info(temps_used, tv);
72
+ init_ts_info(ctx, tv);
73
tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
74
}
75
76
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
77
{
78
int nb_temps, nb_globals, i;
79
TCGOp *op, *op_next, *prev_mb = NULL;
80
- TCGTempSet temps_used;
81
+ OptContext ctx = {};
82
83
/* Array VALS has an element for each temp.
84
If this temp holds a constant then its value is kept in VALS' element.
85
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
86
nb_temps = s->nb_temps;
87
nb_globals = s->nb_globals;
88
89
- memset(&temps_used, 0, sizeof(temps_used));
90
for (i = 0; i < nb_temps; ++i) {
91
s->temps[i].state_ptr = NULL;
92
}
93
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
94
for (i = 0; i < nb_oargs + nb_iargs; i++) {
95
TCGTemp *ts = arg_temp(op->args[i]);
96
if (ts) {
97
- init_ts_info(&temps_used, ts);
98
+ init_ts_info(&ctx, ts);
99
}
100
}
101
} else {
102
nb_oargs = def->nb_oargs;
103
nb_iargs = def->nb_iargs;
104
for (i = 0; i < nb_oargs + nb_iargs; i++) {
105
- init_arg_info(&temps_used, op->args[i]);
106
+ init_arg_info(&ctx, op->args[i]);
107
}
108
}
109
110
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
111
CASE_OP_32_64(rotr):
112
if (arg_is_const(op->args[1])
113
&& arg_info(op->args[1])->val == 0) {
114
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
115
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
116
continue;
117
}
118
break;
119
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
120
121
if (partmask == 0) {
122
tcg_debug_assert(nb_oargs == 1);
123
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
124
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
125
continue;
126
}
127
if (affected == 0) {
128
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
129
CASE_OP_32_64(mulsh):
130
if (arg_is_const(op->args[2])
131
&& arg_info(op->args[2])->val == 0) {
132
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
133
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
134
continue;
135
}
136
break;
137
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
138
CASE_OP_32_64_VEC(sub):
139
CASE_OP_32_64_VEC(xor):
140
if (args_are_copies(op->args[1], op->args[2])) {
141
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
142
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
143
continue;
144
}
145
break;
146
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
147
if (arg_is_const(op->args[1])) {
148
tmp = arg_info(op->args[1])->val;
149
tmp = dup_const(TCGOP_VECE(op), tmp);
150
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
151
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
152
break;
153
}
154
goto do_default;
155
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
156
case INDEX_op_dup2_vec:
157
assert(TCG_TARGET_REG_BITS == 32);
158
if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
159
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
160
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0],
161
deposit64(arg_info(op->args[1])->val, 32, 32,
162
arg_info(op->args[2])->val));
163
break;
164
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
165
case INDEX_op_extrh_i64_i32:
166
if (arg_is_const(op->args[1])) {
167
tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
168
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
169
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
170
break;
171
}
172
goto do_default;
173
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
174
if (arg_is_const(op->args[1])) {
175
tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
176
op->args[2]);
177
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
178
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
179
break;
180
}
181
goto do_default;
182
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
183
if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
184
tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
185
arg_info(op->args[2])->val);
186
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
187
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
188
break;
189
}
190
goto do_default;
191
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
192
TCGArg v = arg_info(op->args[1])->val;
193
if (v != 0) {
194
tmp = do_constant_folding(opc, v, 0);
195
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
196
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
197
} else {
198
tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
199
}
200
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
201
tmp = deposit64(arg_info(op->args[1])->val,
202
op->args[3], op->args[4],
203
arg_info(op->args[2])->val);
204
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
205
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
206
break;
207
}
208
goto do_default;
209
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
210
if (arg_is_const(op->args[1])) {
211
tmp = extract64(arg_info(op->args[1])->val,
212
op->args[2], op->args[3]);
213
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
214
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
215
break;
216
}
217
goto do_default;
218
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
219
if (arg_is_const(op->args[1])) {
220
tmp = sextract64(arg_info(op->args[1])->val,
221
op->args[2], op->args[3]);
222
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
223
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
224
break;
225
}
226
goto do_default;
227
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
228
tmp = (int32_t)(((uint32_t)v1 >> shr) |
229
((uint32_t)v2 << (32 - shr)));
230
}
231
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
232
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
233
break;
234
}
235
goto do_default;
236
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
237
tmp = do_constant_folding_cond(opc, op->args[1],
238
op->args[2], op->args[3]);
239
if (tmp != 2) {
240
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
241
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
242
break;
243
}
244
goto do_default;
245
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
246
op->args[1], op->args[2]);
247
if (tmp != 2) {
248
if (tmp) {
249
- memset(&temps_used, 0, sizeof(temps_used));
250
+ memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
251
op->opc = INDEX_op_br;
252
op->args[0] = op->args[3];
253
} else {
254
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
255
256
rl = op->args[0];
257
rh = op->args[1];
258
- tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
259
- tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
260
+ tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
261
+ tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
262
break;
263
}
264
goto do_default;
265
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
266
267
rl = op->args[0];
268
rh = op->args[1];
269
- tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
270
- tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
271
+ tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
272
+ tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
273
break;
274
}
275
goto do_default;
276
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
277
if (tmp != 2) {
278
if (tmp) {
279
do_brcond_true:
280
- memset(&temps_used, 0, sizeof(temps_used));
281
+ memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
282
op->opc = INDEX_op_br;
283
op->args[0] = op->args[5];
284
} else {
285
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
286
/* Simplify LT/GE comparisons vs zero to a single compare
287
vs the high word of the input. */
288
do_brcond_high:
289
- memset(&temps_used, 0, sizeof(temps_used));
290
+ memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
291
op->opc = INDEX_op_brcond_i32;
292
op->args[0] = op->args[1];
293
op->args[1] = op->args[3];
294
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
295
goto do_default;
296
}
297
do_brcond_low:
298
- memset(&temps_used, 0, sizeof(temps_used));
299
+ memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
300
op->opc = INDEX_op_brcond_i32;
301
op->args[1] = op->args[2];
302
op->args[2] = op->args[4];
303
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
304
op->args[5]);
305
if (tmp != 2) {
306
do_setcond_const:
307
- tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
308
+ tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
309
} else if ((op->args[5] == TCG_COND_LT
310
|| op->args[5] == TCG_COND_GE)
311
&& arg_is_const(op->args[3])
312
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
313
if (!(tcg_call_flags(op)
314
& (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
315
for (i = 0; i < nb_globals; i++) {
316
- if (test_bit(i, temps_used.l)) {
317
+ if (test_bit(i, ctx.temps_used.l)) {
318
reset_ts(&s->temps[i]);
319
}
320
}
321
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
322
block, otherwise we only trash the output args. "z_mask" is
323
the non-zero bits mask for the first output arg. */
324
if (def->flags & TCG_OPF_BB_END) {
325
- memset(&temps_used, 0, sizeof(temps_used));
326
+ memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
327
} else {
328
do_reset_output:
329
for (i = 0; i < nb_oargs; i++) {
330
--
331
2.25.1
332
333
diff view generated by jsdifflib
Deleted patch
1
Break the final cleanup clause out of the main switch
2
statement. When fully folding an opcode to mov/movi,
3
use "continue" to process the next opcode, else break
4
to fall into the final cleanup.
5
1
6
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
7
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
8
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
9
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
10
---
11
tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
12
1 file changed, 94 insertions(+), 96 deletions(-)
13
14
diff --git a/tcg/optimize.c b/tcg/optimize.c
15
index XXXXXXX..XXXXXXX 100644
16
--- a/tcg/optimize.c
17
+++ b/tcg/optimize.c
18
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
19
switch (opc) {
20
CASE_OP_32_64_VEC(mov):
21
tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
22
- break;
23
+ continue;
24
25
case INDEX_op_dup_vec:
26
if (arg_is_const(op->args[1])) {
27
tmp = arg_info(op->args[1])->val;
28
tmp = dup_const(TCGOP_VECE(op), tmp);
29
tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
30
- break;
31
+ continue;
32
}
33
- goto do_default;
34
+ break;
35
36
case INDEX_op_dup2_vec:
37
assert(TCG_TARGET_REG_BITS == 32);
38
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
39
tcg_opt_gen_movi(s, &ctx, op, op->args[0],
40
deposit64(arg_info(op->args[1])->val, 32, 32,
41
arg_info(op->args[2])->val));
42
- break;
43
+ continue;
44
} else if (args_are_copies(op->args[1], op->args[2])) {
45
op->opc = INDEX_op_dup_vec;
46
TCGOP_VECE(op) = MO_32;
47
nb_iargs = 1;
48
}
49
- goto do_default;
50
+ break;
51
52
CASE_OP_32_64(not):
53
CASE_OP_32_64(neg):
54
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
55
if (arg_is_const(op->args[1])) {
56
tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
57
tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
58
- break;
59
+ continue;
60
}
61
- goto do_default;
62
+ break;
63
64
CASE_OP_32_64(bswap16):
65
CASE_OP_32_64(bswap32):
66
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
67
tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
68
op->args[2]);
69
tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
70
- break;
71
+ continue;
72
}
73
- goto do_default;
74
+ break;
75
76
CASE_OP_32_64(add):
77
CASE_OP_32_64(sub):
78
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
79
tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
80
arg_info(op->args[2])->val);
81
tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
82
- break;
83
+ continue;
84
}
85
- goto do_default;
86
+ break;
87
88
CASE_OP_32_64(clz):
89
CASE_OP_32_64(ctz):
90
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
91
} else {
92
tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
93
}
94
- break;
95
+ continue;
96
}
97
- goto do_default;
98
+ break;
99
100
CASE_OP_32_64(deposit):
101
if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
102
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
103
op->args[3], op->args[4],
104
arg_info(op->args[2])->val);
105
tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
106
- break;
107
+ continue;
108
}
109
- goto do_default;
110
+ break;
111
112
CASE_OP_32_64(extract):
113
if (arg_is_const(op->args[1])) {
114
tmp = extract64(arg_info(op->args[1])->val,
115
op->args[2], op->args[3]);
116
tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
117
- break;
118
+ continue;
119
}
120
- goto do_default;
121
+ break;
122
123
CASE_OP_32_64(sextract):
124
if (arg_is_const(op->args[1])) {
125
tmp = sextract64(arg_info(op->args[1])->val,
126
op->args[2], op->args[3]);
127
tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
128
- break;
129
+ continue;
130
}
131
- goto do_default;
132
+ break;
133
134
CASE_OP_32_64(extract2):
135
if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
136
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
137
((uint32_t)v2 << (32 - shr)));
138
}
139
tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
140
- break;
141
+ continue;
142
}
143
- goto do_default;
144
+ break;
145
146
CASE_OP_32_64(setcond):
147
tmp = do_constant_folding_cond(opc, op->args[1],
148
op->args[2], op->args[3]);
149
if (tmp != 2) {
150
tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
151
- break;
152
+ continue;
153
}
154
- goto do_default;
155
+ break;
156
157
CASE_OP_32_64(brcond):
158
tmp = do_constant_folding_cond(opc, op->args[0],
159
op->args[1], op->args[2]);
160
- if (tmp != 2) {
161
- if (tmp) {
162
- memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
163
- op->opc = INDEX_op_br;
164
- op->args[0] = op->args[3];
165
- } else {
166
- tcg_op_remove(s, op);
167
- }
168
+ switch (tmp) {
169
+ case 0:
170
+ tcg_op_remove(s, op);
171
+ continue;
172
+ case 1:
173
+ memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
174
+ op->opc = opc = INDEX_op_br;
175
+ op->args[0] = op->args[3];
176
break;
177
}
178
- goto do_default;
179
+ break;
180
181
CASE_OP_32_64(movcond):
182
tmp = do_constant_folding_cond(opc, op->args[1],
183
op->args[2], op->args[5]);
184
if (tmp != 2) {
185
tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
186
- break;
187
+ continue;
188
}
189
if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
190
uint64_t tv = arg_info(op->args[3])->val;
191
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
192
if (fv == 1 && tv == 0) {
193
cond = tcg_invert_cond(cond);
194
} else if (!(tv == 1 && fv == 0)) {
195
- goto do_default;
196
+ break;
197
}
198
op->args[3] = cond;
199
op->opc = opc = (opc == INDEX_op_movcond_i32
200
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
201
: INDEX_op_setcond_i64);
202
nb_iargs = 2;
203
}
204
- goto do_default;
205
+ break;
206
207
case INDEX_op_add2_i32:
208
case INDEX_op_sub2_i32:
209
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
210
rh = op->args[1];
211
tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
212
tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
213
- break;
214
+ continue;
215
}
216
- goto do_default;
217
+ break;
218
219
case INDEX_op_mulu2_i32:
220
if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
221
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
222
rh = op->args[1];
223
tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
224
tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
225
- break;
226
+ continue;
227
}
228
- goto do_default;
229
+ break;
230
231
case INDEX_op_brcond2_i32:
232
tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
233
op->args[4]);
234
- if (tmp != 2) {
235
- if (tmp) {
236
- do_brcond_true:
237
- memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
238
- op->opc = INDEX_op_br;
239
- op->args[0] = op->args[5];
240
- } else {
241
+ if (tmp == 0) {
242
do_brcond_false:
243
- tcg_op_remove(s, op);
244
- }
245
- } else if ((op->args[4] == TCG_COND_LT
246
- || op->args[4] == TCG_COND_GE)
247
- && arg_is_const(op->args[2])
248
- && arg_info(op->args[2])->val == 0
249
- && arg_is_const(op->args[3])
250
- && arg_info(op->args[3])->val == 0) {
251
+ tcg_op_remove(s, op);
252
+ continue;
253
+ }
254
+ if (tmp == 1) {
255
+ do_brcond_true:
256
+ op->opc = opc = INDEX_op_br;
257
+ op->args[0] = op->args[5];
258
+ break;
259
+ }
260
+ if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
261
+ && arg_is_const(op->args[2])
262
+ && arg_info(op->args[2])->val == 0
263
+ && arg_is_const(op->args[3])
264
+ && arg_info(op->args[3])->val == 0) {
265
/* Simplify LT/GE comparisons vs zero to a single compare
266
vs the high word of the input. */
267
do_brcond_high:
268
- memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
269
- op->opc = INDEX_op_brcond_i32;
270
+ op->opc = opc = INDEX_op_brcond_i32;
271
op->args[0] = op->args[1];
272
op->args[1] = op->args[3];
273
op->args[2] = op->args[4];
274
op->args[3] = op->args[5];
275
- } else if (op->args[4] == TCG_COND_EQ) {
276
+ break;
277
+ }
278
+ if (op->args[4] == TCG_COND_EQ) {
279
/* Simplify EQ comparisons where one of the pairs
280
can be simplified. */
281
tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
282
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
283
if (tmp == 0) {
284
goto do_brcond_false;
285
} else if (tmp != 1) {
286
- goto do_default;
287
+ break;
288
}
289
do_brcond_low:
290
memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
291
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
292
op->args[1] = op->args[2];
293
op->args[2] = op->args[4];
294
op->args[3] = op->args[5];
295
- } else if (op->args[4] == TCG_COND_NE) {
296
+ break;
297
+ }
298
+ if (op->args[4] == TCG_COND_NE) {
299
/* Simplify NE comparisons where one of the pairs
300
can be simplified. */
301
tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
302
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
303
} else if (tmp == 1) {
304
goto do_brcond_true;
305
}
306
- goto do_default;
307
- } else {
308
- goto do_default;
309
}
310
break;
311
312
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
313
if (tmp != 2) {
314
do_setcond_const:
315
tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
316
- } else if ((op->args[5] == TCG_COND_LT
317
- || op->args[5] == TCG_COND_GE)
318
- && arg_is_const(op->args[3])
319
- && arg_info(op->args[3])->val == 0
320
- && arg_is_const(op->args[4])
321
- && arg_info(op->args[4])->val == 0) {
322
+ continue;
323
+ }
324
+ if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
325
+ && arg_is_const(op->args[3])
326
+ && arg_info(op->args[3])->val == 0
327
+ && arg_is_const(op->args[4])
328
+ && arg_info(op->args[4])->val == 0) {
329
/* Simplify LT/GE comparisons vs zero to a single compare
330
vs the high word of the input. */
331
do_setcond_high:
332
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
333
op->args[1] = op->args[2];
334
op->args[2] = op->args[4];
335
op->args[3] = op->args[5];
336
- } else if (op->args[5] == TCG_COND_EQ) {
337
+ break;
338
+ }
339
+ if (op->args[5] == TCG_COND_EQ) {
340
/* Simplify EQ comparisons where one of the pairs
341
can be simplified. */
342
tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
343
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
344
if (tmp == 0) {
345
goto do_setcond_high;
346
} else if (tmp != 1) {
347
- goto do_default;
348
+ break;
349
}
350
do_setcond_low:
351
reset_temp(op->args[0]);
352
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
353
op->opc = INDEX_op_setcond_i32;
354
op->args[2] = op->args[3];
355
op->args[3] = op->args[5];
356
- } else if (op->args[5] == TCG_COND_NE) {
357
+ break;
358
+ }
359
+ if (op->args[5] == TCG_COND_NE) {
360
/* Simplify NE comparisons where one of the pairs
361
can be simplified. */
362
tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
363
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
364
} else if (tmp == 1) {
365
goto do_setcond_const;
366
}
367
- goto do_default;
368
- } else {
369
- goto do_default;
370
}
371
break;
372
373
- case INDEX_op_call:
374
- if (!(tcg_call_flags(op)
375
+ default:
376
+ break;
377
+ }
378
+
379
+ /* Some of the folding above can change opc. */
380
+ opc = op->opc;
381
+ def = &tcg_op_defs[opc];
382
+ if (def->flags & TCG_OPF_BB_END) {
383
+ memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
384
+ } else {
385
+ if (opc == INDEX_op_call &&
386
+ !(tcg_call_flags(op)
387
& (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
388
for (i = 0; i < nb_globals; i++) {
389
if (test_bit(i, ctx.temps_used.l)) {
390
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
391
}
392
}
393
}
394
- goto do_reset_output;
395
396
- default:
397
- do_default:
398
- /* Default case: we know nothing about operation (or were unable
399
- to compute the operation result) so no propagation is done.
400
- We trash everything if the operation is the end of a basic
401
- block, otherwise we only trash the output args. "z_mask" is
402
- the non-zero bits mask for the first output arg. */
403
- if (def->flags & TCG_OPF_BB_END) {
404
- memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
405
- } else {
406
- do_reset_output:
407
- for (i = 0; i < nb_oargs; i++) {
408
- reset_temp(op->args[i]);
409
- /* Save the corresponding known-zero bits mask for the
410
- first output argument (only one supported so far). */
411
- if (i == 0) {
412
- arg_info(op->args[i])->z_mask = z_mask;
413
- }
414
+ for (i = 0; i < nb_oargs; i++) {
415
+ reset_temp(op->args[i]);
416
+ /* Save the corresponding known-zero bits mask for the
417
+ first output argument (only one supported so far). */
418
+ if (i == 0) {
419
+ arg_info(op->args[i])->z_mask = z_mask;
420
}
421
}
422
- break;
423
}
424
425
/* Eliminate duplicate and redundant fence instructions. */
426
--
427
2.25.1
428
429
diff view generated by jsdifflib
Deleted patch
1
Adjust the interface to take the OptContext parameter instead
2
of TCGContext or both.
3
1
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
8
tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
9
1 file changed, 34 insertions(+), 33 deletions(-)
10
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
12
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
14
+++ b/tcg/optimize.c
15
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
16
} TempOptInfo;
17
18
typedef struct OptContext {
19
+ TCGContext *tcg;
20
TCGTempSet temps_used;
21
} OptContext;
22
23
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
24
return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
25
}
26
27
-static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
28
+static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
29
{
30
TCGTemp *dst_ts = arg_temp(dst);
31
TCGTemp *src_ts = arg_temp(src);
32
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
33
TCGOpcode new_op;
34
35
if (ts_are_copies(dst_ts, src_ts)) {
36
- tcg_op_remove(s, op);
37
+ tcg_op_remove(ctx->tcg, op);
38
return;
39
}
40
41
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
42
}
43
}
44
45
-static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
46
- TCGOp *op, TCGArg dst, uint64_t val)
47
+static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
48
+ TCGArg dst, uint64_t val)
49
{
50
const TCGOpDef *def = &tcg_op_defs[op->opc];
51
TCGType type;
52
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
53
/* Convert movi to mov with constant temp. */
54
tv = tcg_constant_internal(type, val);
55
init_ts_info(ctx, tv);
56
- tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
57
+ tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
58
}
59
60
static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
61
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
62
{
63
int nb_temps, nb_globals, i;
64
TCGOp *op, *op_next, *prev_mb = NULL;
65
- OptContext ctx = {};
66
+ OptContext ctx = { .tcg = s };
67
68
/* Array VALS has an element for each temp.
69
If this temp holds a constant then its value is kept in VALS' element.
70
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
71
CASE_OP_32_64(rotr):
72
if (arg_is_const(op->args[1])
73
&& arg_info(op->args[1])->val == 0) {
74
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
75
+ tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
76
continue;
77
}
78
break;
79
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
80
if (!arg_is_const(op->args[1])
81
&& arg_is_const(op->args[2])
82
&& arg_info(op->args[2])->val == 0) {
83
- tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
84
+ tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
85
continue;
86
}
87
break;
88
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
89
if (!arg_is_const(op->args[1])
90
&& arg_is_const(op->args[2])
91
&& arg_info(op->args[2])->val == -1) {
92
- tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
93
+ tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
94
continue;
95
}
96
break;
97
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
98
99
if (partmask == 0) {
100
tcg_debug_assert(nb_oargs == 1);
101
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
102
+ tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
103
continue;
104
}
105
if (affected == 0) {
106
tcg_debug_assert(nb_oargs == 1);
107
- tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
108
+ tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
109
continue;
110
}
111
112
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
113
CASE_OP_32_64(mulsh):
114
if (arg_is_const(op->args[2])
115
&& arg_info(op->args[2])->val == 0) {
116
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
117
+ tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
118
continue;
119
}
120
break;
121
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
122
CASE_OP_32_64_VEC(or):
123
CASE_OP_32_64_VEC(and):
124
if (args_are_copies(op->args[1], op->args[2])) {
125
- tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
126
+ tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
127
continue;
128
}
129
break;
130
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
131
CASE_OP_32_64_VEC(sub):
132
CASE_OP_32_64_VEC(xor):
133
if (args_are_copies(op->args[1], op->args[2])) {
134
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
135
+ tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
136
continue;
137
}
138
break;
139
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
140
allocator where needed and possible. Also detect copies. */
141
switch (opc) {
142
CASE_OP_32_64_VEC(mov):
143
- tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
144
+ tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
145
continue;
146
147
case INDEX_op_dup_vec:
148
if (arg_is_const(op->args[1])) {
149
tmp = arg_info(op->args[1])->val;
150
tmp = dup_const(TCGOP_VECE(op), tmp);
151
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
152
+ tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
153
continue;
154
}
155
break;
156
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
157
case INDEX_op_dup2_vec:
158
assert(TCG_TARGET_REG_BITS == 32);
159
if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
160
- tcg_opt_gen_movi(s, &ctx, op, op->args[0],
161
+ tcg_opt_gen_movi(&ctx, op, op->args[0],
162
deposit64(arg_info(op->args[1])->val, 32, 32,
163
arg_info(op->args[2])->val));
164
continue;
165
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
166
case INDEX_op_extrh_i64_i32:
167
if (arg_is_const(op->args[1])) {
168
tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
169
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
170
+ tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
171
continue;
172
}
173
break;
174
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
175
if (arg_is_const(op->args[1])) {
176
tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
177
op->args[2]);
178
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
179
+ tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
180
continue;
181
}
182
break;
183
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
184
if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
185
tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
186
arg_info(op->args[2])->val);
187
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
188
+ tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
189
continue;
190
}
191
break;
192
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
193
TCGArg v = arg_info(op->args[1])->val;
194
if (v != 0) {
195
tmp = do_constant_folding(opc, v, 0);
196
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
197
+ tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
198
} else {
199
- tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
200
+ tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
201
}
202
continue;
203
}
204
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
205
tmp = deposit64(arg_info(op->args[1])->val,
206
op->args[3], op->args[4],
207
arg_info(op->args[2])->val);
208
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
209
+ tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
210
continue;
211
}
212
break;
213
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
214
if (arg_is_const(op->args[1])) {
215
tmp = extract64(arg_info(op->args[1])->val,
216
op->args[2], op->args[3]);
217
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
218
+ tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
219
continue;
220
}
221
break;
222
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
223
if (arg_is_const(op->args[1])) {
224
tmp = sextract64(arg_info(op->args[1])->val,
225
op->args[2], op->args[3]);
226
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
227
+ tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
228
continue;
229
}
230
break;
231
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
232
tmp = (int32_t)(((uint32_t)v1 >> shr) |
233
((uint32_t)v2 << (32 - shr)));
234
}
235
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
236
+ tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
237
continue;
238
}
239
break;
240
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
241
tmp = do_constant_folding_cond(opc, op->args[1],
242
op->args[2], op->args[3]);
243
if (tmp != 2) {
244
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
245
+ tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
246
continue;
247
}
248
break;
249
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
250
tmp = do_constant_folding_cond(opc, op->args[1],
251
op->args[2], op->args[5]);
252
if (tmp != 2) {
253
- tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
254
+ tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
255
continue;
256
}
257
if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
258
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
259
260
rl = op->args[0];
261
rh = op->args[1];
262
- tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
263
- tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
264
+ tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
265
+ tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
266
continue;
267
}
268
break;
269
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
270
271
rl = op->args[0];
272
rh = op->args[1];
273
- tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
274
- tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
275
+ tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
276
+ tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
277
continue;
278
}
279
break;
280
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
281
op->args[5]);
282
if (tmp != 2) {
283
do_setcond_const:
284
- tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
285
+ tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
286
continue;
287
}
288
if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
289
--
290
2.25.1
291
292
diff view generated by jsdifflib
Deleted patch
1
This will expose the variable to subroutines that
2
will be broken out of tcg_optimize.
3
1
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
6
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
8
---
9
tcg/optimize.c | 11 ++++++-----
10
1 file changed, 6 insertions(+), 5 deletions(-)
11
12
diff --git a/tcg/optimize.c b/tcg/optimize.c
13
index XXXXXXX..XXXXXXX 100644
14
--- a/tcg/optimize.c
15
+++ b/tcg/optimize.c
16
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
17
18
typedef struct OptContext {
19
TCGContext *tcg;
20
+ TCGOp *prev_mb;
21
TCGTempSet temps_used;
22
} OptContext;
23
24
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
25
void tcg_optimize(TCGContext *s)
26
{
27
int nb_temps, nb_globals, i;
28
- TCGOp *op, *op_next, *prev_mb = NULL;
29
+ TCGOp *op, *op_next;
30
OptContext ctx = { .tcg = s };
31
32
/* Array VALS has an element for each temp.
33
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
34
}
35
36
/* Eliminate duplicate and redundant fence instructions. */
37
- if (prev_mb) {
38
+ if (ctx.prev_mb) {
39
switch (opc) {
40
case INDEX_op_mb:
41
/* Merge two barriers of the same type into one,
42
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
43
* barrier. This is stricter than specified but for
44
* the purposes of TCG is better than not optimizing.
45
*/
46
- prev_mb->args[0] |= op->args[0];
47
+ ctx.prev_mb->args[0] |= op->args[0];
48
tcg_op_remove(s, op);
49
break;
50
51
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
52
case INDEX_op_qemu_st_i64:
53
case INDEX_op_call:
54
/* Opcodes that touch guest memory stop the optimization. */
55
- prev_mb = NULL;
56
+ ctx.prev_mb = NULL;
57
break;
58
}
59
} else if (opc == INDEX_op_mb) {
60
- prev_mb = op;
61
+ ctx.prev_mb = op;
62
}
63
}
64
}
65
--
66
2.25.1
67
68
diff view generated by jsdifflib
1
Split out the conditional conversion from a more complex logical
1
Create a new function to combine a CPUTLBEntry addend
2
operation to a simple NOT. Create a couple more helpers to make
2
with the guest address to form a host address.
3
this easy for the outer-most logical operations.
4
3
5
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
4
Reviewed-by: WANG Xuerui <git@xen0n.name>
5
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
6
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
7
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
8
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
9
---
8
tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
10
accel/tcg/cputlb.c | 24 ++++++++++++++----------
9
1 file changed, 86 insertions(+), 72 deletions(-)
11
1 file changed, 14 insertions(+), 10 deletions(-)
10
12
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
13
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
12
index XXXXXXX..XXXXXXX 100644
14
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
15
--- a/accel/tcg/cputlb.c
14
+++ b/tcg/optimize.c
16
+++ b/accel/tcg/cputlb.c
15
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
17
@@ -XXX,XX +XXX,XX @@ static inline size_t sizeof_tlb(CPUTLBDescFast *fast)
16
return false;
18
return fast->mask + (1 << CPU_TLB_ENTRY_BITS);
17
}
19
}
18
20
19
+/*
21
+static inline uintptr_t g2h_tlbe(const CPUTLBEntry *tlb, target_ulong gaddr)
20
+ * Convert @op to NOT, if NOT is supported by the host.
21
+ * Return true f the conversion is successful, which will still
22
+ * indicate that the processing is complete.
23
+ */
24
+static bool fold_not(OptContext *ctx, TCGOp *op);
25
+static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
26
+{
22
+{
27
+ TCGOpcode not_op;
23
+ return tlb->addend + (uintptr_t)gaddr;
28
+ bool have_not;
29
+
30
+ switch (ctx->type) {
31
+ case TCG_TYPE_I32:
32
+ not_op = INDEX_op_not_i32;
33
+ have_not = TCG_TARGET_HAS_not_i32;
34
+ break;
35
+ case TCG_TYPE_I64:
36
+ not_op = INDEX_op_not_i64;
37
+ have_not = TCG_TARGET_HAS_not_i64;
38
+ break;
39
+ case TCG_TYPE_V64:
40
+ case TCG_TYPE_V128:
41
+ case TCG_TYPE_V256:
42
+ not_op = INDEX_op_not_vec;
43
+ have_not = TCG_TARGET_HAS_not_vec;
44
+ break;
45
+ default:
46
+ g_assert_not_reached();
47
+ }
48
+ if (have_not) {
49
+ op->opc = not_op;
50
+ op->args[1] = op->args[idx];
51
+ return fold_not(ctx, op);
52
+ }
53
+ return false;
54
+}
24
+}
55
+
25
+
56
+/* If the binary operation has first argument @i, fold to NOT. */
26
static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
57
+static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
27
size_t max_entries)
58
+{
59
+ if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
60
+ return fold_to_not(ctx, op, 2);
61
+ }
62
+ return false;
63
+}
64
+
65
/* If the binary operation has second argument @i, fold to @i. */
66
static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
67
{
28
{
68
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
29
@@ -XXX,XX +XXX,XX @@ static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
69
return false;
30
31
if ((addr & (TLB_INVALID_MASK | TLB_MMIO |
32
TLB_DISCARD_WRITE | TLB_NOTDIRTY)) == 0) {
33
- addr &= TARGET_PAGE_MASK;
34
- addr += tlb_entry->addend;
35
+ addr = g2h_tlbe(tlb_entry, addr & TARGET_PAGE_MASK);
36
if ((addr - start) < length) {
37
#if TCG_OVERSIZED_GUEST
38
tlb_entry->addr_write |= TLB_NOTDIRTY;
39
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
40
return -1;
41
}
42
43
- p = (void *)((uintptr_t)addr + entry->addend);
44
+ p = (void *)g2h_tlbe(entry, addr);
45
if (hostp) {
46
*hostp = p;
47
}
48
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
49
}
50
51
/* Everything else is RAM. */
52
- *phost = (void *)((uintptr_t)addr + entry->addend);
53
+ *phost = (void *)g2h_tlbe(entry, addr);
54
return flags;
70
}
55
}
71
56
72
+/* If the binary operation has second argument @i, fold to NOT. */
57
@@ -XXX,XX +XXX,XX @@ bool tlb_plugin_lookup(CPUState *cpu, target_ulong addr, int mmu_idx,
73
+static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
58
data->v.io.offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
74
+{
59
} else {
75
+ if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
60
data->is_io = false;
76
+ return fold_to_not(ctx, op, 1);
61
- data->v.ram.hostaddr = (void *)((uintptr_t)addr + tlbe->addend);
77
+ }
62
+ data->v.ram.hostaddr = (void *)g2h_tlbe(tlbe, addr);
78
+ return false;
63
}
79
+}
80
+
81
/* If the binary operation has both arguments equal, fold to @i. */
82
static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
83
{
84
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
85
static bool fold_andc(OptContext *ctx, TCGOp *op)
86
{
87
if (fold_const2(ctx, op) ||
88
- fold_xx_to_i(ctx, op, 0)) {
89
+ fold_xx_to_i(ctx, op, 0) ||
90
+ fold_ix_to_not(ctx, op, -1)) {
91
return true;
64
return true;
65
} else {
66
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
67
goto stop_the_world;
92
}
68
}
93
return false;
69
94
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
70
- hostaddr = (void *)((uintptr_t)addr + tlbe->addend);
95
71
+ hostaddr = (void *)g2h_tlbe(tlbe, addr);
96
static bool fold_eqv(OptContext *ctx, TCGOp *op)
72
97
{
73
if (unlikely(tlb_addr & TLB_NOTDIRTY)) {
98
- return fold_const2(ctx, op);
74
notdirty_write(env_cpu(env), addr, size,
99
+ if (fold_const2(ctx, op) ||
75
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, MemOpIdx oi,
100
+ fold_xi_to_not(ctx, op, 0)) {
76
access_type, op ^ (need_swap * MO_BSWAP));
101
+ return true;
77
}
102
+ }
78
103
+ return false;
79
- haddr = (void *)((uintptr_t)addr + entry->addend);
80
+ haddr = (void *)g2h_tlbe(entry, addr);
81
82
/*
83
* Keep these two load_memop separate to ensure that the compiler
84
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, MemOpIdx oi,
85
return res & MAKE_64BIT_MASK(0, size * 8);
86
}
87
88
- haddr = (void *)((uintptr_t)addr + entry->addend);
89
+ haddr = (void *)g2h_tlbe(entry, addr);
90
return load_memop(haddr, op);
104
}
91
}
105
92
106
static bool fold_extract(OptContext *ctx, TCGOp *op)
93
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
107
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
94
notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
108
95
}
109
static bool fold_nand(OptContext *ctx, TCGOp *op)
96
110
{
97
- haddr = (void *)((uintptr_t)addr + entry->addend);
111
- return fold_const2(ctx, op);
98
+ haddr = (void *)g2h_tlbe(entry, addr);
112
+ if (fold_const2(ctx, op) ||
99
113
+ fold_xi_to_not(ctx, op, -1)) {
100
/*
114
+ return true;
101
* Keep these two store_memop separate to ensure that the compiler
115
+ }
102
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
116
+ return false;
103
return;
104
}
105
106
- haddr = (void *)((uintptr_t)addr + entry->addend);
107
+ haddr = (void *)g2h_tlbe(entry, addr);
108
store_memop(haddr, val, op);
117
}
109
}
118
110
119
static bool fold_neg(OptContext *ctx, TCGOp *op)
120
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
121
122
static bool fold_nor(OptContext *ctx, TCGOp *op)
123
{
124
- return fold_const2(ctx, op);
125
+ if (fold_const2(ctx, op) ||
126
+ fold_xi_to_not(ctx, op, 0)) {
127
+ return true;
128
+ }
129
+ return false;
130
}
131
132
static bool fold_not(OptContext *ctx, TCGOp *op)
133
{
134
- return fold_const1(ctx, op);
135
+ if (fold_const1(ctx, op)) {
136
+ return true;
137
+ }
138
+
139
+ /* Because of fold_to_not, we want to always return true, via finish. */
140
+ finish_folding(ctx, op);
141
+ return true;
142
}
143
144
static bool fold_or(OptContext *ctx, TCGOp *op)
145
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
146
147
static bool fold_orc(OptContext *ctx, TCGOp *op)
148
{
149
- return fold_const2(ctx, op);
150
+ if (fold_const2(ctx, op) ||
151
+ fold_ix_to_not(ctx, op, 0)) {
152
+ return true;
153
+ }
154
+ return false;
155
}
156
157
static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
158
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
159
static bool fold_xor(OptContext *ctx, TCGOp *op)
160
{
161
if (fold_const2(ctx, op) ||
162
- fold_xx_to_i(ctx, op, 0)) {
163
+ fold_xx_to_i(ctx, op, 0) ||
164
+ fold_xi_to_not(ctx, op, -1)) {
165
return true;
166
}
167
return false;
168
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
169
}
170
}
171
break;
172
- CASE_OP_32_64_VEC(xor):
173
- CASE_OP_32_64(nand):
174
- if (!arg_is_const(op->args[1])
175
- && arg_is_const(op->args[2])
176
- && arg_info(op->args[2])->val == -1) {
177
- i = 1;
178
- goto try_not;
179
- }
180
- break;
181
- CASE_OP_32_64(nor):
182
- if (!arg_is_const(op->args[1])
183
- && arg_is_const(op->args[2])
184
- && arg_info(op->args[2])->val == 0) {
185
- i = 1;
186
- goto try_not;
187
- }
188
- break;
189
- CASE_OP_32_64_VEC(andc):
190
- if (!arg_is_const(op->args[2])
191
- && arg_is_const(op->args[1])
192
- && arg_info(op->args[1])->val == -1) {
193
- i = 2;
194
- goto try_not;
195
- }
196
- break;
197
- CASE_OP_32_64_VEC(orc):
198
- CASE_OP_32_64(eqv):
199
- if (!arg_is_const(op->args[2])
200
- && arg_is_const(op->args[1])
201
- && arg_info(op->args[1])->val == 0) {
202
- i = 2;
203
- goto try_not;
204
- }
205
- break;
206
- try_not:
207
- {
208
- TCGOpcode not_op;
209
- bool have_not;
210
-
211
- switch (ctx.type) {
212
- case TCG_TYPE_I32:
213
- not_op = INDEX_op_not_i32;
214
- have_not = TCG_TARGET_HAS_not_i32;
215
- break;
216
- case TCG_TYPE_I64:
217
- not_op = INDEX_op_not_i64;
218
- have_not = TCG_TARGET_HAS_not_i64;
219
- break;
220
- case TCG_TYPE_V64:
221
- case TCG_TYPE_V128:
222
- case TCG_TYPE_V256:
223
- not_op = INDEX_op_not_vec;
224
- have_not = TCG_TARGET_HAS_not_vec;
225
- break;
226
- default:
227
- g_assert_not_reached();
228
- }
229
- if (!have_not) {
230
- break;
231
- }
232
- op->opc = not_op;
233
- reset_temp(op->args[0]);
234
- op->args[1] = op->args[i];
235
- continue;
236
- }
237
default:
238
break;
239
}
240
--
111
--
241
2.25.1
112
2.25.1
242
113
243
114
diff view generated by jsdifflib
1
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
1
When TCG_TARGET_SIGNED_ADDR32 is set, adjust the tlb addend to
2
allow the 32-bit guest address to be sign extended within the
3
64-bit host register instead of zero extended.
4
5
This will simplify tcg hosts like MIPS, RISC-V, and LoongArch,
6
which naturally sign-extend 32-bit values, in contrast to x86_64
7
and AArch64 which zero-extend them.
8
9
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
2
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
10
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
11
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
12
---
5
tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
13
accel/tcg/cputlb.c | 12 +++++++++++-
6
1 file changed, 31 insertions(+), 22 deletions(-)
14
1 file changed, 11 insertions(+), 1 deletion(-)
7
15
8
diff --git a/tcg/optimize.c b/tcg/optimize.c
16
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
9
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
10
--- a/tcg/optimize.c
18
--- a/accel/tcg/cputlb.c
11
+++ b/tcg/optimize.c
19
+++ b/accel/tcg/cputlb.c
12
@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
20
@@ -XXX,XX +XXX,XX @@
13
return fold_const2(ctx, op);
21
#include "qemu/plugin-memory.h"
22
#endif
23
#include "tcg/tcg-ldst.h"
24
+#include "tcg-target-sa32.h"
25
26
/* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
27
/* #define DEBUG_TLB */
28
@@ -XXX,XX +XXX,XX @@ static inline size_t sizeof_tlb(CPUTLBDescFast *fast)
29
30
static inline uintptr_t g2h_tlbe(const CPUTLBEntry *tlb, target_ulong gaddr)
31
{
32
+ if (TCG_TARGET_SIGNED_ADDR32 && TARGET_LONG_BITS == 32) {
33
+ return tlb->addend + (int32_t)gaddr;
34
+ }
35
return tlb->addend + (uintptr_t)gaddr;
14
}
36
}
15
37
16
+static bool fold_dup(OptContext *ctx, TCGOp *op)
38
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
17
+{
39
desc->iotlb[index].attrs = attrs;
18
+ if (arg_is_const(op->args[1])) {
40
19
+ uint64_t t = arg_info(op->args[1])->val;
41
/* Now calculate the new entry */
20
+ t = dup_const(TCGOP_VECE(op), t);
42
- tn.addend = addend - vaddr_page;
21
+ return tcg_opt_gen_movi(ctx, op, op->args[0], t);
22
+ }
23
+ return false;
24
+}
25
+
43
+
26
+static bool fold_dup2(OptContext *ctx, TCGOp *op)
44
+ if (TCG_TARGET_SIGNED_ADDR32 && TARGET_LONG_BITS == 32) {
27
+{
45
+ tn.addend = addend - (int32_t)vaddr_page;
28
+ if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
46
+ } else {
29
+ uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
47
+ tn.addend = addend - vaddr_page;
30
+ arg_info(op->args[2])->val);
31
+ return tcg_opt_gen_movi(ctx, op, op->args[0], t);
32
+ }
48
+ }
33
+
49
+
34
+ if (args_are_copies(op->args[1], op->args[2])) {
50
if (prot & PAGE_READ) {
35
+ op->opc = INDEX_op_dup_vec;
51
tn.addr_read = address;
36
+ TCGOP_VECE(op) = MO_32;
52
if (wp_flags & BP_MEM_READ) {
37
+ }
38
+ return false;
39
+}
40
+
41
static bool fold_eqv(OptContext *ctx, TCGOp *op)
42
{
43
return fold_const2(ctx, op);
44
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
45
done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
46
break;
47
48
- case INDEX_op_dup_vec:
49
- if (arg_is_const(op->args[1])) {
50
- tmp = arg_info(op->args[1])->val;
51
- tmp = dup_const(TCGOP_VECE(op), tmp);
52
- tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
53
- continue;
54
- }
55
- break;
56
-
57
- case INDEX_op_dup2_vec:
58
- assert(TCG_TARGET_REG_BITS == 32);
59
- if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
60
- tcg_opt_gen_movi(&ctx, op, op->args[0],
61
- deposit64(arg_info(op->args[1])->val, 32, 32,
62
- arg_info(op->args[2])->val));
63
- continue;
64
- } else if (args_are_copies(op->args[1], op->args[2])) {
65
- op->opc = INDEX_op_dup_vec;
66
- TCGOP_VECE(op) = MO_32;
67
- }
68
- break;
69
-
70
default:
71
break;
72
73
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
74
CASE_OP_32_64(divu):
75
done = fold_divide(&ctx, op);
76
break;
77
+ case INDEX_op_dup_vec:
78
+ done = fold_dup(&ctx, op);
79
+ break;
80
+ case INDEX_op_dup2_vec:
81
+ done = fold_dup2(&ctx, op);
82
+ break;
83
CASE_OP_32_64(eqv):
84
done = fold_eqv(&ctx, op);
85
break;
86
--
53
--
87
2.25.1
54
2.25.1
88
55
89
56
diff view generated by jsdifflib
1
Pull the "op r, a, a => mov r, a" optimization into a function,
1
While the host may prefer to treat 32-bit addresses as signed,
2
and use it in the outer opcode fold functions.
2
there are edge cases of guests that cannot be implemented with
3
addresses 0x7fff_ffff and 0x8000_0000 being non-consecutive.
3
4
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
5
Therefore, default to guest_base_signed_addr32 false, and allow
6
probe_guest_base to determine whether it is possible to set it
7
to true. A tcg backend which sets TCG_TARGET_SIGNED_ADDR32 will
8
have to cope with either setting for user-only.
9
10
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
11
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
12
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
13
---
8
tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
14
include/exec/cpu-all.h | 16 ++++++++++++++++
9
1 file changed, 24 insertions(+), 15 deletions(-)
15
include/exec/cpu_ldst.h | 3 ++-
16
bsd-user/main.c | 4 ++++
17
linux-user/main.c | 3 +++
18
4 files changed, 25 insertions(+), 1 deletion(-)
10
19
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
20
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
12
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
22
--- a/include/exec/cpu-all.h
14
+++ b/tcg/optimize.c
23
+++ b/include/exec/cpu-all.h
15
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
24
@@ -XXX,XX +XXX,XX @@ static inline void tswap64s(uint64_t *s)
16
return false;
25
17
}
26
#if defined(CONFIG_USER_ONLY)
18
27
#include "exec/user/abitypes.h"
19
+/* If the binary operation has both arguments equal, fold to identity. */
28
+#include "tcg-target-sa32.h"
20
+static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
29
30
/* On some host systems the guest address space is reserved on the host.
31
* This allows the guest address space to be offset to a convenient location.
32
@@ -XXX,XX +XXX,XX @@ extern uintptr_t guest_base;
33
extern bool have_guest_base;
34
extern unsigned long reserved_va;
35
36
+#if TCG_TARGET_SIGNED_ADDR32 && TARGET_LONG_BITS == 32
37
+extern bool guest_base_signed_addr32;
38
+#else
39
+#define guest_base_signed_addr32 false
40
+#endif
41
+
42
+static inline void set_guest_base_signed_addr32(void)
21
+{
43
+{
22
+ if (args_are_copies(op->args[1], op->args[2])) {
44
+#ifdef guest_base_signed_addr32
23
+ return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
45
+ qemu_build_not_reached();
24
+ }
46
+#else
25
+ return false;
47
+ guest_base_signed_addr32 = true;
48
+#endif
26
+}
49
+}
27
+
50
+
28
/*
51
/*
29
* These outermost fold_<op> functions are sorted alphabetically.
52
* Limit the guest addresses as best we can.
30
+ *
53
*
31
+ * The ordering of the transformations should be:
54
diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
32
+ * 1) those that produce a constant
55
index XXXXXXX..XXXXXXX 100644
33
+ * 2) those that produce a copy
56
--- a/include/exec/cpu_ldst.h
34
+ * 3) those that produce information about the result value.
57
+++ b/include/exec/cpu_ldst.h
35
*/
58
@@ -XXX,XX +XXX,XX @@ static inline abi_ptr cpu_untagged_addr(CPUState *cs, abi_ptr x)
36
59
/* All direct uses of g2h and h2g need to go away for usermode softmmu. */
37
static bool fold_add(OptContext *ctx, TCGOp *op)
60
static inline void *g2h_untagged(abi_ptr x)
38
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
39
40
static bool fold_and(OptContext *ctx, TCGOp *op)
41
{
61
{
42
- return fold_const2(ctx, op);
62
- return (void *)((uintptr_t)(x) + guest_base);
43
+ if (fold_const2(ctx, op) ||
63
+ uintptr_t hx = guest_base_signed_addr32 ? (int32_t)x : (uintptr_t)x;
44
+ fold_xx_to_x(ctx, op)) {
64
+ return (void *)(guest_base + hx);
45
+ return true;
46
+ }
47
+ return false;
48
}
65
}
49
66
50
static bool fold_andc(OptContext *ctx, TCGOp *op)
67
static inline void *g2h(CPUState *cs, abi_ptr x)
51
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
68
diff --git a/bsd-user/main.c b/bsd-user/main.c
52
69
index XXXXXXX..XXXXXXX 100644
53
static bool fold_or(OptContext *ctx, TCGOp *op)
70
--- a/bsd-user/main.c
54
{
71
+++ b/bsd-user/main.c
55
- return fold_const2(ctx, op);
72
@@ -XXX,XX +XXX,XX @@
56
+ if (fold_const2(ctx, op) ||
73
int singlestep;
57
+ fold_xx_to_x(ctx, op)) {
74
uintptr_t guest_base;
58
+ return true;
75
bool have_guest_base;
59
+ }
76
+#ifndef guest_base_signed_addr32
60
+ return false;
77
+bool guest_base_signed_addr32;
61
}
78
+#endif
62
79
+
63
static bool fold_orc(OptContext *ctx, TCGOp *op)
80
/*
64
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
81
* When running 32-on-64 we should make sure we can fit all of the possible
65
break;
82
* guest address space into a contiguous chunk of virtual host memory.
66
}
83
diff --git a/linux-user/main.c b/linux-user/main.c
67
84
index XXXXXXX..XXXXXXX 100644
68
- /* Simplify expression for "op r, a, a => mov r, a" cases */
85
--- a/linux-user/main.c
69
- switch (opc) {
86
+++ b/linux-user/main.c
70
- CASE_OP_32_64_VEC(or):
87
@@ -XXX,XX +XXX,XX @@ static const char *seed_optarg;
71
- CASE_OP_32_64_VEC(and):
88
unsigned long mmap_min_addr;
72
- if (args_are_copies(op->args[1], op->args[2])) {
89
uintptr_t guest_base;
73
- tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
90
bool have_guest_base;
74
- continue;
91
+#ifndef guest_base_signed_addr32
75
- }
92
+bool guest_base_signed_addr32;
76
- break;
93
+#endif
77
- default:
94
78
- break;
95
/*
79
- }
96
* Used to implement backwards-compatibility for the `-strace`, and
80
-
81
/*
82
* Process each opcode.
83
* Sorted alphabetically by opcode as much as possible.
84
--
97
--
85
2.25.1
98
2.25.1
86
99
87
100
diff view generated by jsdifflib
1
Calls are special in that they have a variable number
1
When using reserved_va, which is the default for a 64-bit host
2
of arguments, and need to be able to clobber globals.
2
and a 32-bit guest, set guest_base_signed_addr32 if requested
3
by TCG_TARGET_SIGNED_ADDR32, and the executable layout allows.
3
4
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
6
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
8
---
8
tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
9
include/exec/cpu-all.h | 4 ---
9
1 file changed, 41 insertions(+), 22 deletions(-)
10
linux-user/elfload.c | 62 ++++++++++++++++++++++++++++++++++--------
11
2 files changed, 50 insertions(+), 16 deletions(-)
10
12
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
13
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
12
index XXXXXXX..XXXXXXX 100644
14
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
15
--- a/include/exec/cpu-all.h
14
+++ b/tcg/optimize.c
16
+++ b/include/exec/cpu-all.h
15
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
17
@@ -XXX,XX +XXX,XX @@ extern const TargetPageBits target_page;
18
#define PAGE_RESET 0x0040
19
/* For linux-user, indicates that the page is MAP_ANON. */
20
#define PAGE_ANON 0x0080
21
-
22
-#if defined(CONFIG_BSD) && defined(CONFIG_USER_ONLY)
23
-/* FIXME: Code that sets/uses this is broken and needs to go away. */
24
#define PAGE_RESERVED 0x0100
25
-#endif
26
/* Target-specific bits that will be used via page_get_flags(). */
27
#define PAGE_TARGET_1 0x0200
28
#define PAGE_TARGET_2 0x0400
29
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
30
index XXXXXXX..XXXXXXX 100644
31
--- a/linux-user/elfload.c
32
+++ b/linux-user/elfload.c
33
@@ -XXX,XX +XXX,XX @@ static void pgb_dynamic(const char *image_name, long align)
34
static void pgb_reserved_va(const char *image_name, abi_ulong guest_loaddr,
35
abi_ulong guest_hiaddr, long align)
36
{
37
- int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE;
38
+ int flags = (MAP_ANONYMOUS | MAP_PRIVATE |
39
+ MAP_NORESERVE | MAP_FIXED_NOREPLACE);
40
+ unsigned long local_rva = reserved_va;
41
+ bool protect_wrap = false;
42
void *addr, *test;
43
44
- if (guest_hiaddr > reserved_va) {
45
+ if (guest_hiaddr > local_rva) {
46
error_report("%s: requires more than reserved virtual "
47
"address space (0x%" PRIx64 " > 0x%lx)",
48
- image_name, (uint64_t)guest_hiaddr, reserved_va);
49
+ image_name, (uint64_t)guest_hiaddr, local_rva);
50
exit(EXIT_FAILURE);
16
}
51
}
17
}
52
18
53
- /* Widen the "image" to the entire reserved address space. */
19
+static bool fold_call(OptContext *ctx, TCGOp *op)
54
- pgb_static(image_name, 0, reserved_va, align);
20
+{
55
+ if (TCG_TARGET_SIGNED_ADDR32 && TARGET_LONG_BITS == 32) {
21
+ TCGContext *s = ctx->tcg;
56
+ if (guest_loaddr < 0x80000000u && guest_hiaddr > 0x80000000u) {
22
+ int nb_oargs = TCGOP_CALLO(op);
57
+ /*
23
+ int nb_iargs = TCGOP_CALLI(op);
58
+ * The executable itself wraps on signed addresses.
24
+ int flags, i;
59
+ * Without per-page translation, we must keep the
25
+
60
+ * guest address 0x7fff_ffff adjacent to 0x8000_0000
26
+ init_arguments(ctx, op, nb_oargs + nb_iargs);
61
+ * consecutive in host memory: unsigned addresses.
27
+ copy_propagate(ctx, op, nb_oargs, nb_iargs);
62
+ */
28
+
63
+ } else {
29
+ /* If the function reads or writes globals, reset temp data. */
64
+ set_guest_base_signed_addr32();
30
+ flags = tcg_call_flags(op);
65
+ if (local_rva <= 0x80000000u) {
31
+ if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
66
+ /* No guest addresses are "negative": win! */
32
+ int nb_globals = s->nb_globals;
67
+ } else {
33
+
68
+ /* Begin by allocating the entire address space. */
34
+ for (i = 0; i < nb_globals; i++) {
69
+ local_rva = 0xfffffffful + 1;
35
+ if (test_bit(i, ctx->temps_used.l)) {
70
+ protect_wrap = true;
36
+ reset_ts(&ctx->tcg->temps[i]);
37
+ }
71
+ }
38
+ }
72
+ }
39
+ }
73
+ }
40
+
74
41
+ /* Reset temp data for outputs. */
75
- /* osdep.h defines this as 0 if it's missing */
42
+ for (i = 0; i < nb_oargs; i++) {
76
- flags |= MAP_FIXED_NOREPLACE;
43
+ reset_temp(op->args[i]);
77
+ /* Widen the "image" to the entire reserved address space. */
78
+ pgb_static(image_name, 0, local_rva, align);
79
+ assert(guest_base != 0);
80
81
/* Reserve the memory on the host. */
82
- assert(guest_base != 0);
83
test = g2h_untagged(0);
84
- addr = mmap(test, reserved_va, PROT_NONE, flags, -1, 0);
85
+ addr = mmap(test, local_rva, PROT_NONE, flags, -1, 0);
86
if (addr == MAP_FAILED || addr != test) {
87
+ /*
88
+ * If protect_wrap, we could try again with the original reserved_va
89
+ * setting, but the edge case of low ulimit vm setting on a 64-bit
90
+ * host is probably useless.
91
+ */
92
error_report("Unable to reserve 0x%lx bytes of virtual address "
93
- "space at %p (%s) for use as guest address space (check your"
94
- "virtual memory ulimit setting, min_mmap_addr or reserve less "
95
- "using -R option)", reserved_va, test, strerror(errno));
96
+ "space at %p (%s) for use as guest address space "
97
+ "(check your virtual memory ulimit setting, "
98
+ "min_mmap_addr or reserve less using -R option)",
99
+ local_rva, test, strerror(errno));
100
exit(EXIT_FAILURE);
101
}
102
103
+ if (protect_wrap) {
104
+ /*
105
+ * Prevent the page just before 0x80000000 from being allocated.
106
+ * This prevents a single guest object/allocation from crossing
107
+ * the signed wrap, and thus being discontiguous in host memory.
108
+ */
109
+ page_set_flags(0x7fffffff & TARGET_PAGE_MASK, 0x80000000u,
110
+ PAGE_RESERVED);
111
+ /* Adjust guest_base so that 0 is in the middle of the reservation. */
112
+ guest_base += 0x80000000ul;
44
+ }
113
+ }
45
+
114
+
46
+ /* Stop optimizing MB across calls. */
115
qemu_log_mask(CPU_LOG_PAGE, "%s: base @ %p for %lu bytes\n",
47
+ ctx->prev_mb = NULL;
116
__func__, addr, reserved_va);
48
+ return true;
117
}
49
+}
50
+
51
/* Propagate constants and copies, fold constant expressions. */
52
void tcg_optimize(TCGContext *s)
53
{
54
- int nb_temps, nb_globals, i;
55
+ int nb_temps, i;
56
TCGOp *op, *op_next;
57
OptContext ctx = { .tcg = s };
58
59
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
60
available through the doubly linked circular list. */
61
62
nb_temps = s->nb_temps;
63
- nb_globals = s->nb_globals;
64
-
65
for (i = 0; i < nb_temps; ++i) {
66
s->temps[i].state_ptr = NULL;
67
}
68
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
69
uint64_t z_mask, partmask, affected, tmp;
70
int nb_oargs, nb_iargs;
71
TCGOpcode opc = op->opc;
72
- const TCGOpDef *def = &tcg_op_defs[opc];
73
+ const TCGOpDef *def;
74
75
- /* Count the arguments, and initialize the temps that are
76
- going to be used */
77
+ /* Calls are special. */
78
if (opc == INDEX_op_call) {
79
- nb_oargs = TCGOP_CALLO(op);
80
- nb_iargs = TCGOP_CALLI(op);
81
- } else {
82
- nb_oargs = def->nb_oargs;
83
- nb_iargs = def->nb_iargs;
84
+ fold_call(&ctx, op);
85
+ continue;
86
}
87
+
88
+ def = &tcg_op_defs[opc];
89
+ nb_oargs = def->nb_oargs;
90
+ nb_iargs = def->nb_iargs;
91
init_arguments(&ctx, op, nb_oargs + nb_iargs);
92
copy_propagate(&ctx, op, nb_oargs, nb_iargs);
93
94
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
95
if (def->flags & TCG_OPF_BB_END) {
96
memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
97
} else {
98
- if (opc == INDEX_op_call &&
99
- !(tcg_call_flags(op)
100
- & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
101
- for (i = 0; i < nb_globals; i++) {
102
- if (test_bit(i, ctx.temps_used.l)) {
103
- reset_ts(&s->temps[i]);
104
- }
105
- }
106
- }
107
-
108
for (i = 0; i < nb_oargs; i++) {
109
reset_temp(op->args[i]);
110
/* Save the corresponding known-zero bits mask for the
111
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
112
case INDEX_op_qemu_st_i32:
113
case INDEX_op_qemu_st8_i32:
114
case INDEX_op_qemu_st_i64:
115
- case INDEX_op_call:
116
/* Opcodes that touch guest memory stop the optimization. */
117
ctx.prev_mb = NULL;
118
break;
119
--
118
--
120
2.25.1
119
2.25.1
121
120
122
121
diff view generated by jsdifflib
1
Continue splitting tcg_optimize.
1
AArch64 has both sign and zero-extending addressing modes, which
2
means that either treatment of guest addresses is equally efficient.
3
Enabling this for AArch64 gives us testing of the feature in CI.
2
4
3
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
7
---
8
tcg/optimize.c | 22 ++++++++++++++--------
8
tcg/aarch64/tcg-target-sa32.h | 8 +++-
9
1 file changed, 14 insertions(+), 8 deletions(-)
9
tcg/aarch64/tcg-target.c.inc | 81 ++++++++++++++++++++++++-----------
10
2 files changed, 64 insertions(+), 25 deletions(-)
10
11
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
12
diff --git a/tcg/aarch64/tcg-target-sa32.h b/tcg/aarch64/tcg-target-sa32.h
12
index XXXXXXX..XXXXXXX 100644
13
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
14
--- a/tcg/aarch64/tcg-target-sa32.h
14
+++ b/tcg/optimize.c
15
+++ b/tcg/aarch64/tcg-target-sa32.h
15
@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
16
@@ -1 +1,7 @@
16
}
17
-#define TCG_TARGET_SIGNED_ADDR32 0
17
}
18
+/*
18
19
+ * AArch64 has both SXTW and UXTW addressing modes, which means that
19
+static void copy_propagate(OptContext *ctx, TCGOp *op,
20
+ * it is agnostic to how guest addresses should be represented.
20
+ int nb_oargs, int nb_iargs)
21
+ * Because aarch64 is more common than the other hosts that will
22
+ * want to use this feature, enable it for continuous testing.
23
+ */
24
+#define TCG_TARGET_SIGNED_ADDR32 1
25
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
26
index XXXXXXX..XXXXXXX 100644
27
--- a/tcg/aarch64/tcg-target.c.inc
28
+++ b/tcg/aarch64/tcg-target.c.inc
29
@@ -XXX,XX +XXX,XX @@ typedef enum {
30
LDST_LD_S_W = 3, /* load and sign-extend into Wt */
31
} AArch64LdstType;
32
33
+/*
34
+ * See aarch64/instrs/extendreg/DecodeRegExtend
35
+ * But note that option<1> == 0 is UNDEFINED for LDR/STR.
36
+ */
37
+typedef enum {
38
+ LDST_EXT_UXTW = 2, /* zero-extend from uint32_t */
39
+ LDST_EXT_UXTX = 3, /* zero-extend from uint64_t (i.e. no extension) */
40
+ LDST_EXT_SXTW = 6, /* sign-extend from int32_t */
41
+} AArch64LdstExt;
42
+
43
/* We encode the format of the insn into the beginning of the name, so that
44
we can have the preprocessor help "typecheck" the insn vs the output
45
function. Arm didn't provide us with nice names for the formats, so we
46
@@ -XXX,XX +XXX,XX @@ static void tcg_out_insn_3617(TCGContext *s, AArch64Insn insn, bool q,
47
}
48
49
static void tcg_out_insn_3310(TCGContext *s, AArch64Insn insn,
50
- TCGReg rd, TCGReg base, TCGType ext,
51
+ TCGReg rd, TCGReg base, AArch64LdstExt option,
52
TCGReg regoff)
53
{
54
/* Note the AArch64Insn constants above are for C3.3.12. Adjust. */
55
tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 |
56
- 0x4000 | ext << 13 | base << 5 | (rd & 0x1f));
57
+ option << 13 | base << 5 | (rd & 0x1f));
58
}
59
60
static void tcg_out_insn_3312(TCGContext *s, AArch64Insn insn,
61
@@ -XXX,XX +XXX,XX @@ static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd,
62
63
/* Worst-case scenario, move offset to temp register, use reg offset. */
64
tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, offset);
65
- tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP);
66
+ tcg_out_ldst_r(s, insn, rd, rn, LDST_EXT_UXTX, TCG_REG_TMP);
67
}
68
69
static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
70
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
71
72
static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp memop, TCGType ext,
73
TCGReg data_r, TCGReg addr_r,
74
- TCGType otype, TCGReg off_r)
75
+ AArch64LdstExt option, TCGReg off_r)
76
{
77
switch (memop & MO_SSIZE) {
78
case MO_UB:
79
- tcg_out_ldst_r(s, I3312_LDRB, data_r, addr_r, otype, off_r);
80
+ tcg_out_ldst_r(s, I3312_LDRB, data_r, addr_r, option, off_r);
81
break;
82
case MO_SB:
83
tcg_out_ldst_r(s, ext ? I3312_LDRSBX : I3312_LDRSBW,
84
- data_r, addr_r, otype, off_r);
85
+ data_r, addr_r, option, off_r);
86
break;
87
case MO_UW:
88
- tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, otype, off_r);
89
+ tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, option, off_r);
90
break;
91
case MO_SW:
92
tcg_out_ldst_r(s, (ext ? I3312_LDRSHX : I3312_LDRSHW),
93
- data_r, addr_r, otype, off_r);
94
+ data_r, addr_r, option, off_r);
95
break;
96
case MO_UL:
97
- tcg_out_ldst_r(s, I3312_LDRW, data_r, addr_r, otype, off_r);
98
+ tcg_out_ldst_r(s, I3312_LDRW, data_r, addr_r, option, off_r);
99
break;
100
case MO_SL:
101
- tcg_out_ldst_r(s, I3312_LDRSWX, data_r, addr_r, otype, off_r);
102
+ tcg_out_ldst_r(s, I3312_LDRSWX, data_r, addr_r, option, off_r);
103
break;
104
case MO_UQ:
105
- tcg_out_ldst_r(s, I3312_LDRX, data_r, addr_r, otype, off_r);
106
+ tcg_out_ldst_r(s, I3312_LDRX, data_r, addr_r, option, off_r);
107
break;
108
default:
109
tcg_abort();
110
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp memop, TCGType ext,
111
112
static void tcg_out_qemu_st_direct(TCGContext *s, MemOp memop,
113
TCGReg data_r, TCGReg addr_r,
114
- TCGType otype, TCGReg off_r)
115
+ AArch64LdstExt option, TCGReg off_r)
116
{
117
switch (memop & MO_SIZE) {
118
case MO_8:
119
- tcg_out_ldst_r(s, I3312_STRB, data_r, addr_r, otype, off_r);
120
+ tcg_out_ldst_r(s, I3312_STRB, data_r, addr_r, option, off_r);
121
break;
122
case MO_16:
123
- tcg_out_ldst_r(s, I3312_STRH, data_r, addr_r, otype, off_r);
124
+ tcg_out_ldst_r(s, I3312_STRH, data_r, addr_r, option, off_r);
125
break;
126
case MO_32:
127
- tcg_out_ldst_r(s, I3312_STRW, data_r, addr_r, otype, off_r);
128
+ tcg_out_ldst_r(s, I3312_STRW, data_r, addr_r, option, off_r);
129
break;
130
case MO_64:
131
- tcg_out_ldst_r(s, I3312_STRX, data_r, addr_r, otype, off_r);
132
+ tcg_out_ldst_r(s, I3312_STRX, data_r, addr_r, option, off_r);
133
break;
134
default:
135
tcg_abort();
136
}
137
}
138
139
+/*
140
+ * Bits for the option field of LDR/STR (register),
141
+ * for application to a guest address.
142
+ */
143
+static AArch64LdstExt ldst_ext_option(void)
21
+{
144
+{
22
+ TCGContext *s = ctx->tcg;
145
+#ifdef CONFIG_USER_ONLY
146
+ bool signed_addr32 = guest_base_signed_addr32;
147
+#else
148
+ bool signed_addr32 = TCG_TARGET_SIGNED_ADDR32;
149
+#endif
23
+
150
+
24
+ for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
151
+ if (TARGET_LONG_BITS == 64) {
25
+ TCGTemp *ts = arg_temp(op->args[i]);
152
+ return LDST_EXT_UXTX;
26
+ if (ts && ts_is_copy(ts)) {
153
+ } else if (signed_addr32) {
27
+ op->args[i] = temp_arg(find_better_copy(s, ts));
154
+ return LDST_EXT_SXTW;
28
+ }
155
+ } else {
156
+ return LDST_EXT_UXTW;
29
+ }
157
+ }
30
+}
158
+}
31
+
159
+
32
/* Propagate constants and copies, fold constant expressions. */
160
static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
33
void tcg_optimize(TCGContext *s)
161
MemOpIdx oi, TCGType ext)
34
{
162
{
35
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
163
MemOp memop = get_memop(oi);
36
nb_iargs = def->nb_iargs;
164
- const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
37
}
165
+ AArch64LdstExt option = ldst_ext_option();
38
init_arguments(&ctx, op, nb_oargs + nb_iargs);
166
39
-
167
/* Byte swapping is left to middle-end expansion. */
40
- /* Do copy propagation */
168
tcg_debug_assert((memop & MO_BSWAP) == 0);
41
- for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
169
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
42
- TCGTemp *ts = arg_temp(op->args[i]);
170
43
- if (ts && ts_is_copy(ts)) {
171
tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, mem_index, 1);
44
- op->args[i] = temp_arg(find_better_copy(s, ts));
172
tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
45
- }
173
- TCG_REG_X1, otype, addr_reg);
46
- }
174
+ TCG_REG_X1, option, addr_reg);
47
+ copy_propagate(&ctx, op, nb_oargs, nb_iargs);
175
add_qemu_ldst_label(s, true, oi, ext, data_reg, addr_reg,
48
176
s->code_ptr, label_ptr);
49
/* For commutative operations make constant second argument */
177
#else /* !CONFIG_SOFTMMU */
50
switch (opc) {
178
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
179
}
180
if (USE_GUEST_BASE) {
181
tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
182
- TCG_REG_GUEST_BASE, otype, addr_reg);
183
+ TCG_REG_GUEST_BASE, option, addr_reg);
184
} else {
185
+ /* This case is always a 64-bit guest with no extension. */
186
tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
187
- addr_reg, TCG_TYPE_I64, TCG_REG_XZR);
188
+ addr_reg, LDST_EXT_UXTX, TCG_REG_XZR);
189
}
190
#endif /* CONFIG_SOFTMMU */
191
}
192
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
193
MemOpIdx oi)
194
{
195
MemOp memop = get_memop(oi);
196
- const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
197
+ AArch64LdstExt option = ldst_ext_option();
198
199
/* Byte swapping is left to middle-end expansion. */
200
tcg_debug_assert((memop & MO_BSWAP) == 0);
201
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
202
203
tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, mem_index, 0);
204
tcg_out_qemu_st_direct(s, memop, data_reg,
205
- TCG_REG_X1, otype, addr_reg);
206
+ TCG_REG_X1, option, addr_reg);
207
add_qemu_ldst_label(s, false, oi, (memop & MO_SIZE)== MO_64,
208
data_reg, addr_reg, s->code_ptr, label_ptr);
209
#else /* !CONFIG_SOFTMMU */
210
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
211
}
212
if (USE_GUEST_BASE) {
213
tcg_out_qemu_st_direct(s, memop, data_reg,
214
- TCG_REG_GUEST_BASE, otype, addr_reg);
215
+ TCG_REG_GUEST_BASE, option, addr_reg);
216
} else {
217
+ /* This case is always a 64-bit guest with no extension. */
218
tcg_out_qemu_st_direct(s, memop, data_reg,
219
- addr_reg, TCG_TYPE_I64, TCG_REG_XZR);
220
+ addr_reg, LDST_EXT_UXTX, TCG_REG_XZR);
221
}
222
#endif /* CONFIG_SOFTMMU */
223
}
51
--
224
--
52
2.25.1
225
2.25.1
53
54
diff view generated by jsdifflib
Deleted patch
1
Rather than try to keep these up-to-date across folding,
2
re-read nb_oargs at the end, after re-reading the opcode.
3
1
4
A couple of asserts need dropping, but that will take care
5
of itself as we split the function further.
6
7
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
8
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
9
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
10
---
11
tcg/optimize.c | 14 ++++----------
12
1 file changed, 4 insertions(+), 10 deletions(-)
13
14
diff --git a/tcg/optimize.c b/tcg/optimize.c
15
index XXXXXXX..XXXXXXX 100644
16
--- a/tcg/optimize.c
17
+++ b/tcg/optimize.c
18
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
19
20
QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
21
uint64_t z_mask, partmask, affected, tmp;
22
- int nb_oargs, nb_iargs;
23
TCGOpcode opc = op->opc;
24
const TCGOpDef *def;
25
26
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
27
}
28
29
def = &tcg_op_defs[opc];
30
- nb_oargs = def->nb_oargs;
31
- nb_iargs = def->nb_iargs;
32
- init_arguments(&ctx, op, nb_oargs + nb_iargs);
33
- copy_propagate(&ctx, op, nb_oargs, nb_iargs);
34
+ init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
35
+ copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
36
37
/* For commutative operations make constant second argument */
38
switch (opc) {
39
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
40
41
CASE_OP_32_64(qemu_ld):
42
{
43
- MemOpIdx oi = op->args[nb_oargs + nb_iargs];
44
+ MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
45
MemOp mop = get_memop(oi);
46
if (!(mop & MO_SIGN)) {
47
z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
48
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
49
}
50
51
if (partmask == 0) {
52
- tcg_debug_assert(nb_oargs == 1);
53
tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
54
continue;
55
}
56
if (affected == 0) {
57
- tcg_debug_assert(nb_oargs == 1);
58
tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
59
continue;
60
}
61
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
62
} else if (args_are_copies(op->args[1], op->args[2])) {
63
op->opc = INDEX_op_dup_vec;
64
TCGOP_VECE(op) = MO_32;
65
- nb_iargs = 1;
66
}
67
break;
68
69
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
70
op->opc = opc = (opc == INDEX_op_movcond_i32
71
? INDEX_op_setcond_i32
72
: INDEX_op_setcond_i64);
73
- nb_iargs = 2;
74
}
75
break;
76
77
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
78
if (def->flags & TCG_OPF_BB_END) {
79
memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
80
} else {
81
+ int nb_oargs = def->nb_oargs;
82
for (i = 0; i < nb_oargs; i++) {
83
reset_temp(op->args[i]);
84
/* Save the corresponding known-zero bits mask for the
85
--
86
2.25.1
87
88
diff view generated by jsdifflib
1
This will allow callers to tail call to these functions
1
All 32-bit mips operations sign-extend the output, so we are easily
2
and return true indicating processing complete.
2
able to keep TCG_TYPE_I32 values sign-extended in host registers.
3
3
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
6
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
4
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
8
---
6
---
9
tcg/optimize.c | 9 +++++----
7
tcg/mips/tcg-target-sa32.h | 8 ++++++++
10
1 file changed, 5 insertions(+), 4 deletions(-)
8
tcg/mips/tcg-target.c.inc | 10 ++--------
9
2 files changed, 10 insertions(+), 8 deletions(-)
11
10
12
diff --git a/tcg/optimize.c b/tcg/optimize.c
11
diff --git a/tcg/mips/tcg-target-sa32.h b/tcg/mips/tcg-target-sa32.h
13
index XXXXXXX..XXXXXXX 100644
12
index XXXXXXX..XXXXXXX 100644
14
--- a/tcg/optimize.c
13
--- a/tcg/mips/tcg-target-sa32.h
15
+++ b/tcg/optimize.c
14
+++ b/tcg/mips/tcg-target-sa32.h
16
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
15
@@ -1 +1,9 @@
17
return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
16
+/*
18
}
17
+ * Do not set TCG_TARGET_SIGNED_ADDR32 for mips32;
19
18
+ * TCG expects this to only be set for 64-bit hosts.
20
-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
19
+ */
21
+static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
20
+#ifdef __mips64
22
{
21
+#define TCG_TARGET_SIGNED_ADDR32 1
23
TCGTemp *dst_ts = arg_temp(dst);
22
+#else
24
TCGTemp *src_ts = arg_temp(src);
23
#define TCG_TARGET_SIGNED_ADDR32 0
25
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
24
+#endif
26
25
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
27
if (ts_are_copies(dst_ts, src_ts)) {
26
index XXXXXXX..XXXXXXX 100644
28
tcg_op_remove(ctx->tcg, op);
27
--- a/tcg/mips/tcg-target.c.inc
29
- return;
28
+++ b/tcg/mips/tcg-target.c.inc
30
+ return true;
29
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
30
TCG_TMP0, TCG_TMP3, cmp_off);
31
}
31
}
32
32
33
reset_ts(dst_ts);
33
- /* Zero extend a 32-bit guest address for a 64-bit host. */
34
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
34
- if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
35
di->is_const = si->is_const;
35
- tcg_out_ext32u(s, base, addrl);
36
di->val = si->val;
36
- addrl = base;
37
- }
38
-
39
/*
40
* Mask the page bits, keeping the alignment bits to compare against.
41
* For unaligned accesses, compare against the end of the access to
42
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
43
data_regl, data_regh, addr_regl, addr_regh,
44
s->code_ptr, label_ptr);
45
#else
46
- if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
47
+ if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS && !guest_base_signed_addr32) {
48
tcg_out_ext32u(s, base, addr_regl);
49
addr_regl = base;
37
}
50
}
38
+ return true;
51
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
39
}
52
data_regl, data_regh, addr_regl, addr_regh,
40
53
s->code_ptr, label_ptr);
41
-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
54
#else
42
+static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
55
- if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
43
TCGArg dst, uint64_t val)
56
+ if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS && !guest_base_signed_addr32) {
44
{
57
tcg_out_ext32u(s, base, addr_regl);
45
const TCGOpDef *def = &tcg_op_defs[op->opc];
58
addr_regl = base;
46
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
59
}
47
/* Convert movi to mov with constant temp. */
48
tv = tcg_constant_internal(type, val);
49
init_ts_info(ctx, tv);
50
- tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
51
+ return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
52
}
53
54
static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
55
--
60
--
56
2.25.1
61
2.25.1
57
62
58
63
diff view generated by jsdifflib
Deleted patch
1
Copy z_mask into OptContext, for writeback to the
2
first output within the new function.
3
1
4
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
5
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
8
tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
9
1 file changed, 33 insertions(+), 16 deletions(-)
10
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
12
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
14
+++ b/tcg/optimize.c
15
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
16
TCGContext *tcg;
17
TCGOp *prev_mb;
18
TCGTempSet temps_used;
19
+
20
+ /* In flight values from optimization. */
21
+ uint64_t z_mask;
22
} OptContext;
23
24
static inline TempOptInfo *ts_info(TCGTemp *ts)
25
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
26
}
27
}
28
29
+static void finish_folding(OptContext *ctx, TCGOp *op)
30
+{
31
+ const TCGOpDef *def = &tcg_op_defs[op->opc];
32
+ int i, nb_oargs;
33
+
34
+ /*
35
+ * For an opcode that ends a BB, reset all temp data.
36
+ * We do no cross-BB optimization.
37
+ */
38
+ if (def->flags & TCG_OPF_BB_END) {
39
+ memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
40
+ ctx->prev_mb = NULL;
41
+ return;
42
+ }
43
+
44
+ nb_oargs = def->nb_oargs;
45
+ for (i = 0; i < nb_oargs; i++) {
46
+ reset_temp(op->args[i]);
47
+ /*
48
+ * Save the corresponding known-zero bits mask for the
49
+ * first output argument (only one supported so far).
50
+ */
51
+ if (i == 0) {
52
+ arg_info(op->args[i])->z_mask = ctx->z_mask;
53
+ }
54
+ }
55
+}
56
+
57
static bool fold_call(OptContext *ctx, TCGOp *op)
58
{
59
TCGContext *s = ctx->tcg;
60
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
61
partmask &= 0xffffffffu;
62
affected &= 0xffffffffu;
63
}
64
+ ctx.z_mask = z_mask;
65
66
if (partmask == 0) {
67
tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
68
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
69
break;
70
}
71
72
- /* Some of the folding above can change opc. */
73
- opc = op->opc;
74
- def = &tcg_op_defs[opc];
75
- if (def->flags & TCG_OPF_BB_END) {
76
- memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
77
- } else {
78
- int nb_oargs = def->nb_oargs;
79
- for (i = 0; i < nb_oargs; i++) {
80
- reset_temp(op->args[i]);
81
- /* Save the corresponding known-zero bits mask for the
82
- first output argument (only one supported so far). */
83
- if (i == 0) {
84
- arg_info(op->args[i])->z_mask = z_mask;
85
- }
86
- }
87
- }
88
+ finish_folding(&ctx, op);
89
90
/* Eliminate duplicate and redundant fence instructions. */
91
if (ctx.prev_mb) {
92
--
93
2.25.1
94
95
diff view generated by jsdifflib
Deleted patch
1
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
2
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
3
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
6
tcg/optimize.c | 9 ++++++---
7
1 file changed, 6 insertions(+), 3 deletions(-)
8
1
9
diff --git a/tcg/optimize.c b/tcg/optimize.c
10
index XXXXXXX..XXXXXXX 100644
11
--- a/tcg/optimize.c
12
+++ b/tcg/optimize.c
13
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
14
uint64_t z_mask, partmask, affected, tmp;
15
TCGOpcode opc = op->opc;
16
const TCGOpDef *def;
17
+ bool done = false;
18
19
/* Calls are special. */
20
if (opc == INDEX_op_call) {
21
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
22
allocator where needed and possible. Also detect copies. */
23
switch (opc) {
24
CASE_OP_32_64_VEC(mov):
25
- tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
26
- continue;
27
+ done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
28
+ break;
29
30
case INDEX_op_dup_vec:
31
if (arg_is_const(op->args[1])) {
32
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
33
break;
34
}
35
36
- finish_folding(&ctx, op);
37
+ if (!done) {
38
+ finish_folding(&ctx, op);
39
+ }
40
41
/* Eliminate duplicate and redundant fence instructions. */
42
if (ctx.prev_mb) {
43
--
44
2.25.1
45
46
diff view generated by jsdifflib
Deleted patch
1
This puts the separate mb optimization into the same framework
2
as the others. While fold_qemu_{ld,st} are currently identical,
3
that won't last as more code gets moved.
4
1
5
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
6
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
8
---
9
tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
10
1 file changed, 51 insertions(+), 38 deletions(-)
11
12
diff --git a/tcg/optimize.c b/tcg/optimize.c
13
index XXXXXXX..XXXXXXX 100644
14
--- a/tcg/optimize.c
15
+++ b/tcg/optimize.c
16
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
17
return true;
18
}
19
20
+static bool fold_mb(OptContext *ctx, TCGOp *op)
21
+{
22
+ /* Eliminate duplicate and redundant fence instructions. */
23
+ if (ctx->prev_mb) {
24
+ /*
25
+ * Merge two barriers of the same type into one,
26
+ * or a weaker barrier into a stronger one,
27
+ * or two weaker barriers into a stronger one.
28
+ * mb X; mb Y => mb X|Y
29
+ * mb; strl => mb; st
30
+ * ldaq; mb => ld; mb
31
+ * ldaq; strl => ld; mb; st
32
+ * Other combinations are also merged into a strong
33
+ * barrier. This is stricter than specified but for
34
+ * the purposes of TCG is better than not optimizing.
35
+ */
36
+ ctx->prev_mb->args[0] |= op->args[0];
37
+ tcg_op_remove(ctx->tcg, op);
38
+ } else {
39
+ ctx->prev_mb = op;
40
+ }
41
+ return true;
42
+}
43
+
44
+static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
45
+{
46
+ /* Opcodes that touch guest memory stop the mb optimization. */
47
+ ctx->prev_mb = NULL;
48
+ return false;
49
+}
50
+
51
+static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
52
+{
53
+ /* Opcodes that touch guest memory stop the mb optimization. */
54
+ ctx->prev_mb = NULL;
55
+ return false;
56
+}
57
+
58
/* Propagate constants and copies, fold constant expressions. */
59
void tcg_optimize(TCGContext *s)
60
{
61
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
62
}
63
break;
64
65
+ case INDEX_op_mb:
66
+ done = fold_mb(&ctx, op);
67
+ break;
68
+ case INDEX_op_qemu_ld_i32:
69
+ case INDEX_op_qemu_ld_i64:
70
+ done = fold_qemu_ld(&ctx, op);
71
+ break;
72
+ case INDEX_op_qemu_st_i32:
73
+ case INDEX_op_qemu_st8_i32:
74
+ case INDEX_op_qemu_st_i64:
75
+ done = fold_qemu_st(&ctx, op);
76
+ break;
77
+
78
default:
79
break;
80
}
81
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
82
if (!done) {
83
finish_folding(&ctx, op);
84
}
85
-
86
- /* Eliminate duplicate and redundant fence instructions. */
87
- if (ctx.prev_mb) {
88
- switch (opc) {
89
- case INDEX_op_mb:
90
- /* Merge two barriers of the same type into one,
91
- * or a weaker barrier into a stronger one,
92
- * or two weaker barriers into a stronger one.
93
- * mb X; mb Y => mb X|Y
94
- * mb; strl => mb; st
95
- * ldaq; mb => ld; mb
96
- * ldaq; strl => ld; mb; st
97
- * Other combinations are also merged into a strong
98
- * barrier. This is stricter than specified but for
99
- * the purposes of TCG is better than not optimizing.
100
- */
101
- ctx.prev_mb->args[0] |= op->args[0];
102
- tcg_op_remove(s, op);
103
- break;
104
-
105
- default:
106
- /* Opcodes that end the block stop the optimization. */
107
- if ((def->flags & TCG_OPF_BB_END) == 0) {
108
- break;
109
- }
110
- /* fallthru */
111
- case INDEX_op_qemu_ld_i32:
112
- case INDEX_op_qemu_ld_i64:
113
- case INDEX_op_qemu_st_i32:
114
- case INDEX_op_qemu_st8_i32:
115
- case INDEX_op_qemu_st_i64:
116
- /* Opcodes that touch guest memory stop the optimization. */
117
- ctx.prev_mb = NULL;
118
- break;
119
- }
120
- } else if (opc == INDEX_op_mb) {
121
- ctx.prev_mb = op;
122
- }
123
}
124
}
125
--
126
2.25.1
127
128
diff view generated by jsdifflib
Deleted patch
1
Reduce some code duplication by folding the NE and EQ cases.
2
1
3
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
---
7
tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
8
1 file changed, 72 insertions(+), 73 deletions(-)
9
10
diff --git a/tcg/optimize.c b/tcg/optimize.c
11
index XXXXXXX..XXXXXXX 100644
12
--- a/tcg/optimize.c
13
+++ b/tcg/optimize.c
14
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
15
return fold_const2(ctx, op);
16
}
17
18
+static bool fold_setcond2(OptContext *ctx, TCGOp *op)
19
+{
20
+ TCGCond cond = op->args[5];
21
+ int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
22
+ int inv = 0;
23
+
24
+ if (i >= 0) {
25
+ goto do_setcond_const;
26
+ }
27
+
28
+ switch (cond) {
29
+ case TCG_COND_LT:
30
+ case TCG_COND_GE:
31
+ /*
32
+ * Simplify LT/GE comparisons vs zero to a single compare
33
+ * vs the high word of the input.
34
+ */
35
+ if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
36
+ arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
37
+ goto do_setcond_high;
38
+ }
39
+ break;
40
+
41
+ case TCG_COND_NE:
42
+ inv = 1;
43
+ QEMU_FALLTHROUGH;
44
+ case TCG_COND_EQ:
45
+ /*
46
+ * Simplify EQ/NE comparisons where one of the pairs
47
+ * can be simplified.
48
+ */
49
+ i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
50
+ op->args[3], cond);
51
+ switch (i ^ inv) {
52
+ case 0:
53
+ goto do_setcond_const;
54
+ case 1:
55
+ goto do_setcond_high;
56
+ }
57
+
58
+ i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
59
+ op->args[4], cond);
60
+ switch (i ^ inv) {
61
+ case 0:
62
+ goto do_setcond_const;
63
+ case 1:
64
+ op->args[2] = op->args[3];
65
+ op->args[3] = cond;
66
+ op->opc = INDEX_op_setcond_i32;
67
+ break;
68
+ }
69
+ break;
70
+
71
+ default:
72
+ break;
73
+
74
+ do_setcond_high:
75
+ op->args[1] = op->args[2];
76
+ op->args[2] = op->args[4];
77
+ op->args[3] = cond;
78
+ op->opc = INDEX_op_setcond_i32;
79
+ break;
80
+ }
81
+ return false;
82
+
83
+ do_setcond_const:
84
+ return tcg_opt_gen_movi(ctx, op, op->args[0], i);
85
+}
86
+
87
static bool fold_shift(OptContext *ctx, TCGOp *op)
88
{
89
return fold_const2(ctx, op);
90
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
91
}
92
break;
93
94
- case INDEX_op_setcond2_i32:
95
- i = do_constant_folding_cond2(&op->args[1], &op->args[3],
96
- op->args[5]);
97
- if (i >= 0) {
98
- do_setcond_const:
99
- tcg_opt_gen_movi(&ctx, op, op->args[0], i);
100
- continue;
101
- }
102
- if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
103
- && arg_is_const(op->args[3])
104
- && arg_info(op->args[3])->val == 0
105
- && arg_is_const(op->args[4])
106
- && arg_info(op->args[4])->val == 0) {
107
- /* Simplify LT/GE comparisons vs zero to a single compare
108
- vs the high word of the input. */
109
- do_setcond_high:
110
- reset_temp(op->args[0]);
111
- arg_info(op->args[0])->z_mask = 1;
112
- op->opc = INDEX_op_setcond_i32;
113
- op->args[1] = op->args[2];
114
- op->args[2] = op->args[4];
115
- op->args[3] = op->args[5];
116
- break;
117
- }
118
- if (op->args[5] == TCG_COND_EQ) {
119
- /* Simplify EQ comparisons where one of the pairs
120
- can be simplified. */
121
- i = do_constant_folding_cond(INDEX_op_setcond_i32,
122
- op->args[1], op->args[3],
123
- TCG_COND_EQ);
124
- if (i == 0) {
125
- goto do_setcond_const;
126
- } else if (i > 0) {
127
- goto do_setcond_high;
128
- }
129
- i = do_constant_folding_cond(INDEX_op_setcond_i32,
130
- op->args[2], op->args[4],
131
- TCG_COND_EQ);
132
- if (i == 0) {
133
- goto do_setcond_high;
134
- } else if (i < 0) {
135
- break;
136
- }
137
- do_setcond_low:
138
- reset_temp(op->args[0]);
139
- arg_info(op->args[0])->z_mask = 1;
140
- op->opc = INDEX_op_setcond_i32;
141
- op->args[2] = op->args[3];
142
- op->args[3] = op->args[5];
143
- break;
144
- }
145
- if (op->args[5] == TCG_COND_NE) {
146
- /* Simplify NE comparisons where one of the pairs
147
- can be simplified. */
148
- i = do_constant_folding_cond(INDEX_op_setcond_i32,
149
- op->args[1], op->args[3],
150
- TCG_COND_NE);
151
- if (i == 0) {
152
- goto do_setcond_high;
153
- } else if (i > 0) {
154
- goto do_setcond_const;
155
- }
156
- i = do_constant_folding_cond(INDEX_op_setcond_i32,
157
- op->args[2], op->args[4],
158
- TCG_COND_NE);
159
- if (i == 0) {
160
- goto do_setcond_low;
161
- } else if (i > 0) {
162
- goto do_setcond_const;
163
- }
164
- }
165
- break;
166
-
167
default:
168
break;
169
170
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
171
CASE_OP_32_64(shr):
172
done = fold_shift(&ctx, op);
173
break;
174
+ case INDEX_op_setcond2_i32:
175
+ done = fold_setcond2(&ctx, op);
176
+ break;
177
CASE_OP_32_64_VEC(sub):
178
done = fold_sub(&ctx, op);
179
break;
180
--
181
2.25.1
182
183
diff view generated by jsdifflib
Deleted patch
1
Reduce some code duplication by folding the NE and EQ cases.
2
1
3
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
6
tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
7
1 file changed, 81 insertions(+), 78 deletions(-)
8
9
diff --git a/tcg/optimize.c b/tcg/optimize.c
10
index XXXXXXX..XXXXXXX 100644
11
--- a/tcg/optimize.c
12
+++ b/tcg/optimize.c
13
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
14
return fold_const2(ctx, op);
15
}
16
17
+static bool fold_brcond2(OptContext *ctx, TCGOp *op)
18
+{
19
+ TCGCond cond = op->args[4];
20
+ int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
21
+ TCGArg label = op->args[5];
22
+ int inv = 0;
23
+
24
+ if (i >= 0) {
25
+ goto do_brcond_const;
26
+ }
27
+
28
+ switch (cond) {
29
+ case TCG_COND_LT:
30
+ case TCG_COND_GE:
31
+ /*
32
+ * Simplify LT/GE comparisons vs zero to a single compare
33
+ * vs the high word of the input.
34
+ */
35
+ if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
36
+ arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
37
+ goto do_brcond_high;
38
+ }
39
+ break;
40
+
41
+ case TCG_COND_NE:
42
+ inv = 1;
43
+ QEMU_FALLTHROUGH;
44
+ case TCG_COND_EQ:
45
+ /*
46
+ * Simplify EQ/NE comparisons where one of the pairs
47
+ * can be simplified.
48
+ */
49
+ i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
50
+ op->args[2], cond);
51
+ switch (i ^ inv) {
52
+ case 0:
53
+ goto do_brcond_const;
54
+ case 1:
55
+ goto do_brcond_high;
56
+ }
57
+
58
+ i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
59
+ op->args[3], cond);
60
+ switch (i ^ inv) {
61
+ case 0:
62
+ goto do_brcond_const;
63
+ case 1:
64
+ op->opc = INDEX_op_brcond_i32;
65
+ op->args[1] = op->args[2];
66
+ op->args[2] = cond;
67
+ op->args[3] = label;
68
+ break;
69
+ }
70
+ break;
71
+
72
+ default:
73
+ break;
74
+
75
+ do_brcond_high:
76
+ op->opc = INDEX_op_brcond_i32;
77
+ op->args[0] = op->args[1];
78
+ op->args[1] = op->args[3];
79
+ op->args[2] = cond;
80
+ op->args[3] = label;
81
+ break;
82
+
83
+ do_brcond_const:
84
+ if (i == 0) {
85
+ tcg_op_remove(ctx->tcg, op);
86
+ return true;
87
+ }
88
+ op->opc = INDEX_op_br;
89
+ op->args[0] = label;
90
+ break;
91
+ }
92
+ return false;
93
+}
94
+
95
static bool fold_call(OptContext *ctx, TCGOp *op)
96
{
97
TCGContext *s = ctx->tcg;
98
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
99
}
100
break;
101
102
- case INDEX_op_brcond2_i32:
103
- i = do_constant_folding_cond2(&op->args[0], &op->args[2],
104
- op->args[4]);
105
- if (i == 0) {
106
- do_brcond_false:
107
- tcg_op_remove(s, op);
108
- continue;
109
- }
110
- if (i > 0) {
111
- do_brcond_true:
112
- op->opc = opc = INDEX_op_br;
113
- op->args[0] = op->args[5];
114
- break;
115
- }
116
- if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
117
- && arg_is_const(op->args[2])
118
- && arg_info(op->args[2])->val == 0
119
- && arg_is_const(op->args[3])
120
- && arg_info(op->args[3])->val == 0) {
121
- /* Simplify LT/GE comparisons vs zero to a single compare
122
- vs the high word of the input. */
123
- do_brcond_high:
124
- op->opc = opc = INDEX_op_brcond_i32;
125
- op->args[0] = op->args[1];
126
- op->args[1] = op->args[3];
127
- op->args[2] = op->args[4];
128
- op->args[3] = op->args[5];
129
- break;
130
- }
131
- if (op->args[4] == TCG_COND_EQ) {
132
- /* Simplify EQ comparisons where one of the pairs
133
- can be simplified. */
134
- i = do_constant_folding_cond(INDEX_op_brcond_i32,
135
- op->args[0], op->args[2],
136
- TCG_COND_EQ);
137
- if (i == 0) {
138
- goto do_brcond_false;
139
- } else if (i > 0) {
140
- goto do_brcond_high;
141
- }
142
- i = do_constant_folding_cond(INDEX_op_brcond_i32,
143
- op->args[1], op->args[3],
144
- TCG_COND_EQ);
145
- if (i == 0) {
146
- goto do_brcond_false;
147
- } else if (i < 0) {
148
- break;
149
- }
150
- do_brcond_low:
151
- memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
152
- op->opc = INDEX_op_brcond_i32;
153
- op->args[1] = op->args[2];
154
- op->args[2] = op->args[4];
155
- op->args[3] = op->args[5];
156
- break;
157
- }
158
- if (op->args[4] == TCG_COND_NE) {
159
- /* Simplify NE comparisons where one of the pairs
160
- can be simplified. */
161
- i = do_constant_folding_cond(INDEX_op_brcond_i32,
162
- op->args[0], op->args[2],
163
- TCG_COND_NE);
164
- if (i == 0) {
165
- goto do_brcond_high;
166
- } else if (i > 0) {
167
- goto do_brcond_true;
168
- }
169
- i = do_constant_folding_cond(INDEX_op_brcond_i32,
170
- op->args[1], op->args[3],
171
- TCG_COND_NE);
172
- if (i == 0) {
173
- goto do_brcond_low;
174
- } else if (i > 0) {
175
- goto do_brcond_true;
176
- }
177
- }
178
- break;
179
-
180
default:
181
break;
182
183
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
184
CASE_OP_32_64_VEC(andc):
185
done = fold_andc(&ctx, op);
186
break;
187
+ case INDEX_op_brcond2_i32:
188
+ done = fold_brcond2(&ctx, op);
189
+ break;
190
CASE_OP_32_64(ctpop):
191
done = fold_ctpop(&ctx, op);
192
break;
193
--
194
2.25.1
195
196
diff view generated by jsdifflib
Deleted patch
1
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
2
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
5
tcg/optimize.c | 33 +++++++++++++++++++--------------
6
1 file changed, 19 insertions(+), 14 deletions(-)
7
1
8
diff --git a/tcg/optimize.c b/tcg/optimize.c
9
index XXXXXXX..XXXXXXX 100644
10
--- a/tcg/optimize.c
11
+++ b/tcg/optimize.c
12
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
13
return fold_const2(ctx, op);
14
}
15
16
+static bool fold_brcond(OptContext *ctx, TCGOp *op)
17
+{
18
+ TCGCond cond = op->args[2];
19
+ int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
20
+
21
+ if (i == 0) {
22
+ tcg_op_remove(ctx->tcg, op);
23
+ return true;
24
+ }
25
+ if (i > 0) {
26
+ op->opc = INDEX_op_br;
27
+ op->args[0] = op->args[3];
28
+ }
29
+ return false;
30
+}
31
+
32
static bool fold_brcond2(OptContext *ctx, TCGOp *op)
33
{
34
TCGCond cond = op->args[4];
35
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
36
}
37
break;
38
39
- CASE_OP_32_64(brcond):
40
- i = do_constant_folding_cond(opc, op->args[0],
41
- op->args[1], op->args[2]);
42
- if (i == 0) {
43
- tcg_op_remove(s, op);
44
- continue;
45
- } else if (i > 0) {
46
- memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
47
- op->opc = opc = INDEX_op_br;
48
- op->args[0] = op->args[3];
49
- break;
50
- }
51
- break;
52
-
53
CASE_OP_32_64(movcond):
54
i = do_constant_folding_cond(opc, op->args[1],
55
op->args[2], op->args[5]);
56
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
57
CASE_OP_32_64_VEC(andc):
58
done = fold_andc(&ctx, op);
59
break;
60
+ CASE_OP_32_64(brcond):
61
+ done = fold_brcond(&ctx, op);
62
+ break;
63
case INDEX_op_brcond2_i32:
64
done = fold_brcond2(&ctx, op);
65
break;
66
--
67
2.25.1
68
69
diff view generated by jsdifflib
Deleted patch
1
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
2
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
5
tcg/optimize.c | 23 ++++++++++++++---------
6
1 file changed, 14 insertions(+), 9 deletions(-)
7
1
8
diff --git a/tcg/optimize.c b/tcg/optimize.c
9
index XXXXXXX..XXXXXXX 100644
10
--- a/tcg/optimize.c
11
+++ b/tcg/optimize.c
12
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
13
return fold_const2(ctx, op);
14
}
15
16
+static bool fold_setcond(OptContext *ctx, TCGOp *op)
17
+{
18
+ TCGCond cond = op->args[3];
19
+ int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
20
+
21
+ if (i >= 0) {
22
+ return tcg_opt_gen_movi(ctx, op, op->args[0], i);
23
+ }
24
+ return false;
25
+}
26
+
27
static bool fold_setcond2(OptContext *ctx, TCGOp *op)
28
{
29
TCGCond cond = op->args[5];
30
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
31
}
32
break;
33
34
- CASE_OP_32_64(setcond):
35
- i = do_constant_folding_cond(opc, op->args[1],
36
- op->args[2], op->args[3]);
37
- if (i >= 0) {
38
- tcg_opt_gen_movi(&ctx, op, op->args[0], i);
39
- continue;
40
- }
41
- break;
42
-
43
CASE_OP_32_64(movcond):
44
i = do_constant_folding_cond(opc, op->args[1],
45
op->args[2], op->args[5]);
46
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
47
CASE_OP_32_64(shr):
48
done = fold_shift(&ctx, op);
49
break;
50
+ CASE_OP_32_64(setcond):
51
+ done = fold_setcond(&ctx, op);
52
+ break;
53
case INDEX_op_setcond2_i32:
54
done = fold_setcond2(&ctx, op);
55
break;
56
--
57
2.25.1
58
59
diff view generated by jsdifflib
Deleted patch
1
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
2
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
5
tcg/optimize.c | 37 +++++++++++++++++++++----------------
6
1 file changed, 21 insertions(+), 16 deletions(-)
7
1
8
diff --git a/tcg/optimize.c b/tcg/optimize.c
9
index XXXXXXX..XXXXXXX 100644
10
--- a/tcg/optimize.c
11
+++ b/tcg/optimize.c
12
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
13
return fold_const2(ctx, op);
14
}
15
16
+static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
17
+{
18
+ if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
19
+ uint32_t a = arg_info(op->args[2])->val;
20
+ uint32_t b = arg_info(op->args[3])->val;
21
+ uint64_t r = (uint64_t)a * b;
22
+ TCGArg rl, rh;
23
+ TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
24
+
25
+ rl = op->args[0];
26
+ rh = op->args[1];
27
+ tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
28
+ tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
29
+ return true;
30
+ }
31
+ return false;
32
+}
33
+
34
static bool fold_nand(OptContext *ctx, TCGOp *op)
35
{
36
return fold_const2(ctx, op);
37
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
38
}
39
break;
40
41
- case INDEX_op_mulu2_i32:
42
- if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
43
- uint32_t a = arg_info(op->args[2])->val;
44
- uint32_t b = arg_info(op->args[3])->val;
45
- uint64_t r = (uint64_t)a * b;
46
- TCGArg rl, rh;
47
- TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
48
-
49
- rl = op->args[0];
50
- rh = op->args[1];
51
- tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
52
- tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
53
- continue;
54
- }
55
- break;
56
-
57
default:
58
break;
59
60
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
61
CASE_OP_32_64(muluh):
62
done = fold_mul_highpart(&ctx, op);
63
break;
64
+ case INDEX_op_mulu2_i32:
65
+ done = fold_mulu2_i32(&ctx, op);
66
+ break;
67
CASE_OP_32_64(nand):
68
done = fold_nand(&ctx, op);
69
break;
70
--
71
2.25.1
72
73
diff view generated by jsdifflib
Deleted patch
1
Add two additional helpers, fold_add2_i32 and fold_sub2_i32
2
which will not be simple wrappers forever.
3
1
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
8
tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
9
1 file changed, 44 insertions(+), 26 deletions(-)
10
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
12
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
14
+++ b/tcg/optimize.c
15
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
16
return fold_const2(ctx, op);
17
}
18
19
+static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
20
+{
21
+ if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
22
+ arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
23
+ uint32_t al = arg_info(op->args[2])->val;
24
+ uint32_t ah = arg_info(op->args[3])->val;
25
+ uint32_t bl = arg_info(op->args[4])->val;
26
+ uint32_t bh = arg_info(op->args[5])->val;
27
+ uint64_t a = ((uint64_t)ah << 32) | al;
28
+ uint64_t b = ((uint64_t)bh << 32) | bl;
29
+ TCGArg rl, rh;
30
+ TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
31
+
32
+ if (add) {
33
+ a += b;
34
+ } else {
35
+ a -= b;
36
+ }
37
+
38
+ rl = op->args[0];
39
+ rh = op->args[1];
40
+ tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
41
+ tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
42
+ return true;
43
+ }
44
+ return false;
45
+}
46
+
47
+static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
48
+{
49
+ return fold_addsub2_i32(ctx, op, true);
50
+}
51
+
52
static bool fold_and(OptContext *ctx, TCGOp *op)
53
{
54
return fold_const2(ctx, op);
55
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
56
return fold_const2(ctx, op);
57
}
58
59
+static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
60
+{
61
+ return fold_addsub2_i32(ctx, op, false);
62
+}
63
+
64
static bool fold_xor(OptContext *ctx, TCGOp *op)
65
{
66
return fold_const2(ctx, op);
67
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
68
}
69
break;
70
71
- case INDEX_op_add2_i32:
72
- case INDEX_op_sub2_i32:
73
- if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
74
- && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
75
- uint32_t al = arg_info(op->args[2])->val;
76
- uint32_t ah = arg_info(op->args[3])->val;
77
- uint32_t bl = arg_info(op->args[4])->val;
78
- uint32_t bh = arg_info(op->args[5])->val;
79
- uint64_t a = ((uint64_t)ah << 32) | al;
80
- uint64_t b = ((uint64_t)bh << 32) | bl;
81
- TCGArg rl, rh;
82
- TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
83
-
84
- if (opc == INDEX_op_add2_i32) {
85
- a += b;
86
- } else {
87
- a -= b;
88
- }
89
-
90
- rl = op->args[0];
91
- rh = op->args[1];
92
- tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
93
- tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
94
- continue;
95
- }
96
- break;
97
98
default:
99
break;
100
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
101
CASE_OP_32_64_VEC(add):
102
done = fold_add(&ctx, op);
103
break;
104
+ case INDEX_op_add2_i32:
105
+ done = fold_add2_i32(&ctx, op);
106
+ break;
107
CASE_OP_32_64_VEC(and):
108
done = fold_and(&ctx, op);
109
break;
110
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
111
CASE_OP_32_64_VEC(sub):
112
done = fold_sub(&ctx, op);
113
break;
114
+ case INDEX_op_sub2_i32:
115
+ done = fold_sub2_i32(&ctx, op);
116
+ break;
117
CASE_OP_32_64_VEC(xor):
118
done = fold_xor(&ctx, op);
119
break;
120
--
121
2.25.1
122
123
diff view generated by jsdifflib
Deleted patch
1
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
2
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
5
tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
6
1 file changed, 31 insertions(+), 25 deletions(-)
7
1
8
diff --git a/tcg/optimize.c b/tcg/optimize.c
9
index XXXXXXX..XXXXXXX 100644
10
--- a/tcg/optimize.c
11
+++ b/tcg/optimize.c
12
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
13
return true;
14
}
15
16
+static bool fold_movcond(OptContext *ctx, TCGOp *op)
17
+{
18
+ TCGOpcode opc = op->opc;
19
+ TCGCond cond = op->args[5];
20
+ int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
21
+
22
+ if (i >= 0) {
23
+ return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
24
+ }
25
+
26
+ if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
27
+ uint64_t tv = arg_info(op->args[3])->val;
28
+ uint64_t fv = arg_info(op->args[4])->val;
29
+
30
+ opc = (opc == INDEX_op_movcond_i32
31
+ ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
32
+
33
+ if (tv == 1 && fv == 0) {
34
+ op->opc = opc;
35
+ op->args[3] = cond;
36
+ } else if (fv == 1 && tv == 0) {
37
+ op->opc = opc;
38
+ op->args[3] = tcg_invert_cond(cond);
39
+ }
40
+ }
41
+ return false;
42
+}
43
+
44
static bool fold_mul(OptContext *ctx, TCGOp *op)
45
{
46
return fold_const2(ctx, op);
47
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
48
}
49
break;
50
51
- CASE_OP_32_64(movcond):
52
- i = do_constant_folding_cond(opc, op->args[1],
53
- op->args[2], op->args[5]);
54
- if (i >= 0) {
55
- tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
56
- continue;
57
- }
58
- if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
59
- uint64_t tv = arg_info(op->args[3])->val;
60
- uint64_t fv = arg_info(op->args[4])->val;
61
- TCGCond cond = op->args[5];
62
-
63
- if (fv == 1 && tv == 0) {
64
- cond = tcg_invert_cond(cond);
65
- } else if (!(tv == 1 && fv == 0)) {
66
- break;
67
- }
68
- op->args[3] = cond;
69
- op->opc = opc = (opc == INDEX_op_movcond_i32
70
- ? INDEX_op_setcond_i32
71
- : INDEX_op_setcond_i64);
72
- }
73
- break;
74
-
75
-
76
default:
77
break;
78
79
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
80
case INDEX_op_mb:
81
done = fold_mb(&ctx, op);
82
break;
83
+ CASE_OP_32_64(movcond):
84
+ done = fold_movcond(&ctx, op);
85
+ break;
86
CASE_OP_32_64(mul):
87
done = fold_mul(&ctx, op);
88
break;
89
--
90
2.25.1
91
92
diff view generated by jsdifflib
Deleted patch
1
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
2
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
5
tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
6
1 file changed, 22 insertions(+), 17 deletions(-)
7
1
8
diff --git a/tcg/optimize.c b/tcg/optimize.c
9
index XXXXXXX..XXXXXXX 100644
10
--- a/tcg/optimize.c
11
+++ b/tcg/optimize.c
12
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
13
return fold_const2(ctx, op);
14
}
15
16
+static bool fold_extract2(OptContext *ctx, TCGOp *op)
17
+{
18
+ if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
19
+ uint64_t v1 = arg_info(op->args[1])->val;
20
+ uint64_t v2 = arg_info(op->args[2])->val;
21
+ int shr = op->args[3];
22
+
23
+ if (op->opc == INDEX_op_extract2_i64) {
24
+ v1 >>= shr;
25
+ v2 <<= 64 - shr;
26
+ } else {
27
+ v1 = (uint32_t)v1 >> shr;
28
+ v2 = (int32_t)v2 << (32 - shr);
29
+ }
30
+ return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
31
+ }
32
+ return false;
33
+}
34
+
35
static bool fold_exts(OptContext *ctx, TCGOp *op)
36
{
37
return fold_const1(ctx, op);
38
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
39
}
40
break;
41
42
- CASE_OP_32_64(extract2):
43
- if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
44
- uint64_t v1 = arg_info(op->args[1])->val;
45
- uint64_t v2 = arg_info(op->args[2])->val;
46
- int shr = op->args[3];
47
-
48
- if (opc == INDEX_op_extract2_i64) {
49
- tmp = (v1 >> shr) | (v2 << (64 - shr));
50
- } else {
51
- tmp = (int32_t)(((uint32_t)v1 >> shr) |
52
- ((uint32_t)v2 << (32 - shr)));
53
- }
54
- tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
55
- continue;
56
- }
57
- break;
58
-
59
default:
60
break;
61
62
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
63
CASE_OP_32_64(eqv):
64
done = fold_eqv(&ctx, op);
65
break;
66
+ CASE_OP_32_64(extract2):
67
+ done = fold_extract2(&ctx, op);
68
+ break;
69
CASE_OP_32_64(ext8s):
70
CASE_OP_32_64(ext16s):
71
case INDEX_op_ext32s_i64:
72
--
73
2.25.1
74
75
diff view generated by jsdifflib
Deleted patch
1
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
2
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
5
tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
6
1 file changed, 30 insertions(+), 18 deletions(-)
7
1
8
diff --git a/tcg/optimize.c b/tcg/optimize.c
9
index XXXXXXX..XXXXXXX 100644
10
--- a/tcg/optimize.c
11
+++ b/tcg/optimize.c
12
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
13
return fold_const2(ctx, op);
14
}
15
16
+static bool fold_extract(OptContext *ctx, TCGOp *op)
17
+{
18
+ if (arg_is_const(op->args[1])) {
19
+ uint64_t t;
20
+
21
+ t = arg_info(op->args[1])->val;
22
+ t = extract64(t, op->args[2], op->args[3]);
23
+ return tcg_opt_gen_movi(ctx, op, op->args[0], t);
24
+ }
25
+ return false;
26
+}
27
+
28
static bool fold_extract2(OptContext *ctx, TCGOp *op)
29
{
30
if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
31
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
32
return tcg_opt_gen_movi(ctx, op, op->args[0], i);
33
}
34
35
+static bool fold_sextract(OptContext *ctx, TCGOp *op)
36
+{
37
+ if (arg_is_const(op->args[1])) {
38
+ uint64_t t;
39
+
40
+ t = arg_info(op->args[1])->val;
41
+ t = sextract64(t, op->args[2], op->args[3]);
42
+ return tcg_opt_gen_movi(ctx, op, op->args[0], t);
43
+ }
44
+ return false;
45
+}
46
+
47
static bool fold_shift(OptContext *ctx, TCGOp *op)
48
{
49
return fold_const2(ctx, op);
50
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
51
}
52
break;
53
54
- CASE_OP_32_64(extract):
55
- if (arg_is_const(op->args[1])) {
56
- tmp = extract64(arg_info(op->args[1])->val,
57
- op->args[2], op->args[3]);
58
- tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
59
- continue;
60
- }
61
- break;
62
-
63
- CASE_OP_32_64(sextract):
64
- if (arg_is_const(op->args[1])) {
65
- tmp = sextract64(arg_info(op->args[1])->val,
66
- op->args[2], op->args[3]);
67
- tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
68
- continue;
69
- }
70
- break;
71
-
72
default:
73
break;
74
75
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
76
CASE_OP_32_64(eqv):
77
done = fold_eqv(&ctx, op);
78
break;
79
+ CASE_OP_32_64(extract):
80
+ done = fold_extract(&ctx, op);
81
+ break;
82
CASE_OP_32_64(extract2):
83
done = fold_extract2(&ctx, op);
84
break;
85
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
86
case INDEX_op_setcond2_i32:
87
done = fold_setcond2(&ctx, op);
88
break;
89
+ CASE_OP_32_64(sextract):
90
+ done = fold_sextract(&ctx, op);
91
+ break;
92
CASE_OP_32_64_VEC(sub):
93
done = fold_sub(&ctx, op);
94
break;
95
--
96
2.25.1
97
98
diff view generated by jsdifflib
Deleted patch
1
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
2
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
5
tcg/optimize.c | 25 +++++++++++++++----------
6
1 file changed, 15 insertions(+), 10 deletions(-)
7
1
8
diff --git a/tcg/optimize.c b/tcg/optimize.c
9
index XXXXXXX..XXXXXXX 100644
10
--- a/tcg/optimize.c
11
+++ b/tcg/optimize.c
12
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
13
return fold_const1(ctx, op);
14
}
15
16
+static bool fold_deposit(OptContext *ctx, TCGOp *op)
17
+{
18
+ if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
19
+ uint64_t t1 = arg_info(op->args[1])->val;
20
+ uint64_t t2 = arg_info(op->args[2])->val;
21
+
22
+ t1 = deposit64(t1, op->args[3], op->args[4], t2);
23
+ return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
24
+ }
25
+ return false;
26
+}
27
+
28
static bool fold_divide(OptContext *ctx, TCGOp *op)
29
{
30
return fold_const2(ctx, op);
31
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
32
}
33
break;
34
35
- CASE_OP_32_64(deposit):
36
- if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
37
- tmp = deposit64(arg_info(op->args[1])->val,
38
- op->args[3], op->args[4],
39
- arg_info(op->args[2])->val);
40
- tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
41
- continue;
42
- }
43
- break;
44
-
45
default:
46
break;
47
48
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
49
CASE_OP_32_64(ctpop):
50
done = fold_ctpop(&ctx, op);
51
break;
52
+ CASE_OP_32_64(deposit):
53
+ done = fold_deposit(&ctx, op);
54
+ break;
55
CASE_OP_32_64(div):
56
CASE_OP_32_64(divu):
57
done = fold_divide(&ctx, op);
58
--
59
2.25.1
60
61
diff view generated by jsdifflib
Deleted patch
1
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
2
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
5
tcg/optimize.c | 32 ++++++++++++++++++--------------
6
1 file changed, 18 insertions(+), 14 deletions(-)
7
1
8
diff --git a/tcg/optimize.c b/tcg/optimize.c
9
index XXXXXXX..XXXXXXX 100644
10
--- a/tcg/optimize.c
11
+++ b/tcg/optimize.c
12
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
13
return true;
14
}
15
16
+static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
17
+{
18
+ if (arg_is_const(op->args[1])) {
19
+ uint64_t t = arg_info(op->args[1])->val;
20
+
21
+ if (t != 0) {
22
+ t = do_constant_folding(op->opc, t, 0);
23
+ return tcg_opt_gen_movi(ctx, op, op->args[0], t);
24
+ }
25
+ return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
26
+ }
27
+ return false;
28
+}
29
+
30
static bool fold_ctpop(OptContext *ctx, TCGOp *op)
31
{
32
return fold_const1(ctx, op);
33
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
34
}
35
break;
36
37
- CASE_OP_32_64(clz):
38
- CASE_OP_32_64(ctz):
39
- if (arg_is_const(op->args[1])) {
40
- TCGArg v = arg_info(op->args[1])->val;
41
- if (v != 0) {
42
- tmp = do_constant_folding(opc, v, 0);
43
- tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
44
- } else {
45
- tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
46
- }
47
- continue;
48
- }
49
- break;
50
-
51
default:
52
break;
53
54
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
55
case INDEX_op_brcond2_i32:
56
done = fold_brcond2(&ctx, op);
57
break;
58
+ CASE_OP_32_64(clz):
59
+ CASE_OP_32_64(ctz):
60
+ done = fold_count_zeros(&ctx, op);
61
+ break;
62
CASE_OP_32_64(ctpop):
63
done = fold_ctpop(&ctx, op);
64
break;
65
--
66
2.25.1
67
68
diff view generated by jsdifflib
Deleted patch
1
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
2
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
5
tcg/optimize.c | 27 ++++++++++++++++-----------
6
1 file changed, 16 insertions(+), 11 deletions(-)
7
1
8
diff --git a/tcg/optimize.c b/tcg/optimize.c
9
index XXXXXXX..XXXXXXX 100644
10
--- a/tcg/optimize.c
11
+++ b/tcg/optimize.c
12
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
13
return false;
14
}
15
16
+static bool fold_bswap(OptContext *ctx, TCGOp *op)
17
+{
18
+ if (arg_is_const(op->args[1])) {
19
+ uint64_t t = arg_info(op->args[1])->val;
20
+
21
+ t = do_constant_folding(op->opc, t, op->args[2]);
22
+ return tcg_opt_gen_movi(ctx, op, op->args[0], t);
23
+ }
24
+ return false;
25
+}
26
+
27
static bool fold_call(OptContext *ctx, TCGOp *op)
28
{
29
TCGContext *s = ctx->tcg;
30
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
31
}
32
break;
33
34
- CASE_OP_32_64(bswap16):
35
- CASE_OP_32_64(bswap32):
36
- case INDEX_op_bswap64_i64:
37
- if (arg_is_const(op->args[1])) {
38
- tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
39
- op->args[2]);
40
- tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
41
- continue;
42
- }
43
- break;
44
-
45
default:
46
break;
47
48
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
49
case INDEX_op_brcond2_i32:
50
done = fold_brcond2(&ctx, op);
51
break;
52
+ CASE_OP_32_64(bswap16):
53
+ CASE_OP_32_64(bswap32):
54
+ case INDEX_op_bswap64_i64:
55
+ done = fold_bswap(&ctx, op);
56
+ break;
57
CASE_OP_32_64(clz):
58
CASE_OP_32_64(ctz):
59
done = fold_count_zeros(&ctx, op);
60
--
61
2.25.1
62
63
diff view generated by jsdifflib
Deleted patch
1
This is the final entry in the main switch that was in a
2
different form. After this, we have the option to convert
3
the switch into a function dispatch table.
4
1
5
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
6
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
8
---
9
tcg/optimize.c | 27 ++++++++++++++-------------
10
1 file changed, 14 insertions(+), 13 deletions(-)
11
12
diff --git a/tcg/optimize.c b/tcg/optimize.c
13
index XXXXXXX..XXXXXXX 100644
14
--- a/tcg/optimize.c
15
+++ b/tcg/optimize.c
16
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
17
return true;
18
}
19
20
+static bool fold_mov(OptContext *ctx, TCGOp *op)
21
+{
22
+ return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
23
+}
24
+
25
static bool fold_movcond(OptContext *ctx, TCGOp *op)
26
{
27
TCGOpcode opc = op->opc;
28
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
29
break;
30
}
31
32
- /* Propagate constants through copy operations and do constant
33
- folding. Constants will be substituted to arguments by register
34
- allocator where needed and possible. Also detect copies. */
35
+ /*
36
+ * Process each opcode.
37
+ * Sorted alphabetically by opcode as much as possible.
38
+ */
39
switch (opc) {
40
- CASE_OP_32_64_VEC(mov):
41
- done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
42
- break;
43
-
44
- default:
45
- break;
46
-
47
- /* ---------------------------------------------------------- */
48
- /* Sorted alphabetically by opcode as much as possible. */
49
-
50
CASE_OP_32_64_VEC(add):
51
done = fold_add(&ctx, op);
52
break;
53
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
54
case INDEX_op_mb:
55
done = fold_mb(&ctx, op);
56
break;
57
+ CASE_OP_32_64_VEC(mov):
58
+ done = fold_mov(&ctx, op);
59
+ break;
60
CASE_OP_32_64(movcond):
61
done = fold_movcond(&ctx, op);
62
break;
63
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
64
CASE_OP_32_64_VEC(xor):
65
done = fold_xor(&ctx, op);
66
break;
67
+ default:
68
+ break;
69
}
70
71
if (!done) {
72
--
73
2.25.1
74
75
diff view generated by jsdifflib
Deleted patch
1
Pull the "op r, a, a => movi r, 0" optimization into a function,
2
and use it in the outer opcode fold functions.
3
1
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
8
tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
9
1 file changed, 24 insertions(+), 17 deletions(-)
10
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
12
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
14
+++ b/tcg/optimize.c
15
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
16
return false;
17
}
18
19
+/* If the binary operation has both arguments equal, fold to @i. */
20
+static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
21
+{
22
+ if (args_are_copies(op->args[1], op->args[2])) {
23
+ return tcg_opt_gen_movi(ctx, op, op->args[0], i);
24
+ }
25
+ return false;
26
+}
27
+
28
/*
29
* These outermost fold_<op> functions are sorted alphabetically.
30
*/
31
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
32
33
static bool fold_andc(OptContext *ctx, TCGOp *op)
34
{
35
- return fold_const2(ctx, op);
36
+ if (fold_const2(ctx, op) ||
37
+ fold_xx_to_i(ctx, op, 0)) {
38
+ return true;
39
+ }
40
+ return false;
41
}
42
43
static bool fold_brcond(OptContext *ctx, TCGOp *op)
44
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
45
46
static bool fold_sub(OptContext *ctx, TCGOp *op)
47
{
48
- return fold_const2(ctx, op);
49
+ if (fold_const2(ctx, op) ||
50
+ fold_xx_to_i(ctx, op, 0)) {
51
+ return true;
52
+ }
53
+ return false;
54
}
55
56
static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
57
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
58
59
static bool fold_xor(OptContext *ctx, TCGOp *op)
60
{
61
- return fold_const2(ctx, op);
62
+ if (fold_const2(ctx, op) ||
63
+ fold_xx_to_i(ctx, op, 0)) {
64
+ return true;
65
+ }
66
+ return false;
67
}
68
69
/* Propagate constants and copies, fold constant expressions. */
70
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
71
break;
72
}
73
74
- /* Simplify expression for "op r, a, a => movi r, 0" cases */
75
- switch (opc) {
76
- CASE_OP_32_64_VEC(andc):
77
- CASE_OP_32_64_VEC(sub):
78
- CASE_OP_32_64_VEC(xor):
79
- if (args_are_copies(op->args[1], op->args[2])) {
80
- tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
81
- continue;
82
- }
83
- break;
84
- default:
85
- break;
86
- }
87
-
88
/*
89
* Process each opcode.
90
* Sorted alphabetically by opcode as much as possible.
91
--
92
2.25.1
93
94
diff view generated by jsdifflib
1
Even though there is only one user, place this more complex
1
All RV64 32-bit operations sign-extend the output, so we are easily
2
conversion into its own helper.
2
able to keep TCG_TYPE_I32 values sign-extended in host registers.
3
3
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
4
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
5
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
---
7
---
7
tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
8
tcg/riscv/tcg-target-sa32.h | 6 +++++-
8
1 file changed, 47 insertions(+), 42 deletions(-)
9
tcg/riscv/tcg-target.c.inc | 8 ++------
10
2 files changed, 7 insertions(+), 7 deletions(-)
9
11
10
diff --git a/tcg/optimize.c b/tcg/optimize.c
12
diff --git a/tcg/riscv/tcg-target-sa32.h b/tcg/riscv/tcg-target-sa32.h
11
index XXXXXXX..XXXXXXX 100644
13
index XXXXXXX..XXXXXXX 100644
12
--- a/tcg/optimize.c
14
--- a/tcg/riscv/tcg-target-sa32.h
13
+++ b/tcg/optimize.c
15
+++ b/tcg/riscv/tcg-target-sa32.h
14
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
16
@@ -1 +1,5 @@
15
17
-#define TCG_TARGET_SIGNED_ADDR32 0
16
static bool fold_neg(OptContext *ctx, TCGOp *op)
18
+/*
17
{
19
+ * Do not set TCG_TARGET_SIGNED_ADDR32 for RV32;
18
- return fold_const1(ctx, op);
20
+ * TCG expects this to only be set for 64-bit hosts.
19
+ if (fold_const1(ctx, op)) {
21
+ */
20
+ return true;
22
+#define TCG_TARGET_SIGNED_ADDR32 (__riscv_xlen == 64)
21
+ }
23
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
22
+ /*
24
index XXXXXXX..XXXXXXX 100644
23
+ * Because of fold_sub_to_neg, we want to always return true,
25
--- a/tcg/riscv/tcg-target.c.inc
24
+ * via finish_folding.
26
+++ b/tcg/riscv/tcg-target.c.inc
25
+ */
27
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg addrl,
26
+ finish_folding(ctx, op);
28
tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP0, TCG_REG_TMP1, 0);
27
+ return true;
29
30
/* TLB Hit - translate address using addend. */
31
- if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
32
- tcg_out_ext32u(s, TCG_REG_TMP0, addrl);
33
- addrl = TCG_REG_TMP0;
34
- }
35
tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_REG_TMP2, addrl);
28
}
36
}
29
37
30
static bool fold_nor(OptContext *ctx, TCGOp *op)
38
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
31
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
39
data_regl, data_regh, addr_regl, addr_regh,
32
return fold_const2(ctx, op);
40
s->code_ptr, label_ptr);
33
}
41
#else
34
42
- if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
35
+static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
43
+ if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS && !guest_base_signed_addr32) {
36
+{
44
tcg_out_ext32u(s, base, addr_regl);
37
+ TCGOpcode neg_op;
45
addr_regl = base;
38
+ bool have_neg;
39
+
40
+ if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
41
+ return false;
42
+ }
43
+
44
+ switch (ctx->type) {
45
+ case TCG_TYPE_I32:
46
+ neg_op = INDEX_op_neg_i32;
47
+ have_neg = TCG_TARGET_HAS_neg_i32;
48
+ break;
49
+ case TCG_TYPE_I64:
50
+ neg_op = INDEX_op_neg_i64;
51
+ have_neg = TCG_TARGET_HAS_neg_i64;
52
+ break;
53
+ case TCG_TYPE_V64:
54
+ case TCG_TYPE_V128:
55
+ case TCG_TYPE_V256:
56
+ neg_op = INDEX_op_neg_vec;
57
+ have_neg = (TCG_TARGET_HAS_neg_vec &&
58
+ tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
59
+ break;
60
+ default:
61
+ g_assert_not_reached();
62
+ }
63
+ if (have_neg) {
64
+ op->opc = neg_op;
65
+ op->args[1] = op->args[2];
66
+ return fold_neg(ctx, op);
67
+ }
68
+ return false;
69
+}
70
+
71
static bool fold_sub(OptContext *ctx, TCGOp *op)
72
{
73
if (fold_const2(ctx, op) ||
74
- fold_xx_to_i(ctx, op, 0)) {
75
+ fold_xx_to_i(ctx, op, 0) ||
76
+ fold_sub_to_neg(ctx, op)) {
77
return true;
78
}
46
}
79
return false;
47
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
80
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
48
data_regl, data_regh, addr_regl, addr_regh,
81
continue;
49
s->code_ptr, label_ptr);
82
}
50
#else
83
break;
51
- if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
84
- CASE_OP_32_64_VEC(sub):
52
+ if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS && !guest_base_signed_addr32) {
85
- {
53
tcg_out_ext32u(s, base, addr_regl);
86
- TCGOpcode neg_op;
54
addr_regl = base;
87
- bool have_neg;
55
}
88
-
89
- if (arg_is_const(op->args[2])) {
90
- /* Proceed with possible constant folding. */
91
- break;
92
- }
93
- switch (ctx.type) {
94
- case TCG_TYPE_I32:
95
- neg_op = INDEX_op_neg_i32;
96
- have_neg = TCG_TARGET_HAS_neg_i32;
97
- break;
98
- case TCG_TYPE_I64:
99
- neg_op = INDEX_op_neg_i64;
100
- have_neg = TCG_TARGET_HAS_neg_i64;
101
- break;
102
- case TCG_TYPE_V64:
103
- case TCG_TYPE_V128:
104
- case TCG_TYPE_V256:
105
- neg_op = INDEX_op_neg_vec;
106
- have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
107
- TCGOP_VECE(op)) > 0;
108
- break;
109
- default:
110
- g_assert_not_reached();
111
- }
112
- if (!have_neg) {
113
- break;
114
- }
115
- if (arg_is_const(op->args[1])
116
- && arg_info(op->args[1])->val == 0) {
117
- op->opc = neg_op;
118
- reset_temp(op->args[0]);
119
- op->args[1] = op->args[2];
120
- continue;
121
- }
122
- }
123
- break;
124
default:
125
break;
126
}
127
--
56
--
128
2.25.1
57
2.25.1
129
58
130
59
diff view generated by jsdifflib
1
Pull the "op r, a, 0 => movi r, 0" optimization into a function,
1
All 32-bit LoongArch operations sign-extend the output, so we are easily
2
and use it in the outer opcode fold functions.
2
able to keep TCG_TYPE_I32 values sign-extended in host registers.
3
3
4
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
4
Cc: WANG Xuerui <git@xen0n.name>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
5
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
---
7
---
8
tcg/optimize.c | 38 ++++++++++++++++++++------------------
8
tcg/loongarch64/tcg-target-sa32.h | 2 +-
9
1 file changed, 20 insertions(+), 18 deletions(-)
9
tcg/loongarch64/tcg-target.c.inc | 15 ++++++---------
10
2 files changed, 7 insertions(+), 10 deletions(-)
10
11
11
diff --git a/tcg/optimize.c b/tcg/optimize.c
12
diff --git a/tcg/loongarch64/tcg-target-sa32.h b/tcg/loongarch64/tcg-target-sa32.h
12
index XXXXXXX..XXXXXXX 100644
13
index XXXXXXX..XXXXXXX 100644
13
--- a/tcg/optimize.c
14
--- a/tcg/loongarch64/tcg-target-sa32.h
14
+++ b/tcg/optimize.c
15
+++ b/tcg/loongarch64/tcg-target-sa32.h
15
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
16
@@ -1 +1 @@
16
return false;
17
-#define TCG_TARGET_SIGNED_ADDR32 0
18
+#define TCG_TARGET_SIGNED_ADDR32 1
19
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
20
index XXXXXXX..XXXXXXX 100644
21
--- a/tcg/loongarch64/tcg-target.c.inc
22
+++ b/tcg/loongarch64/tcg-target.c.inc
23
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
24
return tcg_out_fail_alignment(s, l);
17
}
25
}
18
26
19
+/* If the binary operation has second argument @i, fold to @i. */
27
-#endif /* CONFIG_SOFTMMU */
20
+static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
28
-
21
+{
29
/*
22
+ if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
30
* `ext32u` the address register into the temp register given,
23
+ return tcg_opt_gen_movi(ctx, op, op->args[0], i);
31
* if target is 32-bit, no-op otherwise.
24
+ }
32
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
25
+ return false;
33
static TCGReg tcg_out_zext_addr_if_32_bit(TCGContext *s,
26
+}
34
TCGReg addr, TCGReg tmp)
27
+
28
/* If the binary operation has both arguments equal, fold to @i. */
29
static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
30
{
35
{
31
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
36
- if (TARGET_LONG_BITS == 32) {
32
static bool fold_and(OptContext *ctx, TCGOp *op)
37
+ if (TARGET_LONG_BITS == 32 && !guest_base_signed_addr32) {
33
{
38
tcg_out_ext32u(s, tmp, addr);
34
if (fold_const2(ctx, op) ||
39
return tmp;
35
+ fold_xi_to_i(ctx, op, 0) ||
36
fold_xx_to_x(ctx, op)) {
37
return true;
38
}
40
}
39
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
41
return addr;
40
41
static bool fold_mul(OptContext *ctx, TCGOp *op)
42
{
43
- return fold_const2(ctx, op);
44
+ if (fold_const2(ctx, op) ||
45
+ fold_xi_to_i(ctx, op, 0)) {
46
+ return true;
47
+ }
48
+ return false;
49
}
42
}
50
43
+#endif /* CONFIG_SOFTMMU */
51
static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
44
52
{
45
static void tcg_out_qemu_ld_indexed(TCGContext *s, TCGReg rd, TCGReg rj,
53
- return fold_const2(ctx, op);
46
TCGReg rk, MemOp opc, TCGType type)
54
+ if (fold_const2(ctx, op) ||
47
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, TCGType type)
55
+ fold_xi_to_i(ctx, op, 0)) {
48
tcg_insn_unit *label_ptr[1];
56
+ return true;
49
#else
57
+ }
50
unsigned a_bits;
58
+ return false;
51
-#endif
59
}
52
TCGReg base;
60
53
+#endif
61
static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
54
62
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
55
data_regl = *args++;
63
continue;
56
addr_regl = *args++;
64
}
57
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, TCGType type)
65
58
66
- /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
59
#if defined(CONFIG_SOFTMMU)
67
- switch (opc) {
60
tcg_out_tlb_load(s, addr_regl, oi, label_ptr, 1);
68
- CASE_OP_32_64_VEC(and):
61
- base = tcg_out_zext_addr_if_32_bit(s, addr_regl, TCG_REG_TMP0);
69
- CASE_OP_32_64_VEC(mul):
62
- tcg_out_qemu_ld_indexed(s, data_regl, base, TCG_REG_TMP2, opc, type);
70
- CASE_OP_32_64(muluh):
63
+ tcg_out_qemu_ld_indexed(s, data_regl, addr_regl, TCG_REG_TMP2, opc, type);
71
- CASE_OP_32_64(mulsh):
64
add_qemu_ldst_label(s, 1, oi, type,
72
- if (arg_is_const(op->args[2])
65
data_regl, addr_regl,
73
- && arg_info(op->args[2])->val == 0) {
66
s->code_ptr, label_ptr);
74
- tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
67
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
75
- continue;
68
tcg_insn_unit *label_ptr[1];
76
- }
69
#else
77
- break;
70
unsigned a_bits;
78
- default:
71
-#endif
79
- break;
72
TCGReg base;
80
- }
73
+#endif
81
-
74
82
/*
75
data_regl = *args++;
83
* Process each opcode.
76
addr_regl = *args++;
84
* Sorted alphabetically by opcode as much as possible.
77
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
78
79
#if defined(CONFIG_SOFTMMU)
80
tcg_out_tlb_load(s, addr_regl, oi, label_ptr, 0);
81
- base = tcg_out_zext_addr_if_32_bit(s, addr_regl, TCG_REG_TMP0);
82
- tcg_out_qemu_st_indexed(s, data_regl, base, TCG_REG_TMP2, opc);
83
+ tcg_out_qemu_st_indexed(s, data_regl, addr_regl, TCG_REG_TMP2, opc);
84
add_qemu_ldst_label(s, 0, oi,
85
0, /* type param is unused for stores */
86
data_regl, addr_regl,
85
--
87
--
86
2.25.1
88
2.25.1
87
89
88
90
diff view generated by jsdifflib