With the use of a suitable temporary, we can use the same
algorithm when src overlaps dst. The result is the same
number of instructions either way.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 26 +++++++++++---------------
1 file changed, 11 insertions(+), 15 deletions(-)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index b49204f707..64c24382a8 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -788,6 +788,16 @@ static inline void tcg_out_sari64(TCGContext *s, TCGReg dst, TCGReg src, int c)
tcg_out32(s, SRADI | RA(dst) | RS(src) | SH(c & 0x1f) | ((c >> 4) & 2));
}
+static void tcg_out_bswap16(TCGContext *s, TCGReg dst, TCGReg src)
+{
+ TCGReg tmp = dst == src ? TCG_REG_R0 : dst;
+
+ /* src = abcd */
+ tcg_out_rlw(s, RLWINM, tmp, src, 24, 24, 31); /* tmp = 000c */
+ tcg_out_rlw(s, RLWIMI, tmp, src, 8, 16, 23); /* tmp = 00dc */
+ tcg_out_mov(s, TCG_TYPE_REG, dst, tmp);
+}
+
/* Emit a move into ret of arg, if it can be done in one insn. */
static bool tcg_out_movi_one(TCGContext *s, TCGReg ret, tcg_target_long arg)
{
@@ -2779,21 +2789,7 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_bswap16_i32:
case INDEX_op_bswap16_i64:
- a0 = args[0], a1 = args[1];
- /* a1 = abcd */
- if (a0 != a1) {
- /* a0 = (a1 r<< 24) & 0xff # 000c */
- tcg_out_rlw(s, RLWINM, a0, a1, 24, 24, 31);
- /* a0 = (a0 & ~0xff00) | (a1 r<< 8) & 0xff00 # 00dc */
- tcg_out_rlw(s, RLWIMI, a0, a1, 8, 16, 23);
- } else {
- /* r0 = (a1 r<< 8) & 0xff00 # 00d0 */
- tcg_out_rlw(s, RLWINM, TCG_REG_R0, a1, 8, 16, 23);
- /* a0 = (a1 r<< 24) & 0xff # 000c */
- tcg_out_rlw(s, RLWINM, a0, a1, 24, 24, 31);
- /* a0 = a0 | r0 # 00dc */
- tcg_out32(s, OR | SAB(TCG_REG_R0, a0, a0));
- }
+ tcg_out_bswap16(s, args[0], args[1]);
break;
case INDEX_op_bswap32_i32:
--
2.25.1