[PATCH v4] tcg/arm: Use register pair allocation for qemu_{ld, st}_i64

Richard Henderson posted 1 patch 1 year, 4 months ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20230107010637.1285036-1-richard.henderson@linaro.org
Maintainers: Richard Henderson <richard.henderson@linaro.org>
tcg/arm/tcg-target-con-set.h |  7 ++++---
tcg/arm/tcg-target-con-str.h |  2 ++
tcg/arm/tcg-target.c.inc     | 28 ++++++++++++++++++----------
3 files changed, 24 insertions(+), 13 deletions(-)
[PATCH v4] tcg/arm: Use register pair allocation for qemu_{ld, st}_i64
Posted by Richard Henderson 1 year, 4 months ago
Although we still can't use ldrd and strd for all operations,
increase the chances by getting the register allocation correct.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---

v3 was patch 5 in a larger patch set:

  https://lore.kernel.org/qemu-devel/20221111074101.2069454-6-richard.henderson@linaro.org/

most of which has been merged.

r~

---
 tcg/arm/tcg-target-con-set.h |  7 ++++---
 tcg/arm/tcg-target-con-str.h |  2 ++
 tcg/arm/tcg-target.c.inc     | 28 ++++++++++++++++++----------
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
index 3685e1786a..b8849b2478 100644
--- a/tcg/arm/tcg-target-con-set.h
+++ b/tcg/arm/tcg-target-con-set.h
@@ -15,8 +15,9 @@ C_O0_I2(r, rIN)
 C_O0_I2(s, s)
 C_O0_I2(w, r)
 C_O0_I3(s, s, s)
+C_O0_I3(S, p, s)
 C_O0_I4(r, r, rI, rI)
-C_O0_I4(s, s, s, s)
+C_O0_I4(S, p, s, s)
 C_O1_I1(r, l)
 C_O1_I1(r, r)
 C_O1_I1(w, r)
@@ -38,8 +39,8 @@ C_O1_I2(w, w, wZ)
 C_O1_I3(w, w, w, w)
 C_O1_I4(r, r, r, rI, rI)
 C_O1_I4(r, r, rIN, rIK, 0)
-C_O2_I1(r, r, l)
-C_O2_I2(r, r, l, l)
+C_O2_I1(e, p, l)
+C_O2_I2(e, p, l, l)
 C_O2_I2(r, r, r, r)
 C_O2_I4(r, r, r, r, rIN, rIK)
 C_O2_I4(r, r, rI, rI, rIN, rIK)
diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
index 8f501149e1..24b4b59feb 100644
--- a/tcg/arm/tcg-target-con-str.h
+++ b/tcg/arm/tcg-target-con-str.h
@@ -8,9 +8,11 @@
  * Define constraint letters for register sets:
  * REGS(letter, register_mask)
  */
+REGS('e', ALL_GENERAL_REGS & 0x5555) /* even regs */
 REGS('r', ALL_GENERAL_REGS)
 REGS('l', ALL_QLOAD_REGS)
 REGS('s', ALL_QSTORE_REGS)
+REGS('S', ALL_QSTORE_REGS & 0x5555)  /* even qstore */
 REGS('w', ALL_VECTOR_REGS)
 
 /*
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index 9245ea86d0..e82749a602 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -1692,9 +1692,11 @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
         tcg_out_ld32_r(s, COND_AL, datalo, addrlo, addend);
         break;
     case MO_UQ:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* LDRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             /*
              * Rm (the second address op) must not overlap Rt or Rt + 1.
              * Since datalo is aligned, we can simplify the test via alignment.
@@ -1748,9 +1750,11 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
         tcg_out_ld32_12(s, COND_AL, datalo, addrlo, 0);
         break;
     case MO_UQ:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* LDRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             tcg_out_ldrd_8(s, COND_AL, datalo, addrlo, 0);
         } else if (datalo == addrlo) {
             tcg_out_ld32_12(s, COND_AL, datahi, addrlo, 4);
@@ -1832,9 +1836,11 @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
         tcg_out_st32_r(s, cond, datalo, addrlo, addend);
         break;
     case MO_64:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* STRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
         } else if (scratch_addend) {
             tcg_out_st32_rwb(s, cond, datalo, addend, addrlo);
@@ -1869,9 +1875,11 @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
         tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
         break;
     case MO_64:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* STRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
         } else {
             tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
@@ -2339,11 +2347,11 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_qemu_ld_i32:
         return TARGET_LONG_BITS == 32 ? C_O1_I1(r, l) : C_O1_I2(r, l, l);
     case INDEX_op_qemu_ld_i64:
-        return TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, l) : C_O2_I2(r, r, l, l);
+        return TARGET_LONG_BITS == 32 ? C_O2_I1(e, p, l) : C_O2_I2(e, p, l, l);
     case INDEX_op_qemu_st_i32:
         return TARGET_LONG_BITS == 32 ? C_O0_I2(s, s) : C_O0_I3(s, s, s);
     case INDEX_op_qemu_st_i64:
-        return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
+        return TARGET_LONG_BITS == 32 ? C_O0_I3(S, p, s) : C_O0_I4(S, p, s, s);
 
     case INDEX_op_st_vec:
         return C_O0_I2(w, r);
-- 
2.34.1
Re: [PATCH v4] tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
Posted by Richard Henderson 1 year, 4 months ago
Ping.

r~

On 1/6/23 15:06, Richard Henderson wrote:
> Although we still can't use ldrd and strd for all operations,
> increase the chances by getting the register allocation correct.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> 
> v3 was patch 5 in a larger patch set:
> 
>    https://lore.kernel.org/qemu-devel/20221111074101.2069454-6-richard.henderson@linaro.org/
> 
> most of which has been merged.
> 
> r~
> 
> ---
>   tcg/arm/tcg-target-con-set.h |  7 ++++---
>   tcg/arm/tcg-target-con-str.h |  2 ++
>   tcg/arm/tcg-target.c.inc     | 28 ++++++++++++++++++----------
>   3 files changed, 24 insertions(+), 13 deletions(-)
> 
> diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
> index 3685e1786a..b8849b2478 100644
> --- a/tcg/arm/tcg-target-con-set.h
> +++ b/tcg/arm/tcg-target-con-set.h
> @@ -15,8 +15,9 @@ C_O0_I2(r, rIN)
>   C_O0_I2(s, s)
>   C_O0_I2(w, r)
>   C_O0_I3(s, s, s)
> +C_O0_I3(S, p, s)
>   C_O0_I4(r, r, rI, rI)
> -C_O0_I4(s, s, s, s)
> +C_O0_I4(S, p, s, s)
>   C_O1_I1(r, l)
>   C_O1_I1(r, r)
>   C_O1_I1(w, r)
> @@ -38,8 +39,8 @@ C_O1_I2(w, w, wZ)
>   C_O1_I3(w, w, w, w)
>   C_O1_I4(r, r, r, rI, rI)
>   C_O1_I4(r, r, rIN, rIK, 0)
> -C_O2_I1(r, r, l)
> -C_O2_I2(r, r, l, l)
> +C_O2_I1(e, p, l)
> +C_O2_I2(e, p, l, l)
>   C_O2_I2(r, r, r, r)
>   C_O2_I4(r, r, r, r, rIN, rIK)
>   C_O2_I4(r, r, rI, rI, rIN, rIK)
> diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
> index 8f501149e1..24b4b59feb 100644
> --- a/tcg/arm/tcg-target-con-str.h
> +++ b/tcg/arm/tcg-target-con-str.h
> @@ -8,9 +8,11 @@
>    * Define constraint letters for register sets:
>    * REGS(letter, register_mask)
>    */
> +REGS('e', ALL_GENERAL_REGS & 0x5555) /* even regs */
>   REGS('r', ALL_GENERAL_REGS)
>   REGS('l', ALL_QLOAD_REGS)
>   REGS('s', ALL_QSTORE_REGS)
> +REGS('S', ALL_QSTORE_REGS & 0x5555)  /* even qstore */
>   REGS('w', ALL_VECTOR_REGS)
>   
>   /*
> diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
> index 9245ea86d0..e82749a602 100644
> --- a/tcg/arm/tcg-target.c.inc
> +++ b/tcg/arm/tcg-target.c.inc
> @@ -1692,9 +1692,11 @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
>           tcg_out_ld32_r(s, COND_AL, datalo, addrlo, addend);
>           break;
>       case MO_UQ:
> +        /* We used pair allocation for datalo, so already should be aligned. */
> +        tcg_debug_assert((datalo & 1) == 0);
> +        tcg_debug_assert(datahi == datalo + 1);
>           /* LDRD requires alignment; double-check that. */
> -        if (get_alignment_bits(opc) >= MO_64
> -            && (datalo & 1) == 0 && datahi == datalo + 1) {
> +        if (get_alignment_bits(opc) >= MO_64) {
>               /*
>                * Rm (the second address op) must not overlap Rt or Rt + 1.
>                * Since datalo is aligned, we can simplify the test via alignment.
> @@ -1748,9 +1750,11 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
>           tcg_out_ld32_12(s, COND_AL, datalo, addrlo, 0);
>           break;
>       case MO_UQ:
> +        /* We used pair allocation for datalo, so already should be aligned. */
> +        tcg_debug_assert((datalo & 1) == 0);
> +        tcg_debug_assert(datahi == datalo + 1);
>           /* LDRD requires alignment; double-check that. */
> -        if (get_alignment_bits(opc) >= MO_64
> -            && (datalo & 1) == 0 && datahi == datalo + 1) {
> +        if (get_alignment_bits(opc) >= MO_64) {
>               tcg_out_ldrd_8(s, COND_AL, datalo, addrlo, 0);
>           } else if (datalo == addrlo) {
>               tcg_out_ld32_12(s, COND_AL, datahi, addrlo, 4);
> @@ -1832,9 +1836,11 @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
>           tcg_out_st32_r(s, cond, datalo, addrlo, addend);
>           break;
>       case MO_64:
> +        /* We used pair allocation for datalo, so already should be aligned. */
> +        tcg_debug_assert((datalo & 1) == 0);
> +        tcg_debug_assert(datahi == datalo + 1);
>           /* STRD requires alignment; double-check that. */
> -        if (get_alignment_bits(opc) >= MO_64
> -            && (datalo & 1) == 0 && datahi == datalo + 1) {
> +        if (get_alignment_bits(opc) >= MO_64) {
>               tcg_out_strd_r(s, cond, datalo, addrlo, addend);
>           } else if (scratch_addend) {
>               tcg_out_st32_rwb(s, cond, datalo, addend, addrlo);
> @@ -1869,9 +1875,11 @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
>           tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
>           break;
>       case MO_64:
> +        /* We used pair allocation for datalo, so already should be aligned. */
> +        tcg_debug_assert((datalo & 1) == 0);
> +        tcg_debug_assert(datahi == datalo + 1);
>           /* STRD requires alignment; double-check that. */
> -        if (get_alignment_bits(opc) >= MO_64
> -            && (datalo & 1) == 0 && datahi == datalo + 1) {
> +        if (get_alignment_bits(opc) >= MO_64) {
>               tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
>           } else {
>               tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
> @@ -2339,11 +2347,11 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
>       case INDEX_op_qemu_ld_i32:
>           return TARGET_LONG_BITS == 32 ? C_O1_I1(r, l) : C_O1_I2(r, l, l);
>       case INDEX_op_qemu_ld_i64:
> -        return TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, l) : C_O2_I2(r, r, l, l);
> +        return TARGET_LONG_BITS == 32 ? C_O2_I1(e, p, l) : C_O2_I2(e, p, l, l);
>       case INDEX_op_qemu_st_i32:
>           return TARGET_LONG_BITS == 32 ? C_O0_I2(s, s) : C_O0_I3(s, s, s);
>       case INDEX_op_qemu_st_i64:
> -        return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
> +        return TARGET_LONG_BITS == 32 ? C_O0_I3(S, p, s) : C_O0_I4(S, p, s, s);
>   
>       case INDEX_op_st_vec:
>           return C_O0_I2(w, r);