[PATCH v2 24/24] target/arm: Enforce alignment for sve unpredicated LDR/STR

Richard Henderson posted 24 patches 5 years, 2 months ago
Maintainers: Peter Maydell <peter.maydell@linaro.org>
There is a newer version of this series
[PATCH v2 24/24] target/arm: Enforce alignment for sve unpredicated LDR/STR
Posted by Richard Henderson 5 years, 2 months ago
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/translate-sve.c | 58 +++++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 13 deletions(-)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 6125e734af..b481e97428 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -4263,7 +4263,8 @@ static bool trans_UCVTF_dd(DisasContext *s, arg_rpr_esz *a)
  * The load should begin at the address Rn + IMM.
  */
 
-static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
+static void do_ldr(DisasContext *s, uint32_t vofs, int len,
+                   MemOp align, int rn, int imm)
 {
     int len_align = QEMU_ALIGN_DOWN(len, 8);
     int len_remain = len % 8;
@@ -4276,6 +4277,10 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
     clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len, MO_8);
     tcg_temp_free_i64(dirty_addr);
 
+    if (!s->align_mem) {
+        align = 0;
+    }
+
     /*
      * Note that unpredicated load/store of vector/predicate registers
      * are defined as a stream of bytes, which equates to little-endian
@@ -4288,7 +4293,8 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
 
         t0 = tcg_temp_new_i64();
         for (i = 0; i < len_align; i += 8) {
-            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ);
+            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ | align);
+            align = 0;
             tcg_gen_st_i64(t0, cpu_env, vofs + i);
             tcg_gen_addi_i64(clean_addr, clean_addr, 8);
         }
@@ -4302,6 +4308,16 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
         clean_addr = new_tmp_a64_local(s);
         tcg_gen_mov_i64(clean_addr, t0);
 
+        if (align > MO_ALIGN_8) {
+            t0 = tcg_temp_new_i64();
+            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ | align);
+            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+            tcg_gen_addi_ptr(i, i, 8);
+            tcg_gen_st_i64(t0, cpu_env, vofs);
+            tcg_temp_free_i64(t0);
+            align = 0;
+        }
+
         gen_set_label(loop);
 
         t0 = tcg_temp_new_i64();
@@ -4330,12 +4346,12 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
         case 4:
         case 8:
             tcg_gen_qemu_ld_i64(t0, clean_addr, midx,
-                                MO_LE | ctz32(len_remain));
+                                MO_LE | ctz32(len_remain) | align);
             break;
 
         case 6:
             t1 = tcg_temp_new_i64();
-            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL);
+            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL | align);
             tcg_gen_addi_i64(clean_addr, clean_addr, 4);
             tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW);
             tcg_gen_deposit_i64(t0, t0, t1, 32, 32);
@@ -4351,7 +4367,8 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
 }
 
 /* Similarly for stores.  */
-static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
+static void do_str(DisasContext *s, uint32_t vofs, int len, MemOp align,
+                   int rn, int imm)
 {
     int len_align = QEMU_ALIGN_DOWN(len, 8);
     int len_remain = len % 8;
@@ -4364,6 +4381,10 @@ static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
     clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len, MO_8);
     tcg_temp_free_i64(dirty_addr);
 
+    if (!s->align_mem) {
+        align = 0;
+    }
+
     /* Note that unpredicated load/store of vector/predicate registers
      * are defined as a stream of bytes, which equates to little-endian
      * operations on larger quantities.  There is no nice way to force
@@ -4378,7 +4399,8 @@ static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
         t0 = tcg_temp_new_i64();
         for (i = 0; i < len_align; i += 8) {
             tcg_gen_ld_i64(t0, cpu_env, vofs + i);
-            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ);
+            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ | align);
+            align = 0;
             tcg_gen_addi_i64(clean_addr, clean_addr, 8);
         }
         tcg_temp_free_i64(t0);
@@ -4391,6 +4413,16 @@ static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
         clean_addr = new_tmp_a64_local(s);
         tcg_gen_mov_i64(clean_addr, t0);
 
+        if (align > MO_ALIGN_8) {
+            t0 = tcg_temp_new_i64();
+            tcg_gen_ld_i64(t0, cpu_env, vofs);
+            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ | align);
+            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+            tcg_gen_addi_ptr(i, i, 8);
+            tcg_temp_free_i64(t0);
+            align = 0;
+        }
+
         gen_set_label(loop);
 
         t0 = tcg_temp_new_i64();
@@ -4400,7 +4432,7 @@ static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
         tcg_gen_addi_ptr(i, i, 8);
         tcg_temp_free_ptr(tp);
 
-        tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ);
+        tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ | align);
         tcg_gen_addi_i64(clean_addr, clean_addr, 8);
         tcg_temp_free_i64(t0);
 
@@ -4418,11 +4450,11 @@ static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
         case 4:
         case 8:
             tcg_gen_qemu_st_i64(t0, clean_addr, midx,
-                                MO_LE | ctz32(len_remain));
+                                MO_LE | ctz32(len_remain) | align);
             break;
 
         case 6:
-            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL);
+            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL | align);
             tcg_gen_addi_i64(clean_addr, clean_addr, 4);
             tcg_gen_shri_i64(t0, t0, 32);
             tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW);
@@ -4440,7 +4472,7 @@ static bool trans_LDR_zri(DisasContext *s, arg_rri *a)
     if (sve_access_check(s)) {
         int size = vec_full_reg_size(s);
         int off = vec_full_reg_offset(s, a->rd);
-        do_ldr(s, off, size, a->rn, a->imm * size);
+        do_ldr(s, off, size, MO_ALIGN_16, a->rn, a->imm * size);
     }
     return true;
 }
@@ -4450,7 +4482,7 @@ static bool trans_LDR_pri(DisasContext *s, arg_rri *a)
     if (sve_access_check(s)) {
         int size = pred_full_reg_size(s);
         int off = pred_full_reg_offset(s, a->rd);
-        do_ldr(s, off, size, a->rn, a->imm * size);
+        do_ldr(s, off, size, MO_ALIGN_2, a->rn, a->imm * size);
     }
     return true;
 }
@@ -4460,7 +4492,7 @@ static bool trans_STR_zri(DisasContext *s, arg_rri *a)
     if (sve_access_check(s)) {
         int size = vec_full_reg_size(s);
         int off = vec_full_reg_offset(s, a->rd);
-        do_str(s, off, size, a->rn, a->imm * size);
+        do_str(s, off, size, MO_ALIGN_16, a->rn, a->imm * size);
     }
     return true;
 }
@@ -4470,7 +4502,7 @@ static bool trans_STR_pri(DisasContext *s, arg_rri *a)
     if (sve_access_check(s)) {
         int size = pred_full_reg_size(s);
         int off = pred_full_reg_offset(s, a->rd);
-        do_str(s, off, size, a->rn, a->imm * size);
+        do_str(s, off, size, MO_ALIGN_2, a->rn, a->imm * size);
     }
     return true;
 }
-- 
2.25.1


Re: [PATCH v2 24/24] target/arm: Enforce alignment for sve unpredicated LDR/STR
Posted by Peter Maydell 5 years, 1 month ago
On Tue, 8 Dec 2020 at 18:02, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/arm/translate-sve.c | 58 +++++++++++++++++++++++++++++---------
>  1 file changed, 45 insertions(+), 13 deletions(-)
>
> diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
> index 6125e734af..b481e97428 100644
> --- a/target/arm/translate-sve.c
> +++ b/target/arm/translate-sve.c
> @@ -4263,7 +4263,8 @@ static bool trans_UCVTF_dd(DisasContext *s, arg_rpr_esz *a)
>   * The load should begin at the address Rn + IMM.
>   */
>
> -static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
> +static void do_ldr(DisasContext *s, uint32_t vofs, int len,
> +                   MemOp align, int rn, int imm)
>  {
>      int len_align = QEMU_ALIGN_DOWN(len, 8);
>      int len_remain = len % 8;
> @@ -4276,6 +4277,10 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
>      clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len, MO_8);
>      tcg_temp_free_i64(dirty_addr);
>
> +    if (!s->align_mem) {
> +        align = 0;
> +    }
> +
>      /*
>       * Note that unpredicated load/store of vector/predicate registers
>       * are defined as a stream of bytes, which equates to little-endian
> @@ -4288,7 +4293,8 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
>
>          t0 = tcg_temp_new_i64();
>          for (i = 0; i < len_align; i += 8) {
> -            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ);
> +            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ | align);
> +            align = 0;
>              tcg_gen_st_i64(t0, cpu_env, vofs + i);
>              tcg_gen_addi_i64(clean_addr, clean_addr, 8);
>          }
> @@ -4302,6 +4308,16 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
>          clean_addr = new_tmp_a64_local(s);
>          tcg_gen_mov_i64(clean_addr, t0);
>
> +        if (align > MO_ALIGN_8) {
> +            t0 = tcg_temp_new_i64();
> +            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ | align);
> +            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
> +            tcg_gen_addi_ptr(i, i, 8);
> +            tcg_gen_st_i64(t0, cpu_env, vofs);
> +            tcg_temp_free_i64(t0);
> +            align = 0;
> +        }
> +

Why do we need to do this (and the similar thing in do_str()) ?
Most of the rest of the patch is fairly clear in that it is just
passing the alignment requirement through to the load/store fns,
but this is a bit more opaque to me...

thanks
-- PMM

Re: [PATCH v2 24/24] target/arm: Enforce alignment for sve unpredicated LDR/STR
Posted by Richard Henderson 5 years, 1 month ago
On 1/7/21 7:39 AM, Peter Maydell wrote:
>> +        if (align > MO_ALIGN_8) {
>> +            t0 = tcg_temp_new_i64();
>> +            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ | align);
>> +            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
>> +            tcg_gen_addi_ptr(i, i, 8);
>> +            tcg_gen_st_i64(t0, cpu_env, vofs);
>> +            tcg_temp_free_i64(t0);
>> +            align = 0;
>> +        }
>> +
> 
> Why do we need to do this (and the similar thing in do_str()) ?
> Most of the rest of the patch is fairly clear in that it is just
> passing the alignment requirement through to the load/store fns,
> but this is a bit more opaque to me...

What follows this context is a single memory access within a tcg loop.

When align is <= the size of the access, every access can use the same
alignment mop.  But for MO_ALIGN_16, since we're emitting 8-byte accesses, the
second access will not be 16-byte aligned.  So I peel off one loop iteration at
the beginning to perform the alignment check.


r~

Re: [PATCH v2 24/24] target/arm: Enforce alignment for sve unpredicated LDR/STR
Posted by Peter Maydell 5 years, 1 month ago
On Thu, 7 Jan 2021 at 22:02, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 1/7/21 7:39 AM, Peter Maydell wrote:
> >> +        if (align > MO_ALIGN_8) {
> >> +            t0 = tcg_temp_new_i64();
> >> +            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ | align);
> >> +            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
> >> +            tcg_gen_addi_ptr(i, i, 8);
> >> +            tcg_gen_st_i64(t0, cpu_env, vofs);
> >> +            tcg_temp_free_i64(t0);
> >> +            align = 0;
> >> +        }
> >> +
> >
> > Why do we need to do this (and the similar thing in do_str()) ?
> > Most of the rest of the patch is fairly clear in that it is just
> > passing the alignment requirement through to the load/store fns,
> > but this is a bit more opaque to me...
>
> What follows this context is a single memory access within a tcg loop.
>
> When align is <= the size of the access, every access can use the same
> alignment mop.  But for MO_ALIGN_16, since we're emitting 8-byte accesses, the
> second access will not be 16-byte aligned.  So I peel off one loop iteration at
> the beginning to perform the alignment check.

OK. Could you add comments to that effect, please?

Otherwise
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

thanks
-- PMM