diff mbox series

[v2,24/24] target/arm: Enforce alignment for sve unpredicated LDR/STR

Message ID 20201208180118.157911-25-richard.henderson@linaro.org
State New
Headers show
Series target/arm: enforce alignment | expand

Commit Message

Richard Henderson Dec. 8, 2020, 6:01 p.m. UTC
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 target/arm/translate-sve.c | 58 +++++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 13 deletions(-)

-- 
2.25.1

Comments

Peter Maydell Jan. 7, 2021, 5:39 p.m. UTC | #1
On Tue, 8 Dec 2020 at 18:02, Richard Henderson
<richard.henderson@linaro.org> wrote:
>

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

>  target/arm/translate-sve.c | 58 +++++++++++++++++++++++++++++---------

>  1 file changed, 45 insertions(+), 13 deletions(-)

>

> diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c

> index 6125e734af..b481e97428 100644

> --- a/target/arm/translate-sve.c

> +++ b/target/arm/translate-sve.c

> @@ -4263,7 +4263,8 @@ static bool trans_UCVTF_dd(DisasContext *s, arg_rpr_esz *a)

>   * The load should begin at the address Rn + IMM.

>   */

>

> -static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)

> +static void do_ldr(DisasContext *s, uint32_t vofs, int len,

> +                   MemOp align, int rn, int imm)

>  {

>      int len_align = QEMU_ALIGN_DOWN(len, 8);

>      int len_remain = len % 8;

> @@ -4276,6 +4277,10 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)

>      clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len, MO_8);

>      tcg_temp_free_i64(dirty_addr);

>

> +    if (!s->align_mem) {

> +        align = 0;

> +    }

> +

>      /*

>       * Note that unpredicated load/store of vector/predicate registers

>       * are defined as a stream of bytes, which equates to little-endian

> @@ -4288,7 +4293,8 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)

>

>          t0 = tcg_temp_new_i64();

>          for (i = 0; i < len_align; i += 8) {

> -            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ);

> +            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ | align);

> +            align = 0;

>              tcg_gen_st_i64(t0, cpu_env, vofs + i);

>              tcg_gen_addi_i64(clean_addr, clean_addr, 8);

>          }

> @@ -4302,6 +4308,16 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)

>          clean_addr = new_tmp_a64_local(s);

>          tcg_gen_mov_i64(clean_addr, t0);

>

> +        if (align > MO_ALIGN_8) {

> +            t0 = tcg_temp_new_i64();

> +            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ | align);

> +            tcg_gen_addi_i64(clean_addr, clean_addr, 8);

> +            tcg_gen_addi_ptr(i, i, 8);

> +            tcg_gen_st_i64(t0, cpu_env, vofs);

> +            tcg_temp_free_i64(t0);

> +            align = 0;

> +        }

> +


Why do we need to do this (and the similar thing in do_str()) ?
Most of the rest of the patch is fairly clear in that it is just
passing the alignment requirement through to the load/store fns,
but this is a bit more opaque to me...

thanks
-- PMM
Richard Henderson Jan. 7, 2021, 10:02 p.m. UTC | #2
On 1/7/21 7:39 AM, Peter Maydell wrote:
>> +        if (align > MO_ALIGN_8) {

>> +            t0 = tcg_temp_new_i64();

>> +            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ | align);

>> +            tcg_gen_addi_i64(clean_addr, clean_addr, 8);

>> +            tcg_gen_addi_ptr(i, i, 8);

>> +            tcg_gen_st_i64(t0, cpu_env, vofs);

>> +            tcg_temp_free_i64(t0);

>> +            align = 0;

>> +        }

>> +

> 

> Why do we need to do this (and the similar thing in do_str()) ?

> Most of the rest of the patch is fairly clear in that it is just

> passing the alignment requirement through to the load/store fns,

> but this is a bit more opaque to me...


What follows this context is a single memory access within a tcg loop.

When align is <= the size of the access, every access can use the same
alignment mop.  But for MO_ALIGN_16, since we're emitting 8-byte accesses, the
second access will not be 16-byte aligned.  So I peel off one loop iteration at
the beginning to perform the alignment check.


r~
Peter Maydell Jan. 8, 2021, 5:22 p.m. UTC | #3
On Thu, 7 Jan 2021 at 22:02, Richard Henderson
<richard.henderson@linaro.org> wrote:
>

> On 1/7/21 7:39 AM, Peter Maydell wrote:

> >> +        if (align > MO_ALIGN_8) {

> >> +            t0 = tcg_temp_new_i64();

> >> +            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ | align);

> >> +            tcg_gen_addi_i64(clean_addr, clean_addr, 8);

> >> +            tcg_gen_addi_ptr(i, i, 8);

> >> +            tcg_gen_st_i64(t0, cpu_env, vofs);

> >> +            tcg_temp_free_i64(t0);

> >> +            align = 0;

> >> +        }

> >> +

> >

> > Why do we need to do this (and the similar thing in do_str()) ?

> > Most of the rest of the patch is fairly clear in that it is just

> > passing the alignment requirement through to the load/store fns,

> > but this is a bit more opaque to me...

>

> What follows this context is a single memory access within a tcg loop.

>

> When align is <= the size of the access, every access can use the same

> alignment mop.  But for MO_ALIGN_16, since we're emitting 8-byte accesses, the

> second access will not be 16-byte aligned.  So I peel off one loop iteration at

> the beginning to perform the alignment check.


OK. Could you add comments to that effect, please?

Otherwise
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>


thanks
-- PMM
diff mbox series

Patch

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 6125e734af..b481e97428 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -4263,7 +4263,8 @@  static bool trans_UCVTF_dd(DisasContext *s, arg_rpr_esz *a)
  * The load should begin at the address Rn + IMM.
  */
 
-static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
+static void do_ldr(DisasContext *s, uint32_t vofs, int len,
+                   MemOp align, int rn, int imm)
 {
     int len_align = QEMU_ALIGN_DOWN(len, 8);
     int len_remain = len % 8;
@@ -4276,6 +4277,10 @@  static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
     clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len, MO_8);
     tcg_temp_free_i64(dirty_addr);
 
+    if (!s->align_mem) {
+        align = 0;
+    }
+
     /*
      * Note that unpredicated load/store of vector/predicate registers
      * are defined as a stream of bytes, which equates to little-endian
@@ -4288,7 +4293,8 @@  static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
 
         t0 = tcg_temp_new_i64();
         for (i = 0; i < len_align; i += 8) {
-            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ);
+            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ | align);
+            align = 0;
             tcg_gen_st_i64(t0, cpu_env, vofs + i);
             tcg_gen_addi_i64(clean_addr, clean_addr, 8);
         }
@@ -4302,6 +4308,16 @@  static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
         clean_addr = new_tmp_a64_local(s);
         tcg_gen_mov_i64(clean_addr, t0);
 
+        if (align > MO_ALIGN_8) {
+            t0 = tcg_temp_new_i64();
+            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ | align);
+            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+            tcg_gen_addi_ptr(i, i, 8);
+            tcg_gen_st_i64(t0, cpu_env, vofs);
+            tcg_temp_free_i64(t0);
+            align = 0;
+        }
+
         gen_set_label(loop);
 
         t0 = tcg_temp_new_i64();
@@ -4330,12 +4346,12 @@  static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
         case 4:
         case 8:
             tcg_gen_qemu_ld_i64(t0, clean_addr, midx,
-                                MO_LE | ctz32(len_remain));
+                                MO_LE | ctz32(len_remain) | align);
             break;
 
         case 6:
             t1 = tcg_temp_new_i64();
-            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL);
+            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL | align);
             tcg_gen_addi_i64(clean_addr, clean_addr, 4);
             tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW);
             tcg_gen_deposit_i64(t0, t0, t1, 32, 32);
@@ -4351,7 +4367,8 @@  static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
 }
 
 /* Similarly for stores.  */
-static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
+static void do_str(DisasContext *s, uint32_t vofs, int len, MemOp align,
+                   int rn, int imm)
 {
     int len_align = QEMU_ALIGN_DOWN(len, 8);
     int len_remain = len % 8;
@@ -4364,6 +4381,10 @@  static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
     clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len, MO_8);
     tcg_temp_free_i64(dirty_addr);
 
+    if (!s->align_mem) {
+        align = 0;
+    }
+
     /* Note that unpredicated load/store of vector/predicate registers
      * are defined as a stream of bytes, which equates to little-endian
      * operations on larger quantities.  There is no nice way to force
@@ -4378,7 +4399,8 @@  static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
         t0 = tcg_temp_new_i64();
         for (i = 0; i < len_align; i += 8) {
             tcg_gen_ld_i64(t0, cpu_env, vofs + i);
-            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ);
+            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ | align);
+            align = 0;
             tcg_gen_addi_i64(clean_addr, clean_addr, 8);
         }
         tcg_temp_free_i64(t0);
@@ -4391,6 +4413,16 @@  static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
         clean_addr = new_tmp_a64_local(s);
         tcg_gen_mov_i64(clean_addr, t0);
 
+        if (align > MO_ALIGN_8) {
+            t0 = tcg_temp_new_i64();
+            tcg_gen_ld_i64(t0, cpu_env, vofs);
+            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ | align);
+            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+            tcg_gen_addi_ptr(i, i, 8);
+            tcg_temp_free_i64(t0);
+            align = 0;
+        }
+
         gen_set_label(loop);
 
         t0 = tcg_temp_new_i64();
@@ -4400,7 +4432,7 @@  static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
         tcg_gen_addi_ptr(i, i, 8);
         tcg_temp_free_ptr(tp);
 
-        tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ);
+        tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ | align);
         tcg_gen_addi_i64(clean_addr, clean_addr, 8);
         tcg_temp_free_i64(t0);
 
@@ -4418,11 +4450,11 @@  static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
         case 4:
         case 8:
             tcg_gen_qemu_st_i64(t0, clean_addr, midx,
-                                MO_LE | ctz32(len_remain));
+                                MO_LE | ctz32(len_remain) | align);
             break;
 
         case 6:
-            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL);
+            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL | align);
             tcg_gen_addi_i64(clean_addr, clean_addr, 4);
             tcg_gen_shri_i64(t0, t0, 32);
             tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW);
@@ -4440,7 +4472,7 @@  static bool trans_LDR_zri(DisasContext *s, arg_rri *a)
     if (sve_access_check(s)) {
         int size = vec_full_reg_size(s);
         int off = vec_full_reg_offset(s, a->rd);
-        do_ldr(s, off, size, a->rn, a->imm * size);
+        do_ldr(s, off, size, MO_ALIGN_16, a->rn, a->imm * size);
     }
     return true;
 }
@@ -4450,7 +4482,7 @@  static bool trans_LDR_pri(DisasContext *s, arg_rri *a)
     if (sve_access_check(s)) {
         int size = pred_full_reg_size(s);
         int off = pred_full_reg_offset(s, a->rd);
-        do_ldr(s, off, size, a->rn, a->imm * size);
+        do_ldr(s, off, size, MO_ALIGN_2, a->rn, a->imm * size);
     }
     return true;
 }
@@ -4460,7 +4492,7 @@  static bool trans_STR_zri(DisasContext *s, arg_rri *a)
     if (sve_access_check(s)) {
         int size = vec_full_reg_size(s);
         int off = vec_full_reg_offset(s, a->rd);
-        do_str(s, off, size, a->rn, a->imm * size);
+        do_str(s, off, size, MO_ALIGN_16, a->rn, a->imm * size);
     }
     return true;
 }
@@ -4470,7 +4502,7 @@  static bool trans_STR_pri(DisasContext *s, arg_rri *a)
     if (sve_access_check(s)) {
         int size = pred_full_reg_size(s);
         int off = pred_full_reg_offset(s, a->rd);
-        do_str(s, off, size, a->rn, a->imm * size);
+        do_str(s, off, size, MO_ALIGN_2, a->rn, a->imm * size);
     }
     return true;
 }