[04/20] target/arm: Promote consecutive memory ops for aa64

Message ID	20181011205206.3552-5-richard.henderson@linaro.org
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: domain of qemu-devel-bounces+patch=linaro.org@nongnu.org designates 2001:4830:134:3::11 as permitted sender) client-ip=2001:4830:134:3::11; From: Richard Henderson <richard.henderson@linaro.org> To: qemu-devel@nongnu.org Date: Thu, 11 Oct 2018 13:51:50 -0700 Message-Id: <20181011205206.3552-5-richard.henderson@linaro.org> In-Reply-To: <20181011205206.3552-1-richard.henderson@linaro.org> References: <20181011205206.3552-1-richard.henderson@linaro.org> Subject: [Qemu-devel] [PATCH 04/20] target/arm: Promote consecutive memory ops for aa64 Precedence: list Cc: peter.maydell@linaro.org Errors-To: qemu-devel-bounces+patch=linaro.org@nongnu.org Sender: "Qemu-devel" <qemu-devel-bounces+patch=linaro.org@nongnu.org>
Series	target/arm: Convert some neon insns to gvec \| expand [00/20] target/arm: Convert some neon insns to gvec [01/20] target/arm: Hoist address increment for vector memory ops [02/20] target/arm: Don't call tcg_clear_temp_count [03/20] target/arm: Use tcg_gen_gvec_dup_i64 for LD[1-4]R [04/20] target/arm: Promote consecutive memory ops for aa64 [05/20] target/arm: Mark some arrays const [06/20] target/arm: Use gvec for NEON VDUP [07/20] target/arm: Use gvec for NEON VMOV, VMVN, VBIC & VORR (immediate) [08/20] target/arm: Use gvec for NEON_3R_LOGIC insns [09/20] target/arm: Use gvec for NEON_3R_VADD_VSUB insns [10/20] target/arm: Use gvec for NEON_2RM_VMN, NEON_2RM_VNEG [11/20] target/arm: Use gvec for NEON_3R_VMUL [12/20] target/arm: Use gvec for VSHR, VSHL [13/20] target/arm: Use gvec for VSRA [14/20] target/arm: Use gvec for VSRI, VSLI [15/20] target/arm: Use gvec for NEON_3R_VML [16/20] target/arm: Use gvec for NEON_3R_VTST_VCEQ, NEON_3R_VCGT, NEON_3R_VCGE [17/20] target/arm: Use gvec for NEON VLD all lanes [18/20] target/arm: Reorg NEON VLD/VST all elements [19/20] target/arm: Promote consecutive memory ops for aa32 [20/20] target/arm: Reorg NEON VLD/VST single element to one lane

Message ID

20181011205206.3552-5-richard.henderson@linaro.org

State

New

Headers

Received-SPF: pass (google.com: domain of
	qemu-devel-bounces+patch=linaro.org@nongnu.org designates
	2001:4830:134:3::11 as permitted sender)
	client-ip=2001:4830:134:3::11; 
From: Richard Henderson <richard.henderson@linaro.org>
To: qemu-devel@nongnu.org
Date: Thu, 11 Oct 2018 13:51:50 -0700
Message-Id: <20181011205206.3552-5-richard.henderson@linaro.org>
In-Reply-To: <20181011205206.3552-1-richard.henderson@linaro.org>
References: <20181011205206.3552-1-richard.henderson@linaro.org>
Subject: [Qemu-devel] [PATCH 04/20] target/arm: Promote consecutive memory
	ops for aa64
Precedence: list
Cc: peter.maydell@linaro.org
Errors-To: qemu-devel-bounces+patch=linaro.org@nongnu.org
Sender: "Qemu-devel" <qemu-devel-bounces+patch=linaro.org@nongnu.org>

Series

target/arm: Convert some neon insns to gvec | expand

Commit Message

Richard Henderson Oct. 11, 2018, 8:51 p.m. UTC

For a sequence of loads or stores from a single register,
little-endian operations can be promoted to an 8-byte op.
This can reduce the number of operations by a factor of 8.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 target/arm/translate-a64.c | 66 +++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 26 deletions(-)

-- 
2.17.1

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index fff99ca303..2f4041462e 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -1200,25 +1200,23 @@  static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src,
 
 /* Store from vector register to memory */
 static void do_vec_st(DisasContext *s, int srcidx, int element,
-                      TCGv_i64 tcg_addr, int size)
+                      TCGv_i64 tcg_addr, int size, TCGMemOp endian)
 {
-    TCGMemOp memop = s->be_data + size;
     TCGv_i64 tcg_tmp = tcg_temp_new_i64();
 
     read_vec_element(s, tcg_tmp, srcidx, element, size);
-    tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
+    tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
 
     tcg_temp_free_i64(tcg_tmp);
 }
 
 /* Load from memory to vector register */
 static void do_vec_ld(DisasContext *s, int destidx, int element,
-                      TCGv_i64 tcg_addr, int size)
+                      TCGv_i64 tcg_addr, int size, TCGMemOp endian)
 {
-    TCGMemOp memop = s->be_data + size;
     TCGv_i64 tcg_tmp = tcg_temp_new_i64();
 
-    tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
+    tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
     write_vec_element(s, tcg_tmp, destidx, element, size);
 
     tcg_temp_free_i64(tcg_tmp);
@@ -3013,9 +3011,10 @@  static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
     bool is_postidx = extract32(insn, 23, 1);
     bool is_q = extract32(insn, 30, 1);
     TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
+    TCGMemOp endian = s->be_data;
 
-    int ebytes = 1 << size;
-    int elements = (is_q ? 128 : 64) / (8 << size);
+    int ebytes;   /* bytes per element */
+    int elements; /* elements per vector */
     int rpt;    /* num iterations */
     int selem;  /* structure elements */
     int r;
@@ -3074,6 +3073,20 @@  static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
         gen_check_sp_alignment(s);
     }
 
+    /* For our purposes, bytes are always little-endian.  */
+    if (size == 0) {
+        endian = MO_LE;
+    }
+
+    /* Consecutive little-endian elements from a single register
+     * can be promoted to a larger little-endian operation.
+     */
+    if (selem == 1 && endian == MO_LE) {
+        size = 3;
+    }
+    ebytes = 1 << size;
+    elements = (is_q ? 16 : 8) / ebytes;
+
     tcg_rn = cpu_reg_sp(s, rn);
     tcg_addr = tcg_temp_new_i64();
     tcg_gen_mov_i64(tcg_addr, tcg_rn);
@@ -3082,32 +3095,33 @@  static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
     for (r = 0; r < rpt; r++) {
         int e;
         for (e = 0; e < elements; e++) {
-            int tt = (rt + r) % 32;
             int xs;
             for (xs = 0; xs < selem; xs++) {
+                int tt = (rt + r + xs) % 32;
                 if (is_store) {
-                    do_vec_st(s, tt, e, tcg_addr, size);
+                    do_vec_st(s, tt, e, tcg_addr, size, endian);
                 } else {
-                    do_vec_ld(s, tt, e, tcg_addr, size);
-
-                    /* For non-quad operations, setting a slice of the low
-                     * 64 bits of the register clears the high 64 bits (in
-                     * the ARM ARM pseudocode this is implicit in the fact
-                     * that 'rval' is a 64 bit wide variable).
-                     * For quad operations, we might still need to zero the
-                     * high bits of SVE.  We optimize by noticing that we only
-                     * need to do this the first time we touch a register.
-                     */
-                    if (e == 0 && (r == 0 || xs == selem - 1)) {
-                        clear_vec_high(s, is_q, tt);
-                    }
+                    do_vec_ld(s, tt, e, tcg_addr, size, endian);
                 }
                 tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
-                tt = (tt + 1) % 32;
             }
         }
     }
 
+    if (!is_store) {
+        /* For non-quad operations, setting a slice of the low
+         * 64 bits of the register clears the high 64 bits (in
+         * the ARM ARM pseudocode this is implicit in the fact
+         * that 'rval' is a 64 bit wide variable).
+         * For quad operations, we might still need to zero the
+         * high bits of SVE.
+         */
+        for (r = 0; r < rpt * selem; r++) {
+            int tt = (rt + r) % 32;
+            clear_vec_high(s, is_q, tt);
+        }
+    }
+
     if (is_postidx) {
         int rm = extract32(insn, 16, 5);
         if (rm == 31) {
@@ -3228,9 +3242,9 @@  static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
         } else {
             /* Load/store one element per register */
             if (is_load) {
-                do_vec_ld(s, rt, index, tcg_addr, scale);
+                do_vec_ld(s, rt, index, tcg_addr, scale, s->be_data);
             } else {
-                do_vec_st(s, rt, index, tcg_addr, scale);
+                do_vec_st(s, rt, index, tcg_addr, scale, s->be_data);
             }
         }
         tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);

[04/20] target/arm: Promote consecutive memory ops for aa64

Commit Message

Patch