@@ -1200,25 +1200,23 @@ static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src,
/* Store from vector register to memory */
static void do_vec_st(DisasContext *s, int srcidx, int element,
- TCGv_i64 tcg_addr, int size)
+ TCGv_i64 tcg_addr, int size, TCGMemOp endian)
{
- TCGMemOp memop = s->be_data + size;
TCGv_i64 tcg_tmp = tcg_temp_new_i64();
read_vec_element(s, tcg_tmp, srcidx, element, size);
- tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
+ tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
tcg_temp_free_i64(tcg_tmp);
}
/* Load from memory to vector register */
static void do_vec_ld(DisasContext *s, int destidx, int element,
- TCGv_i64 tcg_addr, int size)
+ TCGv_i64 tcg_addr, int size, TCGMemOp endian)
{
- TCGMemOp memop = s->be_data + size;
TCGv_i64 tcg_tmp = tcg_temp_new_i64();
- tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
+ tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
write_vec_element(s, tcg_tmp, destidx, element, size);
tcg_temp_free_i64(tcg_tmp);
@@ -3013,9 +3011,10 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
bool is_postidx = extract32(insn, 23, 1);
bool is_q = extract32(insn, 30, 1);
TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
+ TCGMemOp endian = s->be_data;
- int ebytes = 1 << size;
- int elements = (is_q ? 128 : 64) / (8 << size);
+ int ebytes; /* bytes per element */
+ int elements; /* elements per vector */
int rpt; /* num iterations */
int selem; /* structure elements */
int r;
@@ -3074,6 +3073,20 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
gen_check_sp_alignment(s);
}
+ /* For our purposes, bytes are always little-endian. */
+ if (size == 0) {
+ endian = MO_LE;
+ }
+
+ /* Consecutive little-endian elements from a single register
+ * can be promoted to a larger little-endian operation.
+ */
+ if (selem == 1 && endian == MO_LE) {
+ size = 3;
+ }
+ ebytes = 1 << size;
+ elements = (is_q ? 16 : 8) / ebytes;
+
tcg_rn = cpu_reg_sp(s, rn);
tcg_addr = tcg_temp_new_i64();
tcg_gen_mov_i64(tcg_addr, tcg_rn);
@@ -3082,32 +3095,33 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
for (r = 0; r < rpt; r++) {
int e;
for (e = 0; e < elements; e++) {
- int tt = (rt + r) % 32;
int xs;
for (xs = 0; xs < selem; xs++) {
+ int tt = (rt + r + xs) % 32;
if (is_store) {
- do_vec_st(s, tt, e, tcg_addr, size);
+ do_vec_st(s, tt, e, tcg_addr, size, endian);
} else {
- do_vec_ld(s, tt, e, tcg_addr, size);
-
- /* For non-quad operations, setting a slice of the low
- * 64 bits of the register clears the high 64 bits (in
- * the ARM ARM pseudocode this is implicit in the fact
- * that 'rval' is a 64 bit wide variable).
- * For quad operations, we might still need to zero the
- * high bits of SVE. We optimize by noticing that we only
- * need to do this the first time we touch a register.
- */
- if (e == 0 && (r == 0 || xs == selem - 1)) {
- clear_vec_high(s, is_q, tt);
- }
+ do_vec_ld(s, tt, e, tcg_addr, size, endian);
}
tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
- tt = (tt + 1) % 32;
}
}
}
+ if (!is_store) {
+ /* For non-quad operations, setting a slice of the low
+ * 64 bits of the register clears the high 64 bits (in
+ * the ARM ARM pseudocode this is implicit in the fact
+ * that 'rval' is a 64 bit wide variable).
+ * For quad operations, we might still need to zero the
+ * high bits of SVE.
+ */
+ for (r = 0; r < rpt * selem; r++) {
+ int tt = (rt + r) % 32;
+ clear_vec_high(s, is_q, tt);
+ }
+ }
+
if (is_postidx) {
int rm = extract32(insn, 16, 5);
if (rm == 31) {
@@ -3228,9 +3242,9 @@ static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
} else {
/* Load/store one element per register */
if (is_load) {
- do_vec_ld(s, rt, index, tcg_addr, scale);
+ do_vec_ld(s, rt, index, tcg_addr, scale, s->be_data);
} else {
- do_vec_st(s, rt, index, tcg_addr, scale);
+ do_vec_st(s, rt, index, tcg_addr, scale, s->be_data);
}
}
tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
For a sequence of loads or stores from a single register, little-endian operations can be promoted to an 8-byte op. This can reduce the number of operations by a factor of 8. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/translate-a64.c | 66 +++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 26 deletions(-) -- 2.17.1