aarch64 sim load/store multiple instruction fixes

Message ID	CABXYE2W6U6LL4rbM8Hc6oq2eqHsgVJM00NbqE+RC_N0JNHZQqA@mail.gmail.com
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: domain of gdb-patches-return-138545-patch=linaro.org@sourceware.org designates 209.132.180.131 as permitted sender) client-ip=209.132.180.131; DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:mime-version:from:date:message-id:subject:to :content-type; q=dns; s=default; b=t9b0E+gwg4bQ7S8bA22nqIWgTdm5y Cq1/qKVKpZpeChPDWeV3UBY0y7n69uFbzzvOwLyOpK0Rngv0f3eQB+4yzHbdmg5U trmNFLmc0uWVW+zr3GdUWPXliNnsNYOaWYgiS23Dh4QXcPWUrROmTb5xxryPEjss hsxL0aQDIZ1rdo= Mailing-List: contact gdb-patches-help@sourceware.org; run by ezmlm Precedence: bulk Sender: gdb-patches-owner@sourceware.org MIME-Version: 1.0 From: Jim Wilson <jim.wilson@linaro.org> Date: Sat, 22 Apr 2017 16:55:53 -0700 Message-ID: <CABXYE2W6U6LL4rbM8Hc6oq2eqHsgVJM00NbqE+RC_N0JNHZQqA@mail.gmail.com> Subject: [PATCH] aarch64 sim load/store multiple instruction fixes To: gdb-patches@sourceware.org Content-Type: multipart/mixed; boundary=001a113fbc885784e1054dca1d07

2017-04-22 Jim Wilson <jim.wilson@linaro.org> sim/aarch64/ * simulator.c (vec_load): Add M argument. Rewrite to iterate over registers based on structure size. (LD4, LD3, LD2, LD1_2, LD1_3, LD1_4): Pass new arg to vec_load. (LD1_1): Replace with call to vec_load. (vec_store): Add new M argument. Rewrite to iterate over registers based on structure size. (ST4, ST3, ST2, ST1_2, ST1_3, ST1_4): Pass new arg to vec_store. (ST1_1): Replace with call to vec_store. sim/testsuite/sim/aarch64/ * fcvtz.s, fstur.s, ldn_single.s, ldnr.s, mla.s, mls.s, uzp.s: Align data. * sumulh.s: Delete unnecessary data alignment. * stn_single.s: Align data. Fix unaligned ldr insns. Adjust cmp arguments to match change. * ldn_multiple.s, stn_multiple.s: New. (diff --git a/sim/aarch64/simulator.c b/sim/aarch64/simulator.c index 16d8d8d..18f7944 100644 --- a/sim/aarch64/simulator.c +++ b/sim/aarch64/simulator.c @@ -11524,310 +11524,224 @@ vec_reg (unsigned v, unsigned o) return (v + o) & 0x3F; } -/* Load multiple N-element structures to N consecutive registers. */ +/* Load multiple N-element structures to M consecutive registers. */ static void -vec_load (sim_cpu *cpu, uint64_t address, unsigned N) +vec_load (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M) { int all = INSTR (30, 30); unsigned size = INSTR (11, 10); unsigned vd = INSTR (4, 0); - unsigned i; + unsigned rpt = (N == M) ? 1 : M; + unsigned selem = N; + unsigned i, j, k; switch (size) { case 0: /* 8-bit operations. */ - if (all) - for (i = 0; i < (16 * N); i++) - aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15, - aarch64_get_mem_u8 (cpu, address + i)); - else - for (i = 0; i < (8 * N); i++) - aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7, - aarch64_get_mem_u8 (cpu, address + i)); + for (i = 0; i < rpt; i++) + for (j = 0; j < (8 + (8 * all)); j++) + for (k = 0; k < selem; k++) + { + aarch64_set_vec_u8 (cpu, vec_reg (vd, i + k), j, + aarch64_get_mem_u8 (cpu, address)); + address += 1; + } return; case 1: /* 16-bit operations. */ - if (all) - for (i = 0; i < (8 * N); i++) - aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7, - aarch64_get_mem_u16 (cpu, address + i * 2)); - else - for (i = 0; i < (4 * N); i++) - aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3, - aarch64_get_mem_u16 (cpu, address + i * 2)); + for (i = 0; i < rpt; i++) + for (j = 0; j < (4 + (4 * all)); j++) + for (k = 0; k < selem; k++) + { + aarch64_set_vec_u16 (cpu, vec_reg (vd, i + k), j, + aarch64_get_mem_u16 (cpu, address)); + address += 2; + } return; case 2: /* 32-bit operations. */ - if (all) - for (i = 0; i < (4 * N); i++) - aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3, - aarch64_get_mem_u32 (cpu, address + i * 4)); - else - for (i = 0; i < (2 * N); i++) - aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1, - aarch64_get_mem_u32 (cpu, address + i * 4)); + for (i = 0; i < rpt; i++) + for (j = 0; j < (2 + (2 * all)); j++) + for (k = 0; k < selem; k++) + { + aarch64_set_vec_u32 (cpu, vec_reg (vd, i + k), j, + aarch64_get_mem_u32 (cpu, address)); + address += 4; + } return; case 3: /* 64-bit operations. */ - if (all) - for (i = 0; i < (2 * N); i++) - aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1, - aarch64_get_mem_u64 (cpu, address + i * 8)); - else - for (i = 0; i < N; i++) - aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0, - aarch64_get_mem_u64 (cpu, address + i * 8)); + for (i = 0; i < rpt; i++) + for (j = 0; j < (1 + all); j++) + for (k = 0; k < selem; k++) + { + aarch64_set_vec_u64 (cpu, vec_reg (vd, i + k), j, + aarch64_get_mem_u64 (cpu, address)); + address += 8; + } return; } } -/* LD4: load multiple 4-element to four consecutive registers. */ +/* Load multiple 4-element structures into four consecutive registers. */ static void LD4 (sim_cpu *cpu, uint64_t address) { - vec_load (cpu, address, 4); + vec_load (cpu, address, 4, 4); } -/* LD3: load multiple 3-element structures to three consecutive registers. */ +/* Load multiple 3-element structures into three consecutive registers. */ static void LD3 (sim_cpu *cpu, uint64_t address) { - vec_load (cpu, address, 3); + vec_load (cpu, address, 3, 3); } -/* LD2: load multiple 2-element structures to two consecutive registers. */ +/* Load multiple 2-element structures into two consecutive registers. */ static void LD2 (sim_cpu *cpu, uint64_t address) { - vec_load (cpu, address, 2); + vec_load (cpu, address, 2, 2); } /* Load multiple 1-element structures into one register. */ static void LD1_1 (sim_cpu *cpu, uint64_t address) { - int all = INSTR (30, 30); - unsigned size = INSTR (11, 10); - unsigned vd = INSTR (4, 0); - unsigned i; - - switch (size) - { - case 0: - /* LD1 {Vd.16b}, addr, #16 */ - /* LD1 {Vd.8b}, addr, #8 */ - for (i = 0; i < (all ? 16 : 8); i++) - aarch64_set_vec_u8 (cpu, vd, i, - aarch64_get_mem_u8 (cpu, address + i)); - return; - - case 1: - /* LD1 {Vd.8h}, addr, #16 */ - /* LD1 {Vd.4h}, addr, #8 */ - for (i = 0; i < (all ? 8 : 4); i++) - aarch64_set_vec_u16 (cpu, vd, i, - aarch64_get_mem_u16 (cpu, address + i * 2)); - return; - - case 2: - /* LD1 {Vd.4s}, addr, #16 */ - /* LD1 {Vd.2s}, addr, #8 */ - for (i = 0; i < (all ? 4 : 2); i++) - aarch64_set_vec_u32 (cpu, vd, i, - aarch64_get_mem_u32 (cpu, address + i * 4)); - return; - - case 3: - /* LD1 {Vd.2d}, addr, #16 */ - /* LD1 {Vd.1d}, addr, #8 */ - for (i = 0; i < (all ? 2 : 1); i++) - aarch64_set_vec_u64 (cpu, vd, i, - aarch64_get_mem_u64 (cpu, address + i * 8)); - return; - } + vec_load (cpu, address, 1, 1); } /* Load multiple 1-element structures into two registers. */ static void LD1_2 (sim_cpu *cpu, uint64_t address) { - /* FIXME: This algorithm is *exactly* the same as the LD2 version. - So why have two different instructions ? There must be something - wrong somewhere. */ - vec_load (cpu, address, 2); + vec_load (cpu, address, 1, 2); } /* Load multiple 1-element structures into three registers. */ static void LD1_3 (sim_cpu *cpu, uint64_t address) { - /* FIXME: This algorithm is *exactly* the same as the LD3 version. - So why have two different instructions ? There must be something - wrong somewhere. */ - vec_load (cpu, address, 3); + vec_load (cpu, address, 1, 3); } /* Load multiple 1-element structures into four registers. */ static void LD1_4 (sim_cpu *cpu, uint64_t address) { - /* FIXME: This algorithm is *exactly* the same as the LD4 version. - So why have two different instructions ? There must be something - wrong somewhere. */ - vec_load (cpu, address, 4); + vec_load (cpu, address, 1, 4); } -/* Store multiple N-element structures to N consecutive registers. */ +/* Store multiple N-element structures from M consecutive registers. */ static void -vec_store (sim_cpu *cpu, uint64_t address, unsigned N) +vec_store (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M) { int all = INSTR (30, 30); unsigned size = INSTR (11, 10); unsigned vd = INSTR (4, 0); - unsigned i; + unsigned rpt = (N == M) ? 1 : M; + unsigned selem = N; + unsigned i, j, k; switch (size) { case 0: /* 8-bit operations. */ - if (all) - for (i = 0; i < (16 * N); i++) - aarch64_set_mem_u8 - (cpu, address + i, - aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15)); - else - for (i = 0; i < (8 * N); i++) - aarch64_set_mem_u8 - (cpu, address + i, - aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7)); + for (i = 0; i < rpt; i++) + for (j = 0; j < (8 + (8 * all)); j++) + for (k = 0; k < selem; k++) + { + aarch64_set_mem_u8 + (cpu, address, + aarch64_get_vec_u8 (cpu, vec_reg (vd, i + k), j)); + address += 1; + } return; case 1: /* 16-bit operations. */ - if (all) - for (i = 0; i < (8 * N); i++) - aarch64_set_mem_u16 - (cpu, address + i * 2, - aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7)); - else - for (i = 0; i < (4 * N); i++) - aarch64_set_mem_u16 - (cpu, address + i * 2, - aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3)); + for (i = 0; i < rpt; i++) + for (j = 0; j < (4 + (4 * all)); j++) + for (k = 0; k < selem; k++) + { + aarch64_set_mem_u16 + (cpu, address, + aarch64_get_vec_u16 (cpu, vec_reg (vd, i + k), j)); + address += 2; + } return; case 2: /* 32-bit operations. */ - if (all) - for (i = 0; i < (4 * N); i++) - aarch64_set_mem_u32 - (cpu, address + i * 4, - aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3)); - else - for (i = 0; i < (2 * N); i++) - aarch64_set_mem_u32 - (cpu, address + i * 4, - aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1)); + for (i = 0; i < rpt; i++) + for (j = 0; j < (2 + (2 * all)); j++) + for (k = 0; k < selem; k++) + { + aarch64_set_mem_u32 + (cpu, address, + aarch64_get_vec_u32 (cpu, vec_reg (vd, i + k), j)); + address += 4; + } return; case 3: /* 64-bit operations. */ - if (all) - for (i = 0; i < (2 * N); i++) - aarch64_set_mem_u64 - (cpu, address + i * 8, - aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1)); - else - for (i = 0; i < N; i++) - aarch64_set_mem_u64 - (cpu, address + i * 8, - aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0)); + for (i = 0; i < rpt; i++) + for (j = 0; j < (1 + all); j++) + for (k = 0; k < selem; k++) + { + aarch64_set_mem_u64 + (cpu, address, + aarch64_get_vec_u64 (cpu, vec_reg (vd, i + k), j)); + address += 8; + } return; } } -/* Store multiple 4-element structure to four consecutive registers. */ +/* Store multiple 4-element structure from four consecutive registers. */ static void ST4 (sim_cpu *cpu, uint64_t address) { - vec_store (cpu, address, 4); + vec_store (cpu, address, 4, 4); } -/* Store multiple 3-element structures to three consecutive registers. */ +/* Store multiple 3-element structures from three consecutive registers. */ static void ST3 (sim_cpu *cpu, uint64_t address) { - vec_store (cpu, address, 3); + vec_store (cpu, address, 3, 3); } -/* Store multiple 2-element structures to two consecutive registers. */ +/* Store multiple 2-element structures from two consecutive registers. */ static void ST2 (sim_cpu *cpu, uint64_t address) { - vec_store (cpu, address, 2); + vec_store (cpu, address, 2, 2); } -/* Store multiple 1-element structures into one register. */ +/* Store multiple 1-element structures from one register. */ static void ST1_1 (sim_cpu *cpu, uint64_t address) { - int all = INSTR (30, 30); - unsigned size = INSTR (11, 10); - unsigned vd = INSTR (4, 0); - unsigned i; - - switch (size) - { - case 0: - for (i = 0; i < (all ? 16 : 8); i++) - aarch64_set_mem_u8 (cpu, address + i, - aarch64_get_vec_u8 (cpu, vd, i)); - return; - - case 1: - for (i = 0; i < (all ? 8 : 4); i++) - aarch64_set_mem_u16 (cpu, address + i * 2, - aarch64_get_vec_u16 (cpu, vd, i)); - return; - - case 2: - for (i = 0; i < (all ? 4 : 2); i++) - aarch64_set_mem_u32 (cpu, address + i * 4, - aarch64_get_vec_u32 (cpu, vd, i)); - return; - - case 3: - for (i = 0; i < (all ? 2 : 1); i++) - aarch64_set_mem_u64 (cpu, address + i * 8, - aarch64_get_vec_u64 (cpu, vd, i)); - return; - } + vec_store (cpu, address, 1, 1); } -/* Store multiple 1-element structures into two registers. */ +/* Store multiple 1-element structures from two registers. */ static void ST1_2 (sim_cpu *cpu, uint64_t address) { - /* FIXME: This algorithm is *exactly* the same as the ST2 version. - So why have two different instructions ? There must be - something wrong somewhere. */ - vec_store (cpu, address, 2); + vec_store (cpu, address, 1, 2); } -/* Store multiple 1-element structures into three registers. */ +/* Store multiple 1-element structures from three registers. */ static void ST1_3 (sim_cpu *cpu, uint64_t address) { - /* FIXME: This algorithm is *exactly* the same as the ST3 version. - So why have two different instructions ? There must be - something wrong somewhere. */ - vec_store (cpu, address, 3); + vec_store (cpu, address, 1, 3); } -/* Store multiple 1-element structures into four registers. */ +/* Store multiple 1-element structures from four registers. */ static void ST1_4 (sim_cpu *cpu, uint64_t address) { - /* FIXME: This algorithm is *exactly* the same as the ST4 version. - So why have two different instructions ? There must be - something wrong somewhere. */ - vec_store (cpu, address, 4); + vec_store (cpu, address, 1, 4); } #define LDn_STn_SINGLE_LANE_AND_SIZE() \ diff --git a/sim/testsuite/sim/aarch64/fcvtz.s b/sim/testsuite/sim/aarch64/fcvtz.s index 9bb6f9b..311fc2e 100644 --- a/sim/testsuite/sim/aarch64/fcvtz.s +++ b/sim/testsuite/sim/aarch64/fcvtz.s @@ -8,6 +8,7 @@ # For 64-bit unsigned convert, test values 1.5, LONG_MAX, and ULONG_MAX. .data + .align 4 fm1p5: .word 3217031168 fimax: diff --git a/sim/testsuite/sim/aarch64/fstur.s b/sim/testsuite/sim/aarch64/fstur.s index 2206ae5..80e5c67 100644 --- a/sim/testsuite/sim/aarch64/fstur.s +++ b/sim/testsuite/sim/aarch64/fstur.s @@ -8,6 +8,7 @@ .include "testutils.inc" .data + .align 4 fm1: .word 3212836864 fmax: diff --git a/sim/testsuite/sim/aarch64/ldn_multiple.s b/sim/testsuite/sim/aarch64/ldn_multiple.s new file mode 100644 index 0000000..285ef7e --- /dev/null +++ b/sim/testsuite/sim/aarch64/ldn_multiple.s @@ -0,0 +1,136 @@ +# mach: aarch64 + +# Check the load multiple structure instructions: ld1, ld2, ld3, ld4. +# Check the addressing modes: no offset, post-index immediate offset, +# post-index register offset. + +.include "testutils.inc" + + .data + .align 4 +input: + .word 0x04030201 + .word 0x08070605 + .word 0x0c0b0a09 + .word 0x100f0e0d + .word 0xfcfdfeff + .word 0xf8f9fafb + .word 0xf4f5f6f7 + .word 0xf0f1f2f3 + + start + adrp x0, input + add x0, x0, :lo12:input + + mov x2, x0 + mov x3, #16 + ld1 {v0.16b}, [x2], 16 + ld1 {v1.8h}, [x2], x3 + addv b4, v0.16b + addv b5, v1.16b + mov x4, v4.d[0] + cmp x4, #136 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #120 + bne .Lfailure + + mov x2, x0 + mov x3, #16 + ld2 {v0.8b, v1.8b}, [x2], x3 + ld2 {v2.4h, v3.4h}, [x2], 16 + addv b4, v0.8b + addv b5, v1.8b + addv b6, v2.8b + addv b7, v3.8b + mov x4, v4.d[0] + cmp x4, #64 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #72 + bne .Lfailure + mov x6, v6.d[0] + cmp x6, #196 + bne .Lfailure + mov x7, v7.d[0] + cmp x7, #180 + bne .Lfailure + + mov x2, x0 + ld3 {v0.2s, v1.2s, v2.2s}, [x2] + addv b4, v0.8b + addv b5, v1.8b + addv b6, v2.8b + mov x4, v4.d[0] + cmp x4, #68 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #16 + bne .Lfailure + mov x6, v6.d[0] + cmp x6, #16 + bne .Lfailure + + mov x2, x0 + ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x2] + addv b4, v0.8b + addv b5, v1.8b + addv b6, v2.8b + addv b7, v3.8b + mov x4, v4.d[0] + cmp x4, #0 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #0 + bne .Lfailure + mov x6, v6.d[0] + cmp x6, #0 + bne .Lfailure + mov x7, v7.d[0] + cmp x7, #0 + bne .Lfailure + + mov x2, x0 + ld1 {v0.4s, v1.4s}, [x2] + addv b4, v0.16b + addv b5, v1.16b + mov x4, v4.d[0] + cmp x4, #136 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #120 + bne .Lfailure + + mov x2, x0 + ld1 {v0.1d, v1.1d, v2.1d}, [x2] + addv b4, v0.8b + addv b5, v1.8b + addv b6, v2.8b + mov x4, v4.d[0] + cmp x4, #36 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #100 + bne .Lfailure + mov x6, v6.d[0] + cmp x6, #220 + bne .Lfailure + + mov x2, x0 + ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x2] + addv b4, v0.8b + addv b5, v1.8b + addv b6, v2.8b + mov x4, v4.d[0] + cmp x4, #36 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #100 + bne .Lfailure + mov x6, v6.d[0] + cmp x6, #220 + bne .Lfailure + + pass +.Lfailure: + fail diff --git a/sim/testsuite/sim/aarch64/ldn_single.s b/sim/testsuite/sim/aarch64/ldn_single.s index 4c460fb..9681520 100644 --- a/sim/testsuite/sim/aarch64/ldn_single.s +++ b/sim/testsuite/sim/aarch64/ldn_single.s @@ -7,6 +7,8 @@ .include "testutils.inc" + .data + .align 4 input: .word 0x04030201 .word 0x08070605 diff --git a/sim/testsuite/sim/aarch64/ldnr.s b/sim/testsuite/sim/aarch64/ldnr.s index a4bfffa..7126c46 100644 --- a/sim/testsuite/sim/aarch64/ldnr.s +++ b/sim/testsuite/sim/aarch64/ldnr.s @@ -7,6 +7,8 @@ .include "testutils.inc" + .data + .align 4 input: .word 0x04030201 .word 0x08070605 diff --git a/sim/testsuite/sim/aarch64/mla.s b/sim/testsuite/sim/aarch64/mla.s index e0065e7..e3ea836 100644 --- a/sim/testsuite/sim/aarch64/mla.s +++ b/sim/testsuite/sim/aarch64/mla.s @@ -4,6 +4,8 @@ .include "testutils.inc" + .data + .align 4 input: .word 0x04030201 .word 0x08070605 diff --git a/sim/testsuite/sim/aarch64/mls.s b/sim/testsuite/sim/aarch64/mls.s index a34a1aa..5c9e225 100644 --- a/sim/testsuite/sim/aarch64/mls.s +++ b/sim/testsuite/sim/aarch64/mls.s @@ -4,6 +4,8 @@ .include "testutils.inc" + .data + .align 4 input: .word 0x04030201 .word 0x08070605 diff --git a/sim/testsuite/sim/aarch64/stn_multiple.s b/sim/testsuite/sim/aarch64/stn_multiple.s new file mode 100644 index 0000000..1a3f24d --- /dev/null +++ b/sim/testsuite/sim/aarch64/stn_multiple.s @@ -0,0 +1,171 @@ +# mach: aarch64 + +# Check the store multiple structure instructions: st1, st2, st3, st4. +# Check the addressing modes: no offset, post-index immediate offset, +# post-index register offset. + +.include "testutils.inc" + + .data + .align 4 +input: + .word 0x04030201 + .word 0x08070605 + .word 0x0c0b0a09 + .word 0x100f0e0d + .word 0xfcfdfeff + .word 0xf8f9fafb + .word 0xf4f5f6f7 + .word 0xf0f1f2f3 +output: + .zero 64 + + start + adrp x0, input + add x0, x0, :lo12:input + adrp x1, output + add x1, x1, :lo12:output + + mov x2, x0 + ldr q0, [x2], 16 + ldr q1, [x2] + mov x2, x0 + ldr q2, [x2], 16 + ldr q3, [x2] + + mov x2, x1 + mov x3, #16 + st1 {v0.16b}, [x2], 16 + st1 {v1.8h}, [x2], x3 + mov x2, x1 + ldr q4, [x2], 16 + ldr q5, [x2] + addv b4, v4.16b + addv b5, v5.16b + mov x4, v4.d[0] + cmp x4, #136 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #120 + bne .Lfailure + + mov x2, x1 + mov x3, #16 + st2 {v0.8b, v1.8b}, [x2], 16 + st2 {v2.4h, v3.4h}, [x2], x3 + mov x2, x1 + ldr q4, [x2], 16 + ldr q5, [x2] + addv b4, v4.16b + addv b5, v5.16b + mov x4, v4.d[0] + cmp x4, #0 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #0 + bne .Lfailure + + mov x2, x1 + st3 {v0.4s, v1.4s, v2.4s}, [x2] + ldr q4, [x2], 16 + ldr q5, [x2], 16 + ldr q6, [x2] + addv b4, v4.16b + addv b5, v5.16b + addv b6, v6.16b + mov x4, v4.d[0] + cmp x4, #36 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #0 + bne .Lfailure + mov x6, v6.d[0] + cmp x6, #100 + bne .Lfailure + + mov x2, x1 + st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x2] + ldr q4, [x2], 16 + ldr q5, [x2], 16 + ldr q6, [x2], 16 + ldr q7, [x2] + addv b4, v4.16b + addv b5, v5.16b + addv b6, v6.16b + addv b7, v7.16b + mov x4, v4.d[0] + cmp x4, #0 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #0 + bne .Lfailure + mov x6, v6.d[0] + cmp x6, #0 + bne .Lfailure + mov x7, v7.d[0] + cmp x7, #0 + bne .Lfailure + + pass + + mov x2, x1 + st1 {v0.2s, v1.2s}, [x2], 16 + st1 {v2.1d, v3.1d}, [x2] + mov x2, x1 + ldr q4, [x2], 16 + ldr q5, [x2] + addv b4, v4.16b + addv b5, v5.16b + mov x4, v4.d[0] + cmp x4, #0 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #0 + bne .Lfailure + + mov x2, x1 + st1 {v0.2d, v1.2d, v2.2d}, [x2] + mov x2, x1 + ldr q4, [x2], 16 + ldr q5, [x2], 16 + ldr q6, [x2] + addv b4, v4.16b + addv b5, v5.16b + addv b6, v6.16b + mov x4, v4.d[0] + cmp x4, #136 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #120 + bne .Lfailure + mov x6, v6.d[0] + cmp x6, #136 + bne .Lfailure + + mov x2, x1 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x2] + mov x2, x1 + ldr q4, [x2], 16 + ldr q5, [x2], 16 + ldr q6, [x2], 16 + ldr q7, [x2] + addv b4, v4.16b + addv b5, v5.16b + addv b6, v6.16b + addv b7, v7.16b + mov x4, v4.d[0] + cmp x4, #136 + bne .Lfailure + mov x5, v5.d[0] + cmp x5, #120 + bne .Lfailure + mov x6, v6.d[0] + cmp x6, #136 + bne .Lfailure + mov x7, v7.d[0] + cmp x7, #120 + bne .Lfailure + + pass +.Lfailure: + fail diff --git a/sim/testsuite/sim/aarch64/stn_single.s b/sim/testsuite/sim/aarch64/stn_single.s index 2bd19cf..a24b084 100644 --- a/sim/testsuite/sim/aarch64/stn_single.s +++ b/sim/testsuite/sim/aarch64/stn_single.s @@ -7,6 +7,8 @@ .include "testutils.inc" + .data + .align 4 input: .word 0x04030201 .word 0x08070605 @@ -26,10 +28,10 @@ output: add x1, x1, :lo12:output mov x2, x0 - ldr q0, [x2], 8 + ldr q0, [x2], 16 ldr q1, [x2] mov x2, x0 - ldr q2, [x2], 8 + ldr q2, [x2], 16 ldr q3, [x2] mov x2, x1 @@ -61,9 +63,9 @@ output: addv b5, v5.16b mov x5, v4.d[0] mov x6, v5.d[0] - cmp x5, #136 + cmp x5, #200 bne .Lfailure - cmp x6, #8 + cmp x6, #72 bne .Lfailure mov x2, x1 @@ -82,11 +84,11 @@ output: mov x4, v4.d[0] mov x5, v5.d[0] mov x6, v6.d[0] - cmp x4, #88 + cmp x4, #120 bne .Lfailure - cmp x5, #200 + cmp x5, #8 bne .Lfailure - cmp x6, #248 + cmp x6, #24 bne .Lfailure mov x2, x1 @@ -108,13 +110,13 @@ output: mov x5, v5.d[0] mov x6, v6.d[0] mov x7, v7.d[0] - cmp x4, #104 + cmp x4, #168 bne .Lfailure - cmp x5, #168 + cmp x5, #232 bne .Lfailure - cmp x6, #232 + cmp x6, #40 bne .Lfailure - cmp x7, #40 + cmp x7, #104 bne .Lfailure pass diff --git a/sim/testsuite/sim/aarch64/sumulh.s b/sim/testsuite/sim/aarch64/sumulh.s index 17f1ecd..d75e0c6 100644 --- a/sim/testsuite/sim/aarch64/sumulh.s +++ b/sim/testsuite/sim/aarch64/sumulh.s @@ -6,9 +6,6 @@ .include "testutils.inc" - .data - .align 4 - start mov x0, #-2 diff --git a/sim/testsuite/sim/aarch64/uzp.s b/sim/testsuite/sim/aarch64/uzp.s index 55e2cd7..851005e 100644 --- a/sim/testsuite/sim/aarch64/uzp.s +++ b/sim/testsuite/sim/aarch64/uzp.s @@ -4,6 +4,8 @@ .include "testutils.inc" + .data + .align 4 input1: .word 0x04030201 .word 0x08070605

aarch64 sim load/store multiple instruction fixes

Commit Message

Patch