aarch64 sim load/store multiple instruction fixes

Message ID CABXYE2W6U6LL4rbM8Hc6oq2eqHsgVJM00NbqE+RC_N0JNHZQqA@mail.gmail.com
State New
Headers show

Commit Message

Jim Wilson April 22, 2017, 11:55 p.m.
This is my second big load/store patch,and hopefully the last one I
need.  This makes the ldn/stn multiple instructions work correctly.
ld2 loads data into the first reg then the second reg.  ld1 with 2
registers loads data into one lane at a time in the register pair,
alternating between the regs until the lanes are filled.  The
simulator got this wrong, and this patch fixes it, which requires a
large change to the vector load and store code.

The new testcases fail without the patch, and work with the patch.
The GCC C testsuite unexpected failures drop from 1427 to 1295 (-132).

While writing the new testcases, I noticed I had been sloppy about
aligning data in the testcases, and tried to clean that up a bit.  I
may need some further fixes here, but at least every testcase with
data has an alignment directive now.  Also, I noticed bad ldr
instructions in the stn_single.s testcase, where I was loading 128-bit
values and then adding 8 to the address instead of 16.  This fix
changes the input data, and hence requires adjusting the compare
values to make the testcase work again.  There is also one testcase
where I was aligning data but had no data to align, so I removed the
alignment.

Jim

Patch hide | download patch | download mbox

2017-04-22  Jim Wilson  <jim.wilson@linaro.org>

	sim/aarch64/
	* simulator.c (vec_load): Add M argument.  Rewrite to iterate over
	registers based on structure size.
	(LD4, LD3, LD2, LD1_2, LD1_3, LD1_4): Pass new arg to vec_load.
	(LD1_1): Replace with call to vec_load.
	(vec_store): Add new M argument.  Rewrite to iterate over registers
	based on structure size.
	(ST4, ST3, ST2, ST1_2, ST1_3, ST1_4): Pass new arg to vec_store.
	(ST1_1): Replace with call to vec_store.

	sim/testsuite/sim/aarch64/
	* fcvtz.s, fstur.s, ldn_single.s, ldnr.s, mla.s, mls.s, uzp.s: Align
	data.
	* sumulh.s: Delete unnecessary data alignment.
	* stn_single.s: Align data.  Fix unaligned ldr insns.  Adjust cmp
	arguments to match change.
	* ldn_multiple.s, stn_multiple.s: New.

(diff --git a/sim/aarch64/simulator.c b/sim/aarch64/simulator.c
index 16d8d8d..18f7944 100644
--- a/sim/aarch64/simulator.c
+++ b/sim/aarch64/simulator.c
@@ -11524,310 +11524,224 @@  vec_reg (unsigned v, unsigned o)
   return (v + o) & 0x3F;
 }
 
-/* Load multiple N-element structures to N consecutive registers.  */
+/* Load multiple N-element structures to M consecutive registers.  */
 static void
-vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
+vec_load (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
 {
   int      all  = INSTR (30, 30);
   unsigned size = INSTR (11, 10);
   unsigned vd   = INSTR (4, 0);
-  unsigned i;
+  unsigned rpt = (N == M) ? 1 : M;
+  unsigned selem = N;
+  unsigned i, j, k;
 
   switch (size)
     {
     case 0: /* 8-bit operations.  */
-      if (all)
-	for (i = 0; i < (16 * N); i++)
-	  aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
-			      aarch64_get_mem_u8 (cpu, address + i));
-      else
-	for (i = 0; i < (8 * N); i++)
-	  aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
-			      aarch64_get_mem_u8 (cpu, address + i));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (8 + (8 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_vec_u8 (cpu, vec_reg (vd, i + k), j,
+				  aarch64_get_mem_u8 (cpu, address));
+	      address += 1;
+	    }
       return;
 
     case 1: /* 16-bit operations.  */
-      if (all)
-	for (i = 0; i < (8 * N); i++)
-	  aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
-			       aarch64_get_mem_u16 (cpu, address + i * 2));
-      else
-	for (i = 0; i < (4 * N); i++)
-	  aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
-			       aarch64_get_mem_u16 (cpu, address + i * 2));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (4 + (4 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_vec_u16 (cpu, vec_reg (vd, i + k), j,
+				   aarch64_get_mem_u16 (cpu, address));
+	      address += 2;
+	    }
       return;
 
     case 2: /* 32-bit operations.  */
-      if (all)
-	for (i = 0; i < (4 * N); i++)
-	  aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
-			       aarch64_get_mem_u32 (cpu, address + i * 4));
-      else
-	for (i = 0; i < (2 * N); i++)
-	  aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
-			       aarch64_get_mem_u32 (cpu, address + i * 4));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (2 + (2 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_vec_u32 (cpu, vec_reg (vd, i + k), j,
+				   aarch64_get_mem_u32 (cpu, address));
+	      address += 4;
+	    }
       return;
 
     case 3: /* 64-bit operations.  */
-      if (all)
-	for (i = 0; i < (2 * N); i++)
-	  aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
-			       aarch64_get_mem_u64 (cpu, address + i * 8));
-      else
-	for (i = 0; i < N; i++)
-	  aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
-			       aarch64_get_mem_u64 (cpu, address + i * 8));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (1 + all); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_vec_u64 (cpu, vec_reg (vd, i + k), j,
+				   aarch64_get_mem_u64 (cpu, address));
+	      address += 8;
+	    }
       return;
     }
 }
 
-/* LD4: load multiple 4-element to four consecutive registers.  */
+/* Load multiple 4-element structures into four consecutive registers.  */
 static void
 LD4 (sim_cpu *cpu, uint64_t address)
 {
-  vec_load (cpu, address, 4);
+  vec_load (cpu, address, 4, 4);
 }
 
-/* LD3: load multiple 3-element structures to three consecutive registers.  */
+/* Load multiple 3-element structures into three consecutive registers.  */
 static void
 LD3 (sim_cpu *cpu, uint64_t address)
 {
-  vec_load (cpu, address, 3);
+  vec_load (cpu, address, 3, 3);
 }
 
-/* LD2: load multiple 2-element structures to two consecutive registers.  */
+/* Load multiple 2-element structures into two consecutive registers.  */
 static void
 LD2 (sim_cpu *cpu, uint64_t address)
 {
-  vec_load (cpu, address, 2);
+  vec_load (cpu, address, 2, 2);
 }
 
 /* Load multiple 1-element structures into one register.  */
 static void
 LD1_1 (sim_cpu *cpu, uint64_t address)
 {
-  int      all  = INSTR (30, 30);
-  unsigned size = INSTR (11, 10);
-  unsigned vd   = INSTR (4, 0);
-  unsigned i;
-
-  switch (size)
-    {
-    case 0:
-      /* LD1 {Vd.16b}, addr, #16 */
-      /* LD1 {Vd.8b}, addr, #8 */
-      for (i = 0; i < (all ? 16 : 8); i++)
-	aarch64_set_vec_u8 (cpu, vd, i,
-			    aarch64_get_mem_u8 (cpu, address + i));
-      return;
-
-    case 1:
-      /* LD1 {Vd.8h}, addr, #16 */
-      /* LD1 {Vd.4h}, addr, #8 */
-      for (i = 0; i < (all ? 8 : 4); i++)
-	aarch64_set_vec_u16 (cpu, vd, i,
-			     aarch64_get_mem_u16 (cpu, address + i * 2));
-      return;
-
-    case 2:
-      /* LD1 {Vd.4s}, addr, #16 */
-      /* LD1 {Vd.2s}, addr, #8 */
-      for (i = 0; i < (all ? 4 : 2); i++)
-	aarch64_set_vec_u32 (cpu, vd, i,
-			     aarch64_get_mem_u32 (cpu, address + i * 4));
-      return;
-
-    case 3:
-      /* LD1 {Vd.2d}, addr, #16 */
-      /* LD1 {Vd.1d}, addr, #8 */
-      for (i = 0; i < (all ? 2 : 1); i++)
-	aarch64_set_vec_u64 (cpu, vd, i,
-			     aarch64_get_mem_u64 (cpu, address + i * 8));
-      return;
-    }
+  vec_load (cpu, address, 1, 1);
 }
 
 /* Load multiple 1-element structures into two registers.  */
 static void
 LD1_2 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the LD2 version.
-     So why have two different instructions ?  There must be something
-     wrong somewhere.  */
-  vec_load (cpu, address, 2);
+  vec_load (cpu, address, 1, 2);
 }
 
 /* Load multiple 1-element structures into three registers.  */
 static void
 LD1_3 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the LD3 version.
-     So why have two different instructions ?  There must be something
-     wrong somewhere.  */
-  vec_load (cpu, address, 3);
+  vec_load (cpu, address, 1, 3);
 }
 
 /* Load multiple 1-element structures into four registers.  */
 static void
 LD1_4 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the LD4 version.
-     So why have two different instructions ?  There must be something
-     wrong somewhere.  */
-  vec_load (cpu, address, 4);
+  vec_load (cpu, address, 1, 4);
 }
 
-/* Store multiple N-element structures to N consecutive registers.  */
+/* Store multiple N-element structures from M consecutive registers.  */
 static void
-vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
+vec_store (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
 {
   int      all  = INSTR (30, 30);
   unsigned size = INSTR (11, 10);
   unsigned vd   = INSTR (4, 0);
-  unsigned i;
+  unsigned rpt = (N == M) ? 1 : M;
+  unsigned selem = N;
+  unsigned i, j, k;
 
   switch (size)
     {
     case 0: /* 8-bit operations.  */
-      if (all)
-	for (i = 0; i < (16 * N); i++)
-	  aarch64_set_mem_u8
-	    (cpu, address + i,
-	     aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
-      else
-	for (i = 0; i < (8 * N); i++)
-	  aarch64_set_mem_u8
-	    (cpu, address + i,
-	     aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (8 + (8 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_mem_u8
+		(cpu, address,
+		 aarch64_get_vec_u8 (cpu, vec_reg (vd, i + k), j));
+	      address += 1;
+	    }
       return;
 
     case 1: /* 16-bit operations.  */
-      if (all)
-	for (i = 0; i < (8 * N); i++)
-	  aarch64_set_mem_u16
-	    (cpu, address + i * 2,
-	     aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
-      else
-	for (i = 0; i < (4 * N); i++)
-	  aarch64_set_mem_u16
-	    (cpu, address + i * 2,
-	     aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (4 + (4 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_mem_u16
+		(cpu, address,
+		 aarch64_get_vec_u16 (cpu, vec_reg (vd, i + k), j));
+	      address += 2;
+	    }
       return;
 
     case 2: /* 32-bit operations.  */
-      if (all)
-	for (i = 0; i < (4 * N); i++)
-	  aarch64_set_mem_u32
-	    (cpu, address + i * 4,
-	     aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
-      else
-	for (i = 0; i < (2 * N); i++)
-	  aarch64_set_mem_u32
-	    (cpu, address + i * 4,
-	     aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (2 + (2 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_mem_u32
+		(cpu, address,
+		 aarch64_get_vec_u32 (cpu, vec_reg (vd, i + k), j));
+	      address += 4;
+	    }
       return;
 
     case 3: /* 64-bit operations.  */
-      if (all)
-	for (i = 0; i < (2 * N); i++)
-	  aarch64_set_mem_u64
-	    (cpu, address + i * 8,
-	     aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
-      else
-	for (i = 0; i < N; i++)
-	  aarch64_set_mem_u64
-	    (cpu, address + i * 8,
-	     aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (1 + all); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_mem_u64
+		(cpu, address,
+		 aarch64_get_vec_u64 (cpu, vec_reg (vd, i + k), j));
+	      address += 8;
+	    }
       return;
     }
 }
 
-/* Store multiple 4-element structure to four consecutive registers.  */
+/* Store multiple 4-element structure from four consecutive registers.  */
 static void
 ST4 (sim_cpu *cpu, uint64_t address)
 {
-  vec_store (cpu, address, 4);
+  vec_store (cpu, address, 4, 4);
 }
 
-/* Store multiple 3-element structures to three consecutive registers.  */
+/* Store multiple 3-element structures from three consecutive registers.  */
 static void
 ST3 (sim_cpu *cpu, uint64_t address)
 {
-  vec_store (cpu, address, 3);
+  vec_store (cpu, address, 3, 3);
 }
 
-/* Store multiple 2-element structures to two consecutive registers.  */
+/* Store multiple 2-element structures from two consecutive registers.  */
 static void
 ST2 (sim_cpu *cpu, uint64_t address)
 {
-  vec_store (cpu, address, 2);
+  vec_store (cpu, address, 2, 2);
 }
 
-/* Store multiple 1-element structures into one register.  */
+/* Store multiple 1-element structures from one register.  */
 static void
 ST1_1 (sim_cpu *cpu, uint64_t address)
 {
-  int      all  = INSTR (30, 30);
-  unsigned size = INSTR (11, 10);
-  unsigned vd   = INSTR (4, 0);
-  unsigned i;
-
-  switch (size)
-    {
-    case 0:
-      for (i = 0; i < (all ? 16 : 8); i++)
-	aarch64_set_mem_u8 (cpu, address + i,
-			    aarch64_get_vec_u8 (cpu, vd, i));
-      return;
-
-    case 1:
-      for (i = 0; i < (all ? 8 : 4); i++)
-	aarch64_set_mem_u16 (cpu, address + i * 2,
-			     aarch64_get_vec_u16 (cpu, vd, i));
-      return;
-
-    case 2:
-      for (i = 0; i < (all ? 4 : 2); i++)
-	aarch64_set_mem_u32 (cpu, address + i * 4,
-			     aarch64_get_vec_u32 (cpu, vd, i));
-      return;
-
-    case 3:
-      for (i = 0; i < (all ? 2 : 1); i++)
-	aarch64_set_mem_u64 (cpu, address + i * 8,
-			     aarch64_get_vec_u64 (cpu, vd, i));
-      return;
-    }
+  vec_store (cpu, address, 1, 1);
 }
 
-/* Store multiple 1-element structures into two registers.  */
+/* Store multiple 1-element structures from two registers.  */
 static void
 ST1_2 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the ST2 version.
-     So why have two different instructions ?  There must be
-     something wrong somewhere.  */
-  vec_store (cpu, address, 2);
+  vec_store (cpu, address, 1, 2);
 }
 
-/* Store multiple 1-element structures into three registers.  */
+/* Store multiple 1-element structures from three registers.  */
 static void
 ST1_3 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the ST3 version.
-     So why have two different instructions ?  There must be
-     something wrong somewhere.  */
-  vec_store (cpu, address, 3);
+  vec_store (cpu, address, 1, 3);
 }
 
-/* Store multiple 1-element structures into four registers.  */
+/* Store multiple 1-element structures from four registers.  */
 static void
 ST1_4 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the ST4 version.
-     So why have two different instructions ?  There must be
-     something wrong somewhere.  */
-  vec_store (cpu, address, 4);
+  vec_store (cpu, address, 1, 4);
 }
 
 #define LDn_STn_SINGLE_LANE_AND_SIZE()				\
diff --git a/sim/testsuite/sim/aarch64/fcvtz.s b/sim/testsuite/sim/aarch64/fcvtz.s
index 9bb6f9b..311fc2e 100644
--- a/sim/testsuite/sim/aarch64/fcvtz.s
+++ b/sim/testsuite/sim/aarch64/fcvtz.s
@@ -8,6 +8,7 @@ 
 # For 64-bit unsigned convert, test values 1.5, LONG_MAX, and ULONG_MAX.
 
 	.data
+	.align 4
 fm1p5:
 	.word	3217031168
 fimax:
diff --git a/sim/testsuite/sim/aarch64/fstur.s b/sim/testsuite/sim/aarch64/fstur.s
index 2206ae5..80e5c67 100644
--- a/sim/testsuite/sim/aarch64/fstur.s
+++ b/sim/testsuite/sim/aarch64/fstur.s
@@ -8,6 +8,7 @@ 
 .include "testutils.inc"
 
 	.data
+	.align 4
 fm1:
 	.word 3212836864
 fmax:
diff --git a/sim/testsuite/sim/aarch64/ldn_multiple.s b/sim/testsuite/sim/aarch64/ldn_multiple.s
new file mode 100644
index 0000000..285ef7e
--- /dev/null
+++ b/sim/testsuite/sim/aarch64/ldn_multiple.s
@@ -0,0 +1,136 @@ 
+# mach: aarch64
+
+# Check the load multiple structure instructions: ld1, ld2, ld3, ld4.
+# Check the addressing modes: no offset, post-index immediate offset,
+# post-index register offset.
+
+.include "testutils.inc"
+
+	.data
+	.align 4
+input:
+	.word 0x04030201
+	.word 0x08070605
+	.word 0x0c0b0a09
+	.word 0x100f0e0d
+	.word 0xfcfdfeff
+	.word 0xf8f9fafb
+	.word 0xf4f5f6f7
+	.word 0xf0f1f2f3
+
+	start
+	adrp x0, input
+	add x0, x0, :lo12:input
+
+	mov x2, x0
+	mov x3, #16
+	ld1 {v0.16b}, [x2], 16
+	ld1 {v1.8h}, [x2], x3
+	addv b4, v0.16b
+	addv b5, v1.16b
+	mov x4, v4.d[0]
+	cmp x4, #136
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #120
+	bne .Lfailure
+
+	mov x2, x0
+	mov x3, #16
+	ld2 {v0.8b, v1.8b}, [x2], x3
+	ld2 {v2.4h, v3.4h}, [x2], 16
+	addv b4, v0.8b
+	addv b5, v1.8b
+	addv b6, v2.8b
+	addv b7, v3.8b
+	mov x4, v4.d[0]
+	cmp x4, #64
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #72
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #196
+	bne .Lfailure
+	mov x7, v7.d[0]
+	cmp x7, #180
+	bne .Lfailure
+
+	mov x2, x0
+	ld3 {v0.2s, v1.2s, v2.2s}, [x2]
+	addv b4, v0.8b
+	addv b5, v1.8b
+	addv b6, v2.8b
+	mov x4, v4.d[0]
+	cmp x4, #68
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #16
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #16
+	bne .Lfailure
+
+	mov x2, x0
+	ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x2]
+	addv b4, v0.8b
+	addv b5, v1.8b
+	addv b6, v2.8b
+	addv b7, v3.8b
+	mov x4, v4.d[0]
+	cmp x4, #0
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #0
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #0
+	bne .Lfailure
+	mov x7, v7.d[0]
+	cmp x7, #0
+	bne .Lfailure
+
+	mov x2, x0
+	ld1 {v0.4s, v1.4s}, [x2]
+	addv b4, v0.16b
+	addv b5, v1.16b
+	mov x4, v4.d[0]
+	cmp x4, #136
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #120
+	bne .Lfailure
+
+	mov x2, x0
+	ld1 {v0.1d, v1.1d, v2.1d}, [x2]
+	addv b4, v0.8b
+	addv b5, v1.8b
+	addv b6, v2.8b
+	mov x4, v4.d[0]
+	cmp x4, #36
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #100
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #220
+	bne .Lfailure
+
+	mov x2, x0
+	ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x2]
+	addv b4, v0.8b
+	addv b5, v1.8b
+	addv b6, v2.8b
+	mov x4, v4.d[0]
+	cmp x4, #36
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #100
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #220
+	bne .Lfailure
+
+	pass
+.Lfailure:
+	fail
diff --git a/sim/testsuite/sim/aarch64/ldn_single.s b/sim/testsuite/sim/aarch64/ldn_single.s
index 4c460fb..9681520 100644
--- a/sim/testsuite/sim/aarch64/ldn_single.s
+++ b/sim/testsuite/sim/aarch64/ldn_single.s
@@ -7,6 +7,8 @@ 
 
 .include "testutils.inc"
 
+	.data
+	.align 4
 input:
 	.word 0x04030201
 	.word 0x08070605
diff --git a/sim/testsuite/sim/aarch64/ldnr.s b/sim/testsuite/sim/aarch64/ldnr.s
index a4bfffa..7126c46 100644
--- a/sim/testsuite/sim/aarch64/ldnr.s
+++ b/sim/testsuite/sim/aarch64/ldnr.s
@@ -7,6 +7,8 @@ 
 
 .include "testutils.inc"
 
+	.data
+	.align 4
 input:
 	.word 0x04030201
 	.word 0x08070605
diff --git a/sim/testsuite/sim/aarch64/mla.s b/sim/testsuite/sim/aarch64/mla.s
index e0065e7..e3ea836 100644
--- a/sim/testsuite/sim/aarch64/mla.s
+++ b/sim/testsuite/sim/aarch64/mla.s
@@ -4,6 +4,8 @@ 
 
 .include "testutils.inc"
 
+	.data
+	.align 4
 input:
 	.word 0x04030201
 	.word 0x08070605
diff --git a/sim/testsuite/sim/aarch64/mls.s b/sim/testsuite/sim/aarch64/mls.s
index a34a1aa..5c9e225 100644
--- a/sim/testsuite/sim/aarch64/mls.s
+++ b/sim/testsuite/sim/aarch64/mls.s
@@ -4,6 +4,8 @@ 
 
 .include "testutils.inc"
 
+	.data
+	.align 4
 input:
 	.word 0x04030201
 	.word 0x08070605
diff --git a/sim/testsuite/sim/aarch64/stn_multiple.s b/sim/testsuite/sim/aarch64/stn_multiple.s
new file mode 100644
index 0000000..1a3f24d
--- /dev/null
+++ b/sim/testsuite/sim/aarch64/stn_multiple.s
@@ -0,0 +1,171 @@ 
+# mach: aarch64
+
+# Check the store multiple structure instructions: st1, st2, st3, st4.
+# Check the addressing modes: no offset, post-index immediate offset,
+# post-index register offset.
+
+.include "testutils.inc"
+
+	.data
+	.align 4
+input:
+	.word 0x04030201
+	.word 0x08070605
+	.word 0x0c0b0a09
+	.word 0x100f0e0d
+	.word 0xfcfdfeff
+	.word 0xf8f9fafb
+	.word 0xf4f5f6f7
+	.word 0xf0f1f2f3
+output:
+	.zero 64
+
+	start
+	adrp x0, input
+	add x0, x0, :lo12:input
+	adrp x1, output
+	add x1, x1, :lo12:output
+
+	mov x2, x0
+	ldr q0, [x2], 16
+	ldr q1, [x2]
+	mov x2, x0
+	ldr q2, [x2], 16
+	ldr q3, [x2]
+
+	mov x2, x1
+	mov x3, #16
+	st1 {v0.16b}, [x2], 16
+	st1 {v1.8h}, [x2], x3
+	mov x2, x1
+	ldr q4, [x2], 16
+	ldr q5, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	mov x4, v4.d[0]
+	cmp x4, #136
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #120
+	bne .Lfailure
+
+	mov x2, x1
+	mov x3, #16
+	st2 {v0.8b, v1.8b}, [x2], 16
+	st2 {v2.4h, v3.4h}, [x2], x3
+	mov x2, x1
+	ldr q4, [x2], 16
+	ldr q5, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	mov x4, v4.d[0]
+	cmp x4, #0
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #0
+	bne .Lfailure
+
+	mov x2, x1
+	st3 {v0.4s, v1.4s, v2.4s}, [x2]
+	ldr q4, [x2], 16
+	ldr q5, [x2], 16
+	ldr q6, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	addv b6, v6.16b
+	mov x4, v4.d[0]
+	cmp x4, #36
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #0
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #100
+	bne .Lfailure
+
+	mov x2, x1
+	st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x2]
+	ldr q4, [x2], 16
+	ldr q5, [x2], 16
+	ldr q6, [x2], 16
+	ldr q7, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	addv b6, v6.16b
+	addv b7, v7.16b
+	mov x4, v4.d[0]
+	cmp x4, #0
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #0
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #0
+	bne .Lfailure
+	mov x7, v7.d[0]
+	cmp x7, #0
+	bne .Lfailure
+
+	pass
+
+	mov x2, x1
+	st1 {v0.2s, v1.2s}, [x2], 16
+	st1 {v2.1d, v3.1d}, [x2]
+	mov x2, x1
+	ldr q4, [x2], 16
+	ldr q5, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	mov x4, v4.d[0]
+	cmp x4, #0
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #0
+	bne .Lfailure
+
+	mov x2, x1
+	st1 {v0.2d, v1.2d, v2.2d}, [x2]
+	mov x2, x1
+	ldr q4, [x2], 16
+	ldr q5, [x2], 16
+	ldr q6, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	addv b6, v6.16b
+	mov x4, v4.d[0]
+	cmp x4, #136
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #120
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #136
+	bne .Lfailure
+
+	mov x2, x1
+	st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x2]
+	mov x2, x1
+	ldr q4, [x2], 16
+	ldr q5, [x2], 16
+	ldr q6, [x2], 16
+	ldr q7, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	addv b6, v6.16b
+	addv b7, v7.16b
+	mov x4, v4.d[0]
+	cmp x4, #136
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #120
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #136
+	bne .Lfailure
+	mov x7, v7.d[0]
+	cmp x7, #120
+	bne .Lfailure
+
+	pass
+.Lfailure:
+	fail
diff --git a/sim/testsuite/sim/aarch64/stn_single.s b/sim/testsuite/sim/aarch64/stn_single.s
index 2bd19cf..a24b084 100644
--- a/sim/testsuite/sim/aarch64/stn_single.s
+++ b/sim/testsuite/sim/aarch64/stn_single.s
@@ -7,6 +7,8 @@ 
 
 .include "testutils.inc"
 
+	.data
+	.align 4
 input:
 	.word 0x04030201
 	.word 0x08070605
@@ -26,10 +28,10 @@  output:
 	add x1, x1, :lo12:output
 
 	mov x2, x0
-	ldr q0, [x2], 8
+	ldr q0, [x2], 16
 	ldr q1, [x2]
 	mov x2, x0
-	ldr q2, [x2], 8
+	ldr q2, [x2], 16
 	ldr q3, [x2]
 
 	mov x2, x1
@@ -61,9 +63,9 @@  output:
 	addv b5, v5.16b
 	mov x5, v4.d[0]
 	mov x6, v5.d[0]
-	cmp x5, #136
+	cmp x5, #200
 	bne .Lfailure
-	cmp x6, #8
+	cmp x6, #72
 	bne .Lfailure
 
 	mov x2, x1
@@ -82,11 +84,11 @@  output:
 	mov x4, v4.d[0]
 	mov x5, v5.d[0]
 	mov x6, v6.d[0]
-	cmp x4, #88
+	cmp x4, #120
 	bne .Lfailure
-	cmp x5, #200
+	cmp x5, #8
 	bne .Lfailure
-	cmp x6, #248
+	cmp x6, #24
 	bne .Lfailure
 
 	mov x2, x1
@@ -108,13 +110,13 @@  output:
 	mov x5, v5.d[0]
 	mov x6, v6.d[0]
 	mov x7, v7.d[0]
-	cmp x4, #104
+	cmp x4, #168
 	bne .Lfailure
-	cmp x5, #168
+	cmp x5, #232
 	bne .Lfailure
-	cmp x6, #232
+	cmp x6, #40
 	bne .Lfailure
-	cmp x7, #40
+	cmp x7, #104
 	bne .Lfailure
 
 	pass
diff --git a/sim/testsuite/sim/aarch64/sumulh.s b/sim/testsuite/sim/aarch64/sumulh.s
index 17f1ecd..d75e0c6 100644
--- a/sim/testsuite/sim/aarch64/sumulh.s
+++ b/sim/testsuite/sim/aarch64/sumulh.s
@@ -6,9 +6,6 @@ 
 
 .include "testutils.inc"
 
-	.data
-	.align 4
-
 	start
 
 	mov x0, #-2
diff --git a/sim/testsuite/sim/aarch64/uzp.s b/sim/testsuite/sim/aarch64/uzp.s
index 55e2cd7..851005e 100644
--- a/sim/testsuite/sim/aarch64/uzp.s
+++ b/sim/testsuite/sim/aarch64/uzp.s
@@ -4,6 +4,8 @@ 
 
 .include "testutils.inc"
 
+	.data
+	.align 4
 input1:
 	.word 0x04030201
 	.word 0x08070605