From patchwork Sat Apr 22 23:55:53 2017
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jim Wilson <jim.wilson@linaro.org>
X-Patchwork-Id: 97956
Delivered-To: patch@linaro.org
Received: by 10.140.109.52 with SMTP id k49csp900690qgf;
 Sat, 22 Apr 2017 16:56:10 -0700 (PDT)
X-Received: by 10.99.8.193 with SMTP id 184mr18211348pgi.189.1492905370461; 
 Sat, 22 Apr 2017 16:56:10 -0700 (PDT)
Return-Path: <gdb-patches-return-138545-patch=linaro.org@sourceware.org>
Received: from sourceware.org (server1.sourceware.org. [209.132.180.131])
 by mx.google.com with ESMTPS id
 b83si14430808pfe.221.2017.04.22.16.56.10 for <patch@linaro.org>
 (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128);
 Sat, 22 Apr 2017 16:56:10 -0700 (PDT)
Received-SPF: pass (google.com: domain of
 gdb-patches-return-138545-patch=linaro.org@sourceware.org
 designates 209.132.180.131 as permitted sender)
 client-ip=209.132.180.131; 
Authentication-Results: mx.google.com; dkim=pass header.i=@sourceware.org;
 spf=pass (google.com: domain of
 gdb-patches-return-138545-patch=linaro.org@sourceware.org
 designates 209.132.180.131 as permitted sender)
 smtp.mailfrom=gdb-patches-return-138545-patch=linaro.org@sourceware.org;
 dmarc=fail (p=NONE sp=NONE dis=NONE) header.from=linaro.org
DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id
 :list-unsubscribe:list-subscribe:list-archive:list-post
 :list-help:sender:mime-version:from:date:message-id:subject:to
 :content-type; q=dns; s=default; b=t9b0E+gwg4bQ7S8bA22nqIWgTdm5y
 Cq1/qKVKpZpeChPDWeV3UBY0y7n69uFbzzvOwLyOpK0Rngv0f3eQB+4yzHbdmg5U
 trmNFLmc0uWVW+zr3GdUWPXliNnsNYOaWYgiS23Dh4QXcPWUrROmTb5xxryPEjss
 hsxL0aQDIZ1rdo=
DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=sourceware.org; h=list-id
 :list-unsubscribe:list-subscribe:list-archive:list-post
 :list-help:sender:mime-version:from:date:message-id:subject:to
 :content-type; s=default; bh=D2eFeMN7+7be7yA7dNUL2nGC7yg=; b=ADk
 NN/HVz1e0F7JVmhyHHiQ9DgQB2j4jkNy+fscpbnmsZkmjwzsRD0DcPwM2MhfaUTY
 CjEaOzyUqah3+KWerHTdKmw90t6nrGMmaMiGIn4Ab/E1gtU0lG+HfF/g615yMXHl
 V2YHwx771lPkWR+m2ME21GhJ5ABG9J0CX4jG8FD8=
Received: (qmail 46153 invoked by alias); 22 Apr 2017 23:56:00 -0000
Mailing-List: contact gdb-patches-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <gdb-patches.sourceware.org>
List-Unsubscribe: <mailto:gdb-patches-unsubscribe-patch=linaro.org@sourceware.org>
List-Subscribe: <mailto:gdb-patches-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/gdb-patches/>
List-Post: <mailto:gdb-patches@sourceware.org>
List-Help: <mailto:gdb-patches-help@sourceware.org>,
 <http://sourceware.org/ml/#faqs>
Sender: gdb-patches-owner@sourceware.org
Delivered-To: mailing list gdb-patches@sourceware.org
Received: (qmail 46115 invoked by uid 89); 22 Apr 2017 23:55:59 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-23.2 required=5.0 tests=AWL, BAYES_00,
 GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3,
 RCVD_IN_DNSWL_NONE, RCVD_IN_SORBS_SPAM, SPF_PASS,
 UNSUBSCRIBE_BODY autolearn=ham version=3.3.2 spammy=639, 1295,
 78, simulator
X-HELO: mail-lf0-f48.google.com
Received: from mail-lf0-f48.google.com (HELO mail-lf0-f48.google.com)
 (209.85.215.48) by sourceware.org
 (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP;
 Sat, 22 Apr 2017 23:55:55 +0000
Received: by mail-lf0-f48.google.com with SMTP id t144so59156634lff.1 for
 <gdb-patches@sourceware.org>; Sat, 22 Apr 2017 16:55:56 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net;
 s=20161025;
 h=x-gm-message-state:mime-version:from:date:message-id:subject:to;
 bh=BLxgTp2ZRCF2lCSy5f8vSkm0aGvf4ot2S2II+IGS8rc=;
 b=OdnYsvAuT7vhTfLkWzjSGHM9M0f437x8uuaL9zNobgi5pRlIqm53xegJoM9sKWcGQp
 xGr2Y+ZrS/ZcnS1xz52O+RBC0KRi4QL6hr9xqoxpPOu5kjS5mRSkELGzddqMkRNCFjBn
 /teroITDsUXHLTS8qxkL/6rJPzfZ9vs5k99K9H0j3C/SxnSILCduW+oBCzoVIClWjWsW
 CkId2e5795OCOmfVD25DBVdEiBvj6Ir16Xf25exI7qvNVFEsJzGJ5PLu1JdWAqPP1Hs/
 v8PeuzVWwNG1ga7JRPRF1AL/ar+t49a+ONuGrJMbHPdLQ0+RsV9+3QVdDsRCtNJsy1af
 Nd4g==
X-Gm-Message-State: AN3rC/4ZYJ81c/Vv8rZ7U9hyd3pQWy0ShvnOLHfAHA/zxw/J0ngkNRdM	HRRYbFTr/pcP5OHi+wPgb0hKWA/qZa5ZCL8=
X-Received: by 10.25.18.100 with SMTP id h97mr6608070lfi.82.1492905354547;
 Sat, 22 Apr 2017 16:55:54 -0700 (PDT)
MIME-Version: 1.0
Received: by 10.25.207.9 with HTTP; Sat, 22 Apr 2017 16:55:53 -0700 (PDT)
From: Jim Wilson <jim.wilson@linaro.org>
Date: Sat, 22 Apr 2017 16:55:53 -0700
Message-ID: <CABXYE2W6U6LL4rbM8Hc6oq2eqHsgVJM00NbqE+RC_N0JNHZQqA@mail.gmail.com>
Subject: [PATCH] aarch64 sim load/store multiple instruction fixes
To: gdb-patches@sourceware.org

This is my second big load/store patch,and hopefully the last one I
need.  This makes the ldn/stn multiple instructions work correctly.
ld2 loads data into the first reg then the second reg.  ld1 with 2
registers loads data into one lane at a time in the register pair,
alternating between the regs until the lanes are filled.  The
simulator got this wrong, and this patch fixes it, which requires a
large change to the vector load and store code.

The new testcases fail without the patch, and work with the patch.
The GCC C testsuite unexpected failures drop from 1427 to 1295 (-132).

While writing the new testcases, I noticed I had been sloppy about
aligning data in the testcases, and tried to clean that up a bit.  I
may need some further fixes here, but at least every testcase with
data has an alignment directive now.  Also, I noticed bad ldr
instructions in the stn_single.s testcase, where I was loading 128-bit
values and then adding 8 to the address instead of 16.  This fix
changes the input data, and hence requires adjusting the compare
values to make the testcase work again.  There is also one testcase
where I was aligning data but had no data to align, so I removed the
alignment.

Jim

2017-04-22  Jim Wilson  <jim.wilson@linaro.org>

	sim/aarch64/
	* simulator.c (vec_load): Add M argument.  Rewrite to iterate over
	registers based on structure size.
	(LD4, LD3, LD2, LD1_2, LD1_3, LD1_4): Pass new arg to vec_load.
	(LD1_1): Replace with call to vec_load.
	(vec_store): Add new M argument.  Rewrite to iterate over registers
	based on structure size.
	(ST4, ST3, ST2, ST1_2, ST1_3, ST1_4): Pass new arg to vec_store.
	(ST1_1): Replace with call to vec_store.

	sim/testsuite/sim/aarch64/
	* fcvtz.s, fstur.s, ldn_single.s, ldnr.s, mla.s, mls.s, uzp.s: Align
	data.
	* sumulh.s: Delete unnecessary data alignment.
	* stn_single.s: Align data.  Fix unaligned ldr insns.  Adjust cmp
	arguments to match change.
	* ldn_multiple.s, stn_multiple.s: New.

(diff --git a/sim/aarch64/simulator.c b/sim/aarch64/simulator.c
index 16d8d8d..18f7944 100644
--- a/sim/aarch64/simulator.c
+++ b/sim/aarch64/simulator.c
@@ -11524,310 +11524,224 @@ vec_reg (unsigned v, unsigned o)
   return (v + o) & 0x3F;
 }
 
-/* Load multiple N-element structures to N consecutive registers.  */
+/* Load multiple N-element structures to M consecutive registers.  */
 static void
-vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
+vec_load (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
 {
   int      all  = INSTR (30, 30);
   unsigned size = INSTR (11, 10);
   unsigned vd   = INSTR (4, 0);
-  unsigned i;
+  unsigned rpt = (N == M) ? 1 : M;
+  unsigned selem = N;
+  unsigned i, j, k;
 
   switch (size)
     {
     case 0: /* 8-bit operations.  */
-      if (all)
-	for (i = 0; i < (16 * N); i++)
-	  aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
-			      aarch64_get_mem_u8 (cpu, address + i));
-      else
-	for (i = 0; i < (8 * N); i++)
-	  aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
-			      aarch64_get_mem_u8 (cpu, address + i));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (8 + (8 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_vec_u8 (cpu, vec_reg (vd, i + k), j,
+				  aarch64_get_mem_u8 (cpu, address));
+	      address += 1;
+	    }
       return;
 
     case 1: /* 16-bit operations.  */
-      if (all)
-	for (i = 0; i < (8 * N); i++)
-	  aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
-			       aarch64_get_mem_u16 (cpu, address + i * 2));
-      else
-	for (i = 0; i < (4 * N); i++)
-	  aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
-			       aarch64_get_mem_u16 (cpu, address + i * 2));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (4 + (4 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_vec_u16 (cpu, vec_reg (vd, i + k), j,
+				   aarch64_get_mem_u16 (cpu, address));
+	      address += 2;
+	    }
       return;
 
     case 2: /* 32-bit operations.  */
-      if (all)
-	for (i = 0; i < (4 * N); i++)
-	  aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
-			       aarch64_get_mem_u32 (cpu, address + i * 4));
-      else
-	for (i = 0; i < (2 * N); i++)
-	  aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
-			       aarch64_get_mem_u32 (cpu, address + i * 4));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (2 + (2 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_vec_u32 (cpu, vec_reg (vd, i + k), j,
+				   aarch64_get_mem_u32 (cpu, address));
+	      address += 4;
+	    }
       return;
 
     case 3: /* 64-bit operations.  */
-      if (all)
-	for (i = 0; i < (2 * N); i++)
-	  aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
-			       aarch64_get_mem_u64 (cpu, address + i * 8));
-      else
-	for (i = 0; i < N; i++)
-	  aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
-			       aarch64_get_mem_u64 (cpu, address + i * 8));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (1 + all); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_vec_u64 (cpu, vec_reg (vd, i + k), j,
+				   aarch64_get_mem_u64 (cpu, address));
+	      address += 8;
+	    }
       return;
     }
 }
 
-/* LD4: load multiple 4-element to four consecutive registers.  */
+/* Load multiple 4-element structures into four consecutive registers.  */
 static void
 LD4 (sim_cpu *cpu, uint64_t address)
 {
-  vec_load (cpu, address, 4);
+  vec_load (cpu, address, 4, 4);
 }
 
-/* LD3: load multiple 3-element structures to three consecutive registers.  */
+/* Load multiple 3-element structures into three consecutive registers.  */
 static void
 LD3 (sim_cpu *cpu, uint64_t address)
 {
-  vec_load (cpu, address, 3);
+  vec_load (cpu, address, 3, 3);
 }
 
-/* LD2: load multiple 2-element structures to two consecutive registers.  */
+/* Load multiple 2-element structures into two consecutive registers.  */
 static void
 LD2 (sim_cpu *cpu, uint64_t address)
 {
-  vec_load (cpu, address, 2);
+  vec_load (cpu, address, 2, 2);
 }
 
 /* Load multiple 1-element structures into one register.  */
 static void
 LD1_1 (sim_cpu *cpu, uint64_t address)
 {
-  int      all  = INSTR (30, 30);
-  unsigned size = INSTR (11, 10);
-  unsigned vd   = INSTR (4, 0);
-  unsigned i;
-
-  switch (size)
-    {
-    case 0:
-      /* LD1 {Vd.16b}, addr, #16 */
-      /* LD1 {Vd.8b}, addr, #8 */
-      for (i = 0; i < (all ? 16 : 8); i++)
-	aarch64_set_vec_u8 (cpu, vd, i,
-			    aarch64_get_mem_u8 (cpu, address + i));
-      return;
-
-    case 1:
-      /* LD1 {Vd.8h}, addr, #16 */
-      /* LD1 {Vd.4h}, addr, #8 */
-      for (i = 0; i < (all ? 8 : 4); i++)
-	aarch64_set_vec_u16 (cpu, vd, i,
-			     aarch64_get_mem_u16 (cpu, address + i * 2));
-      return;
-
-    case 2:
-      /* LD1 {Vd.4s}, addr, #16 */
-      /* LD1 {Vd.2s}, addr, #8 */
-      for (i = 0; i < (all ? 4 : 2); i++)
-	aarch64_set_vec_u32 (cpu, vd, i,
-			     aarch64_get_mem_u32 (cpu, address + i * 4));
-      return;
-
-    case 3:
-      /* LD1 {Vd.2d}, addr, #16 */
-      /* LD1 {Vd.1d}, addr, #8 */
-      for (i = 0; i < (all ? 2 : 1); i++)
-	aarch64_set_vec_u64 (cpu, vd, i,
-			     aarch64_get_mem_u64 (cpu, address + i * 8));
-      return;
-    }
+  vec_load (cpu, address, 1, 1);
 }
 
 /* Load multiple 1-element structures into two registers.  */
 static void
 LD1_2 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the LD2 version.
-     So why have two different instructions ?  There must be something
-     wrong somewhere.  */
-  vec_load (cpu, address, 2);
+  vec_load (cpu, address, 1, 2);
 }
 
 /* Load multiple 1-element structures into three registers.  */
 static void
 LD1_3 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the LD3 version.
-     So why have two different instructions ?  There must be something
-     wrong somewhere.  */
-  vec_load (cpu, address, 3);
+  vec_load (cpu, address, 1, 3);
 }
 
 /* Load multiple 1-element structures into four registers.  */
 static void
 LD1_4 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the LD4 version.
-     So why have two different instructions ?  There must be something
-     wrong somewhere.  */
-  vec_load (cpu, address, 4);
+  vec_load (cpu, address, 1, 4);
 }
 
-/* Store multiple N-element structures to N consecutive registers.  */
+/* Store multiple N-element structures from M consecutive registers.  */
 static void
-vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
+vec_store (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
 {
   int      all  = INSTR (30, 30);
   unsigned size = INSTR (11, 10);
   unsigned vd   = INSTR (4, 0);
-  unsigned i;
+  unsigned rpt = (N == M) ? 1 : M;
+  unsigned selem = N;
+  unsigned i, j, k;
 
   switch (size)
     {
     case 0: /* 8-bit operations.  */
-      if (all)
-	for (i = 0; i < (16 * N); i++)
-	  aarch64_set_mem_u8
-	    (cpu, address + i,
-	     aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
-      else
-	for (i = 0; i < (8 * N); i++)
-	  aarch64_set_mem_u8
-	    (cpu, address + i,
-	     aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (8 + (8 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_mem_u8
+		(cpu, address,
+		 aarch64_get_vec_u8 (cpu, vec_reg (vd, i + k), j));
+	      address += 1;
+	    }
       return;
 
     case 1: /* 16-bit operations.  */
-      if (all)
-	for (i = 0; i < (8 * N); i++)
-	  aarch64_set_mem_u16
-	    (cpu, address + i * 2,
-	     aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
-      else
-	for (i = 0; i < (4 * N); i++)
-	  aarch64_set_mem_u16
-	    (cpu, address + i * 2,
-	     aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (4 + (4 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_mem_u16
+		(cpu, address,
+		 aarch64_get_vec_u16 (cpu, vec_reg (vd, i + k), j));
+	      address += 2;
+	    }
       return;
 
     case 2: /* 32-bit operations.  */
-      if (all)
-	for (i = 0; i < (4 * N); i++)
-	  aarch64_set_mem_u32
-	    (cpu, address + i * 4,
-	     aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
-      else
-	for (i = 0; i < (2 * N); i++)
-	  aarch64_set_mem_u32
-	    (cpu, address + i * 4,
-	     aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (2 + (2 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_mem_u32
+		(cpu, address,
+		 aarch64_get_vec_u32 (cpu, vec_reg (vd, i + k), j));
+	      address += 4;
+	    }
       return;
 
     case 3: /* 64-bit operations.  */
-      if (all)
-	for (i = 0; i < (2 * N); i++)
-	  aarch64_set_mem_u64
-	    (cpu, address + i * 8,
-	     aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
-      else
-	for (i = 0; i < N; i++)
-	  aarch64_set_mem_u64
-	    (cpu, address + i * 8,
-	     aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (1 + all); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_mem_u64
+		(cpu, address,
+		 aarch64_get_vec_u64 (cpu, vec_reg (vd, i + k), j));
+	      address += 8;
+	    }
       return;
     }
 }
 
-/* Store multiple 4-element structure to four consecutive registers.  */
+/* Store multiple 4-element structure from four consecutive registers.  */
 static void
 ST4 (sim_cpu *cpu, uint64_t address)
 {
-  vec_store (cpu, address, 4);
+  vec_store (cpu, address, 4, 4);
 }
 
-/* Store multiple 3-element structures to three consecutive registers.  */
+/* Store multiple 3-element structures from three consecutive registers.  */
 static void
 ST3 (sim_cpu *cpu, uint64_t address)
 {
-  vec_store (cpu, address, 3);
+  vec_store (cpu, address, 3, 3);
 }
 
-/* Store multiple 2-element structures to two consecutive registers.  */
+/* Store multiple 2-element structures from two consecutive registers.  */
 static void
 ST2 (sim_cpu *cpu, uint64_t address)
 {
-  vec_store (cpu, address, 2);
+  vec_store (cpu, address, 2, 2);
 }
 
-/* Store multiple 1-element structures into one register.  */
+/* Store multiple 1-element structures from one register.  */
 static void
 ST1_1 (sim_cpu *cpu, uint64_t address)
 {
-  int      all  = INSTR (30, 30);
-  unsigned size = INSTR (11, 10);
-  unsigned vd   = INSTR (4, 0);
-  unsigned i;
-
-  switch (size)
-    {
-    case 0:
-      for (i = 0; i < (all ? 16 : 8); i++)
-	aarch64_set_mem_u8 (cpu, address + i,
-			    aarch64_get_vec_u8 (cpu, vd, i));
-      return;
-
-    case 1:
-      for (i = 0; i < (all ? 8 : 4); i++)
-	aarch64_set_mem_u16 (cpu, address + i * 2,
-			     aarch64_get_vec_u16 (cpu, vd, i));
-      return;
-
-    case 2:
-      for (i = 0; i < (all ? 4 : 2); i++)
-	aarch64_set_mem_u32 (cpu, address + i * 4,
-			     aarch64_get_vec_u32 (cpu, vd, i));
-      return;
-
-    case 3:
-      for (i = 0; i < (all ? 2 : 1); i++)
-	aarch64_set_mem_u64 (cpu, address + i * 8,
-			     aarch64_get_vec_u64 (cpu, vd, i));
-      return;
-    }
+  vec_store (cpu, address, 1, 1);
 }
 
-/* Store multiple 1-element structures into two registers.  */
+/* Store multiple 1-element structures from two registers.  */
 static void
 ST1_2 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the ST2 version.
-     So why have two different instructions ?  There must be
-     something wrong somewhere.  */
-  vec_store (cpu, address, 2);
+  vec_store (cpu, address, 1, 2);
 }
 
-/* Store multiple 1-element structures into three registers.  */
+/* Store multiple 1-element structures from three registers.  */
 static void
 ST1_3 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the ST3 version.
-     So why have two different instructions ?  There must be
-     something wrong somewhere.  */
-  vec_store (cpu, address, 3);
+  vec_store (cpu, address, 1, 3);
 }
 
-/* Store multiple 1-element structures into four registers.  */
+/* Store multiple 1-element structures from four registers.  */
 static void
 ST1_4 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the ST4 version.
-     So why have two different instructions ?  There must be
-     something wrong somewhere.  */
-  vec_store (cpu, address, 4);
+  vec_store (cpu, address, 1, 4);
 }
 
 #define LDn_STn_SINGLE_LANE_AND_SIZE()				\
diff --git a/sim/testsuite/sim/aarch64/fcvtz.s b/sim/testsuite/sim/aarch64/fcvtz.s
index 9bb6f9b..311fc2e 100644
--- a/sim/testsuite/sim/aarch64/fcvtz.s
+++ b/sim/testsuite/sim/aarch64/fcvtz.s
@@ -8,6 +8,7 @@
 # For 64-bit unsigned convert, test values 1.5, LONG_MAX, and ULONG_MAX.
 
 	.data
+	.align 4
 fm1p5:
 	.word	3217031168
 fimax:
diff --git a/sim/testsuite/sim/aarch64/fstur.s b/sim/testsuite/sim/aarch64/fstur.s
index 2206ae5..80e5c67 100644
--- a/sim/testsuite/sim/aarch64/fstur.s
+++ b/sim/testsuite/sim/aarch64/fstur.s
@@ -8,6 +8,7 @@
 .include "testutils.inc"
 
 	.data
+	.align 4
 fm1:
 	.word 3212836864
 fmax:
diff --git a/sim/testsuite/sim/aarch64/ldn_multiple.s b/sim/testsuite/sim/aarch64/ldn_multiple.s
new file mode 100644
index 0000000..285ef7e
--- /dev/null
+++ b/sim/testsuite/sim/aarch64/ldn_multiple.s
@@ -0,0 +1,136 @@
+# mach: aarch64
+
+# Check the load multiple structure instructions: ld1, ld2, ld3, ld4.
+# Check the addressing modes: no offset, post-index immediate offset,
+# post-index register offset.
+
+.include "testutils.inc"
+
+	.data
+	.align 4
+input:
+	.word 0x04030201
+	.word 0x08070605
+	.word 0x0c0b0a09
+	.word 0x100f0e0d
+	.word 0xfcfdfeff
+	.word 0xf8f9fafb
+	.word 0xf4f5f6f7
+	.word 0xf0f1f2f3
+
+	start
+	adrp x0, input
+	add x0, x0, :lo12:input
+
+	mov x2, x0
+	mov x3, #16
+	ld1 {v0.16b}, [x2], 16
+	ld1 {v1.8h}, [x2], x3
+	addv b4, v0.16b
+	addv b5, v1.16b
+	mov x4, v4.d[0]
+	cmp x4, #136
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #120
+	bne .Lfailure
+
+	mov x2, x0
+	mov x3, #16
+	ld2 {v0.8b, v1.8b}, [x2], x3
+	ld2 {v2.4h, v3.4h}, [x2], 16
+	addv b4, v0.8b
+	addv b5, v1.8b
+	addv b6, v2.8b
+	addv b7, v3.8b
+	mov x4, v4.d[0]
+	cmp x4, #64
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #72
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #196
+	bne .Lfailure
+	mov x7, v7.d[0]
+	cmp x7, #180
+	bne .Lfailure
+
+	mov x2, x0
+	ld3 {v0.2s, v1.2s, v2.2s}, [x2]
+	addv b4, v0.8b
+	addv b5, v1.8b
+	addv b6, v2.8b
+	mov x4, v4.d[0]
+	cmp x4, #68
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #16
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #16
+	bne .Lfailure
+
+	mov x2, x0
+	ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x2]
+	addv b4, v0.8b
+	addv b5, v1.8b
+	addv b6, v2.8b
+	addv b7, v3.8b
+	mov x4, v4.d[0]
+	cmp x4, #0
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #0
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #0
+	bne .Lfailure
+	mov x7, v7.d[0]
+	cmp x7, #0
+	bne .Lfailure
+
+	mov x2, x0
+	ld1 {v0.4s, v1.4s}, [x2]
+	addv b4, v0.16b
+	addv b5, v1.16b
+	mov x4, v4.d[0]
+	cmp x4, #136
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #120
+	bne .Lfailure
+
+	mov x2, x0
+	ld1 {v0.1d, v1.1d, v2.1d}, [x2]
+	addv b4, v0.8b
+	addv b5, v1.8b
+	addv b6, v2.8b
+	mov x4, v4.d[0]
+	cmp x4, #36
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #100
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #220
+	bne .Lfailure
+
+	mov x2, x0
+	ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x2]
+	addv b4, v0.8b
+	addv b5, v1.8b
+	addv b6, v2.8b
+	mov x4, v4.d[0]
+	cmp x4, #36
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #100
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #220
+	bne .Lfailure
+
+	pass
+.Lfailure:
+	fail
diff --git a/sim/testsuite/sim/aarch64/ldn_single.s b/sim/testsuite/sim/aarch64/ldn_single.s
index 4c460fb..9681520 100644
--- a/sim/testsuite/sim/aarch64/ldn_single.s
+++ b/sim/testsuite/sim/aarch64/ldn_single.s
@@ -7,6 +7,8 @@
 
 .include "testutils.inc"
 
+	.data
+	.align 4
 input:
 	.word 0x04030201
 	.word 0x08070605
diff --git a/sim/testsuite/sim/aarch64/ldnr.s b/sim/testsuite/sim/aarch64/ldnr.s
index a4bfffa..7126c46 100644
--- a/sim/testsuite/sim/aarch64/ldnr.s
+++ b/sim/testsuite/sim/aarch64/ldnr.s
@@ -7,6 +7,8 @@
 
 .include "testutils.inc"
 
+	.data
+	.align 4
 input:
 	.word 0x04030201
 	.word 0x08070605
diff --git a/sim/testsuite/sim/aarch64/mla.s b/sim/testsuite/sim/aarch64/mla.s
index e0065e7..e3ea836 100644
--- a/sim/testsuite/sim/aarch64/mla.s
+++ b/sim/testsuite/sim/aarch64/mla.s
@@ -4,6 +4,8 @@
 
 .include "testutils.inc"
 
+	.data
+	.align 4
 input:
 	.word 0x04030201
 	.word 0x08070605
diff --git a/sim/testsuite/sim/aarch64/mls.s b/sim/testsuite/sim/aarch64/mls.s
index a34a1aa..5c9e225 100644
--- a/sim/testsuite/sim/aarch64/mls.s
+++ b/sim/testsuite/sim/aarch64/mls.s
@@ -4,6 +4,8 @@
 
 .include "testutils.inc"
 
+	.data
+	.align 4
 input:
 	.word 0x04030201
 	.word 0x08070605
diff --git a/sim/testsuite/sim/aarch64/stn_multiple.s b/sim/testsuite/sim/aarch64/stn_multiple.s
new file mode 100644
index 0000000..1a3f24d
--- /dev/null
+++ b/sim/testsuite/sim/aarch64/stn_multiple.s
@@ -0,0 +1,171 @@
+# mach: aarch64
+
+# Check the store multiple structure instructions: st1, st2, st3, st4.
+# Check the addressing modes: no offset, post-index immediate offset,
+# post-index register offset.
+
+.include "testutils.inc"
+
+	.data
+	.align 4
+input:
+	.word 0x04030201
+	.word 0x08070605
+	.word 0x0c0b0a09
+	.word 0x100f0e0d
+	.word 0xfcfdfeff
+	.word 0xf8f9fafb
+	.word 0xf4f5f6f7
+	.word 0xf0f1f2f3
+output:
+	.zero 64
+
+	start
+	adrp x0, input
+	add x0, x0, :lo12:input
+	adrp x1, output
+	add x1, x1, :lo12:output
+
+	mov x2, x0
+	ldr q0, [x2], 16
+	ldr q1, [x2]
+	mov x2, x0
+	ldr q2, [x2], 16
+	ldr q3, [x2]
+
+	mov x2, x1
+	mov x3, #16
+	st1 {v0.16b}, [x2], 16
+	st1 {v1.8h}, [x2], x3
+	mov x2, x1
+	ldr q4, [x2], 16
+	ldr q5, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	mov x4, v4.d[0]
+	cmp x4, #136
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #120
+	bne .Lfailure
+
+	mov x2, x1
+	mov x3, #16
+	st2 {v0.8b, v1.8b}, [x2], 16
+	st2 {v2.4h, v3.4h}, [x2], x3
+	mov x2, x1
+	ldr q4, [x2], 16
+	ldr q5, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	mov x4, v4.d[0]
+	cmp x4, #0
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #0
+	bne .Lfailure
+
+	mov x2, x1
+	st3 {v0.4s, v1.4s, v2.4s}, [x2]
+	ldr q4, [x2], 16
+	ldr q5, [x2], 16
+	ldr q6, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	addv b6, v6.16b
+	mov x4, v4.d[0]
+	cmp x4, #36
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #0
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #100
+	bne .Lfailure
+
+	mov x2, x1
+	st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x2]
+	ldr q4, [x2], 16
+	ldr q5, [x2], 16
+	ldr q6, [x2], 16
+	ldr q7, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	addv b6, v6.16b
+	addv b7, v7.16b
+	mov x4, v4.d[0]
+	cmp x4, #0
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #0
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #0
+	bne .Lfailure
+	mov x7, v7.d[0]
+	cmp x7, #0
+	bne .Lfailure
+
+	pass
+
+	mov x2, x1
+	st1 {v0.2s, v1.2s}, [x2], 16
+	st1 {v2.1d, v3.1d}, [x2]
+	mov x2, x1
+	ldr q4, [x2], 16
+	ldr q5, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	mov x4, v4.d[0]
+	cmp x4, #0
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #0
+	bne .Lfailure
+
+	mov x2, x1
+	st1 {v0.2d, v1.2d, v2.2d}, [x2]
+	mov x2, x1
+	ldr q4, [x2], 16
+	ldr q5, [x2], 16
+	ldr q6, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	addv b6, v6.16b
+	mov x4, v4.d[0]
+	cmp x4, #136
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #120
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #136
+	bne .Lfailure
+
+	mov x2, x1
+	st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x2]
+	mov x2, x1
+	ldr q4, [x2], 16
+	ldr q5, [x2], 16
+	ldr q6, [x2], 16
+	ldr q7, [x2]
+	addv b4, v4.16b
+	addv b5, v5.16b
+	addv b6, v6.16b
+	addv b7, v7.16b
+	mov x4, v4.d[0]
+	cmp x4, #136
+	bne .Lfailure
+	mov x5, v5.d[0]
+	cmp x5, #120
+	bne .Lfailure
+	mov x6, v6.d[0]
+	cmp x6, #136
+	bne .Lfailure
+	mov x7, v7.d[0]
+	cmp x7, #120
+	bne .Lfailure
+
+	pass
+.Lfailure:
+	fail
diff --git a/sim/testsuite/sim/aarch64/stn_single.s b/sim/testsuite/sim/aarch64/stn_single.s
index 2bd19cf..a24b084 100644
--- a/sim/testsuite/sim/aarch64/stn_single.s
+++ b/sim/testsuite/sim/aarch64/stn_single.s
@@ -7,6 +7,8 @@
 
 .include "testutils.inc"
 
+	.data
+	.align 4
 input:
 	.word 0x04030201
 	.word 0x08070605
@@ -26,10 +28,10 @@ output:
 	add x1, x1, :lo12:output
 
 	mov x2, x0
-	ldr q0, [x2], 8
+	ldr q0, [x2], 16
 	ldr q1, [x2]
 	mov x2, x0
-	ldr q2, [x2], 8
+	ldr q2, [x2], 16
 	ldr q3, [x2]
 
 	mov x2, x1
@@ -61,9 +63,9 @@ output:
 	addv b5, v5.16b
 	mov x5, v4.d[0]
 	mov x6, v5.d[0]
-	cmp x5, #136
+	cmp x5, #200
 	bne .Lfailure
-	cmp x6, #8
+	cmp x6, #72
 	bne .Lfailure
 
 	mov x2, x1
@@ -82,11 +84,11 @@ output:
 	mov x4, v4.d[0]
 	mov x5, v5.d[0]
 	mov x6, v6.d[0]
-	cmp x4, #88
+	cmp x4, #120
 	bne .Lfailure
-	cmp x5, #200
+	cmp x5, #8
 	bne .Lfailure
-	cmp x6, #248
+	cmp x6, #24
 	bne .Lfailure
 
 	mov x2, x1
@@ -108,13 +110,13 @@ output:
 	mov x5, v5.d[0]
 	mov x6, v6.d[0]
 	mov x7, v7.d[0]
-	cmp x4, #104
+	cmp x4, #168
 	bne .Lfailure
-	cmp x5, #168
+	cmp x5, #232
 	bne .Lfailure
-	cmp x6, #232
+	cmp x6, #40
 	bne .Lfailure
-	cmp x7, #40
+	cmp x7, #104
 	bne .Lfailure
 
 	pass
diff --git a/sim/testsuite/sim/aarch64/sumulh.s b/sim/testsuite/sim/aarch64/sumulh.s
index 17f1ecd..d75e0c6 100644
--- a/sim/testsuite/sim/aarch64/sumulh.s
+++ b/sim/testsuite/sim/aarch64/sumulh.s
@@ -6,9 +6,6 @@
 
 .include "testutils.inc"
 
-	.data
-	.align 4
-
 	start
 
 	mov x0, #-2
diff --git a/sim/testsuite/sim/aarch64/uzp.s b/sim/testsuite/sim/aarch64/uzp.s
index 55e2cd7..851005e 100644
--- a/sim/testsuite/sim/aarch64/uzp.s
+++ b/sim/testsuite/sim/aarch64/uzp.s
@@ -4,6 +4,8 @@
 
 .include "testutils.inc"
 
+	.data
+	.align 4
 input1:
 	.word 0x04030201
 	.word 0x08070605