[ARM] Add ARMv8.3 VCMLA and VCADD instructions

Message ID 584564BF.7050507@arm.com
State New
Headers show

Commit Message

Szabolcs Nagy Dec. 5, 2016, 12:59 p.m.
Add support for VCMLA and VCADD advanced SIMD complex number instructions.

The command line option is -march=armv8.3-a+fp16+simd for enabling all
instructions.

In arm-dis.c the formatting syntax was abused a bit to select between
0 vs 90 or 180 vs 270 or 90 vs 270 based on a bit value instead of
duplicating entries in the opcode table.

gas/
2016-12-05  Szabolcs Nagy  <szabolcs.nagy@arm.com>

	* config/tc-arm.c (do_vcmla, do_vcadd): Define.
	(neon_scalar_for_vcmla): Define.
	(enum operand_parse_code): Add OP_IROT1 and OP_IROT2.
	(NEON_ENC_TAB): Add DDSI and QQSI variants.
	(insns): Add vcmla and vcadd.

opcodes/
2016-12-05  Szabolcs Nagy  <szabolcs.nagy@arm.com>

	* arm-dis.c (coprocessor_opcodes): Add vcmla and vcadd.
	(print_insn_coprocessor): Add 'V' format for neon D or Q regs.

gas/testsuite/
2016-12-05  Szabolcs Nagy  <szabolcs.nagy@arm.com>

	* gas/arm/armv8_3-a-simd.d: New.
	* gas/arm/armv8_3-a-simd.s: New.
	* gas/arm/armv8_3-a-simd-bad.d: New.
	* gas/arm/armv8_3-a-simd-bad.l: New.
	* gas/arm/armv8_3-a-simd-bad.s: New.

Comments

Nick Clifton Dec. 5, 2016, 1:18 p.m. | #1
Hi Szabolcs,

> gas/

> 2016-12-05  Szabolcs Nagy  <szabolcs.nagy@arm.com>

> 

> 	* config/tc-arm.c (do_vcmla, do_vcadd): Define.

> 	(neon_scalar_for_vcmla): Define.

> 	(enum operand_parse_code): Add OP_IROT1 and OP_IROT2.

> 	(NEON_ENC_TAB): Add DDSI and QQSI variants.

> 	(insns): Add vcmla and vcadd.

> 

> opcodes/

> 2016-12-05  Szabolcs Nagy  <szabolcs.nagy@arm.com>

> 

> 	* arm-dis.c (coprocessor_opcodes): Add vcmla and vcadd.

> 	(print_insn_coprocessor): Add 'V' format for neon D or Q regs.

> 

> gas/testsuite/

> 2016-12-05  Szabolcs Nagy  <szabolcs.nagy@arm.com>

> 

> 	* gas/arm/armv8_3-a-simd.d: New.

> 	* gas/arm/armv8_3-a-simd.s: New.

> 	* gas/arm/armv8_3-a-simd-bad.d: New.

> 	* gas/arm/armv8_3-a-simd-bad.l: New.

> 	* gas/arm/armv8_3-a-simd-bad.s: New.


Approved - please apply.

Cheers
  Nick
Jan Beulich Jan. 12, 2017, 9:49 a.m. | #2
>>> On 05.12.16 at 13:59, <szabolcs.nagy@arm.com> wrote:

> opcodes/

> 2016-12-05  Szabolcs Nagy  <szabolcs.nagy@arm.com>

> 

> 	* arm-dis.c (coprocessor_opcodes): Add vcmla and vcadd.

> 	(print_insn_coprocessor): Add 'V' format for neon D or Q regs.


This set of patterns

+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfe000800, 0xfea00f10, "vcmla%c.f16\t%12-15,22V, %16-19,7V, %0-3D[%5?10], #%20'90"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfe200800, 0xfea00f10, "vcmla%c.f16\t%12-15,22V, %16-19,7V, %0-3D[%5?10], #%20?21%23?780"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfe800800, 0xfea00f10, "vcmla%c.f32\t%12-15,22V, %16-19,7V, %0-3,5D[0], #%20'90"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfea00800, 0xfea00f10, "vcmla%c.f32\t%12-15,22V, %16-19,7V, %0-3,5D[0], #%20?21%23?780"},

suggests that bit 24 is not being decoded as part of the opcode,
yet I also can't seem to figure an operand it affects. Should the
masks perhaps have bit 24 set, or is it really ignored?

Jan
Szabolcs Nagy Jan. 12, 2017, noon | #3
On 12/01/17 09:49, Jan Beulich wrote:
>>>> On 05.12.16 at 13:59, <szabolcs.nagy@arm.com> wrote:

>> opcodes/

>> 2016-12-05  Szabolcs Nagy  <szabolcs.nagy@arm.com>

>>

>> 	* arm-dis.c (coprocessor_opcodes): Add vcmla and vcadd.

>> 	(print_insn_coprocessor): Add 'V' format for neon D or Q regs.

> 

> This set of patterns

> 

> +  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),

> +    0xfe000800, 0xfea00f10, "vcmla%c.f16\t%12-15,22V, %16-19,7V, %0-3D[%5?10], #%20'90"},

> +  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),

> +    0xfe200800, 0xfea00f10, "vcmla%c.f16\t%12-15,22V, %16-19,7V, %0-3D[%5?10], #%20?21%23?780"},

> +  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),

> +    0xfe800800, 0xfea00f10, "vcmla%c.f32\t%12-15,22V, %16-19,7V, %0-3,5D[0], #%20'90"},

> +  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),

> +    0xfea00800, 0xfea00f10, "vcmla%c.f32\t%12-15,22V, %16-19,7V, %0-3,5D[0], #%20?21%23?780"},

> 

> suggests that bit 24 is not being decoded as part of the opcode,

> yet I also can't seem to figure an operand it affects. Should the

> masks perhaps have bit 24 set, or is it really ignored?


indeed the mask should be 0xffa00f10 for these instructions.

i'll prepare a patch.

Patch

diff --git a/gas/config/tc-arm.c b/gas/config/tc-arm.c
index 00da9e0..dbd5c96 100644
--- a/gas/config/tc-arm.c
+++ b/gas/config/tc-arm.c
@@ -6535,6 +6535,8 @@  enum operand_parse_code
   OP_EXPi,	/* same, with optional immediate prefix */
   OP_EXPr,	/* same, with optional relocation suffix */
   OP_HALF,	/* 0 .. 65535 or low/high reloc.  */
+  OP_IROT1,	/* VCADD rotate immediate: 90, 270.  */
+  OP_IROT2,	/* VCMLA rotate immediate: 0, 90, 180, 270.  */
 
   OP_CPSF,	/* CPS flags */
   OP_ENDI,	/* Endianness specifier */
@@ -13345,6 +13347,8 @@  NEON_ENC_TAB
   X(3, (D, Q, S), MIXED),		\
   X(4, (D, D, D, I), DOUBLE),		\
   X(4, (Q, Q, Q, I), QUAD),		\
+  X(4, (D, D, S, I), DOUBLE),		\
+  X(4, (Q, Q, S, I), QUAD),		\
   X(2, (F, F), SINGLE),			\
   X(3, (F, F, F), SINGLE),		\
   X(2, (F, I), SINGLE),			\
@@ -17258,6 +17262,80 @@  do_vrintm (void)
   do_vrint_1 (neon_cvt_mode_m);
 }
 
+static unsigned
+neon_scalar_for_vcmla (unsigned opnd, unsigned elsize)
+{
+  unsigned regno = NEON_SCALAR_REG (opnd);
+  unsigned elno = NEON_SCALAR_INDEX (opnd);
+
+  if (elsize == 16 && elno < 2 && regno < 16)
+    return regno | (elno << 4);
+  else if (elsize == 32 && elno == 0)
+    return regno;
+
+  first_error (_("scalar out of range"));
+  return 0;
+}
+
+static void
+do_vcmla (void)
+{
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
+	      _(BAD_FPU));
+  constraint (inst.reloc.exp.X_op != O_constant, _("expression too complex"));
+  unsigned rot = inst.reloc.exp.X_add_number;
+  constraint (rot != 0 && rot != 90 && rot != 180 && rot != 270,
+	      _("immediate out of range"));
+  rot /= 90;
+  if (inst.operands[2].isscalar)
+    {
+      enum neon_shape rs = neon_select_shape (NS_DDSI, NS_QQSI, NS_NULL);
+      unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
+				       N_KEY | N_F16 | N_F32).size;
+      unsigned m = neon_scalar_for_vcmla (inst.operands[2].reg, size);
+      inst.is_neon = 1;
+      inst.instruction = 0xfe000800;
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+      inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+      inst.instruction |= LOW4 (m);
+      inst.instruction |= HI1 (m) << 5;
+      inst.instruction |= neon_quad (rs) << 6;
+      inst.instruction |= rot << 20;
+      inst.instruction |= (size == 32) << 23;
+    }
+  else
+    {
+      enum neon_shape rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
+      unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
+				       N_KEY | N_F16 | N_F32).size;
+      neon_three_same (neon_quad (rs), 0, -1);
+      inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
+      inst.instruction |= 0xfc200800;
+      inst.instruction |= rot << 23;
+      inst.instruction |= (size == 32) << 20;
+    }
+}
+
+static void
+do_vcadd (void)
+{
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
+	      _(BAD_FPU));
+  constraint (inst.reloc.exp.X_op != O_constant, _("expression too complex"));
+  unsigned rot = inst.reloc.exp.X_add_number;
+  constraint (rot != 90 && rot != 270, _("immediate out of range"));
+  enum neon_shape rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
+  unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
+				   N_KEY | N_F16 | N_F32).size;
+  neon_three_same (neon_quad (rs), 0, -1);
+  inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
+  inst.instruction |= 0xfc800800;
+  inst.instruction |= (rot == 270) << 24;
+  inst.instruction |= (size == 32) << 20;
+}
+
 /* Crypto v1 instructions.  */
 static void
 do_crypto_2op_1 (unsigned elttype, int op)
@@ -19796,6 +19874,8 @@  static const struct asm_opcode insns[] =
 #undef  THUMB_VARIANT
 #define THUMB_VARIANT & arm_ext_v8_3
  NCE (vjcvt, eb90bc0, 2, (RVS, RVD), vjcvt),
+ NUF (vcmla, 0, 4, (RNDQ, RNDQ, RNDQ_RNSC, EXPi), vcmla),
+ NUF (vcadd, 0, 4, (RNDQ, RNDQ, RNDQ, EXPi), vcadd),
 
 #undef  ARM_VARIANT
 #define ARM_VARIANT  & fpu_fpa_ext_v1  /* Core FPA instruction set (V1).  */
diff --git a/gas/testsuite/gas/arm/armv8_3-a-simd-bad.d b/gas/testsuite/gas/arm/armv8_3-a-simd-bad.d
new file mode 100644
index 0000000..b2060cd
--- /dev/null
+++ b/gas/testsuite/gas/arm/armv8_3-a-simd-bad.d
@@ -0,0 +1,2 @@ 
+#as: -march=armv8.3-a+fp16+simd
+#error-output: armv8_3-a-simd-bad.l
diff --git a/gas/testsuite/gas/arm/armv8_3-a-simd-bad.l b/gas/testsuite/gas/arm/armv8_3-a-simd-bad.l
new file mode 100644
index 0000000..2a3ea9b
--- /dev/null
+++ b/gas/testsuite/gas/arm/armv8_3-a-simd-bad.l
@@ -0,0 +1,39 @@ 
+[^:]+: Assembler messages:
+[^:]+:6: Error: operand types can't be inferred -- `vcadd d0,d1,d2,#90'
+[^:]+:7: Error: immediate out of range -- `vcadd\.f32 q0,q1,q2,#0'
+[^:]+:8: Error: immediate out of range -- `vcadd\.f32 q0,q1,q2,#180'
+[^:]+:9: Error: Neon double or quad precision register expected -- `vcadd\.f16 s0,s1,s2,#90'
+[^:]+:10: Error: bad type in Neon instruction -- `vcadd\.f64 d0,d1,d2,#90'
+[^:]+:11: Error: bad type in Neon instruction -- `vcadd\.f64 q0,q1,q2,#90'
+[^:]+:13: Error: operand types can't be inferred -- `vcmla d0,d1,d2,#90'
+[^:]+:14: Error: immediate out of range -- `vcmla\.f32 q0,q1,q2,#-90'
+[^:]+:15: Error: immediate out of range -- `vcmla\.f32 q0,q1,q2,#120'
+[^:]+:16: Error: immediate out of range -- `vcmla\.f32 q0,q1,q2,#360'
+[^:]+:17: Error: Neon double or quad precision register expected -- `vcmla\.f16 s0,s1,s2,#90'
+[^:]+:18: Error: bad type in Neon instruction -- `vcmla\.f64 d0,d1,d2,#90'
+[^:]+:19: Error: bad type in Neon instruction -- `vcmla\.f64 q0,q1,q2,#90'
+[^:]+:21: Error: only D registers may be indexed -- `vcmla\.f16 q0,q1,q2\[0\],#90'
+[^:]+:22: Error: only D registers may be indexed -- `vcmla\.f32 q0,q1,q2\[0\],#90'
+[^:]+:23: Error: scalar out of range -- `vcmla\.f16 d0,d1,d2\[2\],#90'
+[^:]+:24: Error: scalar out of range -- `vcmla\.f16 q0,q1,d2\[2\],#90'
+[^:]+:25: Error: scalar out of range -- `vcmla\.f16 q0,q1,d16\[1\],#90'
+[^:]+:26: Error: scalar out of range -- `vcmla\.f32 q0,q1,d2\[1\],#90'
+[^:]+:31: Error: operand types can't be inferred -- `vcadd d0,d1,d2,#90'
+[^:]+:32: Error: immediate out of range -- `vcadd\.f32 q0,q1,q2,#0'
+[^:]+:33: Error: immediate out of range -- `vcadd\.f32 q0,q1,q2,#180'
+[^:]+:34: Error: Neon double or quad precision register expected -- `vcadd\.f16 s0,s1,s2,#90'
+[^:]+:35: Error: bad type in Neon instruction -- `vcadd\.f64 d0,d1,d2,#90'
+[^:]+:36: Error: bad type in Neon instruction -- `vcadd\.f64 q0,q1,q2,#90'
+[^:]+:38: Error: operand types can't be inferred -- `vcmla d0,d1,d2,#90'
+[^:]+:39: Error: immediate out of range -- `vcmla\.f32 q0,q1,q2,#-90'
+[^:]+:40: Error: immediate out of range -- `vcmla\.f32 q0,q1,q2,#120'
+[^:]+:41: Error: immediate out of range -- `vcmla\.f32 q0,q1,q2,#360'
+[^:]+:42: Error: Neon double or quad precision register expected -- `vcmla\.f16 s0,s1,s2,#90'
+[^:]+:43: Error: bad type in Neon instruction -- `vcmla\.f64 d0,d1,d2,#90'
+[^:]+:44: Error: bad type in Neon instruction -- `vcmla\.f64 q0,q1,q2,#90'
+[^:]+:46: Error: only D registers may be indexed -- `vcmla\.f16 q0,q1,q2\[0\],#90'
+[^:]+:47: Error: only D registers may be indexed -- `vcmla\.f32 q0,q1,q2\[0\],#90'
+[^:]+:48: Error: scalar out of range -- `vcmla\.f16 d0,d1,d2\[2\],#90'
+[^:]+:49: Error: scalar out of range -- `vcmla\.f16 q0,q1,d2\[2\],#90'
+[^:]+:50: Error: scalar out of range -- `vcmla\.f16 q0,q1,d16\[1\],#90'
+[^:]+:51: Error: scalar out of range -- `vcmla\.f32 q0,q1,d2\[1\],#90'
diff --git a/gas/testsuite/gas/arm/armv8_3-a-simd-bad.s b/gas/testsuite/gas/arm/armv8_3-a-simd-bad.s
new file mode 100644
index 0000000..9f6934f
--- /dev/null
+++ b/gas/testsuite/gas/arm/armv8_3-a-simd-bad.s
@@ -0,0 +1,51 @@ 
+	.text
+
+A1:
+	.arm
+
+	vcadd d0,d1,d2,#90
+	vcadd.f32 q0,q1,q2,#0
+	vcadd.f32 q0,q1,q2,#180
+	vcadd.f16 s0,s1,s2,#90
+	vcadd.f64 d0,d1,d2,#90
+	vcadd.f64 q0,q1,q2,#90
+
+	vcmla d0,d1,d2,#90
+	vcmla.f32 q0,q1,q2,#-90
+	vcmla.f32 q0,q1,q2,#120
+	vcmla.f32 q0,q1,q2,#360
+	vcmla.f16 s0,s1,s2,#90
+	vcmla.f64 d0,d1,d2,#90
+	vcmla.f64 q0,q1,q2,#90
+
+	vcmla.f16 q0,q1,q2[0],#90
+	vcmla.f32 q0,q1,q2[0],#90
+	vcmla.f16 d0,d1,d2[2],#90
+	vcmla.f16 q0,q1,d2[2],#90
+	vcmla.f16 q0,q1,d16[1],#90
+	vcmla.f32 q0,q1,d2[1],#90
+
+T1:
+	.thumb
+
+	vcadd d0,d1,d2,#90
+	vcadd.f32 q0,q1,q2,#0
+	vcadd.f32 q0,q1,q2,#180
+	vcadd.f16 s0,s1,s2,#90
+	vcadd.f64 d0,d1,d2,#90
+	vcadd.f64 q0,q1,q2,#90
+
+	vcmla d0,d1,d2,#90
+	vcmla.f32 q0,q1,q2,#-90
+	vcmla.f32 q0,q1,q2,#120
+	vcmla.f32 q0,q1,q2,#360
+	vcmla.f16 s0,s1,s2,#90
+	vcmla.f64 d0,d1,d2,#90
+	vcmla.f64 q0,q1,q2,#90
+
+	vcmla.f16 q0,q1,q2[0],#90
+	vcmla.f32 q0,q1,q2[0],#90
+	vcmla.f16 d0,d1,d2[2],#90
+	vcmla.f16 q0,q1,d2[2],#90
+	vcmla.f16 q0,q1,d16[1],#90
+	vcmla.f32 q0,q1,d2[1],#90
diff --git a/gas/testsuite/gas/arm/armv8_3-a-simd.d b/gas/testsuite/gas/arm/armv8_3-a-simd.d
new file mode 100644
index 0000000..c420cff
--- /dev/null
+++ b/gas/testsuite/gas/arm/armv8_3-a-simd.d
@@ -0,0 +1,47 @@ 
+#as: -march=armv8.3-a+fp16+simd
+#objdump: -dr
+#skip: *-*-pe *-wince-* *-*-coff
+
+.*: +file format .*arm.*
+
+Disassembly of section .text:
+
+[0-9a-f]+ <.*>:
+ +[0-9a-f]+:	fc942846 	vcadd.f32	q1, q2, q3, #90
+ +[0-9a-f]+:	fd942846 	vcadd.f32	q1, q2, q3, #270
+ +[0-9a-f]+:	fcc658a7 	vcadd.f16	d21, d22, d23, #90
+ +[0-9a-f]+:	fc842846 	vcadd.f16	q1, q2, q3, #90
+ +[0-9a-f]+:	fcd658a7 	vcadd.f32	d21, d22, d23, #90
+ +[0-9a-f]+:	fc342846 	vcmla.f32	q1, q2, q3, #0
+ +[0-9a-f]+:	fcb42846 	vcmla.f32	q1, q2, q3, #90
+ +[0-9a-f]+:	fd342846 	vcmla.f32	q1, q2, q3, #180
+ +[0-9a-f]+:	fdb42846 	vcmla.f32	q1, q2, q3, #270
+ +[0-9a-f]+:	fce658a7 	vcmla.f16	d21, d22, d23, #90
+ +[0-9a-f]+:	fca42846 	vcmla.f16	q1, q2, q3, #90
+ +[0-9a-f]+:	fcf658a7 	vcmla.f32	d21, d22, d23, #90
+ +[0-9a-f]+:	fe565883 	vcmla.f16	d21, d22, d3\[0\], #90
+ +[0-9a-f]+:	fe5658a3 	vcmla.f16	d21, d22, d3\[1\], #90
+ +[0-9a-f]+:	fe142843 	vcmla.f16	q1, q2, d3\[0\], #90
+ +[0-9a-f]+:	fe142863 	vcmla.f16	q1, q2, d3\[1\], #90
+ +[0-9a-f]+:	fed658a7 	vcmla.f32	d21, d22, d23\[0\], #90
+ +[0-9a-f]+:	fe942867 	vcmla.f32	q1, q2, d23\[0\], #90
+
+[0-9a-f]+ <.*>:
+ +[0-9a-f]+:	fc94 2846 	vcadd.f32	q1, q2, q3, #90
+ +[0-9a-f]+:	fd94 2846 	vcadd.f32	q1, q2, q3, #270
+ +[0-9a-f]+:	fcc6 58a7 	vcadd.f16	d21, d22, d23, #90
+ +[0-9a-f]+:	fc84 2846 	vcadd.f16	q1, q2, q3, #90
+ +[0-9a-f]+:	fcd6 58a7 	vcadd.f32	d21, d22, d23, #90
+ +[0-9a-f]+:	fc34 2846 	vcmla.f32	q1, q2, q3, #0
+ +[0-9a-f]+:	fcb4 2846 	vcmla.f32	q1, q2, q3, #90
+ +[0-9a-f]+:	fd34 2846 	vcmla.f32	q1, q2, q3, #180
+ +[0-9a-f]+:	fdb4 2846 	vcmla.f32	q1, q2, q3, #270
+ +[0-9a-f]+:	fce6 58a7 	vcmla.f16	d21, d22, d23, #90
+ +[0-9a-f]+:	fca4 2846 	vcmla.f16	q1, q2, q3, #90
+ +[0-9a-f]+:	fcf6 58a7 	vcmla.f32	d21, d22, d23, #90
+ +[0-9a-f]+:	fe56 5883 	vcmla.f16	d21, d22, d3\[0\], #90
+ +[0-9a-f]+:	fe56 58a3 	vcmla.f16	d21, d22, d3\[1\], #90
+ +[0-9a-f]+:	fe14 2843 	vcmla.f16	q1, q2, d3\[0\], #90
+ +[0-9a-f]+:	fe14 2863 	vcmla.f16	q1, q2, d3\[1\], #90
+ +[0-9a-f]+:	fed6 58a7 	vcmla.f32	d21, d22, d23\[0\], #90
+ +[0-9a-f]+:	fe94 2867 	vcmla.f32	q1, q2, d23\[0\], #90
diff --git a/gas/testsuite/gas/arm/armv8_3-a-simd.s b/gas/testsuite/gas/arm/armv8_3-a-simd.s
new file mode 100644
index 0000000..fde2f76
--- /dev/null
+++ b/gas/testsuite/gas/arm/armv8_3-a-simd.s
@@ -0,0 +1,49 @@ 
+	.text
+
+A1:
+	.arm
+
+	vcadd.f32 q1,q2,q3,#90
+	vcadd.f32 q1,q2,q3,#270
+	vcadd.f16 d21,d22,d23,#90
+	vcadd.f16 q1,q2,q3,#90
+	vcadd.f32 d21,d22,d23,#90
+
+	vcmla.f32 q1,q2,q3,#0
+	vcmla.f32 q1,q2,q3,#90
+	vcmla.f32 q1,q2,q3,#180
+	vcmla.f32 q1,q2,q3,#270
+	vcmla.f16 d21,d22,d23,#90
+	vcmla.f16 q1,q2,q3,#90
+	vcmla.f32 d21,d22,d23,#90
+
+	vcmla.f16 d21,d22,d3[0],#90
+	vcmla.f16 d21,d22,d3[1],#90
+	vcmla.f16 q1,q2,d3[0],#90
+	vcmla.f16 q1,q2,d3[1],#90
+	vcmla.f32 d21,d22,d23[0],#90
+	vcmla.f32 q1,q2,d23[0],#90
+
+T1:
+	.thumb
+
+	vcadd.f32 q1,q2,q3,#90
+	vcadd.f32 q1,q2,q3,#270
+	vcadd.f16 d21,d22,d23,#90
+	vcadd.f16 q1,q2,q3,#90
+	vcadd.f32 d21,d22,d23,#90
+
+	vcmla.f32 q1,q2,q3,#0
+	vcmla.f32 q1,q2,q3,#90
+	vcmla.f32 q1,q2,q3,#180
+	vcmla.f32 q1,q2,q3,#270
+	vcmla.f16 d21,d22,d23,#90
+	vcmla.f16 q1,q2,q3,#90
+	vcmla.f32 d21,d22,d23,#90
+
+	vcmla.f16 d21,d22,d3[0],#90
+	vcmla.f16 d21,d22,d3[1],#90
+	vcmla.f16 q1,q2,d3[0],#90
+	vcmla.f16 q1,q2,d3[1],#90
+	vcmla.f32 d21,d22,d23[0],#90
+	vcmla.f32 q1,q2,d23[0],#90
diff --git a/opcodes/arm-dis.c b/opcodes/arm-dis.c
index 0380d37..791d124 100644
--- a/opcodes/arm-dis.c
+++ b/opcodes/arm-dis.c
@@ -116,6 +116,7 @@  struct opcode16
    %<bitfield>G         print as an iWMMXt general purpose or control register
    %<bitfield>D		print as a NEON D register
    %<bitfield>Q		print as a NEON Q register
+   %<bitfield>V		print as a NEON D or Q register
    %<bitfield>E		print a quarter-float immediate value
 
    %y<code>		print a single precision VFP reg.
@@ -882,6 +883,28 @@  static const struct opcode32 coprocessor_opcodes[] =
     0xfc400000, 0xfff00000,
     "mcrr2%c\t%8-11d, %4-7d, %12-15R, %16-19R, cr%0-3d"},
 
+  /* ARMv8.3 AdvSIMD instructions in the space of coprocessor 8.  */
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfc800800, 0xfeb00f10, "vcadd%c.f16\t%12-15,22V, %16-19,7V, %0-3,5V, #%24?29%24'70"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfc900800, 0xfeb00f10, "vcadd%c.f32\t%12-15,22V, %16-19,7V, %0-3,5V, #%24?29%24'70"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfc200800, 0xff300f10, "vcmla%c.f16\t%12-15,22V, %16-19,7V, %0-3,5V, #%23'90"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfd200800, 0xff300f10, "vcmla%c.f16\t%12-15,22V, %16-19,7V, %0-3,5V, #%23?21%23?780"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfc300800, 0xff300f10, "vcmla%c.f32\t%12-15,22V, %16-19,7V, %0-3,5V, #%23'90"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfd300800, 0xff300f10, "vcmla%c.f32\t%12-15,22V, %16-19,7V, %0-3,5V, #%23?21%23?780"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfe000800, 0xfea00f10, "vcmla%c.f16\t%12-15,22V, %16-19,7V, %0-3D[%5?10], #%20'90"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfe200800, 0xfea00f10, "vcmla%c.f16\t%12-15,22V, %16-19,7V, %0-3D[%5?10], #%20?21%23?780"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfe800800, 0xfea00f10, "vcmla%c.f32\t%12-15,22V, %16-19,7V, %0-3,5D[0], #%20'90"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8_3A),
+    0xfea00800, 0xfea00f10, "vcmla%c.f32\t%12-15,22V, %16-19,7V, %0-3,5D[0], #%20?21%23?780"},
+
   /* V5 coprocessor instructions.  */
   {ARM_FEATURE_CORE_LOW (ARM_EXT_V5),
     0xfc100000, 0xfe100000, "ldc2%22'l%c\t%8-11d, cr%12-15d, %A"},
@@ -3673,10 +3696,15 @@  print_insn_coprocessor (bfd_vma pc,
 			  }
 			func (stream, "%s", arm_regnames[value]);
 			break;
+		      case 'V':
+			if (given & (1 << 6))
+			  goto Q;
+			/* FALLTHROUGH */
 		      case 'D':
 			func (stream, "d%ld", value);
 			break;
 		      case 'Q':
+		      Q:
 			if (value & 1)
 			  func (stream, "<illegal reg q%ld.5>", value >> 1);
 			else