diff mbox

RFR: JDK8: Add support for Neon implementation of CRC32

Message ID 1402569257.22470.5.camel@localhost.localdomain
State New
Headers show

Commit Message

Edward Nevill June 12, 2014, 10:34 a.m. UTC
Hi,

The following patch adds support for Neon implementation of CRC32.

This uses the basic PMULL and PMULL2 instructions (ie. not the 64 bit 1Q variants which are part of the crypto extensions).

I have added a -XX:+/-UseNeon switch to enable/disable the Neon CRC. This is off by default.

All the best,
Ed.


--- CUT HERE ---
# HG changeset patch
# User Edward Nevill edward.nevill@linaro.org
# Date 1402568896 -3600
#      Thu Jun 12 11:28:16 2014 +0100
# Node ID 55084fca52d279e90686b5cc53bf87aa853a3c75
# Parent  1b3757e98d39e89faa65c719951d4b273908433c
Add support for Neon implementation of CRC32

Comments

Andrew Haley June 12, 2014, 11:51 a.m. UTC | #1
On 06/12/2014 11:34 AM, Edward Nevill wrote:
> I have added a -XX:+/-UseNeon switch to enable/disable the Neon CRC. This is off by default.

Shouldn't we probe for Neon?

Andrew.
Edward Nevill June 12, 2014, 12:12 p.m. UTC | #2
On Thu, 2014-06-12 at 12:51 +0100, Andrew Haley wrote:
> On 06/12/2014 11:34 AM, Edward Nevill wrote:
> > I have added a -XX:+/-UseNeon switch to enable/disable the Neon CRC. This is off by default.
> 
> Shouldn't we probe for Neon?

AFAIUI Neon is always available on aarch64.

It is only the 1Q extensions to PMULL/PMULL2 which may not be available because they are part of the crypto extensions.

I have turned it off by default for the moment until we have the opportunity to test it fully.

Regards,
Ed.
Andrew Haley June 12, 2014, 12:19 p.m. UTC | #3
On 06/12/2014 01:12 PM, Edward Nevill wrote:
> On Thu, 2014-06-12 at 12:51 +0100, Andrew Haley wrote:
>> On 06/12/2014 11:34 AM, Edward Nevill wrote:
>>> I have added a -XX:+/-UseNeon switch to enable/disable the Neon CRC. This is off by default.
>>
>> Shouldn't we probe for Neon?
> 
> AFAIUI Neon is always available on aarch64.
> 
> It is only the 1Q extensions to PMULL/PMULL2 which may not be available because they are part of the crypto extensions.
> 
> I have turned it off by default for the moment until we have the opportunity to test it fully.

OK.

Andrew.
diff mbox

Patch

diff -r 1b3757e98d39 -r 55084fca52d2 src/cpu/aarch64/vm/assembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Wed Jun 11 13:51:03 2014 +0100
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Jun 12 11:28:16 2014 +0100
@@ -1842,6 +1842,225 @@ 
       fmovd(Vn, zr);
   }
 
+/* SIMD extensions
+ *
+ * We just use FloatRegister in the following. They are exactly the same
+ * as SIMD registers.
+ */
+public:
+
+  enum SIMD_Arrangement {
+       T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D
+  };
+
+  enum SIMD_RegVariant {
+       S32, D64, Q128
+  };
+
+  void v_shl(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement T, int shift){
+    starti;
+    /* The encodings for the immh:immb fields (bits 22:16) are
+     *   0001 xxx	8B/16B, shift = xxx
+     *   001x xxx	4H/8H,  shift = xxxx
+     *   01xx xxx	2S/4S,  shift = xxxxx
+     *   1xxx xxx	1D/2D,  shift = xxxxxx (1D is RESERVED)
+     */
+    assert((1 << ((T>>1)+3)) > shift, "Invalid Shift value");
+    f(0, 31), f(T & 1, 30), f(0b0011110, 29, 23), f((1 << ((T>>1)+3))|shift, 22, 16);
+    f(0b010101, 15, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
+  void v_ushll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
+    starti;
+    /* The encodings for the immh:immb fields (bits 22:16) are
+     *   0001 xxx	8H, 8B/16b shift = xxx
+     *   001x xxx	4S, 4H/8H  shift = xxxx
+     *   01xx xxx	2D, 2S/4S  shift = xxxxx
+     *   1xxx xxx	RESERVED
+     */
+    assert((Tb >> 1) + 1 == (Ta >> 1), "Incompatible arrangement");
+    assert((1 << ((Tb>>1)+3)) > shift, "Invalid shift value");
+    f(0, 31), f(Tb & 1, 30), f(0b1011110, 29, 23), f((1 << ((Tb>>1)+3))|shift, 22, 16);
+    f(0b101001, 15, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+  void v_ushll2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb, int shift) {
+    v_ushll(Vd, Ta, Vn, Tb, shift);
+  }
+
+  void v_uzp1(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T, int op = 0){
+    starti;
+    f(0, 31), f((T & 0x1), 30), f(0b001110, 29, 24), f((T >> 1), 23, 22), f(0, 21);
+    rf(Vm, 16), f(0, 15), f(op, 14), f(0b0110, 13, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+  void v_uzp2(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T){
+    v_uzp1(Vd, Vn, Vm, T, 1);
+  }
+ 
+  // Move from general purpose register
+  //   mov  Vd.T[index], Rn
+  void v_mov(FloatRegister Vd, SIMD_Arrangement T, int index, Register Xn) {
+    starti;
+    f(0b01001110000, 31, 21), f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16); 
+    f(0b000111, 15, 10), rf(Xn, 5), rf(Vd, 0);
+  }
+
+  // Move to general purpose register
+  //   mov  Rd, Vn.T[index]
+  void v_mov(Register Xd, FloatRegister Vn, SIMD_Arrangement T, int index) {
+    starti;
+    f(0, 31), f((T >= T1D) ? 1:0, 30), f(0b001110000, 29, 21);
+    f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16);
+    f(0b001111, 15, 10), rf(Vn, 5), rf(Xd, 0);
+  }
+
+  // We do not handle the 1Q arrangement.
+  void v_pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
+    starti;
+    assert(Ta == T8H && (Tb == T8B || Tb == T16B), "Invalid Size specifier");
+    f(0, 31), f(Tb & 1, 30), f(0b001110001, 29, 21), rf(Vm, 16), f(0b111000, 15, 10);
+    rf(Vn, 5), rf(Vd, 0);
+  }
+  void v_pmull2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
+    v_pmull(Vd, Ta, Vn, Vm, Tb);
+  }
+
+  void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
+    starti;
+    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0111, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b1010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0110, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
+    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+
+  void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int imm) {
+    starti;
+    assert((8 << ((int)T & 1)) == imm, "size/imm mismatch");      
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0111, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn, Register Xm) {
+    starti;
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0111, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn, int imm) {
+    starti;
+    assert((16 << ((int)T & 1)) == imm, "size/imm mismatch");     
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b1010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn, Register Xm) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b1010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn, int imm) {
+    starti;
+    assert((24 << ((int)T & 1)) == imm, "size/imm mismatch");
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0110, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn, Register Xm) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0110, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn, int imm) {
+    starti;
+    assert((32 << ((int)T & 1)) == imm, "size/imm mismatch");
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn, Register Xm) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
+    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+
+  void v_st1(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
+    starti;
+    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0111, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_st1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b1010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }  
+  void v_st1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0110, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_st1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn) {
+    starti;
+    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
+    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0010, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+
+  void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
+    starti;
+    f(0, 31), f((int)T & 1, 30), f(0b001101010000001100, 29, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn, Register Xm) {
+    starti;
+    f(0, 31), f((int)T & 1, 30), f(0b001101110, 29, 21), rf(Xm, 16);
+    f(0b1100, 15, 12), f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int imm) {
+    starti;
+    assert((1 << ((int)T & 3)) == imm, "size/imm mismatch");
+    f(0, 31), f((int)T & 1, 30), f(0b001101110111111100, 29, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+
+  void v_eor(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) {
+    starti;
+    assert(T == T8B || T == T16B, "must be T8B or T16B");
+    f(0, 31), f((int)T & 1, 30), f(0b101110001, 29, 21);
+    rf(Vm, 16), f(0b000111, 15, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
+
+
 /* Simulator extensions to the ISA
 
    haltsim
diff -r 1b3757e98d39 -r 55084fca52d2 src/cpu/aarch64/vm/globals_aarch64.hpp
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Wed Jun 11 13:51:03 2014 +0100
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Jun 12 11:28:16 2014 +0100
@@ -100,6 +100,8 @@ 
   notproduct(bool, UseAcqRelForVolatileFields, false,			\
 	     "Use acquire and release insns for volatile fields")
 
+// Don't attempt to use Neon on builtin sim until builtin sim supports it
+#define UseNeon false
 
 #else
 #define UseBuiltinSim		false
@@ -115,7 +117,9 @@ 
           "doptimize instead of patching instructions")			\
 									\
   notproduct(bool, UseAcqRelForVolatileFields, false,			\
-	     "Use acquire and release insns for volatile fields")
+	     "Use acquire and release insns for volatile fields")       \
+  product(bool, UseNeon, false,                                         \
+          "Use Neon for CRC32 computation")
 
 #endif
 
diff -r 1b3757e98d39 -r 55084fca52d2 src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Wed Jun 11 13:51:03 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Jun 12 11:28:16 2014 +0100
@@ -2152,14 +2152,151 @@ 
 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
         Register table0, Register table1, Register table2, Register table3,
         Register tmp, Register tmp2, Register tmp3) {
-  Label L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
+  Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
   unsigned long offset;
+
     ornw(crc, zr, crc);
     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
     if (offset) add(table0, table0, offset);
     add(table1, table0, 1*256*sizeof(juint));
     add(table2, table0, 2*256*sizeof(juint));
     add(table3, table0, 3*256*sizeof(juint));
+
+  if (UseNeon) {
+      cmp(len, 64);
+      br(Assembler::LT, L_by16);
+      v_eor(v16, T16B, v16, v16);
+
+    Label L_fold;
+
+      add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
+
+      v_ld1(v0, v1, T2D, buf, 32);
+      v_ld1r(v4, T2D, tmp, 8);
+      v_ld1r(v5, T2D, tmp, 8);
+      v_ld1r(v6, T2D, tmp, 8);
+      v_ld1r(v7, T2D, tmp, 8);
+      v_mov(v16, T4S, 0, crc);
+
+      v_eor(v0, T16B, v0, v16);
+      sub(len, len, 64);
+
+    BIND(L_fold);
+      v_pmull(v22, T8H, v0, v5, T8B);
+      v_pmull(v20, T8H, v0, v7, T8B);
+      v_pmull(v23, T8H, v0, v4, T8B);
+      v_pmull(v21, T8H, v0, v6, T8B);
+    
+      v_pmull2(v18, T8H, v0, v5, T16B);
+      v_pmull2(v16, T8H, v0, v7, T16B);
+      v_pmull2(v19, T8H, v0, v4, T16B);
+      v_pmull2(v17, T8H, v0, v6, T16B);
+    
+      v_uzp1(v24, v20, v22, T8H);
+      v_uzp2(v25, v20, v22, T8H);
+      v_eor(v20, T16B, v24, v25);
+    
+      v_uzp1(v26, v16, v18, T8H);
+      v_uzp2(v27, v16, v18, T8H);
+      v_eor(v16, T16B, v26, v27);
+    
+      v_ushll2(v22, T4S, v20, T8H, 8);
+      v_ushll(v20, T4S, v20, T4H, 8);
+    
+      v_ushll2(v18, T4S, v16, T8H, 8);
+      v_ushll(v16, T4S, v16, T4H, 8);
+    
+      v_eor(v22, T16B, v23, v22);
+      v_eor(v18, T16B, v19, v18);
+      v_eor(v20, T16B, v21, v20);
+      v_eor(v16, T16B, v17, v16);
+    
+      v_uzp1(v17, v16, v20, T2D);
+      v_uzp2(v21, v16, v20, T2D);
+      v_eor(v17, T16B, v17, v21);
+    
+      v_ushll2(v20, T2D, v17, T4S, 16);
+      v_ushll(v16, T2D, v17, T2S, 16);
+    
+      v_eor(v20, T16B, v20, v22);
+      v_eor(v16, T16B, v16, v18);
+    
+      v_uzp1(v17, v20, v16, T2D);
+      v_uzp2(v21, v20, v16, T2D);
+      v_eor(v28, T16B, v17, v21);
+    
+      v_pmull(v22, T8H, v1, v5, T8B);
+      v_pmull(v20, T8H, v1, v7, T8B);
+      v_pmull(v23, T8H, v1, v4, T8B);
+      v_pmull(v21, T8H, v1, v6, T8B);
+    
+      v_pmull2(v18, T8H, v1, v5, T16B);
+      v_pmull2(v16, T8H, v1, v7, T16B);
+      v_pmull2(v19, T8H, v1, v4, T16B);
+      v_pmull2(v17, T8H, v1, v6, T16B);
+    
+      v_ld1(v0, v1, T2D, buf, 32);
+    
+      v_uzp1(v24, v20, v22, T8H);
+      v_uzp2(v25, v20, v22, T8H);
+      v_eor(v20, T16B, v24, v25);
+    
+      v_uzp1(v26, v16, v18, T8H);
+      v_uzp2(v27, v16, v18, T8H);
+      v_eor(v16, T16B, v26, v27);
+    
+      v_ushll2(v22, T4S, v20, T8H, 8);
+      v_ushll(v20, T4S, v20, T4H, 8);
+    
+      v_ushll2(v18, T4S, v16, T8H, 8);
+      v_ushll(v16, T4S, v16, T4H, 8);
+    
+      v_eor(v22, T16B, v23, v22);
+      v_eor(v18, T16B, v19, v18);
+      v_eor(v20, T16B, v21, v20);
+      v_eor(v16, T16B, v17, v16);
+    
+      v_uzp1(v17, v16, v20, T2D);
+      v_uzp2(v21, v16, v20, T2D);
+      v_eor(v16, T16B, v17, v21);
+    
+      v_ushll2(v20, T2D, v16, T4S, 16);
+      v_ushll(v16, T2D, v16, T2S, 16);
+    
+      v_eor(v20, T16B, v22, v20);
+      v_eor(v16, T16B, v16, v18);
+    
+      v_uzp1(v17, v20, v16, T2D);
+      v_uzp2(v21, v20, v16, T2D);
+      v_eor(v20, T16B, v17, v21);
+    
+      v_shl(v16, v28, T2D, 1);
+      v_shl(v17, v20, T2D, 1);
+    
+      v_eor(v0, T16B, v0, v16);
+      v_eor(v1, T16B, v1, v17);
+
+      subs(len, len, 32);
+      br(Assembler::GE, L_fold);
+
+      mov(crc, 0);
+      v_mov(tmp, v0, T1D, 0);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+      v_mov(tmp, v0, T1D, 1);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+      v_mov(tmp, v1, T1D, 0);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+      v_mov(tmp, v1, T1D, 1);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+
+      add(len, len, 32);
+  }
+
+  BIND(L_by16);
     subs(len, len, 16);
     br(Assembler::GE, L_by16_loop);
     adds(len, len, 16-4);
diff -r 1b3757e98d39 -r 55084fca52d2 src/cpu/aarch64/vm/stubRoutines_aarch64.cpp
--- a/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp	Wed Jun 11 13:51:03 2014 +0100
+++ b/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp	Thu Jun 12 11:28:16 2014 +0100
@@ -265,5 +265,12 @@ 
     0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
     0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
     0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
-    0xde0506f1UL
+    0xde0506f1UL,
+    // Constants for Neon CRC232 implementation
+    // k3 = 0x78ED02D5 = x^288 mod poly - bit reversed
+    // k4 = 0xED627DAE = x^256 mod poly - bit reversed
+    0x78ED02D5UL, 0xED627DAEUL,		// k4:k3
+    0xED78D502UL, 0x62EDAE7DUL,		// byte swap
+    0x02D578EDUL, 0x7DAEED62UL,		// word swap
+    0xD502ED78UL, 0xAE7D62EDUL,		// byte swap of word swap
 };
--- CUT HERE ---