@@ -1842,6 +1842,225 @@
fmovd(Vn, zr);
}
+/* SIMD extensions
+ *
+ * We just use FloatRegister in the following. They are exactly the same
+ * as SIMD registers.
+ */
+public:
+
+ enum SIMD_Arrangement {
+ T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D
+ };
+
+ enum SIMD_RegVariant {
+ S32, D64, Q128
+ };
+
+ void v_shl(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement T, int shift){
+ starti;
+ /* The encodings for the immh:immb fields (bits 22:16) are
+ * 0001 xxx 8B/16B, shift = xxx
+ * 001x xxx 4H/8H, shift = xxxx
+ * 01xx xxx 2S/4S, shift = xxxxx
+ * 1xxx xxx 1D/2D, shift = xxxxxx (1D is RESERVED)
+ */
+ assert((1 << ((T>>1)+3)) > shift, "Invalid Shift value");
+ f(0, 31), f(T & 1, 30), f(0b0011110, 29, 23), f((1 << ((T>>1)+3))|shift, 22, 16);
+ f(0b010101, 15, 10), rf(Vn, 5), rf(Vd, 0);
+ }
+
+ void v_ushll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
+ starti;
+ /* The encodings for the immh:immb fields (bits 22:16) are
+ * 0001 xxx 8H, 8B/16b shift = xxx
+ * 001x xxx 4S, 4H/8H shift = xxxx
+ * 01xx xxx 2D, 2S/4S shift = xxxxx
+ * 1xxx xxx RESERVED
+ */
+ assert((Tb >> 1) + 1 == (Ta >> 1), "Incompatible arrangement");
+ assert((1 << ((Tb>>1)+3)) > shift, "Invalid shift value");
+ f(0, 31), f(Tb & 1, 30), f(0b1011110, 29, 23), f((1 << ((Tb>>1)+3))|shift, 22, 16);
+ f(0b101001, 15, 10), rf(Vn, 5), rf(Vd, 0);
+ }
+ void v_ushll2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
+ v_ushll(Vd, Ta, Vn, Tb, shift);
+ }
+
+ void v_uzp1(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement T, int op = 0){
+ starti;
+ f(0, 31), f((T & 0x1), 30), f(0b001110, 29, 24), f((T >> 1), 23, 22), f(0, 21);
+ rf(Vm, 16), f(0, 15), f(op, 14), f(0b0110, 13, 10), rf(Vn, 5), rf(Vd, 0);
+ }
+ void v_uzp2(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement T){
+ v_uzp1(Vd, Vn, Vm, T, 1);
+ }
+
+ // Move from general purpose register
+ // mov Vd.T[index], Rn
+ void v_mov(FloatRegister Vd, SIMD_Arrangement T, int index, Register Xn) {
+ starti;
+ f(0b01001110000, 31, 21), f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16);
+ f(0b000111, 15, 10), rf(Xn, 5), rf(Vd, 0);
+ }
+
+ // Move to general purpose register
+ // mov Rd, Vn.T[index]
+ void v_mov(Register Xd, FloatRegister Vn, SIMD_Arrangement T, int index) {
+ starti;
+ f(0, 31), f((T >= T1D) ? 1:0, 30), f(0b001110000, 29, 21);
+ f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16);
+ f(0b001111, 15, 10), rf(Vn, 5), rf(Xd, 0);
+ }
+
+ // We do not handle the 1Q arrangement.
+ void v_pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
+ starti;
+ assert(Ta == T8H && (Tb == T8B || Tb == T16B), "Invalid Size specifier");
+ f(0, 31), f(Tb & 1, 30), f(0b001110001, 29, 21), rf(Vm, 16), f(0b111000, 15, 10);
+ rf(Vn, 5), rf(Vd, 0);
+ }
+ void v_pmull2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
+ v_pmull(Vd, Ta, Vn, Vm, Tb);
+ }
+
+ void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
+ starti;
+ f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0111, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn) {
+ starti;
+ assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+ f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b1010, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn) {
+ starti;
+ assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+ assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+ f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0110, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn) {
+ starti;
+ assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+ assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+ assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
+ f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0010, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+
+ void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int imm) {
+ starti;
+ assert((8 << ((int)T & 1)) == imm, "size/imm mismatch");
+ f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0111, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn, Register Xm) {
+ starti;
+ f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0111, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn, int imm) {
+ starti;
+ assert((16 << ((int)T & 1)) == imm, "size/imm mismatch");
+ assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+ f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b1010, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn, Register Xm) {
+ starti;
+ assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+ f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b1010, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn, int imm) {
+ starti;
+ assert((24 << ((int)T & 1)) == imm, "size/imm mismatch");
+ assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+ assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+ f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0110, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn, Register Xm) {
+ starti;
+ assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+ assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+ f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0110, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn, int imm) {
+ starti;
+ assert((32 << ((int)T & 1)) == imm, "size/imm mismatch");
+ assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+ assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+ assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
+ f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0010, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn, Register Xm) {
+ starti;
+ assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+ assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+ assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
+ f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0010, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+
+ void v_st1(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
+ starti;
+ f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0111, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_st1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn) {
+ starti;
+ assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+ f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b1010, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_st1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn) {
+ starti;
+ assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+ assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+ f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0110, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_st1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn) {
+ starti;
+ assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
+ assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
+ assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
+ f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0010, 15, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+
+ void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
+ starti;
+ f(0, 31), f((int)T & 1, 30), f(0b001101010000001100, 29, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn, Register Xm) {
+ starti;
+ f(0, 31), f((int)T & 1, 30), f(0b001101110, 29, 21), rf(Xm, 16);
+ f(0b1100, 15, 12), f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+ void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int imm) {
+ starti;
+ assert((1 << ((int)T & 3)) == imm, "size/imm mismatch");
+ f(0, 31), f((int)T & 1, 30), f(0b001101110111111100, 29, 12);
+ f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+ }
+
+ void v_eor(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) {
+ starti;
+ assert(T == T8B || T == T16B, "must be T8B or T16B");
+ f(0, 31), f((int)T & 1, 30), f(0b101110001, 29, 21);
+ rf(Vm, 16), f(0b000111, 15, 10), rf(Vn, 5), rf(Vd, 0);
+ }
+
+
+
/* Simulator extensions to the ISA
haltsim
@@ -100,6 +100,8 @@
notproduct(bool, UseAcqRelForVolatileFields, false, \
"Use acquire and release insns for volatile fields")
+// Don't attempt to use Neon on builtin sim until builtin sim supports it
+#define UseNeon false
#else
#define UseBuiltinSim false
@@ -115,7 +117,9 @@
"doptimize instead of patching instructions") \
\
notproduct(bool, UseAcqRelForVolatileFields, false, \
- "Use acquire and release insns for volatile fields")
+ "Use acquire and release insns for volatile fields") \
+ product(bool, UseNeon, false, \
+ "Use Neon for CRC32 computation")
#endif
@@ -2152,14 +2152,151 @@
void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
Register table0, Register table1, Register table2, Register table3,
Register tmp, Register tmp2, Register tmp3) {
- Label L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
+ Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
unsigned long offset;
+
ornw(crc, zr, crc);
adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
if (offset) add(table0, table0, offset);
add(table1, table0, 1*256*sizeof(juint));
add(table2, table0, 2*256*sizeof(juint));
add(table3, table0, 3*256*sizeof(juint));
+
+ if (UseNeon) {
+ cmp(len, 64);
+ br(Assembler::LT, L_by16);
+ v_eor(v16, T16B, v16, v16);
+
+ Label L_fold;
+
+ add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
+
+ v_ld1(v0, v1, T2D, buf, 32);
+ v_ld1r(v4, T2D, tmp, 8);
+ v_ld1r(v5, T2D, tmp, 8);
+ v_ld1r(v6, T2D, tmp, 8);
+ v_ld1r(v7, T2D, tmp, 8);
+ v_mov(v16, T4S, 0, crc);
+
+ v_eor(v0, T16B, v0, v16);
+ sub(len, len, 64);
+
+ BIND(L_fold);
+ v_pmull(v22, T8H, v0, v5, T8B);
+ v_pmull(v20, T8H, v0, v7, T8B);
+ v_pmull(v23, T8H, v0, v4, T8B);
+ v_pmull(v21, T8H, v0, v6, T8B);
+
+ v_pmull2(v18, T8H, v0, v5, T16B);
+ v_pmull2(v16, T8H, v0, v7, T16B);
+ v_pmull2(v19, T8H, v0, v4, T16B);
+ v_pmull2(v17, T8H, v0, v6, T16B);
+
+ v_uzp1(v24, v20, v22, T8H);
+ v_uzp2(v25, v20, v22, T8H);
+ v_eor(v20, T16B, v24, v25);
+
+ v_uzp1(v26, v16, v18, T8H);
+ v_uzp2(v27, v16, v18, T8H);
+ v_eor(v16, T16B, v26, v27);
+
+ v_ushll2(v22, T4S, v20, T8H, 8);
+ v_ushll(v20, T4S, v20, T4H, 8);
+
+ v_ushll2(v18, T4S, v16, T8H, 8);
+ v_ushll(v16, T4S, v16, T4H, 8);
+
+ v_eor(v22, T16B, v23, v22);
+ v_eor(v18, T16B, v19, v18);
+ v_eor(v20, T16B, v21, v20);
+ v_eor(v16, T16B, v17, v16);
+
+ v_uzp1(v17, v16, v20, T2D);
+ v_uzp2(v21, v16, v20, T2D);
+ v_eor(v17, T16B, v17, v21);
+
+ v_ushll2(v20, T2D, v17, T4S, 16);
+ v_ushll(v16, T2D, v17, T2S, 16);
+
+ v_eor(v20, T16B, v20, v22);
+ v_eor(v16, T16B, v16, v18);
+
+ v_uzp1(v17, v20, v16, T2D);
+ v_uzp2(v21, v20, v16, T2D);
+ v_eor(v28, T16B, v17, v21);
+
+ v_pmull(v22, T8H, v1, v5, T8B);
+ v_pmull(v20, T8H, v1, v7, T8B);
+ v_pmull(v23, T8H, v1, v4, T8B);
+ v_pmull(v21, T8H, v1, v6, T8B);
+
+ v_pmull2(v18, T8H, v1, v5, T16B);
+ v_pmull2(v16, T8H, v1, v7, T16B);
+ v_pmull2(v19, T8H, v1, v4, T16B);
+ v_pmull2(v17, T8H, v1, v6, T16B);
+
+ v_ld1(v0, v1, T2D, buf, 32);
+
+ v_uzp1(v24, v20, v22, T8H);
+ v_uzp2(v25, v20, v22, T8H);
+ v_eor(v20, T16B, v24, v25);
+
+ v_uzp1(v26, v16, v18, T8H);
+ v_uzp2(v27, v16, v18, T8H);
+ v_eor(v16, T16B, v26, v27);
+
+ v_ushll2(v22, T4S, v20, T8H, 8);
+ v_ushll(v20, T4S, v20, T4H, 8);
+
+ v_ushll2(v18, T4S, v16, T8H, 8);
+ v_ushll(v16, T4S, v16, T4H, 8);
+
+ v_eor(v22, T16B, v23, v22);
+ v_eor(v18, T16B, v19, v18);
+ v_eor(v20, T16B, v21, v20);
+ v_eor(v16, T16B, v17, v16);
+
+ v_uzp1(v17, v16, v20, T2D);
+ v_uzp2(v21, v16, v20, T2D);
+ v_eor(v16, T16B, v17, v21);
+
+ v_ushll2(v20, T2D, v16, T4S, 16);
+ v_ushll(v16, T2D, v16, T2S, 16);
+
+ v_eor(v20, T16B, v22, v20);
+ v_eor(v16, T16B, v16, v18);
+
+ v_uzp1(v17, v20, v16, T2D);
+ v_uzp2(v21, v20, v16, T2D);
+ v_eor(v20, T16B, v17, v21);
+
+ v_shl(v16, v28, T2D, 1);
+ v_shl(v17, v20, T2D, 1);
+
+ v_eor(v0, T16B, v0, v16);
+ v_eor(v1, T16B, v1, v17);
+
+ subs(len, len, 32);
+ br(Assembler::GE, L_fold);
+
+ mov(crc, 0);
+ v_mov(tmp, v0, T1D, 0);
+ update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+ update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+ v_mov(tmp, v0, T1D, 1);
+ update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+ update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+ v_mov(tmp, v1, T1D, 0);
+ update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+ update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+ v_mov(tmp, v1, T1D, 1);
+ update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+ update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+
+ add(len, len, 32);
+ }
+
+ BIND(L_by16);
subs(len, len, 16);
br(Assembler::GE, L_by16_loop);
adds(len, len, 16-4);
@@ -265,5 +265,12 @@
0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
- 0xde0506f1UL
+ 0xde0506f1UL,
+ // Constants for Neon CRC232 implementation
+ // k3 = 0x78ED02D5 = x^288 mod poly - bit reversed
+ // k4 = 0xED627DAE = x^256 mod poly - bit reversed
+ 0x78ED02D5UL, 0xED627DAEUL, // k4:k3
+ 0xED78D502UL, 0x62EDAE7DUL, // byte swap
+ 0x02D578EDUL, 0x7DAEED62UL, // word swap
+ 0xD502ED78UL, 0xAE7D62EDUL, // byte swap of word swap
};