Message ID | 1412789236-26461-3-git-send-email-charles.baylis@linaro.org |
---|---|
State | New |
Headers | show |
On 08/10/14 18:27, charles.baylis@linaro.org wrote: > From: Charles Baylis <charles.baylis@linaro.org> > > This patch replaces the inline assembler implementations of the > vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin > functions added in patch 1. > > Tested (with the rest of the patch series) with make check on aarch64-oe-linux > with qemu, and also causes no regressions in clyon's NEON intrinsics tests. > > <DATE> Charles Baylis <charles.baylis@linaro.org> > > * config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins, > update uses to use new macro arguments. > (__LD3_LANE_FUNC): Likewise. > (__LD4_LANE_FUNC): Likewise. > > Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a > --- > gcc/config/aarch64/arm_neon.h | 377 ++++++++++++++++++++++++++++-------------- > 1 file changed, 255 insertions(+), 122 deletions(-) > > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index 9b1873f..19ce261 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -11805,47 +11805,83 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q) > __LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q) > __LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q) > > -#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix, \ > - lnsuffix, funcsuffix, Q) \ > - __extension__ static __inline rettype \ > - __attribute__ ((__always_inline__)) \ > - vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ > - rettype b, const int c) \ > - { \ > - rettype result; \ > - __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \ > - "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t" \ > - "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \ > - : "=Q"(result) \ > - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ > - : "memory", "v16", "v17"); \ > - return result; \ > - } > - > -__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,) > -__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,) > -__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,) > -__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,) > -__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,) > -__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,) > -__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,) > -__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,) > -__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,) > -__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,) > -__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,) > -__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,) > -__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q) > -__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q) > -__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q) > -__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q) > -__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q) > -__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q) > -__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q) > -__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q) > -__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q) > -__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q) > -__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q) > -__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q) > +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, \ > + mode, ptrmode, funcsuffix, signedtype) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + __builtin_aarch64_simd_oi __o; \ > + largetype __temp; \ > + __temp.val[0] = \ > + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ > + __temp.val[1] = \ > + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ > + __o = __builtin_aarch64_set_qregoi##mode (__o, \ > + (signedtype) __temp.val[0], \ > + 0); \ > + __o = __builtin_aarch64_set_qregoi##mode (__o, \ > + (signedtype) __temp.val[1], \ > + 1); \ > + __o = __builtin_aarch64_ld2_lane##mode ( \ > + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ > + __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \ > + __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \ > + return __b; \ > +} > + > +__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf, > + sf, f32, float32x4_t) > +__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df, > + df, f64, float64x2_t) > +__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8, > + int8x16_t) > +__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi, > + p16, int16x8_t) > +__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8, > + int8x16_t) > +__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16, > + int16x8_t) > +__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32, > + int32x4_t) > +__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64, > + int64x2_t) > +__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8, > + int8x16_t) > +__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi, > + u16, int16x8_t) > +__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si, > + u32, int32x4_t) > +__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di, > + u64, int64x2_t) > + > +#undef __LD2_LANE_FUNC > +#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + __builtin_aarch64_simd_oi __o; \ > + intype ret; \ > + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \ > + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \ > + __o = __builtin_aarch64_ld2_lane##mode ( \ > + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ > + ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0); \ > + ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1); \ > + return ret; \ > +} > + > +__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32) > +__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64) > +__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8) > +__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16) > +__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8) > +__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16) > +__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32) > +__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64) > +__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8) > +__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16) > +__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32) > +__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64) > > #define __LD3R_FUNC(rettype, structtype, ptrtype, \ > regsuffix, funcsuffix, Q) \ > @@ -11887,47 +11923,91 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q) > __LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q) > __LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q) > > -#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix, \ > - lnsuffix, funcsuffix, Q) \ > - __extension__ static __inline rettype \ > - __attribute__ ((__always_inline__)) \ > - vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ > - rettype b, const int c) \ > - { \ > - rettype result; \ > - __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \ > - "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t" \ > - "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t" \ > - : "=Q"(result) \ > - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ > - : "memory", "v16", "v17", "v18"); \ > - return result; \ > - } > - > -__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,) > -__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,) > -__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,) > -__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,) > -__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,) > -__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,) > -__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,) > -__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,) > -__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,) > -__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,) > -__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,) > -__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,) > -__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q) > -__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q) > -__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q) > -__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q) > -__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q) > -__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q) > -__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q) > -__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q) > -__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q) > -__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q) > -__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q) > -__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q) > +#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, \ > + mode, ptrmode, funcsuffix, signedtype) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + __builtin_aarch64_simd_ci __o; \ > + largetype __temp; \ > + __temp.val[0] = \ > + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ > + __temp.val[1] = \ > + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ > + __temp.val[2] = \ > + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ > + __o = __builtin_aarch64_set_qregci##mode (__o, \ > + (signedtype) __temp.val[0], \ > + 0); \ > + __o = __builtin_aarch64_set_qregci##mode (__o, \ > + (signedtype) __temp.val[1], \ > + 1); \ > + __o = __builtin_aarch64_set_qregci##mode (__o, \ > + (signedtype) __temp.val[2], \ > + 2); \ > + __o = __builtin_aarch64_ld3_lane##mode ( \ > + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ > + __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \ > + __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \ > + __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \ > + return __b; \ > +} > + > +__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf, > + sf, f32, float32x4_t) > +__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df, > + df, f64, float64x2_t) > +__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8, > + int8x16_t) > +__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi, > + p16, int16x8_t) > +__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8, > + int8x16_t) > +__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16, > + int16x8_t) > +__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32, > + int32x4_t) > +__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64, > + int64x2_t) > +__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8, > + int8x16_t) > +__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi, > + u16, int16x8_t) > +__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si, > + u32, int32x4_t) > +__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di, > + u64, int64x2_t) > + > +#undef __LD3_LANE_FUNC > +#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + __builtin_aarch64_simd_ci __o; \ > + intype ret; \ > + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \ > + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \ > + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \ > + __o = __builtin_aarch64_ld3_lane##mode ( \ > + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ > + ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0); \ > + ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1); \ > + ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2); \ > + return ret; \ > +} > + > +__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32) > +__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64) > +__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8) > +__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16) > +__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8) > +__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16) > +__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32) > +__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64) > +__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8) > +__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16) > +__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32) > +__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64) > > #define __LD4R_FUNC(rettype, structtype, ptrtype, \ > regsuffix, funcsuffix, Q) \ > @@ -11969,47 +12049,100 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q) > __LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q) > __LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q) > > -#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix, \ > - lnsuffix, funcsuffix, Q) \ > - __extension__ static __inline rettype \ > - __attribute__ ((__always_inline__)) \ > - vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ > - rettype b, const int c) \ > - { \ > - rettype result; \ > - __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ > - "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t" \ > - "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \ > - : "=Q"(result) \ > - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ > - : "memory", "v16", "v17", "v18", "v19"); \ > - return result; \ > - } > > -__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,) > -__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,) > -__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,) > -__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,) > -__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,) > -__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,) > -__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,) > -__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,) > -__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,) > -__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,) > -__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,) > -__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,) > -__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q) > -__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q) > -__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q) > -__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q) > -__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q) > -__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q) > -__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q) > -__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q) > -__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q) > -__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) > -__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) > -__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) > +#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, \ > + mode, ptrmode, funcsuffix, signedtype) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + __builtin_aarch64_simd_xi __o; \ > + largetype __temp; \ > + __temp.val[0] = \ > + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ > + __temp.val[1] = \ > + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ > + __temp.val[2] = \ > + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ > + __temp.val[3] = \ > + vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \ > + __o = __builtin_aarch64_set_qregxi##mode (__o, \ > + (signedtype) __temp.val[0], \ > + 0); \ > + __o = __builtin_aarch64_set_qregxi##mode (__o, \ > + (signedtype) __temp.val[1], \ > + 1); \ > + __o = __builtin_aarch64_set_qregxi##mode (__o, \ > + (signedtype) __temp.val[2], \ > + 2); \ > + __o = __builtin_aarch64_set_qregxi##mode (__o, \ > + (signedtype) __temp.val[3], \ > + 3); \ > + __o = __builtin_aarch64_ld4_lane##mode ( \ > + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ > + __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \ > + __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \ > + __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \ > + __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \ > + return __b; \ > +} > + > +__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf, > + sf, f32, float32x4_t) > +__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df, > + df, f64, float64x2_t) > +__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8, > + int8x16_t) > +__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi, > + p16, int16x8_t) > +__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8, > + int8x16_t) > +__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16, > + int16x8_t) > +__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32, > + int32x4_t) > +__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64, > + int64x2_t) > +__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8, > + int8x16_t) > +__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi, > + u16, int16x8_t) > +__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si, > + u32, int32x4_t) > +__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di, > + u64, int64x2_t) > + > +#undef __LD4_LANE_FUNC > +#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + __builtin_aarch64_simd_xi __o; \ > + intype ret; \ > + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \ > + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \ > + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \ > + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \ > + __o = __builtin_aarch64_ld4_lane##mode ( \ > + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ > + ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0); \ > + ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1); \ > + ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2); \ > + ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3); \ > + return ret; \ > +} > + > +__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32) > +__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64) > +__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8) > +__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16) > +__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8) > +__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16) > +__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32) > +__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64) > +__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8) > +__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16) > +__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32) > +__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64) > > #define __ST2_LANE_FUNC(intype, largetype, ptrtype, \ > mode, ptr_mode, funcsuffix, signedtype) \ > -- > 1.9.1 > > LGTM(but I can't approve it). Thanks for this patch. Tejas.
On 8 October 2014 18:27, <charles.baylis@linaro.org> wrote: > +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, \ Just above the insertion point in arm-neon.h is the comment: /* Start of temporary inline asm for vldn, vstn and friends. */ This patch removes the "temporary inline asm vldn" implementation, the replacement implementation should be inserted in the section below, look for the comment that starts: "Start of optimal implementations...." > + mode, ptrmode, funcsuffix, signedtype) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + __builtin_aarch64_simd_oi __o; \ > + largetype __temp; \ > + __temp.val[0] = \ There is something odd about the white space here, space before tab? This is repeated in various places through the rest of the patch. Otherwise this and the previous 1/2 associated patch look good, can you respin with these tidy ups? Thanks /Marcus
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 9b1873f..19ce261 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -11805,47 +11805,83 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q) __LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q) __LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q) -#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ - rettype b, const int c) \ - { \ - rettype result; \ - __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \ - "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t" \ - "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ - : "memory", "v16", "v17"); \ - return result; \ - } - -__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,) -__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,) -__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,) -__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,) -__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,) -__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,) -__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,) -__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,) -__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,) -__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,) -__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,) -__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,) -__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q) -__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q) -__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q) -__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q) -__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q) -__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q) -__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q) -__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q) -__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q) -__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q) -__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q) -__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q) +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, \ + mode, ptrmode, funcsuffix, signedtype) \ +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ +vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_oi __o; \ + largetype __temp; \ + __temp.val[0] = \ + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ + __temp.val[1] = \ + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ + __o = __builtin_aarch64_set_qregoi##mode (__o, \ + (signedtype) __temp.val[0], \ + 0); \ + __o = __builtin_aarch64_set_qregoi##mode (__o, \ + (signedtype) __temp.val[1], \ + 1); \ + __o = __builtin_aarch64_ld2_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \ + __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \ + return __b; \ +} + +__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf, + sf, f32, float32x4_t) +__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df, + df, f64, float64x2_t) +__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8, + int8x16_t) +__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi, + p16, int16x8_t) +__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8, + int8x16_t) +__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16, + int16x8_t) +__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32, + int32x4_t) +__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64, + int64x2_t) +__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8, + int8x16_t) +__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi, + u16, int16x8_t) +__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si, + u32, int32x4_t) +__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di, + u64, int64x2_t) + +#undef __LD2_LANE_FUNC +#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ +vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_oi __o; \ + intype ret; \ + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \ + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \ + __o = __builtin_aarch64_ld2_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0); \ + ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1); \ + return ret; \ +} + +__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32) +__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64) +__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8) +__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16) +__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8) +__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16) +__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32) +__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64) +__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8) +__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16) +__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32) +__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64) #define __LD3R_FUNC(rettype, structtype, ptrtype, \ regsuffix, funcsuffix, Q) \ @@ -11887,47 +11923,91 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q) __LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q) __LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q) -#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ - rettype b, const int c) \ - { \ - rettype result; \ - __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \ - "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t" \ - "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ - : "memory", "v16", "v17", "v18"); \ - return result; \ - } - -__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,) -__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,) -__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,) -__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,) -__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,) -__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,) -__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,) -__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,) -__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,) -__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,) -__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,) -__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,) -__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q) -__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q) -__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q) -__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q) -__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q) -__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q) -__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q) -__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q) -__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q) -__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q) -__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q) -__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q) +#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, \ + mode, ptrmode, funcsuffix, signedtype) \ +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ +vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_ci __o; \ + largetype __temp; \ + __temp.val[0] = \ + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ + __temp.val[1] = \ + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ + __temp.val[2] = \ + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ + __o = __builtin_aarch64_set_qregci##mode (__o, \ + (signedtype) __temp.val[0], \ + 0); \ + __o = __builtin_aarch64_set_qregci##mode (__o, \ + (signedtype) __temp.val[1], \ + 1); \ + __o = __builtin_aarch64_set_qregci##mode (__o, \ + (signedtype) __temp.val[2], \ + 2); \ + __o = __builtin_aarch64_ld3_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \ + __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \ + __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \ + return __b; \ +} + +__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf, + sf, f32, float32x4_t) +__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df, + df, f64, float64x2_t) +__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8, + int8x16_t) +__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi, + p16, int16x8_t) +__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8, + int8x16_t) +__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16, + int16x8_t) +__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32, + int32x4_t) +__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64, + int64x2_t) +__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8, + int8x16_t) +__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi, + u16, int16x8_t) +__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si, + u32, int32x4_t) +__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di, + u64, int64x2_t) + +#undef __LD3_LANE_FUNC +#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ +vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_ci __o; \ + intype ret; \ + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \ + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \ + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \ + __o = __builtin_aarch64_ld3_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0); \ + ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1); \ + ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2); \ + return ret; \ +} + +__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32) +__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64) +__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8) +__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16) +__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8) +__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16) +__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32) +__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64) +__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8) +__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16) +__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32) +__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64) #define __LD4R_FUNC(rettype, structtype, ptrtype, \ regsuffix, funcsuffix, Q) \ @@ -11969,47 +12049,100 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q) __LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q) __LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q) -#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ - rettype b, const int c) \ - { \ - rettype result; \ - __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ - "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t" \ - "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ - : "memory", "v16", "v17", "v18", "v19"); \ - return result; \ - } -__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,) -__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,) -__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,) -__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,) -__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,) -__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,) -__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,) -__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,) -__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,) -__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,) -__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,) -__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,) -__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q) -__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q) -__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q) -__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q) -__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q) -__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q) -__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q) -__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q) -__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q) -__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) -__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) -__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) +#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, \ + mode, ptrmode, funcsuffix, signedtype) \ +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ +vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_xi __o; \ + largetype __temp; \ + __temp.val[0] = \ + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ + __temp.val[1] = \ + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ + __temp.val[2] = \ + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ + __temp.val[3] = \ + vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \ + __o = __builtin_aarch64_set_qregxi##mode (__o, \ + (signedtype) __temp.val[0], \ + 0); \ + __o = __builtin_aarch64_set_qregxi##mode (__o, \ + (signedtype) __temp.val[1], \ + 1); \ + __o = __builtin_aarch64_set_qregxi##mode (__o, \ + (signedtype) __temp.val[2], \ + 2); \ + __o = __builtin_aarch64_set_qregxi##mode (__o, \ + (signedtype) __temp.val[3], \ + 3); \ + __o = __builtin_aarch64_ld4_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \ + __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \ + __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \ + __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \ + return __b; \ +} + +__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf, + sf, f32, float32x4_t) +__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df, + df, f64, float64x2_t) +__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8, + int8x16_t) +__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi, + p16, int16x8_t) +__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8, + int8x16_t) +__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16, + int16x8_t) +__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32, + int32x4_t) +__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64, + int64x2_t) +__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8, + int8x16_t) +__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi, + u16, int16x8_t) +__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si, + u32, int32x4_t) +__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di, + u64, int64x2_t) + +#undef __LD4_LANE_FUNC +#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ +vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_xi __o; \ + intype ret; \ + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \ + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \ + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \ + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \ + __o = __builtin_aarch64_ld4_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0); \ + ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1); \ + ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2); \ + ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3); \ + return ret; \ +} + +__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32) +__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64) +__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8) +__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16) +__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8) +__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16) +__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32) +__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64) +__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8) +__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16) +__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32) +__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64) #define __ST2_LANE_FUNC(intype, largetype, ptrtype, \ mode, ptr_mode, funcsuffix, signedtype) \
From: Charles Baylis <charles.baylis@linaro.org> This patch replaces the inline assembler implementations of the vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin functions added in patch 1. Tested (with the rest of the patch series) with make check on aarch64-oe-linux with qemu, and also causes no regressions in clyon's NEON intrinsics tests. <DATE> Charles Baylis <charles.baylis@linaro.org> * config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins, update uses to use new macro arguments. (__LD3_LANE_FUNC): Likewise. (__LD4_LANE_FUNC): Likewise. Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a --- gcc/config/aarch64/arm_neon.h | 377 ++++++++++++++++++++++++++++-------------- 1 file changed, 255 insertions(+), 122 deletions(-)