diff mbox

[3/4,AARCH64,NEON] Fix unnecessary moves in vld[234]q_* intrinsics

Message ID 1411069109-31425-4-git-send-email-charles.baylis@linaro.org
State New
Headers show

Commit Message

Charles Baylis Sept. 18, 2014, 7:38 p.m. UTC
This patch improves code generation of vld[234]q_* intrinsics by avoiding use
of the __builtin_aarch64_get_qreg_* builtins to generate a temporary result
variable. Instead, a union is used for type-punning, which avoids generation of
some unnecessary move instructions. This idiom is already used in several other
intrinsics.

This patch is independent of the previous patches in the series.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE>  Charles Baylis  <charles.baylis@linaro.org>

	* config/aarch64/arm_neon.h (vld2q_s8, vld2q_p8, vld2q_s16, vld2q_p16,
	vld2q_s32, vld2q_s64, vld2q_u8, vld2q_u16, vld2q_u32, vld2q_u64,
	vld2q_f32, vld2q_f64, vld3q_s8, vld3q_p8, vld3q_s16, vld3q_p16,
	vld3q_s32, vld3q_s64, vld3q_u8, vld3q_u16, vld3q_u32, vld3q_u64,
	vld3q_f32, vld3q_f64, vld4q_s8, vld4q_p8, vld4q_s16, vld4q_p16,
	vld4q_s32, vld4q_s64, vld4q_u8, vld4q_u16, vld4q_u32, vld4q_u64,
	vld4q_f32, vld4q_f64): Use type-punning to convert between NEON
	intrinsic types and __builtin_aarch64_simd* types.

Change-Id: I61efa29138b13c7a83679885343211d604a73b15
---
 gcc/config/aarch64/arm_neon.h | 396 +++++++++++++++---------------------------
 1 file changed, 144 insertions(+), 252 deletions(-)
diff mbox

Patch

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c1fcb47..87e3baf 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -16969,133 +16969,109 @@  vld2_f32 (const float32_t * __a)
 __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
 vld2q_s8 (const int8_t * __a)
 {
-  int8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  union { int8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
 vld2q_p8 (const poly8_t * __a)
 {
-  poly8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  union { poly8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
 vld2q_s16 (const int16_t * __a)
 {
-  int16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  union { int16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
 vld2q_p16 (const poly16_t * __a)
 {
-  poly16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  union { poly16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
 vld2q_s32 (const int32_t * __a)
 {
-  int32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-  return ret;
+  union { int32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x2x2_t __attribute__ ((__always_inline__))
 vld2q_s64 (const int64_t * __a)
 {
-  int64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-  return ret;
+  union { int64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
 vld2q_u8 (const uint8_t * __a)
 {
-  uint8x16x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-  return ret;
+  union { uint8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
 vld2q_u16 (const uint16_t * __a)
 {
-  uint16x8x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-  return ret;
+  union { uint16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
 vld2q_u32 (const uint32_t * __a)
 {
-  uint32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-  return ret;
+  union { uint32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint64x2x2_t __attribute__ ((__always_inline__))
 vld2q_u64 (const uint64_t * __a)
 {
-  uint64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-  return ret;
+  union { uint64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 vld2q_f32 (const float32_t * __a)
 {
-  float32x4x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
-  return ret;
+  union { float32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float64x2x2_t __attribute__ ((__always_inline__))
 vld2q_f64 (const float64_t * __a)
 {
-  float64x2x2_t ret;
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
-  return ret;
+  union { float64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
@@ -17245,145 +17221,109 @@  vld3_f32 (const float32_t * __a)
 __extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
 vld3q_s8 (const int8_t * __a)
 {
-  int8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  union { int8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
 vld3q_p8 (const poly8_t * __a)
 {
-  poly8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  union { poly8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
 vld3q_s16 (const int16_t * __a)
 {
-  int16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  union { int16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
 vld3q_p16 (const poly16_t * __a)
 {
-  poly16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  union { poly16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
 vld3q_s32 (const int32_t * __a)
 {
-  int32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-  return ret;
+  union { int32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x2x3_t __attribute__ ((__always_inline__))
 vld3q_s64 (const int64_t * __a)
 {
-  int64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-  return ret;
+  union { int64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
 vld3q_u8 (const uint8_t * __a)
 {
-  uint8x16x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-  return ret;
+  union { uint8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
 vld3q_u16 (const uint16_t * __a)
 {
-  uint16x8x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-  return ret;
+  union { uint16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
 vld3q_u32 (const uint32_t * __a)
 {
-  uint32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-  return ret;
+  union { uint32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint64x2x3_t __attribute__ ((__always_inline__))
 vld3q_u64 (const uint64_t * __a)
 {
-  uint64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-  return ret;
+  union { uint64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 vld3q_f32 (const float32_t * __a)
 {
-  float32x4x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
-  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
-  return ret;
+  union { float32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float64x2x3_t __attribute__ ((__always_inline__))
 vld3q_f64 (const float64_t * __a)
 {
-  float64x2x3_t ret;
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
-  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
-  return ret;
+  union { float64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
@@ -17545,157 +17485,109 @@  vld4_f32 (const float32_t * __a)
 __extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
 vld4q_s8 (const int8_t * __a)
 {
-  int8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  union { int8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
 vld4q_p8 (const poly8_t * __a)
 {
-  poly8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  union { poly8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
 vld4q_s16 (const int16_t * __a)
 {
-  int16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  union { int16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
 vld4q_p16 (const poly16_t * __a)
 {
-  poly16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  union { poly16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
 vld4q_s32 (const int32_t * __a)
 {
-  int32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-  return ret;
+  union { int32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline int64x2x4_t __attribute__ ((__always_inline__))
 vld4q_s64 (const int64_t * __a)
 {
-  int64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-  return ret;
+  union { int64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
 vld4q_u8 (const uint8_t * __a)
 {
-  uint8x16x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-  ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-  return ret;
+  union { uint8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
 vld4q_u16 (const uint16_t * __a)
 {
-  uint16x8x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-  ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-  return ret;
+  union { uint16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
 vld4q_u32 (const uint32_t * __a)
 {
-  uint32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
-  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-  ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-  return ret;
+  union { uint32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline uint64x2x4_t __attribute__ ((__always_inline__))
 vld4q_u64 (const uint64_t * __a)
 {
-  uint64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
-  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-  ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-  return ret;
+  union { uint64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 vld4q_f32 (const float32_t * __a)
 {
-  float32x4x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a);
-  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0);
-  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1);
-  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
-  ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
-  return ret;
+  union { float32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  return __temp.__i;
 }
 
 __extension__ static __inline float64x2x4_t __attribute__ ((__always_inline__))
 vld4q_f64 (const float64_t * __a)
 {
-  float64x2x4_t ret;
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a);
-  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0);
-  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1);
-  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
-  ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
-  return ret;
+  union { float64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp;
+  __temp.__o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a);
+  return __temp.__i;
 }
 
 /* vmax */