diff mbox

[4/4,AARCH64,NEON] Fix unnecessary moves in vst[234]q_* intrinsics

Message ID 1411069109-31425-5-git-send-email-charles.baylis@linaro.org
State New
Headers show

Commit Message

Charles Baylis Sept. 18, 2014, 7:38 p.m. UTC
This patch improves code generation of vst[234]q_* intrinsics by avoiding use
of the __builtin_aarch64_set_qreg_* builtins to generate a temporary
__builtin_aarch64_simd_XX variable. Instead, a union is used for type-punning,
which avoids generation of some unnecessary move instructions. This idiom is
already used in several other intrinsics.

This patch is independent of the previous patches in the series.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE>  Charles Baylis  <charles.baylis@linaro.org>

	* config/aarch64/arm_neon.h (vst2q_s8, vst2q_p8, vst2q_s16, vst2q_p16,
	vst2q_s32, vst2q_s64, vst2q_u8, vst2q_u16, vst2q_u32, vst2q_u64,
	vst2q_f32, vst2q_f64, vst3q_s8, vst3q_p8, vst3q_s16, vst3q_p16,
	vst3q_s32, vst3q_s64, vst3q_u8, vst3q_u16, vst3q_u32, vst3q_u64,
	vst3q_f32, vst3q_f64, vst4q_s8, vst4q_p8, vst4q_s16, vst4q_p16,
	vst4q_s32, vst4q_s64, vst4q_u8, vst4q_u16, vst4q_u32, vst4q_u64,
	vst4q_f32, vst4q_f64): Use type-punning to convert between NEON
	intrinsic types and __builtin_aarch64_simd* types.

Change-Id: I789c68fc8d9458638eb00a15ffa28073bdc969a8
---
 gcc/config/aarch64/arm_neon.h | 288 ++++++++++++++++--------------------------
 1 file changed, 108 insertions(+), 180 deletions(-)
diff mbox

Patch

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 87e3baf..3292ce0 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -22493,109 +22493,97 @@  vst2_f32 (float32_t * __a, float32x2x2_t val)
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_s8 (int8_t * __a, int8x16x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { int8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_p8 (poly8_t * __a, poly8x16x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { poly8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_s16 (int16_t * __a, int16x8x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { int16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_p16 (poly16_t * __a, poly16x8x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { poly16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_s32 (int32_t * __a, int32x4x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
-  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { int32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_s64 (int64_t * __a, int64x2x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { int64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_u8 (uint8_t * __a, uint8x16x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { uint8x16x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_u16 (uint16_t * __a, uint16x8x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { uint16x8x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_u32 (uint32_t * __a, uint32x4x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
-  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { uint32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { uint64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_f32 (float32_t * __a, float32x4x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
-  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+  union { float32x4x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_f64 (float64_t * __a, float64x2x2_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+  union { float64x2x2_t __i;
+	  __builtin_aarch64_simd_oi __o; } __temp = { val };
+  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __temp.__o);
 }
 
 __extension__ static __inline void
@@ -22769,121 +22757,97 @@  vst3_f32 (float32_t * __a, float32x2x3_t val)
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_s8 (int8_t * __a, int8x16x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
-  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { int8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_p8 (poly8_t * __a, poly8x16x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
-  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { poly8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_s16 (int16_t * __a, int16x8x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
-  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { int16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_p16 (poly16_t * __a, poly16x8x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
-  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { poly16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_s32 (int32_t * __a, int32x4x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
-  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { int32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_s64 (int64_t * __a, int64x2x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
-  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { int64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_u8 (uint8_t * __a, uint8x16x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
-  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { uint8x16x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_u16 (uint16_t * __a, uint16x8x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
-  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { uint16x8x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_u32 (uint32_t * __a, uint32x4x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
-  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { uint32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
-  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { uint64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_f32 (float32_t * __a, float32x4x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
-  __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+  union { float32x4x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_f64 (float64_t * __a, float64x2x3_t val)
 {
-  __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
-  __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+  union { float64x2x3_t __i;
+	  __builtin_aarch64_simd_ci __o; } __temp = { val };
+  __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __temp.__o);
 }
 
 __extension__ static __inline void
@@ -23081,133 +23045,97 @@  vst4_f32 (float32_t * __a, float32x2x4_t val)
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_s8 (int8_t * __a, int8x16x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
-  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { int8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_p8 (poly8_t * __a, poly8x16x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
-  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { poly8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_s16 (int16_t * __a, int16x8x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
-  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { int16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_p16 (poly16_t * __a, poly16x8x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
-  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { poly16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_s32 (int32_t * __a, int32x4x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
-  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { int32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_s64 (int64_t * __a, int64x2x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
-  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { int64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_u8 (uint8_t * __a, uint8x16x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
-  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { uint8x16x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_u16 (uint16_t * __a, uint16x8x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
-  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { uint16x8x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_u32 (uint32_t * __a, uint32x4x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
-  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { uint32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
-  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { uint64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_f32 (float32_t * __a, float32x4x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3);
-  __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+  union { float32x4x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __temp.__o);
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_f64 (float64_t * __a, float64x2x4_t val)
 {
-  __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3);
-  __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o);
+  union { float64x2x4_t __i;
+	  __builtin_aarch64_simd_xi __o; } __temp = { val };
+  __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __temp.__o);
 }
 
 /* vsub */