Message ID | 1465835500-11451-1-git-send-email-james.greenhalgh@arm.com |
---|---|
State | New |
Headers | show |
On Mon, Jun 13, 2016 at 05:31:40PM +0100, James Greenhalgh wrote: > > Hi, > > Inspired by Jiong's recent work, here are some more missing intrinsics, > and a smoke test for each of them. > > This patch covers: > > vcvt_n_f64_s64 > vcvt_n_f64_u64 > vcvt_n_s64_f64 > vcvt_n_u64_f64 > vcvt_f64_s64 > vrecpe_f64 > vcvt_f64_u64 > vrecps_f64 > > Tested on aarch64-none-elf, and on an internal testsuite for Neon > intrinsics. > > Note that the new tests will ICE without the fixups in > https://gcc.gnu.org/ml/gcc-patches/2016-06/msg00805.html > > OK? *ping* https://gcc.gnu.org/ml/gcc-patches/2016-06/msg00977.html Thanks, James > gcc/ChangeLog > > 2016-06-10 James Greenhalgh <james.greenhalgh@arm.com> > > * config/aarch64/arm_neon.h (vcvt_n_f64_s64): New. > (vcvt_n_f64_u64): Likewise. > (vcvt_n_s64_f64): Likewise. > (vcvt_n_u64_f64): Likewise. > (vcvt_f64_s64): Likewise. > (vrecpe_f64): Likewise. > (vcvt_f64_u64): Likewise. > (vrecps_f64): Likewise. > > gcc/testsuite/ChangeLog > > 2016-06-10 James Greenhalgh <james.greenhalgh@arm.com> > > * gcc.target/aarch64/vcvt_f64_1.c: New. > * gcc.target/aarch64/vcvt_n_f64_1.c: New. > * gcc.target/aarch64/vrecp_f64_1.c: New.
On 13/06/16 17:31, James Greenhalgh wrote: > > Hi, > > Inspired by Jiong's recent work, here are some more missing intrinsics, > and a smoke test for each of them. > > This patch covers: > > vcvt_n_f64_s64 > vcvt_n_f64_u64 > vcvt_n_s64_f64 > vcvt_n_u64_f64 > vcvt_f64_s64 > vrecpe_f64 > vcvt_f64_u64 > vrecps_f64 > > Tested on aarch64-none-elf, and on an internal testsuite for Neon > intrinsics. > > Note that the new tests will ICE without the fixups in > https://gcc.gnu.org/ml/gcc-patches/2016-06/msg00805.html > > OK? > OK, but please fix the nit that Kyrill highlighted. R. > Thanks, > James > > --- > gcc/ChangeLog > > 2016-06-10 James Greenhalgh <james.greenhalgh@arm.com> > > * config/aarch64/arm_neon.h (vcvt_n_f64_s64): New. > (vcvt_n_f64_u64): Likewise. > (vcvt_n_s64_f64): Likewise. > (vcvt_n_u64_f64): Likewise. > (vcvt_f64_s64): Likewise. > (vrecpe_f64): Likewise. > (vcvt_f64_u64): Likewise. > (vrecps_f64): Likewise. > > gcc/testsuite/ChangeLog > > 2016-06-10 James Greenhalgh <james.greenhalgh@arm.com> > > * gcc.target/aarch64/vcvt_f64_1.c: New. > * gcc.target/aarch64/vcvt_n_f64_1.c: New. > * gcc.target/aarch64/vrecp_f64_1.c: New. > > > 0001-Patch-AArch64-Add-some-more-missing-intrinsics.patch > > > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index f70b6d3..2f90938 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -12447,6 +12447,20 @@ vcvt_n_f32_u32 (uint32x2_t __a, const int __b) > return __builtin_aarch64_ucvtfv2si_sus (__a, __b); > } > > +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) > +vcvt_n_f64_s64 (int64x1_t __a, const int __b) > +{ > + return (float64x1_t) > + { __builtin_aarch64_scvtfdi (vget_lane_s64 (__a, 0), __b) }; > +} > + > +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) > +vcvt_n_f64_u64 (uint64x1_t __a, const int __b) > +{ > + return (float64x1_t) > + { __builtin_aarch64_ucvtfdi_sus (vget_lane_u64 (__a, 0), __b) }; > +} > + > __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) > vcvtq_n_f32_s32 (int32x4_t __a, const int __b) > { > @@ -12509,6 +12523,20 @@ vcvt_n_u32_f32 (float32x2_t __a, const int __b) > return __builtin_aarch64_fcvtzuv2sf_uss (__a, __b); > } > > +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) > +vcvt_n_s64_f64 (float64x1_t __a, const int __b) > +{ > + return (int64x1_t) > + { __builtin_aarch64_fcvtzsdf (vget_lane_f64 (__a, 0), __b) }; > +} > + > +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) > +vcvt_n_u64_f64 (float64x1_t __a, const int __b) > +{ > + return (uint64x1_t) > + { __builtin_aarch64_fcvtzudf_uss (vget_lane_f64 (__a, 0), __b) }; > +} > + > __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > vcvtq_n_s32_f32 (float32x4_t __a, const int __b) > { > @@ -12571,6 +12599,18 @@ vcvt_f32_u32 (uint32x2_t __a) > return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a); > } > > +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) > +vcvt_f64_s64 (int64x1_t __a) > +{ > + return (float64x1_t) { vget_lane_s64 (__a, 0) }; > +} > + > +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) > +vcvt_f64_u64 (uint64x1_t __a) > +{ > + return (float64x1_t) { vget_lane_u64 (__a, 0) }; > +} > + > __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) > vcvtq_f32_s32 (int32x4_t __a) > { > @@ -20659,6 +20699,12 @@ vrecpe_f32 (float32x2_t __a) > return __builtin_aarch64_frecpev2sf (__a); > } > > +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) > +vrecpe_f64 (float64x1_t __a) > +{ > + return (float64x1_t) { vrecped_f64 (vget_lane_f64 (__a, 0)) }; > +} > + > __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) > vrecpeq_f32 (float32x4_t __a) > { > @@ -20691,6 +20737,13 @@ vrecps_f32 (float32x2_t __a, float32x2_t __b) > return __builtin_aarch64_frecpsv2sf (__a, __b); > } > > +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) > +vrecps_f64 (float64x1_t __a, float64x1_t __b) > +{ > + return (float64x1_t) { vrecpsd_f64 (vget_lane_f64 (__a, 0), > + vget_lane_f64 (__b, 0)) }; > +} > + > __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) > vrecpsq_f32 (float32x4_t __a, float32x4_t __b) > { > diff --git a/gcc/testsuite/gcc.target/aarch64/vcvt_f64_1.c b/gcc/testsuite/gcc.target/aarch64/vcvt_f64_1.c > new file mode 100644 > index 0000000..b7ee7af > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vcvt_f64_1.c > @@ -0,0 +1,48 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > + > +#include "arm_neon.h" > + > +/* For each of these intrinsics, we're mapping to a simple C cast. > + While the compiler has some freedom in terms of choice of instruction, > + we'd hope that for this simple case it would always pick the single > + instruction form given in these tests. Anything else is likely a > + regression, so check for an exact instruction pattern and > + register allocation decision. */ > + > +/* Test that if we have a value already in Advanced-SIMD registers, we use > + the scalar register forms. */ > + > +float64x1_t > +test_vcvt_f64_s64_fpr (int64x1_t a) > +{ > + /* { dg-final { scan-assembler-times "scvtf\td0, d0" 1 } } */ > + return vcvt_f64_s64 (a); > +} > + > +float64x1_t > +test_vcvt_f64_u64_fpr (uint64x1_t a) > +{ > + /* { dg-final { scan-assembler-times "ucvtf\td0, d0" 1 } } */ > + return vcvt_f64_u64 (a); > +} > + > +/* Test that if we have an integer constructor, we use the general-purpose > + register forms for scvtf and ucvtf. */ > + > +float64x1_t > +test_vcvt_f64_s64_gpr (int64_t a) > +{ > + /* { dg-final { scan-assembler-times "scvtf\td0, x0" 1 } } */ > + int64x1_t b = (int64x1_t) { a }; > + return vcvt_f64_s64 (b); > +} > + > +float64x1_t > +test_vcvt_f64_u64_gpr (uint64_t a) > +{ > + /* { dg-final { scan-assembler-times "ucvtf\td0, x0" 1 } } */ > + uint64x1_t b = (uint64x1_t) { a }; > + return vcvt_f64_u64 (b); > +} > + > diff --git a/gcc/testsuite/gcc.target/aarch64/vcvt_n_f64_1.c b/gcc/testsuite/gcc.target/aarch64/vcvt_n_f64_1.c > new file mode 100644 > index 0000000..6fe16de > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vcvt_n_f64_1.c > @@ -0,0 +1,80 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > + > +#include "arm_neon.h" > + > +/* For each of these intrinsics, we map directly to an unspec in RTL. > + We're just using the argument directly and returning the result, so we > + can precisely specify the exact instruction pattern and register > + allocations we expect. */ > + > +/* Test that if we have a value already in Advanced-SIMD registers, we use > + the scalar register forms. */ > + > +float64x1_t > +test_vcvt_n_f64_s64_fpr (int64x1_t a) > +{ > + /* { dg-final { scan-assembler-times "scvtf\td0, d0, #3" 1 } } */ > + return vcvt_n_f64_s64 (a, 3); > +} > + > +float64x1_t > +test_vcvt_n_f64_u64_fpr (uint64x1_t a) > +{ > + /* { dg-final { scan-assembler-times "ucvtf\td0, d0, #3" 1 } } */ > + return vcvt_n_f64_u64 (a, 3); > +} > + > +/* Test that if we have an integer constructor, we use the general-purpose > + register forms for scvtf and ucvtf. */ > + > +float64x1_t > +test_vcvt_n_f64_s64_gpr (int64_t a) > +{ > + /* { dg-final { scan-assembler-times "scvtf\td0, x0, #3" 1 } } */ > + int64x1_t b = (int64x1_t) { a }; > + return vcvt_n_f64_s64 (b, 3); > +} > + > +float64x1_t > +test_vcvt_n_f64_u64_gpr (uint64_t a) > +{ > + /* { dg-final { scan-assembler-times "ucvtf\td0, x0, #3" 1 } } */ > + uint64x1_t b = (uint64x1_t) { a }; > + return vcvt_n_f64_u64 (b, 3); > +} > + > +/* Test that a normal return through the Advanced-SIMD registers uses > + the scalar register form. */ > + > +int64x1_t > +test_vcvt_n_s64_f64_fpr (float64x1_t a) > +{ > + /* { dg-final { scan-assembler-times "fcvtzs\td0, d0, #3" 1 } } */ > + return vcvt_n_s64_f64 (a, 3); > +} > + > +uint64x1_t > +test_vcvt_n_u64_f64_fpr (float64x1_t a) > +{ > + /* { dg-final { scan-assembler-times "fcvtzu\td0, d0, #3" 1 } } */ > + return vcvt_n_u64_f64 (a, 3); > +} > + > +/* Test that a lane extracted return as a plain [u]int64_t uses > + the general-register forms of fcvtzs and fcvtzu. */ > + > +int64_t > +test_vcvt_n_s64_f64_gpr (float64x1_t a) > +{ > + /* { dg-final { scan-assembler-times "fcvtzs\tx0, d0, #3" 1 } } */ > + return vget_lane_s64 (vcvt_n_s64_f64 (a, 3), 0); > +} > + > +uint64_t > +test_vcvt_n_u64_f64_gpr (float64x1_t a) > +{ > + /* { dg-final { scan-assembler-times "fcvtzu\tx0, d0, #3" 1 } } */ > + return vget_lane_u64 (vcvt_n_u64_f64 (a, 3), 0); > +} > + > diff --git a/gcc/testsuite/gcc.target/aarch64/vrecp_f64_1.c b/gcc/testsuite/gcc.target/aarch64/vrecp_f64_1.c > new file mode 100644 > index 0000000..c61b2f1 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vrecp_f64_1.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > + > +#include "arm_neon.h" > + > +/* For each of these intrinsics, we're mapping to an unspec in RTL. > + We therefore know the expected instruction choice and register pattern, > + so we can look for it exactly. */ > + > +float64x1_t > +test_vrecpe_f64 (float64x1_t a) > +{ > + /* { dg-final { scan-assembler-times "frecpe\td0, d0" 1 } } */ > + return vrecpe_f64 (a); > +} > + > +float64x1_t > +test_vrecps_f64 (float64x1_t a, float64x1_t b) > +{ > + /* { dg-final { scan-assembler-times "frecps\td0, d0, d1" 1 } } */ > + return vrecps_f64 (a, b); > +} > + >
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index f70b6d3..2f90938 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -12447,6 +12447,20 @@ vcvt_n_f32_u32 (uint32x2_t __a, const int __b) return __builtin_aarch64_ucvtfv2si_sus (__a, __b); } +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vcvt_n_f64_s64 (int64x1_t __a, const int __b) +{ + return (float64x1_t) + { __builtin_aarch64_scvtfdi (vget_lane_s64 (__a, 0), __b) }; +} + +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vcvt_n_f64_u64 (uint64x1_t __a, const int __b) +{ + return (float64x1_t) + { __builtin_aarch64_ucvtfdi_sus (vget_lane_u64 (__a, 0), __b) }; +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vcvtq_n_f32_s32 (int32x4_t __a, const int __b) { @@ -12509,6 +12523,20 @@ vcvt_n_u32_f32 (float32x2_t __a, const int __b) return __builtin_aarch64_fcvtzuv2sf_uss (__a, __b); } +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vcvt_n_s64_f64 (float64x1_t __a, const int __b) +{ + return (int64x1_t) + { __builtin_aarch64_fcvtzsdf (vget_lane_f64 (__a, 0), __b) }; +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcvt_n_u64_f64 (float64x1_t __a, const int __b) +{ + return (uint64x1_t) + { __builtin_aarch64_fcvtzudf_uss (vget_lane_f64 (__a, 0), __b) }; +} + __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) vcvtq_n_s32_f32 (float32x4_t __a, const int __b) { @@ -12571,6 +12599,18 @@ vcvt_f32_u32 (uint32x2_t __a) return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a); } +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vcvt_f64_s64 (int64x1_t __a) +{ + return (float64x1_t) { vget_lane_s64 (__a, 0) }; +} + +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vcvt_f64_u64 (uint64x1_t __a) +{ + return (float64x1_t) { vget_lane_u64 (__a, 0) }; +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vcvtq_f32_s32 (int32x4_t __a) { @@ -20659,6 +20699,12 @@ vrecpe_f32 (float32x2_t __a) return __builtin_aarch64_frecpev2sf (__a); } +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vrecpe_f64 (float64x1_t __a) +{ + return (float64x1_t) { vrecped_f64 (vget_lane_f64 (__a, 0)) }; +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vrecpeq_f32 (float32x4_t __a) { @@ -20691,6 +20737,13 @@ vrecps_f32 (float32x2_t __a, float32x2_t __b) return __builtin_aarch64_frecpsv2sf (__a, __b); } +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vrecps_f64 (float64x1_t __a, float64x1_t __b) +{ + return (float64x1_t) { vrecpsd_f64 (vget_lane_f64 (__a, 0), + vget_lane_f64 (__b, 0)) }; +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vrecpsq_f32 (float32x4_t __a, float32x4_t __b) { diff --git a/gcc/testsuite/gcc.target/aarch64/vcvt_f64_1.c b/gcc/testsuite/gcc.target/aarch64/vcvt_f64_1.c new file mode 100644 index 0000000..b7ee7af --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vcvt_f64_1.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "arm_neon.h" + +/* For each of these intrinsics, we're mapping to a simple C cast. + While the compiler has some freedom in terms of choice of instruction, + we'd hope that for this simple case it would always pick the single + instruction form given in these tests. Anything else is likely a + regression, so check for an exact instruction pattern and + register allocation decision. */ + +/* Test that if we have a value already in Advanced-SIMD registers, we use + the scalar register forms. */ + +float64x1_t +test_vcvt_f64_s64_fpr (int64x1_t a) +{ + /* { dg-final { scan-assembler-times "scvtf\td0, d0" 1 } } */ + return vcvt_f64_s64 (a); +} + +float64x1_t +test_vcvt_f64_u64_fpr (uint64x1_t a) +{ + /* { dg-final { scan-assembler-times "ucvtf\td0, d0" 1 } } */ + return vcvt_f64_u64 (a); +} + +/* Test that if we have an integer constructor, we use the general-purpose + register forms for scvtf and ucvtf. */ + +float64x1_t +test_vcvt_f64_s64_gpr (int64_t a) +{ + /* { dg-final { scan-assembler-times "scvtf\td0, x0" 1 } } */ + int64x1_t b = (int64x1_t) { a }; + return vcvt_f64_s64 (b); +} + +float64x1_t +test_vcvt_f64_u64_gpr (uint64_t a) +{ + /* { dg-final { scan-assembler-times "ucvtf\td0, x0" 1 } } */ + uint64x1_t b = (uint64x1_t) { a }; + return vcvt_f64_u64 (b); +} + diff --git a/gcc/testsuite/gcc.target/aarch64/vcvt_n_f64_1.c b/gcc/testsuite/gcc.target/aarch64/vcvt_n_f64_1.c new file mode 100644 index 0000000..6fe16de --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vcvt_n_f64_1.c @@ -0,0 +1,80 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "arm_neon.h" + +/* For each of these intrinsics, we map directly to an unspec in RTL. + We're just using the argument directly and returning the result, so we + can precisely specify the exact instruction pattern and register + allocations we expect. */ + +/* Test that if we have a value already in Advanced-SIMD registers, we use + the scalar register forms. */ + +float64x1_t +test_vcvt_n_f64_s64_fpr (int64x1_t a) +{ + /* { dg-final { scan-assembler-times "scvtf\td0, d0, #3" 1 } } */ + return vcvt_n_f64_s64 (a, 3); +} + +float64x1_t +test_vcvt_n_f64_u64_fpr (uint64x1_t a) +{ + /* { dg-final { scan-assembler-times "ucvtf\td0, d0, #3" 1 } } */ + return vcvt_n_f64_u64 (a, 3); +} + +/* Test that if we have an integer constructor, we use the general-purpose + register forms for scvtf and ucvtf. */ + +float64x1_t +test_vcvt_n_f64_s64_gpr (int64_t a) +{ + /* { dg-final { scan-assembler-times "scvtf\td0, x0, #3" 1 } } */ + int64x1_t b = (int64x1_t) { a }; + return vcvt_n_f64_s64 (b, 3); +} + +float64x1_t +test_vcvt_n_f64_u64_gpr (uint64_t a) +{ + /* { dg-final { scan-assembler-times "ucvtf\td0, x0, #3" 1 } } */ + uint64x1_t b = (uint64x1_t) { a }; + return vcvt_n_f64_u64 (b, 3); +} + +/* Test that a normal return through the Advanced-SIMD registers uses + the scalar register form. */ + +int64x1_t +test_vcvt_n_s64_f64_fpr (float64x1_t a) +{ + /* { dg-final { scan-assembler-times "fcvtzs\td0, d0, #3" 1 } } */ + return vcvt_n_s64_f64 (a, 3); +} + +uint64x1_t +test_vcvt_n_u64_f64_fpr (float64x1_t a) +{ + /* { dg-final { scan-assembler-times "fcvtzu\td0, d0, #3" 1 } } */ + return vcvt_n_u64_f64 (a, 3); +} + +/* Test that a lane extracted return as a plain [u]int64_t uses + the general-register forms of fcvtzs and fcvtzu. */ + +int64_t +test_vcvt_n_s64_f64_gpr (float64x1_t a) +{ + /* { dg-final { scan-assembler-times "fcvtzs\tx0, d0, #3" 1 } } */ + return vget_lane_s64 (vcvt_n_s64_f64 (a, 3), 0); +} + +uint64_t +test_vcvt_n_u64_f64_gpr (float64x1_t a) +{ + /* { dg-final { scan-assembler-times "fcvtzu\tx0, d0, #3" 1 } } */ + return vget_lane_u64 (vcvt_n_u64_f64 (a, 3), 0); +} + diff --git a/gcc/testsuite/gcc.target/aarch64/vrecp_f64_1.c b/gcc/testsuite/gcc.target/aarch64/vrecp_f64_1.c new file mode 100644 index 0000000..c61b2f1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vrecp_f64_1.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "arm_neon.h" + +/* For each of these intrinsics, we're mapping to an unspec in RTL. + We therefore know the expected instruction choice and register pattern, + so we can look for it exactly. */ + +float64x1_t +test_vrecpe_f64 (float64x1_t a) +{ + /* { dg-final { scan-assembler-times "frecpe\td0, d0" 1 } } */ + return vrecpe_f64 (a); +} + +float64x1_t +test_vrecps_f64 (float64x1_t a, float64x1_t b) +{ + /* { dg-final { scan-assembler-times "frecps\td0, d0, d1" 1 } } */ + return vrecps_f64 (a, b); +} +