Message ID | CAKdteObt_dP63aqn3eH6mHiK5zXP+Y_rL+DfN55D=WfK_4cVGw@mail.gmail.com |
---|---|
State | Superseded |
Headers | show |
Ping? On 15 September 2015 at 18:25, Christophe Lyon <christophe.lyon@linaro.org> wrote: > This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using > existing builtins, and fixes the behaviour on aarch64_be. > > Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model. > > OK? > > Christophe.
Ping? https://gcc.gnu.org/ml/gcc-patches/2015-09/msg01096.html On 29 September 2015 at 22:57, Christophe Lyon <christophe.lyon@linaro.org> wrote: > Ping? > > > On 15 September 2015 at 18:25, Christophe Lyon > <christophe.lyon@linaro.org> wrote: >> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using >> existing builtins, and fixes the behaviour on aarch64_be. >> >> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model. >> >> OK? >> >> Christophe.
On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote: > This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using > existing builtins, and fixes the behaviour on aarch64_be. > > Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model. > > OK? Hi Christophe, Sorry for the delay getting back to you, comments below. > 2015-09-15 Christophe Lyon <christophe.lyon@linaro.org> > > * config/aarch64/aarch64-builtins.c > (aarch64_types_tbl_qualifiers): New static data. > (TYPES_TBL): Define. > * config/aarch64/aarch64-simd-builtins.def: Update builtins > tables. > * config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New. > * config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8) > (vtbl4_s8, vtbl4_u8, vtbl4_p8): Rewrite using builtin functions. > (vtbx4_s8, vtbx4_u8, vtbx4_p8): Emulate behaviour using other > intrinsics. > * config/aarch64/iterators.md (V8Q): New. > diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c > index 0f4f2b9..7ca3917 100644 > --- a/gcc/config/aarch64/aarch64-builtins.c > +++ b/gcc/config/aarch64/aarch64-builtins.c > @@ -253,6 +253,11 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS] > qualifier_none, qualifier_struct_load_store_lane_index }; > #define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers) > > +static enum aarch64_type_qualifiers > +aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS] > + = { qualifier_none, qualifier_none, qualifier_none }; > +#define TYPES_TBL (aarch64_types_tbl_qualifiers) > + Do we need these? This looks like TYPES_BINOP (the predicate on the instruction pattern will prevent the "qualifier_maybe_immediate" from becoming a problem). > #define CF0(N, X) CODE_FOR_aarch64_##N##X > #define CF1(N, X) CODE_FOR_##N##X##1 > #define CF2(N, X) CODE_FOR_##N##X##2 > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def > index d0f298a..62f1b13 100644 > --- a/gcc/config/aarch64/aarch64-simd-builtins.def > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def > @@ -405,3 +405,5 @@ > VAR1 (BINOPP, crypto_pmull, 0, di) > VAR1 (BINOPP, crypto_pmull, 0, v2di) > > + /* Implemented by aarch64_tbl3v8qi. */ > + BUILTIN_V8Q (TBL, tbl3, 0) This can be: VAR1 (BINOP, tbl3, 0, v8qi) It would be good if we could eliminate the casts in arm_neon.h by also defining a "BINOPU" version of this, but I imagine that gets stuck on the types accepted by __builtin_aarch64_set_qregoiv16qi - so don't worry about making that change. > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md > index 9777418..84a61d5 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -4716,6 +4714,16 @@ > [(set_attr "type" "neon_tbl2_q")] > ) > > +(define_insn "aarch64_tbl3v8qi" > + [(set (match_operand:V8QI 0 "register_operand" "=w") > + (unspec:V8QI [(match_operand:OI 1 "register_operand" "w") > + (match_operand:V8QI 2 "register_operand" "w")] > + UNSPEC_TBL))] > + "TARGET_SIMD" > + "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b" > + [(set_attr "type" "neon_tbl3")] > +) > + > (define_insn_and_split "aarch64_combinev16qi" > [(set (match_operand:OI 0 "register_operand" "=w") > (unspec:OI [(match_operand:V16QI 1 "register_operand" "w") > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index 87bbf6e..91704de 100644 > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index 6dfebe7..e8ee318 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > /* End of temporary inline asm. */ > > /* Start of optimal implementations in approved order. */ > @@ -23221,6 +23182,36 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx) > return vbsl_p8 (__mask, __tbl, __r); > } > > +/* vtbx4 */ > + > +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) > +vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) > +{ > + uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), > + vmov_n_u8 (32)); > + int8x8_t __tbl = vtbl4_s8 (__tab, __idx); > + > + return vbsl_s8 (__mask, __tbl, __r); > +} > + > +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) > +vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx) > +{ > + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32)); > + uint8x8_t __tbl = vtbl4_u8 (__tab, __idx); > + > + return vbsl_u8 (__mask, __tbl, __r); > +} > + > +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) > +vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx) > +{ > + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32)); > + poly8x8_t __tbl = vtbl4_p8 (__tab, __idx); > + > + return vbsl_p8 (__mask, __tbl, __r); > +} > + Why do we want this for vtbx4 rather than putting out a VTBX instruction directly (as in the inline asm versions you replace)? This sequence does make sense for vtbx3. > /* vtrn */ > > __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index b8a45d1..dfbd9cd 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -100,6 +100,8 @@ > ;; All modes. > (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF]) > > +(define_mode_iterator V8Q [V8QI]) > + This can be dropped if you use VAR1 in aarch64-builtins.c. Thanks for working on this, with your patch applied, the only remaining intrinsics I see failing for aarch64_be are: vqtbl2_*8 vqtbl2q_*8 vqtbl3_*8 vqtbl3q_*8 vqtbl4_*8 vqtbl4q_*8 vqtbx2_*8 vqtbx2q_*8 vqtbx3_*8 vqtbx3q_*8 vqtbx4_*8 vqtbx4q_*8 Thanks, James
On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote: > On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote: >> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using >> existing builtins, and fixes the behaviour on aarch64_be. >> >> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model. >> >> OK? > > Hi Christophe, > > Sorry for the delay getting back to you, comments below. > >> 2015-09-15 Christophe Lyon <christophe.lyon@linaro.org> >> >> * config/aarch64/aarch64-builtins.c >> (aarch64_types_tbl_qualifiers): New static data. >> (TYPES_TBL): Define. >> * config/aarch64/aarch64-simd-builtins.def: Update builtins >> tables. >> * config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New. >> * config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8) >> (vtbl4_s8, vtbl4_u8, vtbl4_p8): Rewrite using builtin functions. >> (vtbx4_s8, vtbx4_u8, vtbx4_p8): Emulate behaviour using other >> intrinsics. >> * config/aarch64/iterators.md (V8Q): New. > >> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c >> index 0f4f2b9..7ca3917 100644 >> --- a/gcc/config/aarch64/aarch64-builtins.c >> +++ b/gcc/config/aarch64/aarch64-builtins.c >> @@ -253,6 +253,11 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS] >> qualifier_none, qualifier_struct_load_store_lane_index }; >> #define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers) >> >> +static enum aarch64_type_qualifiers >> +aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS] >> + = { qualifier_none, qualifier_none, qualifier_none }; >> +#define TYPES_TBL (aarch64_types_tbl_qualifiers) >> + > > Do we need these? This looks like TYPES_BINOP (the predicate on the > instruction pattern will prevent the "qualifier_maybe_immediate" from > becoming a problem). > I'll give it a try, indeed I feared "qualifier_maybe_immediate" would cause problems. >> #define CF0(N, X) CODE_FOR_aarch64_##N##X >> #define CF1(N, X) CODE_FOR_##N##X##1 >> #define CF2(N, X) CODE_FOR_##N##X##2 >> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def >> index d0f298a..62f1b13 100644 >> --- a/gcc/config/aarch64/aarch64-simd-builtins.def >> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def >> @@ -405,3 +405,5 @@ >> VAR1 (BINOPP, crypto_pmull, 0, di) >> VAR1 (BINOPP, crypto_pmull, 0, v2di) >> >> + /* Implemented by aarch64_tbl3v8qi. */ >> + BUILTIN_V8Q (TBL, tbl3, 0) > > This can be: > > VAR1 (BINOP, tbl3, 0, v8qi) > > It would be good if we could eliminate the casts in arm_neon.h by also > defining a "BINOPU" version of this, but I imagine that gets stuck on the > types accepted by __builtin_aarch64_set_qregoiv16qi - so don't worry about > making that change. OK > >> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md >> index 9777418..84a61d5 100644 >> --- a/gcc/config/aarch64/aarch64-simd.md >> +++ b/gcc/config/aarch64/aarch64-simd.md >> @@ -4716,6 +4714,16 @@ >> [(set_attr "type" "neon_tbl2_q")] >> ) >> >> +(define_insn "aarch64_tbl3v8qi" >> + [(set (match_operand:V8QI 0 "register_operand" "=w") >> + (unspec:V8QI [(match_operand:OI 1 "register_operand" "w") >> + (match_operand:V8QI 2 "register_operand" "w")] >> + UNSPEC_TBL))] >> + "TARGET_SIMD" >> + "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b" >> + [(set_attr "type" "neon_tbl3")] >> +) >> + >> (define_insn_and_split "aarch64_combinev16qi" >> [(set (match_operand:OI 0 "register_operand" "=w") >> (unspec:OI [(match_operand:V16QI 1 "register_operand" "w") >> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c >> index 87bbf6e..91704de 100644 >> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h >> index 6dfebe7..e8ee318 100644 >> --- a/gcc/config/aarch64/arm_neon.h >> +++ b/gcc/config/aarch64/arm_neon.h >> /* End of temporary inline asm. */ >> >> /* Start of optimal implementations in approved order. */ >> @@ -23221,6 +23182,36 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx) >> return vbsl_p8 (__mask, __tbl, __r); >> } >> >> +/* vtbx4 */ >> + >> +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) >> +vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) >> +{ >> + uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), >> + vmov_n_u8 (32)); >> + int8x8_t __tbl = vtbl4_s8 (__tab, __idx); >> + >> + return vbsl_s8 (__mask, __tbl, __r); >> +} >> + >> +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) >> +vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx) >> +{ >> + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32)); >> + uint8x8_t __tbl = vtbl4_u8 (__tab, __idx); >> + >> + return vbsl_u8 (__mask, __tbl, __r); >> +} >> + >> +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) >> +vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx) >> +{ >> + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32)); >> + poly8x8_t __tbl = vtbl4_p8 (__tab, __idx); >> + >> + return vbsl_p8 (__mask, __tbl, __r); >> +} >> + > > Why do we want this for vtbx4 rather than putting out a VTBX instruction > directly (as in the inline asm versions you replace)? > I just followed the pattern used for vtbx3. > This sequence does make sense for vtbx3. In fact, I don't see why vtbx3 and vtbx4 should be different? >> /* vtrn */ >> >> __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) >> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md >> index b8a45d1..dfbd9cd 100644 >> --- a/gcc/config/aarch64/iterators.md >> +++ b/gcc/config/aarch64/iterators.md >> @@ -100,6 +100,8 @@ >> ;; All modes. >> (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF]) >> >> +(define_mode_iterator V8Q [V8QI]) >> + > > This can be dropped if you use VAR1 in aarch64-builtins.c. > > Thanks for working on this, with your patch applied, the only > remaining intrinsics I see failing for aarch64_be are: > > vqtbl2_*8 > vqtbl2q_*8 > vqtbl3_*8 > vqtbl3q_*8 > vqtbl4_*8 > vqtbl4q_*8 > > vqtbx2_*8 > vqtbx2q_*8 > vqtbx3_*8 > vqtbx3q_*8 > vqtbx4_*8 > vqtbx4q_*8 > Quite possibly. Which tests are you looking at? Since these are aarch64-specific, they are not part of the tests I added (advsimd-intrinsics). Do you mean gcc.target/aarch64/table-intrinsics.c? > Thanks, > James >
On Wed, Oct 07, 2015 at 09:07:30PM +0100, Christophe Lyon wrote: > On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote: > > On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote: > > > > Why do we want this for vtbx4 rather than putting out a VTBX instruction > > directly (as in the inline asm versions you replace)? > > > I just followed the pattern used for vtbx3. > > > This sequence does make sense for vtbx3. > In fact, I don't see why vtbx3 and vtbx4 should be different? The difference between TBL and TBX is in their handling of a request to select an out-of-range value. For TBL this returns zero, for TBX this returns the value which was already in the destination register. Because the byte-vectors used by the TBX instruction in aarch64 are 128-bit (so two of them togather allow selecting elements in the range 0-31), and vtbx3 needs to emulate the AArch32 behaviour of picking elements from 3x64-bit vectors (allowing elements in the range 0-23), we need to manually check for values which would have been out-of-range on AArch32, but are not out of range for AArch64 and handle them appropriately. For vtbx4 on the other hand, 2x128-bit registers give the range 0..31 and 4x64-bit registers give the range 0..31, so we don't need the special masked handling. You can find the suggested instruction sequences for the Neon intrinsics in this document: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf > >> /* vtrn */ > >> > >> __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) > >> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > >> index b8a45d1..dfbd9cd 100644 > >> --- a/gcc/config/aarch64/iterators.md > >> +++ b/gcc/config/aarch64/iterators.md > >> @@ -100,6 +100,8 @@ > >> ;; All modes. > >> (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF]) > >> > >> +(define_mode_iterator V8Q [V8QI]) > >> + > > > > This can be dropped if you use VAR1 in aarch64-builtins.c. > > > > Thanks for working on this, with your patch applied, the only > > remaining intrinsics I see failing for aarch64_be are: > > > > vqtbl2_*8 > > vqtbl2q_*8 > > vqtbl3_*8 > > vqtbl3q_*8 > > vqtbl4_*8 > > vqtbl4q_*8 > > > > vqtbx2_*8 > > vqtbx2q_*8 > > vqtbx3_*8 > > vqtbx3q_*8 > > vqtbx4_*8 > > vqtbx4q_*8 > > > Quite possibly. Which tests are you looking at? Since these are > aarch64-specific, they are not part of the > tests I added (advsimd-intrinsics). Do you mean > gcc.target/aarch64/table-intrinsics.c? Sorry, yes I should have given a reference. I'm running with a variant of a testcase from the LLVM test-suite repository: SingleSource/UnitTests/Vector/AArch64/aarch64_neon_intrinsics.c This has an execute test for most of the intrinsics specified for AArch64. It needs some modification to cover the intrinsics we don't implement yet. Thanks, James
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index 0f4f2b9..7ca3917 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -253,6 +253,11 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS] qualifier_none, qualifier_struct_load_store_lane_index }; #define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers) +static enum aarch64_type_qualifiers +aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_none, qualifier_none }; +#define TYPES_TBL (aarch64_types_tbl_qualifiers) + #define CF0(N, X) CODE_FOR_aarch64_##N##X #define CF1(N, X) CODE_FOR_##N##X##1 #define CF2(N, X) CODE_FOR_##N##X##2 diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index d0f298a..62f1b13 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -405,3 +405,5 @@ VAR1 (BINOPP, crypto_pmull, 0, di) VAR1 (BINOPP, crypto_pmull, 0, v2di) + /* Implemented by aarch64_tbl3v8qi. */ + BUILTIN_V8Q (TBL, tbl3, 0) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 9777418..84a61d5 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4716,6 +4714,16 @@ [(set_attr "type" "neon_tbl2_q")] ) +(define_insn "aarch64_tbl3v8qi" + [(set (match_operand:V8QI 0 "register_operand" "=w") + (unspec:V8QI [(match_operand:OI 1 "register_operand" "w") + (match_operand:V8QI 2 "register_operand" "w")] + UNSPEC_TBL))] + "TARGET_SIMD" + "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b" + [(set_attr "type" "neon_tbl3")] +) + (define_insn_and_split "aarch64_combinev16qi" [(set (match_operand:OI 0 "register_operand" "=w") (unspec:OI [(match_operand:V16QI 1 "register_operand" "w") diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 87bbf6e..91704de 100644 diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 6dfebe7..e8ee318 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -10902,13 +10902,14 @@ vtbl3_s8 (int8x8x3_t tab, int8x8_t idx) { int8x8_t result; int8x16x2_t temp; + __builtin_aarch64_simd_oi __o; temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = __builtin_aarch64_tbl3v8qi (__o, idx); return result; } @@ -10917,13 +10918,14 @@ vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx) { uint8x8_t result; uint8x16x2_t temp; + __builtin_aarch64_simd_oi __o; temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); return result; } @@ -10932,13 +10934,14 @@ vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx) { poly8x8_t result; poly8x16x2_t temp; + __builtin_aarch64_simd_oi __o; temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); return result; } @@ -10947,13 +10950,14 @@ vtbl4_s8 (int8x8x4_t tab, int8x8_t idx) { int8x8_t result; int8x16x2_t temp; + __builtin_aarch64_simd_oi __o; temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = __builtin_aarch64_tbl3v8qi (__o, idx); return result; } @@ -10962,13 +10966,14 @@ vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx) { uint8x8_t result; uint8x16x2_t temp; + __builtin_aarch64_simd_oi __o; temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); return result; } @@ -10977,13 +10982,14 @@ vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx) { poly8x8_t result; poly8x16x2_t temp; + __builtin_aarch64_simd_oi __o; temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) temp.val[1], 1); + result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); return result; } @@ -11023,51 +11029,6 @@ vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx) return result; } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx) -{ - int8x8_t result = r; - int8x16x2_t temp; - temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "+w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx) -{ - uint8x8_t result = r; - uint8x16x2_t temp; - temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "+w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx) -{ - poly8x8_t result = r; - poly8x16x2_t temp; - temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "+w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; -} - /* End of temporary inline asm. */ /* Start of optimal implementations in approved order. */ @@ -23221,6 +23182,36 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx) return vbsl_p8 (__mask, __tbl, __r); } +/* vtbx4 */ + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) +{ + uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), + vmov_n_u8 (32)); + int8x8_t __tbl = vtbl4_s8 (__tab, __idx); + + return vbsl_s8 (__mask, __tbl, __r); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx) +{ + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32)); + uint8x8_t __tbl = vtbl4_u8 (__tab, __idx); + + return vbsl_u8 (__mask, __tbl, __r); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx) +{ + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32)); + poly8x8_t __tbl = vtbl4_p8 (__tab, __idx); + + return vbsl_p8 (__mask, __tbl, __r); +} + /* vtrn */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index b8a45d1..dfbd9cd 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -100,6 +100,8 @@ ;; All modes. (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF]) +(define_mode_iterator V8Q [V8QI]) + ;; All vector modes and DI. (define_mode_iterator VALLDI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF DI])