diff mbox

[AArch64_be] Fix vtbl[34] and vtbx4

Message ID CAKdteObt_dP63aqn3eH6mHiK5zXP+Y_rL+DfN55D=WfK_4cVGw@mail.gmail.com
State Superseded
Headers show

Commit Message

Christophe Lyon Sept. 15, 2015, 4:25 p.m. UTC
This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
existing builtins, and fixes the behaviour on aarch64_be.

Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.

OK?

Christophe.
2015-09-15  Christophe Lyon  <christophe.lyon@linaro.org>

	* config/aarch64/aarch64-builtins.c
	(aarch64_types_tbl_qualifiers): New static data.
	(TYPES_TBL): Define.
	* config/aarch64/aarch64-simd-builtins.def: Update builtins
	tables.
	* config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
	* config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
	(vtbl4_s8, vtbl4_u8, vtbl4_p8): Rewrite using builtin functions.
	(vtbx4_s8, vtbx4_u8, vtbx4_p8): Emulate behaviour using other
	intrinsics.
	* config/aarch64/iterators.md (V8Q): New.

Comments

Christophe Lyon Sept. 29, 2015, 8:57 p.m. UTC | #1
Ping?


On 15 September 2015 at 18:25, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
> existing builtins, and fixes the behaviour on aarch64_be.
>
> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
>
> OK?
>
> Christophe.
Christophe Lyon Oct. 7, 2015, 9:23 a.m. UTC | #2
Ping?
https://gcc.gnu.org/ml/gcc-patches/2015-09/msg01096.html

On 29 September 2015 at 22:57, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
> Ping?
>
>
> On 15 September 2015 at 18:25, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
>> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
>> existing builtins, and fixes the behaviour on aarch64_be.
>>
>> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
>>
>> OK?
>>
>> Christophe.
James Greenhalgh Oct. 7, 2015, 3:09 p.m. UTC | #3
On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
> existing builtins, and fixes the behaviour on aarch64_be.
> 
> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
> 
> OK?

Hi Christophe,

Sorry for the delay getting back to you, comments below.

> 2015-09-15  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	* config/aarch64/aarch64-builtins.c
> 	(aarch64_types_tbl_qualifiers): New static data.
> 	(TYPES_TBL): Define.
> 	* config/aarch64/aarch64-simd-builtins.def: Update builtins
> 	tables.
> 	* config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
> 	* config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
> 	(vtbl4_s8, vtbl4_u8, vtbl4_p8): Rewrite using builtin functions.
> 	(vtbx4_s8, vtbx4_u8, vtbx4_p8): Emulate behaviour using other
> 	intrinsics.
> 	* config/aarch64/iterators.md (V8Q): New.

> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
> index 0f4f2b9..7ca3917 100644
> --- a/gcc/config/aarch64/aarch64-builtins.c
> +++ b/gcc/config/aarch64/aarch64-builtins.c
> @@ -253,6 +253,11 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
>        qualifier_none, qualifier_struct_load_store_lane_index };
>  #define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers)
>  
> +static enum aarch64_type_qualifiers
> +aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS]
> +  = { qualifier_none, qualifier_none, qualifier_none };
> +#define TYPES_TBL (aarch64_types_tbl_qualifiers)
> +

Do we need these? This looks like TYPES_BINOP (the predicate on the
instruction pattern will prevent the "qualifier_maybe_immediate" from
becoming a problem).

>  #define CF0(N, X) CODE_FOR_aarch64_##N##X
>  #define CF1(N, X) CODE_FOR_##N##X##1
>  #define CF2(N, X) CODE_FOR_##N##X##2
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
> index d0f298a..62f1b13 100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -405,3 +405,5 @@
>    VAR1 (BINOPP, crypto_pmull, 0, di)
>    VAR1 (BINOPP, crypto_pmull, 0, v2di)
>  
> +  /* Implemented by aarch64_tbl3v8qi.  */
> +  BUILTIN_V8Q (TBL, tbl3, 0)

This can be:

  VAR1 (BINOP, tbl3, 0, v8qi)

It would be good if we could eliminate the casts in arm_neon.h by also
defining a  "BINOPU" version of this, but I imagine that gets stuck on the
types accepted by __builtin_aarch64_set_qregoiv16qi - so don't worry about
making that change.


> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 9777418..84a61d5 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4716,6 +4714,16 @@
>    [(set_attr "type" "neon_tbl2_q")]
>  )
>  
> +(define_insn "aarch64_tbl3v8qi"
> +  [(set (match_operand:V8QI 0 "register_operand" "=w")
> +	(unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
> +		      (match_operand:V8QI 2 "register_operand" "w")]
> +		      UNSPEC_TBL))]
> +  "TARGET_SIMD"
> +  "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
> +  [(set_attr "type" "neon_tbl3")]
> +)
> +
>  (define_insn_and_split "aarch64_combinev16qi"
>    [(set (match_operand:OI 0 "register_operand" "=w")
>  	(unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 87bbf6e..91704de 100644
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 6dfebe7..e8ee318 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
>  /* End of temporary inline asm.  */
>  
>  /* Start of optimal implementations in approved order.  */
> @@ -23221,6 +23182,36 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
>    return vbsl_p8 (__mask, __tbl, __r);
>  }
>  
> +/* vtbx4  */
> +
> +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
> +vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
> +{
> +  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
> +			      vmov_n_u8 (32));
> +  int8x8_t __tbl = vtbl4_s8 (__tab, __idx);
> +
> +  return vbsl_s8 (__mask, __tbl, __r);
> +}
> +
> +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
> +vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
> +{
> +  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
> +  uint8x8_t __tbl = vtbl4_u8 (__tab, __idx);
> +
> +  return vbsl_u8 (__mask, __tbl, __r);
> +}
> +
> +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
> +vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
> +{
> +  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
> +  poly8x8_t __tbl = vtbl4_p8 (__tab, __idx);
> +
> +  return vbsl_p8 (__mask, __tbl, __r);
> +}
> +

Why do we want this for vtbx4 rather than putting out a VTBX instruction
directly (as in the inline asm versions you replace)?

This sequence does make sense for vtbx3.

>  /* vtrn */
>  
>  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index b8a45d1..dfbd9cd 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -100,6 +100,8 @@
>  ;; All modes.
>  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
>  
> +(define_mode_iterator V8Q [V8QI])
> +

This can be dropped if you use VAR1 in aarch64-builtins.c.

Thanks for working on this, with your patch applied, the only
remaining intrinsics I see failing for aarch64_be are:

  vqtbl2_*8
  vqtbl2q_*8
  vqtbl3_*8
  vqtbl3q_*8
  vqtbl4_*8
  vqtbl4q_*8

  vqtbx2_*8
  vqtbx2q_*8
  vqtbx3_*8
  vqtbx3q_*8
  vqtbx4_*8
  vqtbx4q_*8

Thanks,
James
Christophe Lyon Oct. 7, 2015, 8:07 p.m. UTC | #4
On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
>> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
>> existing builtins, and fixes the behaviour on aarch64_be.
>>
>> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation Model.
>>
>> OK?
>
> Hi Christophe,
>
> Sorry for the delay getting back to you, comments below.
>
>> 2015-09-15  Christophe Lyon  <christophe.lyon@linaro.org>
>>
>>       * config/aarch64/aarch64-builtins.c
>>       (aarch64_types_tbl_qualifiers): New static data.
>>       (TYPES_TBL): Define.
>>       * config/aarch64/aarch64-simd-builtins.def: Update builtins
>>       tables.
>>       * config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
>>       * config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
>>       (vtbl4_s8, vtbl4_u8, vtbl4_p8): Rewrite using builtin functions.
>>       (vtbx4_s8, vtbx4_u8, vtbx4_p8): Emulate behaviour using other
>>       intrinsics.
>>       * config/aarch64/iterators.md (V8Q): New.
>
>> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
>> index 0f4f2b9..7ca3917 100644
>> --- a/gcc/config/aarch64/aarch64-builtins.c
>> +++ b/gcc/config/aarch64/aarch64-builtins.c
>> @@ -253,6 +253,11 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
>>        qualifier_none, qualifier_struct_load_store_lane_index };
>>  #define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers)
>>
>> +static enum aarch64_type_qualifiers
>> +aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS]
>> +  = { qualifier_none, qualifier_none, qualifier_none };
>> +#define TYPES_TBL (aarch64_types_tbl_qualifiers)
>> +
>
> Do we need these? This looks like TYPES_BINOP (the predicate on the
> instruction pattern will prevent the "qualifier_maybe_immediate" from
> becoming a problem).
>
I'll give it a try, indeed I feared "qualifier_maybe_immediate" would
cause problems.

>>  #define CF0(N, X) CODE_FOR_aarch64_##N##X
>>  #define CF1(N, X) CODE_FOR_##N##X##1
>>  #define CF2(N, X) CODE_FOR_##N##X##2
>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
>> index d0f298a..62f1b13 100644
>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>> @@ -405,3 +405,5 @@
>>    VAR1 (BINOPP, crypto_pmull, 0, di)
>>    VAR1 (BINOPP, crypto_pmull, 0, v2di)
>>
>> +  /* Implemented by aarch64_tbl3v8qi.  */
>> +  BUILTIN_V8Q (TBL, tbl3, 0)
>
> This can be:
>
>   VAR1 (BINOP, tbl3, 0, v8qi)
>
> It would be good if we could eliminate the casts in arm_neon.h by also
> defining a  "BINOPU" version of this, but I imagine that gets stuck on the
> types accepted by __builtin_aarch64_set_qregoiv16qi - so don't worry about
> making that change.
OK

>
>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>> index 9777418..84a61d5 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -4716,6 +4714,16 @@
>>    [(set_attr "type" "neon_tbl2_q")]
>>  )
>>
>> +(define_insn "aarch64_tbl3v8qi"
>> +  [(set (match_operand:V8QI 0 "register_operand" "=w")
>> +     (unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
>> +                   (match_operand:V8QI 2 "register_operand" "w")]
>> +                   UNSPEC_TBL))]
>> +  "TARGET_SIMD"
>> +  "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
>> +  [(set_attr "type" "neon_tbl3")]
>> +)
>> +
>>  (define_insn_and_split "aarch64_combinev16qi"
>>    [(set (match_operand:OI 0 "register_operand" "=w")
>>       (unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index 87bbf6e..91704de 100644
>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
>> index 6dfebe7..e8ee318 100644
>> --- a/gcc/config/aarch64/arm_neon.h
>> +++ b/gcc/config/aarch64/arm_neon.h
>>  /* End of temporary inline asm.  */
>>
>>  /* Start of optimal implementations in approved order.  */
>> @@ -23221,6 +23182,36 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
>>    return vbsl_p8 (__mask, __tbl, __r);
>>  }
>>
>> +/* vtbx4  */
>> +
>> +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
>> +vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
>> +{
>> +  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
>> +                           vmov_n_u8 (32));
>> +  int8x8_t __tbl = vtbl4_s8 (__tab, __idx);
>> +
>> +  return vbsl_s8 (__mask, __tbl, __r);
>> +}
>> +
>> +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
>> +vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
>> +{
>> +  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
>> +  uint8x8_t __tbl = vtbl4_u8 (__tab, __idx);
>> +
>> +  return vbsl_u8 (__mask, __tbl, __r);
>> +}
>> +
>> +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
>> +vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
>> +{
>> +  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
>> +  poly8x8_t __tbl = vtbl4_p8 (__tab, __idx);
>> +
>> +  return vbsl_p8 (__mask, __tbl, __r);
>> +}
>> +
>
> Why do we want this for vtbx4 rather than putting out a VTBX instruction
> directly (as in the inline asm versions you replace)?
>
I just followed the pattern used for vtbx3.

> This sequence does make sense for vtbx3.
In fact, I don't see why vtbx3 and vtbx4 should be different?

>>  /* vtrn */
>>
>>  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
>> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
>> index b8a45d1..dfbd9cd 100644
>> --- a/gcc/config/aarch64/iterators.md
>> +++ b/gcc/config/aarch64/iterators.md
>> @@ -100,6 +100,8 @@
>>  ;; All modes.
>>  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
>>
>> +(define_mode_iterator V8Q [V8QI])
>> +
>
> This can be dropped if you use VAR1 in aarch64-builtins.c.
>
> Thanks for working on this, with your patch applied, the only
> remaining intrinsics I see failing for aarch64_be are:
>
>   vqtbl2_*8
>   vqtbl2q_*8
>   vqtbl3_*8
>   vqtbl3q_*8
>   vqtbl4_*8
>   vqtbl4q_*8
>
>   vqtbx2_*8
>   vqtbx2q_*8
>   vqtbx3_*8
>   vqtbx3q_*8
>   vqtbx4_*8
>   vqtbx4q_*8
>
Quite possibly. Which tests are you looking at? Since these are
aarch64-specific, they are not part of the
tests I added (advsimd-intrinsics). Do you mean
gcc.target/aarch64/table-intrinsics.c?


> Thanks,
> James
>
James Greenhalgh Oct. 8, 2015, 9:12 a.m. UTC | #5
On Wed, Oct 07, 2015 at 09:07:30PM +0100, Christophe Lyon wrote:
> On 7 October 2015 at 17:09, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> > On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
> >
> > Why do we want this for vtbx4 rather than putting out a VTBX instruction
> > directly (as in the inline asm versions you replace)?
> >
> I just followed the pattern used for vtbx3.
> 
> > This sequence does make sense for vtbx3.
> In fact, I don't see why vtbx3 and vtbx4 should be different?

The difference between TBL and TBX is in their handling of a request to
select an out-of-range value. For TBL this returns zero, for TBX this
returns the value which was already in the destination register.

Because the byte-vectors used by the TBX instruction in aarch64 are 128-bit
(so two of them togather allow selecting elements in the range 0-31), and
vtbx3 needs to emulate the AArch32 behaviour of picking elements from 3x64-bit
vectors (allowing elements in the range 0-23), we need to manually check for
values which would have been out-of-range on AArch32, but are not out
of range for AArch64 and handle them appropriately. For vtbx4 on the other
hand, 2x128-bit registers give the range 0..31 and 4x64-bit registers give
the range 0..31, so we don't need the special masked handling.

You can find the suggested instruction sequences for the Neon intrinsics
in this document:

  http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf

> >>  /* vtrn */
> >>
> >>  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> >> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> >> index b8a45d1..dfbd9cd 100644
> >> --- a/gcc/config/aarch64/iterators.md
> >> +++ b/gcc/config/aarch64/iterators.md
> >> @@ -100,6 +100,8 @@
> >>  ;; All modes.
> >>  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
> >>
> >> +(define_mode_iterator V8Q [V8QI])
> >> +
> >
> > This can be dropped if you use VAR1 in aarch64-builtins.c.
> >
> > Thanks for working on this, with your patch applied, the only
> > remaining intrinsics I see failing for aarch64_be are:
> >
> >   vqtbl2_*8
> >   vqtbl2q_*8
> >   vqtbl3_*8
> >   vqtbl3q_*8
> >   vqtbl4_*8
> >   vqtbl4q_*8
> >
> >   vqtbx2_*8
> >   vqtbx2q_*8
> >   vqtbx3_*8
> >   vqtbx3q_*8
> >   vqtbx4_*8
> >   vqtbx4q_*8
> >
> Quite possibly. Which tests are you looking at? Since these are
> aarch64-specific, they are not part of the
> tests I added (advsimd-intrinsics). Do you mean
> gcc.target/aarch64/table-intrinsics.c?

Sorry, yes I should have given a reference. I'm running with a variant of
a testcase from the LLVM test-suite repository:

  SingleSource/UnitTests/Vector/AArch64/aarch64_neon_intrinsics.c

This has an execute test for most of the intrinsics specified for AArch64.
It needs some modification to cover the intrinsics we don't implement yet.

Thanks,
James
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 0f4f2b9..7ca3917 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -253,6 +253,11 @@  aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
       qualifier_none, qualifier_struct_load_store_lane_index };
 #define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers)
 
+static enum aarch64_type_qualifiers
+aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none };
+#define TYPES_TBL (aarch64_types_tbl_qualifiers)
+
 #define CF0(N, X) CODE_FOR_aarch64_##N##X
 #define CF1(N, X) CODE_FOR_##N##X##1
 #define CF2(N, X) CODE_FOR_##N##X##2
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index d0f298a..62f1b13 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -405,3 +405,5 @@ 
   VAR1 (BINOPP, crypto_pmull, 0, di)
   VAR1 (BINOPP, crypto_pmull, 0, v2di)
 
+  /* Implemented by aarch64_tbl3v8qi.  */
+  BUILTIN_V8Q (TBL, tbl3, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9777418..84a61d5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4716,6 +4714,16 @@ 
   [(set_attr "type" "neon_tbl2_q")]
 )
 
+(define_insn "aarch64_tbl3v8qi"
+  [(set (match_operand:V8QI 0 "register_operand" "=w")
+	(unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
+		      (match_operand:V8QI 2 "register_operand" "w")]
+		      UNSPEC_TBL))]
+  "TARGET_SIMD"
+  "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
+  [(set_attr "type" "neon_tbl3")]
+)
+
 (define_insn_and_split "aarch64_combinev16qi"
   [(set (match_operand:OI 0 "register_operand" "=w")
 	(unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 87bbf6e..91704de 100644
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 6dfebe7..e8ee318 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -10902,13 +10902,14 @@  vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
 {
   int8x8_t result;
   int8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = __builtin_aarch64_tbl3v8qi (__o, idx);
   return result;
 }
 
@@ -10917,13 +10918,14 @@  vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
 {
   uint8x8_t result;
   uint8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
   return result;
 }
 
@@ -10932,13 +10934,14 @@  vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
 {
   poly8x8_t result;
   poly8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
   return result;
 }
 
@@ -10947,13 +10950,14 @@  vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
 {
   int8x8_t result;
   int8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = __builtin_aarch64_tbl3v8qi (__o, idx);
   return result;
 }
 
@@ -10962,13 +10966,14 @@  vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
 {
   uint8x8_t result;
   uint8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
   return result;
 }
 
@@ -10977,13 +10982,14 @@  vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
 {
   poly8x8_t result;
   poly8x16x2_t temp;
+  __builtin_aarch64_simd_oi __o;
   temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
   temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
+					   (int8x16_t) temp.val[1], 1);
+  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
   return result;
 }
 
@@ -11023,51 +11029,6 @@  vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
   return result;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx)
-{
-  int8x8_t result = r;
-  int8x16x2_t temp;
-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "+w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx)
-{
-  uint8x8_t result = r;
-  uint8x16x2_t temp;
-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "+w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx)
-{
-  poly8x8_t result = r;
-  poly8x16x2_t temp;
-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-	   "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "+w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
-}
-
 /* End of temporary inline asm.  */
 
 /* Start of optimal implementations in approved order.  */
@@ -23221,6 +23182,36 @@  vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
   return vbsl_p8 (__mask, __tbl, __r);
 }
 
+/* vtbx4  */
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
+			      vmov_n_u8 (32));
+  int8x8_t __tbl = vtbl4_s8 (__tab, __idx);
+
+  return vbsl_s8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
+  uint8x8_t __tbl = vtbl4_u8 (__tab, __idx);
+
+  return vbsl_u8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
+  poly8x8_t __tbl = vtbl4_p8 (__tab, __idx);
+
+  return vbsl_p8 (__mask, __tbl, __r);
+}
+
 /* vtrn */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index b8a45d1..dfbd9cd 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -100,6 +100,8 @@ 
 ;; All modes.
 (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
 
+(define_mode_iterator V8Q [V8QI])
+
 ;; All vector modes and DI.
 (define_mode_iterator VALLDI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF DI])