Message ID | CAAgBjM=-CJO=OkkRArPUrDy5JpVAQyALLC0Dq909QJVL6GGDGQ@mail.gmail.com |
---|---|
State | Accepted |
Commit | 3a0afad0d212b3ff213b393728e018caf2daa526 |
Headers | show |
Series | [AArch64,SVE] PR88837 - Poor vector construction code in VL-specific mode | expand |
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > Hi, > The attached patch tries to improve initialization for fixed-length > SVE vector and it's algorithm is described in comments for > aarch64_sve_expand_vector_init() in the patch, with help from Richard > Sandiford. I verified tests added in the patch pass with qemu and am > trying to run bootstrap+test on patch in qemu. > Does the patch look OK ? > > Thanks, > Prathamesh > > 2019-05-27 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> > Richard Sandiford <richard.sandiford@arm.com> Although we iterated on ideas for the patch a bit, I didn't write any of it, so the changelog should just have your name. > [...] > @@ -3207,3 +3207,15 @@ > DONE; > } > ) > + > +;; Standard pattern name vec_init<mode><Vel>. > + > +(define_expand "vec_init<mode><Vel>" The rest of the file doesn't have blank lines after the comment. > +/* Subroutine of aarch64_sve_expand_vector_init for handling > + trailing constants. > + This function works as follows: > + (a) Create a new vector consisting of trailing constants. > + (b) Initialize TARGET with the constant vector using emit_move_insn. > + (c) Insert remaining elements in TARGET using insr. > + NELTS is the total number of elements in original vector while > + truncated sentence, guess the rest would have been: NELTS_REQD is the number of elements that are actually significant. or something. > + ??? The heuristic used is to do above only if number of constants > + is at least half the total number of elements. May need fine tuning. */ > + > +static bool > +aarch64_sve_expand_vector_init_handle_trailing_constants > + (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd) > +{ > + machine_mode mode = GET_MODE (target); > + scalar_mode elem_mode = GET_MODE_INNER (mode); > + int n_trailing_constants = 0; > + > + for (int i = nelts_reqd - 1; > + i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i)); > + i--) > + n_trailing_constants++; > + > + if (n_trailing_constants >= nelts_reqd / 2) > + { > + rtx_vector_builder v (mode, 1, nelts); > + for (int i = 0; i < nelts; i++) > + v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants)); > + rtx const_vec = v.build (); > + emit_move_insn (target, const_vec); > + > + for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--) > + emit_insr (target, builder.elt (i)); > + > + return true; > + } > + > + return false; > +} > + > +/* Subroutine of aarch64_sve_expand_vector_init. > + Works as follows: > + (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. > + (b) Skip trailing elements from BUILDER, which are same as s/are same/are the same/ > + element NELTS_REQD - 1. > + (c) Insert earlier elements in reverse order in TARGET using insr. */ > + > +static void > +aarch64_sve_expand_vector_init_insert_elems (rtx target, > + const rtx_vector_builder &builder, > + int nelts_reqd) > +{ > + machine_mode mode = GET_MODE (target); > + scalar_mode elem_mode = GET_MODE_INNER (mode); > + > + struct expand_operand ops[2]; > + enum insn_code icode = optab_handler (vec_duplicate_optab, mode); > + gcc_assert (icode != CODE_FOR_nothing); > + > + create_output_operand (&ops[0], target, mode); > + create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode); > + expand_insn (icode, 2, ops); > + > + int ndups = builder.count_dups (nelts_reqd - 1, -1, -1); > + for (int i = nelts_reqd - ndups - 1; i >= 0; i--) > + emit_insr (target, builder.elt (i)); > +} > + > +/* Subroutine of aarch64_sve_expand_vector_init to handle case > + when all trailing elements of builder are same. > + This works as follows: > + (a) Using expand_insn interface to broadcast last vector element in TARGET. s/Using/Use/ > + (b) Insert remaining elements in TARGET using insr. > + > + ??? The heuristic used is to do above if number of same trailing elements > + is at least 3/4 of total number of elements, loosely based on > + heuristic from mostly_zeros_p. May need fine-tuning. */ Should be two spaces before "May". > [...] > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > new file mode 100644 > index 00000000000..c51876947fb > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile { target aarch64_asm_sve_ok } } */ > +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ These tests shouldn't require -ftree-vectorize. > [...] > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c > new file mode 100644 > index 00000000000..d9640e42ddd > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c > @@ -0,0 +1,21 @@ > +/* { dg-do run { target aarch64_sve256_hw } } */ > +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ No need for --save-temps in the run tests. > [...] > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c > new file mode 100644 > index 00000000000..b90895df436 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile { target aarch64_asm_sve_ok } } */ > +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > + > +/* Case 5.5: Interleaved repeating elements and trailing same elements. */ > + > +#include <stdint.h> > + > +typedef int32_t vnx4si __attribute__((vector_size (32))); > + > +vnx4si foo(int a, int b, int f) > +{ > + return (vnx4si) { a, f, b, f, b, f, b, f }; > +} > + This is missing __attribute__ ((noipa)). Same for some other tests. Thanks, Richard
On Wed, 29 May 2019 at 18:10, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > Hi, > > The attached patch tries to improve initialization for fixed-length > > SVE vector and it's algorithm is described in comments for > > aarch64_sve_expand_vector_init() in the patch, with help from Richard > > Sandiford. I verified tests added in the patch pass with qemu and am > > trying to run bootstrap+test on patch in qemu. > > Does the patch look OK ? > > > > Thanks, > > Prathamesh > > > > 2019-05-27 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> > > Richard Sandiford <richard.sandiford@arm.com> > > Although we iterated on ideas for the patch a bit, I didn't write > any of it, so the changelog should just have your name. > > > [...] > > @@ -3207,3 +3207,15 @@ > > DONE; > > } > > ) > > + > > +;; Standard pattern name vec_init<mode><Vel>. > > + > > +(define_expand "vec_init<mode><Vel>" > > The rest of the file doesn't have blank lines after the comment. > > > +/* Subroutine of aarch64_sve_expand_vector_init for handling > > + trailing constants. > > + This function works as follows: > > + (a) Create a new vector consisting of trailing constants. > > + (b) Initialize TARGET with the constant vector using emit_move_insn. > > + (c) Insert remaining elements in TARGET using insr. > > + NELTS is the total number of elements in original vector while > > + > > truncated sentence, guess the rest would have been: > > NELTS_REQD is the number of elements that are actually significant. > > or something. > > > + ??? The heuristic used is to do above only if number of constants > > + is at least half the total number of elements. May need fine tuning. */ > > + > > +static bool > > +aarch64_sve_expand_vector_init_handle_trailing_constants > > + (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd) > > +{ > > + machine_mode mode = GET_MODE (target); > > + scalar_mode elem_mode = GET_MODE_INNER (mode); > > + int n_trailing_constants = 0; > > + > > + for (int i = nelts_reqd - 1; > > + i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i)); > > + i--) > > + n_trailing_constants++; > > + > > + if (n_trailing_constants >= nelts_reqd / 2) > > + { > > + rtx_vector_builder v (mode, 1, nelts); > > + for (int i = 0; i < nelts; i++) > > + v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants)); > > + rtx const_vec = v.build (); > > + emit_move_insn (target, const_vec); > > + > > + for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--) > > + emit_insr (target, builder.elt (i)); > > + > > + return true; > > + } > > + > > + return false; > > +} > > + > > +/* Subroutine of aarch64_sve_expand_vector_init. > > + Works as follows: > > + (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. > > + (b) Skip trailing elements from BUILDER, which are same as > > s/are same/are the same/ > > > + element NELTS_REQD - 1. > > + (c) Insert earlier elements in reverse order in TARGET using insr. */ > > + > > +static void > > +aarch64_sve_expand_vector_init_insert_elems (rtx target, > > + const rtx_vector_builder &builder, > > + int nelts_reqd) > > +{ > > + machine_mode mode = GET_MODE (target); > > + scalar_mode elem_mode = GET_MODE_INNER (mode); > > + > > + struct expand_operand ops[2]; > > + enum insn_code icode = optab_handler (vec_duplicate_optab, mode); > > + gcc_assert (icode != CODE_FOR_nothing); > > + > > + create_output_operand (&ops[0], target, mode); > > + create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode); > > + expand_insn (icode, 2, ops); > > + > > + int ndups = builder.count_dups (nelts_reqd - 1, -1, -1); > > + for (int i = nelts_reqd - ndups - 1; i >= 0; i--) > > + emit_insr (target, builder.elt (i)); > > +} > > + > > +/* Subroutine of aarch64_sve_expand_vector_init to handle case > > + when all trailing elements of builder are same. > > + This works as follows: > > + (a) Using expand_insn interface to broadcast last vector element in TARGET. > > s/Using/Use/ > > > + (b) Insert remaining elements in TARGET using insr. > > + > > + ??? The heuristic used is to do above if number of same trailing elements > > + is at least 3/4 of total number of elements, loosely based on > > + heuristic from mostly_zeros_p. May need fine-tuning. */ > > Should be two spaces before "May". > > > [...] > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > > new file mode 100644 > > index 00000000000..c51876947fb > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > > @@ -0,0 +1,27 @@ > > +/* { dg-do compile { target aarch64_asm_sve_ok } } */ > > +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > > These tests shouldn't require -ftree-vectorize. > > > [...] > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c > > new file mode 100644 > > index 00000000000..d9640e42ddd > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c > > @@ -0,0 +1,21 @@ > > +/* { dg-do run { target aarch64_sve256_hw } } */ > > +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ > > No need for --save-temps in the run tests. > > > [...] > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c > > new file mode 100644 > > index 00000000000..b90895df436 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c > > @@ -0,0 +1,27 @@ > > +/* { dg-do compile { target aarch64_asm_sve_ok } } */ > > +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > > + > > +/* Case 5.5: Interleaved repeating elements and trailing same elements. */ > > + > > +#include <stdint.h> > > + > > +typedef int32_t vnx4si __attribute__((vector_size (32))); > > + > > +vnx4si foo(int a, int b, int f) > > +{ > > + return (vnx4si) { a, f, b, f, b, f, b, f }; > > +} > > + > > This is missing __attribute__ ((noipa)). Same for some other tests. Hi Richard, Thanks for the suggestions. Is the attached version OK ? Thanks, Prathamesh > > Thanks, > Richard 2019-05-30 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> PR target/88833 * vector-builder.h (vector_builder::count_dups): New method. * config/aarch64/aarch64-protos.h (aarch64_expand_sve_vector_init): Declare prototype. * config/aarch64/aarch64/sve.md (aarch64_sve_rev64<mode>): Use @. (vec_init<mode><Vel>): New pattern. * config/aarch64/aarch64.c (emit_insr): New function. (aarch64_sve_expand_vector_init_handle_trailing_constants): Likewise. (aarch64_sve_expand_vector_init_insert_elems): Likewise. (aarch64_sve_expand_vector_init_handle_trailing_same_elem): Likewise. (aarch64_sve_expand_vector_init): Define two overloaded functions. testsuite/ * gcc.target/aarch64/sve/init_1.c: New test. * gcc.target/aarch64/sve/init_1_run.c: Likewise. * gcc.target/aarch64/sve/init_2.c: Likewise. * gcc.target/aarch64/sve/init_2_run.c: Likewise. * gcc.target/aarch64/sve/init_3.c: Likewise. * gcc.target/aarch64/sve/init_3_run.c: Likewise. * gcc.target/aarch64/sve/init_4.c: Likewise. * gcc.target/aarch64/sve/init_4_run.c: Likewise. * gcc.target/aarch64/sve/init_5.c: Likewise. * gcc.target/aarch64/sve/init_5_run.c: Likewise. * gcc.target/aarch64/sve/init_6.c: Likewise. * gcc.target/aarch64/sve/init_6_run.c: Likewise. * gcc.target/aarch64/sve/init_7.c: Likewise. * gcc.target/aarch64/sve/init_7_run.c: Likewise. * gcc.target/aarch64/sve/init_8.c: Likewise. * gcc.target/aarch64/sve/init_8_run.c: Likewise. * gcc.target/aarch64/sve/init_9.c: Likewise. * gcc.target/aarch64/sve/init_9_run.c: Likewise. * gcc.target/aarch64/sve/init_10.c: Likewise. * gcc.target/aarch64/sve/init_10_run.c: Likewise. * gcc.target/aarch64/sve/init_11.c: Likewise. * gcc.target/aarch64/sve/init_11_run.c: Likewise. * gcc.target/aarch64/sve/init_12.c: Likewise. * gcc.target/aarch64/sve/init_12_run.c: Likewise. diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index b6c0d0a8eb6..f82728ed2d3 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -515,6 +515,7 @@ bool aarch64_maybe_expand_sve_subreg_move (rtx, rtx); void aarch64_split_sve_subreg_move (rtx, rtx, rtx); void aarch64_expand_prologue (void); void aarch64_expand_vector_init (rtx, rtx); +void aarch64_sve_expand_vector_init (rtx, rtx); void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx, const_tree, unsigned); void aarch64_init_expanders (void); diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index b9cb1fae98c..981a9be74c2 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -863,7 +863,7 @@ "revb\t%0.h, %1/m, %2.h" ) -(define_insn "*aarch64_sve_rev<mode>" +(define_insn "@aarch64_sve_rev<mode>" [(set (match_operand:SVE_ALL 0 "register_operand" "=w") (unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand" "w")] UNSPEC_REV))] @@ -3207,3 +3207,14 @@ DONE; } ) + +;; Standard pattern name vec_init<mode><Vel>. +(define_expand "vec_init<mode><Vel>" + [(match_operand:SVE_ALL 0 "register_operand" "") + (match_operand 1 "" "")] + "TARGET_SVE" + { + aarch64_sve_expand_vector_init (operands[0], operands[1]); + DONE; + } +) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 83453d03095..3c7e9dd2d9b 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -15244,6 +15244,263 @@ aarch64_expand_vector_init (rtx target, rtx vals) } } +/* Emit RTL corresponding to: + insr TARGET, ELEM. */ + +static void +emit_insr (rtx target, rtx elem) +{ + machine_mode mode = GET_MODE (target); + scalar_mode elem_mode = GET_MODE_INNER (mode); + elem = force_reg (elem_mode, elem); + + insn_code icode = optab_handler (vec_shl_insert_optab, mode); + gcc_assert (icode != CODE_FOR_nothing); + emit_insn (GEN_FCN (icode) (target, target, elem)); +} + +/* Subroutine of aarch64_sve_expand_vector_init for handling + trailing constants. + This function works as follows: + (a) Create a new vector consisting of trailing constants. + (b) Initialize TARGET with the constant vector using emit_move_insn. + (c) Insert remaining elements in TARGET using insr. + NELTS is the total number of elements in original vector while + while NELTS_REQD is the number of elements that are actually + significant. + + ??? The heuristic used is to do above only if number of constants + is at least half the total number of elements. May need fine tuning. */ + +static bool +aarch64_sve_expand_vector_init_handle_trailing_constants + (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd) +{ + machine_mode mode = GET_MODE (target); + scalar_mode elem_mode = GET_MODE_INNER (mode); + int n_trailing_constants = 0; + + for (int i = nelts_reqd - 1; + i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i)); + i--) + n_trailing_constants++; + + if (n_trailing_constants >= nelts_reqd / 2) + { + rtx_vector_builder v (mode, 1, nelts); + for (int i = 0; i < nelts; i++) + v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants)); + rtx const_vec = v.build (); + emit_move_insn (target, const_vec); + + for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--) + emit_insr (target, builder.elt (i)); + + return true; + } + + return false; +} + +/* Subroutine of aarch64_sve_expand_vector_init. + Works as follows: + (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. + (b) Skip trailing elements from BUILDER, which are the same as + element NELTS_REQD - 1. + (c) Insert earlier elements in reverse order in TARGET using insr. */ + +static void +aarch64_sve_expand_vector_init_insert_elems (rtx target, + const rtx_vector_builder &builder, + int nelts_reqd) +{ + machine_mode mode = GET_MODE (target); + scalar_mode elem_mode = GET_MODE_INNER (mode); + + struct expand_operand ops[2]; + enum insn_code icode = optab_handler (vec_duplicate_optab, mode); + gcc_assert (icode != CODE_FOR_nothing); + + create_output_operand (&ops[0], target, mode); + create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode); + expand_insn (icode, 2, ops); + + int ndups = builder.count_dups (nelts_reqd - 1, -1, -1); + for (int i = nelts_reqd - ndups - 1; i >= 0; i--) + emit_insr (target, builder.elt (i)); +} + +/* Subroutine of aarch64_sve_expand_vector_init to handle case + when all trailing elements of builder are same. + This works as follows: + (a) Use expand_insn interface to broadcast last vector element in TARGET. + (b) Insert remaining elements in TARGET using insr. + + ??? The heuristic used is to do above if number of same trailing elements + is at least 3/4 of total number of elements, loosely based on + heuristic from mostly_zeros_p. May need fine-tuning. */ + +static bool +aarch64_sve_expand_vector_init_handle_trailing_same_elem + (rtx target, const rtx_vector_builder &builder, int nelts_reqd) +{ + int ndups = builder.count_dups (nelts_reqd - 1, -1, -1); + if (ndups >= (3 * nelts_reqd) / 4) + { + aarch64_sve_expand_vector_init_insert_elems (target, builder, + nelts_reqd - ndups + 1); + return true; + } + + return false; +} + +/* Initialize register TARGET from BUILDER. NELTS is the constant number + of elements in BUILDER. + + The function tries to initialize TARGET from BUILDER if it fits one + of the special cases outlined below. + + Failing that, the function divides BUILDER into two sub-vectors: + v_even = even elements of BUILDER; + v_odd = odd elements of BUILDER; + + and recursively calls itself with v_even and v_odd. + + if (recursive call succeeded for v_even or v_odd) + TARGET = zip (v_even, v_odd) + + The function returns true if it managed to build TARGET from BUILDER + with one of the special cases, false otherwise. + + Example: {a, 1, b, 2, c, 3, d, 4} + + The vector gets divided into: + v_even = {a, b, c, d} + v_odd = {1, 2, 3, 4} + + aarch64_sve_expand_vector_init(v_odd) hits case 1 and + initialize tmp2 from constant vector v_odd using emit_move_insn. + + aarch64_sve_expand_vector_init(v_even) fails since v_even contains + 4 elements, so we construct tmp1 from v_even using insr: + tmp1 = dup(d) + insr tmp1, c + insr tmp1, b + insr tmp1, a + + And finally: + TARGET = zip (tmp1, tmp2) + which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */ + +static bool +aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder, + int nelts, int nelts_reqd) +{ + machine_mode mode = GET_MODE (target); + + /* Case 1: Vector contains trailing constants. */ + + if (aarch64_sve_expand_vector_init_handle_trailing_constants + (target, builder, nelts, nelts_reqd)) + return true; + + /* Case 2: Vector contains leading constants. */ + + rtx_vector_builder rev_builder (mode, 1, nelts_reqd); + for (int i = 0; i < nelts_reqd; i++) + rev_builder.quick_push (builder.elt (nelts_reqd - i - 1)); + rev_builder.finalize (); + + if (aarch64_sve_expand_vector_init_handle_trailing_constants + (target, rev_builder, nelts, nelts_reqd)) + { + emit_insn (gen_aarch64_sve_rev (mode, target, target)); + return true; + } + + /* Case 3: Vector contains trailing same element. */ + + if (aarch64_sve_expand_vector_init_handle_trailing_same_elem + (target, builder, nelts_reqd)) + return true; + + /* Case 4: Vector contains leading same element. */ + + if (aarch64_sve_expand_vector_init_handle_trailing_same_elem + (target, rev_builder, nelts_reqd) && nelts_reqd == nelts) + { + emit_insn (gen_aarch64_sve_rev (mode, target, target)); + return true; + } + + /* Avoid recursing below 4-elements. + ??? The threshold 4 may need fine-tuning. */ + + if (nelts_reqd <= 4) + return false; + + rtx_vector_builder v_even (mode, 1, nelts); + rtx_vector_builder v_odd (mode, 1, nelts); + + for (int i = 0; i < nelts * 2; i += 2) + { + v_even.quick_push (builder.elt (i)); + v_odd.quick_push (builder.elt (i + 1)); + } + + v_even.finalize (); + v_odd.finalize (); + + rtx tmp1 = gen_reg_rtx (mode); + bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even, + nelts, nelts_reqd / 2); + + rtx tmp2 = gen_reg_rtx (mode); + bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd, + nelts, nelts_reqd / 2); + + if (!did_even_p && !did_odd_p) + return false; + + /* Initialize v_even and v_odd using INSR if it didn't match any of the + special cases and zip v_even, v_odd. */ + + if (!did_even_p) + aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2); + + if (!did_odd_p) + aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2); + + rtvec v = gen_rtvec (2, tmp1, tmp2); + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); + return true; +} + +/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */ + +void +aarch64_sve_expand_vector_init (rtx target, rtx vals) +{ + machine_mode mode = GET_MODE (target); + int nelts = XVECLEN (vals, 0); + + rtx_vector_builder v (mode, 1, nelts); + for (int i = 0; i < nelts; i++) + v.quick_push (XVECEXP (vals, 0, i)); + v.finalize (); + + /* If neither sub-vectors of v could be initialized specially, + then use INSR to insert all elements from v into TARGET. + ??? This might not be optimal for vectors with large + initializers like 16-element or above. + For nelts < 4, it probably isn't useful to handle specially. */ + + if (nelts < 4 + || !aarch64_sve_expand_vector_init (target, v, nelts, nelts)) + aarch64_sve_expand_vector_init_insert_elems (target, v, nelts); +} + static unsigned HOST_WIDE_INT aarch64_shift_truncation_mask (machine_mode mode) { diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c new file mode 100644 index 00000000000..cbfeff4a59c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c @@ -0,0 +1,27 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 1.1: Trailing constants with stepped sequence. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { a, b, 1, 2, 3, 4, 5, 6 }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + index z0.s, #1, #1 + insr z0.s, w1 + insr z0.s, w0 + ret +*/ + +/* { dg-final { scan-assembler {\tindex\t(z[0-9]+\.s), #1, #1\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c new file mode 100644 index 00000000000..239cde54cdc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c @@ -0,0 +1,29 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.4: Interleaved repeating elements and non-repeating elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c, int f) +{ + return (vnx4si) { a, f, b, f, c, f, c, f }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w2 + mov z1.s, w3 + insr z0.s, w1 + ptrue p0.s, vl8 + insr z0.s, w0 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c new file mode 100644 index 00000000000..9a6d8650eea --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c @@ -0,0 +1,21 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_10.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + int f = 13; + + vnx4si v = foo (a, b, c, f); + int expected[] = { a, f, b, f, c, f, c, f }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c new file mode 100644 index 00000000000..bc646d707b6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c @@ -0,0 +1,28 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.5: Interleaved repeating elements and trailing same elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int f) +{ + return (vnx4si) { a, f, b, f, b, f, b, f }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w1 + mov z1.s, w2 + insr z0.s, w0 + ptrue p0.s, vl8 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w1\n\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\1, w0\n.*\tzip1\t\1, \1, \2} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c new file mode 100644 index 00000000000..4371555818c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c @@ -0,0 +1,20 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_11.c" + +int main() +{ + int a = 10; + int b = 11; + int f = 12; + + vnx4si v = foo (a, b, f); + int expected[] = { a, f, b, f, b, f, b, f }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_12.c b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c new file mode 100644 index 00000000000..c3fc1cae4a7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c @@ -0,0 +1,30 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.5: Interleaved repeating elements and trailing same elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int f) +{ + return (vnx4si) { b, f, b, f, b, f, a, f }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w0 + mov z1.s, w2 + insr z0.s, w1 + ptrue p0.s, vl8 + insr z0.s, w1 + insr z0.s, w1 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tmov\t(z[0-9]+\.s), w0\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tzip1\t\2, \2, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c new file mode 100644 index 00000000000..5ce7edb1e68 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c @@ -0,0 +1,20 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_12.c" + +int main() +{ + int a = 10; + int b = 11; + int f = 12; + + vnx4si v = foo (a, b, f); + int expected[] = { b, f, b, f, b, f, a, f }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c new file mode 100644 index 00000000000..824a5cbea79 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_1.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { a, b, 1, 2, 3, 4, 5, 6 }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_2.c b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c new file mode 100644 index 00000000000..e6807a20222 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c @@ -0,0 +1,29 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 1.2: Trailing constants with repeating sequence. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { a, b, 2, 3, 2, 3, 2, 3 }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + adrp x2, .LANCHOR0 + add x2, x2, :lo12:.LANCHOR0 + ld1w z0.s, p0/z, [x2] + insr z0.s, w1 + insr z0.s, w0 + ret +*/ + +/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c new file mode 100644 index 00000000000..86c191c7771 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_2.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { a, b, 2, 3, 2, 3, 2, 3 }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_3.c b/gcc/testsuite/gcc.target/aarch64/sve/init_3.c new file mode 100644 index 00000000000..c59e8ac55e2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_3.c @@ -0,0 +1,28 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 2.1: Leading constants with stepped sequence. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { 1, 2, 3, 4, 5, 6, a, b }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + index z0.s, #6, #-1 + insr z0.s, w0 + insr z0.s, w1 + rev z0.s, z0.s + ret +*/ + +/* { dg-final { scan-assembler {\tindex\t(z[0-9]+\.s), #6, #-1\n\tinsr\t\1, w0\n\tinsr\t\1, w1\n\trev\t\1, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c new file mode 100644 index 00000000000..ce4de69505f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_3.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { 1, 2, 3, 4, 5, 6, a, b }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_4.c b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c new file mode 100644 index 00000000000..f069149792e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c @@ -0,0 +1,30 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 2.2: Leading constants with stepped sequence. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { 3, 2, 3, 2, 3, 2, b, a }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + adrp x2, .LANCHOR0 + add x2, x2, :lo12:.LANCHOR0 + ld1w z0.s, p0/z, [x2] + insr z0.s, w1 + insr z0.s, w0 + rev z0.s, z0.s + ret +*/ + +/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]\n\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c new file mode 100644 index 00000000000..defee421f9f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_4.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { 3, 2, 3, 2, 3, 2, b, a }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_5.c b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c new file mode 100644 index 00000000000..0ba7a2adf66 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c @@ -0,0 +1,27 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 3: Trailing same element. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c) +{ + return (vnx4si) { a, b, c, c, c, c, c, c }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w2 + ptrue p0.s, vl8 + insr z0.s, w1 + insr z0.s, w0 + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c new file mode 100644 index 00000000000..ba91d6fec09 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c @@ -0,0 +1,20 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_5.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + + vnx4si v = foo (a, b, c); + int expected[] = { a, b, c, c, c, c, c, c }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_6.c b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c new file mode 100644 index 00000000000..9abf29cb60e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c @@ -0,0 +1,28 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 3: Trailing same element. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c) +{ + return (vnx4si) { c, c, c, c, c, c, b, a }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w2 + ptrue p0.s, vl8 + insr z0.s, w1 + insr z0.s, w0 + rev z0.s, z0.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c new file mode 100644 index 00000000000..802b28f98e0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c @@ -0,0 +1,20 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_6.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + + vnx4si v = foo (a, b, c); + int expected[] = { c, c, c, c, c, c, b, a }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_7.c b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c new file mode 100644 index 00000000000..0b1a2d02cae --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c @@ -0,0 +1,32 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.1: All elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c, int d, int e, int f, int g, int h) +{ + return (vnx4si) { a, b, c, d, e, f, g, h }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w7 + ptrue p0.s, vl8 + insr z0.s, w6 + insr z0.s, w5 + insr z0.s, w4 + insr z0.s, w3 + insr z0.s, w2 + insr z0.s, w1 + insr z0.s, w0 + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w7\n.*\tinsr\t\1, w6\n\tinsr\t\1, w5\n\tinsr\t\1, w4\n\tinsr\t\1, w3\n\tinsr\t\1, w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c new file mode 100644 index 00000000000..61fe2850831 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c @@ -0,0 +1,25 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_7.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + int d = 13; + int e = 14; + int f = 15; + int g = 16; + int h = 17; + + vnx4si v = foo (a, b, c, d, e, f, g, h); + int expected[] = { a, b, c, d, e, f, g, h }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_8.c b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c new file mode 100644 index 00000000000..916ce8aa831 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c @@ -0,0 +1,32 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.2: Interleaved elements and constants. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c, int d) +{ + return (vnx4si) { a, 1, b, 2, c, 3, d, 4 }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + mov z0.s, w3 + adrp x3, .LANCHOR0 + insr z0.s, w2 + add x3, x3, :lo12:.LANCHOR0 + insr z0.s, w1 + ld1w z1.s, p0/z, [x3] + insr z0.s, w0 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tadrp\t(x[0-9]+), \.LANCHOR0\n\tinsr\t\1, w2\n\tadd\t\2, \2, :lo12:\.LANCHOR0\n\tinsr\t\1, w1\n\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[\2\]\n\tinsr\t\1, w0\n\tzip1\t\1, \1, \3} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c new file mode 100644 index 00000000000..24a0a6e0673 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c @@ -0,0 +1,21 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_8.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + int d = 13; + + vnx4si v = foo (a, b, c, d); + int expected[] = { a, 1, b, 2, c, 3, d, 4 }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_9.c b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c new file mode 100644 index 00000000000..c555fd23120 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c @@ -0,0 +1,27 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.3: Repeated elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { a, b, a, b, a, b, a, b }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w0 + mov z1.s, w1 + ptrue p0.s, vl8 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w0\n\tmov\t(z[0-9]+\.s), w1\n.*\tzip1\t\1, \1, \2} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c new file mode 100644 index 00000000000..636ae3b8b48 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_9.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { a, b, a, b, a, b, a, b }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/vector-builder.h b/gcc/vector-builder.h index 9967daa6e4c..9f95b01bc3b 100644 --- a/gcc/vector-builder.h +++ b/gcc/vector-builder.h @@ -96,6 +96,7 @@ public: unsigned int encoded_nelts () const; bool encoded_full_vector_p () const; T elt (unsigned int) const; + unsigned int count_dups (int, int, int) const; bool operator == (const Derived &) const; bool operator != (const Derived &x) const { return !operator == (x); } @@ -223,6 +224,23 @@ vector_builder<T, Derived>::elt (unsigned int i) const derived ()->step (prev, final)); } +/* Return the number of leading duplicate elements in the range + [START:END:STEP]. The value is always at least 1. */ + +template<typename T, typename Derived> +unsigned int +vector_builder<T, Derived>::count_dups (int start, int end, int step) const +{ + gcc_assert ((end - start) % step == 0); + + unsigned int ndups = 1; + for (int i = start + step; + i != end && derived ()->equal_p (elt (i), elt (start)); + i += step) + ndups++; + return ndups; +} + /* Change the encoding to NPATTERNS patterns of NELTS_PER_PATTERN each, but without changing the underlying vector. */
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > new file mode 100644 > index 00000000000..cbfeff4a59c > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile { target aarch64_asm_sve_ok } } */ > +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ Sorry for not noticing last time, but the combination of aarch64_asm_sve_ok and --save-temps only makes sense for assemble tests, not compile tests. So these should either be: /* { dg-do assemble { target aarch64_asm_sve_ok } } */ /* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ or: /* { dg-do compile } */ /* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256" } */ Might as well as go for the first I guess. Same for the other non-run tests. OK with that change, thanks. Richard
On Thu, 30 May 2019 at 15:10, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > > new file mode 100644 > > index 00000000000..cbfeff4a59c > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > > @@ -0,0 +1,27 @@ > > +/* { dg-do compile { target aarch64_asm_sve_ok } } */ > > +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > > Sorry for not noticing last time, but the combination of aarch64_asm_sve_ok > and --save-temps only makes sense for assemble tests, not compile tests. > So these should either be: > > /* { dg-do assemble { target aarch64_asm_sve_ok } } */ > /* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > > or: > > /* { dg-do compile } */ > /* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256" } */ > > Might as well as go for the first I guess. Same for the other > non-run tests. > > OK with that change, thanks. Thanks for pointing out, updated the patch with dg-do assemble. Sorry for silly ques - What configure option should be passed to gcc to generate code with -msve-vector-bits=256 by default ? I suppose that'd be necessary for correctness testing, to test patch with run tests that contain initializers and don't explicitly pass -msve-vector-bits=256 ? Thanks, Prathamesh > > Richard
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > On Thu, 30 May 2019 at 15:10, Richard Sandiford > <richard.sandiford@arm.com> wrote: >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c >> > new file mode 100644 >> > index 00000000000..cbfeff4a59c >> > --- /dev/null >> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c >> > @@ -0,0 +1,27 @@ >> > +/* { dg-do compile { target aarch64_asm_sve_ok } } */ >> > +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ >> >> Sorry for not noticing last time, but the combination of aarch64_asm_sve_ok >> and --save-temps only makes sense for assemble tests, not compile tests. >> So these should either be: >> >> /* { dg-do assemble { target aarch64_asm_sve_ok } } */ >> /* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ >> >> or: >> >> /* { dg-do compile } */ >> /* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256" } */ >> >> Might as well as go for the first I guess. Same for the other >> non-run tests. >> >> OK with that change, thanks. > Thanks for pointing out, updated the patch with dg-do assemble. > Sorry for silly ques - What configure option should be passed to gcc > to generate code with -msve-vector-bits=256 by default ? > I suppose that'd be necessary for correctness testing, to test patch > with run tests that contain initializers and don't explicitly pass > -msve-vector-bits=256 ? There's no configure option, but you can test with things like --target_board unix/-msve-vector-bits=256 or --target_board unix{,/-msve-vector-bits=256} (to test both with and without -msve-vector-bits=256). Richard
On Thu, 30 May 2019 at 21:19, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > On Thu, 30 May 2019 at 15:10, Richard Sandiford > > <richard.sandiford@arm.com> wrote: > >> > >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > >> > new file mode 100644 > >> > index 00000000000..cbfeff4a59c > >> > --- /dev/null > >> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > >> > @@ -0,0 +1,27 @@ > >> > +/* { dg-do compile { target aarch64_asm_sve_ok } } */ > >> > +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > >> > >> Sorry for not noticing last time, but the combination of aarch64_asm_sve_ok > >> and --save-temps only makes sense for assemble tests, not compile tests. > >> So these should either be: > >> > >> /* { dg-do assemble { target aarch64_asm_sve_ok } } */ > >> /* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > >> > >> or: > >> > >> /* { dg-do compile } */ > >> /* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256" } */ > >> > >> Might as well as go for the first I guess. Same for the other > >> non-run tests. > >> > >> OK with that change, thanks. > > Thanks for pointing out, updated the patch with dg-do assemble. > > Sorry for silly ques - What configure option should be passed to gcc > > to generate code with -msve-vector-bits=256 by default ? > > I suppose that'd be necessary for correctness testing, to test patch > > with run tests that contain initializers and don't explicitly pass > > -msve-vector-bits=256 ? > > There's no configure option, but you can test with things like > --target_board unix/-msve-vector-bits=256 or > --target_board unix{,/-msve-vector-bits=256} (to test both with > and without -msve-vector-bits=256). Hi, Sorry for late response. I managed to cross-test using qemu and seeing some fallout: 1. Testing pristine trunk with --target_board=arm-qemu/-march=armv8.2-a+sve results in following ICE's: FAIL: gcc.c-torture/compile/pr82096.c -O0 (internal compiler error) FAIL: gcc.c-torture/compile/pr82096.c -O1 (internal compiler error) FAIL: gcc.c-torture/compile/pr82096.c -O2 (internal compiler error) FAIL: gcc.c-torture/compile/pr82096.c -O2 -flto -fno-use-linker-plugin -flto-partition=none (internal compiler error) FAIL: gcc.c-torture/compile/pr82096.c -O3 -g (internal compiler error) FAIL: gcc.c-torture/compile/pr82096.c -Os (internal compiler error) FAIL: gcc.dg/di-longlong64-sync-1.c (internal compiler error) FAIL: gcc.dg/di-sync-multithread.c (internal compiler error) FAIL: gcc.target/aarch64/pr87839.c (internal compiler error) 2. Passing -msve-vector-bits=256 results in following additional ICE's with trunk: FAIL: gcc.dg/pr88598-2.c (internal compiler error) FAIL: gcc.dg/pr88598-3.c (internal compiler error) FAIL: gcc.dg/pr88598-5.c (internal compiler error) FAIL: c-c++-common/torture/builtin-convertvector-1.c -O1 (internal compiler error) FAIL: c-c++-common/torture/builtin-convertvector-1.c -O2 (internal compiler error) FAIL: c-c++-common/torture/builtin-convertvector-1.c -O2 -flto -fno-use-linker-plugin -flto-partition=none (internal compiler error) FAIL: c-c++-common/torture/builtin-convertvector-1.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (internal compiler error) FAIL: c-c++-common/torture/builtin-convertvector-1.c -O3 -g (internal compiler error) FAIL: c-c++-common/torture/builtin-convertvector-1.c -Os (internal compiler error) FAIL: gcc.target/aarch64/pr87839.c (internal compiler error) FAIL: gfortran.dg/vect/vect-8-epilogue.F90 -O (internal compiler error) FAIL: c-c++-common/torture/builtin-convertvector-1.c -O1 (internal compiler error) FAIL: c-c++-common/torture/builtin-convertvector-1.c -O2 (internal compiler error) FAIL: c-c++-common/torture/builtin-convertvector-1.c -O2 -flto -fno-use-linker-plugin -flto-partition=none (internal compiler error) FAIL: c-c++-common/torture/builtin-convertvector-1.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (internal compiler error) FAIL: c-c++-common/torture/builtin-convertvector-1.c -O3 -g (internal compiler error) Applying patch doesn't results in additional fallout relative to trunk with -march=armv8.2-a+sve -msve-vector-bits=256. Is it OK to apply ? PS: Initially, I got UNSUPPORTED for SVE tests, because assembler was rejecting the test "ptrue p0.b" in selector check_effective_target_aarch64_sve_hw and would accept it only if passed -march=armv8.2-a+sve explicitly on command line. I worked around that by patching lib/target-supports.exp to explicitly pass the option. Not sure if that's the right approach tho ? Thanks, Prathamesh > > Richard 2019-06-03 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> PR target/88837 * vector-builder.h (vector_builder::count_dups): New method. * config/aarch64/aarch64-protos.h (aarch64_expand_sve_vector_init): Declare prototype. * config/aarch64/aarch64/sve.md (aarch64_sve_rev64<mode>): Use @. (vec_init<mode><Vel>): New pattern. * config/aarch64/aarch64.c (emit_insr): New function. (aarch64_sve_expand_vector_init_handle_trailing_constants): Likewise. (aarch64_sve_expand_vector_init_insert_elems): Likewise. (aarch64_sve_expand_vector_init_handle_trailing_same_elem): Likewise. (aarch64_sve_expand_vector_init): Define two overloaded functions. testsuite/ * gcc.target/aarch64/sve/init_1.c: New test. * gcc.target/aarch64/sve/init_1_run.c: Likewise. * gcc.target/aarch64/sve/init_2.c: Likewise. * gcc.target/aarch64/sve/init_2_run.c: Likewise. * gcc.target/aarch64/sve/init_3.c: Likewise. * gcc.target/aarch64/sve/init_3_run.c: Likewise. * gcc.target/aarch64/sve/init_4.c: Likewise. * gcc.target/aarch64/sve/init_4_run.c: Likewise. * gcc.target/aarch64/sve/init_5.c: Likewise. * gcc.target/aarch64/sve/init_5_run.c: Likewise. * gcc.target/aarch64/sve/init_6.c: Likewise. * gcc.target/aarch64/sve/init_6_run.c: Likewise. * gcc.target/aarch64/sve/init_7.c: Likewise. * gcc.target/aarch64/sve/init_7_run.c: Likewise. * gcc.target/aarch64/sve/init_8.c: Likewise. * gcc.target/aarch64/sve/init_8_run.c: Likewise. * gcc.target/aarch64/sve/init_9.c: Likewise. * gcc.target/aarch64/sve/init_9_run.c: Likewise. * gcc.target/aarch64/sve/init_10.c: Likewise. * gcc.target/aarch64/sve/init_10_run.c: Likewise. * gcc.target/aarch64/sve/init_11.c: Likewise. * gcc.target/aarch64/sve/init_11_run.c: Likewise. * gcc.target/aarch64/sve/init_12.c: Likewise. * gcc.target/aarch64/sve/init_12_run.c: Likewise. diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index b6c0d0a8eb6..f82728ed2d3 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -515,6 +515,7 @@ bool aarch64_maybe_expand_sve_subreg_move (rtx, rtx); void aarch64_split_sve_subreg_move (rtx, rtx, rtx); void aarch64_expand_prologue (void); void aarch64_expand_vector_init (rtx, rtx); +void aarch64_sve_expand_vector_init (rtx, rtx); void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx, const_tree, unsigned); void aarch64_init_expanders (void); diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index b9cb1fae98c..981a9be74c2 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -863,7 +863,7 @@ "revb\t%0.h, %1/m, %2.h" ) -(define_insn "*aarch64_sve_rev<mode>" +(define_insn "@aarch64_sve_rev<mode>" [(set (match_operand:SVE_ALL 0 "register_operand" "=w") (unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand" "w")] UNSPEC_REV))] @@ -3207,3 +3207,14 @@ DONE; } ) + +;; Standard pattern name vec_init<mode><Vel>. +(define_expand "vec_init<mode><Vel>" + [(match_operand:SVE_ALL 0 "register_operand" "") + (match_operand 1 "" "")] + "TARGET_SVE" + { + aarch64_sve_expand_vector_init (operands[0], operands[1]); + DONE; + } +) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 83453d03095..3c7e9dd2d9b 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -15244,6 +15244,263 @@ aarch64_expand_vector_init (rtx target, rtx vals) } } +/* Emit RTL corresponding to: + insr TARGET, ELEM. */ + +static void +emit_insr (rtx target, rtx elem) +{ + machine_mode mode = GET_MODE (target); + scalar_mode elem_mode = GET_MODE_INNER (mode); + elem = force_reg (elem_mode, elem); + + insn_code icode = optab_handler (vec_shl_insert_optab, mode); + gcc_assert (icode != CODE_FOR_nothing); + emit_insn (GEN_FCN (icode) (target, target, elem)); +} + +/* Subroutine of aarch64_sve_expand_vector_init for handling + trailing constants. + This function works as follows: + (a) Create a new vector consisting of trailing constants. + (b) Initialize TARGET with the constant vector using emit_move_insn. + (c) Insert remaining elements in TARGET using insr. + NELTS is the total number of elements in original vector while + while NELTS_REQD is the number of elements that are actually + significant. + + ??? The heuristic used is to do above only if number of constants + is at least half the total number of elements. May need fine tuning. */ + +static bool +aarch64_sve_expand_vector_init_handle_trailing_constants + (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd) +{ + machine_mode mode = GET_MODE (target); + scalar_mode elem_mode = GET_MODE_INNER (mode); + int n_trailing_constants = 0; + + for (int i = nelts_reqd - 1; + i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i)); + i--) + n_trailing_constants++; + + if (n_trailing_constants >= nelts_reqd / 2) + { + rtx_vector_builder v (mode, 1, nelts); + for (int i = 0; i < nelts; i++) + v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants)); + rtx const_vec = v.build (); + emit_move_insn (target, const_vec); + + for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--) + emit_insr (target, builder.elt (i)); + + return true; + } + + return false; +} + +/* Subroutine of aarch64_sve_expand_vector_init. + Works as follows: + (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. + (b) Skip trailing elements from BUILDER, which are the same as + element NELTS_REQD - 1. + (c) Insert earlier elements in reverse order in TARGET using insr. */ + +static void +aarch64_sve_expand_vector_init_insert_elems (rtx target, + const rtx_vector_builder &builder, + int nelts_reqd) +{ + machine_mode mode = GET_MODE (target); + scalar_mode elem_mode = GET_MODE_INNER (mode); + + struct expand_operand ops[2]; + enum insn_code icode = optab_handler (vec_duplicate_optab, mode); + gcc_assert (icode != CODE_FOR_nothing); + + create_output_operand (&ops[0], target, mode); + create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode); + expand_insn (icode, 2, ops); + + int ndups = builder.count_dups (nelts_reqd - 1, -1, -1); + for (int i = nelts_reqd - ndups - 1; i >= 0; i--) + emit_insr (target, builder.elt (i)); +} + +/* Subroutine of aarch64_sve_expand_vector_init to handle case + when all trailing elements of builder are same. + This works as follows: + (a) Use expand_insn interface to broadcast last vector element in TARGET. + (b) Insert remaining elements in TARGET using insr. + + ??? The heuristic used is to do above if number of same trailing elements + is at least 3/4 of total number of elements, loosely based on + heuristic from mostly_zeros_p. May need fine-tuning. */ + +static bool +aarch64_sve_expand_vector_init_handle_trailing_same_elem + (rtx target, const rtx_vector_builder &builder, int nelts_reqd) +{ + int ndups = builder.count_dups (nelts_reqd - 1, -1, -1); + if (ndups >= (3 * nelts_reqd) / 4) + { + aarch64_sve_expand_vector_init_insert_elems (target, builder, + nelts_reqd - ndups + 1); + return true; + } + + return false; +} + +/* Initialize register TARGET from BUILDER. NELTS is the constant number + of elements in BUILDER. + + The function tries to initialize TARGET from BUILDER if it fits one + of the special cases outlined below. + + Failing that, the function divides BUILDER into two sub-vectors: + v_even = even elements of BUILDER; + v_odd = odd elements of BUILDER; + + and recursively calls itself with v_even and v_odd. + + if (recursive call succeeded for v_even or v_odd) + TARGET = zip (v_even, v_odd) + + The function returns true if it managed to build TARGET from BUILDER + with one of the special cases, false otherwise. + + Example: {a, 1, b, 2, c, 3, d, 4} + + The vector gets divided into: + v_even = {a, b, c, d} + v_odd = {1, 2, 3, 4} + + aarch64_sve_expand_vector_init(v_odd) hits case 1 and + initialize tmp2 from constant vector v_odd using emit_move_insn. + + aarch64_sve_expand_vector_init(v_even) fails since v_even contains + 4 elements, so we construct tmp1 from v_even using insr: + tmp1 = dup(d) + insr tmp1, c + insr tmp1, b + insr tmp1, a + + And finally: + TARGET = zip (tmp1, tmp2) + which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */ + +static bool +aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder, + int nelts, int nelts_reqd) +{ + machine_mode mode = GET_MODE (target); + + /* Case 1: Vector contains trailing constants. */ + + if (aarch64_sve_expand_vector_init_handle_trailing_constants + (target, builder, nelts, nelts_reqd)) + return true; + + /* Case 2: Vector contains leading constants. */ + + rtx_vector_builder rev_builder (mode, 1, nelts_reqd); + for (int i = 0; i < nelts_reqd; i++) + rev_builder.quick_push (builder.elt (nelts_reqd - i - 1)); + rev_builder.finalize (); + + if (aarch64_sve_expand_vector_init_handle_trailing_constants + (target, rev_builder, nelts, nelts_reqd)) + { + emit_insn (gen_aarch64_sve_rev (mode, target, target)); + return true; + } + + /* Case 3: Vector contains trailing same element. */ + + if (aarch64_sve_expand_vector_init_handle_trailing_same_elem + (target, builder, nelts_reqd)) + return true; + + /* Case 4: Vector contains leading same element. */ + + if (aarch64_sve_expand_vector_init_handle_trailing_same_elem + (target, rev_builder, nelts_reqd) && nelts_reqd == nelts) + { + emit_insn (gen_aarch64_sve_rev (mode, target, target)); + return true; + } + + /* Avoid recursing below 4-elements. + ??? The threshold 4 may need fine-tuning. */ + + if (nelts_reqd <= 4) + return false; + + rtx_vector_builder v_even (mode, 1, nelts); + rtx_vector_builder v_odd (mode, 1, nelts); + + for (int i = 0; i < nelts * 2; i += 2) + { + v_even.quick_push (builder.elt (i)); + v_odd.quick_push (builder.elt (i + 1)); + } + + v_even.finalize (); + v_odd.finalize (); + + rtx tmp1 = gen_reg_rtx (mode); + bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even, + nelts, nelts_reqd / 2); + + rtx tmp2 = gen_reg_rtx (mode); + bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd, + nelts, nelts_reqd / 2); + + if (!did_even_p && !did_odd_p) + return false; + + /* Initialize v_even and v_odd using INSR if it didn't match any of the + special cases and zip v_even, v_odd. */ + + if (!did_even_p) + aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2); + + if (!did_odd_p) + aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2); + + rtvec v = gen_rtvec (2, tmp1, tmp2); + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); + return true; +} + +/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */ + +void +aarch64_sve_expand_vector_init (rtx target, rtx vals) +{ + machine_mode mode = GET_MODE (target); + int nelts = XVECLEN (vals, 0); + + rtx_vector_builder v (mode, 1, nelts); + for (int i = 0; i < nelts; i++) + v.quick_push (XVECEXP (vals, 0, i)); + v.finalize (); + + /* If neither sub-vectors of v could be initialized specially, + then use INSR to insert all elements from v into TARGET. + ??? This might not be optimal for vectors with large + initializers like 16-element or above. + For nelts < 4, it probably isn't useful to handle specially. */ + + if (nelts < 4 + || !aarch64_sve_expand_vector_init (target, v, nelts, nelts)) + aarch64_sve_expand_vector_init_insert_elems (target, v, nelts); +} + static unsigned HOST_WIDE_INT aarch64_shift_truncation_mask (machine_mode mode) { diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c new file mode 100644 index 00000000000..5c14b603f46 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c @@ -0,0 +1,27 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 1.1: Trailing constants with stepped sequence. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { a, b, 1, 2, 3, 4, 5, 6 }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + index z0.s, #1, #1 + insr z0.s, w1 + insr z0.s, w0 + ret +*/ + +/* { dg-final { scan-assembler {\tindex\t(z[0-9]+\.s), #1, #1\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c new file mode 100644 index 00000000000..9d6e2dfc876 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c @@ -0,0 +1,29 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.4: Interleaved repeating elements and non-repeating elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c, int f) +{ + return (vnx4si) { a, f, b, f, c, f, c, f }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w2 + mov z1.s, w3 + insr z0.s, w1 + ptrue p0.s, vl8 + insr z0.s, w0 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c new file mode 100644 index 00000000000..9a6d8650eea --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c @@ -0,0 +1,21 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_10.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + int f = 13; + + vnx4si v = foo (a, b, c, f); + int expected[] = { a, f, b, f, c, f, c, f }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c new file mode 100644 index 00000000000..e50cd54ef13 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c @@ -0,0 +1,28 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.5: Interleaved repeating elements and trailing same elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int f) +{ + return (vnx4si) { a, f, b, f, b, f, b, f }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w1 + mov z1.s, w2 + insr z0.s, w0 + ptrue p0.s, vl8 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w1\n\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\1, w0\n.*\tzip1\t\1, \1, \2} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c new file mode 100644 index 00000000000..4371555818c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c @@ -0,0 +1,20 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_11.c" + +int main() +{ + int a = 10; + int b = 11; + int f = 12; + + vnx4si v = foo (a, b, f); + int expected[] = { a, f, b, f, b, f, b, f }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_12.c b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c new file mode 100644 index 00000000000..21d9e764360 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c @@ -0,0 +1,30 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.5: Interleaved repeating elements and trailing same elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int f) +{ + return (vnx4si) { b, f, b, f, b, f, a, f }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w0 + mov z1.s, w2 + insr z0.s, w1 + ptrue p0.s, vl8 + insr z0.s, w1 + insr z0.s, w1 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tmov\t(z[0-9]+\.s), w0\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tzip1\t\2, \2, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c new file mode 100644 index 00000000000..5ce7edb1e68 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c @@ -0,0 +1,20 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_12.c" + +int main() +{ + int a = 10; + int b = 11; + int f = 12; + + vnx4si v = foo (a, b, f); + int expected[] = { b, f, b, f, b, f, a, f }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c new file mode 100644 index 00000000000..824a5cbea79 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_1.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { a, b, 1, 2, 3, 4, 5, 6 }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_2.c b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c new file mode 100644 index 00000000000..a8b2a25b325 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c @@ -0,0 +1,29 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 1.2: Trailing constants with repeating sequence. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { a, b, 2, 3, 2, 3, 2, 3 }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + adrp x2, .LANCHOR0 + add x2, x2, :lo12:.LANCHOR0 + ld1w z0.s, p0/z, [x2] + insr z0.s, w1 + insr z0.s, w0 + ret +*/ + +/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c new file mode 100644 index 00000000000..86c191c7771 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_2.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { a, b, 2, 3, 2, 3, 2, 3 }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_3.c b/gcc/testsuite/gcc.target/aarch64/sve/init_3.c new file mode 100644 index 00000000000..6b000b887ba --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_3.c @@ -0,0 +1,28 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 2.1: Leading constants with stepped sequence. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { 1, 2, 3, 4, 5, 6, a, b }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + index z0.s, #6, #-1 + insr z0.s, w0 + insr z0.s, w1 + rev z0.s, z0.s + ret +*/ + +/* { dg-final { scan-assembler {\tindex\t(z[0-9]+\.s), #6, #-1\n\tinsr\t\1, w0\n\tinsr\t\1, w1\n\trev\t\1, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c new file mode 100644 index 00000000000..ce4de69505f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_3.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { 1, 2, 3, 4, 5, 6, a, b }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_4.c b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c new file mode 100644 index 00000000000..619274928e4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c @@ -0,0 +1,30 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 2.2: Leading constants with stepped sequence. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { 3, 2, 3, 2, 3, 2, b, a }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + adrp x2, .LANCHOR0 + add x2, x2, :lo12:.LANCHOR0 + ld1w z0.s, p0/z, [x2] + insr z0.s, w1 + insr z0.s, w0 + rev z0.s, z0.s + ret +*/ + +/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]\n\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c new file mode 100644 index 00000000000..defee421f9f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_4.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { 3, 2, 3, 2, 3, 2, b, a }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_5.c b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c new file mode 100644 index 00000000000..e7fbdd1a2aa --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c @@ -0,0 +1,27 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 3: Trailing same element. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c) +{ + return (vnx4si) { a, b, c, c, c, c, c, c }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w2 + ptrue p0.s, vl8 + insr z0.s, w1 + insr z0.s, w0 + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c new file mode 100644 index 00000000000..ba91d6fec09 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c @@ -0,0 +1,20 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_5.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + + vnx4si v = foo (a, b, c); + int expected[] = { a, b, c, c, c, c, c, c }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_6.c b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c new file mode 100644 index 00000000000..f6f3da5958d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c @@ -0,0 +1,28 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 3: Trailing same element. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c) +{ + return (vnx4si) { c, c, c, c, c, c, b, a }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w2 + ptrue p0.s, vl8 + insr z0.s, w1 + insr z0.s, w0 + rev z0.s, z0.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c new file mode 100644 index 00000000000..802b28f98e0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c @@ -0,0 +1,20 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_6.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + + vnx4si v = foo (a, b, c); + int expected[] = { c, c, c, c, c, c, b, a }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_7.c b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c new file mode 100644 index 00000000000..e3104a35f13 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c @@ -0,0 +1,32 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.1: All elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c, int d, int e, int f, int g, int h) +{ + return (vnx4si) { a, b, c, d, e, f, g, h }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w7 + ptrue p0.s, vl8 + insr z0.s, w6 + insr z0.s, w5 + insr z0.s, w4 + insr z0.s, w3 + insr z0.s, w2 + insr z0.s, w1 + insr z0.s, w0 + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w7\n.*\tinsr\t\1, w6\n\tinsr\t\1, w5\n\tinsr\t\1, w4\n\tinsr\t\1, w3\n\tinsr\t\1, w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c new file mode 100644 index 00000000000..61fe2850831 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c @@ -0,0 +1,25 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_7.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + int d = 13; + int e = 14; + int f = 15; + int g = 16; + int h = 17; + + vnx4si v = foo (a, b, c, d, e, f, g, h); + int expected[] = { a, b, c, d, e, f, g, h }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_8.c b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c new file mode 100644 index 00000000000..7ff3e0849cc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c @@ -0,0 +1,32 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.2: Interleaved elements and constants. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c, int d) +{ + return (vnx4si) { a, 1, b, 2, c, 3, d, 4 }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + mov z0.s, w3 + adrp x3, .LANCHOR0 + insr z0.s, w2 + add x3, x3, :lo12:.LANCHOR0 + insr z0.s, w1 + ld1w z1.s, p0/z, [x3] + insr z0.s, w0 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tadrp\t(x[0-9]+), \.LANCHOR0\n\tinsr\t\1, w2\n\tadd\t\2, \2, :lo12:\.LANCHOR0\n\tinsr\t\1, w1\n\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[\2\]\n\tinsr\t\1, w0\n\tzip1\t\1, \1, \3} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c new file mode 100644 index 00000000000..24a0a6e0673 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c @@ -0,0 +1,21 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_8.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + int d = 13; + + vnx4si v = foo (a, b, c, d); + int expected[] = { a, 1, b, 2, c, 3, d, 4 }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_9.c b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c new file mode 100644 index 00000000000..4d3c59b3bf8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c @@ -0,0 +1,27 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.3: Repeated elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { a, b, a, b, a, b, a, b }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w0 + mov z1.s, w1 + ptrue p0.s, vl8 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w0\n\tmov\t(z[0-9]+\.s), w1\n.*\tzip1\t\1, \1, \2} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c new file mode 100644 index 00000000000..636ae3b8b48 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -msve-vector-bits=256" } */ + +#include "init_9.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { a, b, a, b, a, b, a, b }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/vector-builder.h b/gcc/vector-builder.h index 9967daa6e4c..9f95b01bc3b 100644 --- a/gcc/vector-builder.h +++ b/gcc/vector-builder.h @@ -96,6 +96,7 @@ public: unsigned int encoded_nelts () const; bool encoded_full_vector_p () const; T elt (unsigned int) const; + unsigned int count_dups (int, int, int) const; bool operator == (const Derived &) const; bool operator != (const Derived &x) const { return !operator == (x); } @@ -223,6 +224,23 @@ vector_builder<T, Derived>::elt (unsigned int i) const derived ()->step (prev, final)); } +/* Return the number of leading duplicate elements in the range + [START:END:STEP]. The value is always at least 1. */ + +template<typename T, typename Derived> +unsigned int +vector_builder<T, Derived>::count_dups (int start, int end, int step) const +{ + gcc_assert ((end - start) % step == 0); + + unsigned int ndups = 1; + for (int i = start + step; + i != end && derived ()->equal_p (elt (i), elt (start)); + i += step) + ndups++; + return ndups; +} + /* Change the encoding to NPATTERNS patterns of NELTS_PER_PATTERN each, but without changing the underlying vector. */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 3bd6e815715..0ff0d8fb757 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -3846,6 +3846,10 @@ proc add_options_for_arm_neon_softfp_fp16 { flags } { return "$flags $et_arm_neon_softfp_fp16_flags" } +proc add_options_for_arm_sve { flags } { + return "$flags -march=armv8.2-a+sve" +} + # Return 1 if this is an ARM target supporting the FP16 alternative # format. Some multilibs may be incompatible with the options needed. Also # set et_arm_neon_fp16_flags to the best options to add. @@ -4323,7 +4327,7 @@ proc check_effective_target_aarch64_sve_hw { } { asm volatile ("ptrue p0.b"); return 0; } - }] + } [ add_options_for_arm_sve "" ]] } # Return true if this is an AArch64 target that can run SVE code and @@ -4343,7 +4347,7 @@ proc aarch64_sve_hw_bits { bits } { __builtin_abort (); return 0; } - }]] + }] [add_options_for_arm_sve ""] ] } # Return true if this is an AArch64 target that can run SVE code and
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > On Thu, 30 May 2019 at 21:19, Richard Sandiford > <richard.sandiford@arm.com> wrote: >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> > On Thu, 30 May 2019 at 15:10, Richard Sandiford >> > <richard.sandiford@arm.com> wrote: >> >> >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c >> >> > new file mode 100644 >> >> > index 00000000000..cbfeff4a59c >> >> > --- /dev/null >> >> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c >> >> > @@ -0,0 +1,27 @@ >> >> > +/* { dg-do compile { target aarch64_asm_sve_ok } } */ >> >> > +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ >> >> >> >> Sorry for not noticing last time, but the combination of aarch64_asm_sve_ok >> >> and --save-temps only makes sense for assemble tests, not compile tests. >> >> So these should either be: >> >> >> >> /* { dg-do assemble { target aarch64_asm_sve_ok } } */ >> >> /* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ >> >> >> >> or: >> >> >> >> /* { dg-do compile } */ >> >> /* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256" } */ >> >> >> >> Might as well as go for the first I guess. Same for the other >> >> non-run tests. >> >> >> >> OK with that change, thanks. >> > Thanks for pointing out, updated the patch with dg-do assemble. >> > Sorry for silly ques - What configure option should be passed to gcc >> > to generate code with -msve-vector-bits=256 by default ? >> > I suppose that'd be necessary for correctness testing, to test patch >> > with run tests that contain initializers and don't explicitly pass >> > -msve-vector-bits=256 ? >> >> There's no configure option, but you can test with things like >> --target_board unix/-msve-vector-bits=256 or >> --target_board unix{,/-msve-vector-bits=256} (to test both with >> and without -msve-vector-bits=256). > Hi, > Sorry for late response. I managed to cross-test using qemu and seeing some > fallout: > > 1. Testing pristine trunk with --target_board=arm-qemu/-march=armv8.2-a+sve > results in following ICE's: > FAIL: gcc.c-torture/compile/pr82096.c -O0 (internal compiler error) > FAIL: gcc.c-torture/compile/pr82096.c -O1 (internal compiler error) > FAIL: gcc.c-torture/compile/pr82096.c -O2 (internal compiler error) > FAIL: gcc.c-torture/compile/pr82096.c -O2 -flto > -fno-use-linker-plugin -flto-partition=none (internal compiler error) > FAIL: gcc.c-torture/compile/pr82096.c -O3 -g (internal compiler error) > FAIL: gcc.c-torture/compile/pr82096.c -Os (internal compiler error) > FAIL: gcc.dg/di-longlong64-sync-1.c (internal compiler error) > FAIL: gcc.dg/di-sync-multithread.c (internal compiler error) > FAIL: gcc.target/aarch64/pr87839.c (internal compiler error) > > 2. Passing -msve-vector-bits=256 results in following additional > ICE's with trunk: > FAIL: gcc.dg/pr88598-2.c (internal compiler error) > FAIL: gcc.dg/pr88598-3.c (internal compiler error) > FAIL: gcc.dg/pr88598-5.c (internal compiler error) > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O1 (internal > compiler error) > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O2 (internal > compiler error) > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O2 -flto > -fno-use-linker-plugin -flto-partition=none (internal compiler error) > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O3 > -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer > -finline-functions (internal compiler error) > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O3 -g > (internal compiler error) > FAIL: c-c++-common/torture/builtin-convertvector-1.c -Os (internal > compiler error) > FAIL: gcc.target/aarch64/pr87839.c (internal compiler error) > FAIL: gfortran.dg/vect/vect-8-epilogue.F90 -O (internal compiler error) > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O1 (internal > compiler error) > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O2 (internal > compiler error) > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O2 -flto > -fno-use-linker-plugin -flto-partition=none (internal compiler error) > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O3 > -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer > -finline-functions (internal compiler error) > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O3 -g > (internal compiler error) I've been concentrating on the ACLE branch recently and haven't been tracking trunk, so can well believe it. :-) > Applying patch doesn't results in additional fallout relative to trunk > with -march=armv8.2-a+sve -msve-vector-bits=256. > Is it OK to apply ? Yes, thanks. > PS: Initially, I got UNSUPPORTED for SVE tests, because assembler > was rejecting the test "ptrue p0.b" in selector > check_effective_target_aarch64_sve_hw and would accept it only if > passed -march=armv8.2-a+sve explicitly on command line. I worked > around that by patching lib/target-supports.exp to explicitly pass the > option. Not sure if that's the right approach tho ? Ah, OK. I always test SVE with a toolchain configured for SVE by default (to get extra coverage building the target libraries), so I never hit this. For: > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > index 3bd6e815715..0ff0d8fb757 100644 > --- a/gcc/testsuite/lib/target-supports.exp > +++ b/gcc/testsuite/lib/target-supports.exp > @@ -3846,6 +3846,10 @@ proc add_options_for_arm_neon_softfp_fp16 { flags } { > return "$flags $et_arm_neon_softfp_fp16_flags" > } > > +proc add_options_for_arm_sve { flags } { > + return "$flags -march=armv8.2-a+sve" ...this I think we should avoid overriding the flags if they already select SVE, so probably: if { ![istarget aarch64*-*-*] || [check_effective_target_aarch64_sve] } { return "$flags" } Should be "aarch64_sve" rather than "arm_sve". > +} > + > # Return 1 if this is an ARM target supporting the FP16 alternative > # format. Some multilibs may be incompatible with the options needed. Also > # set et_arm_neon_fp16_flags to the best options to add. > @@ -4323,7 +4327,7 @@ proc check_effective_target_aarch64_sve_hw { } { > asm volatile ("ptrue p0.b"); > return 0; > } > - }] > + } [ add_options_for_arm_sve "" ]] > } > > # Return true if this is an AArch64 target that can run SVE code and > @@ -4343,7 +4347,7 @@ proc aarch64_sve_hw_bits { bits } { > __builtin_abort (); > return 0; > } > - }]] > + }] [add_options_for_arm_sve ""] ] > } > > # Return true if this is an AArch64 target that can run SVE code and Think the formatting in the second is preferred over the first (i.e. no spaces inside the [...]). Thanks, Richard
On Mon, 3 Jun 2019 at 14:53, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > On Thu, 30 May 2019 at 21:19, Richard Sandiford > > <richard.sandiford@arm.com> wrote: > >> > >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> > On Thu, 30 May 2019 at 15:10, Richard Sandiford > >> > <richard.sandiford@arm.com> wrote: > >> >> > >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > >> >> > new file mode 100644 > >> >> > index 00000000000..cbfeff4a59c > >> >> > --- /dev/null > >> >> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > >> >> > @@ -0,0 +1,27 @@ > >> >> > +/* { dg-do compile { target aarch64_asm_sve_ok } } */ > >> >> > +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > >> >> > >> >> Sorry for not noticing last time, but the combination of aarch64_asm_sve_ok > >> >> and --save-temps only makes sense for assemble tests, not compile tests. > >> >> So these should either be: > >> >> > >> >> /* { dg-do assemble { target aarch64_asm_sve_ok } } */ > >> >> /* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > >> >> > >> >> or: > >> >> > >> >> /* { dg-do compile } */ > >> >> /* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256" } */ > >> >> > >> >> Might as well as go for the first I guess. Same for the other > >> >> non-run tests. > >> >> > >> >> OK with that change, thanks. > >> > Thanks for pointing out, updated the patch with dg-do assemble. > >> > Sorry for silly ques - What configure option should be passed to gcc > >> > to generate code with -msve-vector-bits=256 by default ? > >> > I suppose that'd be necessary for correctness testing, to test patch > >> > with run tests that contain initializers and don't explicitly pass > >> > -msve-vector-bits=256 ? > >> > >> There's no configure option, but you can test with things like > >> --target_board unix/-msve-vector-bits=256 or > >> --target_board unix{,/-msve-vector-bits=256} (to test both with > >> and without -msve-vector-bits=256). > > Hi, > > Sorry for late response. I managed to cross-test using qemu and seeing some > > fallout: > > > > 1. Testing pristine trunk with --target_board=arm-qemu/-march=armv8.2-a+sve > > results in following ICE's: > > FAIL: gcc.c-torture/compile/pr82096.c -O0 (internal compiler error) > > FAIL: gcc.c-torture/compile/pr82096.c -O1 (internal compiler error) > > FAIL: gcc.c-torture/compile/pr82096.c -O2 (internal compiler error) > > FAIL: gcc.c-torture/compile/pr82096.c -O2 -flto > > -fno-use-linker-plugin -flto-partition=none (internal compiler error) > > FAIL: gcc.c-torture/compile/pr82096.c -O3 -g (internal compiler error) > > FAIL: gcc.c-torture/compile/pr82096.c -Os (internal compiler error) > > FAIL: gcc.dg/di-longlong64-sync-1.c (internal compiler error) > > FAIL: gcc.dg/di-sync-multithread.c (internal compiler error) > > FAIL: gcc.target/aarch64/pr87839.c (internal compiler error) > > > > 2. Passing -msve-vector-bits=256 results in following additional > > ICE's with trunk: > > FAIL: gcc.dg/pr88598-2.c (internal compiler error) > > FAIL: gcc.dg/pr88598-3.c (internal compiler error) > > FAIL: gcc.dg/pr88598-5.c (internal compiler error) > > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O1 (internal > > compiler error) > > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O2 (internal > > compiler error) > > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O2 -flto > > -fno-use-linker-plugin -flto-partition=none (internal compiler error) > > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O3 > > -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer > > -finline-functions (internal compiler error) > > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O3 -g > > (internal compiler error) > > FAIL: c-c++-common/torture/builtin-convertvector-1.c -Os (internal > > compiler error) > > FAIL: gcc.target/aarch64/pr87839.c (internal compiler error) > > FAIL: gfortran.dg/vect/vect-8-epilogue.F90 -O (internal compiler error) > > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O1 (internal > > compiler error) > > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O2 (internal > > compiler error) > > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O2 -flto > > -fno-use-linker-plugin -flto-partition=none (internal compiler error) > > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O3 > > -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer > > -finline-functions (internal compiler error) > > FAIL: c-c++-common/torture/builtin-convertvector-1.c -O3 -g > > (internal compiler error) > > I've been concentrating on the ACLE branch recently and haven't been > tracking trunk, so can well believe it. :-) > > > Applying patch doesn't results in additional fallout relative to trunk > > with -march=armv8.2-a+sve -msve-vector-bits=256. > > Is it OK to apply ? > > Yes, thanks. Thanks, committed as r271857. > > > PS: Initially, I got UNSUPPORTED for SVE tests, because assembler > > was rejecting the test "ptrue p0.b" in selector > > check_effective_target_aarch64_sve_hw and would accept it only if > > passed -march=armv8.2-a+sve explicitly on command line. I worked > > around that by patching lib/target-supports.exp to explicitly pass the > > option. Not sure if that's the right approach tho ? > > Ah, OK. I always test SVE with a toolchain configured for SVE by default > (to get extra coverage building the target libraries), so I never hit this. > For: > > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > > index 3bd6e815715..0ff0d8fb757 100644 > > --- a/gcc/testsuite/lib/target-supports.exp > > +++ b/gcc/testsuite/lib/target-supports.exp > > @@ -3846,6 +3846,10 @@ proc add_options_for_arm_neon_softfp_fp16 { flags } { > > return "$flags $et_arm_neon_softfp_fp16_flags" > > } > > > > +proc add_options_for_arm_sve { flags } { > > + return "$flags -march=armv8.2-a+sve" > > ...this I think we should avoid overriding the flags if they already > select SVE, so probably: > > if { ![istarget aarch64*-*-*] || [check_effective_target_aarch64_sve] } { > return "$flags" > } > > Should be "aarch64_sve" rather than "arm_sve". > > > +} > > + > > # Return 1 if this is an ARM target supporting the FP16 alternative > > # format. Some multilibs may be incompatible with the options needed. Also > > # set et_arm_neon_fp16_flags to the best options to add. > > @@ -4323,7 +4327,7 @@ proc check_effective_target_aarch64_sve_hw { } { > > asm volatile ("ptrue p0.b"); > > return 0; > > } > > - }] > > + } [ add_options_for_arm_sve "" ]] > > } > > > > # Return true if this is an AArch64 target that can run SVE code and > > @@ -4343,7 +4347,7 @@ proc aarch64_sve_hw_bits { bits } { > > __builtin_abort (); > > return 0; > > } > > - }]] > > + }] [add_options_for_arm_sve ""] ] > > } > > > > # Return true if this is an AArch64 target that can run SVE code and > > Think the formatting in the second is preferred over the first (i.e. > no spaces inside the [...]). Does the attached patch look OK ? Thanks, Prathamesh > > Thanks, > Richard diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 3bd6e815715..2b3e5d26004 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -3846,6 +3846,13 @@ proc add_options_for_arm_neon_softfp_fp16 { flags } { return "$flags $et_arm_neon_softfp_fp16_flags" } +proc add_options_for_aarch64_sve { flags } { + if { ![istarget aarch64*-*-*] || [check_effective_target_aarch64_sve] } { + return "$flags" + } + return "$flags -march=armv8.2-a+sve" +} + # Return 1 if this is an ARM target supporting the FP16 alternative # format. Some multilibs may be incompatible with the options needed. Also # set et_arm_neon_fp16_flags to the best options to add. @@ -4323,7 +4330,7 @@ proc check_effective_target_aarch64_sve_hw { } { asm volatile ("ptrue p0.b"); return 0; } - }] + } [add_options_for_aarch64_sve ""]] } # Return true if this is an AArch64 target that can run SVE code and @@ -4343,7 +4350,7 @@ proc aarch64_sve_hw_bits { bits } { __builtin_abort (); return 0; } - }]] + }] [add_options_for_aarch64_sve ""]] } # Return true if this is an AArch64 target that can run SVE code and
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp >> > index 3bd6e815715..0ff0d8fb757 100644 >> > --- a/gcc/testsuite/lib/target-supports.exp >> > +++ b/gcc/testsuite/lib/target-supports.exp >> > @@ -3846,6 +3846,10 @@ proc add_options_for_arm_neon_softfp_fp16 { flags } { >> > return "$flags $et_arm_neon_softfp_fp16_flags" >> > } >> > >> > +proc add_options_for_arm_sve { flags } { >> > + return "$flags -march=armv8.2-a+sve" >> >> ...this I think we should avoid overriding the flags if they already >> select SVE, so probably: >> >> if { ![istarget aarch64*-*-*] || [check_effective_target_aarch64_sve] } { >> return "$flags" >> } >> >> Should be "aarch64_sve" rather than "arm_sve". >> >> > +} >> > + >> > # Return 1 if this is an ARM target supporting the FP16 alternative >> > # format. Some multilibs may be incompatible with the options needed. Also >> > # set et_arm_neon_fp16_flags to the best options to add. >> > @@ -4323,7 +4327,7 @@ proc check_effective_target_aarch64_sve_hw { } { >> > asm volatile ("ptrue p0.b"); >> > return 0; >> > } >> > - }] >> > + } [ add_options_for_arm_sve "" ]] >> > } >> > >> > # Return true if this is an AArch64 target that can run SVE code and >> > @@ -4343,7 +4347,7 @@ proc aarch64_sve_hw_bits { bits } { >> > __builtin_abort (); >> > return 0; >> > } >> > - }]] >> > + }] [add_options_for_arm_sve ""] ] >> > } >> > >> > # Return true if this is an AArch64 target that can run SVE code and >> >> Think the formatting in the second is preferred over the first (i.e. >> no spaces inside the [...]). > Does the attached patch look OK ? Yes, thanks. (With a suitable changelog of course :-)) Richard > > Thanks, > Prathamesh >> >> Thanks, >> Richard > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > index 3bd6e815715..2b3e5d26004 100644 > --- a/gcc/testsuite/lib/target-supports.exp > +++ b/gcc/testsuite/lib/target-supports.exp > @@ -3846,6 +3846,13 @@ proc add_options_for_arm_neon_softfp_fp16 { flags } { > return "$flags $et_arm_neon_softfp_fp16_flags" > } > > +proc add_options_for_aarch64_sve { flags } { > + if { ![istarget aarch64*-*-*] || [check_effective_target_aarch64_sve] } { > + return "$flags" > + } > + return "$flags -march=armv8.2-a+sve" > +} > + > # Return 1 if this is an ARM target supporting the FP16 alternative > # format. Some multilibs may be incompatible with the options needed. Also > # set et_arm_neon_fp16_flags to the best options to add. > @@ -4323,7 +4330,7 @@ proc check_effective_target_aarch64_sve_hw { } { > asm volatile ("ptrue p0.b"); > return 0; > } > - }] > + } [add_options_for_aarch64_sve ""]] > } > > # Return true if this is an AArch64 target that can run SVE code and > @@ -4343,7 +4350,7 @@ proc aarch64_sve_hw_bits { bits } { > __builtin_abort (); > return 0; > } > - }]] > + }] [add_options_for_aarch64_sve ""]] > } > > # Return true if this is an AArch64 target that can run SVE code and
On Mon, 3 Jun 2019 at 16:25, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > >> > index 3bd6e815715..0ff0d8fb757 100644 > >> > --- a/gcc/testsuite/lib/target-supports.exp > >> > +++ b/gcc/testsuite/lib/target-supports.exp > >> > @@ -3846,6 +3846,10 @@ proc add_options_for_arm_neon_softfp_fp16 { flags } { > >> > return "$flags $et_arm_neon_softfp_fp16_flags" > >> > } > >> > > >> > +proc add_options_for_arm_sve { flags } { > >> > + return "$flags -march=armv8.2-a+sve" > >> > >> ...this I think we should avoid overriding the flags if they already > >> select SVE, so probably: > >> > >> if { ![istarget aarch64*-*-*] || [check_effective_target_aarch64_sve] } { > >> return "$flags" > >> } > >> > >> Should be "aarch64_sve" rather than "arm_sve". > >> > >> > +} > >> > + > >> > # Return 1 if this is an ARM target supporting the FP16 alternative > >> > # format. Some multilibs may be incompatible with the options needed. Also > >> > # set et_arm_neon_fp16_flags to the best options to add. > >> > @@ -4323,7 +4327,7 @@ proc check_effective_target_aarch64_sve_hw { } { > >> > asm volatile ("ptrue p0.b"); > >> > return 0; > >> > } > >> > - }] > >> > + } [ add_options_for_arm_sve "" ]] > >> > } > >> > > >> > # Return true if this is an AArch64 target that can run SVE code and > >> > @@ -4343,7 +4347,7 @@ proc aarch64_sve_hw_bits { bits } { > >> > __builtin_abort (); > >> > return 0; > >> > } > >> > - }]] > >> > + }] [add_options_for_arm_sve ""] ] > >> > } > >> > > >> > # Return true if this is an AArch64 target that can run SVE code and > >> > >> Think the formatting in the second is preferred over the first (i.e. > >> no spaces inside the [...]). > > Does the attached patch look OK ? > > Yes, thanks. (With a suitable changelog of course :-)) Thanks, committed to trunk. I filed following PR's based on the testsuite fallout: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90722 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90723 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90724 Thanks, Prathamesh > > Richard > > > > > Thanks, > > Prathamesh > >> > >> Thanks, > >> Richard > > > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > > index 3bd6e815715..2b3e5d26004 100644 > > --- a/gcc/testsuite/lib/target-supports.exp > > +++ b/gcc/testsuite/lib/target-supports.exp > > @@ -3846,6 +3846,13 @@ proc add_options_for_arm_neon_softfp_fp16 { flags } { > > return "$flags $et_arm_neon_softfp_fp16_flags" > > } > > > > +proc add_options_for_aarch64_sve { flags } { > > + if { ![istarget aarch64*-*-*] || [check_effective_target_aarch64_sve] } { > > + return "$flags" > > + } > > + return "$flags -march=armv8.2-a+sve" > > +} > > + > > # Return 1 if this is an ARM target supporting the FP16 alternative > > # format. Some multilibs may be incompatible with the options needed. Also > > # set et_arm_neon_fp16_flags to the best options to add. > > @@ -4323,7 +4330,7 @@ proc check_effective_target_aarch64_sve_hw { } { > > asm volatile ("ptrue p0.b"); > > return 0; > > } > > - }] > > + } [add_options_for_aarch64_sve ""]] > > } > > > > # Return true if this is an AArch64 target that can run SVE code and > > @@ -4343,7 +4350,7 @@ proc aarch64_sve_hw_bits { bits } { > > __builtin_abort (); > > return 0; > > } > > - }]] > > + }] [add_options_for_aarch64_sve ""]] > > } > > > > # Return true if this is an AArch64 target that can run SVE code and
On 03/06/2019 08:26, Prathamesh Kulkarni wrote: > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c > @@ -0,0 +1,32 @@ > +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ > +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > + > +/* Case 5.2: Interleaved elements and constants. */ > + > +#include <stdint.h> > + > +typedef int32_t vnx4si __attribute__((vector_size (32))); > + > +__attribute__((noipa)) > +vnx4si foo(int a, int b, int c, int d) > +{ > + return (vnx4si) { a, 1, b, 2, c, 3, d, 4 }; > +} > + > +/* > +foo: > +.LFB0: > + .cfi_startproc > + ptrue p0.s, vl8 > + mov z0.s, w3 > + adrp x3, .LANCHOR0 > + insr z0.s, w2 > + add x3, x3, :lo12:.LANCHOR0 > + insr z0.s, w1 > + ld1w z1.s, p0/z, [x3] > + insr z0.s, w0 > + zip1 z0.s, z0.s, z1.s > + ret > +*/ > + > +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tadrp\t(x[0-9]+), \.LANCHOR0\n\tinsr\t\1, w2\n\tadd\t\2, \2, :lo12:\.LANCHOR0\n\tinsr\t\1, w1\n\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[\2\]\n\tinsr\t\1, w0\n\tzip1\t\1, \1, \3} } } */ this fails with tiny model when i'm testing aarch64-none-elf $ make check-c 'RUNTESTFLAGS=--target_board=aarch64-elf-qemu{-mcmodel=tiny} aarch64-sve.exp=init_8.c' ... FAIL: gcc.target/aarch64/sve/init_8.c -march=armv8.2-a+sve scan-assembler \\tmov\\t(z[0-9]+\\.s), w3\\n\\tadrp\\t(x[0-9]+), \\.LANCHOR0\\n\\tinsr\\t\\1, w2\\n\\tadd\\t\\2, \\2, :lo12:\\.LANCHOR0\\n\\tinsr\\t\\1, w1\\n\\tld1w\\t(z[0-9]+\\.s), p[0-9]+/z, \\[\\2\\]\\n\\tinsr\\t\\1, w0\\n\\tzip1\\t\\1, \\1, \\3 i think you need conditional scan asm for { target aarch64_small } and { target aarch64_tiny } or just skip the test for tiny, but even then matching exact register name and instruction scheduling seems fragile. tiny code: .arch armv8.2-a+crc+sve .file "init_8.c" .text .align 2 .p2align 3,,7 .global foo .type foo, %function foo: ptrue p0.s, vl8 adr x4, .LC0 mov z0.s, w3 ld1w z1.s, p0/z, [x4] insr z0.s, w2 insr z0.s, w1 insr z0.s, w0 zip1 z0.s, z0.s, z1.s st1w z0.s, p0, [x8] ret .size foo, .-foo .align 4 .LC0: .word 1 .word 2 .word 3 .word 4 .word 3 .word 4 .word 3 .word 4
Szabolcs Nagy <Szabolcs.Nagy@arm.com> writes: > On 03/06/2019 08:26, Prathamesh Kulkarni wrote: >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c >> @@ -0,0 +1,32 @@ >> +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ >> +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ >> + >> +/* Case 5.2: Interleaved elements and constants. */ >> + >> +#include <stdint.h> >> + >> +typedef int32_t vnx4si __attribute__((vector_size (32))); >> + >> +__attribute__((noipa)) >> +vnx4si foo(int a, int b, int c, int d) >> +{ >> + return (vnx4si) { a, 1, b, 2, c, 3, d, 4 }; >> +} >> + >> +/* >> +foo: >> +.LFB0: >> + .cfi_startproc >> + ptrue p0.s, vl8 >> + mov z0.s, w3 >> + adrp x3, .LANCHOR0 >> + insr z0.s, w2 >> + add x3, x3, :lo12:.LANCHOR0 >> + insr z0.s, w1 >> + ld1w z1.s, p0/z, [x3] >> + insr z0.s, w0 >> + zip1 z0.s, z0.s, z1.s >> + ret >> +*/ >> + >> +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tadrp\t(x[0-9]+), \.LANCHOR0\n\tinsr\t\1, w2\n\tadd\t\2, \2, :lo12:\.LANCHOR0\n\tinsr\t\1, w1\n\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[\2\]\n\tinsr\t\1, w0\n\tzip1\t\1, \1, \3} } } */ > > this fails with tiny model when i'm testing aarch64-none-elf > > $ make check-c 'RUNTESTFLAGS=--target_board=aarch64-elf-qemu{-mcmodel=tiny} aarch64-sve.exp=init_8.c' > ... > FAIL: gcc.target/aarch64/sve/init_8.c -march=armv8.2-a+sve scan-assembler \\tmov\\t(z[0-9]+\\.s), w3\\n\\tadrp\\t(x[0-9]+), > \\.LANCHOR0\\n\\tinsr\\t\\1, w2\\n\\tadd\\t\\2, \\2, :lo12:\\.LANCHOR0\\n\\tinsr\\t\\1, w1\\n\\tld1w\\t(z[0-9]+\\.s), p[0-9]+/z, > \\[\\2\\]\\n\\tinsr\\t\\1, w0\\n\\tzip1\\t\\1, \\1, \\3 > > i think you need conditional scan asm for { target aarch64_small } > and { target aarch64_tiny } or just skip the test for tiny, Maybe we should remove the address calculation and replace the ld1w address with \[[^]]*\]. All that really matters for this test is that the vector is loaded from memory. > but even then matching exact register name and instruction scheduling > seems fragile. The only hard-coded register names are the parameters, which are guaranteed by the ABI. Testing for those should be fine. The dg-options pass -fno-schedule-insns, but I guess they should also pass -fno-schedule-insns2. Or maybe just use -O instead. We can always revisit this later if even that isn't enough to make the order stable. Richard
On Thu, 6 Jun 2019 at 16:54, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Szabolcs Nagy <Szabolcs.Nagy@arm.com> writes: > > On 03/06/2019 08:26, Prathamesh Kulkarni wrote: > >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c > >> @@ -0,0 +1,32 @@ > >> +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ > >> +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > >> + > >> +/* Case 5.2: Interleaved elements and constants. */ > >> + > >> +#include <stdint.h> > >> + > >> +typedef int32_t vnx4si __attribute__((vector_size (32))); > >> + > >> +__attribute__((noipa)) > >> +vnx4si foo(int a, int b, int c, int d) > >> +{ > >> + return (vnx4si) { a, 1, b, 2, c, 3, d, 4 }; > >> +} > >> + > >> +/* > >> +foo: > >> +.LFB0: > >> + .cfi_startproc > >> + ptrue p0.s, vl8 > >> + mov z0.s, w3 > >> + adrp x3, .LANCHOR0 > >> + insr z0.s, w2 > >> + add x3, x3, :lo12:.LANCHOR0 > >> + insr z0.s, w1 > >> + ld1w z1.s, p0/z, [x3] > >> + insr z0.s, w0 > >> + zip1 z0.s, z0.s, z1.s > >> + ret > >> +*/ > >> + > >> +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tadrp\t(x[0-9]+), \.LANCHOR0\n\tinsr\t\1, w2\n\tadd\t\2, \2, :lo12:\.LANCHOR0\n\tinsr\t\1, w1\n\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[\2\]\n\tinsr\t\1, w0\n\tzip1\t\1, \1, \3} } } */ > > > > this fails with tiny model when i'm testing aarch64-none-elf > > > > $ make check-c 'RUNTESTFLAGS=--target_board=aarch64-elf-qemu{-mcmodel=tiny} aarch64-sve.exp=init_8.c' > > ... > > FAIL: gcc.target/aarch64/sve/init_8.c -march=armv8.2-a+sve scan-assembler \\tmov\\t(z[0-9]+\\.s), w3\\n\\tadrp\\t(x[0-9]+), > > \\.LANCHOR0\\n\\tinsr\\t\\1, w2\\n\\tadd\\t\\2, \\2, :lo12:\\.LANCHOR0\\n\\tinsr\\t\\1, w1\\n\\tld1w\\t(z[0-9]+\\.s), p[0-9]+/z, > > \\[\\2\\]\\n\\tinsr\\t\\1, w0\\n\\tzip1\\t\\1, \\1, \\3 > > > > i think you need conditional scan asm for { target aarch64_small } > > and { target aarch64_tiny } or just skip the test for tiny, > > Maybe we should remove the address calculation and replace the ld1w > address with \[[^]]*\]. All that really matters for this test is that > the vector is loaded from memory. > > > but even then matching exact register name and instruction scheduling > > seems fragile. > > The only hard-coded register names are the parameters, which are > guaranteed by the ABI. Testing for those should be fine. > > The dg-options pass -fno-schedule-insns, but I guess they should > also pass -fno-schedule-insns2. Or maybe just use -O instead. > We can always revisit this later if even that isn't enough to make > the order stable. Thanks for the suggestions. Passing -fno-schedule-insns2 does seem to make the order stable. For init_1.c to init_4.c there were no intervening instructions, and for remaining tests, the patch passes -fno-schedule-insns2 and adjusts dg-scan accordingly. I verified the tests pass with -mcmodel=tiny. OK to commit ? Thanks, Prathamesh > > Richard 2019-06-07 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> * gcc.target/aarch64/sve/init_5.c: Pass -fno-schedule-insns2. Update assembly in comments and adjust dg-scan. * gcc.target/aarch64/sve/init_6.c: Likewise. * gcc.target/aarch64/sve/init_7.c: Likewise. * gcc.target/aarch64/sve/init_8.c: Likewise. * gcc.target/aarch64/sve/init_9.c: Likewise. * gcc.target/aarch64/sve/init_10.c: Likewise. * gcc.target/aarch64/sve/init_11.c: Likewise. * gcc.target/aarch64/sve/init_12.c: Likewise. diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c index 9d6e2dfc876..08437e5d8f1 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -msve-vector-bits=256 --save-temps" } */ /* Case 5.4: Interleaved repeating elements and non-repeating elements. */ @@ -17,13 +17,14 @@ vnx4si foo(int a, int b, int c, int f) foo: .LFB0: .cfi_startproc - mov z0.s, w2 mov z1.s, w3 + mov z0.s, w2 insr z0.s, w1 - ptrue p0.s, vl8 insr z0.s, w0 zip1 z0.s, z0.s, z1.s + ptrue p0.s, vl8 + st1w z0.s, p0, [x8] ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c index e50cd54ef13..786765dbfb7 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_11.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -msve-vector-bits=256 --save-temps" } */ /* Case 5.5: Interleaved repeating elements and trailing same elements. */ @@ -18,11 +18,12 @@ foo: .LFB0: .cfi_startproc mov z0.s, w1 - mov z1.s, w2 insr z0.s, w0 - ptrue p0.s, vl8 + mov z1.s, w2 zip1 z0.s, z0.s, z1.s + ptrue p0.s, vl8 + st1w z0.s, p0, [x8] ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w1\n\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\1, w0\n.*\tzip1\t\1, \1, \2} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w1\n\tinsr\t\1, w0\n\tmov\t(z[0-9]+\.s), w2\n\tzip1\t\1, \1, \2} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_12.c b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c index 21d9e764360..e65b1af475c 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_12.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -msve-vector-bits=256 --save-temps" } */ /* Case 5.5: Interleaved repeating elements and trailing same elements. */ @@ -17,14 +17,15 @@ vnx4si foo(int a, int b, int f) foo: .LFB0: .cfi_startproc - mov z0.s, w0 mov z1.s, w2 + mov z0.s, w0 insr z0.s, w1 - ptrue p0.s, vl8 insr z0.s, w1 insr z0.s, w1 zip1 z0.s, z0.s, z1.s + ptrue p0.s, vl8 + st1w z0.s, p0, [x8] ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tmov\t(z[0-9]+\.s), w0\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tzip1\t\2, \2, \1} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tmov\t(z[0-9]+\.s), w0\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tzip1\t\2, \2, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_5.c b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c index e7fbdd1a2aa..acab6d2d405 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_5.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -msve-vector-bits=256 --save-temps" } */ /* Case 3: Trailing same element. */ @@ -18,10 +18,11 @@ foo: .LFB0: .cfi_startproc mov z0.s, w2 - ptrue p0.s, vl8 insr z0.s, w1 insr z0.s, w0 + ptrue p0.s, vl8 + st1w z0.s, p0, [x8] ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_6.c b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c index f6f3da5958d..fd6d4b9b85a 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_6.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -msve-vector-bits=256 --save-temps" } */ /* Case 3: Trailing same element. */ @@ -18,11 +18,12 @@ foo: .LFB0: .cfi_startproc mov z0.s, w2 - ptrue p0.s, vl8 insr z0.s, w1 insr z0.s, w0 rev z0.s, z0.s + ptrue p0.s, vl8 + st1w z0.s, p0, [x8] ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_7.c b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c index e3104a35f13..cf6926d3a73 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_7.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -msve-vector-bits=256 --save-temps" } */ /* Case 5.1: All elements. */ @@ -18,7 +18,6 @@ foo: .LFB0: .cfi_startproc mov z0.s, w7 - ptrue p0.s, vl8 insr z0.s, w6 insr z0.s, w5 insr z0.s, w4 @@ -26,7 +25,9 @@ foo: insr z0.s, w2 insr z0.s, w1 insr z0.s, w0 + ptrue p0.s, vl8 + st1w z0.s, p0, [x8] ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w7\n.*\tinsr\t\1, w6\n\tinsr\t\1, w5\n\tinsr\t\1, w4\n\tinsr\t\1, w3\n\tinsr\t\1, w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w7\n\tinsr\t\1, w6\n\tinsr\t\1, w5\n\tinsr\t\1, w4\n\tinsr\t\1, w3\n\tinsr\t\1, w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_8.c b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c index 7ff3e0849cc..b3ed32e4c82 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_8.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -msve-vector-bits=256 --save-temps" } */ /* Case 5.2: Interleaved elements and constants. */ @@ -18,15 +18,16 @@ foo: .LFB0: .cfi_startproc ptrue p0.s, vl8 + adrp x4, .LANCHOR0 + add x4, x4, :lo12:.LANCHOR0 + ld1w z1.s, p0/z, [x4] mov z0.s, w3 - adrp x3, .LANCHOR0 insr z0.s, w2 - add x3, x3, :lo12:.LANCHOR0 insr z0.s, w1 - ld1w z1.s, p0/z, [x3] insr z0.s, w0 zip1 z0.s, z0.s, z1.s + st1w z0.s, p0, [x8] ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tadrp\t(x[0-9]+), \.LANCHOR0\n\tinsr\t\1, w2\n\tadd\t\2, \2, :lo12:\.LANCHOR0\n\tinsr\t\1, w1\n\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[\2\]\n\tinsr\t\1, w0\n\tzip1\t\1, \1, \3} } } */ +/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]\n\tmov\t(z[0-9]+\.s), w3\n\tinsr\t\2, w2\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_9.c b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c index 4d3c59b3bf8..333bd4f2852 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_9.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -msve-vector-bits=256 --save-temps" } */ /* Case 5.3: Repeated elements. */ @@ -19,9 +19,10 @@ foo: .cfi_startproc mov z0.s, w0 mov z1.s, w1 - ptrue p0.s, vl8 zip1 z0.s, z0.s, z1.s + ptrue p0.s, vl8 + st1w z0.s, p0, [x8] ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w0\n\tmov\t(z[0-9]+\.s), w1\n.*\tzip1\t\1, \1, \2} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w0\n\tmov\t(z[0-9]+\.s), w1\n\tzip1\t\1, \1, \2} } } */
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > On Thu, 6 Jun 2019 at 16:54, Richard Sandiford > <richard.sandiford@arm.com> wrote: >> >> Szabolcs Nagy <Szabolcs.Nagy@arm.com> writes: >> > On 03/06/2019 08:26, Prathamesh Kulkarni wrote: >> >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c >> >> @@ -0,0 +1,32 @@ >> >> +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ >> >> +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ >> >> + >> >> +/* Case 5.2: Interleaved elements and constants. */ >> >> + >> >> +#include <stdint.h> >> >> + >> >> +typedef int32_t vnx4si __attribute__((vector_size (32))); >> >> + >> >> +__attribute__((noipa)) >> >> +vnx4si foo(int a, int b, int c, int d) >> >> +{ >> >> + return (vnx4si) { a, 1, b, 2, c, 3, d, 4 }; >> >> +} >> >> + >> >> +/* >> >> +foo: >> >> +.LFB0: >> >> + .cfi_startproc >> >> + ptrue p0.s, vl8 >> >> + mov z0.s, w3 >> >> + adrp x3, .LANCHOR0 >> >> + insr z0.s, w2 >> >> + add x3, x3, :lo12:.LANCHOR0 >> >> + insr z0.s, w1 >> >> + ld1w z1.s, p0/z, [x3] >> >> + insr z0.s, w0 >> >> + zip1 z0.s, z0.s, z1.s >> >> + ret >> >> +*/ >> >> + >> >> +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tadrp\t(x[0-9]+), \.LANCHOR0\n\tinsr\t\1, w2\n\tadd\t\2, \2, :lo12:\.LANCHOR0\n\tinsr\t\1, w1\n\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[\2\]\n\tinsr\t\1, w0\n\tzip1\t\1, \1, \3} } } */ >> > >> > this fails with tiny model when i'm testing aarch64-none-elf >> > >> > $ make check-c 'RUNTESTFLAGS=--target_board=aarch64-elf-qemu{-mcmodel=tiny} aarch64-sve.exp=init_8.c' >> > ... >> > FAIL: gcc.target/aarch64/sve/init_8.c -march=armv8.2-a+sve scan-assembler \\tmov\\t(z[0-9]+\\.s), w3\\n\\tadrp\\t(x[0-9]+), >> > \\.LANCHOR0\\n\\tinsr\\t\\1, w2\\n\\tadd\\t\\2, \\2, :lo12:\\.LANCHOR0\\n\\tinsr\\t\\1, w1\\n\\tld1w\\t(z[0-9]+\\.s), p[0-9]+/z, >> > \\[\\2\\]\\n\\tinsr\\t\\1, w0\\n\\tzip1\\t\\1, \\1, \\3 >> > >> > i think you need conditional scan asm for { target aarch64_small } >> > and { target aarch64_tiny } or just skip the test for tiny, >> >> Maybe we should remove the address calculation and replace the ld1w >> address with \[[^]]*\]. All that really matters for this test is that >> the vector is loaded from memory. >> >> > but even then matching exact register name and instruction scheduling >> > seems fragile. >> >> The only hard-coded register names are the parameters, which are >> guaranteed by the ABI. Testing for those should be fine. >> >> The dg-options pass -fno-schedule-insns, but I guess they should >> also pass -fno-schedule-insns2. Or maybe just use -O instead. >> We can always revisit this later if even that isn't enough to make >> the order stable. > Thanks for the suggestions. Passing -fno-schedule-insns2 does seem to > make the order stable. > For init_1.c to init_4.c there were no intervening instructions, and > for remaining tests, the patch passes -fno-schedule-insns2 > and adjusts dg-scan accordingly. I verified the tests pass with -mcmodel=tiny. I think we should use consistent options for all the test though. So either we should add -fno-schedule-insns2 to all of them, or we should switch to -O. TBH -O seems easier :-) (I checked that all tests do still pass with -O.) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c > index 9d6e2dfc876..08437e5d8f1 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c > @@ -1,5 +1,5 @@ > /* { dg-do assemble { target aarch64_asm_sve_ok } } */ > -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -msve-vector-bits=256 --save-temps" } */ > > /* Case 5.4: Interleaved repeating elements and non-repeating elements. */ > > @@ -17,13 +17,14 @@ vnx4si foo(int a, int b, int c, int f) > foo: > .LFB0: > .cfi_startproc > - mov z0.s, w2 > mov z1.s, w3 > + mov z0.s, w2 > insr z0.s, w1 > - ptrue p0.s, vl8 > insr z0.s, w0 > zip1 z0.s, z0.s, z1.s > + ptrue p0.s, vl8 > + st1w z0.s, p0, [x8] > ret > */ > > -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ > +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ You're reintroducing the st1w as part of the asms. We should either do that for all the tests or leave it out. Thanks, Richard
On Fri, 7 Jun 2019 at 18:26, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > On Thu, 6 Jun 2019 at 16:54, Richard Sandiford > > <richard.sandiford@arm.com> wrote: > >> > >> Szabolcs Nagy <Szabolcs.Nagy@arm.com> writes: > >> > On 03/06/2019 08:26, Prathamesh Kulkarni wrote: > >> >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c > >> >> @@ -0,0 +1,32 @@ > >> >> +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ > >> >> +/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > >> >> + > >> >> +/* Case 5.2: Interleaved elements and constants. */ > >> >> + > >> >> +#include <stdint.h> > >> >> + > >> >> +typedef int32_t vnx4si __attribute__((vector_size (32))); > >> >> + > >> >> +__attribute__((noipa)) > >> >> +vnx4si foo(int a, int b, int c, int d) > >> >> +{ > >> >> + return (vnx4si) { a, 1, b, 2, c, 3, d, 4 }; > >> >> +} > >> >> + > >> >> +/* > >> >> +foo: > >> >> +.LFB0: > >> >> + .cfi_startproc > >> >> + ptrue p0.s, vl8 > >> >> + mov z0.s, w3 > >> >> + adrp x3, .LANCHOR0 > >> >> + insr z0.s, w2 > >> >> + add x3, x3, :lo12:.LANCHOR0 > >> >> + insr z0.s, w1 > >> >> + ld1w z1.s, p0/z, [x3] > >> >> + insr z0.s, w0 > >> >> + zip1 z0.s, z0.s, z1.s > >> >> + ret > >> >> +*/ > >> >> + > >> >> +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tadrp\t(x[0-9]+), \.LANCHOR0\n\tinsr\t\1, w2\n\tadd\t\2, \2, :lo12:\.LANCHOR0\n\tinsr\t\1, w1\n\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[\2\]\n\tinsr\t\1, w0\n\tzip1\t\1, \1, \3} } } */ > >> > > >> > this fails with tiny model when i'm testing aarch64-none-elf > >> > > >> > $ make check-c 'RUNTESTFLAGS=--target_board=aarch64-elf-qemu{-mcmodel=tiny} aarch64-sve.exp=init_8.c' > >> > ... > >> > FAIL: gcc.target/aarch64/sve/init_8.c -march=armv8.2-a+sve scan-assembler \\tmov\\t(z[0-9]+\\.s), w3\\n\\tadrp\\t(x[0-9]+), > >> > \\.LANCHOR0\\n\\tinsr\\t\\1, w2\\n\\tadd\\t\\2, \\2, :lo12:\\.LANCHOR0\\n\\tinsr\\t\\1, w1\\n\\tld1w\\t(z[0-9]+\\.s), p[0-9]+/z, > >> > \\[\\2\\]\\n\\tinsr\\t\\1, w0\\n\\tzip1\\t\\1, \\1, \\3 > >> > > >> > i think you need conditional scan asm for { target aarch64_small } > >> > and { target aarch64_tiny } or just skip the test for tiny, > >> > >> Maybe we should remove the address calculation and replace the ld1w > >> address with \[[^]]*\]. All that really matters for this test is that > >> the vector is loaded from memory. > >> > >> > but even then matching exact register name and instruction scheduling > >> > seems fragile. > >> > >> The only hard-coded register names are the parameters, which are > >> guaranteed by the ABI. Testing for those should be fine. > >> > >> The dg-options pass -fno-schedule-insns, but I guess they should > >> also pass -fno-schedule-insns2. Or maybe just use -O instead. > >> We can always revisit this later if even that isn't enough to make > >> the order stable. > > Thanks for the suggestions. Passing -fno-schedule-insns2 does seem to > > make the order stable. > > For init_1.c to init_4.c there were no intervening instructions, and > > for remaining tests, the patch passes -fno-schedule-insns2 > > and adjusts dg-scan accordingly. I verified the tests pass with -mcmodel=tiny. > > I think we should use consistent options for all the test though. > So either we should add -fno-schedule-insns2 to all of them, > or we should switch to -O. TBH -O seems easier :-) (I checked > that all tests do still pass with -O.) > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c > > index 9d6e2dfc876..08437e5d8f1 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c > > @@ -1,5 +1,5 @@ > > /* { dg-do assemble { target aarch64_asm_sve_ok } } */ > > -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > > +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -msve-vector-bits=256 --save-temps" } */ > > > > /* Case 5.4: Interleaved repeating elements and non-repeating elements. */ > > > > @@ -17,13 +17,14 @@ vnx4si foo(int a, int b, int c, int f) > > foo: > > .LFB0: > > .cfi_startproc > > - mov z0.s, w2 > > mov z1.s, w3 > > + mov z0.s, w2 > > insr z0.s, w1 > > - ptrue p0.s, vl8 > > insr z0.s, w0 > > zip1 z0.s, z0.s, z1.s > > + ptrue p0.s, vl8 > > + st1w z0.s, p0, [x8] > > ret > > */ > > > > -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ > > +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ > > You're reintroducing the st1w as part of the asms. We should either > do that for all the tests or leave it out. Oops, sorry about that. Attached patch removes st1w and passes -O for all tests in the attached patch. OK to commit ? Thanks, Prathamesh > > Thanks, > Richard 2019-06-07 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> * gcc.target/aarch64/sve/init_1.c: Remove options -O2 -fno-schedule-insns and instead pass -O. Update assembly in comments. * gcc.target/aarch64/sve/init_2.c: Likewise. * gcc.target/aarch64/sve/init_3.c: Likewise. * gcc.target/aarch64/sve/init_4.c: Likewise. * gcc.target/aarch64/sve/init_5.c: Likewise and additionally adjust dg-scan. * gcc.target/aarch64/sve/init_6.c: Likewise. * gcc.target/aarch64/sve/init_7.c: Likewise. * gcc.target/aarch64/sve/init_8.c: Likewise. * gcc.target/aarch64/sve/init_9.c: Likewise. * gcc.target/aarch64/sve/init_10.c: Likewise. * gcc.target/aarch64/sve/init_11.c: Likewise. * gcc.target/aarch64/sve/init_12.c: Likewise. diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c index 5c14b603f46..4f18088f3b0 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ /* Case 1.1: Trailing constants with stepped sequence. */ @@ -17,10 +17,10 @@ vnx4si foo(int a, int b) foo: .LFB0: .cfi_startproc - ptrue p0.s, vl8 index z0.s, #1, #1 insr z0.s, w1 insr z0.s, w0 + ptrue p0.s, vl8 ret */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c index 9d6e2dfc876..1ee1db723e6 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ /* Case 5.4: Interleaved repeating elements and non-repeating elements. */ @@ -17,13 +17,13 @@ vnx4si foo(int a, int b, int c, int f) foo: .LFB0: .cfi_startproc - mov z0.s, w2 mov z1.s, w3 + mov z0.s, w2 insr z0.s, w1 - ptrue p0.s, vl8 insr z0.s, w0 zip1 z0.s, z0.s, z1.s + ptrue p0.s, vl8 ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c index e50cd54ef13..0b3c4a51198 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_11.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ /* Case 5.5: Interleaved repeating elements and trailing same elements. */ @@ -18,11 +18,11 @@ foo: .LFB0: .cfi_startproc mov z0.s, w1 - mov z1.s, w2 insr z0.s, w0 - ptrue p0.s, vl8 + mov z1.s, w2 zip1 z0.s, z0.s, z1.s + ptrue p0.s, vl8 ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w1\n\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\1, w0\n.*\tzip1\t\1, \1, \2} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w1\n\tinsr\t\1, w0\n\tmov\t(z[0-9]+\.s), w2\n\tzip1\t\1, \1, \2} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_12.c b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c index 21d9e764360..2473a5ecb80 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_12.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ /* Case 5.5: Interleaved repeating elements and trailing same elements. */ @@ -17,14 +17,14 @@ vnx4si foo(int a, int b, int f) foo: .LFB0: .cfi_startproc - mov z0.s, w0 mov z1.s, w2 + mov z0.s, w0 insr z0.s, w1 - ptrue p0.s, vl8 insr z0.s, w1 insr z0.s, w1 zip1 z0.s, z0.s, z1.s + ptrue p0.s, vl8 ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tmov\t(z[0-9]+\.s), w0\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tzip1\t\2, \2, \1} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tmov\t(z[0-9]+\.s), w0\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tzip1\t\2, \2, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_2.c b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c index a8b2a25b325..5b4ba105af2 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ /* Case 1.2: Trailing constants with repeating sequence. */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_3.c b/gcc/testsuite/gcc.target/aarch64/sve/init_3.c index 6b000b887ba..62f31b75efd 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_3.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_3.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ /* Case 2.1: Leading constants with stepped sequence. */ @@ -17,11 +17,11 @@ vnx4si foo(int a, int b) foo: .LFB0: .cfi_startproc - ptrue p0.s, vl8 index z0.s, #6, #-1 insr z0.s, w0 insr z0.s, w1 rev z0.s, z0.s + ptrue p0.s, vl8 ret */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_4.c b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c index 619274928e4..94484b1a4e6 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ /* Case 2.2: Leading constants with stepped sequence. */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_5.c b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c index e7fbdd1a2aa..0a0e8ebd1fe 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_5.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ /* Case 3: Trailing same element. */ @@ -18,10 +18,10 @@ foo: .LFB0: .cfi_startproc mov z0.s, w2 - ptrue p0.s, vl8 insr z0.s, w1 insr z0.s, w0 + ptrue p0.s, vl8 ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_6.c b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c index f6f3da5958d..10eca3a9001 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_6.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ /* Case 3: Trailing same element. */ @@ -18,11 +18,11 @@ foo: .LFB0: .cfi_startproc mov z0.s, w2 - ptrue p0.s, vl8 insr z0.s, w1 insr z0.s, w0 rev z0.s, z0.s + ptrue p0.s, vl8 ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_7.c b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c index e3104a35f13..d83fa9c08f2 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_7.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ /* Case 5.1: All elements. */ @@ -18,7 +18,6 @@ foo: .LFB0: .cfi_startproc mov z0.s, w7 - ptrue p0.s, vl8 insr z0.s, w6 insr z0.s, w5 insr z0.s, w4 @@ -26,7 +25,8 @@ foo: insr z0.s, w2 insr z0.s, w1 insr z0.s, w0 + ptrue p0.s, vl8 ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w7\n.*\tinsr\t\1, w6\n\tinsr\t\1, w5\n\tinsr\t\1, w4\n\tinsr\t\1, w3\n\tinsr\t\1, w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w7\n\tinsr\t\1, w6\n\tinsr\t\1, w5\n\tinsr\t\1, w4\n\tinsr\t\1, w3\n\tinsr\t\1, w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_8.c b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c index 7ff3e0849cc..73f7aba3df3 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_8.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ /* Case 5.2: Interleaved elements and constants. */ @@ -18,15 +18,15 @@ foo: .LFB0: .cfi_startproc ptrue p0.s, vl8 + adrp x4, .LANCHOR0 + add x4, x4, :lo12:.LANCHOR0 + ld1w z1.s, p0/z, [x4] mov z0.s, w3 - adrp x3, .LANCHOR0 insr z0.s, w2 - add x3, x3, :lo12:.LANCHOR0 insr z0.s, w1 - ld1w z1.s, p0/z, [x3] insr z0.s, w0 zip1 z0.s, z0.s, z1.s ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tadrp\t(x[0-9]+), \.LANCHOR0\n\tinsr\t\1, w2\n\tadd\t\2, \2, :lo12:\.LANCHOR0\n\tinsr\t\1, w1\n\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[\2\]\n\tinsr\t\1, w0\n\tzip1\t\1, \1, \3} } } */ +/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]\n\tmov\t(z[0-9]+\.s), w3\n\tinsr\t\2, w2\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_9.c b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c index 4d3c59b3bf8..668b4efdbc5 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/init_9.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ /* Case 5.3: Repeated elements. */ @@ -19,9 +19,9 @@ foo: .cfi_startproc mov z0.s, w0 mov z1.s, w1 - ptrue p0.s, vl8 zip1 z0.s, z0.s, z1.s + ptrue p0.s, vl8 ret */ -/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w0\n\tmov\t(z[0-9]+\.s), w1\n.*\tzip1\t\1, \1, \2} } } */ +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w0\n\tmov\t(z[0-9]+\.s), w1\n\tzip1\t\1, \1, \2} } } */
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > 2019-06-07 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> > > * gcc.target/aarch64/sve/init_1.c: Remove options > -O2 -fno-schedule-insns and instead pass -O. > Update assembly in comments. > * gcc.target/aarch64/sve/init_2.c: Likewise. > * gcc.target/aarch64/sve/init_3.c: Likewise. > * gcc.target/aarch64/sve/init_4.c: Likewise. > * gcc.target/aarch64/sve/init_5.c: Likewise and additionally > adjust dg-scan. > * gcc.target/aarch64/sve/init_6.c: Likewise. > * gcc.target/aarch64/sve/init_7.c: Likewise. > * gcc.target/aarch64/sve/init_8.c: Likewise. > * gcc.target/aarch64/sve/init_9.c: Likewise. > * gcc.target/aarch64/sve/init_10.c: Likewise. > * gcc.target/aarch64/sve/init_11.c: Likewise. > * gcc.target/aarch64/sve/init_12.c: Likewise. > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > index 5c14b603f46..4f18088f3b0 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > @@ -1,5 +1,5 @@ > /* { dg-do assemble { target aarch64_asm_sve_ok } } */ > -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ > > /* Case 1.1: Trailing constants with stepped sequence. */ > > @@ -17,10 +17,10 @@ vnx4si foo(int a, int b) > foo: > .LFB0: > .cfi_startproc > - ptrue p0.s, vl8 > index z0.s, #1, #1 > insr z0.s, w1 > insr z0.s, w0 > + ptrue p0.s, vl8 > ret > */ Let's drop the ptrues as well, since they only exist to feed the st1ws and are rightly not part of the matched code. (Same for all tests that have a ptrue at the end.) OK with that change, thanks. Richard
On Fri, 7 Jun 2019 at 22:47, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > 2019-06-07 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> > > > > * gcc.target/aarch64/sve/init_1.c: Remove options > > -O2 -fno-schedule-insns and instead pass -O. > > Update assembly in comments. > > * gcc.target/aarch64/sve/init_2.c: Likewise. > > * gcc.target/aarch64/sve/init_3.c: Likewise. > > * gcc.target/aarch64/sve/init_4.c: Likewise. > > * gcc.target/aarch64/sve/init_5.c: Likewise and additionally > > adjust dg-scan. > > * gcc.target/aarch64/sve/init_6.c: Likewise. > > * gcc.target/aarch64/sve/init_7.c: Likewise. > > * gcc.target/aarch64/sve/init_8.c: Likewise. > > * gcc.target/aarch64/sve/init_9.c: Likewise. > > * gcc.target/aarch64/sve/init_10.c: Likewise. > > * gcc.target/aarch64/sve/init_11.c: Likewise. > > * gcc.target/aarch64/sve/init_12.c: Likewise. > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > > index 5c14b603f46..4f18088f3b0 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c > > @@ -1,5 +1,5 @@ > > /* { dg-do assemble { target aarch64_asm_sve_ok } } */ > > -/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ > > +/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */ > > > > /* Case 1.1: Trailing constants with stepped sequence. */ > > > > @@ -17,10 +17,10 @@ vnx4si foo(int a, int b) > > foo: > > .LFB0: > > .cfi_startproc > > - ptrue p0.s, vl8 > > index z0.s, #1, #1 > > insr z0.s, w1 > > insr z0.s, w0 > > + ptrue p0.s, vl8 > > ret > > */ > > Let's drop the ptrues as well, since they only exist to feed the > st1ws and are rightly not part of the matched code. (Same for all > tests that have a ptrue at the end.) > > OK with that change, thanks. Thanks for the suggestions, I removed dead ptrue's and committed the patch in r272073. PS: I am on vacation next week, will start working on PR88833 after that. Thanks, Prathamesh > > Richard
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index b6c0d0a8eb6..f82728ed2d3 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -515,6 +515,7 @@ bool aarch64_maybe_expand_sve_subreg_move (rtx, rtx); void aarch64_split_sve_subreg_move (rtx, rtx, rtx); void aarch64_expand_prologue (void); void aarch64_expand_vector_init (rtx, rtx); +void aarch64_sve_expand_vector_init (rtx, rtx); void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx, const_tree, unsigned); void aarch64_init_expanders (void); diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index b9cb1fae98c..a4e0014eb3d 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -863,7 +863,7 @@ "revb\t%0.h, %1/m, %2.h" ) -(define_insn "*aarch64_sve_rev<mode>" +(define_insn "@aarch64_sve_rev<mode>" [(set (match_operand:SVE_ALL 0 "register_operand" "=w") (unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand" "w")] UNSPEC_REV))] @@ -3207,3 +3207,15 @@ DONE; } ) + +;; Standard pattern name vec_init<mode><Vel>. + +(define_expand "vec_init<mode><Vel>" + [(match_operand:SVE_ALL 0 "register_operand" "") + (match_operand 1 "" "")] + "TARGET_SVE" + { + aarch64_sve_expand_vector_init (operands[0], operands[1]); + DONE; + } +) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 83453d03095..8967e02524e 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -15244,6 +15244,261 @@ aarch64_expand_vector_init (rtx target, rtx vals) } } +/* Emit RTL corresponding to: + insr TARGET, ELEM. */ + +static void +emit_insr (rtx target, rtx elem) +{ + machine_mode mode = GET_MODE (target); + scalar_mode elem_mode = GET_MODE_INNER (mode); + elem = force_reg (elem_mode, elem); + + insn_code icode = optab_handler (vec_shl_insert_optab, mode); + gcc_assert (icode != CODE_FOR_nothing); + emit_insn (GEN_FCN (icode) (target, target, elem)); +} + +/* Subroutine of aarch64_sve_expand_vector_init for handling + trailing constants. + This function works as follows: + (a) Create a new vector consisting of trailing constants. + (b) Initialize TARGET with the constant vector using emit_move_insn. + (c) Insert remaining elements in TARGET using insr. + NELTS is the total number of elements in original vector while + + ??? The heuristic used is to do above only if number of constants + is at least half the total number of elements. May need fine tuning. */ + +static bool +aarch64_sve_expand_vector_init_handle_trailing_constants + (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd) +{ + machine_mode mode = GET_MODE (target); + scalar_mode elem_mode = GET_MODE_INNER (mode); + int n_trailing_constants = 0; + + for (int i = nelts_reqd - 1; + i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i)); + i--) + n_trailing_constants++; + + if (n_trailing_constants >= nelts_reqd / 2) + { + rtx_vector_builder v (mode, 1, nelts); + for (int i = 0; i < nelts; i++) + v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants)); + rtx const_vec = v.build (); + emit_move_insn (target, const_vec); + + for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--) + emit_insr (target, builder.elt (i)); + + return true; + } + + return false; +} + +/* Subroutine of aarch64_sve_expand_vector_init. + Works as follows: + (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. + (b) Skip trailing elements from BUILDER, which are same as + element NELTS_REQD - 1. + (c) Insert earlier elements in reverse order in TARGET using insr. */ + +static void +aarch64_sve_expand_vector_init_insert_elems (rtx target, + const rtx_vector_builder &builder, + int nelts_reqd) +{ + machine_mode mode = GET_MODE (target); + scalar_mode elem_mode = GET_MODE_INNER (mode); + + struct expand_operand ops[2]; + enum insn_code icode = optab_handler (vec_duplicate_optab, mode); + gcc_assert (icode != CODE_FOR_nothing); + + create_output_operand (&ops[0], target, mode); + create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode); + expand_insn (icode, 2, ops); + + int ndups = builder.count_dups (nelts_reqd - 1, -1, -1); + for (int i = nelts_reqd - ndups - 1; i >= 0; i--) + emit_insr (target, builder.elt (i)); +} + +/* Subroutine of aarch64_sve_expand_vector_init to handle case + when all trailing elements of builder are same. + This works as follows: + (a) Using expand_insn interface to broadcast last vector element in TARGET. + (b) Insert remaining elements in TARGET using insr. + + ??? The heuristic used is to do above if number of same trailing elements + is at least 3/4 of total number of elements, loosely based on + heuristic from mostly_zeros_p. May need fine-tuning. */ + +static bool +aarch64_sve_expand_vector_init_handle_trailing_same_elem + (rtx target, const rtx_vector_builder &builder, int nelts_reqd) +{ + int ndups = builder.count_dups (nelts_reqd - 1, -1, -1); + if (ndups >= (3 * nelts_reqd) / 4) + { + aarch64_sve_expand_vector_init_insert_elems (target, builder, + nelts_reqd - ndups + 1); + return true; + } + + return false; +} + +/* Initialize register TARGET from BUILDER. NELTS is the constant number + of elements in BUILDER. + + The function tries to initialize TARGET from BUILDER if it fits one + of the special cases outlined below. + + Failing that, the function divides BUILDER into two sub-vectors: + v_even = even elements of BUILDER; + v_odd = odd elements of BUILDER; + + and recursively calls itself with v_even and v_odd. + + if (recursive call succeeded for v_even or v_odd) + TARGET = zip (v_even, v_odd) + + The function returns true if it managed to build TARGET from BUILDER + with one of the special cases, false otherwise. + + Example: {a, 1, b, 2, c, 3, d, 4} + + The vector gets divided into: + v_even = {a, b, c, d} + v_odd = {1, 2, 3, 4} + + aarch64_sve_expand_vector_init(v_odd) hits case 1 and + initialize tmp2 from constant vector v_odd using emit_move_insn. + + aarch64_sve_expand_vector_init(v_even) fails since v_even contains + 4 elements, so we construct tmp1 from v_even using insr: + tmp1 = dup(d) + insr tmp1, c + insr tmp1, b + insr tmp1, a + + And finally: + TARGET = zip (tmp1, tmp2) + which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */ + +static bool +aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder, + int nelts, int nelts_reqd) +{ + machine_mode mode = GET_MODE (target); + + /* Case 1: Vector contains trailing constants. */ + + if (aarch64_sve_expand_vector_init_handle_trailing_constants + (target, builder, nelts, nelts_reqd)) + return true; + + /* Case 2: Vector contains leading constants. */ + + rtx_vector_builder rev_builder (mode, 1, nelts_reqd); + for (int i = 0; i < nelts_reqd; i++) + rev_builder.quick_push (builder.elt (nelts_reqd - i - 1)); + rev_builder.finalize (); + + if (aarch64_sve_expand_vector_init_handle_trailing_constants + (target, rev_builder, nelts, nelts_reqd)) + { + emit_insn (gen_aarch64_sve_rev (mode, target, target)); + return true; + } + + /* Case 3: Vector contains trailing same element. */ + + if (aarch64_sve_expand_vector_init_handle_trailing_same_elem + (target, builder, nelts_reqd)) + return true; + + /* Case 4: Vector contains leading same element. */ + + if (aarch64_sve_expand_vector_init_handle_trailing_same_elem + (target, rev_builder, nelts_reqd) && nelts_reqd == nelts) + { + emit_insn (gen_aarch64_sve_rev (mode, target, target)); + return true; + } + + /* Avoid recursing below 4-elements. + ??? The threshold 4 may need fine-tuning. */ + + if (nelts_reqd <= 4) + return false; + + rtx_vector_builder v_even (mode, 1, nelts); + rtx_vector_builder v_odd (mode, 1, nelts); + + for (int i = 0; i < nelts * 2; i += 2) + { + v_even.quick_push (builder.elt (i)); + v_odd.quick_push (builder.elt (i + 1)); + } + + v_even.finalize (); + v_odd.finalize (); + + rtx tmp1 = gen_reg_rtx (mode); + bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even, + nelts, nelts_reqd / 2); + + rtx tmp2 = gen_reg_rtx (mode); + bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd, + nelts, nelts_reqd / 2); + + if (!did_even_p && !did_odd_p) + return false; + + /* Initialize v_even and v_odd using INSR if it didn't match any of the + special cases and zip v_even, v_odd. */ + + if (!did_even_p) + aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2); + + if (!did_odd_p) + aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2); + + rtvec v = gen_rtvec (2, tmp1, tmp2); + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); + return true; +} + +/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */ + +void +aarch64_sve_expand_vector_init (rtx target, rtx vals) +{ + machine_mode mode = GET_MODE (target); + int nelts = XVECLEN (vals, 0); + + rtx_vector_builder v (mode, 1, nelts); + for (int i = 0; i < nelts; i++) + v.quick_push (XVECEXP (vals, 0, i)); + v.finalize (); + + /* If neither sub-vectors of v could be initialized specially, + then use INSR to insert all elements from v into TARGET. + ??? This might not be optimal for vectors with large + initializers like 16-element or above. + For nelts < 4, it probably isn't useful to handle specially. */ + + if (nelts < 4 + || !aarch64_sve_expand_vector_init (target, v, nelts, nelts)) + aarch64_sve_expand_vector_init_insert_elems (target, v, nelts); +} + static unsigned HOST_WIDE_INT aarch64_shift_truncation_mask (machine_mode mode) { diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c new file mode 100644 index 00000000000..c51876947fb --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c @@ -0,0 +1,27 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 1.1: Trailing constants with stepped sequence. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { a, b, 1, 2, 3, 4, 5, 6 }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + index z0.s, #1, #1 + insr z0.s, w1 + insr z0.s, w0 + ret +*/ + +/* { dg-final { scan-assembler {\tindex\t(z[0-9]+\.s), #1, #1\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c new file mode 100644 index 00000000000..7bca3f0ecc9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c @@ -0,0 +1,29 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.4: Interleaved repeating elements and non-repeating elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c, int f) +{ + return (vnx4si) { a, f, b, f, c, f, c, f }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w2 + mov z1.s, w3 + insr z0.s, w1 + ptrue p0.s, vl8 + insr z0.s, w0 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c new file mode 100644 index 00000000000..d9640e42ddd --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c @@ -0,0 +1,21 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include "init_10.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + int f = 13; + + vnx4si v = foo (a, b, c, f); + int expected[] = { a, f, b, f, c, f, c, f }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c new file mode 100644 index 00000000000..b90895df436 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c @@ -0,0 +1,27 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.5: Interleaved repeating elements and trailing same elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +vnx4si foo(int a, int b, int f) +{ + return (vnx4si) { a, f, b, f, b, f, b, f }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w1 + mov z1.s, w2 + insr z0.s, w0 + ptrue p0.s, vl8 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w1\n\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\1, w0\n.*\tzip1\t\1, \1, \2} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c new file mode 100644 index 00000000000..8a99da45433 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c @@ -0,0 +1,20 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include "init_11.c" + +int main() +{ + int a = 10; + int b = 11; + int f = 12; + + vnx4si v = foo (a, b, f); + int expected[] = { a, f, b, f, b, f, b, f }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_12.c b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c new file mode 100644 index 00000000000..b36967d6d59 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c @@ -0,0 +1,30 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.5: Interleaved repeating elements and trailing same elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int f) +{ + return (vnx4si) { b, f, b, f, b, f, a, f }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w0 + mov z1.s, w2 + insr z0.s, w1 + ptrue p0.s, vl8 + insr z0.s, w1 + insr z0.s, w1 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tmov\t(z[0-9]+\.s), w0\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tzip1\t\2, \2, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c new file mode 100644 index 00000000000..b77464c6b3c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c @@ -0,0 +1,20 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include "init_12.c" + +int main() +{ + int a = 10; + int b = 11; + int f = 12; + + vnx4si v = foo (a, b, f); + int expected[] = { b, f, b, f, b, f, a, f }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c new file mode 100644 index 00000000000..c0cc5235da4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include "init_1.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { a, b, 1, 2, 3, 4, 5, 6 }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_2.c b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c new file mode 100644 index 00000000000..1ab7c4300e6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c @@ -0,0 +1,29 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 1.2: Trailing constants with repeating sequence. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { a, b, 2, 3, 2, 3, 2, 3 }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + adrp x2, .LANCHOR0 + add x2, x2, :lo12:.LANCHOR0 + ld1w z0.s, p0/z, [x2] + insr z0.s, w1 + insr z0.s, w0 + ret +*/ + +/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c new file mode 100644 index 00000000000..0f3705d145b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include "init_2.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { a, b, 2, 3, 2, 3, 2, 3 }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_3.c b/gcc/testsuite/gcc.target/aarch64/sve/init_3.c new file mode 100644 index 00000000000..ccf3fa85292 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_3.c @@ -0,0 +1,28 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 2.1: Leading constants with stepped sequence. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { 1, 2, 3, 4, 5, 6, a, b }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + index z0.s, #6, #-1 + insr z0.s, w0 + insr z0.s, w1 + rev z0.s, z0.s + ret +*/ + +/* { dg-final { scan-assembler {\tindex\t(z[0-9]+\.s), #6, #-1\n\tinsr\t\1, w0\n\tinsr\t\1, w1\n\trev\t\1, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c new file mode 100644 index 00000000000..5df711dfc79 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include "init_3.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { 1, 2, 3, 4, 5, 6, a, b }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_4.c b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c new file mode 100644 index 00000000000..b817dc5d9f7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c @@ -0,0 +1,30 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 2.2: Leading constants with stepped sequence. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { 3, 2, 3, 2, 3, 2, b, a }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + adrp x2, .LANCHOR0 + add x2, x2, :lo12:.LANCHOR0 + ld1w z0.s, p0/z, [x2] + insr z0.s, w1 + insr z0.s, w0 + rev z0.s, z0.s + ret +*/ + +/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]\n\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c new file mode 100644 index 00000000000..563353fe673 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include "init_4.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { 3, 2, 3, 2, 3, 2, b, a }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_5.c b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c new file mode 100644 index 00000000000..d662dfba8b5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c @@ -0,0 +1,27 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 3: Trailing same element. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c) +{ + return (vnx4si) { a, b, c, c, c, c, c, c }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w2 + ptrue p0.s, vl8 + insr z0.s, w1 + insr z0.s, w0 + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c new file mode 100644 index 00000000000..ae444a17688 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c @@ -0,0 +1,20 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include "init_5.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + + vnx4si v = foo (a, b, c); + int expected[] = { a, b, c, c, c, c, c, c }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_6.c b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c new file mode 100644 index 00000000000..fd0e21dcb85 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c @@ -0,0 +1,28 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 3: Trailing same element. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c) +{ + return (vnx4si) { c, c, c, c, c, c, b, a }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w2 + ptrue p0.s, vl8 + insr z0.s, w1 + insr z0.s, w0 + rev z0.s, z0.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c new file mode 100644 index 00000000000..d919f0ce0ba --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c @@ -0,0 +1,20 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include "init_6.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + + vnx4si v = foo (a, b, c); + int expected[] = { c, c, c, c, c, c, b, a }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_7.c b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c new file mode 100644 index 00000000000..5f3d82242d7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c @@ -0,0 +1,32 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.1: All elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c, int d, int e, int f, int g, int h) +{ + return (vnx4si) { a, b, c, d, e, f, g, h }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w7 + ptrue p0.s, vl8 + insr z0.s, w6 + insr z0.s, w5 + insr z0.s, w4 + insr z0.s, w3 + insr z0.s, w2 + insr z0.s, w1 + insr z0.s, w0 + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w7\n.*\tinsr\t\1, w6\n\tinsr\t\1, w5\n\tinsr\t\1, w4\n\tinsr\t\1, w3\n\tinsr\t\1, w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c new file mode 100644 index 00000000000..c9f040c6d4d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c @@ -0,0 +1,25 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include "init_7.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + int d = 13; + int e = 14; + int f = 15; + int g = 16; + int h = 17; + + vnx4si v = foo (a, b, c, d, e, f, g, h); + int expected[] = { a, b, c, d, e, f, g, h }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_8.c b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c new file mode 100644 index 00000000000..9a1869a2765 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c @@ -0,0 +1,32 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.2: Interleaved elements and constants. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b, int c, int d) +{ + return (vnx4si) { a, 1, b, 2, c, 3, d, 4 }; +} + +/* +foo: +.LFB0: + .cfi_startproc + ptrue p0.s, vl8 + mov z0.s, w3 + adrp x3, .LANCHOR0 + insr z0.s, w2 + add x3, x3, :lo12:.LANCHOR0 + insr z0.s, w1 + ld1w z1.s, p0/z, [x3] + insr z0.s, w0 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tadrp\t(x[0-9]+), \.LANCHOR0\n\tinsr\t\1, w2\n\tadd\t\2, \2, :lo12:\.LANCHOR0\n\tinsr\t\1, w1\n\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[\2\]\n\tinsr\t\1, w0\n\tzip1\t\1, \1, \3} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c new file mode 100644 index 00000000000..14a8ad44145 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c @@ -0,0 +1,21 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include "init_8.c" + +int main() +{ + int a = 10; + int b = 11; + int c = 12; + int d = 13; + + vnx4si v = foo (a, b, c, d); + int expected[] = { a, 1, b, 2, c, 3, d, 4 }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_9.c b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c new file mode 100644 index 00000000000..0ecbce848ef --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c @@ -0,0 +1,27 @@ +/* { dg-do compile { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */ + +/* Case 5.3: Repeated elements. */ + +#include <stdint.h> + +typedef int32_t vnx4si __attribute__((vector_size (32))); + +__attribute__((noipa)) +vnx4si foo(int a, int b) +{ + return (vnx4si) { a, b, a, b, a, b, a, b }; +} + +/* +foo: +.LFB0: + .cfi_startproc + mov z0.s, w0 + mov z1.s, w1 + ptrue p0.s, vl8 + zip1 z0.s, z0.s, z1.s + ret +*/ + +/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w0\n\tmov\t(z[0-9]+\.s), w1\n.*\tzip1\t\1, \1, \2} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c new file mode 100644 index 00000000000..6c67025c585 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c @@ -0,0 +1,19 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include "init_9.c" + +int main() +{ + int a = 10; + int b = 11; + + vnx4si v = foo (a, b); + int expected[] = { a, b, a, b, a, b, a, b }; + + for (int i = 0; i < 8; i++) + if (v[i] != expected[i]) + __builtin_abort (); + + return 0; +} diff --git a/gcc/vector-builder.h b/gcc/vector-builder.h index 9967daa6e4c..9f95b01bc3b 100644 --- a/gcc/vector-builder.h +++ b/gcc/vector-builder.h @@ -96,6 +96,7 @@ public: unsigned int encoded_nelts () const; bool encoded_full_vector_p () const; T elt (unsigned int) const; + unsigned int count_dups (int, int, int) const; bool operator == (const Derived &) const; bool operator != (const Derived &x) const { return !operator == (x); } @@ -223,6 +224,23 @@ vector_builder<T, Derived>::elt (unsigned int i) const derived ()->step (prev, final)); } +/* Return the number of leading duplicate elements in the range + [START:END:STEP]. The value is always at least 1. */ + +template<typename T, typename Derived> +unsigned int +vector_builder<T, Derived>::count_dups (int start, int end, int step) const +{ + gcc_assert ((end - start) % step == 0); + + unsigned int ndups = 1; + for (int i = start + step; + i != end && derived ()->equal_p (elt (i), elt (start)); + i += step) + ndups++; + return ndups; +} + /* Change the encoding to NPATTERNS patterns of NELTS_PER_PATTERN each, but without changing the underlying vector. */