Message ID | 1453303331-14492-1-git-send-email-james.greenhalgh@arm.com |
---|---|
State | New |
Headers | show |
On Wed, Jan 20, 2016 at 03:22:11PM +0000, James Greenhalgh wrote: > > Hi, > > In a number of cases where we try to create vectors we end up spilling to the > stack and then filling. This is one example distilled from a couple of > micro-benchmrks where the issue shows up. The reason for the extra cost > in this case is the unnecessary use of the stack. The patch attempts to > finesse this by using lane loads or vector inserts to produce the right > results. > > This patch is mostly Ramana's work, I've just cleaned it up a little. > > This has been in a number of our trees lately, and we haven't seen any > regressions. I've also bootstrapped and tested it, and run a set of > benchmarks to show no regressions on Cortex-A57 or Cortex-A53. > > The patch fixes some regressions caused by the more agressive vectorization > in GCC6, so I'd like to propose it to go in even though we are in Stage 4. > > OK? *Ping* I just ran in to this investigating another performance regression. It would be nice to get fixed. Thanks, James > > Thanks, > James > > --- > gcc/ > > 2016-01-20 James Greenhalgh <james.greenhalgh@arm.com> > Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> > > * config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor, > always use lane loads to construct non-constant vectors. > > gcc/testsuite/ > > 2016-01-20 James Greenhalgh <james.greenhalgh@arm.com> > Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> > > * gcc.target/aarch64/vector_initialization_nostack.c: New. > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index 03bc1b9..3787b38 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -10985,28 +10985,37 @@ aarch64_simd_make_constant (rtx vals) > return NULL_RTX; > } > > +/* Expand a vector initialisation sequence, such that TARGET is > + initialised to contain VALS. */ > + > void > aarch64_expand_vector_init (rtx target, rtx vals) > { > machine_mode mode = GET_MODE (target); > machine_mode inner_mode = GET_MODE_INNER (mode); > + /* The number of vector elements. */ > int n_elts = GET_MODE_NUNITS (mode); > + /* The number of vector elements which are not constant. */ > int n_var = 0; > rtx any_const = NULL_RTX; > + /* The first element of vals. */ > + rtx v0 = XVECEXP (vals, 0, 0); > bool all_same = true; > > + /* Count the number of variable elements to initialise. */ > for (int i = 0; i < n_elts; ++i) > { > rtx x = XVECEXP (vals, 0, i); > - if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x)) > + if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x))) > ++n_var; > else > any_const = x; > > - if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) > - all_same = false; > + all_same &= rtx_equal_p (x, v0); > } > > + /* No variable elements, hand off to aarch64_simd_make_constant which knows > + how best to handle this. */ > if (n_var == 0) > { > rtx constant = aarch64_simd_make_constant (vals); > @@ -11020,14 +11029,15 @@ aarch64_expand_vector_init (rtx target, rtx vals) > /* Splat a single non-constant element if we can. */ > if (all_same) > { > - rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); > + rtx x = copy_to_mode_reg (inner_mode, v0); > aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x)); > return; > } > > - /* Half the fields (or less) are non-constant. Load constant then overwrite > - varying fields. Hope that this is more efficient than using the stack. */ > - if (n_var <= n_elts/2) > + /* Initialise a vector which is part-variable. We want to first try > + to build those lanes which are constant in the most efficient way we > + can. */ > + if (n_var != n_elts) > { > rtx copy = copy_rtx (vals); > > @@ -11054,31 +11064,21 @@ aarch64_expand_vector_init (rtx target, rtx vals) > XVECEXP (copy, 0, i) = subst; > } > aarch64_expand_vector_init (target, copy); > + } > > - /* Insert variables. */ > - enum insn_code icode = optab_handler (vec_set_optab, mode); > - gcc_assert (icode != CODE_FOR_nothing); > + /* Insert the variable lanes directly. */ > > - for (int i = 0; i < n_elts; i++) > - { > - rtx x = XVECEXP (vals, 0, i); > - if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) > - continue; > - x = copy_to_mode_reg (inner_mode, x); > - emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); > - } > - return; > - } > + enum insn_code icode = optab_handler (vec_set_optab, mode); > + gcc_assert (icode != CODE_FOR_nothing); > > - /* Construct the vector in memory one field at a time > - and load the whole vector. */ > - rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); > for (int i = 0; i < n_elts; i++) > - emit_move_insn (adjust_address_nv (mem, inner_mode, > - i * GET_MODE_SIZE (inner_mode)), > - XVECEXP (vals, 0, i)); > - emit_move_insn (target, mem); > - > + { > + rtx x = XVECEXP (vals, 0, i); > + if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) > + continue; > + x = copy_to_mode_reg (inner_mode, x); > + emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); > + } > } > > static unsigned HOST_WIDE_INT > diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c > new file mode 100644 > index 0000000..bbad04d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c > @@ -0,0 +1,53 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */ > +float arr_f[100][100]; > +float > +f9 (void) > +{ > + > + int i; > + float sum = 0; > + for (i = 0; i < 100; i++) > + sum += arr_f[i][0] * arr_f[0][i]; > + return sum; > + > +} > + > + > +int arr[100][100]; > +int > +f10 (void) > +{ > + > + int i; > + int sum = 0; > + for (i = 0; i < 100; i++) > + sum += arr[i][0] * arr[0][i]; > + return sum; > + > +} > + > +double arr_d[100][100]; > +double > +f11 (void) > +{ > + int i; > + double sum = 0; > + for (i = 0; i < 100; i++) > + sum += arr_d[i][0] * arr_d[0][i]; > + return sum; > +} > + > +char arr_c[100][100]; > +char > +f12 (void) > +{ > + int i; > + char sum = 0; > + for (i = 0; i < 100; i++) > + sum += arr_c[i][0] * arr_c[0][i]; > + return sum; > +} > + > + > +/* { dg-final { scan-assembler-not "sp" } } */
On Tue, Feb 02, 2016 at 10:29:29AM +0000, James Greenhalgh wrote: > On Wed, Jan 20, 2016 at 03:22:11PM +0000, James Greenhalgh wrote: > > > > Hi, > > > > In a number of cases where we try to create vectors we end up spilling to the > > stack and then filling. This is one example distilled from a couple of > > micro-benchmrks where the issue shows up. The reason for the extra cost > > in this case is the unnecessary use of the stack. The patch attempts to > > finesse this by using lane loads or vector inserts to produce the right > > results. > > > > This patch is mostly Ramana's work, I've just cleaned it up a little. > > > > This has been in a number of our trees lately, and we haven't seen any > > regressions. I've also bootstrapped and tested it, and run a set of > > benchmarks to show no regressions on Cortex-A57 or Cortex-A53. > > > > The patch fixes some regressions caused by the more agressive vectorization > > in GCC6, so I'd like to propose it to go in even though we are in Stage 4. > > > > OK? > > *Ping* *ping^2* Cheers, James > > 2016-01-20 James Greenhalgh <james.greenhalgh@arm.com> > > Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> > > > > * config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor, > > always use lane loads to construct non-constant vectors. > > > > gcc/testsuite/ > > > > 2016-01-20 James Greenhalgh <james.greenhalgh@arm.com> > > Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> > > > > * gcc.target/aarch64/vector_initialization_nostack.c: New. > > > > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > > index 03bc1b9..3787b38 100644 > > --- a/gcc/config/aarch64/aarch64.c > > +++ b/gcc/config/aarch64/aarch64.c > > @@ -10985,28 +10985,37 @@ aarch64_simd_make_constant (rtx vals) > > return NULL_RTX; > > } > > > > +/* Expand a vector initialisation sequence, such that TARGET is > > + initialised to contain VALS. */ > > + > > void > > aarch64_expand_vector_init (rtx target, rtx vals) > > { > > machine_mode mode = GET_MODE (target); > > machine_mode inner_mode = GET_MODE_INNER (mode); > > + /* The number of vector elements. */ > > int n_elts = GET_MODE_NUNITS (mode); > > + /* The number of vector elements which are not constant. */ > > int n_var = 0; > > rtx any_const = NULL_RTX; > > + /* The first element of vals. */ > > + rtx v0 = XVECEXP (vals, 0, 0); > > bool all_same = true; > > > > + /* Count the number of variable elements to initialise. */ > > for (int i = 0; i < n_elts; ++i) > > { > > rtx x = XVECEXP (vals, 0, i); > > - if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x)) > > + if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x))) > > ++n_var; > > else > > any_const = x; > > > > - if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) > > - all_same = false; > > + all_same &= rtx_equal_p (x, v0); > > } > > > > + /* No variable elements, hand off to aarch64_simd_make_constant which knows > > + how best to handle this. */ > > if (n_var == 0) > > { > > rtx constant = aarch64_simd_make_constant (vals); > > @@ -11020,14 +11029,15 @@ aarch64_expand_vector_init (rtx target, rtx vals) > > /* Splat a single non-constant element if we can. */ > > if (all_same) > > { > > - rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); > > + rtx x = copy_to_mode_reg (inner_mode, v0); > > aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x)); > > return; > > } > > > > - /* Half the fields (or less) are non-constant. Load constant then overwrite > > - varying fields. Hope that this is more efficient than using the stack. */ > > - if (n_var <= n_elts/2) > > + /* Initialise a vector which is part-variable. We want to first try > > + to build those lanes which are constant in the most efficient way we > > + can. */ > > + if (n_var != n_elts) > > { > > rtx copy = copy_rtx (vals); > > > > @@ -11054,31 +11064,21 @@ aarch64_expand_vector_init (rtx target, rtx vals) > > XVECEXP (copy, 0, i) = subst; > > } > > aarch64_expand_vector_init (target, copy); > > + } > > > > - /* Insert variables. */ > > - enum insn_code icode = optab_handler (vec_set_optab, mode); > > - gcc_assert (icode != CODE_FOR_nothing); > > + /* Insert the variable lanes directly. */ > > > > - for (int i = 0; i < n_elts; i++) > > - { > > - rtx x = XVECEXP (vals, 0, i); > > - if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) > > - continue; > > - x = copy_to_mode_reg (inner_mode, x); > > - emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); > > - } > > - return; > > - } > > + enum insn_code icode = optab_handler (vec_set_optab, mode); > > + gcc_assert (icode != CODE_FOR_nothing); > > > > - /* Construct the vector in memory one field at a time > > - and load the whole vector. */ > > - rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); > > for (int i = 0; i < n_elts; i++) > > - emit_move_insn (adjust_address_nv (mem, inner_mode, > > - i * GET_MODE_SIZE (inner_mode)), > > - XVECEXP (vals, 0, i)); > > - emit_move_insn (target, mem); > > - > > + { > > + rtx x = XVECEXP (vals, 0, i); > > + if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) > > + continue; > > + x = copy_to_mode_reg (inner_mode, x); > > + emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); > > + } > > } > > > > static unsigned HOST_WIDE_INT > > diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c > > new file mode 100644 > > index 0000000..bbad04d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c > > @@ -0,0 +1,53 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */ > > +float arr_f[100][100]; > > +float > > +f9 (void) > > +{ > > + > > + int i; > > + float sum = 0; > > + for (i = 0; i < 100; i++) > > + sum += arr_f[i][0] * arr_f[0][i]; > > + return sum; > > + > > +} > > + > > + > > +int arr[100][100]; > > +int > > +f10 (void) > > +{ > > + > > + int i; > > + int sum = 0; > > + for (i = 0; i < 100; i++) > > + sum += arr[i][0] * arr[0][i]; > > + return sum; > > + > > +} > > + > > +double arr_d[100][100]; > > +double > > +f11 (void) > > +{ > > + int i; > > + double sum = 0; > > + for (i = 0; i < 100; i++) > > + sum += arr_d[i][0] * arr_d[0][i]; > > + return sum; > > +} > > + > > +char arr_c[100][100]; > > +char > > +f12 (void) > > +{ > > + int i; > > + char sum = 0; > > + for (i = 0; i < 100; i++) > > + sum += arr_c[i][0] * arr_c[0][i]; > > + return sum; > > +} > > + > > + > > +/* { dg-final { scan-assembler-not "sp" } } */ >
On Mon, Feb 08, 2016 at 10:56:29AM +0000, James Greenhalgh wrote: > On Tue, Feb 02, 2016 at 10:29:29AM +0000, James Greenhalgh wrote: > > On Wed, Jan 20, 2016 at 03:22:11PM +0000, James Greenhalgh wrote: > > > > > > Hi, > > > > > > In a number of cases where we try to create vectors we end up spilling to the > > > stack and then filling. This is one example distilled from a couple of > > > micro-benchmrks where the issue shows up. The reason for the extra cost > > > in this case is the unnecessary use of the stack. The patch attempts to > > > finesse this by using lane loads or vector inserts to produce the right > > > results. > > > > > > This patch is mostly Ramana's work, I've just cleaned it up a little. > > > > > > This has been in a number of our trees lately, and we haven't seen any > > > regressions. I've also bootstrapped and tested it, and run a set of > > > benchmarks to show no regressions on Cortex-A57 or Cortex-A53. > > > > > > The patch fixes some regressions caused by the more agressive vectorization > > > in GCC6, so I'd like to propose it to go in even though we are in Stage 4. > > > > > > OK? > > > > *Ping* > > *ping^2* *ping ^3* Thanks, James > > > 2016-01-20 James Greenhalgh <james.greenhalgh@arm.com> > > > Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> > > > > > > * config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor, > > > always use lane loads to construct non-constant vectors. > > > > > > gcc/testsuite/ > > > > > > 2016-01-20 James Greenhalgh <james.greenhalgh@arm.com> > > > Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> > > > > > > * gcc.target/aarch64/vector_initialization_nostack.c: New. > > > > > > > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > > > index 03bc1b9..3787b38 100644 > > > --- a/gcc/config/aarch64/aarch64.c > > > +++ b/gcc/config/aarch64/aarch64.c > > > @@ -10985,28 +10985,37 @@ aarch64_simd_make_constant (rtx vals) > > > return NULL_RTX; > > > } > > > > > > +/* Expand a vector initialisation sequence, such that TARGET is > > > + initialised to contain VALS. */ > > > + > > > void > > > aarch64_expand_vector_init (rtx target, rtx vals) > > > { > > > machine_mode mode = GET_MODE (target); > > > machine_mode inner_mode = GET_MODE_INNER (mode); > > > + /* The number of vector elements. */ > > > int n_elts = GET_MODE_NUNITS (mode); > > > + /* The number of vector elements which are not constant. */ > > > int n_var = 0; > > > rtx any_const = NULL_RTX; > > > + /* The first element of vals. */ > > > + rtx v0 = XVECEXP (vals, 0, 0); > > > bool all_same = true; > > > > > > + /* Count the number of variable elements to initialise. */ > > > for (int i = 0; i < n_elts; ++i) > > > { > > > rtx x = XVECEXP (vals, 0, i); > > > - if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x)) > > > + if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x))) > > > ++n_var; > > > else > > > any_const = x; > > > > > > - if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) > > > - all_same = false; > > > + all_same &= rtx_equal_p (x, v0); > > > } > > > > > > + /* No variable elements, hand off to aarch64_simd_make_constant which knows > > > + how best to handle this. */ > > > if (n_var == 0) > > > { > > > rtx constant = aarch64_simd_make_constant (vals); > > > @@ -11020,14 +11029,15 @@ aarch64_expand_vector_init (rtx target, rtx vals) > > > /* Splat a single non-constant element if we can. */ > > > if (all_same) > > > { > > > - rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); > > > + rtx x = copy_to_mode_reg (inner_mode, v0); > > > aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x)); > > > return; > > > } > > > > > > - /* Half the fields (or less) are non-constant. Load constant then overwrite > > > - varying fields. Hope that this is more efficient than using the stack. */ > > > - if (n_var <= n_elts/2) > > > + /* Initialise a vector which is part-variable. We want to first try > > > + to build those lanes which are constant in the most efficient way we > > > + can. */ > > > + if (n_var != n_elts) > > > { > > > rtx copy = copy_rtx (vals); > > > > > > @@ -11054,31 +11064,21 @@ aarch64_expand_vector_init (rtx target, rtx vals) > > > XVECEXP (copy, 0, i) = subst; > > > } > > > aarch64_expand_vector_init (target, copy); > > > + } > > > > > > - /* Insert variables. */ > > > - enum insn_code icode = optab_handler (vec_set_optab, mode); > > > - gcc_assert (icode != CODE_FOR_nothing); > > > + /* Insert the variable lanes directly. */ > > > > > > - for (int i = 0; i < n_elts; i++) > > > - { > > > - rtx x = XVECEXP (vals, 0, i); > > > - if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) > > > - continue; > > > - x = copy_to_mode_reg (inner_mode, x); > > > - emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); > > > - } > > > - return; > > > - } > > > + enum insn_code icode = optab_handler (vec_set_optab, mode); > > > + gcc_assert (icode != CODE_FOR_nothing); > > > > > > - /* Construct the vector in memory one field at a time > > > - and load the whole vector. */ > > > - rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); > > > for (int i = 0; i < n_elts; i++) > > > - emit_move_insn (adjust_address_nv (mem, inner_mode, > > > - i * GET_MODE_SIZE (inner_mode)), > > > - XVECEXP (vals, 0, i)); > > > - emit_move_insn (target, mem); > > > - > > > + { > > > + rtx x = XVECEXP (vals, 0, i); > > > + if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) > > > + continue; > > > + x = copy_to_mode_reg (inner_mode, x); > > > + emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); > > > + } > > > } > > > > > > static unsigned HOST_WIDE_INT > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c > > > new file mode 100644 > > > index 0000000..bbad04d > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c > > > @@ -0,0 +1,53 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */ > > > +float arr_f[100][100]; > > > +float > > > +f9 (void) > > > +{ > > > + > > > + int i; > > > + float sum = 0; > > > + for (i = 0; i < 100; i++) > > > + sum += arr_f[i][0] * arr_f[0][i]; > > > + return sum; > > > + > > > +} > > > + > > > + > > > +int arr[100][100]; > > > +int > > > +f10 (void) > > > +{ > > > + > > > + int i; > > > + int sum = 0; > > > + for (i = 0; i < 100; i++) > > > + sum += arr[i][0] * arr[0][i]; > > > + return sum; > > > + > > > +} > > > + > > > +double arr_d[100][100]; > > > +double > > > +f11 (void) > > > +{ > > > + int i; > > > + double sum = 0; > > > + for (i = 0; i < 100; i++) > > > + sum += arr_d[i][0] * arr_d[0][i]; > > > + return sum; > > > +} > > > + > > > +char arr_c[100][100]; > > > +char > > > +f12 (void) > > > +{ > > > + int i; > > > + char sum = 0; > > > + for (i = 0; i < 100; i++) > > > + sum += arr_c[i][0] * arr_c[0][i]; > > > + return sum; > > > +} > > > + > > > + > > > +/* { dg-final { scan-assembler-not "sp" } } */ > > >
On 20 January 2016 at 15:22, James Greenhalgh <james.greenhalgh@arm.com> wrote: > gcc/ > > 2016-01-20 James Greenhalgh <james.greenhalgh@arm.com> > Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> > > * config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor, > always use lane loads to construct non-constant vectors. > > gcc/testsuite/ > > 2016-01-20 James Greenhalgh <james.greenhalgh@arm.com> > Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> > > * gcc.target/aarch64/vector_initialization_nostack.c: New. > OK /Marcus
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 03bc1b9..3787b38 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -10985,28 +10985,37 @@ aarch64_simd_make_constant (rtx vals) return NULL_RTX; } +/* Expand a vector initialisation sequence, such that TARGET is + initialised to contain VALS. */ + void aarch64_expand_vector_init (rtx target, rtx vals) { machine_mode mode = GET_MODE (target); machine_mode inner_mode = GET_MODE_INNER (mode); + /* The number of vector elements. */ int n_elts = GET_MODE_NUNITS (mode); + /* The number of vector elements which are not constant. */ int n_var = 0; rtx any_const = NULL_RTX; + /* The first element of vals. */ + rtx v0 = XVECEXP (vals, 0, 0); bool all_same = true; + /* Count the number of variable elements to initialise. */ for (int i = 0; i < n_elts; ++i) { rtx x = XVECEXP (vals, 0, i); - if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x)) + if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x))) ++n_var; else any_const = x; - if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) - all_same = false; + all_same &= rtx_equal_p (x, v0); } + /* No variable elements, hand off to aarch64_simd_make_constant which knows + how best to handle this. */ if (n_var == 0) { rtx constant = aarch64_simd_make_constant (vals); @@ -11020,14 +11029,15 @@ aarch64_expand_vector_init (rtx target, rtx vals) /* Splat a single non-constant element if we can. */ if (all_same) { - rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); + rtx x = copy_to_mode_reg (inner_mode, v0); aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x)); return; } - /* Half the fields (or less) are non-constant. Load constant then overwrite - varying fields. Hope that this is more efficient than using the stack. */ - if (n_var <= n_elts/2) + /* Initialise a vector which is part-variable. We want to first try + to build those lanes which are constant in the most efficient way we + can. */ + if (n_var != n_elts) { rtx copy = copy_rtx (vals); @@ -11054,31 +11064,21 @@ aarch64_expand_vector_init (rtx target, rtx vals) XVECEXP (copy, 0, i) = subst; } aarch64_expand_vector_init (target, copy); + } - /* Insert variables. */ - enum insn_code icode = optab_handler (vec_set_optab, mode); - gcc_assert (icode != CODE_FOR_nothing); + /* Insert the variable lanes directly. */ - for (int i = 0; i < n_elts; i++) - { - rtx x = XVECEXP (vals, 0, i); - if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) - continue; - x = copy_to_mode_reg (inner_mode, x); - emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); - } - return; - } + enum insn_code icode = optab_handler (vec_set_optab, mode); + gcc_assert (icode != CODE_FOR_nothing); - /* Construct the vector in memory one field at a time - and load the whole vector. */ - rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); for (int i = 0; i < n_elts; i++) - emit_move_insn (adjust_address_nv (mem, inner_mode, - i * GET_MODE_SIZE (inner_mode)), - XVECEXP (vals, 0, i)); - emit_move_insn (target, mem); - + { + rtx x = XVECEXP (vals, 0, i); + if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) + continue; + x = copy_to_mode_reg (inner_mode, x); + emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); + } } static unsigned HOST_WIDE_INT diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c new file mode 100644 index 0000000..bbad04d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */ +float arr_f[100][100]; +float +f9 (void) +{ + + int i; + float sum = 0; + for (i = 0; i < 100; i++) + sum += arr_f[i][0] * arr_f[0][i]; + return sum; + +} + + +int arr[100][100]; +int +f10 (void) +{ + + int i; + int sum = 0; + for (i = 0; i < 100; i++) + sum += arr[i][0] * arr[0][i]; + return sum; + +} + +double arr_d[100][100]; +double +f11 (void) +{ + int i; + double sum = 0; + for (i = 0; i < 100; i++) + sum += arr_d[i][0] * arr_d[0][i]; + return sum; +} + +char arr_c[100][100]; +char +f12 (void) +{ + int i; + char sum = 0; + for (i = 0; i < 100; i++) + sum += arr_c[i][0] * arr_c[0][i]; + return sum; +} + + +/* { dg-final { scan-assembler-not "sp" } } */