diff mbox

[AArch64] GCC 6 regression in vector performance. - Fix vector initialization to happen with lane load instructions.

Message ID 1453303331-14492-1-git-send-email-james.greenhalgh@arm.com
State New
Headers show

Commit Message

James Greenhalgh Jan. 20, 2016, 3:22 p.m. UTC
Hi,

In a number of cases where we try to create vectors we end up spilling to the
stack and then filling. This is one example distilled from a couple of
micro-benchmrks where the issue shows up. The reason for the extra cost
in this case is the unnecessary use of the stack. The patch attempts to
finesse this by using lane loads or vector inserts to produce the right
results.

This patch is mostly Ramana's work, I've just cleaned it up a little.

This has been in a number of our trees lately, and we haven't seen any
regressions. I've also bootstrapped and tested it, and run a set of
benchmarks to show no regressions on Cortex-A57 or Cortex-A53.

The patch fixes some regressions caused by the more agressive vectorization
in GCC6, so I'd like to propose it to go in even though we are in Stage 4.

OK?

Thanks,
James

---
gcc/

2016-01-20  James Greenhalgh  <james.greenhalgh@arm.com>
	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

	* config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor,
	always use lane loads to construct non-constant vectors.

gcc/testsuite/

2016-01-20  James Greenhalgh  <james.greenhalgh@arm.com>
	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

	* gcc.target/aarch64/vector_initialization_nostack.c: New.

Comments

James Greenhalgh Feb. 2, 2016, 10:29 a.m. UTC | #1
On Wed, Jan 20, 2016 at 03:22:11PM +0000, James Greenhalgh wrote:
> 

> Hi,

> 

> In a number of cases where we try to create vectors we end up spilling to the

> stack and then filling. This is one example distilled from a couple of

> micro-benchmrks where the issue shows up. The reason for the extra cost

> in this case is the unnecessary use of the stack. The patch attempts to

> finesse this by using lane loads or vector inserts to produce the right

> results.

> 

> This patch is mostly Ramana's work, I've just cleaned it up a little.

> 

> This has been in a number of our trees lately, and we haven't seen any

> regressions. I've also bootstrapped and tested it, and run a set of

> benchmarks to show no regressions on Cortex-A57 or Cortex-A53.

> 

> The patch fixes some regressions caused by the more agressive vectorization

> in GCC6, so I'd like to propose it to go in even though we are in Stage 4.

> 

> OK?


*Ping*

I just ran in to this investigating another performance regression. It would
be nice to get fixed.

Thanks,
James


> 

> Thanks,

> James

> 

> ---

> gcc/

> 

> 2016-01-20  James Greenhalgh  <james.greenhalgh@arm.com>

> 	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

> 

> 	* config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor,

> 	always use lane loads to construct non-constant vectors.

> 

> gcc/testsuite/

> 

> 2016-01-20  James Greenhalgh  <james.greenhalgh@arm.com>

> 	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

> 

> 	* gcc.target/aarch64/vector_initialization_nostack.c: New.

> 


> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

> index 03bc1b9..3787b38 100644

> --- a/gcc/config/aarch64/aarch64.c

> +++ b/gcc/config/aarch64/aarch64.c

> @@ -10985,28 +10985,37 @@ aarch64_simd_make_constant (rtx vals)

>      return NULL_RTX;

>  }

>  

> +/* Expand a vector initialisation sequence, such that TARGET is

> +   initialised to contain VALS.  */

> +

>  void

>  aarch64_expand_vector_init (rtx target, rtx vals)

>  {

>    machine_mode mode = GET_MODE (target);

>    machine_mode inner_mode = GET_MODE_INNER (mode);

> +  /* The number of vector elements.  */

>    int n_elts = GET_MODE_NUNITS (mode);

> +  /* The number of vector elements which are not constant.  */

>    int n_var = 0;

>    rtx any_const = NULL_RTX;

> +  /* The first element of vals.  */

> +  rtx v0 = XVECEXP (vals, 0, 0);

>    bool all_same = true;

>  

> +  /* Count the number of variable elements to initialise.  */

>    for (int i = 0; i < n_elts; ++i)

>      {

>        rtx x = XVECEXP (vals, 0, i);

> -      if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))

> +      if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))

>  	++n_var;

>        else

>  	any_const = x;

>  

> -      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))

> -	all_same = false;

> +      all_same &= rtx_equal_p (x, v0);

>      }

>  

> +  /* No variable elements, hand off to aarch64_simd_make_constant which knows

> +     how best to handle this.  */

>    if (n_var == 0)

>      {

>        rtx constant = aarch64_simd_make_constant (vals);

> @@ -11020,14 +11029,15 @@ aarch64_expand_vector_init (rtx target, rtx vals)

>    /* Splat a single non-constant element if we can.  */

>    if (all_same)

>      {

> -      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));

> +      rtx x = copy_to_mode_reg (inner_mode, v0);

>        aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));

>        return;

>      }

>  

> -  /* Half the fields (or less) are non-constant.  Load constant then overwrite

> -     varying fields.  Hope that this is more efficient than using the stack.  */

> -  if (n_var <= n_elts/2)

> +  /* Initialise a vector which is part-variable.  We want to first try

> +     to build those lanes which are constant in the most efficient way we

> +     can.  */

> +  if (n_var != n_elts)

>      {

>        rtx copy = copy_rtx (vals);

>  

> @@ -11054,31 +11064,21 @@ aarch64_expand_vector_init (rtx target, rtx vals)

>  	  XVECEXP (copy, 0, i) = subst;

>  	}

>        aarch64_expand_vector_init (target, copy);

> +    }

>  

> -      /* Insert variables.  */

> -      enum insn_code icode = optab_handler (vec_set_optab, mode);

> -      gcc_assert (icode != CODE_FOR_nothing);

> +  /* Insert the variable lanes directly.  */

>  

> -      for (int i = 0; i < n_elts; i++)

> -	{

> -	  rtx x = XVECEXP (vals, 0, i);

> -	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))

> -	    continue;

> -	  x = copy_to_mode_reg (inner_mode, x);

> -	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));

> -	}

> -      return;

> -    }

> +  enum insn_code icode = optab_handler (vec_set_optab, mode);

> +  gcc_assert (icode != CODE_FOR_nothing);

>  

> -  /* Construct the vector in memory one field at a time

> -     and load the whole vector.  */

> -  rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));

>    for (int i = 0; i < n_elts; i++)

> -    emit_move_insn (adjust_address_nv (mem, inner_mode,

> -				    i * GET_MODE_SIZE (inner_mode)),

> -		    XVECEXP (vals, 0, i));

> -  emit_move_insn (target, mem);

> -

> +    {

> +      rtx x = XVECEXP (vals, 0, i);

> +      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))

> +	continue;

> +      x = copy_to_mode_reg (inner_mode, x);

> +      emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));

> +    }

>  }

>  

>  static unsigned HOST_WIDE_INT

> diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c

> new file mode 100644

> index 0000000..bbad04d

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c

> @@ -0,0 +1,53 @@

> +/* { dg-do compile } */

> +/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */

> +float arr_f[100][100];

> +float

> +f9 (void)

> +{

> +

> +  int i;

> +  float sum = 0;

> +  for (i = 0; i < 100; i++)

> +    sum += arr_f[i][0] * arr_f[0][i];

> +  return sum;

> +

> +}

> +

> +

> +int arr[100][100];

> +int

> +f10 (void)

> +{

> +

> +  int i;

> +  int sum = 0;

> +  for (i = 0; i < 100; i++)

> +    sum += arr[i][0] * arr[0][i];

> +  return sum;

> +

> +}

> +

> +double arr_d[100][100];

> +double

> +f11 (void)

> +{

> +  int i;

> +  double sum = 0;

> +  for (i = 0; i < 100; i++)

> +    sum += arr_d[i][0] * arr_d[0][i];

> +  return sum;

> +}

> +

> +char arr_c[100][100];

> +char

> +f12 (void)

> +{

> +  int i;

> +  char sum = 0;

> +  for (i = 0; i < 100; i++)

> +    sum += arr_c[i][0] * arr_c[0][i];

> +  return sum;

> +}

> +

> +

> +/* { dg-final { scan-assembler-not "sp" } } */
James Greenhalgh Feb. 8, 2016, 10:56 a.m. UTC | #2
On Tue, Feb 02, 2016 at 10:29:29AM +0000, James Greenhalgh wrote:
> On Wed, Jan 20, 2016 at 03:22:11PM +0000, James Greenhalgh wrote:

> > 

> > Hi,

> > 

> > In a number of cases where we try to create vectors we end up spilling to the

> > stack and then filling. This is one example distilled from a couple of

> > micro-benchmrks where the issue shows up. The reason for the extra cost

> > in this case is the unnecessary use of the stack. The patch attempts to

> > finesse this by using lane loads or vector inserts to produce the right

> > results.

> > 

> > This patch is mostly Ramana's work, I've just cleaned it up a little.

> > 

> > This has been in a number of our trees lately, and we haven't seen any

> > regressions. I've also bootstrapped and tested it, and run a set of

> > benchmarks to show no regressions on Cortex-A57 or Cortex-A53.

> > 

> > The patch fixes some regressions caused by the more agressive vectorization

> > in GCC6, so I'd like to propose it to go in even though we are in Stage 4.

> > 

> > OK?

> 

> *Ping*


*ping^2*

Cheers,
James

> > 2016-01-20  James Greenhalgh  <james.greenhalgh@arm.com>

> > 	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

> > 

> > 	* config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor,

> > 	always use lane loads to construct non-constant vectors.

> > 

> > gcc/testsuite/

> > 

> > 2016-01-20  James Greenhalgh  <james.greenhalgh@arm.com>

> > 	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

> > 

> > 	* gcc.target/aarch64/vector_initialization_nostack.c: New.

> > 

> 

> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

> > index 03bc1b9..3787b38 100644

> > --- a/gcc/config/aarch64/aarch64.c

> > +++ b/gcc/config/aarch64/aarch64.c

> > @@ -10985,28 +10985,37 @@ aarch64_simd_make_constant (rtx vals)

> >      return NULL_RTX;

> >  }

> >  

> > +/* Expand a vector initialisation sequence, such that TARGET is

> > +   initialised to contain VALS.  */

> > +

> >  void

> >  aarch64_expand_vector_init (rtx target, rtx vals)

> >  {

> >    machine_mode mode = GET_MODE (target);

> >    machine_mode inner_mode = GET_MODE_INNER (mode);

> > +  /* The number of vector elements.  */

> >    int n_elts = GET_MODE_NUNITS (mode);

> > +  /* The number of vector elements which are not constant.  */

> >    int n_var = 0;

> >    rtx any_const = NULL_RTX;

> > +  /* The first element of vals.  */

> > +  rtx v0 = XVECEXP (vals, 0, 0);

> >    bool all_same = true;

> >  

> > +  /* Count the number of variable elements to initialise.  */

> >    for (int i = 0; i < n_elts; ++i)

> >      {

> >        rtx x = XVECEXP (vals, 0, i);

> > -      if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))

> > +      if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))

> >  	++n_var;

> >        else

> >  	any_const = x;

> >  

> > -      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))

> > -	all_same = false;

> > +      all_same &= rtx_equal_p (x, v0);

> >      }

> >  

> > +  /* No variable elements, hand off to aarch64_simd_make_constant which knows

> > +     how best to handle this.  */

> >    if (n_var == 0)

> >      {

> >        rtx constant = aarch64_simd_make_constant (vals);

> > @@ -11020,14 +11029,15 @@ aarch64_expand_vector_init (rtx target, rtx vals)

> >    /* Splat a single non-constant element if we can.  */

> >    if (all_same)

> >      {

> > -      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));

> > +      rtx x = copy_to_mode_reg (inner_mode, v0);

> >        aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));

> >        return;

> >      }

> >  

> > -  /* Half the fields (or less) are non-constant.  Load constant then overwrite

> > -     varying fields.  Hope that this is more efficient than using the stack.  */

> > -  if (n_var <= n_elts/2)

> > +  /* Initialise a vector which is part-variable.  We want to first try

> > +     to build those lanes which are constant in the most efficient way we

> > +     can.  */

> > +  if (n_var != n_elts)

> >      {

> >        rtx copy = copy_rtx (vals);

> >  

> > @@ -11054,31 +11064,21 @@ aarch64_expand_vector_init (rtx target, rtx vals)

> >  	  XVECEXP (copy, 0, i) = subst;

> >  	}

> >        aarch64_expand_vector_init (target, copy);

> > +    }

> >  

> > -      /* Insert variables.  */

> > -      enum insn_code icode = optab_handler (vec_set_optab, mode);

> > -      gcc_assert (icode != CODE_FOR_nothing);

> > +  /* Insert the variable lanes directly.  */

> >  

> > -      for (int i = 0; i < n_elts; i++)

> > -	{

> > -	  rtx x = XVECEXP (vals, 0, i);

> > -	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))

> > -	    continue;

> > -	  x = copy_to_mode_reg (inner_mode, x);

> > -	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));

> > -	}

> > -      return;

> > -    }

> > +  enum insn_code icode = optab_handler (vec_set_optab, mode);

> > +  gcc_assert (icode != CODE_FOR_nothing);

> >  

> > -  /* Construct the vector in memory one field at a time

> > -     and load the whole vector.  */

> > -  rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));

> >    for (int i = 0; i < n_elts; i++)

> > -    emit_move_insn (adjust_address_nv (mem, inner_mode,

> > -				    i * GET_MODE_SIZE (inner_mode)),

> > -		    XVECEXP (vals, 0, i));

> > -  emit_move_insn (target, mem);

> > -

> > +    {

> > +      rtx x = XVECEXP (vals, 0, i);

> > +      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))

> > +	continue;

> > +      x = copy_to_mode_reg (inner_mode, x);

> > +      emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));

> > +    }

> >  }

> >  

> >  static unsigned HOST_WIDE_INT

> > diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c

> > new file mode 100644

> > index 0000000..bbad04d

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c

> > @@ -0,0 +1,53 @@

> > +/* { dg-do compile } */

> > +/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */

> > +float arr_f[100][100];

> > +float

> > +f9 (void)

> > +{

> > +

> > +  int i;

> > +  float sum = 0;

> > +  for (i = 0; i < 100; i++)

> > +    sum += arr_f[i][0] * arr_f[0][i];

> > +  return sum;

> > +

> > +}

> > +

> > +

> > +int arr[100][100];

> > +int

> > +f10 (void)

> > +{

> > +

> > +  int i;

> > +  int sum = 0;

> > +  for (i = 0; i < 100; i++)

> > +    sum += arr[i][0] * arr[0][i];

> > +  return sum;

> > +

> > +}

> > +

> > +double arr_d[100][100];

> > +double

> > +f11 (void)

> > +{

> > +  int i;

> > +  double sum = 0;

> > +  for (i = 0; i < 100; i++)

> > +    sum += arr_d[i][0] * arr_d[0][i];

> > +  return sum;

> > +}

> > +

> > +char arr_c[100][100];

> > +char

> > +f12 (void)

> > +{

> > +  int i;

> > +  char sum = 0;

> > +  for (i = 0; i < 100; i++)

> > +    sum += arr_c[i][0] * arr_c[0][i];

> > +  return sum;

> > +}

> > +

> > +

> > +/* { dg-final { scan-assembler-not "sp" } } */

>
James Greenhalgh Feb. 15, 2016, 10:49 a.m. UTC | #3
On Mon, Feb 08, 2016 at 10:56:29AM +0000, James Greenhalgh wrote:
> On Tue, Feb 02, 2016 at 10:29:29AM +0000, James Greenhalgh wrote:

> > On Wed, Jan 20, 2016 at 03:22:11PM +0000, James Greenhalgh wrote:

> > > 

> > > Hi,

> > > 

> > > In a number of cases where we try to create vectors we end up spilling to the

> > > stack and then filling. This is one example distilled from a couple of

> > > micro-benchmrks where the issue shows up. The reason for the extra cost

> > > in this case is the unnecessary use of the stack. The patch attempts to

> > > finesse this by using lane loads or vector inserts to produce the right

> > > results.

> > > 

> > > This patch is mostly Ramana's work, I've just cleaned it up a little.

> > > 

> > > This has been in a number of our trees lately, and we haven't seen any

> > > regressions. I've also bootstrapped and tested it, and run a set of

> > > benchmarks to show no regressions on Cortex-A57 or Cortex-A53.

> > > 

> > > The patch fixes some regressions caused by the more agressive vectorization

> > > in GCC6, so I'd like to propose it to go in even though we are in Stage 4.

> > > 

> > > OK?

> > 

> > *Ping*

> 

> *ping^2*


*ping ^3*

Thanks,
James

> > > 2016-01-20  James Greenhalgh  <james.greenhalgh@arm.com>

> > > 	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

> > > 

> > > 	* config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor,

> > > 	always use lane loads to construct non-constant vectors.

> > > 

> > > gcc/testsuite/

> > > 

> > > 2016-01-20  James Greenhalgh  <james.greenhalgh@arm.com>

> > > 	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

> > > 

> > > 	* gcc.target/aarch64/vector_initialization_nostack.c: New.

> > > 

> > 

> > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

> > > index 03bc1b9..3787b38 100644

> > > --- a/gcc/config/aarch64/aarch64.c

> > > +++ b/gcc/config/aarch64/aarch64.c

> > > @@ -10985,28 +10985,37 @@ aarch64_simd_make_constant (rtx vals)

> > >      return NULL_RTX;

> > >  }

> > >  

> > > +/* Expand a vector initialisation sequence, such that TARGET is

> > > +   initialised to contain VALS.  */

> > > +

> > >  void

> > >  aarch64_expand_vector_init (rtx target, rtx vals)

> > >  {

> > >    machine_mode mode = GET_MODE (target);

> > >    machine_mode inner_mode = GET_MODE_INNER (mode);

> > > +  /* The number of vector elements.  */

> > >    int n_elts = GET_MODE_NUNITS (mode);

> > > +  /* The number of vector elements which are not constant.  */

> > >    int n_var = 0;

> > >    rtx any_const = NULL_RTX;

> > > +  /* The first element of vals.  */

> > > +  rtx v0 = XVECEXP (vals, 0, 0);

> > >    bool all_same = true;

> > >  

> > > +  /* Count the number of variable elements to initialise.  */

> > >    for (int i = 0; i < n_elts; ++i)

> > >      {

> > >        rtx x = XVECEXP (vals, 0, i);

> > > -      if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))

> > > +      if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))

> > >  	++n_var;

> > >        else

> > >  	any_const = x;

> > >  

> > > -      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))

> > > -	all_same = false;

> > > +      all_same &= rtx_equal_p (x, v0);

> > >      }

> > >  

> > > +  /* No variable elements, hand off to aarch64_simd_make_constant which knows

> > > +     how best to handle this.  */

> > >    if (n_var == 0)

> > >      {

> > >        rtx constant = aarch64_simd_make_constant (vals);

> > > @@ -11020,14 +11029,15 @@ aarch64_expand_vector_init (rtx target, rtx vals)

> > >    /* Splat a single non-constant element if we can.  */

> > >    if (all_same)

> > >      {

> > > -      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));

> > > +      rtx x = copy_to_mode_reg (inner_mode, v0);

> > >        aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));

> > >        return;

> > >      }

> > >  

> > > -  /* Half the fields (or less) are non-constant.  Load constant then overwrite

> > > -     varying fields.  Hope that this is more efficient than using the stack.  */

> > > -  if (n_var <= n_elts/2)

> > > +  /* Initialise a vector which is part-variable.  We want to first try

> > > +     to build those lanes which are constant in the most efficient way we

> > > +     can.  */

> > > +  if (n_var != n_elts)

> > >      {

> > >        rtx copy = copy_rtx (vals);

> > >  

> > > @@ -11054,31 +11064,21 @@ aarch64_expand_vector_init (rtx target, rtx vals)

> > >  	  XVECEXP (copy, 0, i) = subst;

> > >  	}

> > >        aarch64_expand_vector_init (target, copy);

> > > +    }

> > >  

> > > -      /* Insert variables.  */

> > > -      enum insn_code icode = optab_handler (vec_set_optab, mode);

> > > -      gcc_assert (icode != CODE_FOR_nothing);

> > > +  /* Insert the variable lanes directly.  */

> > >  

> > > -      for (int i = 0; i < n_elts; i++)

> > > -	{

> > > -	  rtx x = XVECEXP (vals, 0, i);

> > > -	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))

> > > -	    continue;

> > > -	  x = copy_to_mode_reg (inner_mode, x);

> > > -	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));

> > > -	}

> > > -      return;

> > > -    }

> > > +  enum insn_code icode = optab_handler (vec_set_optab, mode);

> > > +  gcc_assert (icode != CODE_FOR_nothing);

> > >  

> > > -  /* Construct the vector in memory one field at a time

> > > -     and load the whole vector.  */

> > > -  rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));

> > >    for (int i = 0; i < n_elts; i++)

> > > -    emit_move_insn (adjust_address_nv (mem, inner_mode,

> > > -				    i * GET_MODE_SIZE (inner_mode)),

> > > -		    XVECEXP (vals, 0, i));

> > > -  emit_move_insn (target, mem);

> > > -

> > > +    {

> > > +      rtx x = XVECEXP (vals, 0, i);

> > > +      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))

> > > +	continue;

> > > +      x = copy_to_mode_reg (inner_mode, x);

> > > +      emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));

> > > +    }

> > >  }

> > >  

> > >  static unsigned HOST_WIDE_INT

> > > diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c

> > > new file mode 100644

> > > index 0000000..bbad04d

> > > --- /dev/null

> > > +++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c

> > > @@ -0,0 +1,53 @@

> > > +/* { dg-do compile } */

> > > +/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */

> > > +float arr_f[100][100];

> > > +float

> > > +f9 (void)

> > > +{

> > > +

> > > +  int i;

> > > +  float sum = 0;

> > > +  for (i = 0; i < 100; i++)

> > > +    sum += arr_f[i][0] * arr_f[0][i];

> > > +  return sum;

> > > +

> > > +}

> > > +

> > > +

> > > +int arr[100][100];

> > > +int

> > > +f10 (void)

> > > +{

> > > +

> > > +  int i;

> > > +  int sum = 0;

> > > +  for (i = 0; i < 100; i++)

> > > +    sum += arr[i][0] * arr[0][i];

> > > +  return sum;

> > > +

> > > +}

> > > +

> > > +double arr_d[100][100];

> > > +double

> > > +f11 (void)

> > > +{

> > > +  int i;

> > > +  double sum = 0;

> > > +  for (i = 0; i < 100; i++)

> > > +    sum += arr_d[i][0] * arr_d[0][i];

> > > +  return sum;

> > > +}

> > > +

> > > +char arr_c[100][100];

> > > +char

> > > +f12 (void)

> > > +{

> > > +  int i;

> > > +  char sum = 0;

> > > +  for (i = 0; i < 100; i++)

> > > +    sum += arr_c[i][0] * arr_c[0][i];

> > > +  return sum;

> > > +}

> > > +

> > > +

> > > +/* { dg-final { scan-assembler-not "sp" } } */

> > 

>
Marcus Shawcroft Feb. 16, 2016, 8:42 a.m. UTC | #4
On 20 January 2016 at 15:22, James Greenhalgh <james.greenhalgh@arm.com> wrote:

> gcc/

>

> 2016-01-20  James Greenhalgh  <james.greenhalgh@arm.com>

>             Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

>

>         * config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor,

>         always use lane loads to construct non-constant vectors.

>

> gcc/testsuite/

>

> 2016-01-20  James Greenhalgh  <james.greenhalgh@arm.com>

>             Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

>

>         * gcc.target/aarch64/vector_initialization_nostack.c: New.

>


OK /Marcus
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 03bc1b9..3787b38 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -10985,28 +10985,37 @@  aarch64_simd_make_constant (rtx vals)
     return NULL_RTX;
 }
 
+/* Expand a vector initialisation sequence, such that TARGET is
+   initialised to contain VALS.  */
+
 void
 aarch64_expand_vector_init (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
+  /* The number of vector elements.  */
   int n_elts = GET_MODE_NUNITS (mode);
+  /* The number of vector elements which are not constant.  */
   int n_var = 0;
   rtx any_const = NULL_RTX;
+  /* The first element of vals.  */
+  rtx v0 = XVECEXP (vals, 0, 0);
   bool all_same = true;
 
+  /* Count the number of variable elements to initialise.  */
   for (int i = 0; i < n_elts; ++i)
     {
       rtx x = XVECEXP (vals, 0, i);
-      if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
+      if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
 	++n_var;
       else
 	any_const = x;
 
-      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
-	all_same = false;
+      all_same &= rtx_equal_p (x, v0);
     }
 
+  /* No variable elements, hand off to aarch64_simd_make_constant which knows
+     how best to handle this.  */
   if (n_var == 0)
     {
       rtx constant = aarch64_simd_make_constant (vals);
@@ -11020,14 +11029,15 @@  aarch64_expand_vector_init (rtx target, rtx vals)
   /* Splat a single non-constant element if we can.  */
   if (all_same)
     {
-      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
+      rtx x = copy_to_mode_reg (inner_mode, v0);
       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
       return;
     }
 
-  /* Half the fields (or less) are non-constant.  Load constant then overwrite
-     varying fields.  Hope that this is more efficient than using the stack.  */
-  if (n_var <= n_elts/2)
+  /* Initialise a vector which is part-variable.  We want to first try
+     to build those lanes which are constant in the most efficient way we
+     can.  */
+  if (n_var != n_elts)
     {
       rtx copy = copy_rtx (vals);
 
@@ -11054,31 +11064,21 @@  aarch64_expand_vector_init (rtx target, rtx vals)
 	  XVECEXP (copy, 0, i) = subst;
 	}
       aarch64_expand_vector_init (target, copy);
+    }
 
-      /* Insert variables.  */
-      enum insn_code icode = optab_handler (vec_set_optab, mode);
-      gcc_assert (icode != CODE_FOR_nothing);
+  /* Insert the variable lanes directly.  */
 
-      for (int i = 0; i < n_elts; i++)
-	{
-	  rtx x = XVECEXP (vals, 0, i);
-	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
-	    continue;
-	  x = copy_to_mode_reg (inner_mode, x);
-	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
-	}
-      return;
-    }
+  enum insn_code icode = optab_handler (vec_set_optab, mode);
+  gcc_assert (icode != CODE_FOR_nothing);
 
-  /* Construct the vector in memory one field at a time
-     and load the whole vector.  */
-  rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   for (int i = 0; i < n_elts; i++)
-    emit_move_insn (adjust_address_nv (mem, inner_mode,
-				    i * GET_MODE_SIZE (inner_mode)),
-		    XVECEXP (vals, 0, i));
-  emit_move_insn (target, mem);
-
+    {
+      rtx x = XVECEXP (vals, 0, i);
+      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+	continue;
+      x = copy_to_mode_reg (inner_mode, x);
+      emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+    }
 }
 
 static unsigned HOST_WIDE_INT
diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
new file mode 100644
index 0000000..bbad04d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
@@ -0,0 +1,53 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */
+float arr_f[100][100];
+float
+f9 (void)
+{
+
+  int i;
+  float sum = 0;
+  for (i = 0; i < 100; i++)
+    sum += arr_f[i][0] * arr_f[0][i];
+  return sum;
+
+}
+
+
+int arr[100][100];
+int
+f10 (void)
+{
+
+  int i;
+  int sum = 0;
+  for (i = 0; i < 100; i++)
+    sum += arr[i][0] * arr[0][i];
+  return sum;
+
+}
+
+double arr_d[100][100];
+double
+f11 (void)
+{
+  int i;
+  double sum = 0;
+  for (i = 0; i < 100; i++)
+    sum += arr_d[i][0] * arr_d[0][i];
+  return sum;
+}
+
+char arr_c[100][100];
+char
+f12 (void)
+{
+  int i;
+  char sum = 0;
+  for (i = 0; i < 100; i++)
+    sum += arr_c[i][0] * arr_c[0][i];
+  return sum;
+}
+
+
+/* { dg-final { scan-assembler-not "sp" } } */