diff mbox series

[RFC,PR88838,SVE] Use 32-bit WHILELO in LP64 mode

Message ID CAELXzTO4bT6YmbamU5=X36yzPqq0WLCBJtseRHwZyoMEZw1rFw@mail.gmail.com
State New
Headers show
Series [RFC,PR88838,SVE] Use 32-bit WHILELO in LP64 mode | expand

Commit Message

Kugan Vivekanandarajah May 22, 2019, 2:04 a.m. UTC
Hi,

Attached RFC patch attempts to use 32-bit WHILELO in LP64 mode to fix
the PR. Bootstarp and regression testing ongoing. In earlier testing,
I ran into an issue related to fwprop. I will tackle that based on the
feedback for the patch.

Thanks,
Kugan

Comments

Richard Sandiford May 25, 2019, 9:41 a.m. UTC | #1
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c

> index 77d3dac..d6452a1 100644

> --- a/gcc/tree-vect-loop-manip.c

> +++ b/gcc/tree-vect-loop-manip.c

> @@ -418,7 +418,20 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

>    tree mask_type = rgm->mask_type;

>    unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;

>    poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);

> -

> +  bool convert = false;

> +  tree iv_type = NULL_TREE;

> +

> +  /* If the compare_type is not with Pmode size, we will create an IV with

> +     Pmode size with truncated use (i.e. converted to the correct type).

> +     This is because using Pmode allows ivopts to reuse the IV for indices

> +     (in the loads and store).  */

> +  if (known_lt (GET_MODE_BITSIZE (TYPE_MODE (compare_type)),

> +		GET_MODE_BITSIZE (Pmode)))

> +    {

> +      iv_type = build_nonstandard_integer_type (GET_MODE_BITSIZE (Pmode),

> +						true);

> +      convert = true;

> +    }

>    /* Calculate the maximum number of scalar values that the rgroup

>       handles in total, the number that it handles for each iteration

>       of the vector loop, and the number that it should skip during the

> @@ -444,12 +457,43 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

>       processed.  */

>    tree index_before_incr, index_after_incr;

>    gimple_stmt_iterator incr_gsi;

> +  gimple_stmt_iterator incr_gsi2;

>    bool insert_after;

> -  tree zero_index = build_int_cst (compare_type, 0);

> +  tree zero_index;

>    standard_iv_increment_position (loop, &incr_gsi, &insert_after);

> -  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,

> -	     insert_after, &index_before_incr, &index_after_incr);

>  

> +  if (convert)

> +    {

> +      /* If we are creating IV of Pmode type and converting.  */

> +      zero_index = build_int_cst (iv_type, 0);

> +      tree step = build_int_cst (iv_type,

> +				 LOOP_VINFO_VECT_FACTOR (loop_vinfo));

> +      /* Creating IV of Pmode type.  */

> +      create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,

> +		 insert_after, &index_before_incr, &index_after_incr);

> +      /* Create truncated index_before and after increament.  */

> +      tree index_before_incr_trunc = make_ssa_name (compare_type);

> +      tree index_after_incr_trunc = make_ssa_name (compare_type);

> +      gimple *incr_before_stmt = gimple_build_assign (index_before_incr_trunc,

> +						      NOP_EXPR,

> +						      index_before_incr);

> +      gimple *incr_after_stmt = gimple_build_assign (index_after_incr_trunc,

> +						     NOP_EXPR,

> +						     index_after_incr);

> +      incr_gsi2 = incr_gsi;

> +      gsi_insert_before (&incr_gsi2, incr_before_stmt, GSI_NEW_STMT);

> +      gsi_insert_after (&incr_gsi, incr_after_stmt, GSI_NEW_STMT);

> +      index_before_incr = index_before_incr_trunc;

> +      index_after_incr = index_after_incr_trunc;

> +      zero_index = build_int_cst (compare_type, 0);

> +    }

> +  else

> +    {

> +      /* If the IV is of Pmode compare_type, no convertion needed.  */

> +      zero_index = build_int_cst (compare_type, 0);

> +      create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,

> +		 insert_after, &index_before_incr, &index_after_incr);

> +    }

>    tree test_index, test_limit, first_limit;

>    gimple_stmt_iterator *test_gsi;

>    if (might_wrap_p)


Instead of hard-coding Pmode as a special case here, I think we should
record the IV type in vect_verify_full_masking in addition to the comparison
type.  (With the IV type always being at least as wide as the comparison
type.)

> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

> index bd81193..2769c86 100644

> --- a/gcc/tree-vect-loop.c

> +++ b/gcc/tree-vect-loop.c

> @@ -1035,6 +1035,30 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

>    /* Find a scalar mode for which WHILE_ULT is supported.  */

>    opt_scalar_int_mode cmp_mode_iter;

>    tree cmp_type = NULL_TREE;

> +  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));

> +  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);

> +  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);

> +  widest_int iv_limit;

> +  bool known_max_iters = max_loop_iterations (loop, &iv_limit);

> +  if (known_max_iters)

> +    {

> +      if (niters_skip)

> +	{

> +	  /* Add the maximum number of skipped iterations to the

> +	     maximum iteration count.  */

> +	  if (TREE_CODE (niters_skip) == INTEGER_CST)

> +	    iv_limit += wi::to_widest (niters_skip);

> +	  else

> +	    iv_limit += max_vf - 1;

> +	}

> +      /* IV_LIMIT is the maximum number of latch iterations, which is also

> +	 the maximum in-range IV value.  Round this value down to the previous

> +	 vector alignment boundary and then add an extra full iteration.  */

> +      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);

> +      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;

> +    }

> +

> +  /* Get the vectorization factor in tree form.  */


Please split the loop-manip.c bits you need out into a subroutine instead
of cut-&-pasting. :-)

>    FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)

>      {

>        unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());

> @@ -1045,12 +1069,23 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

>  	  if (this_type

>  	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))

>  	    {

> +	      /* See whether zero-based IV would ever generate all-false masks

> +		 before wrapping around.  */

> +	      bool might_wrap_p

> +		= (!known_max_iters

> +		   || (wi::min_precision

> +		       (iv_limit

> +			* vect_get_max_nscalars_per_iter (loop_vinfo),

> +			UNSIGNED) > cmp_bits));


The wi::min_precision is invariant, so there's no need to calculate it in
each iteration of the loop.  Would be good to avoid the duplicated call to
vect_get_max_nscalars_per_iter (also called for min_ni_width).

>  	      /* Although we could stop as soon as we find a valid mode,

>  		 it's often better to continue until we hit Pmode, since the

>  		 operands to the WHILE are more likely to be reusable in

> -		 address calculations.  */

> +		 address calculations.  Unless the limit is extended from

> +		 this_type.  */


Please rewrite the comment to describe the new code rather than tacking
this kind of thing on the end.

>  	      cmp_type = this_type;

> -	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))

> +	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode)

> +		  || (!might_wrap_p

> +		      && (cmp_bits == TYPE_PRECISION (niters_type))))


The TYPE_PRECISION test looks redundant.  E.g. if a loop only executes
N times, all we care about is whether the IV we pick can handle N
iterations without wrapping.  It doesn't really matter what type the
original source code used to hold the loop count.

Thanks,
Richard
Kugan Vivekanandarajah May 28, 2019, 2:33 a.m. UTC | #2
Hi Richard,

Thanks for the review.

On Sat, 25 May 2019 at 19:41, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

> > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c

> > index 77d3dac..d6452a1 100644

> > --- a/gcc/tree-vect-loop-manip.c

> > +++ b/gcc/tree-vect-loop-manip.c

> > @@ -418,7 +418,20 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

> >    tree mask_type = rgm->mask_type;

> >    unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;

> >    poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);

> > -

> > +  bool convert = false;

> > +  tree iv_type = NULL_TREE;

> > +

> > +  /* If the compare_type is not with Pmode size, we will create an IV with

> > +     Pmode size with truncated use (i.e. converted to the correct type).

> > +     This is because using Pmode allows ivopts to reuse the IV for indices

> > +     (in the loads and store).  */

> > +  if (known_lt (GET_MODE_BITSIZE (TYPE_MODE (compare_type)),

> > +             GET_MODE_BITSIZE (Pmode)))

> > +    {

> > +      iv_type = build_nonstandard_integer_type (GET_MODE_BITSIZE (Pmode),

> > +                                             true);

> > +      convert = true;

> > +    }

> >    /* Calculate the maximum number of scalar values that the rgroup

> >       handles in total, the number that it handles for each iteration

> >       of the vector loop, and the number that it should skip during the

> > @@ -444,12 +457,43 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

> >       processed.  */

> >    tree index_before_incr, index_after_incr;

> >    gimple_stmt_iterator incr_gsi;

> > +  gimple_stmt_iterator incr_gsi2;

> >    bool insert_after;

> > -  tree zero_index = build_int_cst (compare_type, 0);

> > +  tree zero_index;

> >    standard_iv_increment_position (loop, &incr_gsi, &insert_after);

> > -  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,

> > -          insert_after, &index_before_incr, &index_after_incr);

> >

> > +  if (convert)

> > +    {

> > +      /* If we are creating IV of Pmode type and converting.  */

> > +      zero_index = build_int_cst (iv_type, 0);

> > +      tree step = build_int_cst (iv_type,

> > +                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));

> > +      /* Creating IV of Pmode type.  */

> > +      create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,

> > +              insert_after, &index_before_incr, &index_after_incr);

> > +      /* Create truncated index_before and after increament.  */

> > +      tree index_before_incr_trunc = make_ssa_name (compare_type);

> > +      tree index_after_incr_trunc = make_ssa_name (compare_type);

> > +      gimple *incr_before_stmt = gimple_build_assign (index_before_incr_trunc,

> > +                                                   NOP_EXPR,

> > +                                                   index_before_incr);

> > +      gimple *incr_after_stmt = gimple_build_assign (index_after_incr_trunc,

> > +                                                  NOP_EXPR,

> > +                                                  index_after_incr);

> > +      incr_gsi2 = incr_gsi;

> > +      gsi_insert_before (&incr_gsi2, incr_before_stmt, GSI_NEW_STMT);

> > +      gsi_insert_after (&incr_gsi, incr_after_stmt, GSI_NEW_STMT);

> > +      index_before_incr = index_before_incr_trunc;

> > +      index_after_incr = index_after_incr_trunc;

> > +      zero_index = build_int_cst (compare_type, 0);

> > +    }

> > +  else

> > +    {

> > +      /* If the IV is of Pmode compare_type, no convertion needed.  */

> > +      zero_index = build_int_cst (compare_type, 0);

> > +      create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,

> > +              insert_after, &index_before_incr, &index_after_incr);

> > +    }

> >    tree test_index, test_limit, first_limit;

> >    gimple_stmt_iterator *test_gsi;

> >    if (might_wrap_p)

>

> Instead of hard-coding Pmode as a special case here, I think we should

> record the IV type in vect_verify_full_masking in addition to the comparison

> type.  (With the IV type always being at least as wide as the comparison

> type.)

Ok.

>

> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

> > index bd81193..2769c86 100644

> > --- a/gcc/tree-vect-loop.c

> > +++ b/gcc/tree-vect-loop.c

> > @@ -1035,6 +1035,30 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

> >    /* Find a scalar mode for which WHILE_ULT is supported.  */

> >    opt_scalar_int_mode cmp_mode_iter;

> >    tree cmp_type = NULL_TREE;

> > +  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));

> > +  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);

> > +  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);

> > +  widest_int iv_limit;

> > +  bool known_max_iters = max_loop_iterations (loop, &iv_limit);

> > +  if (known_max_iters)

> > +    {

> > +      if (niters_skip)

> > +     {

> > +       /* Add the maximum number of skipped iterations to the

> > +          maximum iteration count.  */

> > +       if (TREE_CODE (niters_skip) == INTEGER_CST)

> > +         iv_limit += wi::to_widest (niters_skip);

> > +       else

> > +         iv_limit += max_vf - 1;

> > +     }

> > +      /* IV_LIMIT is the maximum number of latch iterations, which is also

> > +      the maximum in-range IV value.  Round this value down to the previous

> > +      vector alignment boundary and then add an extra full iteration.  */

> > +      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);

> > +      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;

> > +    }

> > +

> > +  /* Get the vectorization factor in tree form.  */

>

> Please split the loop-manip.c bits you need out into a subroutine instead

> of cut-&-pasting. :-)

Ok.

>

> >    FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)

> >      {

> >        unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());

> > @@ -1045,12 +1069,23 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

> >         if (this_type

> >             && can_produce_all_loop_masks_p (loop_vinfo, this_type))

> >           {

> > +           /* See whether zero-based IV would ever generate all-false masks

> > +              before wrapping around.  */

> > +           bool might_wrap_p

> > +             = (!known_max_iters

> > +                || (wi::min_precision

> > +                    (iv_limit

> > +                     * vect_get_max_nscalars_per_iter (loop_vinfo),

> > +                     UNSIGNED) > cmp_bits));

>

> The wi::min_precision is invariant, so there's no need to calculate it in

> each iteration of the loop.  Would be good to avoid the duplicated call to

> vect_get_max_nscalars_per_iter (also called for min_ni_width).

Ok.

>

> >             /* Although we could stop as soon as we find a valid mode,

> >                it's often better to continue until we hit Pmode, since the

> >                operands to the WHILE are more likely to be reusable in

> > -              address calculations.  */

> > +              address calculations.  Unless the limit is extended from

> > +              this_type.  */

>

> Please rewrite the comment to describe the new code rather than tacking

> this kind of thing on the end.

Ok.

>

> >             cmp_type = this_type;

> > -           if (cmp_bits >= GET_MODE_BITSIZE (Pmode))

> > +           if (cmp_bits >= GET_MODE_BITSIZE (Pmode)

> > +               || (!might_wrap_p

> > +                   && (cmp_bits == TYPE_PRECISION (niters_type))))

>

> The TYPE_PRECISION test looks redundant.  E.g. if a loop only executes

> N times, all we care about is whether the IV we pick can handle N

> iterations without wrapping.  It doesn't really matter what type the

> original source code used to hold the loop count.

Attached patch is revised based on your comments. Does this look better.

Thanks,
Kugan
>

> Thanks,

> Richard
Richard Sandiford May 28, 2019, 10:44 a.m. UTC | #3
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:
> [...]

> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c

> index b3fae5b..c15b8a2 100644

> --- a/gcc/tree-vect-loop-manip.c

> +++ b/gcc/tree-vect-loop-manip.c

> @@ -415,10 +415,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

>  			      bool might_wrap_p)

>  {

>    tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);

> +  tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);

>    tree mask_type = rgm->mask_type;

>    unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;

>    poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);

> +  bool convert = false;

>  

> +  /* If the compare_type is not iv_type, we will create an IV with

> +     iv_type with truncated use (i.e. converted to the correct type).  */

> +  if (compare_type != iv_type)

> +    convert = true;

>    /* Calculate the maximum number of scalar values that the rgroup

>       handles in total, the number that it handles for each iteration

>       of the vector loop, and the number that it should skip during the

> @@ -444,12 +450,43 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

>       processed.  */

>    tree index_before_incr, index_after_incr;

>    gimple_stmt_iterator incr_gsi;

> +  gimple_stmt_iterator incr_gsi2;

>    bool insert_after;

> -  tree zero_index = build_int_cst (compare_type, 0);

> +  tree zero_index;

>    standard_iv_increment_position (loop, &incr_gsi, &insert_after);

> -  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,

> -	     insert_after, &index_before_incr, &index_after_incr);

>  

> +  if (convert)

> +    {

> +      /* If we are creating IV of iv_type and then converting.  */

> +      zero_index = build_int_cst (iv_type, 0);

> +      tree step = build_int_cst (iv_type,

> +				 LOOP_VINFO_VECT_FACTOR (loop_vinfo));

> +      /* Creating IV of iv_type.  */

> +      create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,

> +		 insert_after, &index_before_incr, &index_after_incr);

> +      /* Create truncated index_before and after increament.  */

> +      tree index_before_incr_trunc = make_ssa_name (compare_type);

> +      tree index_after_incr_trunc = make_ssa_name (compare_type);

> +      gimple *incr_before_stmt = gimple_build_assign (index_before_incr_trunc,

> +						      NOP_EXPR,

> +						      index_before_incr);

> +      gimple *incr_after_stmt = gimple_build_assign (index_after_incr_trunc,

> +						     NOP_EXPR,

> +						     index_after_incr);

> +      incr_gsi2 = incr_gsi;

> +      gsi_insert_before (&incr_gsi2, incr_before_stmt, GSI_NEW_STMT);

> +      gsi_insert_after (&incr_gsi, incr_after_stmt, GSI_NEW_STMT);

> +      index_before_incr = index_before_incr_trunc;

> +      index_after_incr = index_after_incr_trunc;

> +      zero_index = build_int_cst (compare_type, 0);

> +    }

> +  else

> +    {

> +      /* If the IV is of compare_type, no convertion needed.  */

> +      zero_index = build_int_cst (compare_type, 0);

> +      create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,

> +		 insert_after, &index_before_incr, &index_after_incr);

> +    }

>    tree test_index, test_limit, first_limit;

>    gimple_stmt_iterator *test_gsi;

>    if (might_wrap_p)


Now that we have an explicit iv_type, there shouldn't be any need to
treat this as two special cases.  I think we should just convert the
IV to the comparison type before passing it to the WHILE.

> @@ -617,6 +654,41 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

>    return next_mask;

>  }

>  

> +/* Return the iv_limit for fully masked loop LOOP with LOOP_VINFO.

> +   If it is not possible to calcilate iv_limit, return -1.  */


Maybe:

/* Decide whether it is possible to use a zero-based induction variable
   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
   return the value that the induction variable must be able to hold
   in order to ensure that the loop ends with an all-false mask.
   Return -1 otherwise.  */

I think the function should go on in tree-vect-loop.c instead.

> +widest_int

> +vect_get_loop_iv_limit (struct loop *loop, loop_vec_info loop_vinfo)


Maybe: vect_iv_limit_for_full_masking

Probably worth dropping the "loop" parameter and getting it from
LOOP_VINFO.

> +{

> +  /* Convert skip_niters to the right type.  */


Comment no longer applies.

> +  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);

> +  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);

> +

> +  /* Now calculate the value that the induction variable must be able

> +     to hit in order to ensure that we end the loop with an all-false mask.

> +     This involves adding the maximum number of inactive trailing scalar

> +     iterations.  */

> +  widest_int iv_limit = -1;

> +  bool known_max_iters = max_loop_iterations (loop, &iv_limit);

> +  if (known_max_iters)


No need for this temporary variable.

> +    {

> +      if (niters_skip)

> +	{

> +	  /* Add the maximum number of skipped iterations to the

> +	     maximum iteration count.  */

> +	  if (TREE_CODE (niters_skip) == INTEGER_CST)

> +	    iv_limit += wi::to_widest (niters_skip);

> +	  else

> +	    iv_limit += max_vf - 1;

> +	}


Note that MASK_SKIP_NITERS isn't set at the point you call it
for vect_set_loop_condition_masked.  I think we should have:

    else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
      /* Make a conservatively-correct assumption.  */
      iv_limit += max_vf - 1;
      
> +      /* IV_LIMIT is the maximum number of latch iterations, which is also

> +	 the maximum in-range IV value.  Round this value down to the previous

> +	 vector alignment boundary and then add an extra full iteration.  */

> +      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);

> +      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;

> +    }

> +  return iv_limit;

> +}

> +

>  /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.

>     LOOP_VINFO describes the vectorization of LOOP.  NITERS is the

>     number of iterations of the original scalar loop that should be

> [...]

> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

> index e1229a5..431025b 100644

> --- a/gcc/tree-vect-loop.c

> +++ b/gcc/tree-vect-loop.c

> @@ -1056,6 +1056,16 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

>    /* Find a scalar mode for which WHILE_ULT is supported.  */

>    opt_scalar_int_mode cmp_mode_iter;

>    tree cmp_type = NULL_TREE;

> +  tree iv_type = NULL_TREE;

> +  widest_int iv_limit = vect_get_loop_iv_limit (loop, loop_vinfo);

> +  widest_int iv_precision = -1;


iv_precision should be unsigned int.  Setting it to UINT_MAX would
simplify the later code.

> +

> +  if (iv_limit != -1)

> +    iv_precision

> +      = wi::min_precision (iv_limit

> +			   * vect_get_max_nscalars_per_iter (loop_vinfo),

> +			   UNSIGNED);

> +


Would be good to avoid the duplicated call to
vect_get_max_nscalars_per_iter (also called for min_ni_width).

>    FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)

>      {

>        unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());

> @@ -1066,13 +1076,25 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

>  	  if (this_type

>  	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))

>  	    {

> -	      /* Although we could stop as soon as we find a valid mode,

> -		 it's often better to continue until we hit Pmode, since the

> +	      /* See whether zero-based IV would ever generate all-false masks

> +		 before wrapping around.  */

> +	      bool might_wrap_p = (iv_limit == -1 || (iv_precision > cmp_bits));


With the above change, the iv_limit check would no longer be needed.

> +	      /* Stop as soon as we find a valid mode. If we decided to use

> +		 cmp_type which is less than Pmode precision, it is often better

> +		 to use iv_type corresponding to Pmode, since the

>  		 operands to the WHILE are more likely to be reusable in

> -		 address calculations.  */

> +		 address calculations in this case.  */

>  	      cmp_type = this_type;

> +	      iv_type = this_type;

>  	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))

>  		break;

> +	      if (!might_wrap_p)

> +		{

> +		  iv_type

> +		    = build_nonstandard_integer_type (GET_MODE_BITSIZE (Pmode),

> +						      true);

> +		  break;

> +		}


I think the loop should break in the same place as before, with the
iv_type being what used to be the cmp_type.  The new behaviour is that
(for the new meaning of cmp_type) we keep the current cmp_type if its
precision is already >= iv_precision.

Thanks,
Richard
Kugan Vivekanandarajah May 31, 2019, 3:21 a.m. UTC | #4
Hi Richard,

Thanks for the review.

On Tue, 28 May 2019 at 20:44, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

> > [...]

> > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c

> > index b3fae5b..c15b8a2 100644

> > --- a/gcc/tree-vect-loop-manip.c

> > +++ b/gcc/tree-vect-loop-manip.c

> > @@ -415,10 +415,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

> >                             bool might_wrap_p)

> >  {

> >    tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);

> > +  tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);

> >    tree mask_type = rgm->mask_type;

> >    unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;

> >    poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);

> > +  bool convert = false;

> >

> > +  /* If the compare_type is not iv_type, we will create an IV with

> > +     iv_type with truncated use (i.e. converted to the correct type).  */

> > +  if (compare_type != iv_type)

> > +    convert = true;

> >    /* Calculate the maximum number of scalar values that the rgroup

> >       handles in total, the number that it handles for each iteration

> >       of the vector loop, and the number that it should skip during the

> > @@ -444,12 +450,43 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

> >       processed.  */

> >    tree index_before_incr, index_after_incr;

> >    gimple_stmt_iterator incr_gsi;

> > +  gimple_stmt_iterator incr_gsi2;

> >    bool insert_after;

> > -  tree zero_index = build_int_cst (compare_type, 0);

> > +  tree zero_index;

> >    standard_iv_increment_position (loop, &incr_gsi, &insert_after);

> > -  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,

> > -          insert_after, &index_before_incr, &index_after_incr);

> >

> > +  if (convert)

> > +    {

> > +      /* If we are creating IV of iv_type and then converting.  */

> > +      zero_index = build_int_cst (iv_type, 0);

> > +      tree step = build_int_cst (iv_type,

> > +                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));

> > +      /* Creating IV of iv_type.  */

> > +      create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,

> > +              insert_after, &index_before_incr, &index_after_incr);

> > +      /* Create truncated index_before and after increament.  */

> > +      tree index_before_incr_trunc = make_ssa_name (compare_type);

> > +      tree index_after_incr_trunc = make_ssa_name (compare_type);

> > +      gimple *incr_before_stmt = gimple_build_assign (index_before_incr_trunc,

> > +                                                   NOP_EXPR,

> > +                                                   index_before_incr);

> > +      gimple *incr_after_stmt = gimple_build_assign (index_after_incr_trunc,

> > +                                                  NOP_EXPR,

> > +                                                  index_after_incr);

> > +      incr_gsi2 = incr_gsi;

> > +      gsi_insert_before (&incr_gsi2, incr_before_stmt, GSI_NEW_STMT);

> > +      gsi_insert_after (&incr_gsi, incr_after_stmt, GSI_NEW_STMT);

> > +      index_before_incr = index_before_incr_trunc;

> > +      index_after_incr = index_after_incr_trunc;

> > +      zero_index = build_int_cst (compare_type, 0);

> > +    }

> > +  else

> > +    {

> > +      /* If the IV is of compare_type, no convertion needed.  */

> > +      zero_index = build_int_cst (compare_type, 0);

> > +      create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,

> > +              insert_after, &index_before_incr, &index_after_incr);

> > +    }

> >    tree test_index, test_limit, first_limit;

> >    gimple_stmt_iterator *test_gsi;

> >    if (might_wrap_p)

>

> Now that we have an explicit iv_type, there shouldn't be any need to

> treat this as two special cases.  I think we should just convert the

> IV to the comparison type before passing it to the WHILE.


Changed it.
>

> > @@ -617,6 +654,41 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

> >    return next_mask;

> >  }

> >

> > +/* Return the iv_limit for fully masked loop LOOP with LOOP_VINFO.

> > +   If it is not possible to calcilate iv_limit, return -1.  */

>

> Maybe:

>

> /* Decide whether it is possible to use a zero-based induction variable

>    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,

>    return the value that the induction variable must be able to hold

>    in order to ensure that the loop ends with an all-false mask.

>    Return -1 otherwise.  */

>

> I think the function should go on in tree-vect-loop.c instead.


OK.
>

> > +widest_int

> > +vect_get_loop_iv_limit (struct loop *loop, loop_vec_info loop_vinfo)

>

> Maybe: vect_iv_limit_for_full_masking

>

> Probably worth dropping the "loop" parameter and getting it from

> LOOP_VINFO.

OK.

>

> > +

> > +  /* Convert skip_niters to the right type.  */

>

> Comment no longer applies.

>

> > +  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);

> > +  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);

> > +

> > +  /* Now calculate the value that the induction variable must be able

> > +     to hit in order to ensure that we end the loop with an all-false mask.

> > +     This involves adding the maximum number of inactive trailing scalar

> > +     iterations.  */

> > +  widest_int iv_limit = -1;

> > +  bool known_max_iters = max_loop_iterations (loop, &iv_limit);

> > +  if (known_max_iters)

>

> No need for this temporary variable.

>

> > +    {

> > +      if (niters_skip)

> > +     {

> > +       /* Add the maximum number of skipped iterations to the

> > +          maximum iteration count.  */

> > +       if (TREE_CODE (niters_skip) == INTEGER_CST)

> > +         iv_limit += wi::to_widest (niters_skip);

> > +       else

> > +         iv_limit += max_vf - 1;

> > +     }

>

> Note that MASK_SKIP_NITERS isn't set at the point you call it

> for vect_set_loop_condition_masked.  I think we should have:

>

>     else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))

>       /* Make a conservatively-correct assumption.  */

>       iv_limit += max_vf - 1;

OK.

>

> > +      /* IV_LIMIT is the maximum number of latch iterations, which is also

> > +      the maximum in-range IV value.  Round this value down to the previous

> > +      vector alignment boundary and then add an extra full iteration.  */

> > +      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);

> > +      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;

> > +    }

> > +  return iv_limit;

> > +}

> > +

> >  /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.

> >     LOOP_VINFO describes the vectorization of LOOP.  NITERS is the

> >     number of iterations of the original scalar loop that should be

> > [...]

> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

> > index e1229a5..431025b 100644

> > --- a/gcc/tree-vect-loop.c

> > +++ b/gcc/tree-vect-loop.c

> > @@ -1056,6 +1056,16 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

> >    /* Find a scalar mode for which WHILE_ULT is supported.  */

> >    opt_scalar_int_mode cmp_mode_iter;

> >    tree cmp_type = NULL_TREE;

> > +  tree iv_type = NULL_TREE;

> > +  widest_int iv_limit = vect_get_loop_iv_limit (loop, loop_vinfo);

> > +  widest_int iv_precision = -1;

>

> iv_precision should be unsigned int.  Setting it to UINT_MAX would

> simplify the later code.

OK.

>

> > +

> > +  if (iv_limit != -1)

> > +    iv_precision

> > +      = wi::min_precision (iv_limit

> > +                        * vect_get_max_nscalars_per_iter (loop_vinfo),

> > +                        UNSIGNED);

> > +

>

> Would be good to avoid the duplicated call to

> vect_get_max_nscalars_per_iter (also called for min_ni_width).

OK.

>

> >    FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)

> >      {

> >        unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());

> > @@ -1066,13 +1076,25 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

> >         if (this_type

> >             && can_produce_all_loop_masks_p (loop_vinfo, this_type))

> >           {

> > -           /* Although we could stop as soon as we find a valid mode,

> > -              it's often better to continue until we hit Pmode, since the

> > +           /* See whether zero-based IV would ever generate all-false masks

> > +              before wrapping around.  */

> > +           bool might_wrap_p = (iv_limit == -1 || (iv_precision > cmp_bits));

>

> With the above change, the iv_limit check would no longer be needed.

>

> > +           /* Stop as soon as we find a valid mode. If we decided to use

> > +              cmp_type which is less than Pmode precision, it is often better

> > +              to use iv_type corresponding to Pmode, since the

> >                operands to the WHILE are more likely to be reusable in

> > -              address calculations.  */

> > +              address calculations in this case.  */

> >             cmp_type = this_type;

> > +           iv_type = this_type;

> >             if (cmp_bits >= GET_MODE_BITSIZE (Pmode))

> >               break;

> > +           if (!might_wrap_p)

> > +             {

> > +               iv_type

> > +                 = build_nonstandard_integer_type (GET_MODE_BITSIZE (Pmode),

> > +                                                   true);

> > +               break;

> > +             }

>

> I think the loop should break in the same place as before, with the

> iv_type being what used to be the cmp_type.  The new behaviour is that

> (for the new meaning of cmp_type) we keep the current cmp_type if its

> precision is already >= iv_precision.


OK.

Attached patch fixes the issues raised. Does this look better?

Thanks,
Kugan
>

> Thanks,

> Richard
Richard Sandiford May 31, 2019, 9:43 a.m. UTC | #5
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:
> @@ -609,8 +615,14 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

>  

>        /* Get the mask value for the next iteration of the loop.  */

>        next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");

> -      gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);

> -      gsi_insert_before (test_gsi, call, GSI_SAME_STMT);

> +      tree test_index_cmp_type = make_ssa_name (compare_type);

> +      gimple *conv_stmt = gimple_build_assign (test_index_cmp_type,

> +					       NOP_EXPR,

> +					       test_index);

> +      gsi_insert_before (test_gsi, conv_stmt, GSI_NEW_STMT);


We only need to convert once, so this should happen before the
FOR_EACH_VEC_ELT_REVERSE loop.  Would be better as:

  gimple_seq test_seq = NULL;
  test_index = gimple_convert (&test_seq, compare_type, text_index);
  gimple_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);

so that we don't generate unnecessary converts.

> +      gcall *call = vect_gen_while (next_mask, test_index_cmp_type,

> +				    this_test_limit);

> +      gsi_insert_after (test_gsi, call, GSI_SAME_STMT);

>  

>        vect_set_loop_mask (loop, mask, init_mask, next_mask);

>      }

> @@ -637,12 +649,12 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,

>  

>    tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);

>    unsigned int compare_precision = TYPE_PRECISION (compare_type);

> -  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);

>    tree orig_niters = niters;

>  

>    /* Type of the initial value of NITERS.  */

>    tree ni_actual_type = TREE_TYPE (niters);

>    unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);

> +  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);

>  

>    /* Convert NITERS to the same size as the compare.  */

>    if (compare_precision > ni_actual_precision

> @@ -661,33 +673,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,

>    else

>      niters = gimple_convert (&preheader_seq, compare_type, niters);

>  

> -  /* Convert skip_niters to the right type.  */

> -  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);

> -

> -  /* Now calculate the value that the induction variable must be able

> -     to hit in order to ensure that we end the loop with an all-false mask.

> -     This involves adding the maximum number of inactive trailing scalar

> -     iterations.  */

> -  widest_int iv_limit;

> -  bool known_max_iters = max_loop_iterations (loop, &iv_limit);

> -  if (known_max_iters)

> -    {

> -      if (niters_skip)

> -	{

> -	  /* Add the maximum number of skipped iterations to the

> -	     maximum iteration count.  */

> -	  if (TREE_CODE (niters_skip) == INTEGER_CST)

> -	    iv_limit += wi::to_widest (niters_skip);

> -	  else

> -	    iv_limit += max_vf - 1;

> -	}

> -      /* IV_LIMIT is the maximum number of latch iterations, which is also

> -	 the maximum in-range IV value.  Round this value down to the previous

> -	 vector alignment boundary and then add an extra full iteration.  */

> -      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);

> -      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;

> -    }

> -

> +  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);

>    /* Get the vectorization factor in tree form.  */

>    tree vf = build_int_cst (compare_type,

>  			   LOOP_VINFO_VECT_FACTOR (loop_vinfo));

> @@ -717,7 +703,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,

>  	/* See whether zero-based IV would ever generate all-false masks

>  	   before wrapping around.  */

>  	bool might_wrap_p

> -	  = (!known_max_iters

> +	  = (iv_limit == UINT_MAX


Shouldn't this be == -1?

>  	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,

>  				    UNSIGNED)

>  		 > compare_precision));

> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

> index 4942c69..1240037 100644

> --- a/gcc/tree-vect-loop.c

> +++ b/gcc/tree-vect-loop.c

> @@ -1029,7 +1029,10 @@ static bool

>  vect_verify_full_masking (loop_vec_info loop_vinfo)

>  {

>    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);

> +  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));

>    unsigned int min_ni_width;

> +  unsigned int max_nscalars_per_iter

> +    = vect_get_max_nscalars_per_iter (loop_vinfo);

>  

>    /* Use a normal loop if there are no statements that need masking.

>       This only happens in rare degenerate cases: it means that the loop

> @@ -1048,7 +1051,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

>      max_ni = wi::smin (max_ni, max_back_edges + 1);

>  

>    /* Account for rgroup masks, in which each bit is replicated N times.  */

> -  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);

> +  max_ni *= max_nscalars_per_iter;

>  

>    /* Work out how many bits we need to represent the limit.  */

>    min_ni_width = wi::min_precision (max_ni, UNSIGNED);

> @@ -1056,6 +1059,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

>    /* Find a scalar mode for which WHILE_ULT is supported.  */

>    opt_scalar_int_mode cmp_mode_iter;

>    tree cmp_type = NULL_TREE;

> +  tree iv_type = NULL_TREE;

> +  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);

> +  widest_int iv_precision = UINT_MAX;

> +

> +  if (iv_limit != UINT_MAX)


Same for != -1 here.

> +    iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,

> +				      UNSIGNED);

> +

>    FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)

>      {

>        unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());

> @@ -1066,11 +1077,18 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

>  	  if (this_type

>  	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))

>  	    {

> -	      /* Although we could stop as soon as we find a valid mode,

> -		 it's often better to continue until we hit Pmode, since the

> +	      /* See whether zero-based IV would ever generate all-false masks

> +		 before wrapping around.  */

> +	      bool might_wrap_p = (iv_precision > cmp_bits);

> +	      /* Stop as soon as we find a valid mode.  If we decided to use

> +		 cmp_type which is less than Pmode precision, it is often better

> +		 to use iv_type corresponding to Pmode, since the

>  		 operands to the WHILE are more likely to be reusable in

> -		 address calculations.  */

> -	      cmp_type = this_type;

> +		 address calculations in this case.  */

> +	      iv_type = this_type;

> +	      if (might_wrap_p

> +		  || (cmp_bits <= TYPE_PRECISION (niters_type)))

> +		cmp_type = this_type;


It's not obvious that this always sets cmp_type when it could be set.
I think instead we should have:

	      if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
		cmp_type = this_type;

Thanks,
Richard
Kugan Vivekanandarajah June 3, 2019, 3:23 a.m. UTC | #6
Hi Richard,

Thanks for the review,

On Fri, 31 May 2019 at 19:43, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

> > @@ -609,8 +615,14 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

> >

> >        /* Get the mask value for the next iteration of the loop.  */

> >        next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");

> > -      gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);

> > -      gsi_insert_before (test_gsi, call, GSI_SAME_STMT);

> > +      tree test_index_cmp_type = make_ssa_name (compare_type);

> > +      gimple *conv_stmt = gimple_build_assign (test_index_cmp_type,

> > +                                            NOP_EXPR,

> > +                                            test_index);

> > +      gsi_insert_before (test_gsi, conv_stmt, GSI_NEW_STMT);

>

> We only need to convert once, so this should happen before the

> FOR_EACH_VEC_ELT_REVERSE loop.  Would be better as:

>

>   gimple_seq test_seq = NULL;

>   test_index = gimple_convert (&test_seq, compare_type, text_index);

>   gimple_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);


Ok.
>

> so that we don't generate unnecessary converts.

>

> > +      gcall *call = vect_gen_while (next_mask, test_index_cmp_type,

> > +                                 this_test_limit);

> > +      gsi_insert_after (test_gsi, call, GSI_SAME_STMT);

> >

> >        vect_set_loop_mask (loop, mask, init_mask, next_mask);

> >      }

> > @@ -637,12 +649,12 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,

> >

> >    tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);

> >    unsigned int compare_precision = TYPE_PRECISION (compare_type);

> > -  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);

> >    tree orig_niters = niters;

> >

> >    /* Type of the initial value of NITERS.  */

> >    tree ni_actual_type = TREE_TYPE (niters);

> >    unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);

> > +  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);

> >

> >    /* Convert NITERS to the same size as the compare.  */

> >    if (compare_precision > ni_actual_precision

> > @@ -661,33 +673,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,

> >    else

> >      niters = gimple_convert (&preheader_seq, compare_type, niters);

> >

> > -  /* Convert skip_niters to the right type.  */

> > -  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);

> > -

> > -  /* Now calculate the value that the induction variable must be able

> > -     to hit in order to ensure that we end the loop with an all-false mask.

> > -     This involves adding the maximum number of inactive trailing scalar

> > -     iterations.  */

> > -  widest_int iv_limit;

> > -  bool known_max_iters = max_loop_iterations (loop, &iv_limit);

> > -  if (known_max_iters)

> > -    {

> > -      if (niters_skip)

> > -     {

> > -       /* Add the maximum number of skipped iterations to the

> > -          maximum iteration count.  */

> > -       if (TREE_CODE (niters_skip) == INTEGER_CST)

> > -         iv_limit += wi::to_widest (niters_skip);

> > -       else

> > -         iv_limit += max_vf - 1;

> > -     }

> > -      /* IV_LIMIT is the maximum number of latch iterations, which is also

> > -      the maximum in-range IV value.  Round this value down to the previous

> > -      vector alignment boundary and then add an extra full iteration.  */

> > -      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);

> > -      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;

> > -    }

> > -

> > +  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);

> >    /* Get the vectorization factor in tree form.  */

> >    tree vf = build_int_cst (compare_type,

> >                          LOOP_VINFO_VECT_FACTOR (loop_vinfo));

> > @@ -717,7 +703,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,

> >       /* See whether zero-based IV would ever generate all-false masks

> >          before wrapping around.  */

> >       bool might_wrap_p

> > -       = (!known_max_iters

> > +       = (iv_limit == UINT_MAX

>

> Shouldn't this be == -1?

>

> >            || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,

> >                                   UNSIGNED)

> >                > compare_precision));

> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

> > index 4942c69..1240037 100644

> > --- a/gcc/tree-vect-loop.c

> > +++ b/gcc/tree-vect-loop.c

> > @@ -1029,7 +1029,10 @@ static bool

> >  vect_verify_full_masking (loop_vec_info loop_vinfo)

> >  {

> >    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);

> > +  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));

> >    unsigned int min_ni_width;

> > +  unsigned int max_nscalars_per_iter

> > +    = vect_get_max_nscalars_per_iter (loop_vinfo);

> >

> >    /* Use a normal loop if there are no statements that need masking.

> >       This only happens in rare degenerate cases: it means that the loop

> > @@ -1048,7 +1051,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

> >      max_ni = wi::smin (max_ni, max_back_edges + 1);

> >

> >    /* Account for rgroup masks, in which each bit is replicated N times.  */

> > -  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);

> > +  max_ni *= max_nscalars_per_iter;

> >

> >    /* Work out how many bits we need to represent the limit.  */

> >    min_ni_width = wi::min_precision (max_ni, UNSIGNED);

> > @@ -1056,6 +1059,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

> >    /* Find a scalar mode for which WHILE_ULT is supported.  */

> >    opt_scalar_int_mode cmp_mode_iter;

> >    tree cmp_type = NULL_TREE;

> > +  tree iv_type = NULL_TREE;

> > +  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);

> > +  widest_int iv_precision = UINT_MAX;

> > +

> > +  if (iv_limit != UINT_MAX)

>

> Same for != -1 here.

Yes. Changed this and above.

>

> > +    iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,

> > +                                   UNSIGNED);

> > +

> >    FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)

> >      {

> >        unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());

> > @@ -1066,11 +1077,18 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

> >         if (this_type

> >             && can_produce_all_loop_masks_p (loop_vinfo, this_type))

> >           {

> > -           /* Although we could stop as soon as we find a valid mode,

> > -              it's often better to continue until we hit Pmode, since the

> > +           /* See whether zero-based IV would ever generate all-false masks

> > +              before wrapping around.  */

> > +           bool might_wrap_p = (iv_precision > cmp_bits);

> > +           /* Stop as soon as we find a valid mode.  If we decided to use

> > +              cmp_type which is less than Pmode precision, it is often better

> > +              to use iv_type corresponding to Pmode, since the

> >                operands to the WHILE are more likely to be reusable in

> > -              address calculations.  */

> > -           cmp_type = this_type;

> > +              address calculations in this case.  */

> > +           iv_type = this_type;

> > +           if (might_wrap_p

> > +               || (cmp_bits <= TYPE_PRECISION (niters_type)))

> > +             cmp_type = this_type;

>

> It's not obvious that this always sets cmp_type when it could be set.

> I think instead we should have:

Ok.

Testing the attached patch now.

Thanks,
Kugan
>

>               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))

>                 cmp_type = this_type;

>

> Thanks,

> Richard
From b01db45dfaae2fd74802c6b36e34a906e6687f81 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
Date: Tue, 28 May 2019 11:57:54 +1000
Subject: [PATCH] PR88838 V5

Change-Id: Ibe3f471eb2d67e177c168a19b1ead7f749880963
---
 gcc/testsuite/gcc.target/aarch64/pr88838.c     | 11 ++++
 gcc/testsuite/gcc.target/aarch64/sve/while_1.c | 16 +++---
 gcc/tree-vect-loop-manip.c                     | 46 ++++++-----------
 gcc/tree-vect-loop.c                           | 70 ++++++++++++++++++++++++--
 gcc/tree-vectorizer.h                          |  6 +++
 5 files changed, 105 insertions(+), 44 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr88838.c

diff --git a/gcc/testsuite/gcc.target/aarch64/pr88838.c b/gcc/testsuite/gcc.target/aarch64/pr88838.c
new file mode 100644
index 0000000..d7db847
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr88838.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-S -O3 -march=armv8.2-a+sve" } */
+
+void
+f (int *restrict x, int *restrict y, int *restrict z, int n)
+{
+    for (int i = 0; i < n; i += 1)
+          x[i] = y[i] + z[i];
+}
+
+/* { dg-final { scan-assembler-not "sxtw" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
index a93a04b..05a4860 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
@@ -26,14 +26,14 @@
 TEST_ALL (ADD_LOOP)
 
 /* { dg-final { scan-assembler-not {\tuqdec} } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, xzr,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, x[0-9]+,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, xzr,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, x[0-9]+,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, xzr,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, x[0-9]+,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, xzr,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, x[0-9]+,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, wzr,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, w[0-9]+,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, wzr,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, w[0-9]+,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, wzr,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, w[0-9]+,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, wzr,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, w[0-9]+,} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x0, x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0, x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x0, x[0-9]+, lsl 1\]\n} 2 } } */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index b3fae5b..ad838dd 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -415,6 +415,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 			      bool might_wrap_p)
 {
   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
   tree mask_type = rgm->mask_type;
   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
@@ -445,11 +446,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
   tree index_before_incr, index_after_incr;
   gimple_stmt_iterator incr_gsi;
   bool insert_after;
-  tree zero_index = build_int_cst (compare_type, 0);
   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
-  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
+
+  tree zero_index = build_int_cst (iv_type, 0);
+  tree step = build_int_cst (iv_type,
+			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+  /* Creating IV of iv_type.  */
+  create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
 	     insert_after, &index_before_incr, &index_after_incr);
 
+  zero_index = build_int_cst (compare_type, 0);
   tree test_index, test_limit, first_limit;
   gimple_stmt_iterator *test_gsi;
   if (might_wrap_p)
@@ -529,6 +535,10 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
   tree next_mask = NULL_TREE;
   tree mask;
   unsigned int i;
+  gimple_seq test_seq = NULL;
+  test_index = gimple_convert (&test_seq, compare_type, test_index);
+  gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
+
   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
     {
       /* Previous masks will cover BIAS scalars.  This mask covers the
@@ -637,12 +647,12 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 
   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
   unsigned int compare_precision = TYPE_PRECISION (compare_type);
-  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
   tree orig_niters = niters;
 
   /* Type of the initial value of NITERS.  */
   tree ni_actual_type = TREE_TYPE (niters);
   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 
   /* Convert NITERS to the same size as the compare.  */
   if (compare_precision > ni_actual_precision
@@ -661,33 +671,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
   else
     niters = gimple_convert (&preheader_seq, compare_type, niters);
 
-  /* Convert skip_niters to the right type.  */
-  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
-
-  /* Now calculate the value that the induction variable must be able
-     to hit in order to ensure that we end the loop with an all-false mask.
-     This involves adding the maximum number of inactive trailing scalar
-     iterations.  */
-  widest_int iv_limit;
-  bool known_max_iters = max_loop_iterations (loop, &iv_limit);
-  if (known_max_iters)
-    {
-      if (niters_skip)
-	{
-	  /* Add the maximum number of skipped iterations to the
-	     maximum iteration count.  */
-	  if (TREE_CODE (niters_skip) == INTEGER_CST)
-	    iv_limit += wi::to_widest (niters_skip);
-	  else
-	    iv_limit += max_vf - 1;
-	}
-      /* IV_LIMIT is the maximum number of latch iterations, which is also
-	 the maximum in-range IV value.  Round this value down to the previous
-	 vector alignment boundary and then add an extra full iteration.  */
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
-    }
-
+  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
   /* Get the vectorization factor in tree form.  */
   tree vf = build_int_cst (compare_type,
 			   LOOP_VINFO_VECT_FACTOR (loop_vinfo));
@@ -717,7 +701,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 	/* See whether zero-based IV would ever generate all-false masks
 	   before wrapping around.  */
 	bool might_wrap_p
-	  = (!known_max_iters
+	  = (iv_limit == -1
 	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 				    UNSIGNED)
 		 > compare_precision));
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 4942c69..5cea1df 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1029,7 +1029,10 @@ static bool
 vect_verify_full_masking (loop_vec_info loop_vinfo)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
   unsigned int min_ni_width;
+  unsigned int max_nscalars_per_iter
+    = vect_get_max_nscalars_per_iter (loop_vinfo);
 
   /* Use a normal loop if there are no statements that need masking.
      This only happens in rare degenerate cases: it means that the loop
@@ -1048,7 +1051,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
     max_ni = wi::smin (max_ni, max_back_edges + 1);
 
   /* Account for rgroup masks, in which each bit is replicated N times.  */
-  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
+  max_ni *= max_nscalars_per_iter;
 
   /* Work out how many bits we need to represent the limit.  */
   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
@@ -1056,6 +1059,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   /* Find a scalar mode for which WHILE_ULT is supported.  */
   opt_scalar_int_mode cmp_mode_iter;
   tree cmp_type = NULL_TREE;
+  tree iv_type = NULL_TREE;
+  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
+  widest_int iv_precision = UINT_MAX;
+
+  if (iv_limit != -1)
+    iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
+				      UNSIGNED);
+
   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
     {
       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
@@ -1066,11 +1077,17 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
 	  if (this_type
 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
 	    {
-	      /* Although we could stop as soon as we find a valid mode,
-		 it's often better to continue until we hit Pmode, since the
+	      /* See whether zero-based IV would ever generate all-false masks
+		 before wrapping around.  */
+	      bool might_wrap_p = (iv_precision > cmp_bits);
+	      /* Stop as soon as we find a valid mode.  If we decided to use
+		 cmp_type which is less than Pmode precision, it is often better
+		 to use iv_type corresponding to Pmode, since the
 		 operands to the WHILE are more likely to be reusable in
-		 address calculations.  */
-	      cmp_type = this_type;
+		 address calculations in this case.  */
+	      iv_type = this_type;
+	      if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
+		cmp_type = this_type;
 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
 		break;
 	    }
@@ -1081,6 +1098,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
     return false;
 
   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
   return true;
 }
 
@@ -9014,3 +9032,45 @@ optimize_mask_stores (struct loop *loop)
       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
     }
 }
+
+/* Decide whether it is possible to use a zero-based induction variable
+   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
+   return the value that the induction variable must be able to hold
+   in order to ensure that the loop ends with an all-false mask.
+   Return -1 otherwise.  */
+widest_int
+vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
+{
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
+
+  /* Now calculate the value that the induction variable must be able
+     to hit in order to ensure that we end the loop with an all-false mask.
+     This involves adding the maximum number of inactive trailing scalar
+     iterations.  */
+  widest_int iv_limit = -1;
+  if (max_loop_iterations (loop, &iv_limit))
+    {
+      if (niters_skip)
+	{
+	  /* Add the maximum number of skipped iterations to the
+	     maximum iteration count.  */
+	  if (TREE_CODE (niters_skip) == INTEGER_CST)
+	    iv_limit += wi::to_widest (niters_skip);
+	  else
+	    iv_limit += max_vf - 1;
+	}
+      else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+	/* Make a conservatively-correct assumption.  */
+	iv_limit += max_vf - 1;
+
+      /* IV_LIMIT is the maximum number of latch iterations, which is also
+	 the maximum in-range IV value.  Round this value down to the previous
+	 vector alignment boundary and then add an extra full iteration.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+    }
+  return iv_limit;
+}
+
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 4db30cc..eb0f21f 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -435,6 +435,10 @@ typedef struct _loop_vec_info : public vec_info {
      is false and vectorized loop otherwise.  */
   tree simd_if_cond;
 
+  /* Type of the IV to use in the WHILE_ULT call for fully-masked
+     loops.  */
+  tree iv_type;
+
   /* Unknown DRs according to which loop was peeled.  */
   struct dr_vec_info *unaligned_dr;
 
@@ -570,6 +574,7 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_MASKS(L)                (L)->masks
 #define LOOP_VINFO_MASK_SKIP_NITERS(L)     (L)->mask_skip_niters
 #define LOOP_VINFO_MASK_COMPARE_TYPE(L)    (L)->mask_compare_type
+#define LOOP_VINFO_MASK_IV_TYPE(L)         (L)->iv_type
 #define LOOP_VINFO_PTR_MASK(L)             (L)->ptr_mask
 #define LOOP_VINFO_LOOP_NEST(L)            (L)->shared->loop_nest
 #define LOOP_VINFO_DATAREFS(L)             (L)->shared->datarefs
@@ -1582,6 +1587,7 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *,
 /* FORNOW: Used in tree-parloops.c.  */
 extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
 						  bool *, bool);
+extern widest_int vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo);
 /* Used in gimple-loop-interchange.c.  */
 extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
 				  enum tree_code);
Richard Sandiford June 3, 2019, 9:08 a.m. UTC | #7
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c

> index b3fae5b..ad838dd 100644

> --- a/gcc/tree-vect-loop-manip.c

> +++ b/gcc/tree-vect-loop-manip.c

> @@ -415,6 +415,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

>  			      bool might_wrap_p)

>  {

>    tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);

> +  tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);

>    tree mask_type = rgm->mask_type;

>    unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;

>    poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);

> @@ -445,11 +446,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

>    tree index_before_incr, index_after_incr;

>    gimple_stmt_iterator incr_gsi;

>    bool insert_after;

> -  tree zero_index = build_int_cst (compare_type, 0);

>    standard_iv_increment_position (loop, &incr_gsi, &insert_after);

> -  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,

> +

> +  tree zero_index = build_int_cst (iv_type, 0);

> +  tree step = build_int_cst (iv_type,

> +			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));

> +  /* Creating IV of iv_type.  */


s/Creating/Create/

> +  create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,

>  	     insert_after, &index_before_incr, &index_after_incr);

>  

> +  zero_index = build_int_cst (compare_type, 0);

>    tree test_index, test_limit, first_limit;

>    gimple_stmt_iterator *test_gsi;

>    if (might_wrap_p)

> [...]

> @@ -1066,11 +1077,17 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

>  	  if (this_type

>  	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))

>  	    {

> -	      /* Although we could stop as soon as we find a valid mode,

> -		 it's often better to continue until we hit Pmode, since the

> +	      /* See whether zero-based IV would ever generate all-false masks

> +		 before wrapping around.  */

> +	      bool might_wrap_p = (iv_precision > cmp_bits);

> +	      /* Stop as soon as we find a valid mode.  If we decided to use

> +		 cmp_type which is less than Pmode precision, it is often better

> +		 to use iv_type corresponding to Pmode, since the

>  		 operands to the WHILE are more likely to be reusable in

> -		 address calculations.  */

> -	      cmp_type = this_type;

> +		 address calculations in this case.  */


We're not stopping as soon as we find a valid mode though.  Any type
that satisfies the if condition above is valid, but we pick wider
cmp_types and iv_types for optimisation reasons.  How about:

	      /* Although we could stop as soon as we find a valid mode,
		 there are at least two reasons why that's not always the
		 best choice:

		 - An IV that's Pmode or wider is more likely to be reusable
		   in address calculations than an IV that's narrower than
		   Pmode.

		 - Doing the comparison in IV_PRECISION or wider allows
		   a natural 0-based IV, whereas using a narrower comparison
		   type requires mitigations against wrap-around.

		 Conversely, if the IV limit is variable, doing the comparison
		 in a wider type than the original type can introduce
		 unnecessary extensions, so picking the widest valid mode
		 is not always a good choice either.

		 Here we prefer the first IV type that's Pmode or wider,
		 and the first comparison type that's IV_PRECISION or wider.
		 (The comparison type must be no wider than the IV type,
		 to avoid extensions in the vector loop.)

		 ??? We might want to try continuing beyond Pmode for ILP32
		 targets if CMP_BITS < IV_PRECISION.  */

> +	      iv_type = this_type;

> +	      if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))

> +		cmp_type = this_type;

>  	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))

>  		break;

>  	    }


> [...]

> @@ -9014,3 +9032,45 @@ optimize_mask_stores (struct loop *loop)

>        add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);

>      }

>  }

> +

> +/* Decide whether it is possible to use a zero-based induction variable

> +   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,

> +   return the value that the induction variable must be able to hold

> +   in order to ensure that the loop ends with an all-false mask.

> +   Return -1 otherwise.  */

> +widest_int

> +vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)

> +{

> +  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);

> +  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);

> +  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);

> +

> +  /* Now calculate the value that the induction variable must be able


s/Now calculate/Calculate/

since this comment is no longer following on from earlier code.

OK with those changes, thanks.

Richard
Kugan Vivekanandarajah June 6, 2019, 1:28 a.m. UTC | #8
Hi Richard,

Thanks for the review. Attached is the latest patch.

For testcase like cond_arith_1.c, with the patch, gcc ICE in fwprop. I
am limiting fwprop in cases like this. Is there a better fix for this?
index cf2c9de..2c99285 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -1358,6 +1358,15 @@ forward_propagate_and_simplify (df_ref use,
rtx_insn *def_insn, rtx def_set)
   else
     mode = GET_MODE (*loc);

+  /* TODO. We can't get the mode for
+     (set (reg:VNx16BI 109)
+          (unspec:VNx16BI [
+        (reg:SI 131)
+        (reg:SI 106)
+           ] UNSPEC_WHILE_LO))
+     Thus, bailout when it is UNSPEC and MODEs are not compatible.  */
+  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg)))
+    return false;
   new_rtx = propagate_rtx (*loc, mode, reg, src,
                  optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_insn)));

Thanks,
Kugan

On Mon, 3 Jun 2019 at 19:08, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

> > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c

> > index b3fae5b..ad838dd 100644

> > --- a/gcc/tree-vect-loop-manip.c

> > +++ b/gcc/tree-vect-loop-manip.c

> > @@ -415,6 +415,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

> >                             bool might_wrap_p)

> >  {

> >    tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);

> > +  tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);

> >    tree mask_type = rgm->mask_type;

> >    unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;

> >    poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);

> > @@ -445,11 +446,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,

> >    tree index_before_incr, index_after_incr;

> >    gimple_stmt_iterator incr_gsi;

> >    bool insert_after;

> > -  tree zero_index = build_int_cst (compare_type, 0);

> >    standard_iv_increment_position (loop, &incr_gsi, &insert_after);

> > -  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,

> > +

> > +  tree zero_index = build_int_cst (iv_type, 0);

> > +  tree step = build_int_cst (iv_type,

> > +                          LOOP_VINFO_VECT_FACTOR (loop_vinfo));

> > +  /* Creating IV of iv_type.  */

>

> s/Creating/Create/

>

> > +  create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,

> >            insert_after, &index_before_incr, &index_after_incr);

> >

> > +  zero_index = build_int_cst (compare_type, 0);

> >    tree test_index, test_limit, first_limit;

> >    gimple_stmt_iterator *test_gsi;

> >    if (might_wrap_p)

> > [...]

> > @@ -1066,11 +1077,17 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)

> >         if (this_type

> >             && can_produce_all_loop_masks_p (loop_vinfo, this_type))

> >           {

> > -           /* Although we could stop as soon as we find a valid mode,

> > -              it's often better to continue until we hit Pmode, since the

> > +           /* See whether zero-based IV would ever generate all-false masks

> > +              before wrapping around.  */

> > +           bool might_wrap_p = (iv_precision > cmp_bits);

> > +           /* Stop as soon as we find a valid mode.  If we decided to use

> > +              cmp_type which is less than Pmode precision, it is often better

> > +              to use iv_type corresponding to Pmode, since the

> >                operands to the WHILE are more likely to be reusable in

> > -              address calculations.  */

> > -           cmp_type = this_type;

> > +              address calculations in this case.  */

>

> We're not stopping as soon as we find a valid mode though.  Any type

> that satisfies the if condition above is valid, but we pick wider

> cmp_types and iv_types for optimisation reasons.  How about:

>

>               /* Although we could stop as soon as we find a valid mode,

>                  there are at least two reasons why that's not always the

>                  best choice:

>

>                  - An IV that's Pmode or wider is more likely to be reusable

>                    in address calculations than an IV that's narrower than

>                    Pmode.

>

>                  - Doing the comparison in IV_PRECISION or wider allows

>                    a natural 0-based IV, whereas using a narrower comparison

>                    type requires mitigations against wrap-around.

>

>                  Conversely, if the IV limit is variable, doing the comparison

>                  in a wider type than the original type can introduce

>                  unnecessary extensions, so picking the widest valid mode

>                  is not always a good choice either.

>

>                  Here we prefer the first IV type that's Pmode or wider,

>                  and the first comparison type that's IV_PRECISION or wider.

>                  (The comparison type must be no wider than the IV type,

>                  to avoid extensions in the vector loop.)

>

>                  ??? We might want to try continuing beyond Pmode for ILP32

>                  targets if CMP_BITS < IV_PRECISION.  */

>

> > +           iv_type = this_type;

> > +           if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))

> > +             cmp_type = this_type;

> >             if (cmp_bits >= GET_MODE_BITSIZE (Pmode))

> >               break;

> >           }

>

> > [...]

> > @@ -9014,3 +9032,45 @@ optimize_mask_stores (struct loop *loop)

> >        add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);

> >      }

> >  }

> > +

> > +/* Decide whether it is possible to use a zero-based induction variable

> > +   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,

> > +   return the value that the induction variable must be able to hold

> > +   in order to ensure that the loop ends with an all-false mask.

> > +   Return -1 otherwise.  */

> > +widest_int

> > +vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)

> > +{

> > +  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);

> > +  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);

> > +  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);

> > +

> > +  /* Now calculate the value that the induction variable must be able

>

> s/Now calculate/Calculate/

>

> since this comment is no longer following on from earlier code.

>

> OK with those changes, thanks.

>

> Richard
From 3195876b6424ce0e079afad9437e96abba3b45c1 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
Date: Tue, 28 May 2019 11:57:54 +1000
Subject: [PATCH] PR88838 V5

Change-Id: Icaa9658359b9b71e0148ff5f9d4a14b6bb8df05a
---
 gcc/fwprop.c                                   |  9 +++
 gcc/testsuite/gcc.target/aarch64/pr88838.c     | 11 ++++
 gcc/testsuite/gcc.target/aarch64/sve/while_1.c | 16 ++---
 gcc/tree-vect-loop-manip.c                     | 46 +++++---------
 gcc/tree-vect-loop.c                           | 85 ++++++++++++++++++++++++--
 gcc/tree-vectorizer.h                          |  6 ++
 6 files changed, 129 insertions(+), 44 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr88838.c

diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index cf2c9de..2c99285 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -1358,6 +1358,15 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set)
   else
     mode = GET_MODE (*loc);
 
+  /* TODO. We can't get the mode for
+     (set (reg:VNx16BI 109)
+          (unspec:VNx16BI [
+	    (reg:SI 131)
+	    (reg:SI 106)
+           ] UNSPEC_WHILE_LO))
+     Thus, bailout when it is UNSPEC and MODEs are not compatible.  */
+  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg)))
+    return false;
   new_rtx = propagate_rtx (*loc, mode, reg, src,
   			   optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_insn)));
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pr88838.c b/gcc/testsuite/gcc.target/aarch64/pr88838.c
new file mode 100644
index 0000000..d7db847
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr88838.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-S -O3 -march=armv8.2-a+sve" } */
+
+void
+f (int *restrict x, int *restrict y, int *restrict z, int n)
+{
+    for (int i = 0; i < n; i += 1)
+          x[i] = y[i] + z[i];
+}
+
+/* { dg-final { scan-assembler-not "sxtw" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
index a93a04b..05a4860 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
@@ -26,14 +26,14 @@
 TEST_ALL (ADD_LOOP)
 
 /* { dg-final { scan-assembler-not {\tuqdec} } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, xzr,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, x[0-9]+,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, xzr,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, x[0-9]+,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, xzr,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, x[0-9]+,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, xzr,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, x[0-9]+,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, wzr,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, w[0-9]+,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, wzr,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, w[0-9]+,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, wzr,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, w[0-9]+,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, wzr,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, w[0-9]+,} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x0, x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0, x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x0, x[0-9]+, lsl 1\]\n} 2 } } */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index b3fae5b..a0a1bee 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -415,6 +415,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 			      bool might_wrap_p)
 {
   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
   tree mask_type = rgm->mask_type;
   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
@@ -445,11 +446,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
   tree index_before_incr, index_after_incr;
   gimple_stmt_iterator incr_gsi;
   bool insert_after;
-  tree zero_index = build_int_cst (compare_type, 0);
   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
-  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
+
+  tree zero_index = build_int_cst (iv_type, 0);
+  tree step = build_int_cst (iv_type,
+			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+  /* Create IV of iv_type.  */
+  create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
 	     insert_after, &index_before_incr, &index_after_incr);
 
+  zero_index = build_int_cst (compare_type, 0);
   tree test_index, test_limit, first_limit;
   gimple_stmt_iterator *test_gsi;
   if (might_wrap_p)
@@ -529,6 +535,10 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
   tree next_mask = NULL_TREE;
   tree mask;
   unsigned int i;
+  gimple_seq test_seq = NULL;
+  test_index = gimple_convert (&test_seq, compare_type, test_index);
+  gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
+
   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
     {
       /* Previous masks will cover BIAS scalars.  This mask covers the
@@ -637,12 +647,12 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 
   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
   unsigned int compare_precision = TYPE_PRECISION (compare_type);
-  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
   tree orig_niters = niters;
 
   /* Type of the initial value of NITERS.  */
   tree ni_actual_type = TREE_TYPE (niters);
   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 
   /* Convert NITERS to the same size as the compare.  */
   if (compare_precision > ni_actual_precision
@@ -661,33 +671,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
   else
     niters = gimple_convert (&preheader_seq, compare_type, niters);
 
-  /* Convert skip_niters to the right type.  */
-  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
-
-  /* Now calculate the value that the induction variable must be able
-     to hit in order to ensure that we end the loop with an all-false mask.
-     This involves adding the maximum number of inactive trailing scalar
-     iterations.  */
-  widest_int iv_limit;
-  bool known_max_iters = max_loop_iterations (loop, &iv_limit);
-  if (known_max_iters)
-    {
-      if (niters_skip)
-	{
-	  /* Add the maximum number of skipped iterations to the
-	     maximum iteration count.  */
-	  if (TREE_CODE (niters_skip) == INTEGER_CST)
-	    iv_limit += wi::to_widest (niters_skip);
-	  else
-	    iv_limit += max_vf - 1;
-	}
-      /* IV_LIMIT is the maximum number of latch iterations, which is also
-	 the maximum in-range IV value.  Round this value down to the previous
-	 vector alignment boundary and then add an extra full iteration.  */
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
-    }
-
+  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
   /* Get the vectorization factor in tree form.  */
   tree vf = build_int_cst (compare_type,
 			   LOOP_VINFO_VECT_FACTOR (loop_vinfo));
@@ -717,7 +701,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 	/* See whether zero-based IV would ever generate all-false masks
 	   before wrapping around.  */
 	bool might_wrap_p
-	  = (!known_max_iters
+	  = (iv_limit == -1
 	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 				    UNSIGNED)
 		 > compare_precision));
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 4942c69..671ef2f 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1030,6 +1030,8 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int min_ni_width;
+  unsigned int max_nscalars_per_iter
+    = vect_get_max_nscalars_per_iter (loop_vinfo);
 
   /* Use a normal loop if there are no statements that need masking.
      This only happens in rare degenerate cases: it means that the loop
@@ -1048,7 +1050,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
     max_ni = wi::smin (max_ni, max_back_edges + 1);
 
   /* Account for rgroup masks, in which each bit is replicated N times.  */
-  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
+  max_ni *= max_nscalars_per_iter;
 
   /* Work out how many bits we need to represent the limit.  */
   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
@@ -1056,6 +1058,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   /* Find a scalar mode for which WHILE_ULT is supported.  */
   opt_scalar_int_mode cmp_mode_iter;
   tree cmp_type = NULL_TREE;
+  tree iv_type = NULL_TREE;
+  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
+  widest_int iv_precision = UINT_MAX;
+
+  if (iv_limit != -1)
+    iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
+				      UNSIGNED);
+
   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
     {
       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
@@ -1067,10 +1077,32 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
 	    {
 	      /* Although we could stop as soon as we find a valid mode,
-		 it's often better to continue until we hit Pmode, since the
-		 operands to the WHILE are more likely to be reusable in
-		 address calculations.  */
-	      cmp_type = this_type;
+		 there are at least two reasons why that's not always the
+		 best choice:
+
+		 - An IV that's Pmode or wider is more likely to be reusable
+		 in address calculations than an IV that's narrower than
+		 Pmode.
+
+		 - Doing the comparison in IV_PRECISION or wider allows
+		 a natural 0-based IV, whereas using a narrower comparison
+		 type requires mitigations against wrap-around.
+
+		 Conversely, if the IV limit is variable, doing the comparison
+		 in a wider type than the original type can introduce
+		 unnecessary extensions, so picking the widest valid mode
+		 is not always a good choice either.
+
+		 Here we prefer the first IV type that's Pmode or wider,
+		 and the first comparison type that's IV_PRECISION or wider.
+		 (The comparison type must be no wider than the IV type,
+		 to avoid extensions in the vector loop.)
+
+		 ??? We might want to try continuing beyond Pmode for ILP32
+		 targets if CMP_BITS < IV_PRECISION.  */
+	      iv_type = this_type;
+	      if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
+		cmp_type = this_type;
 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
 		break;
 	    }
@@ -1081,6 +1113,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
     return false;
 
   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
   return true;
 }
 
@@ -9014,3 +9047,45 @@ optimize_mask_stores (struct loop *loop)
       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
     }
 }
+
+/* Decide whether it is possible to use a zero-based induction variable
+   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
+   return the value that the induction variable must be able to hold
+   in order to ensure that the loop ends with an all-false mask.
+   Return -1 otherwise.  */
+widest_int
+vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
+{
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
+
+  /* Calculate the value that the induction variable must be able
+     to hit in order to ensure that we end the loop with an all-false mask.
+     This involves adding the maximum number of inactive trailing scalar
+     iterations.  */
+  widest_int iv_limit = -1;
+  if (max_loop_iterations (loop, &iv_limit))
+    {
+      if (niters_skip)
+	{
+	  /* Add the maximum number of skipped iterations to the
+	     maximum iteration count.  */
+	  if (TREE_CODE (niters_skip) == INTEGER_CST)
+	    iv_limit += wi::to_widest (niters_skip);
+	  else
+	    iv_limit += max_vf - 1;
+	}
+      else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+	/* Make a conservatively-correct assumption.  */
+	iv_limit += max_vf - 1;
+
+      /* IV_LIMIT is the maximum number of latch iterations, which is also
+	 the maximum in-range IV value.  Round this value down to the previous
+	 vector alignment boundary and then add an extra full iteration.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+    }
+  return iv_limit;
+}
+
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 4db30cc..eb0f21f 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -435,6 +435,10 @@ typedef struct _loop_vec_info : public vec_info {
      is false and vectorized loop otherwise.  */
   tree simd_if_cond;
 
+  /* Type of the IV to use in the WHILE_ULT call for fully-masked
+     loops.  */
+  tree iv_type;
+
   /* Unknown DRs according to which loop was peeled.  */
   struct dr_vec_info *unaligned_dr;
 
@@ -570,6 +574,7 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_MASKS(L)                (L)->masks
 #define LOOP_VINFO_MASK_SKIP_NITERS(L)     (L)->mask_skip_niters
 #define LOOP_VINFO_MASK_COMPARE_TYPE(L)    (L)->mask_compare_type
+#define LOOP_VINFO_MASK_IV_TYPE(L)         (L)->iv_type
 #define LOOP_VINFO_PTR_MASK(L)             (L)->ptr_mask
 #define LOOP_VINFO_LOOP_NEST(L)            (L)->shared->loop_nest
 #define LOOP_VINFO_DATAREFS(L)             (L)->shared->datarefs
@@ -1582,6 +1587,7 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *,
 /* FORNOW: Used in tree-parloops.c.  */
 extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
 						  bool *, bool);
+extern widest_int vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo);
 /* Used in gimple-loop-interchange.c.  */
 extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
 				  enum tree_code);
Richard Sandiford June 6, 2019, 9:35 a.m. UTC | #9
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:
> Hi Richard,

>

> Thanks for the review. Attached is the latest patch.

>

> For testcase like cond_arith_1.c, with the patch, gcc ICE in fwprop. I

> am limiting fwprop in cases like this. Is there a better fix for this?

> index cf2c9de..2c99285 100644

> --- a/gcc/fwprop.c

> +++ b/gcc/fwprop.c

> @@ -1358,6 +1358,15 @@ forward_propagate_and_simplify (df_ref use,

> rtx_insn *def_insn, rtx def_set)

>    else

>      mode = GET_MODE (*loc);

>

> +  /* TODO. We can't get the mode for

> +     (set (reg:VNx16BI 109)

> +          (unspec:VNx16BI [

> +        (reg:SI 131)

> +        (reg:SI 106)

> +           ] UNSPEC_WHILE_LO))

> +     Thus, bailout when it is UNSPEC and MODEs are not compatible.  */

> +  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg)))

> +    return false;

>    new_rtx = propagate_rtx (*loc, mode, reg, src,

>                   optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_insn)));


What specifically goes wrong?  The unspec above isn't that unusual --
many unspecs have different modes from their inputs.

Thanks,
Richard
Kugan Vivekanandarajah June 6, 2019, 11:39 a.m. UTC | #10
Hi Richard,

On Thu, 6 Jun 2019 at 19:35, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

> > Hi Richard,

> >

> > Thanks for the review. Attached is the latest patch.

> >

> > For testcase like cond_arith_1.c, with the patch, gcc ICE in fwprop. I

> > am limiting fwprop in cases like this. Is there a better fix for this?

> > index cf2c9de..2c99285 100644

> > --- a/gcc/fwprop.c

> > +++ b/gcc/fwprop.c

> > @@ -1358,6 +1358,15 @@ forward_propagate_and_simplify (df_ref use,

> > rtx_insn *def_insn, rtx def_set)

> >    else

> >      mode = GET_MODE (*loc);

> >

> > +  /* TODO. We can't get the mode for

> > +     (set (reg:VNx16BI 109)

> > +          (unspec:VNx16BI [

> > +        (reg:SI 131)

> > +        (reg:SI 106)

> > +           ] UNSPEC_WHILE_LO))

> > +     Thus, bailout when it is UNSPEC and MODEs are not compatible.  */

> > +  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg)))

> > +    return false;

> >    new_rtx = propagate_rtx (*loc, mode, reg, src,

> >                   optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_insn)));

>

> What specifically goes wrong?  The unspec above isn't that unusual --

> many unspecs have different modes from their inputs.


cond_arith_1.c:38:1: internal compiler error: in paradoxical_subreg_p,
at rtl.h:3130
0x135f1d3 paradoxical_subreg_p(machine_mode, machine_mode)
        ../../88838/gcc/rtl.h:3130
0x135f1d3 propagate_rtx
        ../../88838/gcc/fwprop.c:683
0x135f4a3 forward_propagate_and_simplify
        ../../88838/gcc/fwprop.c:1371
0x135f4a3 forward_propagate_into
        ../../88838/gcc/fwprop.c:1430
0x135fdcb fwprop
        ../../88838/gcc/fwprop.c:1519
0x135fdcb execute
        ../../88838/gcc/fwprop.c:1550
Please submit a full bug report,
with preprocessed source if appropriate.


in forward_propagate_and_simplify

use_set:
(set (reg:VNx16BI 96 [ loop_mask_52 ])
    (unspec:VNx16BI [
            (reg:SI 92 [ _3 ])
            (reg:SI 95 [ niters.36 ])
        ] UNSPEC_WHILE_LO))

reg:
(reg:SI 92 [ _3 ])

*loc:
(unspec:VNx16BI [
        (reg:SI 92 [ _3 ])
        (reg:SI 95 [ niters.36 ])
    ] UNSPEC_WHILE_LO)

src:
(subreg:SI (reg:DI 136 [ ivtmp_101 ]) 0)

use_insn:
(insn 87 86 88 4 (parallel [
            (set (reg:VNx16BI 96 [ loop_mask_52 ])
                (unspec:VNx16BI [
                        (reg:SI 92 [ _3 ])
                        (reg:SI 95 [ niters.36 ])
                    ] UNSPEC_WHILE_LO))
            (clobber (reg:CC 66 cc))
        ]) 4255 {while_ultsivnx16bi}
     (expr_list:REG_UNUSED (reg:CC 66 cc)
        (nil)))

I think we calculate the mode to be VNx16BI which is wrong?
because of which in propgate_rtx,   !paradoxical_subreg_p (mode,
GET_MODE (SUBREG_REG (new_rtx)))))  ICE

Thanks,
Kugan

>

> Thanks,

> Richard
Richard Sandiford June 6, 2019, 12:07 p.m. UTC | #11
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:
> Hi Richard,

>

> On Thu, 6 Jun 2019 at 19:35, Richard Sandiford

> <richard.sandiford@arm.com> wrote:

>>

>> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

>> > Hi Richard,

>> >

>> > Thanks for the review. Attached is the latest patch.

>> >

>> > For testcase like cond_arith_1.c, with the patch, gcc ICE in fwprop. I

>> > am limiting fwprop in cases like this. Is there a better fix for this?

>> > index cf2c9de..2c99285 100644

>> > --- a/gcc/fwprop.c

>> > +++ b/gcc/fwprop.c

>> > @@ -1358,6 +1358,15 @@ forward_propagate_and_simplify (df_ref use,

>> > rtx_insn *def_insn, rtx def_set)

>> >    else

>> >      mode = GET_MODE (*loc);

>> >

>> > +  /* TODO. We can't get the mode for

>> > +     (set (reg:VNx16BI 109)

>> > +          (unspec:VNx16BI [

>> > +        (reg:SI 131)

>> > +        (reg:SI 106)

>> > +           ] UNSPEC_WHILE_LO))

>> > +     Thus, bailout when it is UNSPEC and MODEs are not compatible.  */

>> > +  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg)))

>> > +    return false;

>> >    new_rtx = propagate_rtx (*loc, mode, reg, src,

>> >                   optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_insn)));

>>

>> What specifically goes wrong?  The unspec above isn't that unusual --

>> many unspecs have different modes from their inputs.

>

> cond_arith_1.c:38:1: internal compiler error: in paradoxical_subreg_p,

> at rtl.h:3130

> 0x135f1d3 paradoxical_subreg_p(machine_mode, machine_mode)

>         ../../88838/gcc/rtl.h:3130

> 0x135f1d3 propagate_rtx

>         ../../88838/gcc/fwprop.c:683

> 0x135f4a3 forward_propagate_and_simplify

>         ../../88838/gcc/fwprop.c:1371

> 0x135f4a3 forward_propagate_into

>         ../../88838/gcc/fwprop.c:1430

> 0x135fdcb fwprop

>         ../../88838/gcc/fwprop.c:1519

> 0x135fdcb execute

>         ../../88838/gcc/fwprop.c:1550

> Please submit a full bug report,

> with preprocessed source if appropriate.

>

>

> in forward_propagate_and_simplify

>

> use_set:

> (set (reg:VNx16BI 96 [ loop_mask_52 ])

>     (unspec:VNx16BI [

>             (reg:SI 92 [ _3 ])

>             (reg:SI 95 [ niters.36 ])

>         ] UNSPEC_WHILE_LO))

>

> reg:

> (reg:SI 92 [ _3 ])

>

> *loc:

> (unspec:VNx16BI [

>         (reg:SI 92 [ _3 ])

>         (reg:SI 95 [ niters.36 ])

>     ] UNSPEC_WHILE_LO)

>

> src:

> (subreg:SI (reg:DI 136 [ ivtmp_101 ]) 0)

>

> use_insn:

> (insn 87 86 88 4 (parallel [

>             (set (reg:VNx16BI 96 [ loop_mask_52 ])

>                 (unspec:VNx16BI [

>                         (reg:SI 92 [ _3 ])

>                         (reg:SI 95 [ niters.36 ])

>                     ] UNSPEC_WHILE_LO))

>             (clobber (reg:CC 66 cc))

>         ]) 4255 {while_ultsivnx16bi}

>      (expr_list:REG_UNUSED (reg:CC 66 cc)

>         (nil)))

>

> I think we calculate the mode to be VNx16BI which is wrong?

> because of which in propgate_rtx,   !paradoxical_subreg_p (mode,

> GET_MODE (SUBREG_REG (new_rtx)))))  ICE


Looks like something I hit on the ACLE branch, but didn't have a
non-ACLE reproducer for (see 065881acf0de35ff7818c1fc92769e1c106e1028).

Does the attached work?  The current call is wrong because "mode"
is the mode of "x", not the mode of "new_rtx".

Thanks,
Richard


2019-06-06  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* fwprop.c (propagate_rtx): Fix call to paradoxical_subreg_p.

Index: gcc/fwprop.c
===================================================================
--- gcc/fwprop.c	2019-03-08 18:14:25.333011645 +0000
+++ gcc/fwprop.c	2019-06-06 13:04:34.423476690 +0100
@@ -680,7 +680,7 @@ propagate_rtx (rtx x, machine_mode mode,
       || CONSTANT_P (new_rtx)
       || (GET_CODE (new_rtx) == SUBREG
 	  && REG_P (SUBREG_REG (new_rtx))
-	  && !paradoxical_subreg_p (mode, GET_MODE (SUBREG_REG (new_rtx)))))
+	  && !paradoxical_subreg_p (new_rtx)))
     flags |= PR_CAN_APPEAR;
   if (!varying_mem_p (new_rtx))
     flags |= PR_HANDLE_MEM;
Kugan Vivekanandarajah June 6, 2019, 12:18 p.m. UTC | #12
Hi Richard,

On Thu, 6 Jun 2019 at 22:07, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

> > Hi Richard,

> >

> > On Thu, 6 Jun 2019 at 19:35, Richard Sandiford

> > <richard.sandiford@arm.com> wrote:

> >>

> >> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

> >> > Hi Richard,

> >> >

> >> > Thanks for the review. Attached is the latest patch.

> >> >

> >> > For testcase like cond_arith_1.c, with the patch, gcc ICE in fwprop. I

> >> > am limiting fwprop in cases like this. Is there a better fix for this?

> >> > index cf2c9de..2c99285 100644

> >> > --- a/gcc/fwprop.c

> >> > +++ b/gcc/fwprop.c

> >> > @@ -1358,6 +1358,15 @@ forward_propagate_and_simplify (df_ref use,

> >> > rtx_insn *def_insn, rtx def_set)

> >> >    else

> >> >      mode = GET_MODE (*loc);

> >> >

> >> > +  /* TODO. We can't get the mode for

> >> > +     (set (reg:VNx16BI 109)

> >> > +          (unspec:VNx16BI [

> >> > +        (reg:SI 131)

> >> > +        (reg:SI 106)

> >> > +           ] UNSPEC_WHILE_LO))

> >> > +     Thus, bailout when it is UNSPEC and MODEs are not compatible.  */

> >> > +  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg)))

> >> > +    return false;

> >> >    new_rtx = propagate_rtx (*loc, mode, reg, src,

> >> >                   optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_insn)));

> >>

> >> What specifically goes wrong?  The unspec above isn't that unusual --

> >> many unspecs have different modes from their inputs.

> >

> > cond_arith_1.c:38:1: internal compiler error: in paradoxical_subreg_p,

> > at rtl.h:3130

> > 0x135f1d3 paradoxical_subreg_p(machine_mode, machine_mode)

> >         ../../88838/gcc/rtl.h:3130

> > 0x135f1d3 propagate_rtx

> >         ../../88838/gcc/fwprop.c:683

> > 0x135f4a3 forward_propagate_and_simplify

> >         ../../88838/gcc/fwprop.c:1371

> > 0x135f4a3 forward_propagate_into

> >         ../../88838/gcc/fwprop.c:1430

> > 0x135fdcb fwprop

> >         ../../88838/gcc/fwprop.c:1519

> > 0x135fdcb execute

> >         ../../88838/gcc/fwprop.c:1550

> > Please submit a full bug report,

> > with preprocessed source if appropriate.

> >

> >

> > in forward_propagate_and_simplify

> >

> > use_set:

> > (set (reg:VNx16BI 96 [ loop_mask_52 ])

> >     (unspec:VNx16BI [

> >             (reg:SI 92 [ _3 ])

> >             (reg:SI 95 [ niters.36 ])

> >         ] UNSPEC_WHILE_LO))

> >

> > reg:

> > (reg:SI 92 [ _3 ])

> >

> > *loc:

> > (unspec:VNx16BI [

> >         (reg:SI 92 [ _3 ])

> >         (reg:SI 95 [ niters.36 ])

> >     ] UNSPEC_WHILE_LO)

> >

> > src:

> > (subreg:SI (reg:DI 136 [ ivtmp_101 ]) 0)

> >

> > use_insn:

> > (insn 87 86 88 4 (parallel [

> >             (set (reg:VNx16BI 96 [ loop_mask_52 ])

> >                 (unspec:VNx16BI [

> >                         (reg:SI 92 [ _3 ])

> >                         (reg:SI 95 [ niters.36 ])

> >                     ] UNSPEC_WHILE_LO))

> >             (clobber (reg:CC 66 cc))

> >         ]) 4255 {while_ultsivnx16bi}

> >      (expr_list:REG_UNUSED (reg:CC 66 cc)

> >         (nil)))

> >

> > I think we calculate the mode to be VNx16BI which is wrong?

> > because of which in propgate_rtx,   !paradoxical_subreg_p (mode,

> > GET_MODE (SUBREG_REG (new_rtx)))))  ICE

>

> Looks like something I hit on the ACLE branch, but didn't have a

> non-ACLE reproducer for (see 065881acf0de35ff7818c1fc92769e1c106e1028).

>

> Does the attached work?  The current call is wrong because "mode"

> is the mode of "x", not the mode of "new_rtx".


Yes, attached patch works for this testcase. Are you planning to
commit it to trunk. I will wait for this.

Thanks,
Kugan
>

> Thanks,

> Richard

>

>

> 2019-06-06  Richard Sandiford  <richard.sandiford@arm.com>

>

> gcc/

>         * fwprop.c (propagate_rtx): Fix call to paradoxical_subreg_p.

>

> Index: gcc/fwprop.c

> ===================================================================

> --- gcc/fwprop.c        2019-03-08 18:14:25.333011645 +0000

> +++ gcc/fwprop.c        2019-06-06 13:04:34.423476690 +0100

> @@ -680,7 +680,7 @@ propagate_rtx (rtx x, machine_mode mode,

>        || CONSTANT_P (new_rtx)

>        || (GET_CODE (new_rtx) == SUBREG

>           && REG_P (SUBREG_REG (new_rtx))

> -         && !paradoxical_subreg_p (mode, GET_MODE (SUBREG_REG (new_rtx)))))

> +         && !paradoxical_subreg_p (new_rtx)))

>      flags |= PR_CAN_APPEAR;

>    if (!varying_mem_p (new_rtx))

>      flags |= PR_HANDLE_MEM;
Richard Sandiford June 6, 2019, 12:33 p.m. UTC | #13
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:
> Hi Richard,

>

> On Thu, 6 Jun 2019 at 22:07, Richard Sandiford

> <richard.sandiford@arm.com> wrote:

>>

>> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

>> > Hi Richard,

>> >

>> > On Thu, 6 Jun 2019 at 19:35, Richard Sandiford

>> > <richard.sandiford@arm.com> wrote:

>> >>

>> >> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

>> >> > Hi Richard,

>> >> >

>> >> > Thanks for the review. Attached is the latest patch.

>> >> >

>> >> > For testcase like cond_arith_1.c, with the patch, gcc ICE in fwprop. I

>> >> > am limiting fwprop in cases like this. Is there a better fix for this?

>> >> > index cf2c9de..2c99285 100644

>> >> > --- a/gcc/fwprop.c

>> >> > +++ b/gcc/fwprop.c

>> >> > @@ -1358,6 +1358,15 @@ forward_propagate_and_simplify (df_ref use,

>> >> > rtx_insn *def_insn, rtx def_set)

>> >> >    else

>> >> >      mode = GET_MODE (*loc);

>> >> >

>> >> > +  /* TODO. We can't get the mode for

>> >> > +     (set (reg:VNx16BI 109)

>> >> > +          (unspec:VNx16BI [

>> >> > +        (reg:SI 131)

>> >> > +        (reg:SI 106)

>> >> > +           ] UNSPEC_WHILE_LO))

>> >> > +     Thus, bailout when it is UNSPEC and MODEs are not compatible.  */

>> >> > +  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg)))

>> >> > +    return false;

>> >> >    new_rtx = propagate_rtx (*loc, mode, reg, src,

>> >> >                   optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_insn)));

>> >>

>> >> What specifically goes wrong?  The unspec above isn't that unusual --

>> >> many unspecs have different modes from their inputs.

>> >

>> > cond_arith_1.c:38:1: internal compiler error: in paradoxical_subreg_p,

>> > at rtl.h:3130

>> > 0x135f1d3 paradoxical_subreg_p(machine_mode, machine_mode)

>> >         ../../88838/gcc/rtl.h:3130

>> > 0x135f1d3 propagate_rtx

>> >         ../../88838/gcc/fwprop.c:683

>> > 0x135f4a3 forward_propagate_and_simplify

>> >         ../../88838/gcc/fwprop.c:1371

>> > 0x135f4a3 forward_propagate_into

>> >         ../../88838/gcc/fwprop.c:1430

>> > 0x135fdcb fwprop

>> >         ../../88838/gcc/fwprop.c:1519

>> > 0x135fdcb execute

>> >         ../../88838/gcc/fwprop.c:1550

>> > Please submit a full bug report,

>> > with preprocessed source if appropriate.

>> >

>> >

>> > in forward_propagate_and_simplify

>> >

>> > use_set:

>> > (set (reg:VNx16BI 96 [ loop_mask_52 ])

>> >     (unspec:VNx16BI [

>> >             (reg:SI 92 [ _3 ])

>> >             (reg:SI 95 [ niters.36 ])

>> >         ] UNSPEC_WHILE_LO))

>> >

>> > reg:

>> > (reg:SI 92 [ _3 ])

>> >

>> > *loc:

>> > (unspec:VNx16BI [

>> >         (reg:SI 92 [ _3 ])

>> >         (reg:SI 95 [ niters.36 ])

>> >     ] UNSPEC_WHILE_LO)

>> >

>> > src:

>> > (subreg:SI (reg:DI 136 [ ivtmp_101 ]) 0)

>> >

>> > use_insn:

>> > (insn 87 86 88 4 (parallel [

>> >             (set (reg:VNx16BI 96 [ loop_mask_52 ])

>> >                 (unspec:VNx16BI [

>> >                         (reg:SI 92 [ _3 ])

>> >                         (reg:SI 95 [ niters.36 ])

>> >                     ] UNSPEC_WHILE_LO))

>> >             (clobber (reg:CC 66 cc))

>> >         ]) 4255 {while_ultsivnx16bi}

>> >      (expr_list:REG_UNUSED (reg:CC 66 cc)

>> >         (nil)))

>> >

>> > I think we calculate the mode to be VNx16BI which is wrong?

>> > because of which in propgate_rtx,   !paradoxical_subreg_p (mode,

>> > GET_MODE (SUBREG_REG (new_rtx)))))  ICE

>>

>> Looks like something I hit on the ACLE branch, but didn't have a

>> non-ACLE reproducer for (see 065881acf0de35ff7818c1fc92769e1c106e1028).

>>

>> Does the attached work?  The current call is wrong because "mode"

>> is the mode of "x", not the mode of "new_rtx".

>

> Yes, attached patch works for this testcase. Are you planning to

> commit it to trunk. I will wait for this.


Needs approval first. :-)

The patch was originally bootstrapped & regtested on aarch64-linux-gnu,
but I'll repeat that for trunk and test x86_64-linux-gnu too.

Richard

>

> Thanks,

> Kugan

>>

>> Thanks,

>> Richard

>>

>>

>> 2019-06-06  Richard Sandiford  <richard.sandiford@arm.com>

>>

>> gcc/

>>         * fwprop.c (propagate_rtx): Fix call to paradoxical_subreg_p.

>>

>> Index: gcc/fwprop.c

>> ===================================================================

>> --- gcc/fwprop.c        2019-03-08 18:14:25.333011645 +0000

>> +++ gcc/fwprop.c        2019-06-06 13:04:34.423476690 +0100

>> @@ -680,7 +680,7 @@ propagate_rtx (rtx x, machine_mode mode,

>>        || CONSTANT_P (new_rtx)

>>        || (GET_CODE (new_rtx) == SUBREG

>>           && REG_P (SUBREG_REG (new_rtx))

>> -         && !paradoxical_subreg_p (mode, GET_MODE (SUBREG_REG (new_rtx)))))

>> +         && !paradoxical_subreg_p (new_rtx)))

>>      flags |= PR_CAN_APPEAR;

>>    if (!varying_mem_p (new_rtx))

>>      flags |= PR_HANDLE_MEM;
Jeff Law June 7, 2019, 12:01 a.m. UTC | #14
On 6/6/19 6:33 AM, Richard Sandiford wrote:
> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

>> Hi Richard,

>>

>> On Thu, 6 Jun 2019 at 22:07, Richard Sandiford

>> <richard.sandiford@arm.com> wrote:

>>>

>>> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

>>>> Hi Richard,

>>>>

>>>> On Thu, 6 Jun 2019 at 19:35, Richard Sandiford

>>>> <richard.sandiford@arm.com> wrote:

>>>>>

>>>>> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

>>>>>> Hi Richard,

>>>>>>

>>>>>> Thanks for the review. Attached is the latest patch.

>>>>>>

>>>>>> For testcase like cond_arith_1.c, with the patch, gcc ICE in fwprop. I

>>>>>> am limiting fwprop in cases like this. Is there a better fix for this?

>>>>>> index cf2c9de..2c99285 100644

>>>>>> --- a/gcc/fwprop.c

>>>>>> +++ b/gcc/fwprop.c

>>>>>> @@ -1358,6 +1358,15 @@ forward_propagate_and_simplify (df_ref use,

>>>>>> rtx_insn *def_insn, rtx def_set)

>>>>>>    else

>>>>>>      mode = GET_MODE (*loc);

>>>>>>

>>>>>> +  /* TODO. We can't get the mode for

>>>>>> +     (set (reg:VNx16BI 109)

>>>>>> +          (unspec:VNx16BI [

>>>>>> +        (reg:SI 131)

>>>>>> +        (reg:SI 106)

>>>>>> +           ] UNSPEC_WHILE_LO))

>>>>>> +     Thus, bailout when it is UNSPEC and MODEs are not compatible.  */

>>>>>> +  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg)))

>>>>>> +    return false;

>>>>>>    new_rtx = propagate_rtx (*loc, mode, reg, src,

>>>>>>                   optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_insn)));

>>>>>

>>>>> What specifically goes wrong?  The unspec above isn't that unusual --

>>>>> many unspecs have different modes from their inputs.

>>>>

>>>> cond_arith_1.c:38:1: internal compiler error: in paradoxical_subreg_p,

>>>> at rtl.h:3130

>>>> 0x135f1d3 paradoxical_subreg_p(machine_mode, machine_mode)

>>>>         ../../88838/gcc/rtl.h:3130

>>>> 0x135f1d3 propagate_rtx

>>>>         ../../88838/gcc/fwprop.c:683

>>>> 0x135f4a3 forward_propagate_and_simplify

>>>>         ../../88838/gcc/fwprop.c:1371

>>>> 0x135f4a3 forward_propagate_into

>>>>         ../../88838/gcc/fwprop.c:1430

>>>> 0x135fdcb fwprop

>>>>         ../../88838/gcc/fwprop.c:1519

>>>> 0x135fdcb execute

>>>>         ../../88838/gcc/fwprop.c:1550

>>>> Please submit a full bug report,

>>>> with preprocessed source if appropriate.

>>>>

>>>>

>>>> in forward_propagate_and_simplify

>>>>

>>>> use_set:

>>>> (set (reg:VNx16BI 96 [ loop_mask_52 ])

>>>>     (unspec:VNx16BI [

>>>>             (reg:SI 92 [ _3 ])

>>>>             (reg:SI 95 [ niters.36 ])

>>>>         ] UNSPEC_WHILE_LO))

>>>>

>>>> reg:

>>>> (reg:SI 92 [ _3 ])

>>>>

>>>> *loc:

>>>> (unspec:VNx16BI [

>>>>         (reg:SI 92 [ _3 ])

>>>>         (reg:SI 95 [ niters.36 ])

>>>>     ] UNSPEC_WHILE_LO)

>>>>

>>>> src:

>>>> (subreg:SI (reg:DI 136 [ ivtmp_101 ]) 0)

>>>>

>>>> use_insn:

>>>> (insn 87 86 88 4 (parallel [

>>>>             (set (reg:VNx16BI 96 [ loop_mask_52 ])

>>>>                 (unspec:VNx16BI [

>>>>                         (reg:SI 92 [ _3 ])

>>>>                         (reg:SI 95 [ niters.36 ])

>>>>                     ] UNSPEC_WHILE_LO))

>>>>             (clobber (reg:CC 66 cc))

>>>>         ]) 4255 {while_ultsivnx16bi}

>>>>      (expr_list:REG_UNUSED (reg:CC 66 cc)

>>>>         (nil)))

>>>>

>>>> I think we calculate the mode to be VNx16BI which is wrong?

>>>> because of which in propgate_rtx,   !paradoxical_subreg_p (mode,

>>>> GET_MODE (SUBREG_REG (new_rtx)))))  ICE

>>>

>>> Looks like something I hit on the ACLE branch, but didn't have a

>>> non-ACLE reproducer for (see 065881acf0de35ff7818c1fc92769e1c106e1028).

>>>

>>> Does the attached work?  The current call is wrong because "mode"

>>> is the mode of "x", not the mode of "new_rtx".

>>

>> Yes, attached patch works for this testcase. Are you planning to

>> commit it to trunk. I will wait for this.

> 

> Needs approval first. :-)

> 

> The patch was originally bootstrapped & regtested on aarch64-linux-gnu,

> but I'll repeat that for trunk and test x86_64-linux-gnu too.

Assuming that passes this is fine.
jeff
Richard Sandiford June 7, 2019, 7:41 a.m. UTC | #15
Jeff Law <law@redhat.com> writes:
> On 6/6/19 6:33 AM, Richard Sandiford wrote:

>> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

>>> Hi Richard,

>>>

>>> On Thu, 6 Jun 2019 at 22:07, Richard Sandiford

>>> <richard.sandiford@arm.com> wrote:

>>>>

>>>> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

>>>>> Hi Richard,

>>>>>

>>>>> On Thu, 6 Jun 2019 at 19:35, Richard Sandiford

>>>>> <richard.sandiford@arm.com> wrote:

>>>>>>

>>>>>> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> writes:

>>>>>>> Hi Richard,

>>>>>>>

>>>>>>> Thanks for the review. Attached is the latest patch.

>>>>>>>

>>>>>>> For testcase like cond_arith_1.c, with the patch, gcc ICE in fwprop. I

>>>>>>> am limiting fwprop in cases like this. Is there a better fix for this?

>>>>>>> index cf2c9de..2c99285 100644

>>>>>>> --- a/gcc/fwprop.c

>>>>>>> +++ b/gcc/fwprop.c

>>>>>>> @@ -1358,6 +1358,15 @@ forward_propagate_and_simplify (df_ref use,

>>>>>>> rtx_insn *def_insn, rtx def_set)

>>>>>>>    else

>>>>>>>      mode = GET_MODE (*loc);

>>>>>>>

>>>>>>> +  /* TODO. We can't get the mode for

>>>>>>> +     (set (reg:VNx16BI 109)

>>>>>>> +          (unspec:VNx16BI [

>>>>>>> +        (reg:SI 131)

>>>>>>> +        (reg:SI 106)

>>>>>>> +           ] UNSPEC_WHILE_LO))

>>>>>>> +     Thus, bailout when it is UNSPEC and MODEs are not compatible.  */

>>>>>>> +  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg)))

>>>>>>> +    return false;

>>>>>>>    new_rtx = propagate_rtx (*loc, mode, reg, src,

>>>>>>>                   optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_insn)));

>>>>>>

>>>>>> What specifically goes wrong?  The unspec above isn't that unusual --

>>>>>> many unspecs have different modes from their inputs.

>>>>>

>>>>> cond_arith_1.c:38:1: internal compiler error: in paradoxical_subreg_p,

>>>>> at rtl.h:3130

>>>>> 0x135f1d3 paradoxical_subreg_p(machine_mode, machine_mode)

>>>>>         ../../88838/gcc/rtl.h:3130

>>>>> 0x135f1d3 propagate_rtx

>>>>>         ../../88838/gcc/fwprop.c:683

>>>>> 0x135f4a3 forward_propagate_and_simplify

>>>>>         ../../88838/gcc/fwprop.c:1371

>>>>> 0x135f4a3 forward_propagate_into

>>>>>         ../../88838/gcc/fwprop.c:1430

>>>>> 0x135fdcb fwprop

>>>>>         ../../88838/gcc/fwprop.c:1519

>>>>> 0x135fdcb execute

>>>>>         ../../88838/gcc/fwprop.c:1550

>>>>> Please submit a full bug report,

>>>>> with preprocessed source if appropriate.

>>>>>

>>>>>

>>>>> in forward_propagate_and_simplify

>>>>>

>>>>> use_set:

>>>>> (set (reg:VNx16BI 96 [ loop_mask_52 ])

>>>>>     (unspec:VNx16BI [

>>>>>             (reg:SI 92 [ _3 ])

>>>>>             (reg:SI 95 [ niters.36 ])

>>>>>         ] UNSPEC_WHILE_LO))

>>>>>

>>>>> reg:

>>>>> (reg:SI 92 [ _3 ])

>>>>>

>>>>> *loc:

>>>>> (unspec:VNx16BI [

>>>>>         (reg:SI 92 [ _3 ])

>>>>>         (reg:SI 95 [ niters.36 ])

>>>>>     ] UNSPEC_WHILE_LO)

>>>>>

>>>>> src:

>>>>> (subreg:SI (reg:DI 136 [ ivtmp_101 ]) 0)

>>>>>

>>>>> use_insn:

>>>>> (insn 87 86 88 4 (parallel [

>>>>>             (set (reg:VNx16BI 96 [ loop_mask_52 ])

>>>>>                 (unspec:VNx16BI [

>>>>>                         (reg:SI 92 [ _3 ])

>>>>>                         (reg:SI 95 [ niters.36 ])

>>>>>                     ] UNSPEC_WHILE_LO))

>>>>>             (clobber (reg:CC 66 cc))

>>>>>         ]) 4255 {while_ultsivnx16bi}

>>>>>      (expr_list:REG_UNUSED (reg:CC 66 cc)

>>>>>         (nil)))

>>>>>

>>>>> I think we calculate the mode to be VNx16BI which is wrong?

>>>>> because of which in propgate_rtx,   !paradoxical_subreg_p (mode,

>>>>> GET_MODE (SUBREG_REG (new_rtx)))))  ICE

>>>>

>>>> Looks like something I hit on the ACLE branch, but didn't have a

>>>> non-ACLE reproducer for (see 065881acf0de35ff7818c1fc92769e1c106e1028).

>>>>

>>>> Does the attached work?  The current call is wrong because "mode"

>>>> is the mode of "x", not the mode of "new_rtx".

>>>

>>> Yes, attached patch works for this testcase. Are you planning to

>>> commit it to trunk. I will wait for this.

>> 

>> Needs approval first. :-)

>> 

>> The patch was originally bootstrapped & regtested on aarch64-linux-gnu,

>> but I'll repeat that for trunk and test x86_64-linux-gnu too.

> Assuming that passes this is fine.

> jeff


Thanks, applied as r272032.

Richard
diff mbox series

Patch

From 4e9837ff9c0c080923f342e83574a6fdba2b3d92 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
Date: Tue, 5 Mar 2019 10:01:45 +1100
Subject: [PATCH] pr88838[v2]

As Mentioned in PR88838, this patch  avoid the SXTW by using WHILELO on W
registers instead of X registers.

As mentined in PR, vect_verify_full_masking checks which IV widths are
supported for WHILELO but prefers to go to Pmode width.  This is because
using Pmode allows ivopts to reuse the IV for indices (as in the loads
and store above).  However, it would be better to use a 32-bit WHILELO
with a truncated 64-bit IV if:

(a) the limit is extended from 32 bits.
(b) the detection loop in vect_verify_full_masking detects that using a
    32-bit IV would be correct.

gcc/ChangeLog:

2019-05-22  Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>

	* tree-vect-loop-manip.c (vect_set_loop_masks_directly): If the
	compare_type is not with Pmode size, we will create an IV with
	Pmode size with truncated use (i.e. converted to the correct type).
	* tree-vect-loop.c (vect_verify_full_masking): Find which IV
	widths are supported for WHILELO.

gcc/testsuite/ChangeLog:

2019-05-22  Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>

	* gcc.target/aarch64/pr88838.c: New test.
	* gcc.target/aarch64/sve/while_1.c: Adjust.

Change-Id: Iff52946c28d468078f2cc0868d53edb05325b8ca
---
 gcc/fwprop.c                                   | 13 +++++++
 gcc/testsuite/gcc.target/aarch64/pr88838.c     | 11 ++++++
 gcc/testsuite/gcc.target/aarch64/sve/while_1.c | 16 ++++----
 gcc/tree-vect-loop-manip.c                     | 52 ++++++++++++++++++++++++--
 gcc/tree-vect-loop.c                           | 39 ++++++++++++++++++-
 5 files changed, 117 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr88838.c

diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index cf2c9de..5275ad3 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -1358,6 +1358,19 @@  forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set)
   else
     mode = GET_MODE (*loc);
 
+  /* TODO.  */
+  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg)))
+    return false;
+  /* TODO. We can't get the mode for
+     (set (reg:VNx16BI 109)
+          (unspec:VNx16BI [
+	    (reg:SI 131)
+	    (reg:SI 106)
+           ] UNSPEC_WHILE_LO))
+     Thus, bailout when it is UNSPEC and MODEs are not compatible.  */
+  if (GET_MODE_CLASS (mode) != GET_MODE_CLASS (GET_MODE (reg))
+      && GET_CODE (SET_SRC (use_set)) == UNSPEC)
+    return false;
   new_rtx = propagate_rtx (*loc, mode, reg, src,
   			   optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_insn)));
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pr88838.c b/gcc/testsuite/gcc.target/aarch64/pr88838.c
new file mode 100644
index 0000000..9d03c0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr88838.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-S -O3 -march=arm8.2-a+sve" } */
+
+void
+f (int *restrict x, int *restrict y, int *restrict z, int n)
+{
+    for (int i = 0; i < n; i += 1)
+          x[i] = y[i] + z[i];
+}
+
+/* { dg-final { scan-assembler-not "sxtw" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
index a93a04b..05a4860 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
@@ -26,14 +26,14 @@ 
 TEST_ALL (ADD_LOOP)
 
 /* { dg-final { scan-assembler-not {\tuqdec} } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, xzr,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, x[0-9]+,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, xzr,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, x[0-9]+,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, xzr,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, x[0-9]+,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, xzr,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, x[0-9]+,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, wzr,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, w[0-9]+,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, wzr,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, w[0-9]+,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, wzr,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, w[0-9]+,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, wzr,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, w[0-9]+,} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x0, x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0, x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x0, x[0-9]+, lsl 1\]\n} 2 } } */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 77d3dac..d6452a1 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -418,7 +418,20 @@  vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
   tree mask_type = rgm->mask_type;
   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
-
+  bool convert = false;
+  tree iv_type = NULL_TREE;
+
+  /* If the compare_type is not with Pmode size, we will create an IV with
+     Pmode size with truncated use (i.e. converted to the correct type).
+     This is because using Pmode allows ivopts to reuse the IV for indices
+     (in the loads and store).  */
+  if (known_lt (GET_MODE_BITSIZE (TYPE_MODE (compare_type)),
+		GET_MODE_BITSIZE (Pmode)))
+    {
+      iv_type = build_nonstandard_integer_type (GET_MODE_BITSIZE (Pmode),
+						true);
+      convert = true;
+    }
   /* Calculate the maximum number of scalar values that the rgroup
      handles in total, the number that it handles for each iteration
      of the vector loop, and the number that it should skip during the
@@ -444,12 +457,43 @@  vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
      processed.  */
   tree index_before_incr, index_after_incr;
   gimple_stmt_iterator incr_gsi;
+  gimple_stmt_iterator incr_gsi2;
   bool insert_after;
-  tree zero_index = build_int_cst (compare_type, 0);
+  tree zero_index;
   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
-  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
-	     insert_after, &index_before_incr, &index_after_incr);
 
+  if (convert)
+    {
+      /* If we are creating IV of Pmode type and converting.  */
+      zero_index = build_int_cst (iv_type, 0);
+      tree step = build_int_cst (iv_type,
+				 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+      /* Creating IV of Pmode type.  */
+      create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
+		 insert_after, &index_before_incr, &index_after_incr);
+      /* Create truncated index_before and after increament.  */
+      tree index_before_incr_trunc = make_ssa_name (compare_type);
+      tree index_after_incr_trunc = make_ssa_name (compare_type);
+      gimple *incr_before_stmt = gimple_build_assign (index_before_incr_trunc,
+						      NOP_EXPR,
+						      index_before_incr);
+      gimple *incr_after_stmt = gimple_build_assign (index_after_incr_trunc,
+						     NOP_EXPR,
+						     index_after_incr);
+      incr_gsi2 = incr_gsi;
+      gsi_insert_before (&incr_gsi2, incr_before_stmt, GSI_NEW_STMT);
+      gsi_insert_after (&incr_gsi, incr_after_stmt, GSI_NEW_STMT);
+      index_before_incr = index_before_incr_trunc;
+      index_after_incr = index_after_incr_trunc;
+      zero_index = build_int_cst (compare_type, 0);
+    }
+  else
+    {
+      /* If the IV is of Pmode compare_type, no convertion needed.  */
+      zero_index = build_int_cst (compare_type, 0);
+      create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
+		 insert_after, &index_before_incr, &index_after_incr);
+    }
   tree test_index, test_limit, first_limit;
   gimple_stmt_iterator *test_gsi;
   if (might_wrap_p)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index bd81193..2769c86 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1035,6 +1035,30 @@  vect_verify_full_masking (loop_vec_info loop_vinfo)
   /* Find a scalar mode for which WHILE_ULT is supported.  */
   opt_scalar_int_mode cmp_mode_iter;
   tree cmp_type = NULL_TREE;
+  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
+  widest_int iv_limit;
+  bool known_max_iters = max_loop_iterations (loop, &iv_limit);
+  if (known_max_iters)
+    {
+      if (niters_skip)
+	{
+	  /* Add the maximum number of skipped iterations to the
+	     maximum iteration count.  */
+	  if (TREE_CODE (niters_skip) == INTEGER_CST)
+	    iv_limit += wi::to_widest (niters_skip);
+	  else
+	    iv_limit += max_vf - 1;
+	}
+      /* IV_LIMIT is the maximum number of latch iterations, which is also
+	 the maximum in-range IV value.  Round this value down to the previous
+	 vector alignment boundary and then add an extra full iteration.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+    }
+
+  /* Get the vectorization factor in tree form.  */
   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
     {
       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
@@ -1045,12 +1069,23 @@  vect_verify_full_masking (loop_vec_info loop_vinfo)
 	  if (this_type
 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
 	    {
+	      /* See whether zero-based IV would ever generate all-false masks
+		 before wrapping around.  */
+	      bool might_wrap_p
+		= (!known_max_iters
+		   || (wi::min_precision
+		       (iv_limit
+			* vect_get_max_nscalars_per_iter (loop_vinfo),
+			UNSIGNED) > cmp_bits));
 	      /* Although we could stop as soon as we find a valid mode,
 		 it's often better to continue until we hit Pmode, since the
 		 operands to the WHILE are more likely to be reusable in
-		 address calculations.  */
+		 address calculations.  Unless the limit is extended from
+		 this_type.  */
 	      cmp_type = this_type;
-	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
+	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode)
+		  || (!might_wrap_p
+		      && (cmp_bits == TYPE_PRECISION (niters_type))))
 		break;
 	    }
 	}
-- 
2.7.4