Use range info in split_constant_offset (PR 81635)

Message ID 874ln25ape.fsf@linaro.org
State New
Headers show
Series
  • Use range info in split_constant_offset (PR 81635)
Related show

Commit Message

Richard Sandiford Jan. 31, 2018, 3:06 p.m.
This patch implements the original suggestion for fixing PR 81635:
use range info in split_constant_offset to see whether a conversion
of a wrapping type can be split.  The range info problem described in:

  https://gcc.gnu.org/ml/gcc-patches/2017-08/msg01002.html

seems to have been fixed.

The patch is part 1.  There needs to be a follow-on patch to handle:

  for (unsigned int i = 0; i < n; i += 4)
    {
      ...[i + 2]...
      ...[i + 3]...

which the old SCEV test handles, but which the range check doesn't.
At the moment we record that the low two bits of "i" are clear,
but we still end up with a maximum range of 0xffffffff rather than
0xfffffffc.

Tested on aarch64-linux-gnu, x86_64-linux-gnu and powerpc64le-linux-gnu.
Also tested by comparing the before and after testsuite assembly output
for at least one target per CPU directory.  Excluding a small number
of register renamings on some targets, there were two differences:

(1) In gcc.c-torture/compile/pr55350.c:

    void
    foo (__INTPTR_TYPE__ x, __INTPTR_TYPE__ y)
    {
      int i;
      void **a = (void *)  (8UL * (x / 8UL));
      for (i = 0; i < x; i++)
        a[i] = (void *) y;
    }

    we previously kept base "a" and offset 0, but now use
    "(void **) _n" (where _n holds the multiplication result).

    This is because the old test had the side-effect of prohibiting
    casts from unsigned ints to pointers of the same size.
    What we do for code like this isn't going to help much though.

(2) In gcc.c-torture/execute/2003016-1.c, we unrolled:

    void f (unsigned int *x)
    {
      unsigned char i;
      int j;

      i = 0x10;
      for (j = 0; j < 0x10; j++)
        {
          i += 0xe8;
          x[i] = 0;
          i -= 0xe7;
        }
    }

    and ended up with an unpropagated degenerate phi:

      # i_38 = PHI <16(3)>
      ...
      i_40 = i_38 + 1;
      ...
      i_48 = i_40 + 1;
      ... etc ...
      i_178 = i_168 + 1;
      i_16 = i_178 + 232;
      _17 = (long unsigned int) i_16;
      _18 = _17 * 4;
      _19 = &x + _18;
      *_19 = 0;

    Calling split_constant_offset on each (long unsigned int) operand
    gives i_38 + 0xe8, i_38 + 0xe9, ..., i_38 + 0xf7, with i_38
    still having the range [0x10, 0x20].  We can therefore tell
    that i_38 + 0xf0 has the range [0x00, 0x10], and similarly
    for +0xf1...+0xf7.

    We should really be folding to constants here though.

OK to install?


2018-01-31  Richard Sandiford  <richard.sandiford@linaro.org>

gcc/
	PR tree-optimization/81635
	* tree-data-ref.c (split_constant_offset_1): For types that
	wrap on overflow, try to use range info to prove that wrapping
	cannot occur.

gcc/testsuite/
	PR tree-optimization/81635
	* gcc.dg/vect/bb-slp-pr81635-1.c: New test.
	* gcc.dg/vect/bb-slp-pr81635-2.c: Likewise.

Comments

Richard Biener Feb. 1, 2018, 1:04 p.m. | #1
On Wed, Jan 31, 2018 at 4:06 PM, Richard Sandiford
<richard.sandiford@linaro.org> wrote:
> This patch implements the original suggestion for fixing PR 81635:

> use range info in split_constant_offset to see whether a conversion

> of a wrapping type can be split.  The range info problem described in:

>

>   https://gcc.gnu.org/ml/gcc-patches/2017-08/msg01002.html

>

> seems to have been fixed.

>

> The patch is part 1.  There needs to be a follow-on patch to handle:

>

>   for (unsigned int i = 0; i < n; i += 4)

>     {

>       ...[i + 2]...

>       ...[i + 3]...

>

> which the old SCEV test handles, but which the range check doesn't.

> At the moment we record that the low two bits of "i" are clear,

> but we still end up with a maximum range of 0xffffffff rather than

> 0xfffffffc.

>

> Tested on aarch64-linux-gnu, x86_64-linux-gnu and powerpc64le-linux-gnu.

> Also tested by comparing the before and after testsuite assembly output

> for at least one target per CPU directory.  Excluding a small number

> of register renamings on some targets, there were two differences:

>

> (1) In gcc.c-torture/compile/pr55350.c:

>

>     void

>     foo (__INTPTR_TYPE__ x, __INTPTR_TYPE__ y)

>     {

>       int i;

>       void **a = (void *)  (8UL * (x / 8UL));

>       for (i = 0; i < x; i++)

>         a[i] = (void *) y;

>     }

>

>     we previously kept base "a" and offset 0, but now use

>     "(void **) _n" (where _n holds the multiplication result).

>

>     This is because the old test had the side-effect of prohibiting

>     casts from unsigned ints to pointers of the same size.

>     What we do for code like this isn't going to help much though.

>

> (2) In gcc.c-torture/execute/2003016-1.c, we unrolled:

>

>     void f (unsigned int *x)

>     {

>       unsigned char i;

>       int j;

>

>       i = 0x10;

>       for (j = 0; j < 0x10; j++)

>         {

>           i += 0xe8;

>           x[i] = 0;

>           i -= 0xe7;

>         }

>     }

>

>     and ended up with an unpropagated degenerate phi:

>

>       # i_38 = PHI <16(3)>

>       ...

>       i_40 = i_38 + 1;

>       ...

>       i_48 = i_40 + 1;

>       ... etc ...

>       i_178 = i_168 + 1;

>       i_16 = i_178 + 232;

>       _17 = (long unsigned int) i_16;

>       _18 = _17 * 4;

>       _19 = &x + _18;

>       *_19 = 0;

>

>     Calling split_constant_offset on each (long unsigned int) operand

>     gives i_38 + 0xe8, i_38 + 0xe9, ..., i_38 + 0xf7, with i_38

>     still having the range [0x10, 0x20].  We can therefore tell

>     that i_38 + 0xf0 has the range [0x00, 0x10], and similarly

>     for +0xf1...+0xf7.

>

>     We should really be folding to constants here though.

>

> OK to install?

>

>

> 2018-01-31  Richard Sandiford  <richard.sandiford@linaro.org>

>

> gcc/

>         PR tree-optimization/81635

>         * tree-data-ref.c (split_constant_offset_1): For types that

>         wrap on overflow, try to use range info to prove that wrapping

>         cannot occur.

>

> gcc/testsuite/

>         PR tree-optimization/81635

>         * gcc.dg/vect/bb-slp-pr81635-1.c: New test.

>         * gcc.dg/vect/bb-slp-pr81635-2.c: Likewise.

>

> Index: gcc/tree-data-ref.c

> ===================================================================

> --- gcc/tree-data-ref.c 2018-01-13 18:02:00.946360352 +0000

> +++ gcc/tree-data-ref.c 2018-01-31 13:26:13.488630604 +0000

> @@ -704,11 +704,46 @@ split_constant_offset_1 (tree type, tree

>            and the outer precision is at least as large as the inner.  */

>         tree itype = TREE_TYPE (op0);

>         if ((POINTER_TYPE_P (itype)

> -            || (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_UNDEFINED (itype)))

> +            || (INTEGRAL_TYPE_P (itype) && !TYPE_OVERFLOW_TRAPS (itype)))

>             && TYPE_PRECISION (type) >= TYPE_PRECISION (itype)

>             && (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type)))

>           {

> -           split_constant_offset (op0, &var0, off);

> +           if (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_WRAPS (itype))

> +             {

> +               /* Split the unconverted operand and try to prove that

> +                  wrapping isn't a problem.  */

> +               tree tmp_var, tmp_off;

> +               split_constant_offset (op0, &tmp_var, &tmp_off);

> +

> +               /* See whether we have an SSA_NAME whose range is known

> +                  to be [A, B].  */

> +               if (TREE_CODE (tmp_var) != SSA_NAME)

> +                 return false;

> +               wide_int var_min, var_max;

> +               if (get_range_info (tmp_var, &var_min, &var_max) != VR_RANGE)

> +                 return false;

> +

> +               /* See whether the range of OP0 (i.e. TMP_VAR + TMP_OFF)

> +                  is known to be [A + TMP_OFF, B + TMP_OFF], with all

> +                  operations done in ITYPE.  The addition must overflow

> +                  at both ends of the range or at neither.  */


Hmm, if they overflow at both ends that's still an issue given we effectively
widen the addition?  Consider char -> short and [255U, 255U] + 1 which
when widened will be 256 but when wrapping in char it's 0...

> +               bool overflow[2];

> +               signop sgn = TYPE_SIGN (itype);

> +               unsigned int prec = TYPE_PRECISION (itype);

> +               wide_int woff = wi::to_wide (tmp_off, prec);

> +               wide_int op0_min = wi::add (var_min, woff, sgn, &overflow[0]);

> +               wi::add (var_max, woff, sgn, &overflow[1]);

> +               if (overflow[0] != overflow[1])

> +                 return false;

> +

> +               /* Calculate (ssizetype) OP0 - (ssizetype) TMP_VAR.  */

> +               widest_int diff = (widest_int::from (op0_min, sgn)

> +                                  - widest_int::from (var_min, sgn));

> +               var0 = tmp_var;

> +               *off = wide_int_to_tree (ssizetype, diff);


... or is this part trying to deal with that case?  Because I don't understand
it - isn't 'off' simply supposed to be 'tmp_off' literally?  I think this needs
more explanation...  if there's no overflow that would be all that is to do, no?

Thanks,
Richard.

> +             }

> +           else

> +             split_constant_offset (op0, &var0, off);

>             *var = fold_convert (type, var0);

>             return true;

>           }

> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c

> ===================================================================

> --- /dev/null   2018-01-30 17:30:22.185477046 +0000

> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c        2018-01-31 13:26:13.487630644 +0000

> @@ -0,0 +1,92 @@

> +/* { dg-do compile } */

> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */

> +/* { dg-require-effective-target vect_double } */

> +/* { dg-require-effective-target lp64 } */

> +

> +void

> +f1 (double *p, double *q)

> +{

> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

> +  for (unsigned int i = 0; i < 1000; i += 4)

> +    {

> +      double a = q[i] + p[i];

> +      double b = q[i + 1] + p[i + 1];

> +      q[i] = a;

> +      q[i + 1] = b;

> +    }

> +}

> +

> +void

> +f2 (double *p, double *q)

> +{

> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

> +  for (unsigned int i = 2; i < ~0U - 4; i += 4)

> +    {

> +      double a = q[i] + p[i];

> +      double b = q[i + 1] + p[i + 1];

> +      q[i] = a;

> +      q[i + 1] = b;

> +    }

> +}

> +

> +void

> +f3 (double *p, double *q)

> +{

> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

> +  for (unsigned int i = 0; i < ~0U - 3; i += 4)

> +    {

> +      double a = q[i + 2] + p[i + 2];

> +      double b = q[i + 3] + p[i + 3];

> +      q[i + 2] = a;

> +      q[i + 3] = b;

> +    }

> +}

> +

> +void

> +f4 (double *p, double *q)

> +{

> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

> +  for (unsigned int i = 0; i < 500; i += 6)

> +    for (unsigned int j = 0; j < 500; j += 4)

> +      {

> +       double a = q[j] + p[i];

> +       double b = q[j + 1] + p[i + 1];

> +       q[i] = a;

> +       q[i + 1] = b;

> +      }

> +}

> +

> +void

> +f5 (double *p, double *q)

> +{

> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

> +  for (unsigned int i = 2; i < 1000; i += 4)

> +    {

> +      double a = q[i - 2] + p[i - 2];

> +      double b = q[i - 1] + p[i - 1];

> +      q[i - 2] = a;

> +      q[i - 1] = b;

> +    }

> +}

> +

> +double p[1000];

> +double q[1000];

> +

> +void

> +f6 (int n)

> +{

> +  for (unsigned int i = 0; i < n; i += 4)

> +    {

> +      double a = q[i] + p[i];

> +      double b = q[i + 1] + p[i + 1];

> +      q[i] = a;

> +      q[i + 1] = b;

> +    }

> +}

> +

> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 6 "slp1" } } */

> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c

> ===================================================================

> --- /dev/null   2018-01-30 17:30:22.185477046 +0000

> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c        2018-01-31 13:26:13.487630644 +0000

> @@ -0,0 +1,64 @@

> +/* { dg-do compile } */

> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */

> +/* { dg-require-effective-target lp64 } */

> +

> +double p[1000];

> +double q[1000];

> +

> +void

> +f1 (double *p, double *q)

> +{

> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

> +  for (unsigned int i = 2; i < ~0U - 4; i += 4)

> +    {

> +      double a = q[i + 2] + p[i + 2];

> +      double b = q[i + 3] + p[i + 3];

> +      q[i + 2] = a;

> +      q[i + 3] = b;

> +    }

> +}

> +

> +void

> +f2 (double *p, double *q)

> +{

> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

> +  for (unsigned int i = 0; i < ~0U - 3; i += 4)

> +    {

> +      double a = q[i + 4] + p[i + 4];

> +      double b = q[i + 5] + p[i + 5];

> +      q[i + 4] = a;

> +      q[i + 5] = b;

> +    }

> +}

> +

> +void

> +f3 (double *p, double *q)

> +{

> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

> +  for (unsigned int i = 0; i < 1000; i += 4)

> +    {

> +      double a = q[i - 2] + p[i - 2];

> +      double b = q[i - 1] + p[i - 1];

> +      q[i - 2] = a;

> +      q[i - 1] = b;

> +    }

> +}

> +

> +void

> +f4 (double *p, double *q)

> +{

> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

> +  for (unsigned int i = 2; i < 1000; i += 4)

> +    {

> +      double a = q[i - 4] + p[i - 4];

> +      double b = q[i - 3] + p[i - 3];

> +      q[i - 4] = a;

> +      q[i - 3] = b;

> +    }

> +}

> +

> +/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp1" } } */
Richard Sandiford Feb. 1, 2018, 1:18 p.m. | #2
Richard Biener <richard.guenther@gmail.com> writes:
> On Wed, Jan 31, 2018 at 4:06 PM, Richard Sandiford

> <richard.sandiford@linaro.org> wrote:

>> This patch implements the original suggestion for fixing PR 81635:

>> use range info in split_constant_offset to see whether a conversion

>> of a wrapping type can be split.  The range info problem described in:

>>

>>   https://gcc.gnu.org/ml/gcc-patches/2017-08/msg01002.html

>>

>> seems to have been fixed.

>>

>> The patch is part 1.  There needs to be a follow-on patch to handle:

>>

>>   for (unsigned int i = 0; i < n; i += 4)

>>     {

>>       ...[i + 2]...

>>       ...[i + 3]...

>>

>> which the old SCEV test handles, but which the range check doesn't.

>> At the moment we record that the low two bits of "i" are clear,

>> but we still end up with a maximum range of 0xffffffff rather than

>> 0xfffffffc.

>>

>> Tested on aarch64-linux-gnu, x86_64-linux-gnu and powerpc64le-linux-gnu.

>> Also tested by comparing the before and after testsuite assembly output

>> for at least one target per CPU directory.  Excluding a small number

>> of register renamings on some targets, there were two differences:

>>

>> (1) In gcc.c-torture/compile/pr55350.c:

>>

>>     void

>>     foo (__INTPTR_TYPE__ x, __INTPTR_TYPE__ y)

>>     {

>>       int i;

>>       void **a = (void *)  (8UL * (x / 8UL));

>>       for (i = 0; i < x; i++)

>>         a[i] = (void *) y;

>>     }

>>

>>     we previously kept base "a" and offset 0, but now use

>>     "(void **) _n" (where _n holds the multiplication result).

>>

>>     This is because the old test had the side-effect of prohibiting

>>     casts from unsigned ints to pointers of the same size.

>>     What we do for code like this isn't going to help much though.

>>

>> (2) In gcc.c-torture/execute/2003016-1.c, we unrolled:

>>

>>     void f (unsigned int *x)

>>     {

>>       unsigned char i;

>>       int j;

>>

>>       i = 0x10;

>>       for (j = 0; j < 0x10; j++)

>>         {

>>           i += 0xe8;

>>           x[i] = 0;

>>           i -= 0xe7;

>>         }

>>     }

>>

>>     and ended up with an unpropagated degenerate phi:

>>

>>       # i_38 = PHI <16(3)>

>>       ...

>>       i_40 = i_38 + 1;

>>       ...

>>       i_48 = i_40 + 1;

>>       ... etc ...

>>       i_178 = i_168 + 1;

>>       i_16 = i_178 + 232;

>>       _17 = (long unsigned int) i_16;

>>       _18 = _17 * 4;

>>       _19 = &x + _18;

>>       *_19 = 0;

>>

>>     Calling split_constant_offset on each (long unsigned int) operand

>>     gives i_38 + 0xe8, i_38 + 0xe9, ..., i_38 + 0xf7, with i_38

>>     still having the range [0x10, 0x20].  We can therefore tell

>>     that i_38 + 0xf0 has the range [0x00, 0x10], and similarly

>>     for +0xf1...+0xf7.

>>

>>     We should really be folding to constants here though.

>>

>> OK to install?

>>

>>

>> 2018-01-31  Richard Sandiford  <richard.sandiford@linaro.org>

>>

>> gcc/

>>         PR tree-optimization/81635

>>         * tree-data-ref.c (split_constant_offset_1): For types that

>>         wrap on overflow, try to use range info to prove that wrapping

>>         cannot occur.

>>

>> gcc/testsuite/

>>         PR tree-optimization/81635

>>         * gcc.dg/vect/bb-slp-pr81635-1.c: New test.

>>         * gcc.dg/vect/bb-slp-pr81635-2.c: Likewise.

>>

>> Index: gcc/tree-data-ref.c

>> ===================================================================

>> --- gcc/tree-data-ref.c 2018-01-13 18:02:00.946360352 +0000

>> +++ gcc/tree-data-ref.c 2018-01-31 13:26:13.488630604 +0000

>> @@ -704,11 +704,46 @@ split_constant_offset_1 (tree type, tree

>>            and the outer precision is at least as large as the inner.  */

>>         tree itype = TREE_TYPE (op0);

>>         if ((POINTER_TYPE_P (itype)

>> -            || (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_UNDEFINED (itype)))

>> +            || (INTEGRAL_TYPE_P (itype) && !TYPE_OVERFLOW_TRAPS (itype)))

>>             && TYPE_PRECISION (type) >= TYPE_PRECISION (itype)

>>             && (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type)))

>>           {

>> -           split_constant_offset (op0, &var0, off);

>> +           if (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_WRAPS (itype))

>> +             {

>> +               /* Split the unconverted operand and try to prove that

>> +                  wrapping isn't a problem.  */

>> +               tree tmp_var, tmp_off;

>> +               split_constant_offset (op0, &tmp_var, &tmp_off);

>> +

>> +               /* See whether we have an SSA_NAME whose range is known

>> +                  to be [A, B].  */

>> +               if (TREE_CODE (tmp_var) != SSA_NAME)

>> +                 return false;

>> +               wide_int var_min, var_max;

>> +               if (get_range_info (tmp_var, &var_min, &var_max) != VR_RANGE)

>> +                 return false;

>> +

>> +               /* See whether the range of OP0 (i.e. TMP_VAR + TMP_OFF)

>> +                  is known to be [A + TMP_OFF, B + TMP_OFF], with all

>> +                  operations done in ITYPE.  The addition must overflow

>> +                  at both ends of the range or at neither.  */

>

> Hmm, if they overflow at both ends that's still an issue given we effectively

> widen the addition?  Consider char -> short and [255U, 255U] + 1 which

> when widened will be 256 but when wrapping in char it's 0...

>

>> +               bool overflow[2];

>> +               signop sgn = TYPE_SIGN (itype);

>> +               unsigned int prec = TYPE_PRECISION (itype);

>> +               wide_int woff = wi::to_wide (tmp_off, prec);

>> +               wide_int op0_min = wi::add (var_min, woff, sgn, &overflow[0]);

>> +               wi::add (var_max, woff, sgn, &overflow[1]);

>> +               if (overflow[0] != overflow[1])

>> +                 return false;

>> +

>> +               /* Calculate (ssizetype) OP0 - (ssizetype) TMP_VAR.  */

>> +               widest_int diff = (widest_int::from (op0_min, sgn)

>> +                                  - widest_int::from (var_min, sgn));

>> +               var0 = tmp_var;

>> +               *off = wide_int_to_tree (ssizetype, diff);

>

> ... or is this part trying to deal with that case?  Because I don't understand

> it - isn't 'off' simply supposed to be 'tmp_off' literally?  I think this needs

> more explanation...  if there's no overflow that would be all that is to do, no?


Yeah, like you say, this is to handle the case in which they overflow
at both ends.  So in your example op0_min is 0 and var_min is 255,
so we end up with the inner char + -255.

The idea is that we don't care whether overflow happens per se.
All we care about is whether values of the unconverted operand (op0)
are displaced from the values of the inner variable by a constant
amount, calculated as diff here.

Thanks,
Richard


>

> Thanks,

> Richard.

>

>> +             }

>> +           else

>> +             split_constant_offset (op0, &var0, off);

>>             *var = fold_convert (type, var0);

>>             return true;

>>           }

>> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c

>> ===================================================================

>> --- /dev/null   2018-01-30 17:30:22.185477046 +0000

>> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c        2018-01-31 13:26:13.487630644 +0000

>> @@ -0,0 +1,92 @@

>> +/* { dg-do compile } */

>> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */

>> +/* { dg-require-effective-target vect_double } */

>> +/* { dg-require-effective-target lp64 } */

>> +

>> +void

>> +f1 (double *p, double *q)

>> +{

>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>> +  for (unsigned int i = 0; i < 1000; i += 4)

>> +    {

>> +      double a = q[i] + p[i];

>> +      double b = q[i + 1] + p[i + 1];

>> +      q[i] = a;

>> +      q[i + 1] = b;

>> +    }

>> +}

>> +

>> +void

>> +f2 (double *p, double *q)

>> +{

>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>> +  for (unsigned int i = 2; i < ~0U - 4; i += 4)

>> +    {

>> +      double a = q[i] + p[i];

>> +      double b = q[i + 1] + p[i + 1];

>> +      q[i] = a;

>> +      q[i + 1] = b;

>> +    }

>> +}

>> +

>> +void

>> +f3 (double *p, double *q)

>> +{

>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>> +  for (unsigned int i = 0; i < ~0U - 3; i += 4)

>> +    {

>> +      double a = q[i + 2] + p[i + 2];

>> +      double b = q[i + 3] + p[i + 3];

>> +      q[i + 2] = a;

>> +      q[i + 3] = b;

>> +    }

>> +}

>> +

>> +void

>> +f4 (double *p, double *q)

>> +{

>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>> +  for (unsigned int i = 0; i < 500; i += 6)

>> +    for (unsigned int j = 0; j < 500; j += 4)

>> +      {

>> +       double a = q[j] + p[i];

>> +       double b = q[j + 1] + p[i + 1];

>> +       q[i] = a;

>> +       q[i + 1] = b;

>> +      }

>> +}

>> +

>> +void

>> +f5 (double *p, double *q)

>> +{

>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>> +  for (unsigned int i = 2; i < 1000; i += 4)

>> +    {

>> +      double a = q[i - 2] + p[i - 2];

>> +      double b = q[i - 1] + p[i - 1];

>> +      q[i - 2] = a;

>> +      q[i - 1] = b;

>> +    }

>> +}

>> +

>> +double p[1000];

>> +double q[1000];

>> +

>> +void

>> +f6 (int n)

>> +{

>> +  for (unsigned int i = 0; i < n; i += 4)

>> +    {

>> +      double a = q[i] + p[i];

>> +      double b = q[i + 1] + p[i + 1];

>> +      q[i] = a;

>> +      q[i + 1] = b;

>> +    }

>> +}

>> +

>> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 6 "slp1" } } */

>> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c

>> ===================================================================

>> --- /dev/null   2018-01-30 17:30:22.185477046 +0000

>> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c        2018-01-31 13:26:13.487630644 +0000

>> @@ -0,0 +1,64 @@

>> +/* { dg-do compile } */

>> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */

>> +/* { dg-require-effective-target lp64 } */

>> +

>> +double p[1000];

>> +double q[1000];

>> +

>> +void

>> +f1 (double *p, double *q)

>> +{

>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>> +  for (unsigned int i = 2; i < ~0U - 4; i += 4)

>> +    {

>> +      double a = q[i + 2] + p[i + 2];

>> +      double b = q[i + 3] + p[i + 3];

>> +      q[i + 2] = a;

>> +      q[i + 3] = b;

>> +    }

>> +}

>> +

>> +void

>> +f2 (double *p, double *q)

>> +{

>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>> +  for (unsigned int i = 0; i < ~0U - 3; i += 4)

>> +    {

>> +      double a = q[i + 4] + p[i + 4];

>> +      double b = q[i + 5] + p[i + 5];

>> +      q[i + 4] = a;

>> +      q[i + 5] = b;

>> +    }

>> +}

>> +

>> +void

>> +f3 (double *p, double *q)

>> +{

>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>> +  for (unsigned int i = 0; i < 1000; i += 4)

>> +    {

>> +      double a = q[i - 2] + p[i - 2];

>> +      double b = q[i - 1] + p[i - 1];

>> +      q[i - 2] = a;

>> +      q[i - 1] = b;

>> +    }

>> +}

>> +

>> +void

>> +f4 (double *p, double *q)

>> +{

>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>> +  for (unsigned int i = 2; i < 1000; i += 4)

>> +    {

>> +      double a = q[i - 4] + p[i - 4];

>> +      double b = q[i - 3] + p[i - 3];

>> +      q[i - 4] = a;

>> +      q[i - 3] = b;

>> +    }

>> +}

>> +

>> +/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp1" } } */
Richard Biener Feb. 1, 2018, 1:21 p.m. | #3
On Thu, Feb 1, 2018 at 2:18 PM, Richard Sandiford
<richard.sandiford@linaro.org> wrote:
> Richard Biener <richard.guenther@gmail.com> writes:

>> On Wed, Jan 31, 2018 at 4:06 PM, Richard Sandiford

>> <richard.sandiford@linaro.org> wrote:

>>> This patch implements the original suggestion for fixing PR 81635:

>>> use range info in split_constant_offset to see whether a conversion

>>> of a wrapping type can be split.  The range info problem described in:

>>>

>>>   https://gcc.gnu.org/ml/gcc-patches/2017-08/msg01002.html

>>>

>>> seems to have been fixed.

>>>

>>> The patch is part 1.  There needs to be a follow-on patch to handle:

>>>

>>>   for (unsigned int i = 0; i < n; i += 4)

>>>     {

>>>       ...[i + 2]...

>>>       ...[i + 3]...

>>>

>>> which the old SCEV test handles, but which the range check doesn't.

>>> At the moment we record that the low two bits of "i" are clear,

>>> but we still end up with a maximum range of 0xffffffff rather than

>>> 0xfffffffc.

>>>

>>> Tested on aarch64-linux-gnu, x86_64-linux-gnu and powerpc64le-linux-gnu.

>>> Also tested by comparing the before and after testsuite assembly output

>>> for at least one target per CPU directory.  Excluding a small number

>>> of register renamings on some targets, there were two differences:

>>>

>>> (1) In gcc.c-torture/compile/pr55350.c:

>>>

>>>     void

>>>     foo (__INTPTR_TYPE__ x, __INTPTR_TYPE__ y)

>>>     {

>>>       int i;

>>>       void **a = (void *)  (8UL * (x / 8UL));

>>>       for (i = 0; i < x; i++)

>>>         a[i] = (void *) y;

>>>     }

>>>

>>>     we previously kept base "a" and offset 0, but now use

>>>     "(void **) _n" (where _n holds the multiplication result).

>>>

>>>     This is because the old test had the side-effect of prohibiting

>>>     casts from unsigned ints to pointers of the same size.

>>>     What we do for code like this isn't going to help much though.

>>>

>>> (2) In gcc.c-torture/execute/2003016-1.c, we unrolled:

>>>

>>>     void f (unsigned int *x)

>>>     {

>>>       unsigned char i;

>>>       int j;

>>>

>>>       i = 0x10;

>>>       for (j = 0; j < 0x10; j++)

>>>         {

>>>           i += 0xe8;

>>>           x[i] = 0;

>>>           i -= 0xe7;

>>>         }

>>>     }

>>>

>>>     and ended up with an unpropagated degenerate phi:

>>>

>>>       # i_38 = PHI <16(3)>

>>>       ...

>>>       i_40 = i_38 + 1;

>>>       ...

>>>       i_48 = i_40 + 1;

>>>       ... etc ...

>>>       i_178 = i_168 + 1;

>>>       i_16 = i_178 + 232;

>>>       _17 = (long unsigned int) i_16;

>>>       _18 = _17 * 4;

>>>       _19 = &x + _18;

>>>       *_19 = 0;

>>>

>>>     Calling split_constant_offset on each (long unsigned int) operand

>>>     gives i_38 + 0xe8, i_38 + 0xe9, ..., i_38 + 0xf7, with i_38

>>>     still having the range [0x10, 0x20].  We can therefore tell

>>>     that i_38 + 0xf0 has the range [0x00, 0x10], and similarly

>>>     for +0xf1...+0xf7.

>>>

>>>     We should really be folding to constants here though.

>>>

>>> OK to install?

>>>

>>>

>>> 2018-01-31  Richard Sandiford  <richard.sandiford@linaro.org>

>>>

>>> gcc/

>>>         PR tree-optimization/81635

>>>         * tree-data-ref.c (split_constant_offset_1): For types that

>>>         wrap on overflow, try to use range info to prove that wrapping

>>>         cannot occur.

>>>

>>> gcc/testsuite/

>>>         PR tree-optimization/81635

>>>         * gcc.dg/vect/bb-slp-pr81635-1.c: New test.

>>>         * gcc.dg/vect/bb-slp-pr81635-2.c: Likewise.

>>>

>>> Index: gcc/tree-data-ref.c

>>> ===================================================================

>>> --- gcc/tree-data-ref.c 2018-01-13 18:02:00.946360352 +0000

>>> +++ gcc/tree-data-ref.c 2018-01-31 13:26:13.488630604 +0000

>>> @@ -704,11 +704,46 @@ split_constant_offset_1 (tree type, tree

>>>            and the outer precision is at least as large as the inner.  */

>>>         tree itype = TREE_TYPE (op0);

>>>         if ((POINTER_TYPE_P (itype)

>>> -            || (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_UNDEFINED (itype)))

>>> +            || (INTEGRAL_TYPE_P (itype) && !TYPE_OVERFLOW_TRAPS (itype)))

>>>             && TYPE_PRECISION (type) >= TYPE_PRECISION (itype)

>>>             && (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type)))

>>>           {

>>> -           split_constant_offset (op0, &var0, off);

>>> +           if (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_WRAPS (itype))

>>> +             {

>>> +               /* Split the unconverted operand and try to prove that

>>> +                  wrapping isn't a problem.  */

>>> +               tree tmp_var, tmp_off;

>>> +               split_constant_offset (op0, &tmp_var, &tmp_off);

>>> +

>>> +               /* See whether we have an SSA_NAME whose range is known

>>> +                  to be [A, B].  */

>>> +               if (TREE_CODE (tmp_var) != SSA_NAME)

>>> +                 return false;

>>> +               wide_int var_min, var_max;

>>> +               if (get_range_info (tmp_var, &var_min, &var_max) != VR_RANGE)

>>> +                 return false;

>>> +

>>> +               /* See whether the range of OP0 (i.e. TMP_VAR + TMP_OFF)

>>> +                  is known to be [A + TMP_OFF, B + TMP_OFF], with all

>>> +                  operations done in ITYPE.  The addition must overflow

>>> +                  at both ends of the range or at neither.  */

>>

>> Hmm, if they overflow at both ends that's still an issue given we effectively

>> widen the addition?  Consider char -> short and [255U, 255U] + 1 which

>> when widened will be 256 but when wrapping in char it's 0...

>>

>>> +               bool overflow[2];

>>> +               signop sgn = TYPE_SIGN (itype);

>>> +               unsigned int prec = TYPE_PRECISION (itype);

>>> +               wide_int woff = wi::to_wide (tmp_off, prec);

>>> +               wide_int op0_min = wi::add (var_min, woff, sgn, &overflow[0]);

>>> +               wi::add (var_max, woff, sgn, &overflow[1]);

>>> +               if (overflow[0] != overflow[1])

>>> +                 return false;

>>> +

>>> +               /* Calculate (ssizetype) OP0 - (ssizetype) TMP_VAR.  */

>>> +               widest_int diff = (widest_int::from (op0_min, sgn)

>>> +                                  - widest_int::from (var_min, sgn));

>>> +               var0 = tmp_var;

>>> +               *off = wide_int_to_tree (ssizetype, diff);

>>

>> ... or is this part trying to deal with that case?  Because I don't understand

>> it - isn't 'off' simply supposed to be 'tmp_off' literally?  I think this needs

>> more explanation...  if there's no overflow that would be all that is to do, no?

>

> Yeah, like you say, this is to handle the case in which they overflow

> at both ends.  So in your example op0_min is 0 and var_min is 255,

> so we end up with the inner char + -255.

>

> The idea is that we don't care whether overflow happens per se.

> All we care about is whether values of the unconverted operand (op0)

> are displaced from the values of the inner variable by a constant

> amount, calculated as diff here.


Ok.  Can you amend the "Calculate (ssizetype) OP0 ... comment by,
say, ".. to handle the case where both ends wrapped"?

Ok with that change.
Richard.

> Thanks,

> Richard

>

>

>>

>> Thanks,

>> Richard.

>>

>>> +             }

>>> +           else

>>> +             split_constant_offset (op0, &var0, off);

>>>             *var = fold_convert (type, var0);

>>>             return true;

>>>           }

>>> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c

>>> ===================================================================

>>> --- /dev/null   2018-01-30 17:30:22.185477046 +0000

>>> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c        2018-01-31 13:26:13.487630644 +0000

>>> @@ -0,0 +1,92 @@

>>> +/* { dg-do compile } */

>>> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */

>>> +/* { dg-require-effective-target vect_double } */

>>> +/* { dg-require-effective-target lp64 } */

>>> +

>>> +void

>>> +f1 (double *p, double *q)

>>> +{

>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>>> +  for (unsigned int i = 0; i < 1000; i += 4)

>>> +    {

>>> +      double a = q[i] + p[i];

>>> +      double b = q[i + 1] + p[i + 1];

>>> +      q[i] = a;

>>> +      q[i + 1] = b;

>>> +    }

>>> +}

>>> +

>>> +void

>>> +f2 (double *p, double *q)

>>> +{

>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>>> +  for (unsigned int i = 2; i < ~0U - 4; i += 4)

>>> +    {

>>> +      double a = q[i] + p[i];

>>> +      double b = q[i + 1] + p[i + 1];

>>> +      q[i] = a;

>>> +      q[i + 1] = b;

>>> +    }

>>> +}

>>> +

>>> +void

>>> +f3 (double *p, double *q)

>>> +{

>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>>> +  for (unsigned int i = 0; i < ~0U - 3; i += 4)

>>> +    {

>>> +      double a = q[i + 2] + p[i + 2];

>>> +      double b = q[i + 3] + p[i + 3];

>>> +      q[i + 2] = a;

>>> +      q[i + 3] = b;

>>> +    }

>>> +}

>>> +

>>> +void

>>> +f4 (double *p, double *q)

>>> +{

>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>>> +  for (unsigned int i = 0; i < 500; i += 6)

>>> +    for (unsigned int j = 0; j < 500; j += 4)

>>> +      {

>>> +       double a = q[j] + p[i];

>>> +       double b = q[j + 1] + p[i + 1];

>>> +       q[i] = a;

>>> +       q[i + 1] = b;

>>> +      }

>>> +}

>>> +

>>> +void

>>> +f5 (double *p, double *q)

>>> +{

>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>>> +  for (unsigned int i = 2; i < 1000; i += 4)

>>> +    {

>>> +      double a = q[i - 2] + p[i - 2];

>>> +      double b = q[i - 1] + p[i - 1];

>>> +      q[i - 2] = a;

>>> +      q[i - 1] = b;

>>> +    }

>>> +}

>>> +

>>> +double p[1000];

>>> +double q[1000];

>>> +

>>> +void

>>> +f6 (int n)

>>> +{

>>> +  for (unsigned int i = 0; i < n; i += 4)

>>> +    {

>>> +      double a = q[i] + p[i];

>>> +      double b = q[i + 1] + p[i + 1];

>>> +      q[i] = a;

>>> +      q[i + 1] = b;

>>> +    }

>>> +}

>>> +

>>> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 6 "slp1" } } */

>>> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c

>>> ===================================================================

>>> --- /dev/null   2018-01-30 17:30:22.185477046 +0000

>>> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c        2018-01-31 13:26:13.487630644 +0000

>>> @@ -0,0 +1,64 @@

>>> +/* { dg-do compile } */

>>> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */

>>> +/* { dg-require-effective-target lp64 } */

>>> +

>>> +double p[1000];

>>> +double q[1000];

>>> +

>>> +void

>>> +f1 (double *p, double *q)

>>> +{

>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>>> +  for (unsigned int i = 2; i < ~0U - 4; i += 4)

>>> +    {

>>> +      double a = q[i + 2] + p[i + 2];

>>> +      double b = q[i + 3] + p[i + 3];

>>> +      q[i + 2] = a;

>>> +      q[i + 3] = b;

>>> +    }

>>> +}

>>> +

>>> +void

>>> +f2 (double *p, double *q)

>>> +{

>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>>> +  for (unsigned int i = 0; i < ~0U - 3; i += 4)

>>> +    {

>>> +      double a = q[i + 4] + p[i + 4];

>>> +      double b = q[i + 5] + p[i + 5];

>>> +      q[i + 4] = a;

>>> +      q[i + 5] = b;

>>> +    }

>>> +}

>>> +

>>> +void

>>> +f3 (double *p, double *q)

>>> +{

>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>>> +  for (unsigned int i = 0; i < 1000; i += 4)

>>> +    {

>>> +      double a = q[i - 2] + p[i - 2];

>>> +      double b = q[i - 1] + p[i - 1];

>>> +      q[i - 2] = a;

>>> +      q[i - 1] = b;

>>> +    }

>>> +}

>>> +

>>> +void

>>> +f4 (double *p, double *q)

>>> +{

>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);

>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);

>>> +  for (unsigned int i = 2; i < 1000; i += 4)

>>> +    {

>>> +      double a = q[i - 4] + p[i - 4];

>>> +      double b = q[i - 3] + p[i - 3];

>>> +      q[i - 4] = a;

>>> +      q[i - 3] = b;

>>> +    }

>>> +}

>>> +

>>> +/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp1" } } */

Patch

Index: gcc/tree-data-ref.c
===================================================================
--- gcc/tree-data-ref.c	2018-01-13 18:02:00.946360352 +0000
+++ gcc/tree-data-ref.c	2018-01-31 13:26:13.488630604 +0000
@@ -704,11 +704,46 @@  split_constant_offset_1 (tree type, tree
 	   and the outer precision is at least as large as the inner.  */
 	tree itype = TREE_TYPE (op0);
 	if ((POINTER_TYPE_P (itype)
-	     || (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_UNDEFINED (itype)))
+	     || (INTEGRAL_TYPE_P (itype) && !TYPE_OVERFLOW_TRAPS (itype)))
 	    && TYPE_PRECISION (type) >= TYPE_PRECISION (itype)
 	    && (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type)))
 	  {
-	    split_constant_offset (op0, &var0, off);
+	    if (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_WRAPS (itype))
+	      {
+		/* Split the unconverted operand and try to prove that
+		   wrapping isn't a problem.  */
+		tree tmp_var, tmp_off;
+		split_constant_offset (op0, &tmp_var, &tmp_off);
+
+		/* See whether we have an SSA_NAME whose range is known
+		   to be [A, B].  */
+		if (TREE_CODE (tmp_var) != SSA_NAME)
+		  return false;
+		wide_int var_min, var_max;
+		if (get_range_info (tmp_var, &var_min, &var_max) != VR_RANGE)
+		  return false;
+
+		/* See whether the range of OP0 (i.e. TMP_VAR + TMP_OFF)
+		   is known to be [A + TMP_OFF, B + TMP_OFF], with all
+		   operations done in ITYPE.  The addition must overflow
+		   at both ends of the range or at neither.  */
+		bool overflow[2];
+		signop sgn = TYPE_SIGN (itype);
+		unsigned int prec = TYPE_PRECISION (itype);
+		wide_int woff = wi::to_wide (tmp_off, prec);
+		wide_int op0_min = wi::add (var_min, woff, sgn, &overflow[0]);
+		wi::add (var_max, woff, sgn, &overflow[1]);
+		if (overflow[0] != overflow[1])
+		  return false;
+
+		/* Calculate (ssizetype) OP0 - (ssizetype) TMP_VAR.  */
+		widest_int diff = (widest_int::from (op0_min, sgn)
+				   - widest_int::from (var_min, sgn));
+		var0 = tmp_var;
+		*off = wide_int_to_tree (ssizetype, diff);
+	      }
+	    else
+	      split_constant_offset (op0, &var0, off);
 	    *var = fold_convert (type, var0);
 	    return true;
 	  }
Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c
===================================================================
--- /dev/null	2018-01-30 17:30:22.185477046 +0000
+++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c	2018-01-31 13:26:13.487630644 +0000
@@ -0,0 +1,92 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-require-effective-target lp64 } */
+
+void
+f1 (double *p, double *q)
+{
+  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
+  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
+  for (unsigned int i = 0; i < 1000; i += 4)
+    {
+      double a = q[i] + p[i];
+      double b = q[i + 1] + p[i + 1];
+      q[i] = a;
+      q[i + 1] = b;
+    }
+}
+
+void
+f2 (double *p, double *q)
+{
+  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
+  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
+  for (unsigned int i = 2; i < ~0U - 4; i += 4)
+    {
+      double a = q[i] + p[i];
+      double b = q[i + 1] + p[i + 1];
+      q[i] = a;
+      q[i + 1] = b;
+    }
+}
+
+void
+f3 (double *p, double *q)
+{
+  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
+  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
+  for (unsigned int i = 0; i < ~0U - 3; i += 4)
+    {
+      double a = q[i + 2] + p[i + 2];
+      double b = q[i + 3] + p[i + 3];
+      q[i + 2] = a;
+      q[i + 3] = b;
+    }
+}
+
+void
+f4 (double *p, double *q)
+{
+  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
+  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
+  for (unsigned int i = 0; i < 500; i += 6)
+    for (unsigned int j = 0; j < 500; j += 4)
+      {
+	double a = q[j] + p[i];
+	double b = q[j + 1] + p[i + 1];
+	q[i] = a;
+	q[i + 1] = b;
+      }
+}
+
+void
+f5 (double *p, double *q)
+{
+  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
+  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
+  for (unsigned int i = 2; i < 1000; i += 4)
+    {
+      double a = q[i - 2] + p[i - 2];
+      double b = q[i - 1] + p[i - 1];
+      q[i - 2] = a;
+      q[i - 1] = b;
+    }
+}
+
+double p[1000];
+double q[1000];
+
+void
+f6 (int n)
+{
+  for (unsigned int i = 0; i < n; i += 4)
+    {
+      double a = q[i] + p[i];
+      double b = q[i + 1] + p[i + 1];
+      q[i] = a;
+      q[i + 1] = b;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 6 "slp1" } } */
Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c
===================================================================
--- /dev/null	2018-01-30 17:30:22.185477046 +0000
+++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c	2018-01-31 13:26:13.487630644 +0000
@@ -0,0 +1,64 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+/* { dg-require-effective-target lp64 } */
+
+double p[1000];
+double q[1000];
+
+void
+f1 (double *p, double *q)
+{
+  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
+  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
+  for (unsigned int i = 2; i < ~0U - 4; i += 4)
+    {
+      double a = q[i + 2] + p[i + 2];
+      double b = q[i + 3] + p[i + 3];
+      q[i + 2] = a;
+      q[i + 3] = b;
+    }
+}
+
+void
+f2 (double *p, double *q)
+{
+  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
+  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
+  for (unsigned int i = 0; i < ~0U - 3; i += 4)
+    {
+      double a = q[i + 4] + p[i + 4];
+      double b = q[i + 5] + p[i + 5];
+      q[i + 4] = a;
+      q[i + 5] = b;
+    }
+}
+
+void
+f3 (double *p, double *q)
+{
+  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
+  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
+  for (unsigned int i = 0; i < 1000; i += 4)
+    {
+      double a = q[i - 2] + p[i - 2];
+      double b = q[i - 1] + p[i - 1];
+      q[i - 2] = a;
+      q[i - 1] = b;
+    }
+}
+
+void
+f4 (double *p, double *q)
+{
+  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
+  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
+  for (unsigned int i = 2; i < 1000; i += 4)
+    {
+      double a = q[i - 4] + p[i - 4];
+      double b = q[i - 3] + p[i - 3];
+      q[i - 4] = a;
+      q[i - 3] = b;
+    }
+}
+
+/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp1" } } */