Message ID | 87h900yznu.fsf@linaro.org |
---|---|
State | New |
Headers | show |
On Thu, Jun 1, 2017 at 8:45 AM, Richard Sandiford <richard.sandiford@linaro.org> wrote: > Some of the SVE patches extend SLP to predicated operations created by > ifcvt. However, ifcvt currently forces the mask into a temporary: > > mask = ifc_temp_var (TREE_TYPE (mask), mask, &gsi); > > and at the moment SLP doesn't handle simple assignments like: > > SSA_NAME = SSA_NAME > SSA_NAME = <constant> > > (It does of course handle: > > SSA_NAME = SSA_NAME op SSA_NAME > SSA_NAME = SSA_NAME op <constant>) > > I realise copy propagation should usually ensure that these simple > assignments don't occur, but normal loop vectorisation handles them > just fine, and SLP does too once we get over the initial validity check. > I thought this patch might be useful even if we decide that we don't want > ifcvt to create a temporary mask in such cases. > > Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install? > > Thanks, > Richard > > > 2017-06-01 Richard Sandiford <richard.sandiford@linaro.org> > > gcc/ > * tree-vect-slp.c (vect_build_slp_tree_1): Allow mixtures of SSA > names and constants, without treating them as separate operations. > Explicitly reject stores. > > gcc/testsuite/ > * gcc.dg/vect/slp-temp-1.c: New test. > * gcc.dg/vect/slp-temp-2.c: Likewise. > * gcc.dg/vect/slp-temp-3.c: Likewise. > > Index: gcc/tree-vect-slp.c > =================================================================== > --- gcc/tree-vect-slp.c 2017-05-18 07:51:12.387750673 +0100 > +++ gcc/tree-vect-slp.c 2017-06-01 07:21:44.094320070 +0100 > @@ -671,6 +671,13 @@ vect_build_slp_tree_1 (vec_info *vinfo, > first_op1 = gimple_assign_rhs2 (stmt); > } > } > + else if ((TREE_CODE_CLASS (rhs_code) == tcc_constant > + || rhs_code == SSA_NAME) > + && (TREE_CODE_CLASS (first_stmt_code) == tcc_constant > + || first_stmt_code == SSA_NAME)) > + /* Merging two simple rvalues is OK and doesn't count as two > + operations. */ > + ; But this doesn't help in the case one stmt has the copy/constant propagated and one not ... > else > { > if (first_stmt_code != rhs_code > @@ -800,11 +807,14 @@ vect_build_slp_tree_1 (vec_info *vinfo, > } > > /* Not memory operation. */ > - if (TREE_CODE_CLASS (rhs_code) != tcc_binary > - && TREE_CODE_CLASS (rhs_code) != tcc_unary > - && TREE_CODE_CLASS (rhs_code) != tcc_expression > - && TREE_CODE_CLASS (rhs_code) != tcc_comparison > - && rhs_code != CALL_EXPR) > + if (REFERENCE_CLASS_P (lhs) > + || (TREE_CODE_CLASS (rhs_code) != tcc_binary > + && TREE_CODE_CLASS (rhs_code) != tcc_unary > + && TREE_CODE_CLASS (rhs_code) != tcc_expression > + && TREE_CODE_CLASS (rhs_code) != tcc_comparison > + && TREE_CODE_CLASS (rhs_code) != tcc_constant > + && rhs_code != CALL_EXPR > + && rhs_code != SSA_NAME)) I think this whole block is dead code ... we can't ever visit stores and the only case the rhs_code == tcc_reference check above would miss is plain decls (but non-indexed decls should be rejected by dataref analysis already). I think a better fix is to ensure we do not have non-copy/constant propagated IL fed into the vectorizer. Richard. > { > if (dump_enabled_p ()) > { > Index: gcc/testsuite/gcc.dg/vect/slp-temp-1.c > =================================================================== > --- /dev/null 2017-06-01 07:09:35.344016119 +0100 > +++ gcc/testsuite/gcc.dg/vect/slp-temp-1.c 2017-06-01 07:39:24.406603119 +0100 > @@ -0,0 +1,71 @@ > +/* { dg-do run { target { lp64 || ilp32 } } } */ > +/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */ > + > +void __GIMPLE (startwith ("loop")) > +f (int *x, int n) > +{ > + int i_in; > + int i_out; > + int double_i; > + > + long unsigned int index_0; > + long unsigned int offset_0; > + int *addr_0; > + int temp_0; > + > + long unsigned int index_1; > + long unsigned int offset_1; > + int *addr_1; > + int temp_1; > + > + entry: > + goto loop; > + > + loop: > + i_in = __PHI (entry: 0, latch: i_out); > + double_i = i_in * 2; > + > + index_0 = (long unsigned int) double_i; > + offset_0 = index_0 * 4ul; > + addr_0 = x_1(D) + offset_0; > + temp_0 = 1; > + *addr_0 = temp_0; > + > + index_1 = index_0 + 1ul; > + offset_1 = index_1 * 4ul; > + addr_1 = x_1(D) + offset_1; > + temp_1 = 3; > + *addr_1 = temp_1; > + > + i_out = i_in + 1; > + if (n_2(D) > i_out) > + goto latch; > + else > + goto exit; > + > + latch: > + goto loop; > + > + exit: > + return; > +} > + > +#define N 1024 > + > +int > +main (void) > +{ > + int a[N * 2]; > + f (a, N); > + for (int i = 0; i < N; ++i) > + { > + if (a[i * 2] != 1 > + || a[i * 2 + 1] != 3) > + __builtin_abort (); > + asm volatile ("" ::: "memory"); > + } > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */ > Index: gcc/testsuite/gcc.dg/vect/slp-temp-2.c > =================================================================== > --- /dev/null 2017-06-01 07:09:35.344016119 +0100 > +++ gcc/testsuite/gcc.dg/vect/slp-temp-2.c 2017-06-01 07:39:30.914219838 +0100 > @@ -0,0 +1,71 @@ > +/* { dg-do run { target { lp64 || ilp32 } } } */ > +/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */ > + > +void __GIMPLE (startwith ("loop")) > +f (int *x, int n, int foo) > +{ > + int i_in; > + int i_out; > + int double_i; > + > + long unsigned int index_0; > + long unsigned int offset_0; > + int *addr_0; > + int temp_0; > + > + long unsigned int index_1; > + long unsigned int offset_1; > + int *addr_1; > + int temp_1; > + > + entry: > + goto loop; > + > + loop: > + i_in = __PHI (entry: 0, latch: i_out); > + double_i = i_in * 2; > + > + index_0 = (long unsigned int) double_i; > + offset_0 = index_0 * 4ul; > + addr_0 = x_1(D) + offset_0; > + temp_0 = 1; > + *addr_0 = temp_0; > + > + index_1 = index_0 + 1ul; > + offset_1 = index_1 * 4ul; > + addr_1 = x_1(D) + offset_1; > + temp_1 = foo_2(D); > + *addr_1 = temp_1; > + > + i_out = i_in + 1; > + if (n_3(D) > i_out) > + goto latch; > + else > + goto exit; > + > + latch: > + goto loop; > + > + exit: > + return; > +} > + > +#define N 1024 > + > +int > +main (void) > +{ > + int a[N * 2]; > + f (a, N, 11); > + for (int i = 0; i < N; ++i) > + { > + if (a[i * 2] != 1 > + || a[i * 2 + 1] != 11) > + __builtin_abort (); > + asm volatile ("" ::: "memory"); > + } > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */ > Index: gcc/testsuite/gcc.dg/vect/slp-temp-3.c > =================================================================== > --- /dev/null 2017-06-01 07:09:35.344016119 +0100 > +++ gcc/testsuite/gcc.dg/vect/slp-temp-3.c 2017-06-01 07:39:40.925589561 +0100 > @@ -0,0 +1,84 @@ > +/* { dg-do run { target { lp64 || ilp32 } } } */ > +/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */ > + > +void __GIMPLE (startwith ("loop")) __attribute__ ((noinline, noclone)) > +f (int *x, int n) > +{ > + int i_in; > + int i_out; > + int double_i; > + > + long unsigned int index_0; > + long unsigned int offset_0; > + int *addr_0; > + int old_0; > + int new_0; > + int temp_0; > + > + long unsigned int index_1; > + long unsigned int offset_1; > + int *addr_1; > + int old_1; > + int new_1; > + int temp_1; > + > + entry: > + goto loop; > + > + loop: > + i_in = __PHI (entry: 0, latch: i_out); > + double_i = i_in * 2; > + > + index_0 = (long unsigned int) double_i; > + offset_0 = index_0 * 4ul; > + addr_0 = x_1(D) + offset_0; > + old_0 = *addr_0; > + new_0 = old_0 + 1; > + temp_0 = new_0; > + *addr_0 = temp_0; > + > + index_1 = index_0 + 1ul; > + offset_1 = index_1 * 4ul; > + addr_1 = x_1(D) + offset_1; > + old_1 = *addr_1; > + new_1 = old_1 + 3; > + temp_1 = new_1; > + *addr_1 = temp_1; > + > + i_out = i_in + 1; > + if (n_2(D) > i_out) > + goto latch; > + else > + goto exit; > + > + latch: > + goto loop; > + > + exit: > + return; > +} > + > +#define N 1024 > + > +int > +main (void) > +{ > + int a[N * 2]; > + for (int i = 0; i < N * 2; ++i) > + { > + a[i] = i * 4; > + asm volatile ("" ::: "memory"); > + } > + f (a, N); > + for (int i = 0; i < N; ++i) > + { > + if (a[i * 2] != i * 8 + 1 > + || a[i * 2 + 1] != i * 8 + 7) > + __builtin_abort (); > + asm volatile ("" ::: "memory"); > + } > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */
Index: gcc/tree-vect-slp.c =================================================================== --- gcc/tree-vect-slp.c 2017-05-18 07:51:12.387750673 +0100 +++ gcc/tree-vect-slp.c 2017-06-01 07:21:44.094320070 +0100 @@ -671,6 +671,13 @@ vect_build_slp_tree_1 (vec_info *vinfo, first_op1 = gimple_assign_rhs2 (stmt); } } + else if ((TREE_CODE_CLASS (rhs_code) == tcc_constant + || rhs_code == SSA_NAME) + && (TREE_CODE_CLASS (first_stmt_code) == tcc_constant + || first_stmt_code == SSA_NAME)) + /* Merging two simple rvalues is OK and doesn't count as two + operations. */ + ; else { if (first_stmt_code != rhs_code @@ -800,11 +807,14 @@ vect_build_slp_tree_1 (vec_info *vinfo, } /* Not memory operation. */ - if (TREE_CODE_CLASS (rhs_code) != tcc_binary - && TREE_CODE_CLASS (rhs_code) != tcc_unary - && TREE_CODE_CLASS (rhs_code) != tcc_expression - && TREE_CODE_CLASS (rhs_code) != tcc_comparison - && rhs_code != CALL_EXPR) + if (REFERENCE_CLASS_P (lhs) + || (TREE_CODE_CLASS (rhs_code) != tcc_binary + && TREE_CODE_CLASS (rhs_code) != tcc_unary + && TREE_CODE_CLASS (rhs_code) != tcc_expression + && TREE_CODE_CLASS (rhs_code) != tcc_comparison + && TREE_CODE_CLASS (rhs_code) != tcc_constant + && rhs_code != CALL_EXPR + && rhs_code != SSA_NAME)) { if (dump_enabled_p ()) { Index: gcc/testsuite/gcc.dg/vect/slp-temp-1.c =================================================================== --- /dev/null 2017-06-01 07:09:35.344016119 +0100 +++ gcc/testsuite/gcc.dg/vect/slp-temp-1.c 2017-06-01 07:39:24.406603119 +0100 @@ -0,0 +1,71 @@ +/* { dg-do run { target { lp64 || ilp32 } } } */ +/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */ + +void __GIMPLE (startwith ("loop")) +f (int *x, int n) +{ + int i_in; + int i_out; + int double_i; + + long unsigned int index_0; + long unsigned int offset_0; + int *addr_0; + int temp_0; + + long unsigned int index_1; + long unsigned int offset_1; + int *addr_1; + int temp_1; + + entry: + goto loop; + + loop: + i_in = __PHI (entry: 0, latch: i_out); + double_i = i_in * 2; + + index_0 = (long unsigned int) double_i; + offset_0 = index_0 * 4ul; + addr_0 = x_1(D) + offset_0; + temp_0 = 1; + *addr_0 = temp_0; + + index_1 = index_0 + 1ul; + offset_1 = index_1 * 4ul; + addr_1 = x_1(D) + offset_1; + temp_1 = 3; + *addr_1 = temp_1; + + i_out = i_in + 1; + if (n_2(D) > i_out) + goto latch; + else + goto exit; + + latch: + goto loop; + + exit: + return; +} + +#define N 1024 + +int +main (void) +{ + int a[N * 2]; + f (a, N); + for (int i = 0; i < N; ++i) + { + if (a[i * 2] != 1 + || a[i * 2 + 1] != 3) + __builtin_abort (); + asm volatile ("" ::: "memory"); + } + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */ Index: gcc/testsuite/gcc.dg/vect/slp-temp-2.c =================================================================== --- /dev/null 2017-06-01 07:09:35.344016119 +0100 +++ gcc/testsuite/gcc.dg/vect/slp-temp-2.c 2017-06-01 07:39:30.914219838 +0100 @@ -0,0 +1,71 @@ +/* { dg-do run { target { lp64 || ilp32 } } } */ +/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */ + +void __GIMPLE (startwith ("loop")) +f (int *x, int n, int foo) +{ + int i_in; + int i_out; + int double_i; + + long unsigned int index_0; + long unsigned int offset_0; + int *addr_0; + int temp_0; + + long unsigned int index_1; + long unsigned int offset_1; + int *addr_1; + int temp_1; + + entry: + goto loop; + + loop: + i_in = __PHI (entry: 0, latch: i_out); + double_i = i_in * 2; + + index_0 = (long unsigned int) double_i; + offset_0 = index_0 * 4ul; + addr_0 = x_1(D) + offset_0; + temp_0 = 1; + *addr_0 = temp_0; + + index_1 = index_0 + 1ul; + offset_1 = index_1 * 4ul; + addr_1 = x_1(D) + offset_1; + temp_1 = foo_2(D); + *addr_1 = temp_1; + + i_out = i_in + 1; + if (n_3(D) > i_out) + goto latch; + else + goto exit; + + latch: + goto loop; + + exit: + return; +} + +#define N 1024 + +int +main (void) +{ + int a[N * 2]; + f (a, N, 11); + for (int i = 0; i < N; ++i) + { + if (a[i * 2] != 1 + || a[i * 2 + 1] != 11) + __builtin_abort (); + asm volatile ("" ::: "memory"); + } + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */ Index: gcc/testsuite/gcc.dg/vect/slp-temp-3.c =================================================================== --- /dev/null 2017-06-01 07:09:35.344016119 +0100 +++ gcc/testsuite/gcc.dg/vect/slp-temp-3.c 2017-06-01 07:39:40.925589561 +0100 @@ -0,0 +1,84 @@ +/* { dg-do run { target { lp64 || ilp32 } } } */ +/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */ + +void __GIMPLE (startwith ("loop")) __attribute__ ((noinline, noclone)) +f (int *x, int n) +{ + int i_in; + int i_out; + int double_i; + + long unsigned int index_0; + long unsigned int offset_0; + int *addr_0; + int old_0; + int new_0; + int temp_0; + + long unsigned int index_1; + long unsigned int offset_1; + int *addr_1; + int old_1; + int new_1; + int temp_1; + + entry: + goto loop; + + loop: + i_in = __PHI (entry: 0, latch: i_out); + double_i = i_in * 2; + + index_0 = (long unsigned int) double_i; + offset_0 = index_0 * 4ul; + addr_0 = x_1(D) + offset_0; + old_0 = *addr_0; + new_0 = old_0 + 1; + temp_0 = new_0; + *addr_0 = temp_0; + + index_1 = index_0 + 1ul; + offset_1 = index_1 * 4ul; + addr_1 = x_1(D) + offset_1; + old_1 = *addr_1; + new_1 = old_1 + 3; + temp_1 = new_1; + *addr_1 = temp_1; + + i_out = i_in + 1; + if (n_2(D) > i_out) + goto latch; + else + goto exit; + + latch: + goto loop; + + exit: + return; +} + +#define N 1024 + +int +main (void) +{ + int a[N * 2]; + for (int i = 0; i < N * 2; ++i) + { + a[i] = i * 4; + asm volatile ("" ::: "memory"); + } + f (a, N); + for (int i = 0; i < N; ++i) + { + if (a[i * 2] != i * 8 + 1 + || a[i * 2 + 1] != i * 8 + 7) + __builtin_abort (); + asm volatile ("" ::: "memory"); + } + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */