Message ID | 87lgc9easx.fsf@linaro.org |
---|---|
State | Accepted |
Commit | c453ccc2335bf4267a154c9385eb50a8c45235a1 |
Headers | show |
Series | Use canonicalize_math_after_vectorization_p for FMA folds | expand |
On Thu, May 24, 2018 at 10:07 AM Richard Sandiford < richard.sandiford@linaro.org> wrote: > The folds in r260348 kicked in before vectorisation, which hurts > for two reasons: > (1) the current suboptimal handling of nothrow meant that we could > drop the flag early and so prevent if-conversion > (2) some architectures provide more scalar forms than vector forms > (true for Advanced SIMD) > (1) is a bug in itself that needs to be fixed eventually, but delaying > the folds is still needed for (2). > Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf > and x86_64-linux-gnu. OK to install? OK. Richard. > (Patch is mostly just reindent.) > Richard > 2018-05-24 Richard Sandiford <richard.sandiford@linaro.org> > gcc/ > * match.pd: Delay FMA folds until after vectorization. > gcc/testsuite/ > * gcc.dg/vect/vect-fma-1.c: New test. > Index: gcc/match.pd > =================================================================== > --- gcc/match.pd 2018-05-18 09:26:37.735714314 +0100 > +++ gcc/match.pd 2018-05-24 09:05:10.432158893 +0100 > @@ -4703,59 +4703,60 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > wi::to_wide (@ipos) + isize)) > (BIT_FIELD_REF @0 @rsize @rpos))))) > -(for fmas (FMA) > +(if (canonicalize_math_after_vectorization_p ()) > + (for fmas (FMA) > + (simplify > + (fmas:c (negate @0) @1 @2) > + (IFN_FNMA @0 @1 @2)) > + (simplify > + (fmas @0 @1 (negate @2)) > + (IFN_FMS @0 @1 @2)) > + (simplify > + (fmas:c (negate @0) @1 (negate @2)) > + (IFN_FNMS @0 @1 @2)) > + (simplify > + (negate (fmas@3 @0 @1 @2)) > + (if (single_use (@3)) > + (IFN_FNMS @0 @1 @2)))) > + > + (simplify > + (IFN_FMS:c (negate @0) @1 @2) > + (IFN_FNMS @0 @1 @2)) > (simplify > - (fmas:c (negate @0) @1 @2) > + (IFN_FMS @0 @1 (negate @2)) > + (IFN_FMA @0 @1 @2)) > + (simplify > + (IFN_FMS:c (negate @0) @1 (negate @2)) > (IFN_FNMA @0 @1 @2)) > (simplify > - (fmas @0 @1 (negate @2)) > - (IFN_FMS @0 @1 @2)) > + (negate (IFN_FMS@3 @0 @1 @2)) > + (if (single_use (@3)) > + (IFN_FNMA @0 @1 @2))) > + > + (simplify > + (IFN_FNMA:c (negate @0) @1 @2) > + (IFN_FMA @0 @1 @2)) > (simplify > - (fmas:c (negate @0) @1 (negate @2)) > + (IFN_FNMA @0 @1 (negate @2)) > (IFN_FNMS @0 @1 @2)) > (simplify > - (negate (fmas@3 @0 @1 @2)) > + (IFN_FNMA:c (negate @0) @1 (negate @2)) > + (IFN_FMS @0 @1 @2)) > + (simplify > + (negate (IFN_FNMA@3 @0 @1 @2)) > (if (single_use (@3)) > - (IFN_FNMS @0 @1 @2)))) > + (IFN_FMS @0 @1 @2))) > -(simplify > - (IFN_FMS:c (negate @0) @1 @2) > - (IFN_FNMS @0 @1 @2)) > -(simplify > - (IFN_FMS @0 @1 (negate @2)) > - (IFN_FMA @0 @1 @2)) > -(simplify > - (IFN_FMS:c (negate @0) @1 (negate @2)) > - (IFN_FNMA @0 @1 @2)) > -(simplify > - (negate (IFN_FMS@3 @0 @1 @2)) > + (simplify > + (IFN_FNMS:c (negate @0) @1 @2) > + (IFN_FMS @0 @1 @2)) > + (simplify > + (IFN_FNMS @0 @1 (negate @2)) > + (IFN_FNMA @0 @1 @2)) > + (simplify > + (IFN_FNMS:c (negate @0) @1 (negate @2)) > + (IFN_FMA @0 @1 @2)) > + (simplify > + (negate (IFN_FNMS@3 @0 @1 @2)) > (if (single_use (@3)) > - (IFN_FNMA @0 @1 @2))) > - > -(simplify > - (IFN_FNMA:c (negate @0) @1 @2) > - (IFN_FMA @0 @1 @2)) > -(simplify > - (IFN_FNMA @0 @1 (negate @2)) > - (IFN_FNMS @0 @1 @2)) > -(simplify > - (IFN_FNMA:c (negate @0) @1 (negate @2)) > - (IFN_FMS @0 @1 @2)) > -(simplify > - (negate (IFN_FNMA@3 @0 @1 @2)) > - (if (single_use (@3)) > - (IFN_FMS @0 @1 @2))) > - > -(simplify > - (IFN_FNMS:c (negate @0) @1 @2) > - (IFN_FMS @0 @1 @2)) > -(simplify > - (IFN_FNMS @0 @1 (negate @2)) > - (IFN_FNMA @0 @1 @2)) > -(simplify > - (IFN_FNMS:c (negate @0) @1 (negate @2)) > - (IFN_FMA @0 @1 @2)) > -(simplify > - (negate (IFN_FNMS@3 @0 @1 @2)) > - (if (single_use (@3)) > - (IFN_FMA @0 @1 @2))) > + (IFN_FMA @0 @1 @2)))) > Index: gcc/testsuite/gcc.dg/vect/vect-fma-1.c > =================================================================== > --- /dev/null 2018-04-20 16:19:46.369131350 +0100 > +++ gcc/testsuite/gcc.dg/vect/vect-fma-1.c 2018-05-24 09:05:10.432158893 +0100 > @@ -0,0 +1,58 @@ > +/* { dg-require-effective-target scalar_all_fma } */ > + > +#include "tree-vect.h" > + > +#define N (VECTOR_BITS * 11 / 64 + 3) > + > +#define DEF(INV) \ > + void __attribute__ ((noipa)) \ > + f_##INV (double *restrict a, double *restrict b, \ > + double *restrict c, double *restrict d) \ > + { \ > + for (int i = 0; i < N; ++i) \ > + { \ > + double mb = (INV & 1 ? -b[i] : b[i]); \ > + double mc = c[i]; \ > + double md = (INV & 2 ? -d[i] : d[i]); \ > + double fma = __builtin_fma (mb, mc, md); \ > + a[i] = (INV & 4 ? -fma : fma); \ > + } \ > + } > + > +#define TEST(INV) \ > + { \ > + f_##INV (a, b, c, d); \ > + for (int i = 0; i < N; ++i) \ > + { \ > + double mb = (INV & 1 ? -b[i] : b[i]); \ > + double mc = c[i]; \ > + double md = (INV & 2 ? -d[i] : d[i]); \ > + double fma = __builtin_fma (mb, mc, md); \ > + double expected = (INV & 4 ? -fma : fma); \ > + if (a[i] != expected) \ > + __builtin_abort (); \ > + asm volatile ("" ::: "memory"); \ > + } \ > + } > + > +#define FOR_EACH_INV(T) \ > + T (0) T (1) T (2) T (3) T (4) T (5) T (6) T (7) > + > +FOR_EACH_INV (DEF) > + > +int > +main (void) > +{ > + double a[N], b[N], c[N], d[N]; > + for (int i = 0; i < N; ++i) > + { > + b[i] = i % 17; > + c[i] = i % 9 + 11; > + d[i] = i % 13 + 14; > + asm volatile ("" ::: "memory"); > + } > + FOR_EACH_INV (TEST) > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 8 "vect" { target vect_double } } } */
Index: gcc/match.pd =================================================================== --- gcc/match.pd 2018-05-18 09:26:37.735714314 +0100 +++ gcc/match.pd 2018-05-24 09:05:10.432158893 +0100 @@ -4703,59 +4703,60 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) wi::to_wide (@ipos) + isize)) (BIT_FIELD_REF @0 @rsize @rpos))))) -(for fmas (FMA) +(if (canonicalize_math_after_vectorization_p ()) + (for fmas (FMA) + (simplify + (fmas:c (negate @0) @1 @2) + (IFN_FNMA @0 @1 @2)) + (simplify + (fmas @0 @1 (negate @2)) + (IFN_FMS @0 @1 @2)) + (simplify + (fmas:c (negate @0) @1 (negate @2)) + (IFN_FNMS @0 @1 @2)) + (simplify + (negate (fmas@3 @0 @1 @2)) + (if (single_use (@3)) + (IFN_FNMS @0 @1 @2)))) + + (simplify + (IFN_FMS:c (negate @0) @1 @2) + (IFN_FNMS @0 @1 @2)) (simplify - (fmas:c (negate @0) @1 @2) + (IFN_FMS @0 @1 (negate @2)) + (IFN_FMA @0 @1 @2)) + (simplify + (IFN_FMS:c (negate @0) @1 (negate @2)) (IFN_FNMA @0 @1 @2)) (simplify - (fmas @0 @1 (negate @2)) - (IFN_FMS @0 @1 @2)) + (negate (IFN_FMS@3 @0 @1 @2)) + (if (single_use (@3)) + (IFN_FNMA @0 @1 @2))) + + (simplify + (IFN_FNMA:c (negate @0) @1 @2) + (IFN_FMA @0 @1 @2)) (simplify - (fmas:c (negate @0) @1 (negate @2)) + (IFN_FNMA @0 @1 (negate @2)) (IFN_FNMS @0 @1 @2)) (simplify - (negate (fmas@3 @0 @1 @2)) + (IFN_FNMA:c (negate @0) @1 (negate @2)) + (IFN_FMS @0 @1 @2)) + (simplify + (negate (IFN_FNMA@3 @0 @1 @2)) (if (single_use (@3)) - (IFN_FNMS @0 @1 @2)))) + (IFN_FMS @0 @1 @2))) -(simplify - (IFN_FMS:c (negate @0) @1 @2) - (IFN_FNMS @0 @1 @2)) -(simplify - (IFN_FMS @0 @1 (negate @2)) - (IFN_FMA @0 @1 @2)) -(simplify - (IFN_FMS:c (negate @0) @1 (negate @2)) - (IFN_FNMA @0 @1 @2)) -(simplify - (negate (IFN_FMS@3 @0 @1 @2)) + (simplify + (IFN_FNMS:c (negate @0) @1 @2) + (IFN_FMS @0 @1 @2)) + (simplify + (IFN_FNMS @0 @1 (negate @2)) + (IFN_FNMA @0 @1 @2)) + (simplify + (IFN_FNMS:c (negate @0) @1 (negate @2)) + (IFN_FMA @0 @1 @2)) + (simplify + (negate (IFN_FNMS@3 @0 @1 @2)) (if (single_use (@3)) - (IFN_FNMA @0 @1 @2))) - -(simplify - (IFN_FNMA:c (negate @0) @1 @2) - (IFN_FMA @0 @1 @2)) -(simplify - (IFN_FNMA @0 @1 (negate @2)) - (IFN_FNMS @0 @1 @2)) -(simplify - (IFN_FNMA:c (negate @0) @1 (negate @2)) - (IFN_FMS @0 @1 @2)) -(simplify - (negate (IFN_FNMA@3 @0 @1 @2)) - (if (single_use (@3)) - (IFN_FMS @0 @1 @2))) - -(simplify - (IFN_FNMS:c (negate @0) @1 @2) - (IFN_FMS @0 @1 @2)) -(simplify - (IFN_FNMS @0 @1 (negate @2)) - (IFN_FNMA @0 @1 @2)) -(simplify - (IFN_FNMS:c (negate @0) @1 (negate @2)) - (IFN_FMA @0 @1 @2)) -(simplify - (negate (IFN_FNMS@3 @0 @1 @2)) - (if (single_use (@3)) - (IFN_FMA @0 @1 @2))) + (IFN_FMA @0 @1 @2)))) Index: gcc/testsuite/gcc.dg/vect/vect-fma-1.c =================================================================== --- /dev/null 2018-04-20 16:19:46.369131350 +0100 +++ gcc/testsuite/gcc.dg/vect/vect-fma-1.c 2018-05-24 09:05:10.432158893 +0100 @@ -0,0 +1,58 @@ +/* { dg-require-effective-target scalar_all_fma } */ + +#include "tree-vect.h" + +#define N (VECTOR_BITS * 11 / 64 + 3) + +#define DEF(INV) \ + void __attribute__ ((noipa)) \ + f_##INV (double *restrict a, double *restrict b, \ + double *restrict c, double *restrict d) \ + { \ + for (int i = 0; i < N; ++i) \ + { \ + double mb = (INV & 1 ? -b[i] : b[i]); \ + double mc = c[i]; \ + double md = (INV & 2 ? -d[i] : d[i]); \ + double fma = __builtin_fma (mb, mc, md); \ + a[i] = (INV & 4 ? -fma : fma); \ + } \ + } + +#define TEST(INV) \ + { \ + f_##INV (a, b, c, d); \ + for (int i = 0; i < N; ++i) \ + { \ + double mb = (INV & 1 ? -b[i] : b[i]); \ + double mc = c[i]; \ + double md = (INV & 2 ? -d[i] : d[i]); \ + double fma = __builtin_fma (mb, mc, md); \ + double expected = (INV & 4 ? -fma : fma); \ + if (a[i] != expected) \ + __builtin_abort (); \ + asm volatile ("" ::: "memory"); \ + } \ + } + +#define FOR_EACH_INV(T) \ + T (0) T (1) T (2) T (3) T (4) T (5) T (6) T (7) + +FOR_EACH_INV (DEF) + +int +main (void) +{ + double a[N], b[N], c[N], d[N]; + for (int i = 0; i < N; ++i) + { + b[i] = i % 17; + c[i] = i % 9 + 11; + d[i] = i % 13 + 14; + asm volatile ("" ::: "memory"); + } + FOR_EACH_INV (TEST) + return 0; +} + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 8 "vect" { target vect_double } } } */