diff mbox

Improve detection of widening multiplication in the vectorizer

Message ID BANLkTi=wUC7Lf2oqY3Gt4m8s0JfwbcJ78Q@mail.gmail.com
State Accepted
Headers show

Commit Message

Ira Rosen June 1, 2011, 9:23 a.m. UTC
Hi,

The vectorizer expects widening multiplication pattern to be:

     type a_t, b_t;
     TYPE a_T, b_T, prod_T;

     a_T = (TYPE) a_t;
     b_T = (TYPE) b_t;
     prod_T = a_T * b_T;

where type 'TYPE' is double the size of type 'type'. This works fine
when the types are signed. For the unsigned types the code looks like:

     unsigned type a_t, b_t;
     unsigned TYPE u_prod_T;
     TYPE a_T, b_T, prod_T;

      a_T = (TYPE) a_t;
      b_T = (TYPE) b_t;
      prod_T = a_T * b_T;
      u_prod_T = (unsigned TYPE) prod_T;

i.e., the multiplication is done on signed, followed by a cast to unsigned.
This patch adds a support of such patterns and generates
WIDEN_MULT_EXPR for the unsigned type.

Another unsupported case is multiplication by a constant (e.g., b_T is
a constant). This patch checks that the constant fits the smaller type
'type' and recognizes such cases as widening multiplication.

Bootstrapped and tested on powerpc64-suse-linux. Tested the
vectorization testsuite on arm-linux-gnueabi.
I'll commit the patch shortly if there are no comments/objections.

Ira

ChangeLog:

       * tree-vectorizer.h (vect_recog_func_ptr): Make last argument to be
       a pointer.
       * tree-vect-patterns.c (vect_recog_widen_sum_pattern,
       vect_recog_widen_mult_pattern, vect_recog_dot_prod_pattern,
       vect_recog_pow_pattern): Likewise.
       (vect_pattern_recog_1): Remove declaration.
       (widened_name_p): Remove declaration.  Add new argument to specify
       whether to check that both types are either signed or unsigned.
       (vect_recog_widen_mult_pattern): Update documentation.  Handle
       unsigned patterns and multiplication by constants.
       (vect_pattern_recog_1): Update vect_recog_func references.  Use
       statement information from the statement returned from pattern
       detection functions.
       (vect_pattern_recog): Update vect_recog_func reference.
       * tree-vect-stmts.c (vectorizable_type_promotion): For widening
       multiplication by a constant use the type of the other operand.

testsuite/ChangeLog:

       * lib/target-supports.exp
(check_effective_target_vect_widen_mult_qi_to_hi):
       Add NEON as supporting target.
       (check_effective_target_vect_widen_mult_hi_to_si): Likewise.
       (check_effective_target_vect_widen_mult_qi_to_hi_pattern): New.
       (check_effective_target_vect_widen_mult_hi_to_si_pattern): New.
       * gcc.dg/vect/vect-widen-mult-u8.c: Expect to be vectorized
using widening
       multiplication on targets that support it.
       * gcc.dg/vect/vect-widen-mult-u16.c: Likewise.
       * gcc.dg/vect/vect-widen-mult-const-s16.c: New test.
       * gcc.dg/vect/vect-widen-mult-const-u16.c: New test.
Index: testsuite/lib/target-supports.exp
===================================================================
--- testsuite/lib/target-supports.exp	(revision 174475)
+++ testsuite/lib/target-supports.exp	(working copy)
@@ -2668,7 +2668,8 @@ proc check_effective_target_vect_widen_mult_qi_to_
 	} else {
 	    set et_vect_widen_mult_qi_to_hi_saved 0
 	}
-        if { [istarget powerpc*-*-*] } {
+        if { [istarget powerpc*-*-*]
+              || ([istarget arm*-*-*] && [check_effective_target_arm_neon]) } {
             set et_vect_widen_mult_qi_to_hi_saved 1
         }
     }
@@ -2701,7 +2702,8 @@ proc check_effective_target_vect_widen_mult_hi_to_
 	      || [istarget spu-*-*]
 	      || [istarget ia64-*-*]
 	      || [istarget i?86-*-*]
-	      || [istarget x86_64-*-*] } {
+	      || [istarget x86_64-*-*]
+              || ([istarget arm*-*-*] && [check_effective_target_arm_neon]) } {
             set et_vect_widen_mult_hi_to_si_saved 1
         }
     }
@@ -2710,6 +2712,52 @@ proc check_effective_target_vect_widen_mult_hi_to_
 }
 
 # Return 1 if the target plus current options supports a vector
+# widening multiplication of *char* args into *short* result, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_widen_mult_qi_to_hi_pattern { } {
+    global et_vect_widen_mult_qi_to_hi_pattern
+
+    if [info exists et_vect_widen_mult_qi_to_hi_pattern_saved] {
+        verbose "check_effective_target_vect_widen_mult_qi_to_hi_pattern: using cached result" 2
+    } else {
+        set et_vect_widen_mult_qi_to_hi_pattern_saved 0
+        if { [istarget powerpc*-*-*]
+              || ([istarget arm*-*-*] && [check_effective_target_arm_neon]) } {
+            set et_vect_widen_mult_qi_to_hi_pattern_saved 1
+        }
+    }
+    verbose "check_effective_target_vect_widen_mult_qi_to_hi_pattern: returning $et_vect_widen_mult_qi_to_hi_pattern_saved" 2
+    return $et_vect_widen_mult_qi_to_hi_pattern_saved
+}
+
+# Return 1 if the target plus current options supports a vector
+# widening multiplication of *short* args into *int* result, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_widen_mult_hi_to_si_pattern { } {
+    global et_vect_widen_mult_hi_to_si_pattern
+
+    if [info exists et_vect_widen_mult_hi_to_si_pattern_saved] {
+        verbose "check_effective_target_vect_widen_mult_hi_to_si_pattern: using cached result" 2
+    } else {
+        set et_vect_widen_mult_hi_to_si_pattern_saved 0
+        if { [istarget powerpc*-*-*]
+              || [istarget spu-*-*]
+              || [istarget ia64-*-*]
+              || [istarget i?86-*-*]
+              || [istarget x86_64-*-*]
+              || ([istarget arm*-*-*] && [check_effective_target_arm_neon]) } {
+            set et_vect_widen_mult_hi_to_si_pattern_saved 1
+        }
+    }
+    verbose "check_effective_target_vect_widen_mult_hi_to_si_pattern: returning $et_vect_widen_mult_hi_to_si_pattern_saved" 2
+    return $et_vect_widen_mult_hi_to_si_pattern_saved
+}
+
+# Return 1 if the target plus current options supports a vector
 # dot-product of signed chars, 0 otherwise.
 #
 # This won't change for different subtargets so cache the result.
Index: testsuite/gcc.dg/vect/vect-widen-mult-u8.c
===================================================================
--- testsuite/gcc.dg/vect/vect-widen-mult-u8.c	(revision 174475)
+++ testsuite/gcc.dg/vect/vect-widen-mult-u8.c	(working copy)
@@ -9,7 +9,7 @@ unsigned char X[N] __attribute__ ((__aligned__(__B
 unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
 unsigned short result[N];
 
-/* char->short widening-mult */
+/* unsigned char-> unsigned short widening-mult.  */
 __attribute__ ((noinline)) int
 foo1(int len) {
   int i;
@@ -28,8 +28,7 @@ int main (void)
   for (i=0; i<N; i++) {
     X[i] = i;
     Y[i] = 64-i;
-    if (i%4 == 0)
-      X[i] = 5;
+    __asm__ volatile ("");
   }
 
   foo1 (N);
@@ -43,5 +42,7 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_qi_to_hi || vect_unpack } } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */
+/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
 
Index: testsuite/gcc.dg/vect/vect-widen-mult-const-s16.c
===================================================================
--- testsuite/gcc.dg/vect/vect-widen-mult-const-s16.c	(revision 0)
+++ testsuite/gcc.dg/vect/vect-widen-mult-const-s16.c	(revision 0)
@@ -0,0 +1,60 @@
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+#include <stdlib.h>
+
+#define N 32
+
+__attribute__ ((noinline)) void 
+foo (int *__restrict a,
+     short *__restrict b,
+     int n)
+{
+  int i;
+
+  for (i = 0; i < n; i++)
+    a[i] = b[i] * 2333;
+
+  for (i = 0; i < n; i++)
+    if (a[i] != b[i] * 2333)
+      abort ();
+}
+
+__attribute__ ((noinline)) void
+bar (int *__restrict a,
+     short *__restrict b,
+     int n)
+{
+  int i;
+
+  for (i = 0; i < n; i++)
+    a[i] = b[i] * (short) 2333;
+
+  for (i = 0; i < n; i++)
+    if (a[i] != b[i] * (short) 2333)
+      abort ();
+}
+
+int main (void)
+{
+  int i;
+  int a[N];
+  short b[N];
+
+  for (i = 0; i < N; i++)
+    {
+      a[i] = 0;
+      b[i] = i;
+      __asm__ volatile ("");
+    }
+
+  foo (a, b, N);
+  bar (a, b, N);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_widen_mult_hi_to_si } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 2 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
+/* { dg-final { scan-tree-dump-times "pattern recognized" 2 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
Index: testsuite/gcc.dg/vect/vect-widen-mult-const-u16.c
===================================================================
--- testsuite/gcc.dg/vect/vect-widen-mult-const-u16.c	(revision 0)
+++ testsuite/gcc.dg/vect/vect-widen-mult-const-u16.c	(revision 0)
@@ -0,0 +1,77 @@
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+#include <stdlib.h>
+
+#define N 32
+
+__attribute__ ((noinline)) void 
+foo (unsigned int *__restrict a,
+     unsigned short *__restrict b,
+     int n)
+{
+  int i;
+
+  for (i = 0; i < n; i++)
+    a[i] = b[i] * 2333;
+
+  for (i = 0; i < n; i++)
+    if (a[i] != b[i] * 2333)
+      abort ();
+}
+
+__attribute__ ((noinline)) void
+bar (unsigned int *__restrict a,
+     unsigned short *__restrict b,
+     int n)
+{
+  int i;
+
+  for (i = 0; i < n; i++)
+    a[i] = (unsigned short) 2333 * b[i];
+
+  for (i = 0; i < n; i++)
+    if (a[i] != b[i] * (unsigned short) 2333)
+      abort ();
+}
+
+__attribute__ ((noinline)) void
+baz (unsigned int *__restrict a,
+     unsigned short *__restrict b,
+     int n)
+{
+  int i;
+
+  for (i = 0; i < n; i++)
+    a[i] = b[i] * 233333333;
+
+  for (i = 0; i < n; i++)
+    if (a[i] != b[i] * 233333333)
+      abort ();
+}
+
+
+int main (void)
+{
+  int i;
+  unsigned int a[N];
+  unsigned short b[N];
+
+  for (i = 0; i < N; i++)
+    {
+      a[i] = 0;
+      b[i] = i;
+      __asm__ volatile ("");
+    }
+
+  foo (a, b, N);
+  bar (a, b, N);
+  baz (a, b, N);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" { target vect_widen_mult_hi_to_si } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 2 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
+/* { dg-final { scan-tree-dump-times "pattern recognized" 2 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
Index: testsuite/gcc.dg/vect/vect-widen-mult-u16.c
===================================================================
--- testsuite/gcc.dg/vect/vect-widen-mult-u16.c	(revision 174475)
+++ testsuite/gcc.dg/vect/vect-widen-mult-u16.c	(working copy)
@@ -9,13 +9,11 @@ unsigned short X[N] __attribute__ ((__aligned__(__
 unsigned short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
 unsigned int result[N];
 
-/* short->int widening-mult */
+/* unsigned short->unsigned int widening-mult.  */
 __attribute__ ((noinline)) int
 foo1(int len) {
   int i;
 
-  /* Not vectorized because X[i] and Y[i] are casted to 'int'
-     so the widening multiplication pattern is not recognized.  */
   for (i=0; i<len; i++) {
     result[i] = (unsigned int)(X[i] * Y[i]);
   }
@@ -43,8 +41,8 @@ int main (void)
   return 0;
 }
 
-/*The induction loop is vectorized  */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_pack_trunc } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
+/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */

Comments

Richard Biener June 1, 2011, 9:42 a.m. UTC | #1
On Wed, Jun 1, 2011 at 11:23 AM, Ira Rosen <ira.rosen@linaro.org> wrote:
> Hi,
>
> The vectorizer expects widening multiplication pattern to be:
>
>     type a_t, b_t;
>     TYPE a_T, b_T, prod_T;
>
>     a_T = (TYPE) a_t;
>     b_T = (TYPE) b_t;
>     prod_T = a_T * b_T;
>
> where type 'TYPE' is double the size of type 'type'. This works fine
> when the types are signed. For the unsigned types the code looks like:
>
>     unsigned type a_t, b_t;
>     unsigned TYPE u_prod_T;
>     TYPE a_T, b_T, prod_T;
>
>      a_T = (TYPE) a_t;
>      b_T = (TYPE) b_t;
>      prod_T = a_T * b_T;
>      u_prod_T = (unsigned TYPE) prod_T;
>
> i.e., the multiplication is done on signed, followed by a cast to unsigned.
> This patch adds a support of such patterns and generates
> WIDEN_MULT_EXPR for the unsigned type.
>
> Another unsupported case is multiplication by a constant (e.g., b_T is
> a constant). This patch checks that the constant fits the smaller type
> 'type' and recognizes such cases as widening multiplication.
>
> Bootstrapped and tested on powerpc64-suse-linux. Tested the
> vectorization testsuite on arm-linux-gnueabi.
> I'll commit the patch shortly if there are no comments/objections.

Did you think about moving pass_optimize_widening_mul before
loop optimizations?  Does that pass catch the cases you are
teaching the pattern recognizer?  I think we should try to expose
these more complicated instructions to loop optimizers.

Thanks,
Richard.

> Ira
>
> ChangeLog:
>
>       * tree-vectorizer.h (vect_recog_func_ptr): Make last argument to be
>       a pointer.
>       * tree-vect-patterns.c (vect_recog_widen_sum_pattern,
>       vect_recog_widen_mult_pattern, vect_recog_dot_prod_pattern,
>       vect_recog_pow_pattern): Likewise.
>       (vect_pattern_recog_1): Remove declaration.
>       (widened_name_p): Remove declaration.  Add new argument to specify
>       whether to check that both types are either signed or unsigned.
>       (vect_recog_widen_mult_pattern): Update documentation.  Handle
>       unsigned patterns and multiplication by constants.
>       (vect_pattern_recog_1): Update vect_recog_func references.  Use
>       statement information from the statement returned from pattern
>       detection functions.
>       (vect_pattern_recog): Update vect_recog_func reference.
>       * tree-vect-stmts.c (vectorizable_type_promotion): For widening
>       multiplication by a constant use the type of the other operand.
>
> testsuite/ChangeLog:
>
>       * lib/target-supports.exp
> (check_effective_target_vect_widen_mult_qi_to_hi):
>       Add NEON as supporting target.
>       (check_effective_target_vect_widen_mult_hi_to_si): Likewise.
>       (check_effective_target_vect_widen_mult_qi_to_hi_pattern): New.
>       (check_effective_target_vect_widen_mult_hi_to_si_pattern): New.
>       * gcc.dg/vect/vect-widen-mult-u8.c: Expect to be vectorized
> using widening
>       multiplication on targets that support it.
>       * gcc.dg/vect/vect-widen-mult-u16.c: Likewise.
>       * gcc.dg/vect/vect-widen-mult-const-s16.c: New test.
>       * gcc.dg/vect/vect-widen-mult-const-u16.c: New test.
>
Richard Biener June 1, 2011, 12:14 p.m. UTC | #2
On Wed, Jun 1, 2011 at 1:37 PM, Ira Rosen <ira.rosen@linaro.org> wrote:
> On 1 June 2011 12:42, Richard Guenther <richard.guenther@gmail.com> wrote:
>
>> Did you think about moving pass_optimize_widening_mul before
>> loop optimizations?  Does that pass catch the cases you are
>> teaching the pattern recognizer?  I think we should try to expose
>> these more complicated instructions to loop optimizers.
>>
>
> pass_optimize_widening_mul doesn't catch these cases, but I can try to
> teach it instead of the vectorizer.
> I am now testing
>
> Index: passes.c
> ===================================================================
> --- passes.c    (revision 174391)
> +++ passes.c    (working copy)
> @@ -870,6 +870,7 @@
>       NEXT_PASS (pass_split_crit_edges);
>       NEXT_PASS (pass_pre);
>       NEXT_PASS (pass_sink_code);
> +      NEXT_PASS (pass_optimize_widening_mul);
>       NEXT_PASS (pass_tree_loop);
>        {
>          struct opt_pass **p = &pass_tree_loop.pass.sub;
> @@ -934,7 +935,6 @@
>       NEXT_PASS (pass_forwprop);
>       NEXT_PASS (pass_phiopt);
>       NEXT_PASS (pass_fold_builtins);
> -      NEXT_PASS (pass_optimize_widening_mul);
>       NEXT_PASS (pass_tail_calls);
>       NEXT_PASS (pass_rename_ssa_copies);
>       NEXT_PASS (pass_uncprop);
>
> to see how it affects other loop optimizations (vectorizer pattern
> tests obviously fail).

Thanks.  I would hope that we eventually can get rid of the
pattern recognizer ... at least for SSE there is also always
a scalar variant instruction for each vectorized one.

Richard.
Richard Sandiford June 6, 2011, 1:04 p.m. UTC | #3
Richard Guenther <richard.guenther@gmail.com> writes:
> Thanks.  I would hope that we eventually can get rid of the
> pattern recognizer ... at least for SSE there is also always
> a scalar variant instruction for each vectorized one.

AFAIK, that isn't true for ARM and NEON.  E.g. I don't know of a single
instruction that does the scalar equivalent of things like VADDHN
(add values and narrow to high half), VSUBL.U32 (subtract two values
and extend the result), etc.

FWIW, I think MIPS only has minimum and maximum operations for paired
floats, not for single floats or doubles.  I don't have the manuals to
hand to check though.

It's probably OK for the particular case of widening multiplications.
It sounded like you were making a more general statement though.
If so, I think we should try to avoid assuming that every vectorisable
operation has an equivalent scalar machine instruction.

Richard
Richard Biener June 6, 2011, 2:28 p.m. UTC | #4
On Mon, Jun 6, 2011 at 3:04 PM, Richard Sandiford
<richard.sandiford@linaro.org> wrote:
> Richard Guenther <richard.guenther@gmail.com> writes:
>> Thanks.  I would hope that we eventually can get rid of the
>> pattern recognizer ... at least for SSE there is also always
>> a scalar variant instruction for each vectorized one.
>
> AFAIK, that isn't true for ARM and NEON.  E.g. I don't know of a single
> instruction that does the scalar equivalent of things like VADDHN
> (add values and narrow to high half), VSUBL.U32 (subtract two values
> and extend the result), etc.
>
> FWIW, I think MIPS only has minimum and maximum operations for paired
> floats, not for single floats or doubles.  I don't have the manuals to
> hand to check though.
>
> It's probably OK for the particular case of widening multiplications.
> It sounded like you were making a more general statement though.
> If so, I think we should try to avoid assuming that every vectorisable
> operation has an equivalent scalar machine instruction.

Hmm, too bad ;)  Yes, I was suggesting that we assume that.  I guess
for now we can go with the vectorizer pattern matching enhancement
and re-visit re-ordering the passes later (I don't have time right now to
look into the reported issue).

Thanks,
Richard.

> Richard
>
H.J. Lu June 7, 2011, 8:50 p.m. UTC | #5
On Wed, Jun 1, 2011 at 2:23 AM, Ira Rosen <ira.rosen@linaro.org> wrote:
> Hi,
>
> The vectorizer expects widening multiplication pattern to be:
>
>     type a_t, b_t;
>     TYPE a_T, b_T, prod_T;
>
>     a_T = (TYPE) a_t;
>     b_T = (TYPE) b_t;
>     prod_T = a_T * b_T;
>
> where type 'TYPE' is double the size of type 'type'. This works fine
> when the types are signed. For the unsigned types the code looks like:
>
>     unsigned type a_t, b_t;
>     unsigned TYPE u_prod_T;
>     TYPE a_T, b_T, prod_T;
>
>      a_T = (TYPE) a_t;
>      b_T = (TYPE) b_t;
>      prod_T = a_T * b_T;
>      u_prod_T = (unsigned TYPE) prod_T;
>
> i.e., the multiplication is done on signed, followed by a cast to unsigned.
> This patch adds a support of such patterns and generates
> WIDEN_MULT_EXPR for the unsigned type.
>
> Another unsupported case is multiplication by a constant (e.g., b_T is
> a constant). This patch checks that the constant fits the smaller type
> 'type' and recognizes such cases as widening multiplication.
>
> Bootstrapped and tested on powerpc64-suse-linux. Tested the
> vectorization testsuite on arm-linux-gnueabi.
> I'll commit the patch shortly if there are no comments/objections.
>
> Ira
>
> ChangeLog:
>
>       * tree-vectorizer.h (vect_recog_func_ptr): Make last argument to be
>       a pointer.
>       * tree-vect-patterns.c (vect_recog_widen_sum_pattern,
>       vect_recog_widen_mult_pattern, vect_recog_dot_prod_pattern,
>       vect_recog_pow_pattern): Likewise.
>       (vect_pattern_recog_1): Remove declaration.
>       (widened_name_p): Remove declaration.  Add new argument to specify
>       whether to check that both types are either signed or unsigned.
>       (vect_recog_widen_mult_pattern): Update documentation.  Handle
>       unsigned patterns and multiplication by constants.
>       (vect_pattern_recog_1): Update vect_recog_func references.  Use
>       statement information from the statement returned from pattern
>       detection functions.
>       (vect_pattern_recog): Update vect_recog_func reference.
>       * tree-vect-stmts.c (vectorizable_type_promotion): For widening
>       multiplication by a constant use the type of the other operand.
>
.

This caused:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=49318
H.J. Lu June 21, 2011, 12:32 a.m. UTC | #6
On Tue, Jun 7, 2011 at 1:50 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Wed, Jun 1, 2011 at 2:23 AM, Ira Rosen <ira.rosen@linaro.org> wrote:
>> Hi,
>>
>> The vectorizer expects widening multiplication pattern to be:
>>
>>     type a_t, b_t;
>>     TYPE a_T, b_T, prod_T;
>>
>>     a_T = (TYPE) a_t;
>>     b_T = (TYPE) b_t;
>>     prod_T = a_T * b_T;
>>
>> where type 'TYPE' is double the size of type 'type'. This works fine
>> when the types are signed. For the unsigned types the code looks like:
>>
>>     unsigned type a_t, b_t;
>>     unsigned TYPE u_prod_T;
>>     TYPE a_T, b_T, prod_T;
>>
>>      a_T = (TYPE) a_t;
>>      b_T = (TYPE) b_t;
>>      prod_T = a_T * b_T;
>>      u_prod_T = (unsigned TYPE) prod_T;
>>
>> i.e., the multiplication is done on signed, followed by a cast to unsigned.
>> This patch adds a support of such patterns and generates
>> WIDEN_MULT_EXPR for the unsigned type.
>>
>> Another unsupported case is multiplication by a constant (e.g., b_T is
>> a constant). This patch checks that the constant fits the smaller type
>> 'type' and recognizes such cases as widening multiplication.
>>
>> Bootstrapped and tested on powerpc64-suse-linux. Tested the
>> vectorization testsuite on arm-linux-gnueabi.
>> I'll commit the patch shortly if there are no comments/objections.
>>
>> Ira
>>
>> ChangeLog:
>>
>>       * tree-vectorizer.h (vect_recog_func_ptr): Make last argument to be
>>       a pointer.
>>       * tree-vect-patterns.c (vect_recog_widen_sum_pattern,
>>       vect_recog_widen_mult_pattern, vect_recog_dot_prod_pattern,
>>       vect_recog_pow_pattern): Likewise.
>>       (vect_pattern_recog_1): Remove declaration.
>>       (widened_name_p): Remove declaration.  Add new argument to specify
>>       whether to check that both types are either signed or unsigned.
>>       (vect_recog_widen_mult_pattern): Update documentation.  Handle
>>       unsigned patterns and multiplication by constants.
>>       (vect_pattern_recog_1): Update vect_recog_func references.  Use
>>       statement information from the statement returned from pattern
>>       detection functions.
>>       (vect_pattern_recog): Update vect_recog_func reference.
>>       * tree-vect-stmts.c (vectorizable_type_promotion): For widening
>>       multiplication by a constant use the type of the other operand.
>>
> .
>
> This caused:
>
> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=49318
>

This also caused:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=49478
diff mbox

Patch

Index: tree-vectorizer.h
===================================================================
--- tree-vectorizer.h	(revision 174475)
+++ tree-vectorizer.h	(working copy)
@@ -896,7 +896,7 @@  extern void vect_slp_transform_bb (basic_block);
 /* Pattern recognition functions.
    Additional pattern recognition functions can (and will) be added
    in the future.  */
-typedef gimple (* vect_recog_func_ptr) (gimple, tree *, tree *);
+typedef gimple (* vect_recog_func_ptr) (gimple *, tree *, tree *);
 #define NUM_PATTERNS 4
 void vect_pattern_recog (loop_vec_info);
 
Index: tree-vect-patterns.c
===================================================================
--- tree-vect-patterns.c	(revision 174475)
+++ tree-vect-patterns.c	(working copy)
@@ -38,16 +38,11 @@  along with GCC; see the file COPYING3.  If not see
 #include "recog.h"
 #include "diagnostic-core.h"
 
-/* Function prototypes */
-static void vect_pattern_recog_1
-  (gimple (* ) (gimple, tree *, tree *), gimple_stmt_iterator);
-static bool widened_name_p (tree, gimple, tree *, gimple *);
-
 /* Pattern recognition functions  */
-static gimple vect_recog_widen_sum_pattern (gimple, tree *, tree *);
-static gimple vect_recog_widen_mult_pattern (gimple, tree *, tree *);
-static gimple vect_recog_dot_prod_pattern (gimple, tree *, tree *);
-static gimple vect_recog_pow_pattern (gimple, tree *, tree *);
+static gimple vect_recog_widen_sum_pattern (gimple *, tree *, tree *);
+static gimple vect_recog_widen_mult_pattern (gimple *, tree *, tree *);
+static gimple vect_recog_dot_prod_pattern (gimple *, tree *, tree *);
+static gimple vect_recog_pow_pattern (gimple *, tree *, tree *);
 static vect_recog_func_ptr vect_vect_recog_func_ptrs[NUM_PATTERNS] = {
 	vect_recog_widen_mult_pattern,
 	vect_recog_widen_sum_pattern,
@@ -61,10 +56,12 @@  static vect_recog_func_ptr vect_vect_recog_func_pt
    is a result of a type-promotion, such that:
      DEF_STMT: NAME = NOP (name0)
    where the type of name0 (HALF_TYPE) is smaller than the type of NAME.
-*/
+   If CHECK_SIGN is TRUE, check that either both types are signed or both are
+   unsigned.  */
 
 static bool
-widened_name_p (tree name, gimple use_stmt, tree *half_type, gimple *def_stmt)
+widened_name_p (tree name, gimple use_stmt, tree *half_type, gimple *def_stmt,
+		bool check_sign)
 {
   tree dummy;
   gimple dummy_gimple;
@@ -98,7 +95,7 @@  static bool
 
   *half_type = TREE_TYPE (oprnd0);
   if (!INTEGRAL_TYPE_P (type) || !INTEGRAL_TYPE_P (*half_type)
-      || (TYPE_UNSIGNED (type) != TYPE_UNSIGNED (*half_type))
+      || ((TYPE_UNSIGNED (type) != TYPE_UNSIGNED (*half_type)) && check_sign)
       || (TYPE_PRECISION (type) < (TYPE_PRECISION (*half_type) * 2)))
     return false;
 
@@ -168,12 +165,12 @@  vect_recog_temp_ssa_var (tree type, gimple stmt)
          inner-loop nested in an outer-loop that us being vectorized).  */
 
 static gimple
-vect_recog_dot_prod_pattern (gimple last_stmt, tree *type_in, tree *type_out)
+vect_recog_dot_prod_pattern (gimple *last_stmt, tree *type_in, tree *type_out)
 {
   gimple stmt;
   tree oprnd0, oprnd1;
   tree oprnd00, oprnd01;
-  stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
+  stmt_vec_info stmt_vinfo = vinfo_for_stmt (*last_stmt);
   tree type, half_type;
   gimple pattern_stmt;
   tree prod_type;
@@ -181,10 +178,10 @@  static gimple
   struct loop *loop = LOOP_VINFO_LOOP (loop_info);
   tree var;
 
-  if (!is_gimple_assign (last_stmt))
+  if (!is_gimple_assign (*last_stmt))
     return NULL;
 
-  type = gimple_expr_type (last_stmt);
+  type = gimple_expr_type (*last_stmt);
 
   /* Look for the following pattern
           DX = (TYPE1) X;
@@ -210,7 +207,7 @@  static gimple
   /* Starting from LAST_STMT, follow the defs of its uses in search
      of the above pattern.  */
 
-  if (gimple_assign_rhs_code (last_stmt) != PLUS_EXPR)
+  if (gimple_assign_rhs_code (*last_stmt) != PLUS_EXPR)
     return NULL;
 
   if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
@@ -231,14 +228,14 @@  static gimple
 
       if (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def)
         return NULL;
-      oprnd0 = gimple_assign_rhs1 (last_stmt);
-      oprnd1 = gimple_assign_rhs2 (last_stmt);
+      oprnd0 = gimple_assign_rhs1 (*last_stmt);
+      oprnd1 = gimple_assign_rhs2 (*last_stmt);
       if (!types_compatible_p (TREE_TYPE (oprnd0), type)
 	  || !types_compatible_p (TREE_TYPE (oprnd1), type))
         return NULL;
-      stmt = last_stmt;
+      stmt = *last_stmt;
 
-      if (widened_name_p (oprnd0, stmt, &half_type, &def_stmt))
+      if (widened_name_p (oprnd0, stmt, &half_type, &def_stmt, true))
         {
           stmt = def_stmt;
           oprnd0 = gimple_assign_rhs1 (stmt);
@@ -247,7 +244,7 @@  static gimple
         half_type = type;
     }
 
-  /* So far so good. Since last_stmt was detected as a (summation) reduction,
+  /* So far so good.  Since *last_stmt was detected as a (summation) reduction,
      we know that oprnd1 is the reduction variable (defined by a loop-header
      phi), and oprnd0 is an ssa-name defined by a stmt in the loop body.
      Left to check that oprnd0 is defined by a (widen_)mult_expr  */
@@ -293,10 +290,10 @@  static gimple
       if (!types_compatible_p (TREE_TYPE (oprnd0), prod_type)
           || !types_compatible_p (TREE_TYPE (oprnd1), prod_type))
         return NULL;
-      if (!widened_name_p (oprnd0, stmt, &half_type0, &def_stmt))
+      if (!widened_name_p (oprnd0, stmt, &half_type0, &def_stmt, true))
         return NULL;
       oprnd00 = gimple_assign_rhs1 (def_stmt);
-      if (!widened_name_p (oprnd1, stmt, &half_type1, &def_stmt))
+      if (!widened_name_p (oprnd1, stmt, &half_type1, &def_stmt, true))
         return NULL;
       oprnd01 = gimple_assign_rhs1 (def_stmt);
       if (!types_compatible_p (half_type0, half_type1))
@@ -322,7 +319,7 @@  static gimple
 
   /* We don't allow changing the order of the computation in the inner-loop
      when doing outer-loop vectorization.  */
-  gcc_assert (!nested_in_vect_loop_p (loop, last_stmt));
+  gcc_assert (!nested_in_vect_loop_p (loop, *last_stmt));
 
   return pattern_stmt;
 }
@@ -342,24 +339,47 @@  static gimple
 
    where type 'TYPE' is at least double the size of type 'type'.
 
+   Also detect unsgigned cases:
+
+     unsigned type a_t, b_t;
+     unsigned TYPE u_prod_T;
+     TYPE a_T, b_T, prod_T;
+
+     S1  a_t = ;
+     S2  b_t = ;
+     S3  a_T = (TYPE) a_t;
+     S4  b_T = (TYPE) b_t;
+     S5  prod_T = a_T * b_T;
+     S6  u_prod_T = (unsigned TYPE) prod_T;
+
+   and multiplication by constants:
+
+     type a_t;
+     TYPE a_T, prod_T;
+
+     S1  a_t = ;
+     S3  a_T = (TYPE) a_t;
+     S5  prod_T = a_T * CONST;
+
    Input:
 
-   * LAST_STMT: A stmt from which the pattern search begins. In the example,
-   when this function is called with S5, the pattern {S3,S4,S5} is be detected.
+   * LAST_STMT: A stmt from which the pattern search begins.  In the example,
+   when this function is called with S5, the pattern {S3,S4,S5,(S6)} is
+   detected.
 
    Output:
 
    * TYPE_IN: The type of the input arguments to the pattern.
 
-   * TYPE_OUT: The type of the output  of this pattern.
+   * TYPE_OUT: The type of the output of this pattern.
 
    * Return value: A new stmt that will be used to replace the sequence of
-   stmts that constitute the pattern. In this case it will be:
+   stmts that constitute the pattern.  In this case it will be:
         WIDEN_MULT <a_t, b_t>
 */
 
 static gimple
-vect_recog_widen_mult_pattern (gimple last_stmt,
+vect_recog_widen_mult_pattern (gimple *last_stmt,
 			       tree *type_in,
 			       tree *type_out)
 {
@@ -367,40 +387,111 @@  static gimple
   tree oprnd0, oprnd1;
   tree type, half_type0, half_type1;
   gimple pattern_stmt;
-  tree vectype, vectype_out;
+  tree vectype, vectype_out = NULL_TREE;
   tree dummy;
   tree var;
   enum tree_code dummy_code;
   int dummy_int;
   VEC (tree, heap) *dummy_vec;
+  bool op0_ok, op1_ok;
 
-  if (!is_gimple_assign (last_stmt))
+  if (!is_gimple_assign (*last_stmt))
     return NULL;
 
-  type = gimple_expr_type (last_stmt);
+  type = gimple_expr_type (*last_stmt);
 
   /* Starting from LAST_STMT, follow the defs of its uses in search
      of the above pattern.  */
 
-  if (gimple_assign_rhs_code (last_stmt) != MULT_EXPR)
+  if (gimple_assign_rhs_code (*last_stmt) != MULT_EXPR)
     return NULL;
 
-  oprnd0 = gimple_assign_rhs1 (last_stmt);
-  oprnd1 = gimple_assign_rhs2 (last_stmt);
+  oprnd0 = gimple_assign_rhs1 (*last_stmt);
+  oprnd1 = gimple_assign_rhs2 (*last_stmt);
   if (!types_compatible_p (TREE_TYPE (oprnd0), type)
       || !types_compatible_p (TREE_TYPE (oprnd1), type))
     return NULL;
 
-  /* Check argument 0 */
-  if (!widened_name_p (oprnd0, last_stmt, &half_type0, &def_stmt0))
-    return NULL;
-  oprnd0 = gimple_assign_rhs1 (def_stmt0);
+  /* Check argument 0.  */
+  op0_ok = widened_name_p (oprnd0, *last_stmt, &half_type0, &def_stmt0, false);
+  /* Check argument 1.  */
+  op1_ok = widened_name_p (oprnd1, *last_stmt, &half_type1, &def_stmt1, false);
 
-  /* Check argument 1 */
-  if (!widened_name_p (oprnd1, last_stmt, &half_type1, &def_stmt1))
+  /* In case of multiplication by a constant one of the operands may not match
+     the pattern, but not both.  */
+  if (!op0_ok && !op1_ok)
     return NULL;
-  oprnd1 = gimple_assign_rhs1 (def_stmt1);
 
+  if (op0_ok && op1_ok)
+    {
+      oprnd0 = gimple_assign_rhs1 (def_stmt0);
+      oprnd1 = gimple_assign_rhs1 (def_stmt1);
+    }	       
+  else if (!op0_ok)
+    {
+      if (CONSTANT_CLASS_P (oprnd0)
+	  && TREE_CODE (half_type1) == INTEGER_TYPE
+	  && tree_int_cst_lt (oprnd0, TYPE_MAXVAL (half_type1))
+          && tree_int_cst_lt (TYPE_MINVAL (half_type1), oprnd0))
+        {
+	  /* OPRND0 is a constant of HALF_TYPE1.  */
+ 	  half_type0 = half_type1;
+          oprnd1 = gimple_assign_rhs1 (def_stmt1);
+        }
+      else
+	return NULL;
+    }
+  else if (!op1_ok)
+    {
+      if (CONSTANT_CLASS_P (oprnd1)
+          && TREE_CODE (half_type0) == INTEGER_TYPE
+          && tree_int_cst_lt (oprnd1, TYPE_MAXVAL (half_type0))
+          && tree_int_cst_lt (TYPE_MINVAL (half_type0), oprnd1))
+        {
+	  /* OPRND1 is a constant of HALF_TYPE0.  */
+          half_type1 = half_type0;
+          oprnd0 = gimple_assign_rhs1 (def_stmt0);
+        }
+      else
+        return NULL;
+    }
+
+  /* Handle unsigned case.  Look for
+     S6  u_prod_T = (unsigned TYPE) prod_T;
+     Use unsigned TYPE as the type for WIDEN_MULT_EXPR.  */
+  if (TYPE_UNSIGNED (type) != TYPE_UNSIGNED (half_type0))
+    {
+      tree lhs = gimple_assign_lhs (*last_stmt), use_lhs;
+      imm_use_iterator imm_iter;
+      use_operand_p use_p;
+      int nuses = 0;
+      gimple use_stmt = NULL;
+      tree use_type;
+
+      if (TYPE_UNSIGNED (type) == TYPE_UNSIGNED (half_type1))
+        return NULL;
+
+      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
+        {
+          use_stmt = USE_STMT (use_p);
+          nuses++;
+        }
+
+      if (nuses != 1 || !is_gimple_assign (use_stmt)
+          || gimple_assign_rhs_code (use_stmt) != NOP_EXPR)
+        return NULL;
+
+      use_lhs = gimple_assign_lhs (use_stmt);
+      use_type = TREE_TYPE (use_lhs);
+      if (!INTEGRAL_TYPE_P (use_type)
+          || (TYPE_UNSIGNED (type) == TYPE_UNSIGNED (use_type))
+          || (TYPE_PRECISION (type) != TYPE_PRECISION (use_type)))
+        return NULL;
+
+      type = use_type;
+      *last_stmt = use_stmt;
+    }
+
   if (!types_compatible_p (half_type0, half_type1))
     return NULL;
 
@@ -413,7 +504,7 @@  static gimple
   vectype_out = get_vectype_for_scalar_type (type);
   if (!vectype
       || !vectype_out
-      || !supportable_widening_operation (WIDEN_MULT_EXPR, last_stmt,
+      || !supportable_widening_operation (WIDEN_MULT_EXPR, *last_stmt,
 					  vectype_out, vectype,
 					  &dummy, &dummy, &dummy_code,
 					  &dummy_code, &dummy_int, &dummy_vec))
@@ -462,16 +553,16 @@  static gimple
 */
 
 static gimple
-vect_recog_pow_pattern (gimple last_stmt, tree *type_in, tree *type_out)
+vect_recog_pow_pattern (gimple *last_stmt, tree *type_in, tree *type_out)
 {
   tree fn, base, exp = NULL;
   gimple stmt;
   tree var;
 
-  if (!is_gimple_call (last_stmt) || gimple_call_lhs (last_stmt) == NULL)
+  if (!is_gimple_call (*last_stmt) || gimple_call_lhs (*last_stmt) == NULL)
     return NULL;
 
-  fn = gimple_call_fndecl (last_stmt);
+  fn = gimple_call_fndecl (*last_stmt);
   if (fn == NULL_TREE || DECL_BUILT_IN_CLASS (fn) != BUILT_IN_NORMAL)
    return NULL;
 
@@ -481,8 +572,8 @@  static gimple
     case BUILT_IN_POWI:
     case BUILT_IN_POWF:
     case BUILT_IN_POW:
-      base = gimple_call_arg (last_stmt, 0);
-      exp = gimple_call_arg (last_stmt, 1);
+      base = gimple_call_arg (*last_stmt, 0);
+      exp = gimple_call_arg (*last_stmt, 1);
       if (TREE_CODE (exp) != REAL_CST
 	  && TREE_CODE (exp) != INTEGER_CST)
         return NULL;
@@ -574,21 +665,21 @@  static gimple
 	 inner-loop nested in an outer-loop that us being vectorized).  */
 
 static gimple
-vect_recog_widen_sum_pattern (gimple last_stmt, tree *type_in, tree *type_out)
+vect_recog_widen_sum_pattern (gimple *last_stmt, tree *type_in, tree *type_out)
 {
   gimple stmt;
   tree oprnd0, oprnd1;
-  stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
+  stmt_vec_info stmt_vinfo = vinfo_for_stmt (*last_stmt);
   tree type, half_type;
   gimple pattern_stmt;
   loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
   struct loop *loop = LOOP_VINFO_LOOP (loop_info);
   tree var;
 
-  if (!is_gimple_assign (last_stmt))
+  if (!is_gimple_assign (*last_stmt))
     return NULL;
 
-  type = gimple_expr_type (last_stmt);
+  type = gimple_expr_type (*last_stmt);
 
   /* Look for the following pattern
           DX = (TYPE) X;
@@ -600,25 +691,25 @@  static gimple
   /* Starting from LAST_STMT, follow the defs of its uses in search
      of the above pattern.  */
 
-  if (gimple_assign_rhs_code (last_stmt) != PLUS_EXPR)
+  if (gimple_assign_rhs_code (*last_stmt) != PLUS_EXPR)
     return NULL;
 
   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def)
     return NULL;
 
-  oprnd0 = gimple_assign_rhs1 (last_stmt);
-  oprnd1 = gimple_assign_rhs2 (last_stmt);
+  oprnd0 = gimple_assign_rhs1 (*last_stmt);
+  oprnd1 = gimple_assign_rhs2 (*last_stmt);
   if (!types_compatible_p (TREE_TYPE (oprnd0), type)
       || !types_compatible_p (TREE_TYPE (oprnd1), type))
     return NULL;
 
-  /* So far so good. Since last_stmt was detected as a (summation) reduction,
+  /* So far so good.  Since *last_stmt was detected as a (summation) reduction,
      we know that oprnd1 is the reduction variable (defined by a loop-header
      phi), and oprnd0 is an ssa-name defined by a stmt in the loop body.
      Left to check that oprnd0 is defined by a cast from type 'type' to type
      'TYPE'.  */
 
-  if (!widened_name_p (oprnd0, last_stmt, &half_type, &stmt))
+  if (!widened_name_p (oprnd0, *last_stmt, &half_type, &stmt, true))
     return NULL;
 
   oprnd0 = gimple_assign_rhs1 (stmt);
@@ -639,7 +730,7 @@  static gimple
 
   /* We don't allow changing the order of the computation in the inner-loop
      when doing outer-loop vectorization.  */
-  gcc_assert (!nested_in_vect_loop_p (loop, last_stmt));
+  gcc_assert (!nested_in_vect_loop_p (loop, *last_stmt));
 
   return pattern_stmt;
 }
@@ -669,23 +760,27 @@  static gimple
 
 static void
 vect_pattern_recog_1 (
-	gimple (* vect_recog_func) (gimple, tree *, tree *),
+	gimple (* vect_recog_func) (gimple *, tree *, tree *),
 	gimple_stmt_iterator si)
 {
   gimple stmt = gsi_stmt (si), pattern_stmt;
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  stmt_vec_info stmt_info;
   stmt_vec_info pattern_stmt_info;
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  loop_vec_info loop_vinfo;
   tree pattern_vectype;
   tree type_in, type_out;
   enum tree_code code;
   int i;
   gimple next;
 
-  pattern_stmt = (* vect_recog_func) (stmt, &type_in, &type_out);
+  pattern_stmt = (* vect_recog_func) (&stmt, &type_in, &type_out);
   if (!pattern_stmt)
     return;
 
+  si = gsi_for_stmt (stmt);
+  stmt_info = vinfo_for_stmt (stmt);
+  loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+ 
   if (VECTOR_MODE_P (TYPE_MODE (type_in)))
     {
       /* No need to check target support (already checked by the pattern
@@ -832,7 +927,7 @@  vect_pattern_recog (loop_vec_info loop_vinfo)
   unsigned int nbbs = loop->num_nodes;
   gimple_stmt_iterator si;
   unsigned int i, j;
-  gimple (* vect_recog_func_ptr) (gimple, tree *, tree *);
+  gimple (* vect_recog_func_ptr) (gimple *, tree *, tree *);
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "=== vect_pattern_recog ===");
Index: tree-vect-stmts.c
===================================================================
--- tree-vect-stmts.c	(revision 174475)
+++ tree-vect-stmts.c	(working copy)
@@ -3232,6 +3232,33 @@  vectorizable_type_promotion (gimple stmt, gimple_s
 	fprintf (vect_dump, "use not simple.");
       return false;
     }
+
+  op_type = TREE_CODE_LENGTH (code);
+  if (op_type == binary_op)
+    {
+      bool ok;
+
+      op1 = gimple_assign_rhs2 (stmt);
+      if (code == WIDEN_MULT_EXPR)
+        {
+	  /* For WIDEN_MULT_EXPR, if OP0 is a constant, use the type of
+	     OP1.  */
+          if (CONSTANT_CLASS_P (op0))
+            ok = vect_is_simple_use_1 (op1, loop_vinfo, NULL,
+                             &def_stmt, &def, &dt[1], &vectype_in);
+          else
+            ok = vect_is_simple_use (op1, loop_vinfo, NULL, &def_stmt, &def,
+                                     &dt[1]);
+
+          if (!ok)
+            {
+	      if (vect_print_dump_info (REPORT_DETAILS))
+	        fprintf (vect_dump, "use not simple.");
+              return false;
+            }
+        }        
+    }
+
   /* If op0 is an external or constant def use a vector type with
      the same size as the output vector type.  */
   if (!vectype_in)
@@ -3264,18 +3291,6 @@  vectorizable_type_promotion (gimple stmt, gimple_s
 
   gcc_assert (ncopies >= 1);
 
-  op_type = TREE_CODE_LENGTH (code);
-  if (op_type == binary_op)
-    {
-      op1 = gimple_assign_rhs2 (stmt);
-      if (!vect_is_simple_use (op1, loop_vinfo, NULL, &def_stmt, &def, &dt[1]))
-        {
-	  if (vect_print_dump_info (REPORT_DETAILS))
-	    fprintf (vect_dump, "use not simple.");
-          return false;
-        }
-    }
-
   /* Supportable by target?  */
   if (!supportable_widening_operation (code, stmt, vectype_out, vectype_in,
 				       &decl1, &decl2, &code1, &code2,
@@ -3301,6 +3316,14 @@  vectorizable_type_promotion (gimple stmt, gimple_s
     fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
                         ncopies);
 
+  if (code == WIDEN_MULT_EXPR)
+    {
+      if (CONSTANT_CLASS_P (op0))
+	op0 = fold_convert (TREE_TYPE (op1), op0);
+      else if (CONSTANT_CLASS_P (op1))
+	op1 = fold_convert (TREE_TYPE (op0), op1);
+    }
+
   /* Handle def.  */
   /* In case of multi-step promotion, we first generate promotion operations
      to the intermediate types, and then from that types to the final one.