diff mbox series

[SLP] SLP vectorization: vectorize vector constructors

Message ID 5edb0b00-4ae2-41c0-80ec-76de15d0b110@arm.com
State New
Headers show
Series [SLP] SLP vectorization: vectorize vector constructors | expand

Commit Message

Joel Oct. 11, 2019, 3:32 p.m. UTC
Hi Richard,

Thanks for your help, I've reworked my SLP RFC based on your feedback.
 > I think a better place for the loop searching for CONSTRUCTORs is

 > vect_slp_analyze_bb_1 where I'd put it before the check you remove,

 > and I'd simply append found CONSTRUCTORs to the grouped_stores

 > array

I've moved this check into a separate function and called it from 
vect_slp_analyze_bb_1

 > The fixup you do in vectorizable_operation doesn't

 > belong there either, I'd add a new field to the SLP instance

 > structure refering to the CONSTRUCTOR stmt and do the fixup

 > in vect_schedule_slp_instance instead where you can simply

 > replace the CONSTRUCTOR with the vectorized SSA name then.


Done.

 > +           /* Check that the constructor elements are unique.  */

 > +           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)

 > +             {

 > +               tree prev_val;

 > +               int j;

 > +               FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j,

 > prev_val)

 > +               {

 > +                 if (val == prev_val && i!=j)

 >

 > why's that necessary? (it looks incomplete, also doesn't catch

 > [duplicate] constants)


The thinking was that there was no benefit in vectorizing a constructor of
duplicates, or a vector of constants, although now you mention it that 
thinking
may be flawed. I've removed it

 > You miss to check that CONSTRUCTOR_NELTS == TYPE_VECTOR_SUBPARTS

 > (we can have omitted trailing zeros).

...
 > What happens if you have a vector constructor that is twice

 > as large as the machine supports?  The vectorizer will happily

 > produce a two vector SSA name vectorized result but your

 > CONSTRUCTOR replacement doesn't work here.  I think this should

 > be made work correctly (not give up on that case).


I've reworked the patch to account for this, by checking that the 
vectorized version
has one vectorized stmt at the root of the SLP tree. I'm not sure how to 
handle
a vector constructor twice as large as the machine supports, as far as I 
can see,
when only analyzing a within a basic block, the SSA name of the 
constructor has to
be maintained.


Currently SLP vectorization can build SLP trees starting from reductions or
from group stores. This patch adds a third starting point: vector 
constructors.

For the following test case (compiled with -O3):

char g_d[1024], g_s1[1024], g_s2[1024];
void test_loop(void)
{
   char d = g_d, s1 = g_s1, *s2 = g_s2;
   for ( int y = 0; y < 128; y++ )

   {
     for ( int x = 0; x < 16; x++ )
       d[x] = s1[x] + s2[x];
     d += 16;
   }
}

before patch:
test_loop:
.LFB0:
         .cfi_startproc
         adrp    x0, g_s1
         adrp    x2, g_s2
         add     x3, x0, :lo12:g_s1
         add     x4, x2, :lo12:g_s2
         ldrb    w7, [x2, #:lo12:g_s2]
         ldrb    w1, [x0, #:lo12:g_s1]
         adrp    x0, g_d
         ldrb    w6, [x4, 1]
         add     x0, x0, :lo12:g_d
         ldrb    w5, [x3, 1]
         add     w1, w1, w7
         fmov    s0, w1
         ldrb    w7, [x4, 2]
         add     w5, w5, w6
         ldrb    w1, [x3, 2]
         ldrb    w6, [x4, 3]
         add     x2, x0, 2048
         ins     v0.b[1], w5
         add     w1, w1, w7
         ldrb    w7, [x3, 3]
         ldrb    w5, [x4, 4]
         add     w7, w7, w6
         ldrb    w6, [x3, 4]
         ins     v0.b[2], w1
         ldrb    w8, [x4, 5]
         add     w6, w6, w5
         ldrb    w5, [x3, 5]
         ldrb    w9, [x4, 6]
         add     w5, w5, w8
         ldrb    w1, [x3, 6]
         ins     v0.b[3], w7
         ldrb    w8, [x4, 7]
         add     w1, w1, w9
         ldrb    w11, [x3, 7]
         ldrb    w7, [x4, 8]
         add     w11, w11, w8
         ldrb    w10, [x3, 8]
         ins     v0.b[4], w6
         ldrb    w8, [x4, 9]
         add     w10, w10, w7
         ldrb    w9, [x3, 9]
         ldrb    w7, [x4, 10]
         add     w9, w9, w8
         ldrb    w8, [x3, 10]
         ins     v0.b[5], w5
         ldrb    w6, [x4, 11]
         add     w8, w8, w7
         ldrb    w7, [x3, 11]
         ldrb    w5, [x4, 12]
         add     w7, w7, w6
         ldrb    w6, [x3, 12]
         ins     v0.b[6], w1
         ldrb    w12, [x4, 13]
         add     w6, w6, w5
         ldrb    w5, [x3, 13]
         ldrb    w1, [x3, 14]
         add     w5, w5, w12
         ldrb    w13, [x4, 14]
         ins     v0.b[7], w11
         ldrb    w12, [x4, 15]
         add     w4, w1, w13
         ldrb    w1, [x3, 15]
         add     w1, w1, w12
         ins     v0.b[8], w10
         ins     v0.b[9], w9
         ins     v0.b[10], w8
         ins     v0.b[11], w7
         ins     v0.b[12], w6
         ins     v0.b[13], w5
         ins     v0.b[14], w4
         ins     v0.b[15], w1
         .p2align 3,,7
.L2:
         str     q0, [x0], 16
         cmp     x2, x0
         bne     .L2
         ret
         .cfi_endproc
.LFE0:

After patch:

test_loop:
.LFB0:
         .cfi_startproc
         adrp    x3, g_s1
         adrp    x2, g_s2
         add     x3, x3, :lo12:g_s1
         add     x2, x2, :lo12:g_s2
         adrp    x0, g_d
         add     x0, x0, :lo12:g_d
         add     x1, x0, 2048
         ldr     q1, [x2]
         ldr     q0, [x3]
         add     v0.16b, v0.16b, v1.16b
         .p2align 3,,7
.L2:
         str     q0, [x0], 16
         cmp     x0, x1
         bne     .L2
         ret
         .cfi_endproc
.LFE0:


2019-10-11  Joel Hutton  Joel.Hutton@arm.com

     * tree-vect-slp.c (vect_analyze_slp_instance): Add case for vector 
constructors.
     (vect_bb_slp_scalar_cost): Likewise.
     (vect_ssa_use_outside_bb): New function.
     (vect_slp_check_for_constructors): New function.
     (vect_slp_analyze_bb_1): Add check for vector constructors.
     (vect_schedule_slp_instance): Add case to fixup vector constructor 
stmt.
     * tree-vectorizer.h (SLP_INSTANCE_ROOT_STMT): New field.


gcc/testsuite/ChangeLog:

2019-10-11  Joel Hutton  Joel.Hutton@arm.com

     * gcc.dg/vect/bb-slp-40.c: New test.

bootstrapped and regression tested on aarch64-none-linux-gnu

Comments

Richard Biener Oct. 15, 2019, 12:11 p.m. UTC | #1
On Fri, 11 Oct 2019, Joel Hutton wrote:

> Hi Richard,

> 

> Thanks for your help, I've reworked my SLP RFC based on your feedback.

>  > I think a better place for the loop searching for CONSTRUCTORs is

>  > vect_slp_analyze_bb_1 where I'd put it before the check you remove,

>  > and I'd simply append found CONSTRUCTORs to the grouped_stores

>  > array

> I've moved this check into a separate function and called it from 

> vect_slp_analyze_bb_1

> 

>  > The fixup you do in vectorizable_operation doesn't

>  > belong there either, I'd add a new field to the SLP instance

>  > structure refering to the CONSTRUCTOR stmt and do the fixup

>  > in vect_schedule_slp_instance instead where you can simply

>  > replace the CONSTRUCTOR with the vectorized SSA name then.

> 

> Done.

> 

>  > +           /* Check that the constructor elements are unique.  */

>  > +           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)

>  > +             {

>  > +               tree prev_val;

>  > +               int j;

>  > +               FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j,

>  > prev_val)

>  > +               {

>  > +                 if (val == prev_val && i!=j)

>  >

>  > why's that necessary? (it looks incomplete, also doesn't catch

>  > [duplicate] constants)

> 

> The thinking was that there was no benefit in vectorizing a constructor of

> duplicates, or a vector of constants, although now you mention it that 

> thinking

> may be flawed. I've removed it

> 

>  > You miss to check that CONSTRUCTOR_NELTS == TYPE_VECTOR_SUBPARTS

>  > (we can have omitted trailing zeros).


^^^

I don't see this being handled?  You give up on non-SSA names
but not on the omitted trailing zeros.

> ...

>  > What happens if you have a vector constructor that is twice

>  > as large as the machine supports?  The vectorizer will happily

>  > produce a two vector SSA name vectorized result but your

>  > CONSTRUCTOR replacement doesn't work here.  I think this should

>  > be made work correctly (not give up on that case).

> 

> I've reworked the patch to account for this, by checking that the 

> vectorized version

> has one vectorized stmt at the root of the SLP tree. I'm not sure how to 

> handle

> a vector constructor twice as large as the machine supports, as far as I 

> can see,

> when only analyzing a within a basic block, the SSA name of the 

> constructor has to

> be maintained.


You build a CONSTRUCTOR of vectors, thus

_orig_ssa_1 = { vect_part1_2, vect_part2_3, ... };


+
+         if (constructor)
+           {
+             SLP_INSTANCE_ROOT_STMT (new_instance) = stmt_info->stmt;
+           }
+         else
+           SLP_INSTANCE_ROOT_STMT (new_instance) = NULL;
+

too much vertical space, no {} around single-stmt if clauses

@@ -2725,6 +2760,10 @@ vect_bb_slp_scalar_cost (basic_block bb,
                stmt_vec_info use_stmt_info = vinfo->lookup_stmt 
(use_stmt);
                if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
                  {
+                   /* Check this is not a constructor that will be 
vectorized
+                      away.  */
+                   if (BB_VINFO_GROUPED_STORES (vinfo).contains 
(use_stmt_info))
+                       continue;

hmm, so why not set the slp type on SLP_INSTANCE_ROOT_STMT instead?
In theory the stmt should be part of the SLP tree itself but that's 
probably too awkward to be made work at the moment ;)

vect_ssa_use_outside_bb and vect_slp_check_for_constructors are new
functions which need comments.

+  /* For vector constructors, the same SSA name must be used to maintain 
data
+     flow into other basic blocks.  */
+  if (instance->root == node && SLP_INSTANCE_ROOT_STMT (instance)
+      && SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1
+      && SLP_TREE_VEC_STMTS (node).exists ())
+    {

it should read

  /* Vectorize the instance root.  */

and be in vect_schedule_slp after the vect_schedule_slp_instance.
As said above you need to handle SLP_TREE_NUMBER_OF_VEC_STMTS > 1,
you also cannot simply do "nothing" here, "failing" vectorization
(well, you probably can - DCE will remove the vectorized code - but
at least if there were calls in the SLP tree they will be mangled
by vectorization so the scalar code is wrecked).  SO it should be

 if (SLP_INSTANCE_ROOT_STMT (instance))
   .. you may not fail to replace the scalar root stmt here ..

+         if (CONSTRUCTOR_NELTS (rhs) == 0)
+           vectorizable = false;
+

if you use continue; you can elide the 'vectorizable' variable.

+         if (!vect_ssa_use_outside_bb (gimple_assign_lhs (stmt)))
+           vectorizable = false;
+

why's that?  no comments that clarify ;)  The vector may be
used as argument to a call or as source of a store.  So I'd simply
remove this check (and the function).

Thanks,
Richard.

> Currently SLP vectorization can build SLP trees starting from reductions or

> from group stores. This patch adds a third starting point: vector 

> constructors.

> 

> For the following test case (compiled with -O3):

> 

> char g_d[1024], g_s1[1024], g_s2[1024];

> void test_loop(void)

> {

>    char d = g_d, s1 = g_s1, *s2 = g_s2;

>    for ( int y = 0; y < 128; y++ )

> 

>    {

>      for ( int x = 0; x < 16; x++ )

>        d[x] = s1[x] + s2[x];

>      d += 16;

>    }

> }

> 

> before patch:

> test_loop:

> .LFB0:

>          .cfi_startproc

>          adrp    x0, g_s1

>          adrp    x2, g_s2

>          add     x3, x0, :lo12:g_s1

>          add     x4, x2, :lo12:g_s2

>          ldrb    w7, [x2, #:lo12:g_s2]

>          ldrb    w1, [x0, #:lo12:g_s1]

>          adrp    x0, g_d

>          ldrb    w6, [x4, 1]

>          add     x0, x0, :lo12:g_d

>          ldrb    w5, [x3, 1]

>          add     w1, w1, w7

>          fmov    s0, w1

>          ldrb    w7, [x4, 2]

>          add     w5, w5, w6

>          ldrb    w1, [x3, 2]

>          ldrb    w6, [x4, 3]

>          add     x2, x0, 2048

>          ins     v0.b[1], w5

>          add     w1, w1, w7

>          ldrb    w7, [x3, 3]

>          ldrb    w5, [x4, 4]

>          add     w7, w7, w6

>          ldrb    w6, [x3, 4]

>          ins     v0.b[2], w1

>          ldrb    w8, [x4, 5]

>          add     w6, w6, w5

>          ldrb    w5, [x3, 5]

>          ldrb    w9, [x4, 6]

>          add     w5, w5, w8

>          ldrb    w1, [x3, 6]

>          ins     v0.b[3], w7

>          ldrb    w8, [x4, 7]

>          add     w1, w1, w9

>          ldrb    w11, [x3, 7]

>          ldrb    w7, [x4, 8]

>          add     w11, w11, w8

>          ldrb    w10, [x3, 8]

>          ins     v0.b[4], w6

>          ldrb    w8, [x4, 9]

>          add     w10, w10, w7

>          ldrb    w9, [x3, 9]

>          ldrb    w7, [x4, 10]

>          add     w9, w9, w8

>          ldrb    w8, [x3, 10]

>          ins     v0.b[5], w5

>          ldrb    w6, [x4, 11]

>          add     w8, w8, w7

>          ldrb    w7, [x3, 11]

>          ldrb    w5, [x4, 12]

>          add     w7, w7, w6

>          ldrb    w6, [x3, 12]

>          ins     v0.b[6], w1

>          ldrb    w12, [x4, 13]

>          add     w6, w6, w5

>          ldrb    w5, [x3, 13]

>          ldrb    w1, [x3, 14]

>          add     w5, w5, w12

>          ldrb    w13, [x4, 14]

>          ins     v0.b[7], w11

>          ldrb    w12, [x4, 15]

>          add     w4, w1, w13

>          ldrb    w1, [x3, 15]

>          add     w1, w1, w12

>          ins     v0.b[8], w10

>          ins     v0.b[9], w9

>          ins     v0.b[10], w8

>          ins     v0.b[11], w7

>          ins     v0.b[12], w6

>          ins     v0.b[13], w5

>          ins     v0.b[14], w4

>          ins     v0.b[15], w1

>          .p2align 3,,7

> .L2:

>          str     q0, [x0], 16

>          cmp     x2, x0

>          bne     .L2

>          ret

>          .cfi_endproc

> .LFE0:

> 

> After patch:

> 

> test_loop:

> .LFB0:

>          .cfi_startproc

>          adrp    x3, g_s1

>          adrp    x2, g_s2

>          add     x3, x3, :lo12:g_s1

>          add     x2, x2, :lo12:g_s2

>          adrp    x0, g_d

>          add     x0, x0, :lo12:g_d

>          add     x1, x0, 2048

>          ldr     q1, [x2]

>          ldr     q0, [x3]

>          add     v0.16b, v0.16b, v1.16b

>          .p2align 3,,7

> .L2:

>          str     q0, [x0], 16

>          cmp     x0, x1

>          bne     .L2

>          ret

>          .cfi_endproc

> .LFE0:

> 

> 

> 2019-10-11  Joel Hutton  Joel.Hutton@arm.com

> 

>      * tree-vect-slp.c (vect_analyze_slp_instance): Add case for vector 

> constructors.

>      (vect_bb_slp_scalar_cost): Likewise.

>      (vect_ssa_use_outside_bb): New function.

>      (vect_slp_check_for_constructors): New function.

>      (vect_slp_analyze_bb_1): Add check for vector constructors.

>      (vect_schedule_slp_instance): Add case to fixup vector constructor 

> stmt.

>      * tree-vectorizer.h (SLP_INSTANCE_ROOT_STMT): New field.

> 

> 

> gcc/testsuite/ChangeLog:

> 

> 2019-10-11  Joel Hutton  Joel.Hutton@arm.com

> 

>      * gcc.dg/vect/bb-slp-40.c: New test.

> 

> bootstrapped and regression tested on aarch64-none-linux-gnu

> 


-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)
diff mbox series

Patch

From 2bc57c17faa1dd494ed3898298e9fbe91f8a8675 Mon Sep 17 00:00:00 2001
From: Joel Hutton <Joel.Hutton@arm.com>
Date: Wed, 2 Oct 2019 17:38:53 +0100
Subject: [PATCH] SLP Vectorization: Vectorize Vector Constructors

---
 gcc/testsuite/gcc.dg/vect/bb-slp-40.c |  33 +++++++
 gcc/tree-vect-slp.c                   | 127 ++++++++++++++++++++++++++
 gcc/tree-vectorizer.h                 |   5 +
 3 files changed, 165 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-40.c

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-40.c b/gcc/testsuite/gcc.dg/vect/bb-slp-40.c
new file mode 100644
index 0000000000000000000000000000000000000000..51566b716bcda2fe82f50c50e9e9685cb3eb10ae
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-40.c
@@ -0,0 +1,33 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-slp-all" } */
+/* { dg-require-effective-target vect_int } */
+
+char g_d[1024], g_s1[1024], g_s2[1024];
+void foo(void)
+{
+    char *d = g_d, *s1 = g_s1, *s2 = g_s2;
+
+    for ( int y = 0; y < 128; y++ )
+    {
+      d[0 ] = s1[0 ] + s2[0 ];
+      d[1 ] = s1[1 ] + s2[1 ];
+      d[2 ] = s1[2 ] + s2[2 ];
+      d[3 ] = s1[3 ] + s2[3 ];
+      d[4 ] = s1[4 ] + s2[4 ];
+      d[5 ] = s1[5 ] + s2[5 ];
+      d[6 ] = s1[6 ] + s2[6 ];
+      d[7 ] = s1[7 ] + s2[7 ];
+      d[8 ] = s1[8 ] + s2[8 ];
+      d[9 ] = s1[9 ] + s2[9 ];
+      d[10] = s1[10] + s2[10];
+      d[11] = s1[11] + s2[11];
+      d[12] = s1[12] + s2[12];
+      d[13] = s1[13] + s2[13];
+      d[14] = s1[14] + s2[14];
+      d[15] = s1[15] + s2[15];
+      d += 16;
+    }
+}
+
+/* See that we vectorize an SLP instance.  */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "slp1" } } */
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 9b86b67734ad3e3506e9cee6a532b68decf24ae6..c4d452e3dfd46acdaa94dc047e4c6114d8295458 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -1922,6 +1922,7 @@  vect_analyze_slp_instance (vec_info *vinfo,
   unsigned int i;
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   vec<stmt_vec_info> scalar_stmts;
+  bool constructor = false;
 
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
@@ -1935,6 +1936,13 @@  vect_analyze_slp_instance (vec_info *vinfo,
       vectype = STMT_VINFO_VECTYPE (stmt_info);
       group_size = REDUC_GROUP_SIZE (stmt_info);
     }
+  else if (is_gimple_assign (stmt_info->stmt)
+	    && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
+    {
+      vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
+      group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
+      constructor = true;
+    }
   else
     {
       gcc_assert (is_a <loop_vec_info> (vinfo));
@@ -1981,6 +1989,25 @@  vect_analyze_slp_instance (vec_info *vinfo,
       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
 	= STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
     }
+  else if (constructor)
+    {
+      tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
+      tree val;
+      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
+	{
+	  if (TREE_CODE (val) == SSA_NAME)
+	    {
+	      gimple* def = SSA_NAME_DEF_STMT (val);
+	      stmt_vec_info def_info = vinfo->lookup_stmt (def);
+	      /* Value is defined in another basic block.  */
+	      if (!def_info)
+		return false;
+	      scalar_stmts.safe_push (def_info);
+	    }
+	  else
+	    return false;
+	}
+    }
   else
     {
       /* Collect reduction statements.  */
@@ -2038,6 +2065,14 @@  vect_analyze_slp_instance (vec_info *vinfo,
 	  SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
 	  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
 	  SLP_INSTANCE_LOADS (new_instance) = vNULL;
+
+	  if (constructor)
+	    {
+	      SLP_INSTANCE_ROOT_STMT (new_instance) = stmt_info->stmt;
+	    }
+	  else
+	    SLP_INSTANCE_ROOT_STMT (new_instance) = NULL;
+
 	  vect_gather_slp_loads (new_instance, node);
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location,
@@ -2725,6 +2760,10 @@  vect_bb_slp_scalar_cost (basic_block bb,
 		stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
 		if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
 		  {
+		    /* Check this is not a constructor that will be vectorized
+		       away.  */
+		    if (BB_VINFO_GROUPED_STORES (vinfo).contains (use_stmt_info))
+			continue;
 		    (*life)[i] = true;
 		    BREAK_FROM_IMM_USE_STMT (use_iter);
 		  }
@@ -2836,6 +2875,72 @@  vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
   return true;
 }
 
+static bool
+vect_ssa_use_outside_bb (tree ssa)
+{
+  imm_use_iterator use_iter;
+  gimple *use_stmt;
+  bool use_outside_of_block = false;
+  gcc_checking_assert (TREE_CODE (ssa) == SSA_NAME);
+  gimple* def = SSA_NAME_DEF_STMT (ssa);
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, ssa)
+  {
+    if (use_stmt->bb != def->bb)
+    {
+      use_outside_of_block = true;
+      BREAK_FROM_IMM_USE_STMT (use_iter);
+    }
+    /* In the following pattern, we consider _1 and vect_1
+       equivalent.
+       _1 = {a,b,c}
+       vect_1 = _1  */
+    else if (is_gimple_assign (use_stmt)
+	     && gimple_assign_rhs_code (use_stmt) == SSA_NAME
+	     && TREE_CODE (gimple_assign_lhs (use_stmt)) == SSA_NAME)
+    {
+      use_outside_of_block = vect_ssa_use_outside_bb (gimple_assign_lhs (use_stmt));
+      BREAK_FROM_IMM_USE_STMT (use_iter);
+    }
+    else
+      BREAK_FROM_IMM_USE_STMT (use_iter);
+  }
+  return use_outside_of_block;
+}
+
+static void
+vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
+{
+  gimple_stmt_iterator gsi;
+
+  for (gsi = bb_vinfo->region_begin;
+      gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      bool vectorizable = true;
+
+      if (is_gimple_assign (stmt)
+	  && gimple_assign_rhs_code (stmt) == CONSTRUCTOR
+	  && TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME
+	  && TREE_CODE (TREE_TYPE (gimple_assign_lhs (stmt))) == VECTOR_TYPE)
+	{
+	  tree rhs = gimple_assign_rhs1 (stmt);
+
+	  if (CONSTRUCTOR_NELTS (rhs) == 0)
+	    vectorizable = false;
+
+	  if (!vect_ssa_use_outside_bb (gimple_assign_lhs (stmt)))
+	    vectorizable = false;
+
+	  if (vectorizable)
+	    {
+	      stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (stmt);
+	      BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
+	    }
+	}
+    }
+}
+
 /* Check if the basic block can be vectorized.  Returns a bb_vec_info
    if so and sets fatal to true if failure is independent of
    current_vector_size.  */
@@ -2908,6 +3013,8 @@  vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
       return NULL;
     }
 
+  vect_slp_check_for_constructors (bb_vinfo);
+
   /* If there are no grouped stores in the region there is no need
      to continue with pattern recog as vect_analyze_slp will fail
      anyway.  */
@@ -4053,6 +4160,26 @@  vect_schedule_slp_instance (slp_tree node, slp_instance instance,
 	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
 	  STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
       }
+
+  /* For vector constructors, the same SSA name must be used to maintain data
+     flow into other basic blocks.  */
+  if (instance->root == node && SLP_INSTANCE_ROOT_STMT (instance)
+      && SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1
+      && SLP_TREE_VEC_STMTS (node).exists ())
+    {
+	  stmt_vec_info child_stmt_info;
+	  int j;
+	  FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
+	    {
+		gassign *rstmt
+		  = gimple_build_assign (gimple_get_lhs (instance->root_stmt),
+					 gimple_get_lhs (child_stmt_info->stmt));
+		gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt);
+		gsi_replace (&rgsi, rstmt, true);
+		break;
+	    }
+    }
+
 }
 
 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 837fb5ab52537cdf95a413557335b30704f9dc26..906579f0bc0efce955a1cde177fe1404ea8ce843 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -149,6 +149,10 @@  public:
   /* The root of SLP tree.  */
   slp_tree root;
 
+  /* For vector constructors, the constructor stmt that the SLP tree is built
+     from, NULL otherwise.  */
+  gimple *root_stmt;
+
   /* Size of groups of scalar stmts that will be replaced by SIMD stmt/s.  */
   unsigned int group_size;
 
@@ -168,6 +172,7 @@  public:
 #define SLP_INSTANCE_GROUP_SIZE(S)               (S)->group_size
 #define SLP_INSTANCE_UNROLLING_FACTOR(S)         (S)->unrolling_factor
 #define SLP_INSTANCE_LOADS(S)                    (S)->loads
+#define SLP_INSTANCE_ROOT_STMT(S)                (S)->root_stmt
 
 #define SLP_TREE_CHILDREN(S)                     (S)->children
 #define SLP_TREE_SCALAR_STMTS(S)                 (S)->stmts
-- 
2.17.1