diff mbox series

[RFC,SLP] SLP vectorization: vectorize vector constructors

Message ID eea93add-6487-2c7c-8726-fbb79a04aaa6@arm.com
State New
Headers show
Series [RFC,SLP] SLP vectorization: vectorize vector constructors | expand

Commit Message

Joel Oct. 1, 2019, 11:14 a.m. UTC
On 01/10/2019 12:07, Joel wrote:
>

> SLP vectorization: vectorize vector constructors

>

>

> Currently SLP vectorization can build SLP trees staring from 

> reductions or from group stores. This patch adds a third starting 

> point: vector constructors.

>

>

> For the following test case (compiled with -O3 -fno-vect-cost-model):

>

>

> char g_d[1024], g_s1[1024], g_s2[1024]; void test_loop(void) { char /d 

> = g_d, /s1 = g_s1, *s2 = g_s2;

>

>

> for ( int y = 0; y < 128; y++ )

> {

>    for ( int x = 0; x < 16; x++ )

>      d[x] = s1[x] + s2[x];

>    d += 16;

> }

>

> }

>

>

> before patch: test_loop: .LFB0: .cfi_startproc adrp x0, g_s1 adrp x2, 

> g_s2 add x3, x0, :lo12:g_s1 add x4, x2, :lo12:g_s2 ldrb w7, [x2, 

> #:lo12:g_s2] ldrb w1, [x0, #:lo12:g_s1] adrp x0, g_d ldrb w6, [x4, 1] 

> add x0, x0, :lo12:g_d ldrb w5, [x3, 1] add w1, w1, w7 fmov s0, w1 ldrb 

> w7, [x4, 2] add w5, w5, w6 ldrb w1, [x3, 2] ldrb w6, [x4, 3] add x2, 

> x0, 2048 ins v0.b[1], w5 add w1, w1, w7 ldrb w7, [x3, 3] ldrb w5, [x4, 

> 4] add w7, w7, w6 ldrb w6, [x3, 4] ins v0.b[2], w1 ldrb w8, [x4, 5] 

> add w6, w6, w5 ldrb w5, [x3, 5] ldrb w9, [x4, 6] add w5, w5, w8 ldrb 

> w1, [x3, 6] ins v0.b[3], w7 ldrb w8, [x4, 7] add w1, w1, w9 ldrb w11, 

> [x3, 7] ldrb w7, [x4, 8] add w11, w11, w8 ldrb w10, [x3, 8] ins 

> v0.b[4], w6 ldrb w8, [x4, 9] add w10, w10, w7 ldrb w9, [x3, 9] ldrb 

> w7, [x4, 10] add w9, w9, w8 ldrb w8, [x3, 10] ins v0.b[5], w5 ldrb w6, 

> [x4, 11] add w8, w8, w7 ldrb w7, [x3, 11] ldrb w5, [x4, 12] add w7, 

> w7, w6 ldrb w6, [x3, 12] ins v0.b[6], w1 ldrb w12, [x4, 13] add w6, 

> w6, w5 ldrb w5, [x3, 13] ldrb w1, [x3, 14] add w5, w5, w12 ldrb w13, 

> [x4, 14] ins v0.b[7], w11 ldrb w12, [x4, 15] add w4, w1, w13 ldrb w1, 

> [x3, 15] add w1, w1, w12 ins v0.b[8], w10 ins v0.b[9], w9 ins 

> v0.b[10], w8 ins v0.b[11], w7 ins v0.b[12], w6 ins v0.b[13], w5 ins 

> v0.b[14], w4 ins v0.b[15], w1 .p2align 3,,7 .L2: str q0, [x0], 16 cmp 

> x2, x0 bne .L2 ret .cfi_endproc .LFE0:

>

>

> After patch:

>

>

> test_loop: .LFB0: .cfi_startproc adrp x3, g_s1 adrp x2, g_s2 add x3, 

> x3, :lo12:g_s1 add x2, x2, :lo12:g_s2 adrp x0, g_d add x0, x0, 

> :lo12:g_d add x1, x0, 2048 ldr q1, [x2] ldr q0, [x3] add v0.16b, 

> v0.16b, v1.16b .p2align 3,,7 .L2: str q0, [x0], 16 cmp x0, x1 bne .L2 

> ret .cfi_endproc .LFE0:

>

>

>

>

> bootstrapped and tested on aarch64-none-linux-gnu

>

Patch attached:

Comments

Richard Biener Oct. 1, 2019, 11:42 a.m. UTC | #1
On Tue, 1 Oct 2019, Joel Hutton wrote:

> On 01/10/2019 12:07, Joel wrote:

> >

> > SLP vectorization: vectorize vector constructors

> >

> >

> > Currently SLP vectorization can build SLP trees staring from 

> > reductions or from group stores. This patch adds a third starting 

> > point: vector constructors.

> >

> >

> > For the following test case (compiled with -O3 -fno-vect-cost-model):

> >

> >

> > char g_d[1024], g_s1[1024], g_s2[1024]; void test_loop(void) { char /d 

> > = g_d, /s1 = g_s1, *s2 = g_s2;

> >

> >

> > for ( int y = 0; y < 128; y++ )

> > {

> >    for ( int x = 0; x < 16; x++ )

> >      d[x] = s1[x] + s2[x];

> >    d += 16;

> > }

> >

> > }

> >

> >

> > before patch: test_loop: .LFB0: .cfi_startproc adrp x0, g_s1 adrp x2, 

> > g_s2 add x3, x0, :lo12:g_s1 add x4, x2, :lo12:g_s2 ldrb w7, [x2, 

> > #:lo12:g_s2] ldrb w1, [x0, #:lo12:g_s1] adrp x0, g_d ldrb w6, [x4, 1] 

> > add x0, x0, :lo12:g_d ldrb w5, [x3, 1] add w1, w1, w7 fmov s0, w1 ldrb 

> > w7, [x4, 2] add w5, w5, w6 ldrb w1, [x3, 2] ldrb w6, [x4, 3] add x2, 

> > x0, 2048 ins v0.b[1], w5 add w1, w1, w7 ldrb w7, [x3, 3] ldrb w5, [x4, 

> > 4] add w7, w7, w6 ldrb w6, [x3, 4] ins v0.b[2], w1 ldrb w8, [x4, 5] 

> > add w6, w6, w5 ldrb w5, [x3, 5] ldrb w9, [x4, 6] add w5, w5, w8 ldrb 

> > w1, [x3, 6] ins v0.b[3], w7 ldrb w8, [x4, 7] add w1, w1, w9 ldrb w11, 

> > [x3, 7] ldrb w7, [x4, 8] add w11, w11, w8 ldrb w10, [x3, 8] ins 

> > v0.b[4], w6 ldrb w8, [x4, 9] add w10, w10, w7 ldrb w9, [x3, 9] ldrb 

> > w7, [x4, 10] add w9, w9, w8 ldrb w8, [x3, 10] ins v0.b[5], w5 ldrb w6, 

> > [x4, 11] add w8, w8, w7 ldrb w7, [x3, 11] ldrb w5, [x4, 12] add w7, 

> > w7, w6 ldrb w6, [x3, 12] ins v0.b[6], w1 ldrb w12, [x4, 13] add w6, 

> > w6, w5 ldrb w5, [x3, 13] ldrb w1, [x3, 14] add w5, w5, w12 ldrb w13, 

> > [x4, 14] ins v0.b[7], w11 ldrb w12, [x4, 15] add w4, w1, w13 ldrb w1, 

> > [x3, 15] add w1, w1, w12 ins v0.b[8], w10 ins v0.b[9], w9 ins 

> > v0.b[10], w8 ins v0.b[11], w7 ins v0.b[12], w6 ins v0.b[13], w5 ins 

> > v0.b[14], w4 ins v0.b[15], w1 .p2align 3,,7 .L2: str q0, [x0], 16 cmp 

> > x2, x0 bne .L2 ret .cfi_endproc .LFE0:

> >

> >

> > After patch:

> >

> >

> > test_loop: .LFB0: .cfi_startproc adrp x3, g_s1 adrp x2, g_s2 add x3, 

> > x3, :lo12:g_s1 add x2, x2, :lo12:g_s2 adrp x0, g_d add x0, x0, 

> > :lo12:g_d add x1, x0, 2048 ldr q1, [x2] ldr q0, [x3] add v0.16b, 

> > v0.16b, v1.16b .p2align 3,,7 .L2: str q0, [x0], 16 cmp x0, x1 bne .L2 

> > ret .cfi_endproc .LFE0:

> >

> >

> >

> >

> > bootstrapped and tested on aarch64-none-linux-gnu

> >

> Patch attached:


I think a better place for the loop searching for CONSTRUCTORs is
vect_slp_analyze_bb_1 where I'd put it before the check you remove,
and I'd simply append found CONSTRUCTORs to the grouped_stores
array.  The fixup you do in vectorizable_operation doesn't
belong there either, I'd add a new field to the SLP instance
structure refering to the CONSTRUCTOR stmt and do the fixup
in vect_schedule_slp_instance instead where you can simply
replace the CONSTRUCTOR with the vectorized SSA name then.

+           /* Check that the constructor elements are unique.  */
+           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
+             {
+               tree prev_val;
+               int j;
+               FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, 
prev_val)
+               {
+                 if (val == prev_val && i!=j)

why's that necessary? (it looks incomplete, also doesn't catch
[duplicate] constants)

You miss to check that CONSTRUCTOR_NELTS == TYPE_VECTOR_SUBPARTS
(we can have omitted trailing zeros).

What happens if you have a vector constructor that is twice
as large as the machine supports?  The vectorizer will happily
produce a two vector SSA name vectorized result but your
CONSTRUCTOR replacement doesn't work here.  I think this should
be made work correctly (not give up on that case).

Thanks,
Richard.
diff mbox series

Patch

From 7b9e6d02017ffe6f7ab17cbdd48da41ccc5f6db0 Mon Sep 17 00:00:00 2001
From: Joel Hutton <Joel.Hutton@arm.com>
Date: Fri, 27 Sep 2019 10:26:00 +0100
Subject: [PATCH] SLP vectorization: vectorize vector constructors

---
 gcc/tree-vect-slp.c   | 98 ++++++++++++++++++++++++++++++++++++-------
 gcc/tree-vect-stmts.c | 20 +++++++++
 2 files changed, 103 insertions(+), 15 deletions(-)

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 9b86b67734ad3e3506e9cee6a532b68decf24ae6..4c715ebe34dbdb8072e15dc9053f53a1949a070d 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -1923,6 +1923,8 @@  vect_analyze_slp_instance (vec_info *vinfo,
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   vec<stmt_vec_info> scalar_stmts;
 
+  bool constructor = false;
+
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
       scalar_type = TREE_TYPE (DR_REF (dr));
@@ -1935,6 +1937,17 @@  vect_analyze_slp_instance (vec_info *vinfo,
       vectype = STMT_VINFO_VECTYPE (stmt_info);
       group_size = REDUC_GROUP_SIZE (stmt_info);
     }
+  else if (is_gimple_assign (stmt_info->stmt)
+      && TREE_CODE (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)))
+	== VECTOR_TYPE
+      && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
+    {
+      vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
+      group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
+      constructor = true;
+      if (TREE_CODE (vectype) != VECTOR_TYPE)
+	vectype = NULL;
+    }
   else
     {
       gcc_assert (is_a <loop_vec_info> (vinfo));
@@ -1981,6 +1994,32 @@  vect_analyze_slp_instance (vec_info *vinfo,
       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
 	= STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
     }
+  else if (constructor)
+    {
+      tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
+      tree val;
+      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
+	{
+	  tree prev_val;
+	  int j;
+	  FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, prev_val)
+	    {
+	      if (val == prev_val && i!=j)
+		return false;
+	    }
+	  if (TREE_CODE (val) == SSA_NAME)
+	    {
+	      gimple* def = SSA_NAME_DEF_STMT (val);
+	      stmt_vec_info def_info = vinfo->lookup_stmt (def);
+	      /* Value is defined in another basic block.  */
+	      if (!def_info)
+		return false;
+	      scalar_stmts.safe_push (def_info);
+	    }
+	    else
+	      return false;
+	}
+    }
   else
     {
       /* Collect reduction statements.  */
@@ -2227,6 +2266,50 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 				   max_tree_size);
     }
 
+  /* Find SLP sequences starting from a vector constructor.  */
+  gimple_stmt_iterator gsi;
+  if (is_a <bb_vec_info> (vinfo))
+    {
+      bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
+
+      for (gsi = bb_vinfo->region_begin;
+	  gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
+      {
+	gimple *stmt = gsi_stmt (gsi);
+	bool vectorizable = true;
+	if ( is_gimple_assign (stmt)
+	    && gimple_assign_rhs_code (stmt) == CONSTRUCTOR
+	    && TREE_CODE (TREE_TYPE (gimple_assign_lhs (stmt))) == VECTOR_TYPE)
+	  {
+	    tree rhs = gimple_assign_rhs1 (stmt);
+	    tree val;
+	    int i;
+	    /* Check that the constructor elements are unique.  */
+	    FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
+	      {
+		tree prev_val;
+		int j;
+		FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, prev_val)
+		{
+		  if (val == prev_val && i!=j)
+		    {
+		      vectorizable = false;
+		      break;
+		    }
+		}
+	      }
+
+	    if (vectorizable)
+	      {
+		first_element = bb_vinfo->lookup_stmt (stmt);
+		vect_analyze_slp_instance (vinfo, first_element,
+		    max_tree_size);
+	      }
+	  }
+      }
+    }
+
+
   return opt_result::success ();
 }
 
@@ -2908,21 +2991,6 @@  vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
       return NULL;
     }
 
-  /* If there are no grouped stores in the region there is no need
-     to continue with pattern recog as vect_analyze_slp will fail
-     anyway.  */
-  if (bb_vinfo->grouped_stores.is_empty ())
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not vectorized: no grouped stores in "
-			 "basic block.\n");
-
-      delete bb_vinfo;
-      return NULL;
-    }
-
-  /* While the rest of the analysis below depends on it in some way.  */
   fatal = false;
 
   vect_pattern_recog (bb_vinfo);
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index b1e97f85d96b352e9e52bbb7c265dda6e1d0f3ad..9e1a762f922d069da4e5bcc3cb05df2bc3d1acff 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -6237,6 +6237,26 @@  vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  gimple_assign_set_lhs (new_stmt, new_temp);
 	  new_stmt_info
 	    = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+
+	  /* For vector constructors, the same SSA name must be used, so that
+	   data flow into other basic blocks is maintained.  */
+	  imm_use_iterator use_iter;
+	  gimple *use_stmt;
+	  tree lhs = gimple_get_lhs (stmt_info->stmt);
+	  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, lhs)
+	    {
+	      if (is_gimple_assign (use_stmt)
+		  && gimple_assign_rhs_code (use_stmt) == CONSTRUCTOR
+		  && use_stmt->bb == stmt_info->stmt->bb)
+		{
+		  gassign *rstmt
+		    = gimple_build_assign (gimple_get_lhs (use_stmt),
+					   gimple_get_lhs (new_stmt));
+		  gimple_stmt_iterator rgsi = gsi_for_stmt (use_stmt);
+		  gsi_replace (&rgsi, rstmt, true);
+		}
+	    }
+
 	  if (vec_cvt_dest)
 	    {
 	      new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp);
-- 
2.17.1