diff mbox

Support subchains of interleaving chains in basic block SLP

Message ID CAKSNEw5aRw6MS8JqnNFK45UJ-9bFwMeOREoJDYy3w4sY8AZRWA@mail.gmail.com
State New
Headers show

Commit Message

Ira Rosen Oct. 16, 2011, 10:47 a.m. UTC
Hi,

This patch allows to vectorize a subchain of interleaved loads in
basic block SLP (in loop vectorization this would be more complicated
because of loop peeling). This patch also swaps operands if necessary
(and possible) to make operations isomorphic.

Bootstrapped and tested on powerpc64-suse-linux.
Committed.

Ira

ChangeLog:

        * tree-vect-stmts.c (vectorizable_load): For SLP without permutation
        treat the first load of the node as the first element in its
        interleaving chain.
        * tree-vect-slp.c (vect_get_and_check_slp_defs): Swap the operands if
        necessary and possible.
        (vect_build_slp_tree): Add new argument.  Allow load groups of any size
        in basic blocks.  Keep all the loads for further permutation check.
        Use the new argument to determine if there is a permutation.  Update
        the recursive calls.
        (vect_supported_load_permutation_p): Allow subchains of interleaving
        chains in basic block vectorization.
        (vect_analyze_slp_instance): Update the call to vect_build_slp_tree.
        Check load permutation based on the new parameter.
        (vect_schedule_slp_instance): Don't start from the first element in
        interleaving chain unless the loads are permuted.

testsuite/ChangeLog:

        * gcc.dg/vect/bb-slp-29.c: New test.
diff mbox

Patch

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 180052)
+++ ChangeLog	(working copy)
@@ -1,3 +1,21 @@ 
+2011-10-16 Ira Rosen  <ira.rosen@linaro.org>
+
+	* tree-vect-stmts.c (vectorizable_load): For SLP without permutation
+	treat the first load of the node as the first element in its
+	interleaving chain.
+	* tree-vect-slp.c (vect_get_and_check_slp_defs): Swap the operands if
+	necessary and possible.
+	(vect_build_slp_tree): Add new argument.  Allow load groups of any size
+	in basic blocks.  Keep all the loads for further permutation check.
+	Use the new argument to determine if there is a permutation.  Update
+	the recursive calls.
+	(vect_supported_load_permutation_p): Allow subchains of interleaving
+	chains in basic block vectorization.
+	(vect_analyze_slp_instance): Update the call to vect_build_slp_tree.
+	Check load permutation based on the new parameter.
+	(vect_schedule_slp_instance): Don't start from the first element in
+	interleaving chain unless the loads are permuted.
+
 2011-10-15  Richard Henderson  <rth@redhat.com>
 
 	* tree-vect-slp.c: Include langhooks.h.
Index: testsuite/gcc.dg/vect/bb-slp-29.c
===================================================================
--- testsuite/gcc.dg/vect/bb-slp-29.c	(revision 0)
+++ testsuite/gcc.dg/vect/bb-slp-29.c	(revision 0)
@@ -0,0 +1,59 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define A 3
+#define B 4
+#define N 256
+
+short src[N], dst[N];
+
+void foo (short * __restrict__ dst, short * __restrict__ src, int h, int stride, int dummy)
+{
+  int i;
+  h /= 16;
+  for (i = 0; i < h; i++)
+    {
+      dst[0] = A*src[0] + B*src[1];
+      dst[1] = A*src[1] + B*src[2];
+      dst[2] = A*src[2] + B*src[3];
+      dst[3] = A*src[3] + B*src[4];
+      dst[4] = A*src[4] + B*src[5];
+      dst[5] = A*src[5] + B*src[6];
+      dst[6] = A*src[6] + B*src[7];
+      dst[7] = A*src[7] + B*src[8];
+      dst += stride;
+      src += stride;
+      if (dummy == 32)
+        abort ();
+   }
+}
+
+
+int main (void)
+{
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i < N; i++)
+    {
+       dst[i] = 0;
+       src[i] = i;
+    }
+
+  foo (dst, src, N, 8, 0);
+
+  for (i = 0; i < N/2; i++)
+    {
+      if (dst[i] != A * src[i] + B * src[i+1])
+        abort ();
+    }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1 "slp"  { target { vect_int_mult &&  vect_element_align } } } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
+
Index: testsuite/ChangeLog
===================================================================
--- testsuite/ChangeLog	(revision 180052)
+++ testsuite/ChangeLog	(working copy)
@@ -1,3 +1,7 @@ 
+2011-10-16 Ira Rosen  <ira.rosen@linaro.org>
+
+	* gcc.dg/vect/bb-slp-29.c: New test.
+
 2011-10-15  Paolo Carlini  <paolo.carlini@oracle.com>
 
 	PR c++/50732
Index: tree-vect-stmts.c
===================================================================
--- tree-vect-stmts.c	(revision 180052)
+++ tree-vect-stmts.c	(working copy)
@@ -4241,6 +4241,11 @@  vectorizable_load (gimple stmt, gimple_stmt_iterat
   if (strided_load)
     {
       first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+      if (slp
+          && !SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance)
+	  && first_stmt != VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0))
+        first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
+
       /* Check if the chain of loads is already vectorized.  */
       if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
 	{
Index: tree-vect-slp.c
===================================================================
--- tree-vect-slp.c	(revision 180052)
+++ tree-vect-slp.c	(working copy)
@@ -116,13 +116,15 @@  vect_get_and_check_slp_defs (loop_vec_info loop_vi
 {
   tree oprnd;
   unsigned int i, number_of_oprnds;
-  tree def;
+  tree def[2];
   gimple def_stmt;
   enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
   stmt_vec_info stmt_info =
     vinfo_for_stmt (VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0));
   enum gimple_rhs_class rhs_class;
   struct loop *loop = NULL;
+  enum tree_code rhs_code;
+  bool different_types = false;
 
   if (loop_vinfo)
     loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -134,7 +136,7 @@  vect_get_and_check_slp_defs (loop_vec_info loop_vi
     {
       oprnd = gimple_op (stmt, i + 1);
 
-      if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def,
+      if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def[i],
                                &dt[i])
 	  || (!def_stmt && dt[i] != vect_constant_def))
 	{
@@ -189,11 +191,11 @@  vect_get_and_check_slp_defs (loop_vec_info loop_vi
           switch (gimple_code (def_stmt))
             {
               case GIMPLE_PHI:
-                def = gimple_phi_result (def_stmt);
+                def[i] = gimple_phi_result (def_stmt);
                 break;
 
               case GIMPLE_ASSIGN:
-                def = gimple_assign_lhs (def_stmt);
+                def[i] = gimple_assign_lhs (def_stmt);
                 break;
 
               default:
@@ -207,8 +209,8 @@  vect_get_and_check_slp_defs (loop_vec_info loop_vi
 	{
 	  /* op0 of the first stmt of the group - store its info.  */
 	  *first_stmt_dt0 = dt[i];
-	  if (def)
-	    *first_stmt_def0_type = TREE_TYPE (def);
+	  if (def[i])
+	    *first_stmt_def0_type = TREE_TYPE (def[i]);
 	  else
 	    *first_stmt_const_oprnd = oprnd;
 
@@ -228,8 +230,8 @@  vect_get_and_check_slp_defs (loop_vec_info loop_vi
 	    {
 	      /* op1 of the first stmt of the group - store its info.  */
 	      *first_stmt_dt1 = dt[i];
-	      if (def)
-		*first_stmt_def1_type = TREE_TYPE (def);
+	      if (def[i])
+		*first_stmt_def1_type = TREE_TYPE (def[i]);
 	      else
 		{
 		  /* We assume that the stmt contains only one constant
@@ -255,24 +257,56 @@  vect_get_and_check_slp_defs (loop_vec_info loop_vi
 		   && ((*first_stmt_dt0 != dt[i]
                         && !(*first_stmt_dt0 == vect_reduction_def
                              && dt[i] == vect_internal_def))
-		       || (*first_stmt_def0_type && def
+		       || (*first_stmt_def0_type && def[0]
 			   && !types_compatible_p (*first_stmt_def0_type,
-						   TREE_TYPE (def)))))
+						   TREE_TYPE (def[0])))))
 		  || (i == 1
 		      && ((*first_stmt_dt1 != dt[i]
                            && !(*first_stmt_dt1 == vect_reduction_def
                                 && dt[i] == vect_internal_def))
-			  || (*first_stmt_def1_type && def
+			  || (*first_stmt_def1_type && def[1]
 			      && !types_compatible_p (*first_stmt_def1_type,
-						      TREE_TYPE (def)))))
-		  || (!def
+						      TREE_TYPE (def[1])))))
+		  || (!def[i]
 		      && !types_compatible_p (TREE_TYPE (*first_stmt_const_oprnd),
-					      TREE_TYPE (oprnd))))
+					      TREE_TYPE (oprnd)))
+		  || different_types)
 		{
-		  if (vect_print_dump_info (REPORT_SLP))
-		    fprintf (vect_dump, "Build SLP failed: different types ");
+                  if (i != number_of_oprnds - 1)
+                    different_types = true;
+                  else
+	            {
+                      if (is_gimple_assign (stmt)
+                          && (rhs_code = gimple_assign_rhs_code (stmt))
+ 		          && TREE_CODE_CLASS (rhs_code) == tcc_binary
+			  && commutative_tree_code (rhs_code)
+			  && *first_stmt_dt0 == dt[1]
+			  && *first_stmt_dt1 == dt[0]
+			  && def[0] && def[1]
+			  && !(*first_stmt_def0_type
+			       && !types_compatible_p (*first_stmt_def0_type,
+			                               TREE_TYPE (def[1])))
+                          && !(*first_stmt_def1_type
+                               && !types_compatible_p (*first_stmt_def1_type,
+                                                       TREE_TYPE (def[0]))))
+                        {
+                           if (vect_print_dump_info (REPORT_SLP))
+	                     {
+			       fprintf (vect_dump, "Swapping operands of ");
+ 		               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+			     }
 
-		  return false;
+ 			   swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt),
+ 	                                      gimple_assign_rhs2_ptr (stmt));
+			}
+                      else
+                        {
+         		  if (vect_print_dump_info (REPORT_SLP))
+	        	    fprintf (vect_dump, "Build SLP failed: different types ");
+
+		          return false;
+			}
+		    }
 		}
 	    }
 	}
@@ -286,10 +320,10 @@  vect_get_and_check_slp_defs (loop_vec_info loop_vi
 
 	case vect_internal_def:
         case vect_reduction_def:
-	  if (i == 0)
+	  if ((i == 0 && !different_types) || (i == 1 && different_types))
 	    VEC_safe_push (gimple, heap, *def_stmts0, def_stmt);
 	  else
-	    VEC_safe_push (gimple, heap, *def_stmts1, def_stmt);
+ 	    VEC_safe_push (gimple, heap, *def_stmts1, def_stmt);
 	  break;
 
 	default:
@@ -297,7 +331,7 @@  vect_get_and_check_slp_defs (loop_vec_info loop_vi
 	  if (vect_print_dump_info (REPORT_SLP))
 	    {
 	      fprintf (vect_dump, "Build SLP failed: illegal type of def ");
-	      print_generic_expr (vect_dump, def, TDF_SLIM);
+	      print_generic_expr (vect_dump, def[i], TDF_SLIM);
 	    }
 
 	  return false;
@@ -320,7 +354,7 @@  vect_build_slp_tree (loop_vec_info loop_vinfo, bb_
                      int ncopies_for_cost, unsigned int *max_nunits,
                      VEC (int, heap) **load_permutation,
                      VEC (slp_tree, heap) **loads,
-                     unsigned int vectorization_factor)
+                     unsigned int vectorization_factor, bool *loads_permuted)
 {
   VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size);
   VEC (gimple, heap) *def_stmts1 =  VEC_alloc (gimple, heap, group_size);
@@ -531,7 +565,8 @@  vect_build_slp_tree (loop_vec_info loop_vinfo, bb_
 
               /* Check that the size of interleaved loads group is not
                  greater than the SLP group size.  */
-              if (GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
+              if (loop_vinfo
+                  && GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
                 {
                   if (vect_print_dump_info (REPORT_SLP))
                     {
@@ -652,19 +687,22 @@  vect_build_slp_tree (loop_vec_info loop_vinfo, bb_
   /* Strided loads were reached - stop the recursion.  */
   if (stop_recursion)
     {
+      VEC_safe_push (slp_tree, heap, *loads, *node);
       if (permutation)
         {
-          VEC_safe_push (slp_tree, heap, *loads, *node);
+
+          *loads_permuted = true;
           *inside_cost 
             += targetm.vectorize.builtin_vectorization_cost (vec_perm, NULL, 0) 
                * group_size;
         }
       else
-        { 
-          /* We don't check here complex numbers chains, so we keep them in
-	     LOADS for further check in vect_supported_load_permutation_p.  */ 
+        {
+          /* We don't check here complex numbers chains, so we set
+             LOADS_PERMUTED for further check in
+             vect_supported_load_permutation_p.  */
           if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR)
-            VEC_safe_push (slp_tree, heap, *loads, *node);
+            *loads_permuted = true;
         }
 
       return true;
@@ -683,7 +721,7 @@  vect_build_slp_tree (loop_vec_info loop_vinfo, bb_
       if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &left_node, group_size,
 				inside_cost, outside_cost, ncopies_for_cost,
 				max_nunits, load_permutation, loads,
-				vectorization_factor))
+				vectorization_factor, loads_permuted))
 	return false;
 
       SLP_TREE_LEFT (*node) = left_node;
@@ -701,7 +739,7 @@  vect_build_slp_tree (loop_vec_info loop_vinfo, bb_
       if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &right_node, group_size,
 				inside_cost, outside_cost, ncopies_for_cost,
 				max_nunits, load_permutation, loads,
-				vectorization_factor))
+				vectorization_factor, loads_permuted))
 	return false;
 
       SLP_TREE_RIGHT (*node) = right_node;
@@ -887,8 +925,10 @@  vect_supported_load_permutation_p (slp_instance sl
   bool supported, bad_permutation = false;
   sbitmap load_index;
   slp_tree node, other_complex_node;
-  gimple stmt, first = NULL, other_node_first;
+  gimple stmt, first = NULL, other_node_first, load, next_load, first_load;
   unsigned complex_numbers = 0;
+  struct data_reference *dr;
+  bb_vec_info bb_vinfo;
 
   /* FORNOW: permutations are only supported in SLP.  */
   if (!slp_instn)
@@ -1050,6 +1090,76 @@  vect_supported_load_permutation_p (slp_instance sl
         }
     }
 
+  /* In basic block vectorization we allow any subchain of an interleaving
+     chain.
+     FORNOW: not supported in loop SLP because of realignment compications.  */
+  bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt));
+  bad_permutation = false;
+  /* Check that for every node in the instance teh loads form a subchain.  */
+  if (bb_vinfo)
+    {
+      FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node)
+        {
+          next_load = NULL;
+          first_load = NULL;
+          FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), j, load)
+            {
+              if (!first_load)
+                first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load));
+              else if (first_load
+                         != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)))
+                {
+                  bad_permutation = true;
+	          break;
+	        }
+
+              if (j != 0 && next_load != load)
+                {
+                  bad_permutation = true;
+                  break;
+                }
+
+              next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
+            }
+
+          if (bad_permutation)
+            break;
+        }
+
+      /* Check that the alignment of the first load in every subchain, i.e.,
+         the first statement in every load node, is supported.  */
+      if (!bad_permutation)
+        {
+          FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node)
+            {
+              first_load = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
+              if (first_load
+                    != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
+                {
+                  dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
+                  if (vect_supportable_dr_alignment (dr, false)
+ 	               == dr_unaligned_unsupported)
+                    {
+   		      if (vect_print_dump_info (REPORT_SLP))
+		        {
+  	                  fprintf (vect_dump, "unsupported unaligned load ");
+                          print_gimple_stmt (vect_dump, first_load, 0,
+					     TDF_SLIM);
+                        }
+  		      bad_permutation = true;
+                      break;
+                    }
+	        }
+            }
+
+          if (!bad_permutation)
+            {
+              VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn));
+              return true;
+    	    }
+        }
+    }
+
   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
      well (unless it's reduction).  */
@@ -1159,6 +1269,7 @@  vect_analyze_slp_instance (loop_vec_info loop_vinf
   VEC (int, heap) *load_permutation;
   VEC (slp_tree, heap) *loads;
   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
+  bool loads_permuted = false;
 
   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
     {
@@ -1250,7 +1361,7 @@  vect_analyze_slp_instance (loop_vec_info loop_vinf
   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
                            &inside_cost, &outside_cost, ncopies_for_cost,
 			   &max_nunits, &load_permutation, &loads,
-			   vectorization_factor))
+			   vectorization_factor, &loads_permuted))
     {
       /* Calculate the unrolling factor based on the smallest type.  */
       if (max_nunits > nunits)
@@ -1275,7 +1386,8 @@  vect_analyze_slp_instance (loop_vec_info loop_vinf
       SLP_INSTANCE_LOADS (new_instance) = loads;
       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
       SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
-      if (VEC_length (slp_tree, loads))
+
+      if (loads_permuted)
         {
           if (!vect_supported_load_permutation_p (new_instance, group_size,
                                                   load_permutation))
@@ -2567,10 +2679,11 @@  vect_schedule_slp_instance (slp_tree node, slp_ins
   /* Loads should be inserted before the first load.  */
   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
       && STMT_VINFO_STRIDED_ACCESS (stmt_info)
-      && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
+      && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
+      && SLP_INSTANCE_LOAD_PERMUTATION (instance))
     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
   else if (is_pattern_stmt_p (stmt_info))
-     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
+    si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
   else
     si = gsi_for_stmt (stmt);