[RFC] Tree Loop Unroller Pass

Message ID CAELXzTNzMks8yg-vTc=m9txFQBXWp-UJt--9XnBxWS8sp43_pw@mail.gmail.com
State New
Headers show
Series
  • [RFC] Tree Loop Unroller Pass
Related show

Commit Message

Kugan Vivekanandarajah Feb. 12, 2018, 11:55 p.m.
Implements tree loop unroller using the infrastructure provided.

gcc/ChangeLog:

2018-02-12  Kugan Vivekanandarajah  <kuganv@linaro.org>

    * Makefile.in (OBJS): Add tree-ssa-loop-unroll.o.
    * common.opt (ftree-loop-unroll): New option.
    * passes.def: Add pass_tree_loop_uroll
    * timevar.def (TV_TREE_LOOP_UNROLL): Add.
    * tree-pass.h (make_pass_tree_loop_unroll): Declare.
    * tree-ssa-loop-unroll.c: New file.

Comments

Andrew Pinski Feb. 20, 2018, 7:53 p.m. | #1
On Mon, Feb 12, 2018 at 3:55 PM, Kugan Vivekanandarajah
<kugan.vivekanandarajah@linaro.org> wrote:
> Implements tree loop unroller using the infrastructure provided.

>

> gcc/ChangeLog:

>

> 2018-02-12  Kugan Vivekanandarajah  <kuganv@linaro.org>

>

>     * Makefile.in (OBJS): Add tree-ssa-loop-unroll.o.

>     * common.opt (ftree-loop-unroll): New option.

>     * passes.def: Add pass_tree_loop_uroll

>     * timevar.def (TV_TREE_LOOP_UNROLL): Add.

>     * tree-pass.h (make_pass_tree_loop_unroll): Declare.

>     * tree-ssa-loop-unroll.c: New file.



I think it was decided to name new gimple passes as gimple-* rather
than tree-ssa-*.
The option should most likely just take over -funrol-loops, etc.

Shouldn't:
+  if (targetm.hw_max_mem_read_streams
+      && (max_load_streams = targetm.hw_max_mem_read_streams ()) > 0)
+    {
+      load_streams = count_mem_load_streams (loop, max_load_streams);
+      if (load_streams > 0)
+ {
+   signed t = 1 << (floor_log2 (max_load_streams / load_streams));
+   if (t < n_unroll)
+     n_unroll = t;
+ }
+    }

this be a target hook instead of doing inline here.  It seems too
target specific notion of what a stream is.  Even the notion of a
stream here seems micro-arch specific and not generic enough.


A few more comments about the pass.
You don't check can_duplicate_loop_p on the loop.
You use optimize_function_for_size_p on the whole function instead of
checking if the loop is cold (by checking optimize_loop_for_size_p).

In my version of gimple loop unrolling, I had to add
lpt_decision.already_unrolled and mark the loop as already unrolled so
the rtl loop unroller would not happen again.
-fopt-report does not report when the unrolling has happened unlike
the RTL version.

Thanks,
Andrew

Patch

From 71baaf8393dd79a98b4c0216e56d87083caf0177 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
Date: Mon, 12 Feb 2018 10:44:00 +1100
Subject: [PATCH 2/4] tree-loop-unroller

Change-Id: I58c25b5f2e796d4166af3ea4e50a0f4a3078b6c2
---
 gcc/Makefile.in            |   1 +
 gcc/common.opt             |   4 +
 gcc/passes.def             |   1 +
 gcc/timevar.def            |   1 +
 gcc/tree-pass.h            |   1 +
 gcc/tree-ssa-loop-unroll.c | 268 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 276 insertions(+)
 create mode 100644 gcc/tree-ssa-loop-unroll.c

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 374bf3e..de3c146 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1536,6 +1536,7 @@  OBJS = \
 	tree-ssa-loop-im.o \
 	tree-ssa-loop-ivcanon.o \
 	tree-ssa-loop-ivopts.o \
+	tree-ssa-loop-unroll.o \
 	tree-ssa-loop-manip.o \
 	tree-ssa-loop-niter.o \
 	tree-ssa-loop-prefetch.o \
diff --git a/gcc/common.opt b/gcc/common.opt
index b20a9aa..ea47b8c 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1770,6 +1770,10 @@  fivopts
 Common Report Var(flag_ivopts) Init(1) Optimization
 Optimize induction variables on trees.
 
+ftree-loop-unroll
+Common Report Var(flag_tree_loop_unroll) Init(1) Optimization
+Perform loop unrolling in gimple.
+
 fjump-tables
 Common Var(flag_jump_tables) Init(1) Optimization
 Use jump tables for sufficiently large switch statements.
diff --git a/gcc/passes.def b/gcc/passes.def
index 9802f08..57f7cc2 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -302,6 +302,7 @@  along with GCC; see the file COPYING3.  If not see
           NEXT_PASS (pass_predcom);
 	  NEXT_PASS (pass_complete_unroll);
 	  NEXT_PASS (pass_slp_vectorize);
+	  NEXT_PASS (pass_tree_loop_unroll);
 	  NEXT_PASS (pass_loop_prefetch);
 	  /* Run IVOPTs after the last pass that uses data-reference analysis
 	     as that doesn't handle TARGET_MEM_REFs.  */
diff --git a/gcc/timevar.def b/gcc/timevar.def
index 91221ae..a6bb847 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -202,6 +202,7 @@  DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution")
 DEFTIMEVAR (TV_CHECK_DATA_DEPS       , "tree check data dependences")
 DEFTIMEVAR (TV_TREE_PREFETCH	     , "tree prefetching")
 DEFTIMEVAR (TV_TREE_LOOP_IVOPTS	     , "tree iv optimization")
+DEFTIMEVAR (TV_TREE_LOOP_UNROLL     , "tree loop unroll")
 DEFTIMEVAR (TV_PREDCOM		     , "predictive commoning")
 DEFTIMEVAR (TV_TREE_CH		     , "tree copy headers")
 DEFTIMEVAR (TV_TREE_SSA_UNCPROP	     , "tree SSA uncprop")
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 93a6a99..2c0740f 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -388,6 +388,7 @@  extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_tree_loop_unroll (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-loop-unroll.c b/gcc/tree-ssa-loop-unroll.c
new file mode 100644
index 0000000..04cf092
--- /dev/null
+++ b/gcc/tree-ssa-loop-unroll.c
@@ -0,0 +1,268 @@ 
+
+/* Tree Loop Unroller.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "tree.h"
+#include "tree-pass.h"
+#include "target.h"
+#include "gimple.h"
+#include "cfgloop.h"
+#include "tree-ssa-loop.h"
+#include "tree-scalar-evolution.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-ssa-loop-niter.h"
+#include "tree-ssa-loop-ivopts.h"
+#include "gimple.h"
+#include "predict.h"
+#include "cfghooks.h"
+#include "tree-inline.h"
+#include "gimple-iterator.h"
+#include "fold-const.h"
+#include "tree-data-ref.h"
+#include "params.h"
+
+/* Count the load memory streams in the LOOP.  If the streams are larger
+   (compared to MAX_LOAD_STREAMS), we dont need to compute all of
+   them.  This is used to limit the partial unrolling factor to avoid
+   too many memory load streams.  */
+
+static signed
+count_mem_load_streams (struct loop *loop, signed max_load_streams)
+{
+  basic_block *bbs = get_loop_body (loop);
+  unsigned nbbs = loop->num_nodes;
+  gimple_stmt_iterator gsi;
+  signed count = 0;
+  hash_set <data_reference_p> *dr_set = new hash_set<data_reference_p> ();
+  vec<data_reference_p> datarefs_vec;
+  datarefs_vec.create (20);
+  unsigned k;
+
+  for (unsigned i = 0; i < nbbs; i++)
+    {
+      basic_block bb = bbs[i];
+      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  gimple *stmt = gsi_stmt (gsi);
+	  tree access_fn;
+	  vec<tree> access_fns;
+	  datarefs_vec.truncate (0);
+
+	  if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
+	     continue;
+
+	  for (unsigned j = 0; j < datarefs_vec.length (); ++j)
+	    {
+	      data_reference_p dr = datarefs_vec[j];
+	      if (!DR_IS_READ (dr))
+		continue;
+	      access_fns = DR_ACCESS_FNS (dr);
+
+	      FOR_EACH_VEC_ELT (access_fns, k, access_fn)
+		{
+		  if (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
+		    {
+		      if (!dr_set->add (dr))
+			count++;
+		      break;
+		    }
+		}
+	    }
+
+	  if (count > max_load_streams / 2)
+	    break;
+	}
+    }
+  free_data_refs (datarefs_vec);
+  free (dr_set);
+  return count;
+}
+
+/* Verify that loop in a form that is suitable for unrolling.  */
+
+static bool
+loop_form_ok_p (struct loop *loop)
+{
+  if (!single_exit (loop))
+    return false;
+
+  /* Loop has preheader.  */
+  if (!loop_preheader_edge (loop))
+    return false;
+
+  return true;
+}
+
+/* Return false if the cost model does not prefer LOOP to be unrolled.
+   Return true and update the DESC if we decided that the LOOP
+   is to be unrolled.  Also, in this case, determine the factor (NUNROLL)
+   by which the LOOP should be unrolled.  */
+
+static bool
+decide_unroll_iterations (struct loop *loop,
+			  struct tree_niter_desc *desc,
+			  signed *nunroll)
+{
+  /* nunroll = total number of copies of the original loop body in
+     unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
+  signed ninsns = tree_num_loop_insns (loop, &eni_size_weights);
+  signed ninsns_outer = tree_num_loop_insns (loop_outer (loop),
+					     &eni_size_weights);
+  signed n_unroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
+  signed load_streams = 0;
+  signed max_load_streams = -1;
+  signed n_unroll2;
+
+  if (ninsns_outer >= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS))
+    return false;
+  ninsns_outer -= ninsns;
+  n_unroll2 = (PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) - ninsns_outer)/ ninsns;
+  if (n_unroll2 < n_unroll)
+    n_unroll = n_unroll2;
+
+  if (n_unroll > PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
+    n_unroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
+
+  /* Check for simple loops.  */
+  edge exit = single_exit (loop);
+  if (!number_of_iterations_exit (loop, exit,
+				  desc, false, false)
+      || !integer_onep (desc->assumptions)
+      || !integer_zerop (desc->may_be_zero)
+      || !desc->niter)
+    return false;
+
+  /* Now force nunroll to be power of 2.  */
+  n_unroll = 1 << (floor_log2 (n_unroll));
+  if (targetm.hw_max_mem_read_streams
+      && (max_load_streams = targetm.hw_max_mem_read_streams ()) > 0)
+    {
+      load_streams = count_mem_load_streams (loop, max_load_streams);
+      if (load_streams > 0)
+	{
+	  signed t = 1 << (floor_log2 (max_load_streams / load_streams));
+	  if (t < n_unroll)
+	    n_unroll = t;
+	}
+    }
+
+  if (!can_unroll_loop_p (loop, n_unroll, desc))
+    return false;
+
+  *nunroll = n_unroll;
+  return true;
+}
+
+/* Unroll LOOP based on the cost model.  */
+
+static bool
+unroll_loop (struct loop *loop)
+{
+  struct tree_niter_desc desc;
+  signed n_unroll;
+
+  if (!decide_unroll_iterations (loop, &desc, &n_unroll))
+    return false;
+  if (n_unroll <= 1)
+    return false;
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "\nunrollong loop=%d times=%d\n", loop->num, n_unroll);
+  initialize_original_copy_tables ();
+  tree_unroll_loop (loop, n_unroll,
+		    single_dom_exit (loop), &desc);
+  free_original_copy_tables ();
+  return true;
+}
+
+static unsigned
+execute_tree_loop_unroll ()
+{
+  struct loop *loop;
+  bool changed = false;
+  int todo_flags = 0;
+  if (optimize_function_for_size_p (cfun))
+    return 0;
+
+  estimate_numbers_of_iterations (cfun);
+  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
+    {
+      if (!loop_form_ok_p (loop))
+	continue;
+      if (unroll_loop (loop))
+	{
+	  changed |= true;
+	  free_numbers_of_iterations_estimates (loop);
+	}
+    }
+
+  if (changed)
+    {
+      scev_reset ();
+      todo_flags |= TODO_cleanup_cfg;
+    }
+
+  return todo_flags;
+}
+
+namespace {
+const pass_data pass_data_tree_loop_unroll =
+{
+  GIMPLE_PASS, /* type */
+  "tree-loop-unroll", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_TREE_LOOP_UNROLL, /* tv_id */
+  PROP_ssa, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  (TODO_update_ssa | TODO_verify_all),
+};
+
+class pass_tree_loop_unroll : public gimple_opt_pass
+{
+public:
+  pass_tree_loop_unroll (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_tree_loop_unroll, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  opt_pass * clone () { return new pass_tree_loop_unroll (m_ctxt); }
+  virtual bool gate (function *) { return flag_tree_loop_unroll != 0; }
+  virtual unsigned int execute (function *)
+    {
+      return execute_tree_loop_unroll ();
+    }
+
+}; // class pass_tree_loop_unroll
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_tree_loop_unroll (gcc::context *ctxt)
+{
+  return new pass_tree_loop_unroll (ctxt);
+}
+
-- 
2.7.4