diff mbox

[AArch64] Enable autoprefetcher modelling in the scheduler

Message ID 56290946.50804@arm.com
State Accepted
Commit 2d6bc7fa9eeddc5387fabe559453a499e4653cc6
Headers show

Commit Message

Kyrylo Tkachov Oct. 22, 2015, 4:05 p.m. UTC
Hi all,

This patch enables the autoprefetcher heuristic for scheduling in AArch64.
It is enabled for the Cortex-A53, Cortex-A57 cores and is off for the other cores,
leaving their behaviour unchanged.

When enabled, the scheduler will try to sort groups of loads or stores in order of the offset from
a common base register.

 From what I understand of the relevant scheduling hooks, there are essentially three levels of this:
1) Don't use the autoprefetcher heuristic
2) Use it to order loads/stores but allow other scheduling heuristics to reorder them again to maximise multi-issue opportunities
3) Use it to order loads/stores and keep that order, even if it can harm multi-issue opportunities.

With this patch I get a 0.4% improvement in SPECINT 2006 and 1.7% improvement in SPECFP 2006 on a Cortex-A57
as well as improvements in various streaming workloads.

On Cortex-A53 I see improvements to various streaming workloads and there's no regressions or improvements on SPEC2000.

Bootstrapped and tested on aarch64-none-linux-gnu.

Ok for trunk?

Thanks,
Kyrill

2015-10-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * config/aarch64/aarch64-protos.h
     (struct tune_params): Add autoprefetcher_model field.
     * config/aarch64/aarch64.c: Include params.h
     (generic_tunings): Specify autoprefetcher_model value.
     (cortexa53_tunings): Likewise.
     (cortexa57_tunings): Likewise.
     (cortexa72_tunings): Likewise.
     (thunderx_tunings): Likewise.
     (xgene1_tunings): Likewise.
     (aarch64_first_cycle_multipass_dfa_lookahead_guard): New function.
     (TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define.
     (aarch64_override_options_internal): Set
     PARAM_SCHED_AUTOPREF_QUEUE_DEPTH param.

Comments

James Greenhalgh Oct. 27, 2015, 10:44 a.m. UTC | #1
On Thu, Oct 22, 2015 at 05:05:26PM +0100, Kyrill Tkachov wrote:
> Hi all,

> 

> This patch enables the autoprefetcher heuristic for scheduling in AArch64.

> It is enabled for the Cortex-A53, Cortex-A57 cores and is off for the other cores,

> leaving their behaviour unchanged.

> 

> When enabled, the scheduler will try to sort groups of loads or stores in

> order of the offset from a common base register.

> 

> From what I understand of the relevant scheduling hooks, there are

> essentially three levels of this:

> 1) Don't use the autoprefetcher heuristic

> 2) Use it to order loads/stores but allow other scheduling heuristics to

> reorder them again to maximise multi-issue opportunities

> 3) Use it to order loads/stores and keep that order, even if it can harm

> multi-issue opportunities.

> 

> With this patch I get a 0.4% improvement in SPECINT 2006 and 1.7% improvement

> in SPECFP 2006 on a Cortex-A57 as well as improvements in various streaming

> workloads.

> 

> On Cortex-A53 I see improvements to various streaming workloads and there's

> no regressions or improvements on SPEC2000.

> 

> Bootstrapped and tested on aarch64-none-linux-gnu.

> 

> Ok for trunk?


OK.

Thanks,
James

> 2015-10-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

> 

>     * config/aarch64/aarch64-protos.h

>     (struct tune_params): Add autoprefetcher_model field.

>     * config/aarch64/aarch64.c: Include params.h

>     (generic_tunings): Specify autoprefetcher_model value.

>     (cortexa53_tunings): Likewise.

>     (cortexa57_tunings): Likewise.

>     (cortexa72_tunings): Likewise.

>     (thunderx_tunings): Likewise.

>     (xgene1_tunings): Likewise.

>     (aarch64_first_cycle_multipass_dfa_lookahead_guard): New function.

>     (TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define.

>     (aarch64_override_options_internal): Set

>     PARAM_SCHED_AUTOPREF_QUEUE_DEPTH param.
diff mbox

Patch

commit da29c21db2050a6fb3b8c428eb0fc20e63856b6c
Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date:   Wed Sep 30 09:29:59 2015 +0100

    [AArch64] Enable autoprefetcher modelling in the scheduler

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index baaf1bd..07839ef 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -194,6 +194,23 @@  struct tune_params
   int vec_reassoc_width;
   int min_div_recip_mul_sf;
   int min_div_recip_mul_df;
+
+/* An enum specifying how to take into account CPU autoprefetch capabilities
+   during instruction scheduling:
+   - AUTOPREFETCHER_OFF: Do not take autoprefetch capabilities into account.
+   - AUTOPREFETCHER_WEAK: Attempt to sort sequences of loads/store in order of
+   offsets but allow the pipeline hazard recognizer to alter that order to
+   maximize multi-issue opportunities.
+   - AUTOPREFETCHER_STRONG: Attempt to sort sequences of loads/store in order of
+   offsets and prefer this even if it restricts multi-issue opportunities.  */
+
+  enum aarch64_autoprefetch_model
+  {
+    AUTOPREFETCHER_OFF,
+    AUTOPREFETCHER_WEAK,
+    AUTOPREFETCHER_STRONG
+  } autoprefetcher_model;
+
   unsigned int extra_tuning_flags;
 };
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d4c5665..4c69dc8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -64,6 +64,7 @@ 
 #include "gimple-fold.h"
 #include "tree-eh.h"
 #include "gimplify.h"
+#include "params.h"
 #include "optabs.h"
 #include "dwarf2.h"
 #include "cfgloop.h"
@@ -364,6 +365,7 @@  static const struct tune_params generic_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
@@ -386,6 +388,7 @@  static const struct tune_params cortexa53_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
@@ -408,6 +411,7 @@  static const struct tune_params cortexa57_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
 };
 
@@ -430,6 +434,7 @@  static const struct tune_params cortexa72_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
@@ -451,6 +456,7 @@  static const struct tune_params thunderx_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
@@ -472,6 +478,7 @@  static const struct tune_params xgene1_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
@@ -7024,6 +7031,19 @@  aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
 }
 
+
+/* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
+   autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
+   has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
+
+static int
+aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
+						    int ready_index)
+{
+  return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
+}
+
+
 /* Vectorizer cost model target hooks.  */
 
 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
@@ -7615,6 +7635,29 @@  aarch64_override_options_internal (struct gcc_options *opts)
   initialize_aarch64_code_model (opts);
   initialize_aarch64_tls_size (opts);
 
+  int queue_depth = 0;
+  switch (aarch64_tune_params.autoprefetcher_model)
+    {
+      case tune_params::AUTOPREFETCHER_OFF:
+	queue_depth = -1;
+	break;
+      case tune_params::AUTOPREFETCHER_WEAK:
+	queue_depth = 0;
+	break;
+      case tune_params::AUTOPREFETCHER_STRONG:
+	queue_depth = max_insn_queue_index + 1;
+	break;
+      default:
+	gcc_unreachable ();
+    }
+
+  /* We don't mind passing in global_options_set here as we don't use
+     the *options_set structs anyway.  */
+  maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
+			 queue_depth,
+			 opts->x_param_values,
+			 global_options_set.x_param_values);
+
   aarch64_override_options_after_change_1 (opts);
 }
 
@@ -13481,6 +13524,10 @@  aarch64_promoted_type (const_tree t)
 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
   aarch64_sched_first_cycle_multipass_dfa_lookahead
 
+#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
+#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
+  aarch64_first_cycle_multipass_dfa_lookahead_guard
+
 #undef TARGET_TRAMPOLINE_INIT
 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init