diff mbox

[2/3,AARCH64] Add scheduler for ThunderX

Message ID 1415926574-3080-3-git-send-email-apinski@cavium.com
State Accepted
Commit 2d41ed582029561858ae03b01234cc05e1730107
Headers show

Commit Message

Andrew Pinski Nov. 14, 2014, 12:56 a.m. UTC
This adds the schedule model for ThunderX. There are a few TODOs in that
not all of the SIMD is model currently.  Also the idea of a simple
shift/extend is not modeled and all cases where there is a shift/extend
is considered as non simple and take up two cycles rather than correct
value of one cycle.  Also the 32bit divide and the 64bit divide
have different cycle counts but there is no way to model that currently.
Also multiply high takes one cycle more than the normal multiply but
there is no way to model that currently either.

Build and tested for aarch64-elf with no regressions.

ChangeLog:
* config/aarch64/aarch64-cores.def (thunderx): Change the scheduler
over to thunderx.
* config/aarch64/aarch64.md: Include thunderx.md.
(generic_sched): Set to no for thunderx.
* config/aarch64/thunderx.md: New file.
---
 gcc/config/aarch64/aarch64-cores.def |    2 +-
 gcc/config/aarch64/aarch64.md        |    3 +-
 gcc/config/aarch64/thunderx.md       |  260 ++++++++++++++++++++++++++++++++++
 3 files changed, 263 insertions(+), 2 deletions(-)
 create mode 100644 gcc/config/aarch64/thunderx.md

Comments

Marcus Shawcroft Nov. 14, 2014, 10:54 a.m. UTC | #1
On 14 November 2014 00:56, Andrew Pinski <apinski@cavium.com> wrote:
> This adds the schedule model for ThunderX. There are a few TODOs in that
> not all of the SIMD is model currently.  Also the idea of a simple
> shift/extend is not modeled and all cases where there is a shift/extend
> is considered as non simple and take up two cycles rather than correct
> value of one cycle.  Also the 32bit divide and the 64bit divide
> have different cycle counts but there is no way to model that currently.
> Also multiply high takes one cycle more than the normal multiply but
> there is no way to model that currently either.
>
> Build and tested for aarch64-elf with no regressions.
>
> ChangeLog:
> * config/aarch64/aarch64-cores.def (thunderx): Change the scheduler
> over to thunderx.
> * config/aarch64/aarch64.md: Include thunderx.md.
> (generic_sched): Set to no for thunderx.
> * config/aarch64/thunderx.md: New file.

OK /Marcus
Sebastian Pop Nov. 17, 2014, 8:04 p.m. UTC | #2
Andrew Pinski wrote:
> diff --git a/gcc/config/aarch64/thunderx.md b/gcc/config/aarch64/thunderx.md
> new file mode 100644
> index 0000000..30e4395
> --- /dev/null
> +++ b/gcc/config/aarch64/thunderx.md
> @@ -0,0 +1,260 @@
> +;; Cavium ThunderX pipeline description
> +;; Copyright (C) 2014 Free Software Foundation, Inc.
> +;;
> +;; Written by Andrew Pinski  <apinski@cavium.com>
> +
> +;; This file is part of GCC.
> +
> +;; GCC is free software; you can redistribute it and/or modify
> +;; it under the terms of the GNU General Public License as published by
> +;; the Free Software Foundation; either version 3, or (at your option)
> +;; any later version.
> +
> +;; GCC is distributed in the hope that it will be useful,
> +;; but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +;; GNU General Public License for more details.
> +
> +;; You should have received a copy of the GNU General Public License
> +;; along with GCC; see the file COPYING3.  If not see
> +;; <http://www.gnu.org/licenses/>.
> +;;   Copyright (C) 2004, 2005, 2006 Cavium Networks.

You should remove this line before commit.
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index b3318c3..471cdd6 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -36,7 +36,7 @@ 
 
 AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa53)
 AARCH64_CORE("cortex-a57",  cortexa15, cortexa15, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa57)
-AARCH64_CORE("thunderx",    thunderx,  cortexa53, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx)
+AARCH64_CORE("thunderx",    thunderx,  thunderx, 8,  AARCH64_FL_FPSIMD | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx)
 
 /* V8 big.LITTLE implementations.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 17570ba..80f2db7 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -191,13 +191,14 @@ 
 
 (define_attr "generic_sched" "yes,no"
   (const (if_then_else
-          (eq_attr "tune" "cortexa53,cortexa15")
+          (eq_attr "tune" "cortexa53,cortexa15,thunderx")
           (const_string "no")
           (const_string "yes"))))
 
 ;; Scheduling
 (include "../arm/cortex-a53.md")
 (include "../arm/cortex-a15.md")
+(include "thunderx.md")
 
 ;; -------------------------------------------------------------------
 ;; Jumps and other miscellaneous insns
diff --git a/gcc/config/aarch64/thunderx.md b/gcc/config/aarch64/thunderx.md
new file mode 100644
index 0000000..30e4395
--- /dev/null
+++ b/gcc/config/aarch64/thunderx.md
@@ -0,0 +1,260 @@ 
+;; Cavium ThunderX pipeline description
+;; Copyright (C) 2014 Free Software Foundation, Inc.
+;;
+;; Written by Andrew Pinski  <apinski@cavium.com>
+
+;; This file is part of GCC.
+
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;   Copyright (C) 2004, 2005, 2006 Cavium Networks.
+
+
+;; Thunder is a dual-issue processor that can issue all instructions on
+;; pipe0 and a subset on pipe1.
+
+
+(define_automaton "thunderx_main, thunderx_mult, thunderx_divide, thunderx_simd")
+
+(define_cpu_unit "thunderx_pipe0" "thunderx_main")
+(define_cpu_unit "thunderx_pipe1" "thunderx_main")
+(define_cpu_unit "thunderx_mult" "thunderx_mult")
+(define_cpu_unit "thunderx_divide" "thunderx_divide")
+(define_cpu_unit "thunderx_simd" "thunderx_simd")
+
+(define_insn_reservation "thunderx_add" 1
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "adc_imm,adc_reg,adr,alu_imm,alu_sreg,alus_imm,alus_sreg,extend,logic_imm,logic_reg,logics_imm,logics_reg,mov_imm,mov_reg"))
+  "thunderx_pipe0 | thunderx_pipe1")
+
+(define_insn_reservation "thunderx_shift" 1
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "bfm,extend,shift_imm,shift_reg"))
+  "thunderx_pipe0 | thunderx_pipe1")
+
+
+;; Arthimentic instructions with an extra shift or extend is two cycles.
+;; FIXME: This needs more attributes on aarch64 than what is currently there;
+;;    this is conserative for now.
+;; Except this is not correct as this is only for !(LSL && shift by 0/1/2/3)
+;; Except this is not correct as this is only for !(zero extend)
+
+(define_insn_reservation "thunderx_arith_shift" 2
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "alu_ext,alu_shift_imm,alu_shift_reg,alus_ext,logic_shift_imm,logic_shift_reg,logics_shift_imm,logics_shift_reg,alus_shift_imm"))
+  "thunderx_pipe0 | thunderx_pipe1")
+
+(define_insn_reservation "thunderx_csel" 2
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "csel"))
+  "thunderx_pipe0 | thunderx_pipe1")
+
+;; Multiply and mulitply accumulate and count leading zeros can only happen on pipe 1
+
+(define_insn_reservation "thunderx_mul" 4
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "mul,muls,mla,mlas,clz,smull,umull,smlal,umlal"))
+  "thunderx_pipe1 + thunderx_mult")
+
+;; Multiply high instructions take an extra cycle and cause the muliply unit to
+;; be busy for an extra cycle.
+
+;(define_insn_reservation "thunderx_mul_high" 5
+;  (and (eq_attr "tune" "thunderx")
+;       (eq_attr "type" "smull,umull"))
+;  "thunderx_pipe1 + thunderx_mult")
+
+(define_insn_reservation "thunderx_div32" 22
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "udiv,sdiv"))
+  "thunderx_pipe1 + thunderx_divide, thunderx_divide * 21")
+
+;(define_insn_reservation "thunderx_div64" 38
+;  (and (eq_attr "tune" "thunderx")
+;       (eq_attr "type" "udiv,sdiv")
+;       (eq_attr "mode" "DI"))
+;  "thunderx_pipe1 + thunderx_divide, thunderx_divide * 34")
+
+;; Stores take one cycle in pipe 0
+(define_insn_reservation "thunderx_store" 1
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "store1"))
+  "thunderx_pipe0")
+
+;; Store pair are single issued
+(define_insn_reservation "thunderx_storepair" 1
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "store2"))
+  "thunderx_pipe0 + thunderx_pipe1")
+
+
+;; loads (and load pairs) from L1 take 3 cycles in pipe 0
+(define_insn_reservation "thunderx_load" 3
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "load1, load2"))
+  "thunderx_pipe0")
+
+(define_insn_reservation "thunderx_brj" 1
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "branch,trap,call"))
+  "thunderx_pipe1")
+
+;; FPU
+
+(define_insn_reservation "thunderx_fadd" 4
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "faddd,fadds"))
+  "thunderx_pipe1")
+
+(define_insn_reservation "thunderx_fconst" 1
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "fconsts,fconstd"))
+  "thunderx_pipe1")
+
+;; Moves between fp are 2 cycles including min/max/select/abs/neg
+(define_insn_reservation "thunderx_fmov" 2
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "fmov,f_minmaxs,f_minmaxd,fcsel,ffarithd,ffariths"))
+  "thunderx_pipe1")
+
+(define_insn_reservation "thunderx_fmovgpr" 2
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "f_mrc, f_mcr"))
+  "thunderx_pipe1")
+
+(define_insn_reservation "thunderx_fmul" 6
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "fmacs,fmacd,fmuls,fmuld"))
+  "thunderx_pipe1")
+
+(define_insn_reservation "thunderx_fdivs" 12
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "fdivs"))
+  "thunderx_pipe1 + thunderx_divide, thunderx_divide*8")
+
+(define_insn_reservation "thunderx_fdivd" 22
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "fdivd"))
+  "thunderx_pipe1 + thunderx_divide, thunderx_divide*18")
+
+(define_insn_reservation "thunderx_fsqrts" 17
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "fsqrts"))
+  "thunderx_pipe1 + thunderx_divide, thunderx_divide*13")
+
+(define_insn_reservation "thunderx_fsqrtd" 28
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "fsqrtd"))
+  "thunderx_pipe1 + thunderx_divide, thunderx_divide*31")
+
+;; The rounding conversion inside fp is 4 cycles
+(define_insn_reservation "thunderx_frint" 4
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "f_rints,f_rintd"))
+  "thunderx_pipe1")
+
+;; Float to integer with a move from int to/from float is 6 cycles
+(define_insn_reservation "thunderx_f_cvt" 6
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "f_cvt,f_cvtf2i,f_cvti2f"))
+  "thunderx_pipe1")
+
+;; FP/SIMD load/stores happen in pipe 0
+;; 64bit Loads register/pairs are 4 cycles from L1
+(define_insn_reservation "thunderx_64simd_fp_load" 4
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "f_loadd,f_loads,neon_load1_1reg,\
+			neon_load1_1reg_q,neon_load1_2reg"))
+  "thunderx_pipe0")
+
+;; 128bit load pair is singled issue and 4 cycles from L1
+(define_insn_reservation "thunderx_128simd_pair_load" 4
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "neon_load1_2reg_q"))
+  "thunderx_pipe0+thunderx_pipe1")
+
+;; FP/SIMD Stores takes one cycle in pipe 0
+(define_insn_reservation "thunderx_simd_fp_store" 1
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "f_stored,f_stores,neon_store1_1reg,neon_store1_1reg_q"))
+  "thunderx_pipe0")
+
+;; 64bit neon store pairs are single issue for one cycle
+(define_insn_reservation "thunderx_64neon_storepair" 1
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "neon_store1_2reg"))
+  "thunderx_pipe0 + thunderx_pipe1")
+
+;; 128bit neon store pair are single issued for two cycles
+(define_insn_reservation "thunderx_128neon_storepair" 2
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "neon_store1_2reg_q"))
+  "(thunderx_pipe0 + thunderx_pipe1)*2")
+
+
+;; SIMD/NEON (q forms take an extra cycle)
+
+;; Thunder simd move instruction types - 2/3 cycles
+(define_insn_reservation "thunderx_neon_move" 2
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "neon_logic, neon_bsl, neon_fp_compare_s, \
+			neon_fp_compare_d, neon_move"))
+  "thunderx_pipe1 + thunderx_simd")
+
+(define_insn_reservation "thunderx_neon_move_q" 3
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "neon_logic_q, neon_bsl_q, neon_fp_compare_s_q, \
+			neon_fp_compare_d_q, neon_move_q"))
+  "thunderx_pipe1 + thunderx_simd, thunderx_simd")
+
+
+;; Thunder simd simple/add instruction types - 4/5 cycles
+
+(define_insn_reservation "thunderx_neon_add" 4
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "neon_reduc_add, neon_reduc_minmax, neon_fp_reduc_add_s, \
+			neon_fp_reduc_add_d, neon_fp_to_int_s, neon_fp_to_int_d, \
+			neon_add_halve, neon_sub_halve, neon_qadd, neon_compare, \
+			neon_compare_zero, neon_minmax, neon_abd, neon_add, neon_sub, \
+			neon_fp_minmax_s, neon_fp_minmax_d, neon_reduc_add, neon_cls, \
+			neon_qabs, neon_qneg, neon_fp_addsub_s, neon_fp_addsub_d"))
+  "thunderx_pipe1 + thunderx_simd")
+
+;; BIG NOTE: neon_add_long/neon_sub_long don't have a q form which is incorrect
+
+(define_insn_reservation "thunderx_neon_add_q" 5
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "neon_reduc_add_q, neon_reduc_minmax_q, neon_fp_reduc_add_s_q, \
+			neon_fp_reduc_add_d_q, neon_fp_to_int_s_q, neon_fp_to_int_d_q, \
+			neon_add_halve_q, neon_sub_halve_q, neon_qadd_q, neon_compare_q, \
+			neon_compare_zero_q, neon_minmax_q, neon_abd_q, neon_add_q, neon_sub_q, \
+			neon_fp_minmax_s_q, neon_fp_minmax_d_q, neon_reduc_add_q, neon_cls_q, \
+			neon_qabs_q, neon_qneg_q, neon_fp_addsub_s_q, neon_fp_addsub_d_q, \
+			neon_add_long, neon_sub_long"))
+  "thunderx_pipe1 + thunderx_simd, thunderx_simd")
+
+
+;; Thunder 128bit SIMD reads the upper halve in cycle 2 and writes in the last cycle
+(define_bypass 2 "thunderx_neon_move_q" "thunderx_neon_move_q, thunderx_neon_add_q")
+(define_bypass 4 "thunderx_neon_add_q" "thunderx_neon_move_q, thunderx_neon_add_q")
+
+;; Assume both pipes are needed for unknown and multiple-instruction
+;; patterns.
+
+(define_insn_reservation "thunderx_unknown" 1
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "untyped,multiple"))
+  "thunderx_pipe0 + thunderx_pipe1")
+
+