diff mbox

[AARCH64] add qdf24xx tuning structure

Message ID CABXYE2X2tVgJne4c80QugEkTGyKgjFTLCo=onkK=1E96KvyFyg@mail.gmail.com
State New
Headers show

Commit Message

Jim Wilson June 28, 2016, 1:03 a.m. UTC
On Mon, Jun 13, 2016 at 3:01 AM, Kyrill Tkachov
<kyrylo.tkachov@foss.arm.com> wrote:
> Hi Jim,

>

> On 10/06/16 23:48, Jim Wilson wrote:

>>

>> This adds a tuning structure for qdf24xx.  This was tested with an

>> aarch64-linux bootstrap and a make check, with no regressions.  I also

>> tested it with an x86_64-linux C make check to verify that I didn't

>> break the testsuite for non aarch64 targets.

>

>

> As this also changes code in the arm backend

> it also needs a bootstrap and test on an arm target

> (arm-none-linux-gnueabihf for example).

> Can you please confirm that this passes successfully?


Yes, I forgot to do the bootstrap and make check on arm.

I tried to do that testing, and ran into problems with the armv8-a
assembler warning
    IT blocks containing 32-bit Thumb instructions are deprecated in ARMv8
which messed up my testsuite results so much that they were unusable.
I had to rerun the tests to workaround that, and then got distracted
by other problems, but I have now done an armhf bootstrap and make
check with unpatched (cortex-a57) and patched (qdf24xx) trees and got
the same results.

During the delay, the aarch64 tuning structure changed how the recip
square root approx is handled, so I had to make a trivial change to my
patch to compensate for that, and then redo the aarch64 bootstrap to
make sure it was still OK.  The new patch is attached, which otherwise
the same as the previous patch.  I'm assuming this is still OK to
install, as the previous patch was approved pending test results, but
will wait a bit in case someone ones to object.

Jim

Comments

Christophe Lyon June 29, 2016, 7:38 p.m. UTC | #1
On 28 June 2016 at 10:21, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> wrote:
>

> On 28/06/16 02:03, Jim Wilson wrote:

>>

>> On Mon, Jun 13, 2016 at 3:01 AM, Kyrill Tkachov

>> <kyrylo.tkachov@foss.arm.com> wrote:

>>>

>>> Hi Jim,

>>>

>>> On 10/06/16 23:48, Jim Wilson wrote:

>>>>

>>>> This adds a tuning structure for qdf24xx.  This was tested with an

>>>> aarch64-linux bootstrap and a make check, with no regressions.  I also

>>>> tested it with an x86_64-linux C make check to verify that I didn't

>>>> break the testsuite for non aarch64 targets.

>>>

>>>

>>> As this also changes code in the arm backend

>>> it also needs a bootstrap and test on an arm target

>>> (arm-none-linux-gnueabihf for example).

>>> Can you please confirm that this passes successfully?

>>

>> Yes, I forgot to do the bootstrap and make check on arm.

>>

>> I tried to do that testing, and ran into problems with the armv8-a

>> assembler warning

>>      IT blocks containing 32-bit Thumb instructions are deprecated in

>> ARMv8

>> which messed up my testsuite results so much that they were unusable.

>

>

> Yes, that's PR 67591. For these purposes I usually configure with

I'm looking at this. Progress is slow as I keep being distracted.

Christophe

> something like --with-cpu=cortex-a15 unless I'm actually testing

> ARMv8-A functionality.

>

>> I had to rerun the tests to workaround that, and then got distracted

>> by other problems, but I have now done an armhf bootstrap and make

>> check with unpatched (cortex-a57) and patched (qdf24xx) trees and got

>> the same results.

>

>

> Thanks. That's ok arm-wise.

>

> Kyrill

>

>

>>

>> During the delay, the aarch64 tuning structure changed how the recip

>> square root approx is handled, so I had to make a trivial change to my

>> patch to compensate for that, and then redo the aarch64 bootstrap to

>> make sure it was still OK.  The new patch is attached, which otherwise

>> the same as the previous patch.  I'm assuming this is still OK to

>> install, as the previous patch was approved pending test results, but

>> will wait a bit in case someone ones to object.

>>

>> Jim

>

>
diff mbox

Patch

	gcc/
	* config/aarch64/aarch64-cores.def (qdf24xx): Use qdf24xx tuning.
	* config/aarch64/aarch64.c (qdf24xx_addrcost_table,
	qdf24xx_regmove_cost, qdf24xx_tunings): New.
	* config/arm/aarch64-cost-tables.h (qdf24xx_extra_costs): New.
	* config/arm/arm-cores.def (qdf24xx): Use qdf24xx tuning.
	* config/arm/arm.c (arm_qdf24xx_tune): New.

	gcc/testsuite/
	* gcc.dg/asr_div1.c: Add aarch64 specific dg-options.

Index: config/aarch64/aarch64-cores.def
===================================================================
--- config/aarch64/aarch64-cores.def	(revision 237800)
+++ config/aarch64/aarch64-cores.def	(working copy)
@@ -46,7 +46,7 @@  AARCH64_CORE("cortex-a57",  cortexa57, cortexa57,
 AARCH64_CORE("cortex-a72",  cortexa72, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08")
 AARCH64_CORE("cortex-a73",  cortexa73, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa73, "0x41", "0xd09")
 AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, exynosm1,  "0x53", "0x001")
-AARCH64_CORE("qdf24xx",     qdf24xx,   cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa57, "0x51", "0x800")
+AARCH64_CORE("qdf24xx",     qdf24xx,   cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, qdf24xx,   "0x51", "0x800")
 AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
 AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
 
Index: config/aarch64/aarch64.c
===================================================================
--- config/aarch64/aarch64.c	(revision 237800)
+++ config/aarch64/aarch64.c	(working copy)
@@ -250,6 +250,22 @@  static const struct cpu_addrcost_table xgene1_addr
   0, /* imm_offset  */
 };
 
+static const struct cpu_addrcost_table qdf24xx_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
 static const struct cpu_regmove_cost generic_regmove_cost =
 {
   1, /* GP2GP  */
@@ -308,6 +324,15 @@  static const struct cpu_regmove_cost xgene1_regmov
   2 /* FP2FP  */
 };
 
+static const struct cpu_regmove_cost qdf24xx_regmove_cost =
+{
+  2, /* GP2GP  */
+  /* Avoid the use of int<->fp moves for spilling.  */
+  6, /* GP2FP  */
+  6, /* FP2GP  */
+  4 /* FP2FP  */
+};
+
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost generic_vector_cost =
 {
@@ -647,6 +672,32 @@  static const struct tune_params xgene1_tunings =
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
+static const struct tune_params qdf24xx_tunings =
+{
+  &qdf24xx_extra_costs,
+  &qdf24xx_addrcost_table,
+  &qdf24xx_regmove_cost,
+  &generic_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  4, /* memmov_cost  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
+  16,	/* function_align.  */
+  8,	/* jump_align.  */
+  16,	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  64,	/* cache_line_size.  */
+  tune_params::AUTOPREFETCHER_STRONG,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE)		/* tune_flags.  */
+};
+
 /* Support for fine-grained override of the tuning structures.  */
 struct aarch64_tuning_override_function
 {
Index: config/arm/aarch-cost-tables.h
===================================================================
--- config/arm/aarch-cost-tables.h	(revision 237800)
+++ config/arm/aarch-cost-tables.h	(working copy)
@@ -537,4 +537,107 @@  const struct cpu_cost_table xgene1_extra_costs =
   }
 };
 
+const struct cpu_cost_table qdf24xx_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    0,                 /* shift_reg.  */
+    COSTS_N_INSNS (1), /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    0,                 /* log_shift.  */
+    0,                 /* log_shift_reg.  */
+    0,                 /* extend.  */
+    0,                 /* extend_arith.  */
+    0,                 /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,	               /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (2),       /* simple.  */
+      COSTS_N_INSNS (2),       /* flag_setting.  */
+      COSTS_N_INSNS (2),       /* extend.  */
+      COSTS_N_INSNS (2),       /* add.  */
+      COSTS_N_INSNS (2),       /* extend_add.  */
+      COSTS_N_INSNS (4)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (3),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (3),       /* extend_add.  */
+      COSTS_N_INSNS (9)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (2),         /* load.  */
+    COSTS_N_INSNS (2),         /* load_sign_extend.  */
+    COSTS_N_INSNS (2),         /* ldrd.  */
+    COSTS_N_INSNS (2),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (2),         /* loadf.  */
+    COSTS_N_INSNS (2),         /* loadd.  */
+    COSTS_N_INSNS (3),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    COSTS_N_INSNS (1),         /* store_unaligned.  */
+    COSTS_N_INSNS (1),         /* loadv.  */
+    COSTS_N_INSNS (1)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (6),      /* div.  */
+      COSTS_N_INSNS (5),       /* mult.  */
+      COSTS_N_INSNS (5),       /* mult_addsub. */
+      COSTS_N_INSNS (5),       /* fma.  */
+      COSTS_N_INSNS (3),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst. */
+      COSTS_N_INSNS (1),       /* neg.  */
+      COSTS_N_INSNS (2),       /* compare.  */
+      COSTS_N_INSNS (4),       /* widen.  */
+      COSTS_N_INSNS (4),       /* narrow.  */
+      COSTS_N_INSNS (4),       /* toint.  */
+      COSTS_N_INSNS (4),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (11),      /* div.  */
+      COSTS_N_INSNS (6),       /* mult.  */
+      COSTS_N_INSNS (6),       /* mult_addsub.  */
+      COSTS_N_INSNS (6),       /* fma.  */
+      COSTS_N_INSNS (3),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      COSTS_N_INSNS (1),       /* neg.  */
+      COSTS_N_INSNS (2),       /* compare.  */
+      COSTS_N_INSNS (4),       /* widen.  */
+      COSTS_N_INSNS (4),       /* narrow.  */
+      COSTS_N_INSNS (4),       /* toint.  */
+      COSTS_N_INSNS (4),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (1)  /* alu.  */
+  }
+};
+
 #endif /* GCC_AARCH_COST_TABLES_H */
Index: config/arm/arm-cores.def
===================================================================
--- config/arm/arm-cores.def	(revision 237800)
+++ config/arm/arm-cores.def	(working copy)
@@ -173,7 +173,7 @@  ARM_CORE("cortex-a57",	cortexa57, cortexa57,	8A,	A
 ARM_CORE("cortex-a72",	cortexa72, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("cortex-a73",	cortexa73, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a73)
 ARM_CORE("exynos-m1",	exynosm1,  exynosm1,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), exynosm1)
-ARM_CORE("qdf24xx",	qdf24xx,   cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
+ARM_CORE("qdf24xx",	qdf24xx,   cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), qdf24xx)
 ARM_CORE("xgene1",      xgene1,    xgene1,      8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8A),            xgene1)
 
 /* V8 big.LITTLE implementations */
Index: config/arm/arm.c
===================================================================
--- config/arm/arm.c	(revision 237800)
+++ config/arm/arm.c	(working copy)
@@ -2052,6 +2052,29 @@  const struct tune_params arm_xgene1_tune =
   tune_params::SCHED_AUTOPREF_OFF
 };
 
+const struct tune_params arm_qdf24xx_tune =
+{
+  arm_9e_rtx_costs,
+  &qdf24xx_extra_costs,
+  NULL,                                         /* Scheduler cost adjustment.  */
+  arm_default_branch_cost,
+  &arm_default_vec_cost,			/* Vectorizer costs.  */
+  1,						/* Constant limit.  */
+  2,						/* Max cond insns.  */
+  8,						/* Memset max inline.  */
+  4,						/* Issue rate.  */
+  ARM_PREFETCH_BENEFICIAL (0, -1, 64),
+  tune_params::PREF_CONST_POOL_FALSE,
+  tune_params::PREF_LDRD_TRUE,
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_TRUE,	/* Thumb.  */
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_TRUE,	/* ARM.  */
+  tune_params::DISPARAGE_FLAGS_ALL,
+  tune_params::PREF_NEON_64_FALSE,
+  tune_params::PREF_NEON_STRINGOPS_TRUE,
+  FUSE_OPS (tune_params::FUSE_MOVW_MOVT),
+  tune_params::SCHED_AUTOPREF_FULL
+};
+
 /* Branches can be dual-issued on Cortex-A5, so conditional execution is
    less appealing.  Set max_insns_skipped to a low value.  */
 
Index: testsuite/gcc.dg/asr_div1.c
===================================================================
--- testsuite/gcc.dg/asr_div1.c	(revision 237800)
+++ testsuite/gcc.dg/asr_div1.c	(working copy)
@@ -1,6 +1,7 @@ 
 /* Test division by const int generates only one shift.  */
 /* { dg-do run } */
 /* { dg-options "-O2 -fdump-rtl-combine-all" } */
+/* { dg-options "-O2 -fdump-rtl-combine-all -mtune=cortex-a53" { target aarch64*-*-* } } */
 /* { dg-require-effective-target int32plus } */
 
 extern void abort (void);