diff mbox

[AARCH64] add qdf24xx tuning structure

Message ID CABXYE2WSf-qOwNkoaSo=a8T7dvG2yrbj7aN+_qwNWd2TLjCHUw@mail.gmail.com
State Superseded
Headers show

Commit Message

Jim Wilson June 10, 2016, 10:48 p.m. UTC
This adds a tuning structure for qdf24xx.  This was tested with an
aarch64-linux bootstrap and a make check, with no regressions.  I also
tested it with an x86_64-linux C make check to verify that I didn't
break the testsuite for non aarch64 targets.

I had to change one testcase because it assumes that a divide by
constant will always be emitted as a multiply.  That actually depends
on the relative costs of multiply, shift, and divide instructions.  I
ended up with a divide instruction for my target, as it has reasonably
fast divide instructions.  I fixed it by adding a -mtune=cortex-a53
option for aarch64 to ensure that we always get the multiply insn.

Jim

Comments

James Greenhalgh June 13, 2016, 8:53 a.m. UTC | #1
On Fri, Jun 10, 2016 at 03:48:38PM -0700, Jim Wilson wrote:
> This adds a tuning structure for qdf24xx.  This was tested with an

> aarch64-linux bootstrap and a make check, with no regressions.  I also

> tested it with an x86_64-linux C make check to verify that I didn't

> break the testsuite for non aarch64 targets.

> 

> I had to change one testcase because it assumes that a divide by

> constant will always be emitted as a multiply.  That actually depends

> on the relative costs of multiply, shift, and divide instructions.  I

> ended up with a divide instruction for my target, as it has reasonably

> fast divide instructions.  I fixed it by adding a -mtune=cortex-a53

> option for aarch64 to ensure that we always get the multiply insn.

> 

> Index: config/arm/aarch-cost-tables.h

> ===================================================================

> --- config/arm/aarch-cost-tables.h	(revision 237273)

> +++ config/arm/aarch-cost-tables.h	(working copy)

> @@ -537,4 +537,107 @@ const struct cpu_cost_table xgene1_extra_costs =

>    }

>  };

>  

> +const struct cpu_cost_table qdf24xx_extra_costs =

> +{


<...snip...>

> +  {

> +    /* FP SFmode */

> +    {

> +      COSTS_N_INSNS (6),      /* div.  */

> +      COSTS_N_INSNS (5),       /* mult.  */

> +      COSTS_N_INSNS (5),       /* mult_addsub. */

> +      COSTS_N_INSNS (5),       /* fma.  */

> +      COSTS_N_INSNS (3),       /* addsub.  */

> +      COSTS_N_INSNS (1),       /* fpconst. */

> +      COSTS_N_INSNS (1),       /* neg.  */

> +      COSTS_N_INSNS (2),       /* compare.  */

> +      COSTS_N_INSNS (4),       /* widen.  */

> +      COSTS_N_INSNS (4),       /* narrow.  */

> +      COSTS_N_INSNS (4),       /* toint.  */

> +      COSTS_N_INSNS (4),       /* fromint.  */

> +      COSTS_N_INSNS (2)        /* roundint.  */

> +    },

> +    /* FP DFmode */

> +    {

> +      COSTS_N_INSNS (11),      /* div.  */

> +      COSTS_N_INSNS (6),       /* mult.  */

> +      COSTS_N_INSNS (6),       /* mult_addsub.  */

> +      COSTS_N_INSNS (6),       /* fma.  */

> +      COSTS_N_INSNS (3),       /* addsub.  */

> +      COSTS_N_INSNS (1),       /* fpconst.  */

> +      COSTS_N_INSNS (1),       /* neg.  */

> +      COSTS_N_INSNS (2),       /* compare.  */

> +      COSTS_N_INSNS (4),       /* widen.  */

> +      COSTS_N_INSNS (4),       /* narrow.  */

> +      COSTS_N_INSNS (4),       /* toint.  */

> +      COSTS_N_INSNS (4),       /* fromint.  */

> +      COSTS_N_INSNS (2)        /* roundint.  */

> +    }

> +  },


Have you seen my recent patch for Cortex-A57 that changes the costs there
to be relative to the cost of a floating-point register to floating-point
register move [1]? I found that gave me a number of
improvements due to comparisons in the compiler that assume a move in a
mode is cheap, and other costs will be defined relative to it.

Did you consider that for the qdf24xx costs?

Otherwise, the AArch64 parts look good to me, but you'll want to wait for
an ARM OK too.

Thanks,
James

[1]: https://gcc.gnu.org/ml/gcc-patches/2016-06/msg00251.html
Jim Wilson June 28, 2016, 12:22 a.m. UTC | #2
On Mon, Jun 13, 2016 at 1:53 AM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
> On Fri, Jun 10, 2016 at 03:48:38PM -0700, Jim Wilson wrote:

>> This adds a tuning structure for qdf24xx.  This was tested with an


> Have you seen my recent patch for Cortex-A57 that changes the costs there

> to be relative to the cost of a floating-point register to floating-point

> register move [1]? I found that gave me a number of

> improvements due to comparisons in the compiler that assume a move in a

> mode is cheap, and other costs will be defined relative to it.

>

> Did you consider that for the qdf24xx costs?


No.  We haven't considered that.  However, we have plans to revist
some of this tuning work when we get updated docs, so we can take a
look at this then.

Jim
diff mbox

Patch

	gcc/
	* config/aarch64/aarch64-cores.def (qdf24xx): Use qdf24xx tuning.
	* config/aarch64/aarch64.c (qdf24xx_addrcost_table,
	qdf24xx_regmove_cost, qdf24xx_tunings): New.
	* config/arm/aarch64-cost-tables.h (qdf24xx_extra_costs): New.
	* config/arm/arm-cores.def (qdf24xx): Use qdf24xx tuning.
	* config/arm/arm.c (arm_qdf24xx_tune): New.

	gcc/testsuite/
	* gcc.dg/asr_div1.c: Add aarch64 specific dg-options.

Index: config/aarch64/aarch64-cores.def
===================================================================
--- config/aarch64/aarch64-cores.def	(revision 237273)
+++ config/aarch64/aarch64-cores.def	(working copy)
@@ -45,7 +45,7 @@  AARCH64_CORE("cortex-a53",  cortexa53, cortexa53,
 AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07")
 AARCH64_CORE("cortex-a72",  cortexa72, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08")
 AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, exynosm1,  "0x53", "0x001")
-AARCH64_CORE("qdf24xx",     qdf24xx,   cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa57, "0x51", "0x800")
+AARCH64_CORE("qdf24xx",     qdf24xx,   cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, qdf24xx,   "0x51", "0x800")
 AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
 AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
 
Index: config/aarch64/aarch64.c
===================================================================
--- config/aarch64/aarch64.c	(revision 237273)
+++ config/aarch64/aarch64.c	(working copy)
@@ -250,6 +250,22 @@  static const struct cpu_addrcost_table xgene1_addr
   0, /* imm_offset  */
 };
 
+static const struct cpu_addrcost_table qdf24xx_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
 static const struct cpu_regmove_cost generic_regmove_cost =
 {
   1, /* GP2GP  */
@@ -308,6 +324,15 @@  static const struct cpu_regmove_cost xgene1_regmov
   2 /* FP2FP  */
 };
 
+static const struct cpu_regmove_cost qdf24xx_regmove_cost =
+{
+  2, /* GP2GP  */
+  /* Avoid the use of int<->fp moves for spilling.  */
+  6, /* GP2FP  */
+  6, /* FP2GP  */
+  4 /* FP2FP  */
+};
+
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost generic_vector_cost =
 {
@@ -589,6 +614,31 @@  static const struct tune_params xgene1_tunings =
   (AARCH64_EXTRA_TUNE_APPROX_RSQRT)	/* tune_flags.  */
 };
 
+static const struct tune_params qdf24xx_tunings =
+{
+  &qdf24xx_extra_costs,
+  &qdf24xx_addrcost_table,
+  &qdf24xx_regmove_cost,
+  &generic_vector_cost,
+  &generic_branch_cost,
+  4, /* memmov_cost  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
+  16,	/* function_align.  */
+  8,	/* jump_align.  */
+  16,	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  64,	/* cache_line_size.  */
+  tune_params::AUTOPREFETCHER_STRONG,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE)		/* tune_flags.  */
+};
+
 /* Support for fine-grained override of the tuning structures.  */
 struct aarch64_tuning_override_function
 {
Index: config/arm/aarch-cost-tables.h
===================================================================
--- config/arm/aarch-cost-tables.h	(revision 237273)
+++ config/arm/aarch-cost-tables.h	(working copy)
@@ -537,4 +537,107 @@  const struct cpu_cost_table xgene1_extra_costs =
   }
 };
 
+const struct cpu_cost_table qdf24xx_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    0,                 /* shift_reg.  */
+    COSTS_N_INSNS (1), /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    0,                 /* log_shift.  */
+    0,                 /* log_shift_reg.  */
+    0,                 /* extend.  */
+    0,                 /* extend_arith.  */
+    0,                 /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,	               /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (2),       /* simple.  */
+      COSTS_N_INSNS (2),       /* flag_setting.  */
+      COSTS_N_INSNS (2),       /* extend.  */
+      COSTS_N_INSNS (2),       /* add.  */
+      COSTS_N_INSNS (2),       /* extend_add.  */
+      COSTS_N_INSNS (4)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (3),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (3),       /* extend_add.  */
+      COSTS_N_INSNS (9)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (2),         /* load.  */
+    COSTS_N_INSNS (2),         /* load_sign_extend.  */
+    COSTS_N_INSNS (2),         /* ldrd.  */
+    COSTS_N_INSNS (2),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (2),         /* loadf.  */
+    COSTS_N_INSNS (2),         /* loadd.  */
+    COSTS_N_INSNS (3),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    COSTS_N_INSNS (1),         /* store_unaligned.  */
+    COSTS_N_INSNS (1),         /* loadv.  */
+    COSTS_N_INSNS (1)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (6),      /* div.  */
+      COSTS_N_INSNS (5),       /* mult.  */
+      COSTS_N_INSNS (5),       /* mult_addsub. */
+      COSTS_N_INSNS (5),       /* fma.  */
+      COSTS_N_INSNS (3),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst. */
+      COSTS_N_INSNS (1),       /* neg.  */
+      COSTS_N_INSNS (2),       /* compare.  */
+      COSTS_N_INSNS (4),       /* widen.  */
+      COSTS_N_INSNS (4),       /* narrow.  */
+      COSTS_N_INSNS (4),       /* toint.  */
+      COSTS_N_INSNS (4),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (11),      /* div.  */
+      COSTS_N_INSNS (6),       /* mult.  */
+      COSTS_N_INSNS (6),       /* mult_addsub.  */
+      COSTS_N_INSNS (6),       /* fma.  */
+      COSTS_N_INSNS (3),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      COSTS_N_INSNS (1),       /* neg.  */
+      COSTS_N_INSNS (2),       /* compare.  */
+      COSTS_N_INSNS (4),       /* widen.  */
+      COSTS_N_INSNS (4),       /* narrow.  */
+      COSTS_N_INSNS (4),       /* toint.  */
+      COSTS_N_INSNS (4),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (1)  /* alu.  */
+  }
+};
+
 #endif /* GCC_AARCH_COST_TABLES_H */
Index: config/arm/arm-cores.def
===================================================================
--- config/arm/arm-cores.def	(revision 237273)
+++ config/arm/arm-cores.def	(working copy)
@@ -172,7 +172,7 @@  ARM_CORE("cortex-a53",	cortexa53, cortexa53,	8A,	A
 ARM_CORE("cortex-a57",	cortexa57, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("cortex-a72",	cortexa72, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("exynos-m1",	exynosm1,  exynosm1,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), exynosm1)
-ARM_CORE("qdf24xx",	qdf24xx,   cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
+ARM_CORE("qdf24xx",	qdf24xx,   cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), qdf24xx)
 ARM_CORE("xgene1",      xgene1,    xgene1,      8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8A),            xgene1)
 
 /* V8 big.LITTLE implementations */
Index: config/arm/arm.c
===================================================================
--- config/arm/arm.c	(revision 237273)
+++ config/arm/arm.c	(working copy)
@@ -2053,6 +2053,29 @@  const struct tune_params arm_xgene1_tune =
   tune_params::SCHED_AUTOPREF_OFF
 };
 
+const struct tune_params arm_qdf24xx_tune =
+{
+  arm_9e_rtx_costs,
+  &qdf24xx_extra_costs,
+  NULL,                                         /* Scheduler cost adjustment.  */
+  arm_default_branch_cost,
+  &arm_default_vec_cost,			/* Vectorizer costs.  */
+  1,						/* Constant limit.  */
+  2,						/* Max cond insns.  */
+  8,						/* Memset max inline.  */
+  4,						/* Issue rate.  */
+  ARM_PREFETCH_BENEFICIAL (0, -1, 64),
+  tune_params::PREF_CONST_POOL_FALSE,
+  tune_params::PREF_LDRD_TRUE,
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_TRUE,	/* Thumb.  */
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_TRUE,	/* ARM.  */
+  tune_params::DISPARAGE_FLAGS_ALL,
+  tune_params::PREF_NEON_64_FALSE,
+  tune_params::PREF_NEON_STRINGOPS_TRUE,
+  FUSE_OPS (tune_params::FUSE_MOVW_MOVT),
+  tune_params::SCHED_AUTOPREF_FULL
+};
+
 /* Branches can be dual-issued on Cortex-A5, so conditional execution is
    less appealing.  Set max_insns_skipped to a low value.  */
 
Index: testsuite/gcc.dg/asr_div1.c
===================================================================
--- testsuite/gcc.dg/asr_div1.c	(revision 237273)
+++ testsuite/gcc.dg/asr_div1.c	(working copy)
@@ -1,6 +1,7 @@ 
 /* Test division by const int generates only one shift.  */
 /* { dg-do run } */
 /* { dg-options "-O2 -fdump-rtl-combine-all" } */
+/* { dg-options "-O2 -fdump-rtl-combine-all -mtune=cortex-a53" { target aarch64*-*-* } } */
 /* { dg-require-effective-target int32plus } */
 
 extern void abort (void);