diff mbox

[AArch64] Expand DImode constant stores to two SImode stores when profitable

Message ID 5818B33D.7040208@foss.arm.com
State New
Headers show

Commit Message

Kyrill Tkachov Nov. 1, 2016, 3:22 p.m. UTC
And here is the patch itself.


On 01/11/16 15:21, Kyrill Tkachov wrote:
>

> On 31/10/16 11:54, Kyrill Tkachov wrote:

>>

>> On 24/10/16 17:15, Andrew Pinski wrote:

>>> On Mon, Oct 24, 2016 at 7:27 AM, Kyrill Tkachov

>>> <kyrylo.tkachov@foss.arm.com> wrote:

>>>> Hi all,

>>>>

>>>> When storing a 64-bit immediate that has equal bottom and top halves we

>>>> currently

>>>> synthesize the repeating 32-bit pattern twice and perform a single X-store.

>>>> With this patch we synthesize the 32-bit pattern once into a W register and

>>>> store

>>>> that twice using an STP. This reduces codesize bloat from synthesising the

>>>> same

>>>> constant multiple times at the expense of converting a store to a

>>>> store-pair.

>>>> It will only trigger if we can save two or more instructions, so it will

>>>> only transform:

>>>>          mov     x1, 49370

>>>>          movk    x1, 0xc0da, lsl 32

>>>>          str     x1, [x0]

>>>>

>>>> into:

>>>>

>>>>          mov     w1, 49370

>>>>          stp     w1, w1, [x0]

>>>>

>>>> when optimising for -Os, whereas it will always transform a 4-insn synthesis

>>>> sequence into a two-insn sequence + STP (see comments in the patch).

>>>>

>>>> This patch triggers already but will trigger more with the store merging

>>>> pass

>>>> that I'm working on since that will generate more of these repeating 64-bit

>>>> constants.

>>>> This helps improve codegen on 456.hmmer where store merging can sometimes

>>>> create very

>>>> complex repeating constants and target-specific expand needs to break them

>>>> down.

>>>

>>> Doing STP might be worse on ThunderX 1 than the mov/movk.  Or this

>>> might cause an ICE with -mcpu=thunderx; I can't remember if the check

>>> for slow unaligned store pair word is with the pattern or not.

>>

>> I can't get it to ICE with -mcpu=thunderx.

>> The restriction is just on the STP forming code in the sched-fusion hooks AFAIK.

>>

>>> Basically the rule is

>>> 1) if 4 byte aligned, then it is better to do two str.

>>> 2) If 8 byte aligned, then doing stp is good

>>> 3) Otherwise it is better to do two str.

>>

>> Ok, then I'll make the function just emit two stores and depend on the sched-fusion

>> machinery to fuse them into an STP when appropriate since that has the logic that

>> takes thunderx into account.

>>

>

> Here it is.

> I've confirmed that it emits to STRs for 4 byte aligned stores when -mtune=thunderx

> and still generates STP for the other tunings, though now sched-fusion is responsible for

> merging them, which is ok by me.

>

> Bootstrapped and tested on aarch64.

> Ok for trunk?

>

> Thanks,

> Kyril

>

>

> 2016-11-01  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

>

>     * config/aarch64/aarch64.md (mov<mode>): Call

>     aarch64_split_dimode_const_store on DImode constant stores.

>     * config/aarch64/aarch64-protos.h (aarch64_split_dimode_const_store):

>     New prototype.

>     * config/aarch64/aarch64.c (aarch64_split_dimode_const_store): New

>     function.

>

> 2016-11-01  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

>

>     * gcc.target/aarch64/store_repeating_constant_1.c: New test.

>     * gcc.target/aarch64/store_repeating_constant_2.c: Likewise.

>

>>

>>

>>>

>>> Thanks,

>>> Andrew

>>>

>>>

>>>> Bootstrapped and tested on aarch64-none-linux-gnu.

>>>>

>>>> Ok for trunk?

>>>>

>>>> Thanks,

>>>> Kyrill

>>>>

>>>> 2016-10-24  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

>>>>

>>>>      * config/aarch64/aarch64.md (mov<mode>): Call

>>>>      aarch64_split_dimode_const_store on DImode constant stores.

>>>>      * config/aarch64/aarch64-protos.h (aarch64_split_dimode_const_store):

>>>>      New prototype.

>>>>      * config/aarch64/aarch64.c (aarch64_split_dimode_const_store): New

>>>>      function.

>>>>

>>>> 2016-10-24  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

>>>>

>>>>      * gcc.target/aarch64/store_repeating_constant_1.c: New test.

>>>>      * gcc.target/aarch64/store_repeating_constant_2.c: Likewise.

>>

>
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 4f4989d8b0da91096000d0b51736eaa5b0aa9474..b6ca3dfacb0dc88e5d688905d9d013263d4e8d7f 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -337,6 +337,7 @@  bool aarch64_simd_scalar_immediate_valid_for_move (rtx, machine_mode);
 bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
 bool aarch64_simd_valid_immediate (rtx, machine_mode, bool,
 				   struct simd_immediate_info *);
+bool aarch64_split_dimode_const_store (rtx, rtx);
 bool aarch64_symbolic_address_p (rtx);
 bool aarch64_uimm12_shift (HOST_WIDE_INT);
 bool aarch64_use_return_insn_p (void);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a5c72a65451db639a5a623d15ecc61ceb60d1707..ec77d1cb2a6c63ac1703efc75fc67946e7d24c8e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13178,6 +13178,63 @@  aarch64_expand_movmem (rtx *operands)
   return true;
 }
 
+/* Split a DImode store of a CONST_INT SRC to MEM DST as two
+   SImode stores.  Handle the case when the constant has identical
+   bottom and top halves.  This is beneficial when the two stores can be
+   merged into an STP and we avoid synthesising potentially expensive
+   immediates twice.  Return true if such a split is possible.  */
+
+bool
+aarch64_split_dimode_const_store (rtx dst, rtx src)
+{
+  rtx lo = gen_lowpart (SImode, src);
+  rtx hi = gen_highpart_mode (SImode, DImode, src);
+
+  bool size_p = optimize_function_for_size_p (cfun);
+
+  if (!rtx_equal_p (lo, hi))
+    return false;
+
+  unsigned int orig_cost
+    = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
+  unsigned int lo_cost
+    = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
+
+  /* We want to transform:
+     MOV	x1, 49370
+     MOVK	x1, 0x140, lsl 16
+     MOVK	x1, 0xc0da, lsl 32
+     MOVK	x1, 0x140, lsl 48
+     STR	x1, [x0]
+   into:
+     MOV	w1, 49370
+     MOVK	w1, 0x140, lsl 16
+     STP	w1, w1, [x0]
+   So we want to perform this only when we save two instructions
+   or more.  When optimizing for size, however, accept any code size
+   savings we can.  */
+  if (size_p && orig_cost <= lo_cost)
+    return false;
+
+  if (!size_p
+      && (orig_cost <= lo_cost + 1))
+    return false;
+
+  rtx mem_lo = adjust_address (dst, SImode, 0);
+  if (!aarch64_mem_pair_operand (mem_lo, SImode))
+    return false;
+
+  rtx tmp_reg = gen_reg_rtx (SImode);
+  aarch64_expand_mov_immediate (tmp_reg, lo);
+  rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
+  /* Don't emit an explicit store pair as this may not be always profitable.
+     Let the sched-fusion logic decide whether to merge them.  */
+  emit_move_insn (mem_lo, tmp_reg);
+  emit_move_insn (mem_hi, tmp_reg);
+
+  return true;
+}
+
 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
 
 static unsigned HOST_WIDE_INT
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 8861ac18f4e33ada3bc6dde0e4667fd040b1c213..ec423eb4b048609daaf2f81b0ae874f2e4350f69 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1010,6 +1010,11 @@  (define_expand "mov<mode>"
 	(match_operand:GPI 1 "general_operand" ""))]
   ""
   "
+    if (MEM_P (operands[0]) && CONST_INT_P (operands[1])
+	&& <MODE>mode == DImode
+	&& aarch64_split_dimode_const_store (operands[0], operands[1]))
+      DONE;
+
     if (GET_CODE (operands[0]) == MEM && operands[1] != const0_rtx)
       operands[1] = force_reg (<MODE>mode, operands[1]);
 
diff --git a/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_1.c b/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..50d456834bcea10760f6cccecc89e3c21f53bb4c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_1.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+
+void
+foo (unsigned long long *a)
+{
+  a[0] = 0x0140c0da0140c0daULL;
+}
+
+/* { dg-final { scan-assembler-times "movk\\tw.*" 1 } } */
+/* { dg-final { scan-assembler-times "stp\tw\[0-9\]+, w\[0-9\]+.*" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_2.c b/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..c421277989adcf446ad8a7b3ab9060602c03a7ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_2.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+
+/* Check that for -Os we synthesize only the bottom half and then
+   store it twice with an STP rather than synthesizing it twice in each
+   half of an X-reg.  */
+
+void
+foo (unsigned long long *a)
+{
+  a[0] = 0xc0da0000c0daULL;
+}
+
+/* { dg-final { scan-assembler-times "mov\\tw.*" 1 } } */
+/* { dg-final { scan-assembler-times "stp\tw\[0-9\]+, w\[0-9\]+.*" 1 } } */