diff mbox

[AArch64,v2] Improve comparison with complex immediates followed by branch/cset

Message ID 56543105.9090104@arm.com
State New
Headers show

Commit Message

Kyrylo Tkachov Nov. 24, 2015, 9:42 a.m. UTC
On 23/11/15 15:01, Kyrill Tkachov wrote:
>

> On 23/11/15 14:58, James Greenhalgh wrote:

>> On Mon, Nov 23, 2015 at 10:33:01AM +0000, Kyrill Tkachov wrote:

>>> On 12/11/15 12:05, James Greenhalgh wrote:

>>>> On Tue, Nov 03, 2015 at 03:43:24PM +0000, Kyrill Tkachov wrote:

>>>>> Hi all,

>>>>>

>>>>> Bootstrapped and tested on aarch64.

>>>>>

>>>>> Ok for trunk?

>>>> Comments in-line.

>>>>

>>> Here's an updated patch according to your comments.

>>> Sorry it took so long to respin it, had other things to deal with with

>>> stage1 closing...

>>>

>>> I've indented the sample code sequences and used valid mnemonics.

>>> These patterns can only match during combine, so I'd expect them to always

>>> split during combine or immediately after, but I don't think that's a documented

>>> guarantee so I've gated them on !reload_completed.

>>>

>>> I've used IN_RANGE in the predicate.md hunk and added scan-assembler checks

>>> in the tests.

>>>

>>> Is this ok?

>>>

>>> Thanks,

>>> Kyrill

>>>

>>> 2015-11-20  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

>>>

>>>      * config/aarch64/aarch64.md (*condjump): Rename to...

>>>      (condjump): ... This.

>>>      (*compare_condjump<mode>): New define_insn_and_split.

>>>      (*compare_cstore<mode>_insn): Likewise.

>>>      (*cstore<mode>_insn): Rename to...

>>>      (cstore<mode>_insn): ... This.

>>>      * config/aarch64/iterators.md (CMP): Handle ne code.

>>>      * config/aarch64/predicates.md (aarch64_imm24): New predicate.

>>>

>>> 2015-11-20  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

>>>

>>>      * gcc.target/aarch64/cmpimm_branch_1.c: New test.

>>>      * gcc.target/aarch64/cmpimm_cset_1.c: Likewise.

>>>

>>> commit bb44feed4e6beaae25d9bdffa45073dc61c65838

>>> Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com>

>>> Date:   Mon Sep 21 10:56:47 2015 +0100

>>>

>>>      [AArch64] Improve comparison with complex immediates

>>>

>>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md

>>> index 11f6387..3e57d08 100644

>>> --- a/gcc/config/aarch64/aarch64.md

>>> +++ b/gcc/config/aarch64/aarch64.md

>>> @@ -372,7 +372,7 @@ (define_expand "mod<mode>3"

>>>     }

>>>   )

>>>   -(define_insn "*condjump"

>>> +(define_insn "condjump"

>>>     [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"

>>>                   [(match_operand 1 "cc_register" "") (const_int 0)])

>>>                  (label_ref (match_operand 2 "" ""))

>>> @@ -397,6 +397,41 @@ (define_insn "*condjump"

>>>                 (const_int 1)))]

>>>   )

>>>   +;; For a 24-bit immediate CST we can optimize the compare for equality

>>> +;; and branch sequence from:

>>> +;;     mov    x0, #imm1

>>> +;;     movk    x0, #imm2, lsl 16 /* x0 contains CST.  */

>>> +;;     cmp    x1, x0

>>> +;;     b<ne,eq> .Label

>>> +;; into the shorter:

>>> +;;     sub    x0, x0, #(CST & 0xfff000)

>>> +;;     subs    x0, x0, #(CST & 0x000fff)

>>    sub x0, x1, #(CST....)  ?

>>

>> The transform doesn't make sense otherwise.

>

> Doh, yes. The source should be x1 of course.

>


Here's what I'll be committing.

Thanks,
Kyrill
2015-11-24  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * config/aarch64/aarch64.md (*condjump): Rename to...
     (condjump): ... This.
     (*compare_condjump<mode>): New define_insn_and_split.
     (*compare_cstore<mode>_insn): Likewise.
     (*cstore<mode>_insn): Rename to...
     (cstore<mode>_insn): ... This.
     * config/aarch64/iterators.md (CMP): Handle ne code.
     * config/aarch64/predicates.md (aarch64_imm24): New predicate.

2015-11-24  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * gcc.target/aarch64/cmpimm_branch_1.c: New test.
     * gcc.target/aarch64/cmpimm_cset_1.c: Likewise.


> Kyrill

>

>>

>>> +;;     b<ne,eq> .Label

>>> +(define_insn_and_split "*compare_condjump<mode>"

>>> +  [(set (pc) (if_then_else (EQL

>>> +                  (match_operand:GPI 0 "register_operand" "r")

>>> +                  (match_operand:GPI 1 "aarch64_imm24" "n"))

>>> +               (label_ref:P (match_operand 2 "" ""))

>>> +               (pc)))]

>>> +  "!aarch64_move_imm (INTVAL (operands[1]), <MODE>mode)

>>> +   && !aarch64_plus_operand (operands[1], <MODE>mode)

>>> +   && !reload_completed"

>>> +  "#"

>>> +  "&& true"

>>> +  [(const_int 0)]

>>> +  {

>>> +    HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff;

>>> +    HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000;

>>> +    rtx tmp = gen_reg_rtx (<MODE>mode);

>>> +    emit_insn (gen_add<mode>3 (tmp, operands[0], GEN_INT (-hi_imm)));

>>> +    emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));

>>> +    rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);

>>> +    rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx);

>>> +    emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2]));

>>> +    DONE;

>>> +  }

>>> +)

>>> +

>>>   (define_expand "casesi"

>>>     [(match_operand:SI 0 "register_operand" "")    ; Index

>>>      (match_operand:SI 1 "const_int_operand" "")    ; Lower bound

>>> @@ -2901,7 +2936,7 @@ (define_expand "cstore<mode>4"

>>>     "

>>>   )

>>>   -(define_insn "*cstore<mode>_insn"

>>> +(define_insn "aarch64_cstore<mode>"

>>>     [(set (match_operand:ALLI 0 "register_operand" "=r")

>>>       (match_operator:ALLI 1 "aarch64_comparison_operator"

>>>        [(match_operand 2 "cc_register" "") (const_int 0)]))]

>>> @@ -2910,6 +2945,40 @@ (define_insn "*cstore<mode>_insn"

>>>     [(set_attr "type" "csel")]

>>>   )

>>>   +;; For a 24-bit immediate CST we can optimize the compare for equality

>>> +;; and branch sequence from:

>>> +;;     mov    x0, #imm1

>>> +;;     movk    x0, #imm2, lsl 16 /* x0 contains CST.  */

>>> +;;     cmp    x1, x0

>>> +;;     cset    x2, <ne,eq>

>>> +;; into the shorter:

>>> +;;     sub    x0, x0, #(CST & 0xfff000)

>>> +;;     subs    x0, x0, #(CST & 0x000fff)

>>> +;;     cset x1, <ne, eq>.

>> Please fix the register allocation in your shorter sequence, these

>> are not equivalent.

>>

>>> +(define_insn_and_split "*compare_cstore<mode>_insn"

>>> +  [(set (match_operand:GPI 0 "register_operand" "=r")

>>> +     (EQL:GPI (match_operand:GPI 1 "register_operand" "r")

>>> +          (match_operand:GPI 2 "aarch64_imm24" "n")))]

>>> +  "!aarch64_move_imm (INTVAL (operands[2]), <MODE>mode)

>>> +   && !aarch64_plus_operand (operands[2], <MODE>mode)

>>> +   && !reload_completed"

>>> +  "#"

>>> +  "&& true"

>>> +  [(const_int 0)]

>>> +  {

>>> +    HOST_WIDE_INT lo_imm = UINTVAL (operands[2]) & 0xfff;

>>> +    HOST_WIDE_INT hi_imm = UINTVAL (operands[2]) & 0xfff000;

>>> +    rtx tmp = gen_reg_rtx (<MODE>mode);

>>> +    emit_insn (gen_add<mode>3 (tmp, operands[1], GEN_INT (-hi_imm)));

>>> +    emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));

>>> +    rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);

>>> +    rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx);

>>> +    emit_insn (gen_aarch64_cstore<mode> (operands[0], cmp_rtx, cc_reg));

>>> +    DONE;

>>> +  }

>>> +  [(set_attr "type" "csel")]

>>> +)

>>> +

>>>   ;; zero_extend version of the above

>>>   (define_insn "*cstoresi_insn_uxtw"

>>>     [(set (match_operand:DI 0 "register_operand" "=r")

>>> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

>>> index c2eb7de..422bc87 100644

>>> --- a/gcc/config/aarch64/iterators.md

>>> +++ b/gcc/config/aarch64/iterators.md

>>> @@ -824,7 +824,8 @@ (define_code_attr cmp_2   [(lt "1") (le "1") (eq "2") (ge "2") (gt "2")

>>>                  (ltu "1") (leu "1") (geu "2") (gtu "2")])

>>>     (define_code_attr CMP [(lt "LT") (le "LE") (eq "EQ") (ge "GE") (gt "GT")

>>> -               (ltu "LTU") (leu "LEU") (geu "GEU") (gtu "GTU")])

>>> +            (ltu "LTU") (leu "LEU") (ne "NE") (geu "GEU")

>>> +            (gtu "GTU")])

>>>     (define_code_attr fix_trunc_optab [(fix "fix_trunc")

>>>                      (unsigned_fix "fixuns_trunc")])

>>> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md

>>> index e7f76e0..c0c3ff5 100644

>>> --- a/gcc/config/aarch64/predicates.md

>>> +++ b/gcc/config/aarch64/predicates.md

>>> @@ -145,6 +145,11 @@ (define_predicate "aarch64_imm3"

>>>     (and (match_code "const_int")

>>>          (match_test "(unsigned HOST_WIDE_INT) INTVAL (op) <= 4")))

>>>   +;; An immediate that fits into 24 bits.

>>> +(define_predicate "aarch64_imm24"

>>> +  (and (match_code "const_int")

>>> +       (match_test "IN_RANGE (UINTVAL (op), 0, 0xffffff)")))

>>> +

>>>   (define_predicate "aarch64_pwr_imm3"

>>>     (and (match_code "const_int")

>>>          (match_test "INTVAL (op) != 0

>>> diff --git a/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c b/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c

>>> new file mode 100644

>>> index 0000000..7ad736b

>>> --- /dev/null

>>> +++ b/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c

>>> @@ -0,0 +1,26 @@

>>> +/* { dg-do compile } */

>>> +/* { dg-options "-save-temps -O2" } */

>>> +

>>> +/* Test that we emit a sub+subs sequence rather than mov+movk+cmp.  */

>>> +

>>> +void g (void);

>>> +void

>>> +foo (int x)

>>> +{

>>> +  if (x != 0x123456)

>>> +    g ();

>>> +}

>>> +

>>> +void

>>> +fool (long long x)

>>> +{

>>> +  if (x != 0x123456)

>>> +    g ();

>>> +}

>>> +

>>> +/* { dg-final { scan-assembler-not "cmp\tw\[0-9\]*.*" } } */

>>> +/* { dg-final { scan-assembler-not "cmp\tx\[0-9\]*.*" } } */

>>> +/* { dg-final { scan-assembler-times "sub\tw\[0-9\]+.*" 1 } } */

>>> +/* { dg-final { scan-assembler-times "sub\tx\[0-9\]+.*" 1 } } */

>>> +/* { dg-final { scan-assembler-times "subs\tw\[0-9\]+.*" 1 } } */

>>> +/* { dg-final { scan-assembler-times "subs\tx\[0-9\]+.*" 1 } } */

>>> diff --git a/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c b/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c

>>> new file mode 100644

>>> index 0000000..6a03cc9

>>> --- /dev/null

>>> +++ b/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c

>>> @@ -0,0 +1,23 @@

>>> +/* { dg-do compile } */

>>> +/* { dg-options "-save-temps -O2" } */

>>> +

>>> +/* Test that we emit a sub+subs sequence rather than mov+movk+cmp.  */

>>> +

>>> +int

>>> +foo (int x)

>>> +{

>>> +  return x == 0x123456;

>>> +}

>>> +

>>> +long

>>> +fool (long x)

>>> +{

>>> +  return x == 0x123456;

>>> +}

>>> +

>> This test will be broken for ILP32. This should be long long.

>>

>> OK with those comments fixed.

>

> Thanks, I'll prepare an updated version.

>

> Kyrill

>

>> Thanks,

>> James

>>

>
diff mbox

Patch

commit 30cc3774824ba7fd372111a223ade075ad7c49cc
Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date:   Mon Sep 21 10:56:47 2015 +0100

    [AArch64] Improve comparison with complex immediates

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 11f6387..3283cb2 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -372,7 +372,7 @@  (define_expand "mod<mode>3"
   }
 )
 
-(define_insn "*condjump"
+(define_insn "condjump"
   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
 			    [(match_operand 1 "cc_register" "") (const_int 0)])
 			   (label_ref (match_operand 2 "" ""))
@@ -397,6 +397,41 @@  (define_insn "*condjump"
 		      (const_int 1)))]
 )
 
+;; For a 24-bit immediate CST we can optimize the compare for equality
+;; and branch sequence from:
+;; 	mov	x0, #imm1
+;; 	movk	x0, #imm2, lsl 16 /* x0 contains CST.  */
+;; 	cmp	x1, x0
+;; 	b<ne,eq> .Label
+;; into the shorter:
+;; 	sub	x0, x1, #(CST & 0xfff000)
+;; 	subs	x0, x0, #(CST & 0x000fff)
+;; 	b<ne,eq> .Label
+(define_insn_and_split "*compare_condjump<mode>"
+  [(set (pc) (if_then_else (EQL
+			      (match_operand:GPI 0 "register_operand" "r")
+			      (match_operand:GPI 1 "aarch64_imm24" "n"))
+			   (label_ref:P (match_operand 2 "" ""))
+			   (pc)))]
+  "!aarch64_move_imm (INTVAL (operands[1]), <MODE>mode)
+   && !aarch64_plus_operand (operands[1], <MODE>mode)
+   && !reload_completed"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+    HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff;
+    HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000;
+    rtx tmp = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_add<mode>3 (tmp, operands[0], GEN_INT (-hi_imm)));
+    emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));
+    rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
+    rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx);
+    emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2]));
+    DONE;
+  }
+)
+
 (define_expand "casesi"
   [(match_operand:SI 0 "register_operand" "")	; Index
    (match_operand:SI 1 "const_int_operand" "")	; Lower bound
@@ -2901,7 +2936,7 @@  (define_expand "cstore<mode>4"
   "
 )
 
-(define_insn "*cstore<mode>_insn"
+(define_insn "aarch64_cstore<mode>"
   [(set (match_operand:ALLI 0 "register_operand" "=r")
 	(match_operator:ALLI 1 "aarch64_comparison_operator"
 	 [(match_operand 2 "cc_register" "") (const_int 0)]))]
@@ -2910,6 +2945,40 @@  (define_insn "*cstore<mode>_insn"
   [(set_attr "type" "csel")]
 )
 
+;; For a 24-bit immediate CST we can optimize the compare for equality
+;; and branch sequence from:
+;; 	mov	x0, #imm1
+;; 	movk	x0, #imm2, lsl 16 /* x0 contains CST.  */
+;; 	cmp	x1, x0
+;; 	cset	x2, <ne,eq>
+;; into the shorter:
+;; 	sub	x0, x1, #(CST & 0xfff000)
+;; 	subs	x0, x0, #(CST & 0x000fff)
+;; 	cset x2, <ne, eq>.
+(define_insn_and_split "*compare_cstore<mode>_insn"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	 (EQL:GPI (match_operand:GPI 1 "register_operand" "r")
+		  (match_operand:GPI 2 "aarch64_imm24" "n")))]
+  "!aarch64_move_imm (INTVAL (operands[2]), <MODE>mode)
+   && !aarch64_plus_operand (operands[2], <MODE>mode)
+   && !reload_completed"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+    HOST_WIDE_INT lo_imm = UINTVAL (operands[2]) & 0xfff;
+    HOST_WIDE_INT hi_imm = UINTVAL (operands[2]) & 0xfff000;
+    rtx tmp = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_add<mode>3 (tmp, operands[1], GEN_INT (-hi_imm)));
+    emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));
+    rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
+    rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx);
+    emit_insn (gen_aarch64_cstore<mode> (operands[0], cmp_rtx, cc_reg));
+    DONE;
+  }
+  [(set_attr "type" "csel")]
+)
+
 ;; zero_extend version of the above
 (define_insn "*cstoresi_insn_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index c2eb7de..422bc87 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -824,7 +824,8 @@  (define_code_attr cmp_2   [(lt "1") (le "1") (eq "2") (ge "2") (gt "2")
 			   (ltu "1") (leu "1") (geu "2") (gtu "2")])
 
 (define_code_attr CMP [(lt "LT") (le "LE") (eq "EQ") (ge "GE") (gt "GT")
-			   (ltu "LTU") (leu "LEU") (geu "GEU") (gtu "GTU")])
+			(ltu "LTU") (leu "LEU") (ne "NE") (geu "GEU")
+			(gtu "GTU")])
 
 (define_code_attr fix_trunc_optab [(fix "fix_trunc")
 				   (unsigned_fix "fixuns_trunc")])
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index e7f76e0..c0c3ff5 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -145,6 +145,11 @@  (define_predicate "aarch64_imm3"
   (and (match_code "const_int")
        (match_test "(unsigned HOST_WIDE_INT) INTVAL (op) <= 4")))
 
+;; An immediate that fits into 24 bits.
+(define_predicate "aarch64_imm24"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (UINTVAL (op), 0, 0xffffff)")))
+
 (define_predicate "aarch64_pwr_imm3"
   (and (match_code "const_int")
        (match_test "INTVAL (op) != 0
diff --git a/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c b/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c
new file mode 100644
index 0000000..7ad736b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c
@@ -0,0 +1,26 @@ 
+/* { dg-do compile } */
+/* { dg-options "-save-temps -O2" } */
+
+/* Test that we emit a sub+subs sequence rather than mov+movk+cmp.  */
+
+void g (void);
+void
+foo (int x)
+{
+  if (x != 0x123456)
+    g ();
+}
+
+void
+fool (long long x)
+{
+  if (x != 0x123456)
+    g ();
+}
+
+/* { dg-final { scan-assembler-not "cmp\tw\[0-9\]*.*" } } */
+/* { dg-final { scan-assembler-not "cmp\tx\[0-9\]*.*" } } */
+/* { dg-final { scan-assembler-times "sub\tw\[0-9\]+.*" 1 } } */
+/* { dg-final { scan-assembler-times "sub\tx\[0-9\]+.*" 1 } } */
+/* { dg-final { scan-assembler-times "subs\tw\[0-9\]+.*" 1 } } */
+/* { dg-final { scan-assembler-times "subs\tx\[0-9\]+.*" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c b/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c
new file mode 100644
index 0000000..f6fd69f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-save-temps -O2" } */
+
+/* Test that we emit a sub+subs sequence rather than mov+movk+cmp.  */
+
+int
+foo (int x)
+{
+  return x == 0x123456;
+}
+
+long long
+fool (long long x)
+{
+  return x == 0x123456;
+}
+
+/* { dg-final { scan-assembler-not "cmp\tw\[0-9\]*.*" } } */
+/* { dg-final { scan-assembler-not "cmp\tx\[0-9\]*.*" } } */
+/* { dg-final { scan-assembler-times "sub\tw\[0-9\]+.*" 1 } } */
+/* { dg-final { scan-assembler-times "sub\tx\[0-9\]+.*" 1 } } */
+/* { dg-final { scan-assembler-times "subs\tw\[0-9\]+.*" 1 } } */
+/* { dg-final { scan-assembler-times "subs\tx\[0-9\]+.*" 1 } } */