[02/nn] Add more vec_duplicate simplifications

Message ID	87o9oyxgdf.fsf@linaro.org
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: domain of gcc-patches-return-464728-patch=linaro.org@gcc.gnu.org designates 209.132.180.131 as permitted sender) client-ip=209.132.180.131; DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:from :to:subject:references:date:in-reply-to:message-id:mime-version :content-type; q=dns; s=default; b=TLV1363ZBEJwNWIcVwK59y98P+tX+ abe+DrT2L/5nGzm7k9qpKJcZk875RjYXREKh9cPQ2v7Kg/73C6RT1E2pFWPzf59r 0aMt4NTp8ukknMadPxbtZ0uZX1u+/J82a+SVm9ykwF30yQiJTjJJXYSOV4QWSgDM 1c06QHdBWoxF5M= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org From: Richard Sandiford <richard.sandiford@linaro.org> To: gcc-patches@gcc.gnu.org Mail-Followup-To: gcc-patches@gcc.gnu.org, richard.sandiford@linaro.org Subject: [02/nn] Add more vec_duplicate simplifications References: <87wp3mxgir.fsf@linaro.org> Date: Mon, 23 Oct 2017 12:17:48 +0100 In-Reply-To: <87wp3mxgir.fsf@linaro.org> (Richard Sandiford's message of "Mon, 23 Oct 2017 12:14:36 +0100") Message-ID: <87o9oyxgdf.fsf@linaro.org> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/25.2 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain
Series	[02/nn] Add more vec_duplicate simplifications \| expand [02/nn] Add more vec_duplicate simplifications

Message ID

87o9oyxgdf.fsf@linaro.org

State

New

Headers

Received-SPF: pass (google.com: domain of
	gcc-patches-return-464728-patch=linaro.org@gcc.gnu.org
	designates 209.132.180.131 as permitted sender)
	client-ip=209.132.180.131; 
DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:from
	:to:subject:references:date:in-reply-to:message-id:mime-version
	:content-type; q=dns; s=default; b=TLV1363ZBEJwNWIcVwK59y98P+tX+
	abe+DrT2L/5nGzm7k9qpKJcZk875RjYXREKh9cPQ2v7Kg/73C6RT1E2pFWPzf59r
	0aMt4NTp8ukknMadPxbtZ0uZX1u+/J82a+SVm9ykwF30yQiJTjJJXYSOV4QWSgDM
	1c06QHdBWoxF5M=
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
Sender: gcc-patches-owner@gcc.gnu.org
From: Richard Sandiford <richard.sandiford@linaro.org>
To: gcc-patches@gcc.gnu.org
Mail-Followup-To: gcc-patches@gcc.gnu.org, richard.sandiford@linaro.org
Subject: [02/nn] Add more vec_duplicate simplifications
References: <87wp3mxgir.fsf@linaro.org>
Date: Mon, 23 Oct 2017 12:17:48 +0100
In-Reply-To: <87wp3mxgir.fsf@linaro.org> (Richard Sandiford's message of
	"Mon, 23 Oct 2017 12:14:36 +0100")
Message-ID: <87o9oyxgdf.fsf@linaro.org>
User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/25.2 (gnu/linux)
MIME-Version: 1.0
Content-Type: text/plain

Series

[02/nn] Add more vec_duplicate simplifications | expand

Commit Message

Richard Sandiford Oct. 23, 2017, 11:17 a.m. UTC

This patch adds a vec_duplicate_p helper that tests for constant
or non-constant vector duplicates.  Together with the existing
const_vec_duplicate_p, this complements the gen_vec_duplicate
and gen_const_vec_duplicate added by a previous patch.

The patch uses the new routines to add more rtx simplifications
involving vector duplicates.  These mirror simplifications that
we already do for CONST_VECTOR broadcasts and are needed for
variable-length SVE, which uses:

  (const:M (vec_duplicate:M X))

to represent constant broadcasts instead.  The simplifications do
trigger on the testsuite for variable duplicates too, and in each
case I saw the change was an improvement.  E.g.:

- Several targets had this simplification in gcc.dg/pr49948.c
  when compiled at -O3:

    -Failed to match this instruction:
    +Successfully matched this instruction:
     (set (reg:DI 88)
    -    (subreg:DI (vec_duplicate:V2DI (reg/f:DI 75 [ _4 ])) 0))
    +    (reg/f:DI 75 [ _4 ]))

  On aarch64 this gives:

            ret
            .p2align 2
     .L8:
    +       adrp    x1, b
            sub     sp, sp, #80
    -       adrp    x2, b
    -       add     x1, sp, 12
    +       add     x2, sp, 12
            str     wzr, [x0, #:lo12:a]
    +       str     x2, [x1, #:lo12:b]
            mov     w0, 0
    -       dup     v0.2d, x1
    -       str     d0, [x2, #:lo12:b]
            add     sp, sp, 80
            ret
            .size   foo, .-foo

  On x86_64:

            jg      .L2
            leaq    -76(%rsp), %rax
            movl    $0, a(%rip)
    -       movq    %rax, -96(%rsp)
    -       movq    -96(%rsp), %xmm0
    -       punpcklqdq      %xmm0, %xmm0
    -       movq    %xmm0, b(%rip)
    +       movq    %rax, b(%rip)
     .L2:
            xorl    %eax, %eax
            ret

  etc.

- gcc.dg/torture/pr58018.c compiled at -O3 on aarch64 has an instance of:

     Trying 50, 52, 46 -> 53:
     Failed to match this instruction:
     (set (reg:V4SI 167)
    -    (and:V4SI (and:V4SI (vec_duplicate:V4SI (reg:SI 132 [ _165 ]))
    -            (reg:V4SI 209))
    -        (const_vector:V4SI [
    -                (const_int 1 [0x1])
    -                (const_int 1 [0x1])
    -                (const_int 1 [0x1])
    -                (const_int 1 [0x1])
    -            ])))
    +    (and:V4SI (vec_duplicate:V4SI (reg:SI 132 [ _165 ]))
    +        (reg:V4SI 209)))
     Successfully matched this instruction:
     (set (reg:V4SI 163 [ vect_patt_16.14 ])
         (vec_duplicate:V4SI (reg:SI 132 [ _165 ])))
    +Successfully matched this instruction:
    +(set (reg:V4SI 167)
    +    (and:V4SI (reg:V4SI 163 [ vect_patt_16.14 ])
    +        (reg:V4SI 209)))

  where (reg:SI 132) is the result of a scalar comparison and so
  is known to be 0 or 1.  This saves a MOVI and vector AND:

            cmp     w7, 4
            bls     .L15
            dup     v1.4s, w2
    -       lsr     w2, w1, 2
    +       dup     v2.4s, w6
            movi    v3.4s, 0
    -       mov     w0, 0
    -       movi    v2.4s, 0x1
    +       lsr     w2, w1, 2
            mvni    v0.4s, 0
    +       mov     w0, 0
            cmge    v1.4s, v1.4s, v3.4s
            and     v1.16b, v2.16b, v1.16b
    -       dup     v2.4s, w6
    -       and     v1.16b, v1.16b, v2.16b
            .p2align 3
     .L7:
            and     v0.16b, v0.16b, v1.16b

- powerpc64le has many instances of things like:

    -Failed to match this instruction:
    +Successfully matched this instruction:
     (set (reg:V4SI 161 [ vect_cst__24 ])
    -    (vec_select:V4SI (vec_duplicate:V4SI (vec_select:SI (reg:V4SI 143)
    -                (parallel [
    -                        (const_int 0 [0])
    -                    ])))
    -        (parallel [
    -                (const_int 2 [0x2])
    -                (const_int 3 [0x3])
    -                (const_int 0 [0])
    -                (const_int 1 [0x1])
    -            ])))
    +    (vec_duplicate:V4SI (vec_select:SI (reg:V4SI 143)
    +            (parallel [
    +                    (const_int 0 [0])
    +                ]))))

  This removes redundant XXPERMDIs from many tests.

The best way of testing the new simplifications seemed to be
via selftests.  The patch cribs part of David's patch here:
https://gcc.gnu.org/ml/gcc-patches/2016-07/msg00270.html .


2017-10-23  Richard Sandiford  <richard.sandiford@linaro.org>
	    David Malcolm  <dmalcolm@redhat.com>
	    Alan Hayward  <alan.hayward@arm.com>
	    David Sherwood  <david.sherwood@arm.com>

gcc/
	* rtl.h (vec_duplicate_p): New function.
	* selftest-rtl.c (assert_rtx_eq_at): New function.
	* selftest-rtl.h (ASSERT_RTX_EQ): New macro.
	(assert_rtx_eq_at): Declare.
	* selftest.h (selftest::simplify_rtx_c_tests): Declare.
	* selftest-run-tests.c (selftest::run_tests): Call it.
	* simplify-rtx.c: Include selftest.h and selftest-rtl.h.
	(simplify_unary_operation_1): Recursively handle vector duplicates.
	(simplify_binary_operation_1): Likewise.  Handle VEC_SELECTs of
	vector duplicates.
	(simplify_subreg): Handle subregs of vector duplicates.
	(make_test_reg, test_vector_ops_duplicate, test_vector_ops)
	(selftest::simplify_rtx_c_tests): New functions.

Comments

Jeff Law Oct. 25, 2017, 4:29 p.m. UTC | #1

On 10/23/2017 05:17 AM, Richard Sandiford wrote:
> This patch adds a vec_duplicate_p helper that tests for constant

> or non-constant vector duplicates.  Together with the existing

> const_vec_duplicate_p, this complements the gen_vec_duplicate

> and gen_const_vec_duplicate added by a previous patch.

> 

> The patch uses the new routines to add more rtx simplifications

> involving vector duplicates.  These mirror simplifications that

> we already do for CONST_VECTOR broadcasts and are needed for

> variable-length SVE, which uses:

> 

>   (const:M (vec_duplicate:M X))

> 

> to represent constant broadcasts instead.  The simplifications do

> trigger on the testsuite for variable duplicates too, and in each

> case I saw the change was an improvement.  E.g.:

> 

[ snip ]

> 

> The best way of testing the new simplifications seemed to be

> via selftests.  The patch cribs part of David's patch here:

> https://gcc.gnu.org/ml/gcc-patches/2016-07/msg00270.html .

Cool.  I really wish I had more time to promote David's work by adding
selftests to various things.  There's certainly cases where it's the
most direct and useful way to test certain bits of lower level
infrastructure we have.  Glad to see you found it useful here.



> 

> 

> 2017-10-23  Richard Sandiford  <richard.sandiford@linaro.org>

> 	    David Malcolm  <dmalcolm@redhat.com>

> 	    Alan Hayward  <alan.hayward@arm.com>

> 	    David Sherwood  <david.sherwood@arm.com>

> 

> gcc/

> 	* rtl.h (vec_duplicate_p): New function.

> 	* selftest-rtl.c (assert_rtx_eq_at): New function.

> 	* selftest-rtl.h (ASSERT_RTX_EQ): New macro.

> 	(assert_rtx_eq_at): Declare.

> 	* selftest.h (selftest::simplify_rtx_c_tests): Declare.

> 	* selftest-run-tests.c (selftest::run_tests): Call it.

> 	* simplify-rtx.c: Include selftest.h and selftest-rtl.h.

> 	(simplify_unary_operation_1): Recursively handle vector duplicates.

> 	(simplify_binary_operation_1): Likewise.  Handle VEC_SELECTs of

> 	vector duplicates.

> 	(simplify_subreg): Handle subregs of vector duplicates.

> 	(make_test_reg, test_vector_ops_duplicate, test_vector_ops)

> 	(selftest::simplify_rtx_c_tests): New functions.

Thanks for the examples of how this affects various targets.  Seems like
it ought to be a consistent win when they trigger.

jeff

Christophe Lyon Nov. 10, 2017, 9:38 a.m. UTC | #2

On 25 October 2017 at 18:29, Jeff Law <law@redhat.com> wrote:
> On 10/23/2017 05:17 AM, Richard Sandiford wrote:

>> This patch adds a vec_duplicate_p helper that tests for constant

>> or non-constant vector duplicates.  Together with the existing

>> const_vec_duplicate_p, this complements the gen_vec_duplicate

>> and gen_const_vec_duplicate added by a previous patch.

>>

>> The patch uses the new routines to add more rtx simplifications

>> involving vector duplicates.  These mirror simplifications that

>> we already do for CONST_VECTOR broadcasts and are needed for

>> variable-length SVE, which uses:

>>

>>   (const:M (vec_duplicate:M X))

>>

>> to represent constant broadcasts instead.  The simplifications do

>> trigger on the testsuite for variable duplicates too, and in each

>> case I saw the change was an improvement.  E.g.:

>>

> [ snip ]

>

>>

>> The best way of testing the new simplifications seemed to be

>> via selftests.  The patch cribs part of David's patch here:

>> https://gcc.gnu.org/ml/gcc-patches/2016-07/msg00270.html .

> Cool.  I really wish I had more time to promote David's work by adding

> selftests to various things.  There's certainly cases where it's the

> most direct and useful way to test certain bits of lower level

> infrastructure we have.  Glad to see you found it useful here.

>

>

>

>>

>>

>> 2017-10-23  Richard Sandiford  <richard.sandiford@linaro.org>

>>           David Malcolm  <dmalcolm@redhat.com>

>>           Alan Hayward  <alan.hayward@arm.com>

>>           David Sherwood  <david.sherwood@arm.com>

>>

>> gcc/

>>       * rtl.h (vec_duplicate_p): New function.

>>       * selftest-rtl.c (assert_rtx_eq_at): New function.

>>       * selftest-rtl.h (ASSERT_RTX_EQ): New macro.

>>       (assert_rtx_eq_at): Declare.

>>       * selftest.h (selftest::simplify_rtx_c_tests): Declare.

>>       * selftest-run-tests.c (selftest::run_tests): Call it.

>>       * simplify-rtx.c: Include selftest.h and selftest-rtl.h.

>>       (simplify_unary_operation_1): Recursively handle vector duplicates.

>>       (simplify_binary_operation_1): Likewise.  Handle VEC_SELECTs of

>>       vector duplicates.

>>       (simplify_subreg): Handle subregs of vector duplicates.

>>       (make_test_reg, test_vector_ops_duplicate, test_vector_ops)

>>       (selftest::simplify_rtx_c_tests): New functions.


Hi Richard,

I've noticed that this patch (r254294) causes
FAIL: gcc.dg/vect/vect-126.c (internal compiler error)
FAIL: gcc.dg/vect/vect-126.c -flto -ffat-lto-objects (internal compiler error)
on arm* targets.
Sorry if this has been reported before, I've restarted validations
only recently,
so the process is still catching up.

gcc.log has this:
spawn -ignore SIGHUP
/aci-gcc-fsf/builds/gcc-fsf-gccsrc/obj-arm-none-linux-gnueabihf/gcc3/gcc/xgcc
-B/aci-gcc-fsf/builds/gcc-fsf-gccsrc/obj-arm-none-linux-gnueabihf/gcc3/gcc/
/gcc/testsuite/gcc.dg/vect/vect-126.c -fno-diagnostics-show-caret
-fdiagnostics-color=never -ffast-math -ftree-vectorize
-fno-vect-cost-model -fno-common -O2 -fdump-tree-vect-details -S -o
vect-126.s
during RTL pass: combine
/gcc/testsuite/gcc.dg/vect/vect-126.c: In function 'f5':
/gcc/testsuite/gcc.dg/vect/vect-126.c:53:1: internal compiler error:
in neon_valid_immediate, at config/arm/arm.c:11850
0xf3e6c8 neon_valid_immediate
        /gcc/config/arm/arm.c:11850
0xf3ea9a neon_immediate_valid_for_move(rtx_def*, machine_mode, rtx_def**, int*)
        /gcc/config/arm/arm.c:11968
0xf40a20 arm_rtx_costs_internal
        /gcc/config/arm/arm.c:10695
0xf40a20 arm_rtx_costs
        /gcc/config/arm/arm.c:10946
0xb113ef rtx_cost(rtx_def*, machine_mode, rtx_code, int, bool)
        /gcc/rtlanal.c:4187
0xb1169f set_src_cost
        /gcc/rtl.h:2700
0xb1169f pattern_cost(rtx_def*, bool)
        /gcc/rtlanal.c:5315
0x128bb3b combine_validate_cost
        /gcc/combine.c:893
0x128bb3b try_combine
        /gcc/combine.c:4113
0x12923d5 combine_instructions
        /gcc/combine.c:1452
0x12926ed rest_of_handle_combine
        /gcc/combine.c:14795
0x12926ed execute
        /gcc/combine.c:14840
Please submit a full bug report,


Thanks,

Christophe

> Thanks for the examples of how this affects various targets.  Seems like

> it ought to be a consistent win when they trigger.

>

> jeff

Index: gcc/rtl.h
===================================================================
--- gcc/rtl.h	2017-10-23 11:40:11.485292126 +0100
+++ gcc/rtl.h	2017-10-23 11:41:36.307050364 +0100
@@ -2772,6 +2772,21 @@  const_vec_duplicate_p (T x, T *elt)
   return false;
 }
 
+/* Return true if X is a vector with a duplicated element value, either
+   constant or nonconstant.  Store the duplicated element in *ELT if so.  */
+
+template <typename T>
+inline bool
+vec_duplicate_p (T x, T *elt)
+{
+  if (GET_CODE (x) == VEC_DUPLICATE)
+    {
+      *elt = XEXP (x, 0);
+      return true;
+    }
+  return const_vec_duplicate_p (x, elt);
+}
+
 /* If X is a vector constant with a duplicated element value, return that
    element value, otherwise return X.  */
 
Index: gcc/selftest-rtl.c
===================================================================
--- gcc/selftest-rtl.c	2017-10-23 11:40:11.485292126 +0100
+++ gcc/selftest-rtl.c	2017-10-23 11:41:36.307050364 +0100
@@ -35,6 +35,29 @@  Software Foundation; either version 3, o
 
 namespace selftest {
 
+/* Compare rtx EXPECTED and ACTUAL using rtx_equal_p, calling
+   ::selftest::pass if they are equal, aborting if they are non-equal.
+   LOC is the effective location of the assertion, MSG describes it.  */
+
+void
+assert_rtx_eq_at (const location &loc, const char *msg,
+		  rtx expected, rtx actual)
+{
+  if (rtx_equal_p (expected, actual))
+    ::selftest::pass (loc, msg);
+  else
+    {
+      fprintf (stderr, "%s:%i: %s: FAIL: %s\n", loc.m_file, loc.m_line,
+	       loc.m_function, msg);
+      fprintf (stderr, "  expected: ");
+      print_rtl (stderr, expected);
+      fprintf (stderr, "\n  actual: ");
+      print_rtl (stderr, actual);
+      fprintf (stderr, "\n");
+      abort ();
+    }
+}
+
 /* Compare rtx EXPECTED and ACTUAL by pointer equality, calling
    ::selftest::pass if they are equal, aborting if they are non-equal.
    LOC is the effective location of the assertion, MSG describes it.  */
Index: gcc/selftest-rtl.h
===================================================================
--- gcc/selftest-rtl.h	2017-10-23 11:40:11.485292126 +0100
+++ gcc/selftest-rtl.h	2017-10-23 11:41:36.307050364 +0100
@@ -47,6 +47,15 @@  #define ASSERT_RTL_DUMP_EQ_WITH_REUSE(EX
   assert_rtl_dump_eq (SELFTEST_LOCATION, (EXPECTED_DUMP), (RTX), \
 		      (REUSE_MANAGER))
 
+#define ASSERT_RTX_EQ(EXPECTED, ACTUAL) 				\
+  SELFTEST_BEGIN_STMT							\
+  const char *desc = "ASSERT_RTX_EQ (" #EXPECTED ", " #ACTUAL ")";	\
+  ::selftest::assert_rtx_eq_at (SELFTEST_LOCATION, desc, (EXPECTED),	\
+				(ACTUAL));				\
+  SELFTEST_END_STMT
+
+extern void assert_rtx_eq_at (const location &, const char *, rtx, rtx);
+
 /* Evaluate rtx EXPECTED and ACTUAL and compare them with ==
    (i.e. pointer equality), calling ::selftest::pass if they are
    equal, aborting if they are non-equal.  */
Index: gcc/selftest.h
===================================================================
--- gcc/selftest.h	2017-10-23 11:41:25.513859990 +0100
+++ gcc/selftest.h	2017-10-23 11:41:36.308050364 +0100
@@ -198,6 +198,7 @@  extern void tree_cfg_c_tests ();
 extern void vec_c_tests ();
 extern void wide_int_cc_tests ();
 extern void predict_c_tests ();
+extern void simplify_rtx_c_tests ();
 
 extern int num_passes;
 
Index: gcc/selftest-run-tests.c
===================================================================
--- gcc/selftest-run-tests.c	2017-10-23 11:41:25.872704926 +0100
+++ gcc/selftest-run-tests.c	2017-10-23 11:41:36.308050364 +0100
@@ -94,6 +94,7 @@  selftest::run_tests ()
 
   store_merging_c_tests ();
   predict_c_tests ();
+  simplify_rtx_c_tests ();
 
   /* Run any lang-specific selftests.  */
   lang_hooks.run_lang_selftests ();
Index: gcc/simplify-rtx.c
===================================================================
--- gcc/simplify-rtx.c	2017-10-23 11:41:32.370050264 +0100
+++ gcc/simplify-rtx.c	2017-10-23 11:41:36.309050364 +0100
@@ -33,6 +33,8 @@  Software Foundation; either version 3, o
 #include "diagnostic-core.h"
 #include "varasm.h"
 #include "flags.h"
+#include "selftest.h"
+#include "selftest-rtl.h"
 
 /* Simplification and canonicalization of RTL.  */
 
@@ -925,7 +927,7 @@  exact_int_to_float_conversion_p (const_r
 simplify_unary_operation_1 (enum rtx_code code, machine_mode mode, rtx op)
 {
   enum rtx_code reversed;
-  rtx temp;
+  rtx temp, elt;
   scalar_int_mode inner, int_mode, op_mode, op0_mode;
 
   switch (code)
@@ -1681,6 +1683,28 @@  simplify_unary_operation_1 (enum rtx_cod
       break;
     }
 
+  if (VECTOR_MODE_P (mode) && vec_duplicate_p (op, &elt))
+    {
+      /* Try applying the operator to ELT and see if that simplifies.
+	 We can duplicate the result if so.
+
+	 The reason we don't use simplify_gen_unary is that it isn't
+	 necessarily a win to convert things like:
+
+	   (neg:V (vec_duplicate:V (reg:S R)))
+
+	 to:
+
+	   (vec_duplicate:V (neg:S (reg:S R)))
+
+	 The first might be done entirely in vector registers while the
+	 second might need a move between register files.  */
+      temp = simplify_unary_operation (code, GET_MODE_INNER (mode),
+				       elt, GET_MODE_INNER (GET_MODE (op)));
+      if (temp)
+	return gen_vec_duplicate (mode, temp);
+    }
+
   return 0;
 }
 
@@ -2138,7 +2162,7 @@  simplify_binary_operation (enum rtx_code
 simplify_binary_operation_1 (enum rtx_code code, machine_mode mode,
 			     rtx op0, rtx op1, rtx trueop0, rtx trueop1)
 {
-  rtx tem, reversed, opleft, opright;
+  rtx tem, reversed, opleft, opright, elt0, elt1;
   HOST_WIDE_INT val;
   unsigned int width = GET_MODE_PRECISION (mode);
   scalar_int_mode int_mode, inner_mode;
@@ -3480,6 +3504,9 @@  simplify_binary_operation_1 (enum rtx_co
 	  gcc_assert (XVECLEN (trueop1, 0) == 1);
 	  gcc_assert (CONST_INT_P (XVECEXP (trueop1, 0, 0)));
 
+	  if (vec_duplicate_p (trueop0, &elt0))
+	    return elt0;
+
 	  if (GET_CODE (trueop0) == CONST_VECTOR)
 	    return CONST_VECTOR_ELT (trueop0, INTVAL (XVECEXP
 						      (trueop1, 0, 0)));
@@ -3562,9 +3589,6 @@  simplify_binary_operation_1 (enum rtx_co
 				    tmp_op, gen_rtx_PARALLEL (VOIDmode, vec));
 	      return tmp;
 	    }
-	  if (GET_CODE (trueop0) == VEC_DUPLICATE
-	      && GET_MODE (XEXP (trueop0, 0)) == mode)
-	    return XEXP (trueop0, 0);
 	}
       else
 	{
@@ -3573,6 +3597,11 @@  simplify_binary_operation_1 (enum rtx_co
 		      == GET_MODE_INNER (GET_MODE (trueop0)));
 	  gcc_assert (GET_CODE (trueop1) == PARALLEL);
 
+	  if (vec_duplicate_p (trueop0, &elt0))
+	    /* It doesn't matter which elements are selected by trueop1,
+	       because they are all the same.  */
+	    return gen_vec_duplicate (mode, elt0);
+
 	  if (GET_CODE (trueop0) == CONST_VECTOR)
 	    {
 	      int elt_size = GET_MODE_UNIT_SIZE (mode);
@@ -3873,6 +3902,32 @@  simplify_binary_operation_1 (enum rtx_co
       gcc_unreachable ();
     }
 
+  if (mode == GET_MODE (op0)
+      && mode == GET_MODE (op1)
+      && vec_duplicate_p (op0, &elt0)
+      && vec_duplicate_p (op1, &elt1))
+    {
+      /* Try applying the operator to ELT and see if that simplifies.
+	 We can duplicate the result if so.
+
+	 The reason we don't use simplify_gen_binary is that it isn't
+	 necessarily a win to convert things like:
+
+	   (plus:V (vec_duplicate:V (reg:S R1))
+		   (vec_duplicate:V (reg:S R2)))
+
+	 to:
+
+	   (vec_duplicate:V (plus:S (reg:S R1) (reg:S R2)))
+
+	 The first might be done entirely in vector registers while the
+	 second might need a move between register files.  */
+      tem = simplify_binary_operation (code, GET_MODE_INNER (mode),
+				       elt0, elt1);
+      if (tem)
+	return gen_vec_duplicate (mode, tem);
+    }
+
   return 0;
 }
 
@@ -6021,6 +6076,20 @@  simplify_subreg (machine_mode outermode,
   if (outermode == innermode && !byte)
     return op;
 
+  if (byte % GET_MODE_UNIT_SIZE (innermode) == 0)
+    {
+      rtx elt;
+
+      if (VECTOR_MODE_P (outermode)
+	  && GET_MODE_INNER (outermode) == GET_MODE_INNER (innermode)
+	  && vec_duplicate_p (op, &elt))
+	return gen_vec_duplicate (outermode, elt);
+
+      if (outermode == GET_MODE_INNER (innermode)
+	  && vec_duplicate_p (op, &elt))
+	return elt;
+    }
+
   if (CONST_SCALAR_INT_P (op)
       || CONST_DOUBLE_AS_FLOAT_P (op)
       || GET_CODE (op) == CONST_FIXED
@@ -6326,3 +6395,125 @@  simplify_rtx (const_rtx x)
     }
   return NULL;
 }
+
+#if CHECKING_P
+
+namespace selftest {
+
+/* Make a unique pseudo REG of mode MODE for use by selftests.  */
+
+static rtx
+make_test_reg (machine_mode mode)
+{
+  static int test_reg_num = LAST_VIRTUAL_REGISTER + 1;
+
+  return gen_rtx_REG (mode, test_reg_num++);
+}
+
+/* Test vector simplifications involving VEC_DUPLICATE in which the
+   operands and result have vector mode MODE.  SCALAR_REG is a pseudo
+   register that holds one element of MODE.  */
+
+static void
+test_vector_ops_duplicate (machine_mode mode, rtx scalar_reg)
+{
+  scalar_mode inner_mode = GET_MODE_INNER (mode);
+  rtx duplicate = gen_rtx_VEC_DUPLICATE (mode, scalar_reg);
+  unsigned int nunits = GET_MODE_NUNITS (mode);
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      /* Test some simple unary cases with VEC_DUPLICATE arguments.  */
+      rtx not_scalar_reg = gen_rtx_NOT (inner_mode, scalar_reg);
+      rtx duplicate_not = gen_rtx_VEC_DUPLICATE (mode, not_scalar_reg);
+      ASSERT_RTX_EQ (duplicate,
+		     simplify_unary_operation (NOT, mode,
+					       duplicate_not, mode));
+
+      rtx neg_scalar_reg = gen_rtx_NEG (inner_mode, scalar_reg);
+      rtx duplicate_neg = gen_rtx_VEC_DUPLICATE (mode, neg_scalar_reg);
+      ASSERT_RTX_EQ (duplicate,
+		     simplify_unary_operation (NEG, mode,
+					       duplicate_neg, mode));
+
+      /* Test some simple binary cases with VEC_DUPLICATE arguments.  */
+      ASSERT_RTX_EQ (duplicate,
+		     simplify_binary_operation (PLUS, mode, duplicate,
+						CONST0_RTX (mode)));
+
+      ASSERT_RTX_EQ (duplicate,
+		     simplify_binary_operation (MINUS, mode, duplicate,
+						CONST0_RTX (mode)));
+
+      ASSERT_RTX_PTR_EQ (CONST0_RTX (mode),
+			 simplify_binary_operation (MINUS, mode, duplicate,
+						    duplicate));
+    }
+
+  /* Test a scalar VEC_SELECT of a VEC_DUPLICATE.  */
+  rtx zero_par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+  ASSERT_RTX_PTR_EQ (scalar_reg,
+		     simplify_binary_operation (VEC_SELECT, inner_mode,
+						duplicate, zero_par));
+
+  /* And again with the final element.  */
+  rtx last_index = gen_int_mode (GET_MODE_NUNITS (mode) - 1, word_mode);
+  rtx last_par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, last_index));
+  ASSERT_RTX_PTR_EQ (scalar_reg,
+		     simplify_binary_operation (VEC_SELECT, inner_mode,
+						duplicate, last_par));
+
+  /* Test a scalar subreg of a VEC_DUPLICATE.  */
+  unsigned int offset = subreg_lowpart_offset (inner_mode, mode);
+  ASSERT_RTX_EQ (scalar_reg,
+		 simplify_gen_subreg (inner_mode, duplicate,
+				      mode, offset));
+
+  machine_mode narrower_mode;
+  if (nunits > 2
+      && mode_for_vector (inner_mode, 2).exists (&narrower_mode)
+      && VECTOR_MODE_P (narrower_mode))
+    {
+      /* Test VEC_SELECT of a vector.  */
+      rtx vec_par
+	= gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, const1_rtx, const0_rtx));
+      rtx narrower_duplicate
+	= gen_rtx_VEC_DUPLICATE (narrower_mode, scalar_reg);
+      ASSERT_RTX_EQ (narrower_duplicate,
+		     simplify_binary_operation (VEC_SELECT, narrower_mode,
+						duplicate, vec_par));
+
+      /* Test a vector subreg of a VEC_DUPLICATE.  */
+      unsigned int offset = subreg_lowpart_offset (narrower_mode, mode);
+      ASSERT_RTX_EQ (narrower_duplicate,
+		     simplify_gen_subreg (narrower_mode, duplicate,
+					  mode, offset));
+    }
+}
+
+/* Verify some simplifications involving vectors.  */
+
+static void
+test_vector_ops ()
+{
+  for (unsigned int i = 0; i < NUM_MACHINE_MODES; ++i)
+    {
+      machine_mode mode = (machine_mode) i;
+      if (VECTOR_MODE_P (mode))
+	{
+	  rtx scalar_reg = make_test_reg (GET_MODE_INNER (mode));
+	  test_vector_ops_duplicate (mode, scalar_reg);
+	}
+    }
+}
+
+/* Run all of the selftests within this file.  */
+
+void
+simplify_rtx_c_tests ()
+{
+  test_vector_ops ();
+}
+
+} // namespace selftest
+
+#endif /* CHECKING_P */

[02/nn] Add more vec_duplicate simplifications

Commit Message

Comments

Patch