[SVE,fwprop] PR88833 - Redundant moves for WHILELO-based loops

Message ID CAAgBjMndm8upw++qigGPQNTA4nqm+Uck=CcVBpt3cZ74fEGEwA@mail.gmail.com
State New
Headers show
Series
  • [SVE,fwprop] PR88833 - Redundant moves for WHILELO-based loops
Related show

Commit Message

Prathamesh Kulkarni June 21, 2019, 9:51 a.m.
Hi,
The attached patch tries to fix PR88833.
For the following test-case:

subroutine foo(x)
  real :: x(100)
  x = x + 10
end subroutine foo

Assembly with -O3 -march=armv8.2-a+sve:

foo_:
.LFB0:
        .cfi_startproc
        mov     w2, 100
        mov     w3, w2
        mov     x1, 0
        whilelo p0.s, wzr, w2
        fmov    z1.s, #1.0e+1
        .p2align 3,,7
.L2:
        ld1w    z0.s, p0/z, [x0, x1, lsl 2]
        fadd    z0.s, z0.s, z1.s
        st1w    z0.s, p0, [x0, x1, lsl 2]
        incw    x1
        whilelo p0.s, w1, w3
        bne     .L2
        ret

As we can see, it generates extra mov w3, w2. Instead it could have reused w2 in
both whilelo's.

expand produces:
insn 7: reg:SI 97 = 100
insn 8: use (reg:SI 97)
insn 22: reg:SI 105 = 100
insn 23: use (reg:SI 105)

Both reg:SI 97 and reg:SI 105 have only single definitions (and also
single use).

cse2 then replaces 100 with reg:SI 97 in insn 22, which becomes:
insn 22: reg:SI 105 = reg:SI 97.

sched1 then reorders instructions, and insn 7 and insn 22 end up falling
in same basic block Looking at reload dump:

         Choosing alt 3 in insn 7:  (0) r  (1) M {*movsi_aarch64}
          alt=0,overall=0,losers=0,rld_nregs=0
         Choosing alt 0 in insn 2:  (0) =r  (1) r {*movdi_aarch64}
          alt=0,overall=0,losers=0,rld_nregs=0
         Choosing alt 0 in insn 22:  (0) =r  (1) r {*movsi_aarch64}
            1 Non-pseudo reload: reject+=2
            1 Non input pseudo reload: reject++
            Cycle danger: overall += LRA_MAX_REJECT
          alt=0,overall=609,losers=1,rld_nregs=1

which shows, it ends up taking extra register.

The issue here is that cse2 pass is leaving opportunities for
propagating register copies.
To address this, the patch makes following changes to fwprop.c:
(a) Add support for handling UNSPEC in propagate_rtx_1 in a similar
manner to simplify_replace_fn_rtx.
(b) Allow propagating def inside a loop if source of def is a register
in forward_propagate_into.
AFAIU, replacing register by another register shouldn't increase cost.
(c) Integrate fwprop and fwprop_addr, and make fwprop_addr propagate
register copies.

With the patch, fwprop_addr propagates reg:SI 97 in insn 23 and deletes insn 22,
which eliminates the redundant mov.

Does this patch look OK ?
Bootstrapped + tested on x86_64-unknown-linux-gnu and aarch64-linux-gnu.
Cross-testing with SVE in progress.

Thanks,
Prathamesh

Comments

Richard Sandiford June 24, 2019, 9:29 a.m. | #1
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> diff --git a/gcc/fwprop.c b/gcc/fwprop.c

> index 45703fe5f01..93a1a10c9a6 100644

> --- a/gcc/fwprop.c

> +++ b/gcc/fwprop.c

> @@ -547,6 +547,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags)

>  	  tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)),

>  				     SUBREG_BYTE (x));

>  	}

> +

> +      else

> +	{

> +	  rtvec vec;

> +	  rtvec newvec;

> +	  const char *fmt = GET_RTX_FORMAT (code);

> +	  rtx op;

> +

> +	  for (int i = 0; fmt[i]; i++)

> +	    switch (fmt[i])

> +	      {

> +	      case 'E':

> +		vec = XVEC (x, i);

> +		newvec = vec;

> +		for (int j = 0; j < GET_NUM_ELEM (vec); j++)

> +		  {

> +		    op = RTVEC_ELT (vec, j);

> +		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);

> +		    if (op != RTVEC_ELT (vec, j))

> +		      {

> +			if (newvec == vec)

> +			  {

> +			    newvec = shallow_copy_rtvec (vec);

> +			    if (!tem)

> +			      tem = shallow_copy_rtx (x);

> +			    XVEC (tem, i) = newvec;

> +			  }

> +			RTVEC_ELT (newvec, j) = op;

> +		      }

> +		  }

> +	      break;


Misindented break: should be indented two columns further.

> +

> +	      case 'e':

> +		if (XEXP (x, i))

> +		  {

> +		    op = XEXP (x, i);

> +		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);

> +		    if (op != XEXP (x, i))

> +		      {

> +			if (!tem)

> +			  tem = shallow_copy_rtx (x);

> +			XEXP (tem, i) = op;

> +		      }

> +		  }

> +	      break;


Same here.

> +	      }

> +	}

> +

>        break;

>  

>      case RTX_OBJ:

> @@ -1370,10 +1418,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set)

>  

>  /* Given a use USE of an insn, if it has a single reaching

>     definition, try to forward propagate it into that insn.

> -   Return true if cfg cleanup will be needed.  */

> +   Return true if cfg cleanup will be needed.

> +   REG_PROP_ONLY is true if we should only propagate register copies.  */

>  

>  static bool

> -forward_propagate_into (df_ref use)

> +forward_propagate_into (df_ref use, bool reg_prop_only = false)

>  {

>    df_ref def;

>    rtx_insn *def_insn, *use_insn;

> @@ -1394,10 +1443,6 @@ forward_propagate_into (df_ref use)

>    if (DF_REF_IS_ARTIFICIAL (def))

>      return false;

>  

> -  /* Do not propagate loop invariant definitions inside the loop.  */

> -  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father)

> -    return false;

> -

>    /* Check if the use is still present in the insn!  */

>    use_insn = DF_REF_INSN (use);

>    if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE)

> @@ -1415,6 +1460,16 @@ forward_propagate_into (df_ref use)

>    if (!def_set)

>      return false;

>  

> +  if (reg_prop_only && !REG_P (SET_SRC (def_set)))

> +    return false;

> +

> +  /* Allow propagating def inside loop only if source of def_set is

> +     reg, since replacing reg by source reg shouldn't increase cost.  */


Maybe:

  /* Allow propagations into a loop only for reg-to-reg copies, since
     replacing one register by another shouldn't increase the cost.  */

> +

> +  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father

> +      && !REG_P (SET_SRC (def_set)))

> +    return false;


To be extra safe, I think we should check that the SET_DEST is a REG_P
in both cases, to exclude REG-to-SUBREG copies.

Thanks,
Richard
Prathamesh Kulkarni June 24, 2019, 11:36 a.m. | #2
On Mon, 24 Jun 2019 at 14:59, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> > diff --git a/gcc/fwprop.c b/gcc/fwprop.c

> > index 45703fe5f01..93a1a10c9a6 100644

> > --- a/gcc/fwprop.c

> > +++ b/gcc/fwprop.c

> > @@ -547,6 +547,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags)

> >         tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)),

> >                                    SUBREG_BYTE (x));

> >       }

> > +

> > +      else

> > +     {

> > +       rtvec vec;

> > +       rtvec newvec;

> > +       const char *fmt = GET_RTX_FORMAT (code);

> > +       rtx op;

> > +

> > +       for (int i = 0; fmt[i]; i++)

> > +         switch (fmt[i])

> > +           {

> > +           case 'E':

> > +             vec = XVEC (x, i);

> > +             newvec = vec;

> > +             for (int j = 0; j < GET_NUM_ELEM (vec); j++)

> > +               {

> > +                 op = RTVEC_ELT (vec, j);

> > +                 valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);

> > +                 if (op != RTVEC_ELT (vec, j))

> > +                   {

> > +                     if (newvec == vec)

> > +                       {

> > +                         newvec = shallow_copy_rtvec (vec);

> > +                         if (!tem)

> > +                           tem = shallow_copy_rtx (x);

> > +                         XVEC (tem, i) = newvec;

> > +                       }

> > +                     RTVEC_ELT (newvec, j) = op;

> > +                   }

> > +               }

> > +           break;

>

> Misindented break: should be indented two columns further.

>

> > +

> > +           case 'e':

> > +             if (XEXP (x, i))

> > +               {

> > +                 op = XEXP (x, i);

> > +                 valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);

> > +                 if (op != XEXP (x, i))

> > +                   {

> > +                     if (!tem)

> > +                       tem = shallow_copy_rtx (x);

> > +                     XEXP (tem, i) = op;

> > +                   }

> > +               }

> > +           break;

>

> Same here.

>

> > +           }

> > +     }

> > +

> >        break;

> >

> >      case RTX_OBJ:

> > @@ -1370,10 +1418,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set)

> >

> >  /* Given a use USE of an insn, if it has a single reaching

> >     definition, try to forward propagate it into that insn.

> > -   Return true if cfg cleanup will be needed.  */

> > +   Return true if cfg cleanup will be needed.

> > +   REG_PROP_ONLY is true if we should only propagate register copies.  */

> >

> >  static bool

> > -forward_propagate_into (df_ref use)

> > +forward_propagate_into (df_ref use, bool reg_prop_only = false)

> >  {

> >    df_ref def;

> >    rtx_insn *def_insn, *use_insn;

> > @@ -1394,10 +1443,6 @@ forward_propagate_into (df_ref use)

> >    if (DF_REF_IS_ARTIFICIAL (def))

> >      return false;

> >

> > -  /* Do not propagate loop invariant definitions inside the loop.  */

> > -  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father)

> > -    return false;

> > -

> >    /* Check if the use is still present in the insn!  */

> >    use_insn = DF_REF_INSN (use);

> >    if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE)

> > @@ -1415,6 +1460,16 @@ forward_propagate_into (df_ref use)

> >    if (!def_set)

> >      return false;

> >

> > +  if (reg_prop_only && !REG_P (SET_SRC (def_set)))

> > +    return false;

> > +

> > +  /* Allow propagating def inside loop only if source of def_set is

> > +     reg, since replacing reg by source reg shouldn't increase cost.  */

>

> Maybe:

>

>   /* Allow propagations into a loop only for reg-to-reg copies, since

>      replacing one register by another shouldn't increase the cost.  */

>

> > +

> > +  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father

> > +      && !REG_P (SET_SRC (def_set)))

> > +    return false;

>

> To be extra safe, I think we should check that the SET_DEST is a REG_P

> in both cases, to exclude REG-to-SUBREG copies.

Thanks for the suggestions.
Does the attached version look OK ?

Thanks,
Prathamesh
>

> Thanks,

> Richard
diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index 45703fe5f01..c5abebb7832 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -547,6 +547,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags)
 	  tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)),
 				     SUBREG_BYTE (x));
 	}
+
+      else
+	{
+	  rtvec vec;
+	  rtvec newvec;
+	  const char *fmt = GET_RTX_FORMAT (code);
+	  rtx op;
+
+	  for (int i = 0; fmt[i]; i++)
+	    switch (fmt[i])
+	      {
+	      case 'E':
+		vec = XVEC (x, i);
+		newvec = vec;
+		for (int j = 0; j < GET_NUM_ELEM (vec); j++)
+		  {
+		    op = RTVEC_ELT (vec, j);
+		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+		    if (op != RTVEC_ELT (vec, j))
+		      {
+			if (newvec == vec)
+			  {
+			    newvec = shallow_copy_rtvec (vec);
+			    if (!tem)
+			      tem = shallow_copy_rtx (x);
+			    XVEC (tem, i) = newvec;
+			  }
+			RTVEC_ELT (newvec, j) = op;
+		      }
+		  }
+	        break;
+
+	      case 'e':
+		if (XEXP (x, i))
+		  {
+		    op = XEXP (x, i);
+		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+		    if (op != XEXP (x, i))
+		      {
+			if (!tem)
+			  tem = shallow_copy_rtx (x);
+			XEXP (tem, i) = op;
+		      }
+		  }
+	        break;
+	      }
+	}
+
       break;
 
     case RTX_OBJ:
@@ -1370,10 +1418,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set)
 
 /* Given a use USE of an insn, if it has a single reaching
    definition, try to forward propagate it into that insn.
-   Return true if cfg cleanup will be needed.  */
+   Return true if cfg cleanup will be needed.
+   REG_PROP_ONLY is true if we should only propagate register copies.  */
 
 static bool
-forward_propagate_into (df_ref use)
+forward_propagate_into (df_ref use, bool reg_prop_only = false)
 {
   df_ref def;
   rtx_insn *def_insn, *use_insn;
@@ -1394,10 +1443,6 @@ forward_propagate_into (df_ref use)
   if (DF_REF_IS_ARTIFICIAL (def))
     return false;
 
-  /* Do not propagate loop invariant definitions inside the loop.  */
-  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father)
-    return false;
-
   /* Check if the use is still present in the insn!  */
   use_insn = DF_REF_INSN (use);
   if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE)
@@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use)
   if (!def_set)
     return false;
 
+  if (reg_prop_only
+      && !REG_P (SET_SRC (def_set))
+      && !REG_P (SET_DEST (def_set)))
+    return false;
+
+  /* Allow propagations into a loop only for reg-to-reg copies, since
+     replacing one register by another shouldn't increase the cost.  */
+
+  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father
+      && !REG_P (SET_SRC (def_set))
+      && !REG_P (SET_DEST (def_set)))
+    return false;
+
   /* Only try one kind of propagation.  If two are possible, we'll
      do it on the following iterations.  */
   if (forward_propagate_and_simplify (use, def_insn, def_set)
@@ -1483,7 +1541,7 @@ gate_fwprop (void)
 }
 
 static unsigned int
-fwprop (void)
+fwprop (bool fwprop_addr_p)
 {
   unsigned i;
 
@@ -1502,11 +1560,16 @@ fwprop (void)
 
       df_ref use = DF_USES_GET (i);
       if (use)
-	if (DF_REF_TYPE (use) == DF_REF_REG_USE
-	    || DF_REF_BB (use)->loop_father == NULL
-	    /* The outer most loop is not really a loop.  */
-	    || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
-	  forward_propagate_into (use);
+	{
+	  if (DF_REF_TYPE (use) == DF_REF_REG_USE
+	      || DF_REF_BB (use)->loop_father == NULL
+	      /* The outer most loop is not really a loop.  */
+	      || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
+	    forward_propagate_into (use, fwprop_addr_p);
+
+	  else if (fwprop_addr_p)
+	    forward_propagate_into (use, false);
+	}
     }
 
   fwprop_done ();
@@ -1537,7 +1600,7 @@ public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop (); }
+  virtual unsigned int execute (function *) { return fwprop (false); }
 
 }; // class pass_rtl_fwprop
 
@@ -1549,33 +1612,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt)
   return new pass_rtl_fwprop (ctxt);
 }
 
-static unsigned int
-fwprop_addr (void)
-{
-  unsigned i;
-
-  fwprop_init ();
-
-  /* Go through all the uses.  df_uses_create will create new ones at the
-     end, and we'll go through them as well.  */
-  for (i = 0; i < DF_USES_TABLE_SIZE (); i++)
-    {
-      if (!propagations_left)
-	break;
-
-      df_ref use = DF_USES_GET (i);
-      if (use)
-	if (DF_REF_TYPE (use) != DF_REF_REG_USE
-	    && DF_REF_BB (use)->loop_father != NULL
-	    /* The outer most loop is not really a loop.  */
-	    && loop_outer (DF_REF_BB (use)->loop_father) != NULL)
-	  forward_propagate_into (use);
-    }
-
-  fwprop_done ();
-  return 0;
-}
-
 namespace {
 
 const pass_data pass_data_rtl_fwprop_addr =
@@ -1600,7 +1636,7 @@ public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop_addr (); }
+  virtual unsigned int execute (function *) { return fwprop (true); }
 
 }; // class pass_rtl_fwprop_addr
 
diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90
new file mode 100644
index 00000000000..224e6ce5f3d
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr88833.f90
@@ -0,0 +1,9 @@
+! { dg-do assemble { target aarch64_asm_sve_ok } }
+! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" }
+
+subroutine foo(x)
+  real :: x(100)
+  x = x + 10
+end subroutine foo
+
+! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }
Richard Sandiford June 24, 2019, 2:21 p.m. | #3
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use)

>    if (!def_set)

>      return false;

>  

> +  if (reg_prop_only

> +      && !REG_P (SET_SRC (def_set))

> +      && !REG_P (SET_DEST (def_set)))

> +    return false;


This should be:

  if (reg_prop_only
      && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set))))
    return false;

so that we return false if either operand isn't a register.

> +

> +  /* Allow propagations into a loop only for reg-to-reg copies, since

> +     replacing one register by another shouldn't increase the cost.  */

> +

> +  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father

> +      && !REG_P (SET_SRC (def_set))

> +      && !REG_P (SET_DEST (def_set)))

> +    return false;


Same here.

OK with that change, thanks.

Richard
Prathamesh Kulkarni June 24, 2019, 4:11 p.m. | #4
On Mon, 24 Jun 2019 at 19:51, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use)

> >    if (!def_set)

> >      return false;

> >

> > +  if (reg_prop_only

> > +      && !REG_P (SET_SRC (def_set))

> > +      && !REG_P (SET_DEST (def_set)))

> > +    return false;

>

> This should be:

>

>   if (reg_prop_only

>       && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set))))

>     return false;

>

> so that we return false if either operand isn't a register.

Oops, sorry about that  -:(
>

> > +

> > +  /* Allow propagations into a loop only for reg-to-reg copies, since

> > +     replacing one register by another shouldn't increase the cost.  */

> > +

> > +  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father

> > +      && !REG_P (SET_SRC (def_set))

> > +      && !REG_P (SET_DEST (def_set)))

> > +    return false;

>

> Same here.

>

> OK with that change, thanks.

Thanks for the review, will make the changes and commit the patch
after re-testing.

Thanks,
Prathamesh
>

> Richard
Richard Sandiford June 25, 2019, 2:35 p.m. | #5
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni

> <prathamesh.kulkarni@linaro.org> wrote:

>>

>> On Mon, 24 Jun 2019 at 19:51, Richard Sandiford

>> <richard.sandiford@arm.com> wrote:

>> >

>> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

>> > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use)

>> > >    if (!def_set)

>> > >      return false;

>> > >

>> > > +  if (reg_prop_only

>> > > +      && !REG_P (SET_SRC (def_set))

>> > > +      && !REG_P (SET_DEST (def_set)))

>> > > +    return false;

>> >

>> > This should be:

>> >

>> >   if (reg_prop_only

>> >       && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set))))

>> >     return false;

>> >

>> > so that we return false if either operand isn't a register.

>> Oops, sorry about that  -:(

>> >

>> > > +

>> > > +  /* Allow propagations into a loop only for reg-to-reg copies, since

>> > > +     replacing one register by another shouldn't increase the cost.  */

>> > > +

>> > > +  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father

>> > > +      && !REG_P (SET_SRC (def_set))

>> > > +      && !REG_P (SET_DEST (def_set)))

>> > > +    return false;

>> >

>> > Same here.

>> >

>> > OK with that change, thanks.

>> Thanks for the review, will make the changes and commit the patch

>> after re-testing.

> Hi,

> Testing the patch showed following failures on 32-bit x86:

>

>   Executed from: g++.target/i386/i386.exp

>     g++:g++.target/i386/pr88152.C   scan-assembler-not vpcmpgt|vpcmpeq|vpsra

>   Executed from: gcc.target/i386/i386.exp

>     gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs:

>     gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t

> ]*\\%eax,[\\t ]*%eax 1

>

> The failure of pr88152.C is also seen on x86_64.

>

> For pr66768.c, and pr90178.c, forwprop replaces register which is

> volatile and frame related respectively.

> To avoid that, the attached patch, makes a stronger constraint that

> src and dest should be a register

> and not have frame_related or volatil flags set, which is checked in

> usable_reg_p().

> Which avoids the failures for both the cases.

> Does it look OK ?


That's not the reason it's a bad transform.  In both cases we're
propagating r2 <- r1 even though

(a) r1 dies in the copy and
(b) fwprop can't replace all uses of r2, because some have multiple
    definitions

This has the effect of making both values live in cases where only one
was previously.

In the case of pr66768.c, fwprop2 is undoing the effect of
cse.c:canon_reg, which tries to pick the best register to use
(see cse.c:make_regs_eqv).  fwprop1 makes the same change,
and made it even before the patch, but the cse.c choice should win.

A (hopefully) conservative fix would be to propagate the copy only if
both registers have a single definition, which you can test with:

  (DF_REG_DEF_COUNT (regno) == 1
   && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (m_fn)), regno))

In that case, fwprop should see all uses of the destination, and should
be able to replace it in all cases with the source.

> For g++.target/i386/pr88152.C, the issue is that after the patch,

> forwprop1 does following propagation (in f10) which wasn't done

> before:

>

> In insn 10, replacing

>  (unspec:SI [

>             (reg:V2DF 91)

>         ] UNSPEC_MOVMSK)

>  with (unspec:SI [

>             (subreg:V2DF (reg:V2DI 90) 0)

>         ] UNSPEC_MOVMSK)

>

> This later defeats combine because insn 9 gets deleted.

> Without patch, the following combination takes place:

>

> Trying 7 -> 9:

>     7: r90:V2DI=r89:V2DI>r93:V2DI

>       REG_DEAD r93:V2DI

>       REG_DEAD r89:V2DI

>     9: r91:V2DF=r90:V2DI#0

>       REG_DEAD r90:V2DI

> Successfully matched this instruction:

> (set (subreg:V2DI (reg:V2DF 91) 0)

>     (gt:V2DI (reg:V2DI 89)

>         (reg:V2DI 93)))

> allowing combination of insns 7 and 9

>

> and then:

> Trying 6, 9 -> 10:

>     6: r89:V2DI=const_vector

>     9: r91:V2DF#0=r89:V2DI>r93:V2DI

>       REG_DEAD r89:V2DI

>       REG_DEAD r93:V2DI

>    10: r87:SI=unspec[r91:V2DF] 43

>       REG_DEAD r91:V2DF

> Successfully matched this instruction:

> (set (reg:SI 87)

>     (unspec:SI [

>             (lt:V2DF (reg:V2DI 93)

>                 (const_vector:V2DI [

>                         (const_int 0 [0]) repeated x2

>                     ]))

>         ] UNSPEC_MOVMSK))


Eh?  lt:*V2DF*?  Does that mean that it's 0 for false and an all-1 NaN
for true?

Looks like a bug that we manage to fold to that, and manage to match it.

Thanks,
Richard

> allowing combination of insns 6, 9 and 10

> original costs 4 + 8 + 4 = 16

> replacement cost 12

> deferring deletion of insn with uid = 9.

> deferring deletion of insn with uid = 6.

> which deletes insns 2, 3, 6, 7, 9.

>

> With patch, it fails to combine 7->10:

> Trying 7 -> 10:

>     7: r90:V2DI=r89:V2DI>r93:V2DI

>       REG_DEAD r93:V2DI

>       REG_DEAD r89:V2DI

>    10: r87:SI=unspec[r90:V2DI#0] 43

>       REG_DEAD r90:V2DI

> Failed to match this instruction:

> (set (reg:SI 87)

>     (unspec:SI [

>             (subreg:V2DF (gt:V2DI (reg:V2DI 89)

>                     (reg:V2DI 93)) 0)

>         ] UNSPEC_MOVMSK))

>

> and subsequently 6, 7 -> 10

> (attached combine dumps before and after patch).

>

> So IIUC, the issue is that the target does not have a pattern that can

> match the above insn ?

> I tried a simple workaround to "pessimize" the else condition in

> propagate_rtx_1 in patch, to require old_rtx and new_rtx have same

> rtx_code, which at least

> works for this test-case, but not sure if that's the correct approach.

> Could you suggest how to proceed ?

>

> Thanks,

> Prathamesh

>>

>> Thanks,

>> Prathamesh

>> >

>> > Richard
Prathamesh Kulkarni June 26, 2019, 10:10 a.m. | #6
On Tue, 25 Jun 2019 at 20:05, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> > On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni

> > <prathamesh.kulkarni@linaro.org> wrote:

> >>

> >> On Mon, 24 Jun 2019 at 19:51, Richard Sandiford

> >> <richard.sandiford@arm.com> wrote:

> >> >

> >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> >> > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use)

> >> > >    if (!def_set)

> >> > >      return false;

> >> > >

> >> > > +  if (reg_prop_only

> >> > > +      && !REG_P (SET_SRC (def_set))

> >> > > +      && !REG_P (SET_DEST (def_set)))

> >> > > +    return false;

> >> >

> >> > This should be:

> >> >

> >> >   if (reg_prop_only

> >> >       && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set))))

> >> >     return false;

> >> >

> >> > so that we return false if either operand isn't a register.

> >> Oops, sorry about that  -:(

> >> >

> >> > > +

> >> > > +  /* Allow propagations into a loop only for reg-to-reg copies, since

> >> > > +     replacing one register by another shouldn't increase the cost.  */

> >> > > +

> >> > > +  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father

> >> > > +      && !REG_P (SET_SRC (def_set))

> >> > > +      && !REG_P (SET_DEST (def_set)))

> >> > > +    return false;

> >> >

> >> > Same here.

> >> >

> >> > OK with that change, thanks.

> >> Thanks for the review, will make the changes and commit the patch

> >> after re-testing.

> > Hi,

> > Testing the patch showed following failures on 32-bit x86:

> >

> >   Executed from: g++.target/i386/i386.exp

> >     g++:g++.target/i386/pr88152.C   scan-assembler-not vpcmpgt|vpcmpeq|vpsra

> >   Executed from: gcc.target/i386/i386.exp

> >     gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs:

> >     gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t

> > ]*\\%eax,[\\t ]*%eax 1

> >

> > The failure of pr88152.C is also seen on x86_64.

> >

> > For pr66768.c, and pr90178.c, forwprop replaces register which is

> > volatile and frame related respectively.

> > To avoid that, the attached patch, makes a stronger constraint that

> > src and dest should be a register

> > and not have frame_related or volatil flags set, which is checked in

> > usable_reg_p().

> > Which avoids the failures for both the cases.

> > Does it look OK ?

>

> That's not the reason it's a bad transform.  In both cases we're

> propagating r2 <- r1 even though

>

> (a) r1 dies in the copy and

> (b) fwprop can't replace all uses of r2, because some have multiple

>     definitions

>

> This has the effect of making both values live in cases where only one

> was previously.

>

> In the case of pr66768.c, fwprop2 is undoing the effect of

> cse.c:canon_reg, which tries to pick the best register to use

> (see cse.c:make_regs_eqv).  fwprop1 makes the same change,

> and made it even before the patch, but the cse.c choice should win.

>

> A (hopefully) conservative fix would be to propagate the copy only if

> both registers have a single definition, which you can test with:

>

>   (DF_REG_DEF_COUNT (regno) == 1

>    && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (m_fn)), regno))

>

> In that case, fwprop should see all uses of the destination, and should

> be able to replace it in all cases with the source.

Ah I see, thanks for the explanation!
The above check works to resolve the failure.
IIUC, !bitmap_bit_p (...) above checks that reg isn't used uninitialized ?
>

> > For g++.target/i386/pr88152.C, the issue is that after the patch,

> > forwprop1 does following propagation (in f10) which wasn't done

> > before:

> >

> > In insn 10, replacing

> >  (unspec:SI [

> >             (reg:V2DF 91)

> >         ] UNSPEC_MOVMSK)

> >  with (unspec:SI [

> >             (subreg:V2DF (reg:V2DI 90) 0)

> >         ] UNSPEC_MOVMSK)

> >

> > This later defeats combine because insn 9 gets deleted.

> > Without patch, the following combination takes place:

> >

> > Trying 7 -> 9:

> >     7: r90:V2DI=r89:V2DI>r93:V2DI

> >       REG_DEAD r93:V2DI

> >       REG_DEAD r89:V2DI

> >     9: r91:V2DF=r90:V2DI#0

> >       REG_DEAD r90:V2DI

> > Successfully matched this instruction:

> > (set (subreg:V2DI (reg:V2DF 91) 0)

> >     (gt:V2DI (reg:V2DI 89)

> >         (reg:V2DI 93)))

> > allowing combination of insns 7 and 9

> >

> > and then:

> > Trying 6, 9 -> 10:

> >     6: r89:V2DI=const_vector

> >     9: r91:V2DF#0=r89:V2DI>r93:V2DI

> >       REG_DEAD r89:V2DI

> >       REG_DEAD r93:V2DI

> >    10: r87:SI=unspec[r91:V2DF] 43

> >       REG_DEAD r91:V2DF

> > Successfully matched this instruction:

> > (set (reg:SI 87)

> >     (unspec:SI [

> >             (lt:V2DF (reg:V2DI 93)

> >                 (const_vector:V2DI [

> >                         (const_int 0 [0]) repeated x2

> >                     ]))

> >         ] UNSPEC_MOVMSK))

>

> Eh?  lt:*V2DF*?  Does that mean that it's 0 for false and an all-1 NaN

> for true?

Well looking at .optimized dump:

  vector(2) long int _2;
  vector(2) double _3;
  int _6;
  signed long _7;
  long int _8;
  signed long _9;
  long int _10;

  <bb 2> [local count: 1073741824]:
  _7 = BIT_FIELD_REF <a_4(D), 64, 0>;
  _8 = _7 < 0 ? -1 : 0;
  _9 = BIT_FIELD_REF <a_4(D), 64, 64>;
  _10 = _9 < 0 ? -1 : 0;
  _2 = {_8, _10};
  _3 = VIEW_CONVERT_EXPR<__m128d>(_2);
  _6 = __builtin_ia32_movmskpd (_3); [tail call]
  return _6;

So AFAIU, we're using -1 for representing true and 0 for false
and casting -1 (literally) to double would change it's value to -NaN ?

The above insn 10 created by combine is then transformed to following insn 22:
(insn 22 9 16 2 (set (reg:SI 0 ax [87])
        (unspec:SI [
                (reg:V2DF 20 xmm0 [93])
            ] UNSPEC_MOVMSK))
"../../stage1-build/gcc/include/emmintrin.h":958:34 4222
{sse2_movmskpd}
     (nil))

deleting insn 10.

Thanks,
Prathamesh

>

> Looks like a bug that we manage to fold to that, and manage to match it.

>

> Thanks,

> Richard

>

> > allowing combination of insns 6, 9 and 10

> > original costs 4 + 8 + 4 = 16

> > replacement cost 12

> > deferring deletion of insn with uid = 9.

> > deferring deletion of insn with uid = 6.

> > which deletes insns 2, 3, 6, 7, 9.

> >

> > With patch, it fails to combine 7->10:

> > Trying 7 -> 10:

> >     7: r90:V2DI=r89:V2DI>r93:V2DI

> >       REG_DEAD r93:V2DI

> >       REG_DEAD r89:V2DI

> >    10: r87:SI=unspec[r90:V2DI#0] 43

> >       REG_DEAD r90:V2DI

> > Failed to match this instruction:

> > (set (reg:SI 87)

> >     (unspec:SI [

> >             (subreg:V2DF (gt:V2DI (reg:V2DI 89)

> >                     (reg:V2DI 93)) 0)

> >         ] UNSPEC_MOVMSK))

> >

> > and subsequently 6, 7 -> 10

> > (attached combine dumps before and after patch).

> >

> > So IIUC, the issue is that the target does not have a pattern that can

> > match the above insn ?

> > I tried a simple workaround to "pessimize" the else condition in

> > propagate_rtx_1 in patch, to require old_rtx and new_rtx have same

> > rtx_code, which at least

> > works for this test-case, but not sure if that's the correct approach.

> > Could you suggest how to proceed ?

> >

> > Thanks,

> > Prathamesh

> >>

> >> Thanks,

> >> Prathamesh

> >> >

> >> > Richard
Richard Sandiford June 26, 2019, 10:35 a.m. | #7
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Tue, 25 Jun 2019 at 20:05, Richard Sandiford

> <richard.sandiford@arm.com> wrote:

>>

>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

>> > On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni

>> > <prathamesh.kulkarni@linaro.org> wrote:

>> >>

>> >> On Mon, 24 Jun 2019 at 19:51, Richard Sandiford

>> >> <richard.sandiford@arm.com> wrote:

>> >> >

>> >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

>> >> > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use)

>> >> > >    if (!def_set)

>> >> > >      return false;

>> >> > >

>> >> > > +  if (reg_prop_only

>> >> > > +      && !REG_P (SET_SRC (def_set))

>> >> > > +      && !REG_P (SET_DEST (def_set)))

>> >> > > +    return false;

>> >> >

>> >> > This should be:

>> >> >

>> >> >   if (reg_prop_only

>> >> >       && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set))))

>> >> >     return false;

>> >> >

>> >> > so that we return false if either operand isn't a register.

>> >> Oops, sorry about that  -:(

>> >> >

>> >> > > +

>> >> > > +  /* Allow propagations into a loop only for reg-to-reg copies, since

>> >> > > +     replacing one register by another shouldn't increase the cost.  */

>> >> > > +

>> >> > > +  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father

>> >> > > +      && !REG_P (SET_SRC (def_set))

>> >> > > +      && !REG_P (SET_DEST (def_set)))

>> >> > > +    return false;

>> >> >

>> >> > Same here.

>> >> >

>> >> > OK with that change, thanks.

>> >> Thanks for the review, will make the changes and commit the patch

>> >> after re-testing.

>> > Hi,

>> > Testing the patch showed following failures on 32-bit x86:

>> >

>> >   Executed from: g++.target/i386/i386.exp

>> >     g++:g++.target/i386/pr88152.C   scan-assembler-not vpcmpgt|vpcmpeq|vpsra

>> >   Executed from: gcc.target/i386/i386.exp

>> >     gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs:

>> >     gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t

>> > ]*\\%eax,[\\t ]*%eax 1

>> >

>> > The failure of pr88152.C is also seen on x86_64.

>> >

>> > For pr66768.c, and pr90178.c, forwprop replaces register which is

>> > volatile and frame related respectively.

>> > To avoid that, the attached patch, makes a stronger constraint that

>> > src and dest should be a register

>> > and not have frame_related or volatil flags set, which is checked in

>> > usable_reg_p().

>> > Which avoids the failures for both the cases.

>> > Does it look OK ?

>>

>> That's not the reason it's a bad transform.  In both cases we're

>> propagating r2 <- r1 even though

>>

>> (a) r1 dies in the copy and

>> (b) fwprop can't replace all uses of r2, because some have multiple

>>     definitions

>>

>> This has the effect of making both values live in cases where only one

>> was previously.

>>

>> In the case of pr66768.c, fwprop2 is undoing the effect of

>> cse.c:canon_reg, which tries to pick the best register to use

>> (see cse.c:make_regs_eqv).  fwprop1 makes the same change,

>> and made it even before the patch, but the cse.c choice should win.

>>

>> A (hopefully) conservative fix would be to propagate the copy only if

>> both registers have a single definition, which you can test with:

>>

>>   (DF_REG_DEF_COUNT (regno) == 1

>>    && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (m_fn)), regno))

>>

>> In that case, fwprop should see all uses of the destination, and should

>> be able to replace it in all cases with the source.

> Ah I see, thanks for the explanation!

> The above check works to resolve the failure.

> IIUC, !bitmap_bit_p (...) above checks that reg isn't used uninitialized ?


Right.

>> > For g++.target/i386/pr88152.C, the issue is that after the patch,

>> > forwprop1 does following propagation (in f10) which wasn't done

>> > before:

>> >

>> > In insn 10, replacing

>> >  (unspec:SI [

>> >             (reg:V2DF 91)

>> >         ] UNSPEC_MOVMSK)

>> >  with (unspec:SI [

>> >             (subreg:V2DF (reg:V2DI 90) 0)

>> >         ] UNSPEC_MOVMSK)

>> >

>> > This later defeats combine because insn 9 gets deleted.

>> > Without patch, the following combination takes place:

>> >

>> > Trying 7 -> 9:

>> >     7: r90:V2DI=r89:V2DI>r93:V2DI

>> >       REG_DEAD r93:V2DI

>> >       REG_DEAD r89:V2DI

>> >     9: r91:V2DF=r90:V2DI#0

>> >       REG_DEAD r90:V2DI

>> > Successfully matched this instruction:

>> > (set (subreg:V2DI (reg:V2DF 91) 0)

>> >     (gt:V2DI (reg:V2DI 89)

>> >         (reg:V2DI 93)))

>> > allowing combination of insns 7 and 9

>> >

>> > and then:

>> > Trying 6, 9 -> 10:

>> >     6: r89:V2DI=const_vector

>> >     9: r91:V2DF#0=r89:V2DI>r93:V2DI

>> >       REG_DEAD r89:V2DI

>> >       REG_DEAD r93:V2DI

>> >    10: r87:SI=unspec[r91:V2DF] 43

>> >       REG_DEAD r91:V2DF

>> > Successfully matched this instruction:

>> > (set (reg:SI 87)

>> >     (unspec:SI [

>> >             (lt:V2DF (reg:V2DI 93)

>> >                 (const_vector:V2DI [

>> >                         (const_int 0 [0]) repeated x2

>> >                     ]))

>> >         ] UNSPEC_MOVMSK))

>>

>> Eh?  lt:*V2DF*?  Does that mean that it's 0 for false and an all-1 NaN

>> for true?

> Well looking at .optimized dump:

>

>   vector(2) long int _2;

>   vector(2) double _3;

>   int _6;

>   signed long _7;

>   long int _8;

>   signed long _9;

>   long int _10;

>

>   <bb 2> [local count: 1073741824]:

>   _7 = BIT_FIELD_REF <a_4(D), 64, 0>;

>   _8 = _7 < 0 ? -1 : 0;

>   _9 = BIT_FIELD_REF <a_4(D), 64, 64>;

>   _10 = _9 < 0 ? -1 : 0;

>   _2 = {_8, _10};

>   _3 = VIEW_CONVERT_EXPR<__m128d>(_2);

>   _6 = __builtin_ia32_movmskpd (_3); [tail call]

>   return _6;

>

> So AFAIU, we're using -1 for representing true and 0 for false

> and casting -1 (literally) to double would change it's value to -NaN ?


Exactly.  And for -ffinite-math-only, we might as well then fold the
condition to false. :-)

IMO rtl condition results have to have integral modes and not folding
(subreg:V2DF (lt:V2DI ...) 0) is the correct behaviour.  So I think this
is just a latent bug and shouldn't hold up the patch.

I'm not sure whether:

   reinterpret_cast<__m128d> (a > ...)

is something we expect users to write, or whether it was just
included for completeness.

Thanks,
Richard

> The above insn 10 created by combine is then transformed to following insn 22:

> (insn 22 9 16 2 (set (reg:SI 0 ax [87])

>         (unspec:SI [

>                 (reg:V2DF 20 xmm0 [93])

>             ] UNSPEC_MOVMSK))

> "../../stage1-build/gcc/include/emmintrin.h":958:34 4222

> {sse2_movmskpd}

>      (nil))

>

> deleting insn 10.

>

> Thanks,

> Prathamesh

>

>>

>> Looks like a bug that we manage to fold to that, and manage to match it.

>>

>> Thanks,

>> Richard

>>

>> > allowing combination of insns 6, 9 and 10

>> > original costs 4 + 8 + 4 = 16

>> > replacement cost 12

>> > deferring deletion of insn with uid = 9.

>> > deferring deletion of insn with uid = 6.

>> > which deletes insns 2, 3, 6, 7, 9.

>> >

>> > With patch, it fails to combine 7->10:

>> > Trying 7 -> 10:

>> >     7: r90:V2DI=r89:V2DI>r93:V2DI

>> >       REG_DEAD r93:V2DI

>> >       REG_DEAD r89:V2DI

>> >    10: r87:SI=unspec[r90:V2DI#0] 43

>> >       REG_DEAD r90:V2DI

>> > Failed to match this instruction:

>> > (set (reg:SI 87)

>> >     (unspec:SI [

>> >             (subreg:V2DF (gt:V2DI (reg:V2DI 89)

>> >                     (reg:V2DI 93)) 0)

>> >         ] UNSPEC_MOVMSK))

>> >

>> > and subsequently 6, 7 -> 10

>> > (attached combine dumps before and after patch).

>> >

>> > So IIUC, the issue is that the target does not have a pattern that can

>> > match the above insn ?

>> > I tried a simple workaround to "pessimize" the else condition in

>> > propagate_rtx_1 in patch, to require old_rtx and new_rtx have same

>> > rtx_code, which at least

>> > works for this test-case, but not sure if that's the correct approach.

>> > Could you suggest how to proceed ?

>> >

>> > Thanks,

>> > Prathamesh

>> >>

>> >> Thanks,

>> >> Prathamesh

>> >> >

>> >> > Richard
Prathamesh Kulkarni June 26, 2019, 1:54 p.m. | #8
On Wed, 26 Jun 2019 at 16:05, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> > On Tue, 25 Jun 2019 at 20:05, Richard Sandiford

> > <richard.sandiford@arm.com> wrote:

> >>

> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> >> > On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni

> >> > <prathamesh.kulkarni@linaro.org> wrote:

> >> >>

> >> >> On Mon, 24 Jun 2019 at 19:51, Richard Sandiford

> >> >> <richard.sandiford@arm.com> wrote:

> >> >> >

> >> >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> >> >> > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use)

> >> >> > >    if (!def_set)

> >> >> > >      return false;

> >> >> > >

> >> >> > > +  if (reg_prop_only

> >> >> > > +      && !REG_P (SET_SRC (def_set))

> >> >> > > +      && !REG_P (SET_DEST (def_set)))

> >> >> > > +    return false;

> >> >> >

> >> >> > This should be:

> >> >> >

> >> >> >   if (reg_prop_only

> >> >> >       && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set))))

> >> >> >     return false;

> >> >> >

> >> >> > so that we return false if either operand isn't a register.

> >> >> Oops, sorry about that  -:(

> >> >> >

> >> >> > > +

> >> >> > > +  /* Allow propagations into a loop only for reg-to-reg copies, since

> >> >> > > +     replacing one register by another shouldn't increase the cost.  */

> >> >> > > +

> >> >> > > +  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father

> >> >> > > +      && !REG_P (SET_SRC (def_set))

> >> >> > > +      && !REG_P (SET_DEST (def_set)))

> >> >> > > +    return false;

> >> >> >

> >> >> > Same here.

> >> >> >

> >> >> > OK with that change, thanks.

> >> >> Thanks for the review, will make the changes and commit the patch

> >> >> after re-testing.

> >> > Hi,

> >> > Testing the patch showed following failures on 32-bit x86:

> >> >

> >> >   Executed from: g++.target/i386/i386.exp

> >> >     g++:g++.target/i386/pr88152.C   scan-assembler-not vpcmpgt|vpcmpeq|vpsra

> >> >   Executed from: gcc.target/i386/i386.exp

> >> >     gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs:

> >> >     gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t

> >> > ]*\\%eax,[\\t ]*%eax 1

> >> >

> >> > The failure of pr88152.C is also seen on x86_64.

> >> >

> >> > For pr66768.c, and pr90178.c, forwprop replaces register which is

> >> > volatile and frame related respectively.

> >> > To avoid that, the attached patch, makes a stronger constraint that

> >> > src and dest should be a register

> >> > and not have frame_related or volatil flags set, which is checked in

> >> > usable_reg_p().

> >> > Which avoids the failures for both the cases.

> >> > Does it look OK ?

> >>

> >> That's not the reason it's a bad transform.  In both cases we're

> >> propagating r2 <- r1 even though

> >>

> >> (a) r1 dies in the copy and

> >> (b) fwprop can't replace all uses of r2, because some have multiple

> >>     definitions

> >>

> >> This has the effect of making both values live in cases where only one

> >> was previously.

> >>

> >> In the case of pr66768.c, fwprop2 is undoing the effect of

> >> cse.c:canon_reg, which tries to pick the best register to use

> >> (see cse.c:make_regs_eqv).  fwprop1 makes the same change,

> >> and made it even before the patch, but the cse.c choice should win.

> >>

> >> A (hopefully) conservative fix would be to propagate the copy only if

> >> both registers have a single definition, which you can test with:

> >>

> >>   (DF_REG_DEF_COUNT (regno) == 1

> >>    && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (m_fn)), regno))

> >>

> >> In that case, fwprop should see all uses of the destination, and should

> >> be able to replace it in all cases with the source.

> > Ah I see, thanks for the explanation!

> > The above check works to resolve the failure.

> > IIUC, !bitmap_bit_p (...) above checks that reg isn't used uninitialized ?

>

> Right.

>

> >> > For g++.target/i386/pr88152.C, the issue is that after the patch,

> >> > forwprop1 does following propagation (in f10) which wasn't done

> >> > before:

> >> >

> >> > In insn 10, replacing

> >> >  (unspec:SI [

> >> >             (reg:V2DF 91)

> >> >         ] UNSPEC_MOVMSK)

> >> >  with (unspec:SI [

> >> >             (subreg:V2DF (reg:V2DI 90) 0)

> >> >         ] UNSPEC_MOVMSK)

> >> >

> >> > This later defeats combine because insn 9 gets deleted.

> >> > Without patch, the following combination takes place:

> >> >

> >> > Trying 7 -> 9:

> >> >     7: r90:V2DI=r89:V2DI>r93:V2DI

> >> >       REG_DEAD r93:V2DI

> >> >       REG_DEAD r89:V2DI

> >> >     9: r91:V2DF=r90:V2DI#0

> >> >       REG_DEAD r90:V2DI

> >> > Successfully matched this instruction:

> >> > (set (subreg:V2DI (reg:V2DF 91) 0)

> >> >     (gt:V2DI (reg:V2DI 89)

> >> >         (reg:V2DI 93)))

> >> > allowing combination of insns 7 and 9

> >> >

> >> > and then:

> >> > Trying 6, 9 -> 10:

> >> >     6: r89:V2DI=const_vector

> >> >     9: r91:V2DF#0=r89:V2DI>r93:V2DI

> >> >       REG_DEAD r89:V2DI

> >> >       REG_DEAD r93:V2DI

> >> >    10: r87:SI=unspec[r91:V2DF] 43

> >> >       REG_DEAD r91:V2DF

> >> > Successfully matched this instruction:

> >> > (set (reg:SI 87)

> >> >     (unspec:SI [

> >> >             (lt:V2DF (reg:V2DI 93)

> >> >                 (const_vector:V2DI [

> >> >                         (const_int 0 [0]) repeated x2

> >> >                     ]))

> >> >         ] UNSPEC_MOVMSK))

> >>

> >> Eh?  lt:*V2DF*?  Does that mean that it's 0 for false and an all-1 NaN

> >> for true?

> > Well looking at .optimized dump:

> >

> >   vector(2) long int _2;

> >   vector(2) double _3;

> >   int _6;

> >   signed long _7;

> >   long int _8;

> >   signed long _9;

> >   long int _10;

> >

> >   <bb 2> [local count: 1073741824]:

> >   _7 = BIT_FIELD_REF <a_4(D), 64, 0>;

> >   _8 = _7 < 0 ? -1 : 0;

> >   _9 = BIT_FIELD_REF <a_4(D), 64, 64>;

> >   _10 = _9 < 0 ? -1 : 0;

> >   _2 = {_8, _10};

> >   _3 = VIEW_CONVERT_EXPR<__m128d>(_2);

> >   _6 = __builtin_ia32_movmskpd (_3); [tail call]

> >   return _6;

> >

> > So AFAIU, we're using -1 for representing true and 0 for false

> > and casting -1 (literally) to double would change it's value to -NaN ?

>

> Exactly.  And for -ffinite-math-only, we might as well then fold the

> condition to false. :-)

>

> IMO rtl condition results have to have integral modes and not folding

> (subreg:V2DF (lt:V2DI ...) 0) is the correct behaviour.  So I think this

> is just a latent bug and shouldn't hold up the patch.

>

> I'm not sure whether:

>

>    reinterpret_cast<__m128d> (a > ...)

>

> is something we expect users to write, or whether it was just

> included for completeness.

I will report the issue and commit after re-testing patch.
Thanks a lot for the helpful reviews!

Thanks,
Prathamesh
>

> Thanks,

> Richard

>

> > The above insn 10 created by combine is then transformed to following insn 22:

> > (insn 22 9 16 2 (set (reg:SI 0 ax [87])

> >         (unspec:SI [

> >                 (reg:V2DF 20 xmm0 [93])

> >             ] UNSPEC_MOVMSK))

> > "../../stage1-build/gcc/include/emmintrin.h":958:34 4222

> > {sse2_movmskpd}

> >      (nil))

> >

> > deleting insn 10.

> >

> > Thanks,

> > Prathamesh

> >

> >>

> >> Looks like a bug that we manage to fold to that, and manage to match it.

> >>

> >> Thanks,

> >> Richard

> >>

> >> > allowing combination of insns 6, 9 and 10

> >> > original costs 4 + 8 + 4 = 16

> >> > replacement cost 12

> >> > deferring deletion of insn with uid = 9.

> >> > deferring deletion of insn with uid = 6.

> >> > which deletes insns 2, 3, 6, 7, 9.

> >> >

> >> > With patch, it fails to combine 7->10:

> >> > Trying 7 -> 10:

> >> >     7: r90:V2DI=r89:V2DI>r93:V2DI

> >> >       REG_DEAD r93:V2DI

> >> >       REG_DEAD r89:V2DI

> >> >    10: r87:SI=unspec[r90:V2DI#0] 43

> >> >       REG_DEAD r90:V2DI

> >> > Failed to match this instruction:

> >> > (set (reg:SI 87)

> >> >     (unspec:SI [

> >> >             (subreg:V2DF (gt:V2DI (reg:V2DI 89)

> >> >                     (reg:V2DI 93)) 0)

> >> >         ] UNSPEC_MOVMSK))

> >> >

> >> > and subsequently 6, 7 -> 10

> >> > (attached combine dumps before and after patch).

> >> >

> >> > So IIUC, the issue is that the target does not have a pattern that can

> >> > match the above insn ?

> >> > I tried a simple workaround to "pessimize" the else condition in

> >> > propagate_rtx_1 in patch, to require old_rtx and new_rtx have same

> >> > rtx_code, which at least

> >> > works for this test-case, but not sure if that's the correct approach.

> >> > Could you suggest how to proceed ?

> >> >

> >> > Thanks,

> >> > Prathamesh

> >> >>

> >> >> Thanks,

> >> >> Prathamesh

> >> >> >

> >> >> > Richard
Richard Sandiford June 26, 2019, 6:15 p.m. | #9
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Wed, 26 Jun 2019 at 16:05, Richard Sandiford

> <richard.sandiford@arm.com> wrote:

>>

>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

>> > On Tue, 25 Jun 2019 at 20:05, Richard Sandiford

>> > <richard.sandiford@arm.com> wrote:

>> >>

>> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

>> >> > On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni

>> >> > <prathamesh.kulkarni@linaro.org> wrote:

>> >> >>

>> >> >> On Mon, 24 Jun 2019 at 19:51, Richard Sandiford

>> >> >> <richard.sandiford@arm.com> wrote:

>> >> >> >

>> >> >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

>> >> >> > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use)

>> >> >> > >    if (!def_set)

>> >> >> > >      return false;

>> >> >> > >

>> >> >> > > +  if (reg_prop_only

>> >> >> > > +      && !REG_P (SET_SRC (def_set))

>> >> >> > > +      && !REG_P (SET_DEST (def_set)))

>> >> >> > > +    return false;

>> >> >> >

>> >> >> > This should be:

>> >> >> >

>> >> >> >   if (reg_prop_only

>> >> >> >       && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set))))

>> >> >> >     return false;

>> >> >> >

>> >> >> > so that we return false if either operand isn't a register.

>> >> >> Oops, sorry about that  -:(

>> >> >> >

>> >> >> > > +

>> >> >> > > +  /* Allow propagations into a loop only for reg-to-reg copies, since

>> >> >> > > +     replacing one register by another shouldn't increase the cost.  */

>> >> >> > > +

>> >> >> > > +  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father

>> >> >> > > +      && !REG_P (SET_SRC (def_set))

>> >> >> > > +      && !REG_P (SET_DEST (def_set)))

>> >> >> > > +    return false;

>> >> >> >

>> >> >> > Same here.

>> >> >> >

>> >> >> > OK with that change, thanks.

>> >> >> Thanks for the review, will make the changes and commit the patch

>> >> >> after re-testing.

>> >> > Hi,

>> >> > Testing the patch showed following failures on 32-bit x86:

>> >> >

>> >> >   Executed from: g++.target/i386/i386.exp

>> >> >     g++:g++.target/i386/pr88152.C   scan-assembler-not vpcmpgt|vpcmpeq|vpsra

>> >> >   Executed from: gcc.target/i386/i386.exp

>> >> >     gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs:

>> >> >     gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t

>> >> > ]*\\%eax,[\\t ]*%eax 1

>> >> >

>> >> > The failure of pr88152.C is also seen on x86_64.

>> >> >

>> >> > For pr66768.c, and pr90178.c, forwprop replaces register which is

>> >> > volatile and frame related respectively.

>> >> > To avoid that, the attached patch, makes a stronger constraint that

>> >> > src and dest should be a register

>> >> > and not have frame_related or volatil flags set, which is checked in

>> >> > usable_reg_p().

>> >> > Which avoids the failures for both the cases.

>> >> > Does it look OK ?

>> >>

>> >> That's not the reason it's a bad transform.  In both cases we're

>> >> propagating r2 <- r1 even though

>> >>

>> >> (a) r1 dies in the copy and

>> >> (b) fwprop can't replace all uses of r2, because some have multiple

>> >>     definitions

>> >>

>> >> This has the effect of making both values live in cases where only one

>> >> was previously.

>> >>

>> >> In the case of pr66768.c, fwprop2 is undoing the effect of

>> >> cse.c:canon_reg, which tries to pick the best register to use

>> >> (see cse.c:make_regs_eqv).  fwprop1 makes the same change,

>> >> and made it even before the patch, but the cse.c choice should win.

>> >>

>> >> A (hopefully) conservative fix would be to propagate the copy only if

>> >> both registers have a single definition, which you can test with:

>> >>

>> >>   (DF_REG_DEF_COUNT (regno) == 1

>> >>    && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (m_fn)), regno))

>> >>

>> >> In that case, fwprop should see all uses of the destination, and should

>> >> be able to replace it in all cases with the source.

>> > Ah I see, thanks for the explanation!

>> > The above check works to resolve the failure.

>> > IIUC, !bitmap_bit_p (...) above checks that reg isn't used uninitialized ?

>>

>> Right.

>>

>> >> > For g++.target/i386/pr88152.C, the issue is that after the patch,

>> >> > forwprop1 does following propagation (in f10) which wasn't done

>> >> > before:

>> >> >

>> >> > In insn 10, replacing

>> >> >  (unspec:SI [

>> >> >             (reg:V2DF 91)

>> >> >         ] UNSPEC_MOVMSK)

>> >> >  with (unspec:SI [

>> >> >             (subreg:V2DF (reg:V2DI 90) 0)

>> >> >         ] UNSPEC_MOVMSK)

>> >> >

>> >> > This later defeats combine because insn 9 gets deleted.

>> >> > Without patch, the following combination takes place:

>> >> >

>> >> > Trying 7 -> 9:

>> >> >     7: r90:V2DI=r89:V2DI>r93:V2DI

>> >> >       REG_DEAD r93:V2DI

>> >> >       REG_DEAD r89:V2DI

>> >> >     9: r91:V2DF=r90:V2DI#0

>> >> >       REG_DEAD r90:V2DI

>> >> > Successfully matched this instruction:

>> >> > (set (subreg:V2DI (reg:V2DF 91) 0)

>> >> >     (gt:V2DI (reg:V2DI 89)

>> >> >         (reg:V2DI 93)))

>> >> > allowing combination of insns 7 and 9

>> >> >

>> >> > and then:

>> >> > Trying 6, 9 -> 10:

>> >> >     6: r89:V2DI=const_vector

>> >> >     9: r91:V2DF#0=r89:V2DI>r93:V2DI

>> >> >       REG_DEAD r89:V2DI

>> >> >       REG_DEAD r93:V2DI

>> >> >    10: r87:SI=unspec[r91:V2DF] 43

>> >> >       REG_DEAD r91:V2DF

>> >> > Successfully matched this instruction:

>> >> > (set (reg:SI 87)

>> >> >     (unspec:SI [

>> >> >             (lt:V2DF (reg:V2DI 93)

>> >> >                 (const_vector:V2DI [

>> >> >                         (const_int 0 [0]) repeated x2

>> >> >                     ]))

>> >> >         ] UNSPEC_MOVMSK))

>> >>

>> >> Eh?  lt:*V2DF*?  Does that mean that it's 0 for false and an all-1 NaN

>> >> for true?

>> > Well looking at .optimized dump:

>> >

>> >   vector(2) long int _2;

>> >   vector(2) double _3;

>> >   int _6;

>> >   signed long _7;

>> >   long int _8;

>> >   signed long _9;

>> >   long int _10;

>> >

>> >   <bb 2> [local count: 1073741824]:

>> >   _7 = BIT_FIELD_REF <a_4(D), 64, 0>;

>> >   _8 = _7 < 0 ? -1 : 0;

>> >   _9 = BIT_FIELD_REF <a_4(D), 64, 64>;

>> >   _10 = _9 < 0 ? -1 : 0;

>> >   _2 = {_8, _10};

>> >   _3 = VIEW_CONVERT_EXPR<__m128d>(_2);

>> >   _6 = __builtin_ia32_movmskpd (_3); [tail call]

>> >   return _6;

>> >

>> > So AFAIU, we're using -1 for representing true and 0 for false

>> > and casting -1 (literally) to double would change it's value to -NaN ?

>>

>> Exactly.  And for -ffinite-math-only, we might as well then fold the

>> condition to false. :-)

>>

>> IMO rtl condition results have to have integral modes and not folding

>> (subreg:V2DF (lt:V2DI ...) 0) is the correct behaviour.  So I think this

>> is just a latent bug and shouldn't hold up the patch.

>>

>> I'm not sure whether:

>>

>>    reinterpret_cast<__m128d> (a > ...)

>>

>> is something we expect users to write, or whether it was just

>> included for completeness.

> I will report the issue and commit after re-testing patch.

> Thanks a lot for the helpful reviews!


Since it seems FP comparison results are OK, I guess it needs to be
fixed after all.  I think the problem is that the simplification
is only done by gen_lowpart_for_combine:

  /* If X is a comparison operator, rewrite it in a new mode.  This
     probably won't match, but may allow further simplifications.  */
  else if (COMPARISON_P (x))
    return gen_rtx_fmt_ee (GET_CODE (x), omode, XEXP (x, 0), XEXP (x, 1));

and triggers via expand_field_assignment when the subreg is on the lhs
of the SET.  For it to work for a general subreg on the rhs, it probably
needs to be moved from here to simplify_subreg.

We'll need to be careful about the conditions under which it
happens though.  The above doesn't for example check whether
the new mode has the same number of elements as the original,
so could generate things like:

   (gt:V4SF (reg:V2DI X) (reg:V2DI Y))

for:

   (set (reg:V2DI res) (gt:V2DI (reg:V2DI X) (reg:V2DI Y)))
   (subreg:V4SF (reg:V2DI res) 0)

I think most code would expect the result of a comparison to have the
same number of elements as the inputs and could ICE if they don't,
so I don't think it's up to the target to decide whether mismatches
are OK.  (But there again, I thought that last time. :-))

We'd also need to check the element sizes, since e.g. a lowpart
(subreg:V2HI ...) of a V2SI comparison isn't the same as a V2HI
comparison.

Thanks,
Richard
Prathamesh Kulkarni July 2, 2019, 11:20 a.m. | #10
On Wed, 26 Jun 2019 at 23:45, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> > On Wed, 26 Jun 2019 at 16:05, Richard Sandiford

> > <richard.sandiford@arm.com> wrote:

> >>

> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> >> > On Tue, 25 Jun 2019 at 20:05, Richard Sandiford

> >> > <richard.sandiford@arm.com> wrote:

> >> >>

> >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> >> >> > On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni

> >> >> > <prathamesh.kulkarni@linaro.org> wrote:

> >> >> >>

> >> >> >> On Mon, 24 Jun 2019 at 19:51, Richard Sandiford

> >> >> >> <richard.sandiford@arm.com> wrote:

> >> >> >> >

> >> >> >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> >> >> >> > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use)

> >> >> >> > >    if (!def_set)

> >> >> >> > >      return false;

> >> >> >> > >

> >> >> >> > > +  if (reg_prop_only

> >> >> >> > > +      && !REG_P (SET_SRC (def_set))

> >> >> >> > > +      && !REG_P (SET_DEST (def_set)))

> >> >> >> > > +    return false;

> >> >> >> >

> >> >> >> > This should be:

> >> >> >> >

> >> >> >> >   if (reg_prop_only

> >> >> >> >       && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set))))

> >> >> >> >     return false;

> >> >> >> >

> >> >> >> > so that we return false if either operand isn't a register.

> >> >> >> Oops, sorry about that  -:(

> >> >> >> >

> >> >> >> > > +

> >> >> >> > > +  /* Allow propagations into a loop only for reg-to-reg copies, since

> >> >> >> > > +     replacing one register by another shouldn't increase the cost.  */

> >> >> >> > > +

> >> >> >> > > +  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father

> >> >> >> > > +      && !REG_P (SET_SRC (def_set))

> >> >> >> > > +      && !REG_P (SET_DEST (def_set)))

> >> >> >> > > +    return false;

> >> >> >> >

> >> >> >> > Same here.

> >> >> >> >

> >> >> >> > OK with that change, thanks.

> >> >> >> Thanks for the review, will make the changes and commit the patch

> >> >> >> after re-testing.

> >> >> > Hi,

> >> >> > Testing the patch showed following failures on 32-bit x86:

> >> >> >

> >> >> >   Executed from: g++.target/i386/i386.exp

> >> >> >     g++:g++.target/i386/pr88152.C   scan-assembler-not vpcmpgt|vpcmpeq|vpsra

> >> >> >   Executed from: gcc.target/i386/i386.exp

> >> >> >     gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs:

> >> >> >     gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t

> >> >> > ]*\\%eax,[\\t ]*%eax 1

> >> >> >

> >> >> > The failure of pr88152.C is also seen on x86_64.

> >> >> >

> >> >> > For pr66768.c, and pr90178.c, forwprop replaces register which is

> >> >> > volatile and frame related respectively.

> >> >> > To avoid that, the attached patch, makes a stronger constraint that

> >> >> > src and dest should be a register

> >> >> > and not have frame_related or volatil flags set, which is checked in

> >> >> > usable_reg_p().

> >> >> > Which avoids the failures for both the cases.

> >> >> > Does it look OK ?

> >> >>

> >> >> That's not the reason it's a bad transform.  In both cases we're

> >> >> propagating r2 <- r1 even though

> >> >>

> >> >> (a) r1 dies in the copy and

> >> >> (b) fwprop can't replace all uses of r2, because some have multiple

> >> >>     definitions

> >> >>

> >> >> This has the effect of making both values live in cases where only one

> >> >> was previously.

> >> >>

> >> >> In the case of pr66768.c, fwprop2 is undoing the effect of

> >> >> cse.c:canon_reg, which tries to pick the best register to use

> >> >> (see cse.c:make_regs_eqv).  fwprop1 makes the same change,

> >> >> and made it even before the patch, but the cse.c choice should win.

> >> >>

> >> >> A (hopefully) conservative fix would be to propagate the copy only if

> >> >> both registers have a single definition, which you can test with:

> >> >>

> >> >>   (DF_REG_DEF_COUNT (regno) == 1

> >> >>    && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (m_fn)), regno))

> >> >>

> >> >> In that case, fwprop should see all uses of the destination, and should

> >> >> be able to replace it in all cases with the source.

> >> > Ah I see, thanks for the explanation!

> >> > The above check works to resolve the failure.

> >> > IIUC, !bitmap_bit_p (...) above checks that reg isn't used uninitialized ?

> >>

> >> Right.

> >>

> >> >> > For g++.target/i386/pr88152.C, the issue is that after the patch,

> >> >> > forwprop1 does following propagation (in f10) which wasn't done

> >> >> > before:

> >> >> >

> >> >> > In insn 10, replacing

> >> >> >  (unspec:SI [

> >> >> >             (reg:V2DF 91)

> >> >> >         ] UNSPEC_MOVMSK)

> >> >> >  with (unspec:SI [

> >> >> >             (subreg:V2DF (reg:V2DI 90) 0)

> >> >> >         ] UNSPEC_MOVMSK)

> >> >> >

> >> >> > This later defeats combine because insn 9 gets deleted.

> >> >> > Without patch, the following combination takes place:

> >> >> >

> >> >> > Trying 7 -> 9:

> >> >> >     7: r90:V2DI=r89:V2DI>r93:V2DI

> >> >> >       REG_DEAD r93:V2DI

> >> >> >       REG_DEAD r89:V2DI

> >> >> >     9: r91:V2DF=r90:V2DI#0

> >> >> >       REG_DEAD r90:V2DI

> >> >> > Successfully matched this instruction:

> >> >> > (set (subreg:V2DI (reg:V2DF 91) 0)

> >> >> >     (gt:V2DI (reg:V2DI 89)

> >> >> >         (reg:V2DI 93)))

> >> >> > allowing combination of insns 7 and 9

> >> >> >

> >> >> > and then:

> >> >> > Trying 6, 9 -> 10:

> >> >> >     6: r89:V2DI=const_vector

> >> >> >     9: r91:V2DF#0=r89:V2DI>r93:V2DI

> >> >> >       REG_DEAD r89:V2DI

> >> >> >       REG_DEAD r93:V2DI

> >> >> >    10: r87:SI=unspec[r91:V2DF] 43

> >> >> >       REG_DEAD r91:V2DF

> >> >> > Successfully matched this instruction:

> >> >> > (set (reg:SI 87)

> >> >> >     (unspec:SI [

> >> >> >             (lt:V2DF (reg:V2DI 93)

> >> >> >                 (const_vector:V2DI [

> >> >> >                         (const_int 0 [0]) repeated x2

> >> >> >                     ]))

> >> >> >         ] UNSPEC_MOVMSK))

> >> >>

> >> >> Eh?  lt:*V2DF*?  Does that mean that it's 0 for false and an all-1 NaN

> >> >> for true?

> >> > Well looking at .optimized dump:

> >> >

> >> >   vector(2) long int _2;

> >> >   vector(2) double _3;

> >> >   int _6;

> >> >   signed long _7;

> >> >   long int _8;

> >> >   signed long _9;

> >> >   long int _10;

> >> >

> >> >   <bb 2> [local count: 1073741824]:

> >> >   _7 = BIT_FIELD_REF <a_4(D), 64, 0>;

> >> >   _8 = _7 < 0 ? -1 : 0;

> >> >   _9 = BIT_FIELD_REF <a_4(D), 64, 64>;

> >> >   _10 = _9 < 0 ? -1 : 0;

> >> >   _2 = {_8, _10};

> >> >   _3 = VIEW_CONVERT_EXPR<__m128d>(_2);

> >> >   _6 = __builtin_ia32_movmskpd (_3); [tail call]

> >> >   return _6;

> >> >

> >> > So AFAIU, we're using -1 for representing true and 0 for false

> >> > and casting -1 (literally) to double would change it's value to -NaN ?

> >>

> >> Exactly.  And for -ffinite-math-only, we might as well then fold the

> >> condition to false. :-)

> >>

> >> IMO rtl condition results have to have integral modes and not folding

> >> (subreg:V2DF (lt:V2DI ...) 0) is the correct behaviour.  So I think this

> >> is just a latent bug and shouldn't hold up the patch.

> >>

> >> I'm not sure whether:

> >>

> >>    reinterpret_cast<__m128d> (a > ...)

> >>

> >> is something we expect users to write, or whether it was just

> >> included for completeness.

> > I will report the issue and commit after re-testing patch.

> > Thanks a lot for the helpful reviews!

>

> Since it seems FP comparison results are OK, I guess it needs to be

> fixed after all.  I think the problem is that the simplification

> is only done by gen_lowpart_for_combine:

>

>   /* If X is a comparison operator, rewrite it in a new mode.  This

>      probably won't match, but may allow further simplifications.  */

>   else if (COMPARISON_P (x))

>     return gen_rtx_fmt_ee (GET_CODE (x), omode, XEXP (x, 0), XEXP (x, 1));

>

> and triggers via expand_field_assignment when the subreg is on the lhs

> of the SET.  For it to work for a general subreg on the rhs, it probably

> needs to be moved from here to simplify_subreg.

>

> We'll need to be careful about the conditions under which it

> happens though.  The above doesn't for example check whether

> the new mode has the same number of elements as the original,

> so could generate things like:

>

>    (gt:V4SF (reg:V2DI X) (reg:V2DI Y))

>

> for:

>

>    (set (reg:V2DI res) (gt:V2DI (reg:V2DI X) (reg:V2DI Y)))

>    (subreg:V4SF (reg:V2DI res) 0)

>

> I think most code would expect the result of a comparison to have the

> same number of elements as the inputs and could ICE if they don't,

> so I don't think it's up to the target to decide whether mismatches

> are OK.  (But there again, I thought that last time. :-))

>

> We'd also need to check the element sizes, since e.g. a lowpart

> (subreg:V2HI ...) of a V2SI comparison isn't the same as a V2HI

> comparison.

Hi Richard,
Thanks for the detailed suggestions and sorry for late response, for
some reason, I missed this email earlier -;(
With the changes to simplify_subreg, the test doesn't regress anymore.

IIUC it does the following change:
(subreg:V2DF
  (lt:V2DI (reg:V2DI 93) (const_vector 0)))
->
(lt:V2DF (reg:V2DI 93) (const_vector 0))

which allowed the 6, 7 -> 10 combination which was failing earlier.

Does the attached version look OK ?
Testing in progress.

Thanks,
Prathamesh
>

> Thanks,

> Richard
Richard Sandiford July 2, 2019, 11:29 a.m. | #11
Thanks for fixing this.

Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c

> index 89a46a933fa..79bd0cfbd28 100644

> --- a/gcc/simplify-rtx.c

> +++ b/gcc/simplify-rtx.c

> @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op,

>  	}

>      }

>  

> +  /* If op is a vector comparison operator, rewrite it in a new mode.

> +     This probably won't match, but may allow further simplifications.

> +     Also check if number of elements and size of each element

> +     match for outermode and innermode.  */

> +


Excess blank line after the comment.  IMO the second part of the comment
reads too much like an afterthought.  How about:

  /* If OP is a vector comparison and the subreg is not changing the
     number of elements or the size of the elements, change the result
     of the comparison to the new mode.  */

> +  if (COMPARISON_P (op)

> +      && VECTOR_MODE_P (outermode)

> +      && VECTOR_MODE_P (innermode)

> +      && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode)))

> +      && (known_eq (GET_MODE_UNIT_SIZE (outermode),

> +		    GET_MODE_UNIT_SIZE (innermode))))


Redundant brackets around the known_eq calls.

> +    return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP (op, 1));


This should use simplify_gen_relational, so that we try to simplify
the new expression.

Richard
Prathamesh Kulkarni July 2, 2019, 12:10 p.m. | #12
On Tue, 2 Jul 2019 at 16:59, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Thanks for fixing this.

>

> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c

> > index 89a46a933fa..79bd0cfbd28 100644

> > --- a/gcc/simplify-rtx.c

> > +++ b/gcc/simplify-rtx.c

> > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op,

> >       }

> >      }

> >

> > +  /* If op is a vector comparison operator, rewrite it in a new mode.

> > +     This probably won't match, but may allow further simplifications.

> > +     Also check if number of elements and size of each element

> > +     match for outermode and innermode.  */

> > +

>

> Excess blank line after the comment.  IMO the second part of the comment

> reads too much like an afterthought.  How about:

>

>   /* If OP is a vector comparison and the subreg is not changing the

>      number of elements or the size of the elements, change the result

>      of the comparison to the new mode.  */

>

> > +  if (COMPARISON_P (op)

> > +      && VECTOR_MODE_P (outermode)

> > +      && VECTOR_MODE_P (innermode)

> > +      && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode)))

> > +      && (known_eq (GET_MODE_UNIT_SIZE (outermode),

> > +                 GET_MODE_UNIT_SIZE (innermode))))

>

> Redundant brackets around the known_eq calls.

>

> > +    return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP (op, 1));

>

> This should use simplify_gen_relational, so that we try to simplify

> the new expression.

Does the attached version look OK ?

Thanks,
Prathamesh
>

> Richard
diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index 45703fe5f01..e6f37527192 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -448,6 +448,18 @@ enum {
   PR_OPTIMIZE_FOR_SPEED = 4
 };
 
+/* Check that X has a single def.  */
+
+static bool
+reg_single_def_p (rtx x)
+{
+  if (!REG_P (x))
+    return false;
+
+  int regno = REGNO (x);
+  return (DF_REG_DEF_COUNT (regno) == 1
+	  && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (cfun)), regno));
+}
 
 /* Replace all occurrences of OLD in *PX with NEW and try to simplify the
    resulting expression.  Replace *PX with a new RTL expression if an
@@ -547,6 +559,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags)
 	  tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)),
 				     SUBREG_BYTE (x));
 	}
+
+      else
+	{
+	  rtvec vec;
+	  rtvec newvec;
+	  const char *fmt = GET_RTX_FORMAT (code);
+	  rtx op;
+
+	  for (int i = 0; fmt[i]; i++)
+	    switch (fmt[i])
+	      {
+	      case 'E':
+		vec = XVEC (x, i);
+		newvec = vec;
+		for (int j = 0; j < GET_NUM_ELEM (vec); j++)
+		  {
+		    op = RTVEC_ELT (vec, j);
+		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+		    if (op != RTVEC_ELT (vec, j))
+		      {
+			if (newvec == vec)
+			  {
+			    newvec = shallow_copy_rtvec (vec);
+			    if (!tem)
+			      tem = shallow_copy_rtx (x);
+			    XVEC (tem, i) = newvec;
+			  }
+			RTVEC_ELT (newvec, j) = op;
+		      }
+		  }
+	        break;
+
+	      case 'e':
+		if (XEXP (x, i))
+		  {
+		    op = XEXP (x, i);
+		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+		    if (op != XEXP (x, i))
+		      {
+			if (!tem)
+			  tem = shallow_copy_rtx (x);
+			XEXP (tem, i) = op;
+		      }
+		  }
+	        break;
+	      }
+	}
+
       break;
 
     case RTX_OBJ:
@@ -1370,10 +1430,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set)
 
 /* Given a use USE of an insn, if it has a single reaching
    definition, try to forward propagate it into that insn.
-   Return true if cfg cleanup will be needed.  */
+   Return true if cfg cleanup will be needed.
+   REG_PROP_ONLY is true if we should only propagate register copies.  */
 
 static bool
-forward_propagate_into (df_ref use)
+forward_propagate_into (df_ref use, bool reg_prop_only = false)
 {
   df_ref def;
   rtx_insn *def_insn, *use_insn;
@@ -1394,10 +1455,6 @@ forward_propagate_into (df_ref use)
   if (DF_REF_IS_ARTIFICIAL (def))
     return false;
 
-  /* Do not propagate loop invariant definitions inside the loop.  */
-  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father)
-    return false;
-
   /* Check if the use is still present in the insn!  */
   use_insn = DF_REF_INSN (use);
   if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE)
@@ -1415,6 +1472,19 @@ forward_propagate_into (df_ref use)
   if (!def_set)
     return false;
 
+  if (reg_prop_only
+      && (!reg_single_def_p (SET_SRC (def_set))
+	  || !reg_single_def_p (SET_DEST (def_set))))
+    return false;
+
+  /* Allow propagations into a loop only for reg-to-reg copies, since
+     replacing one register by another shouldn't increase the cost.  */
+
+  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father
+      && (!reg_single_def_p (SET_SRC (def_set))
+	  || !reg_single_def_p (SET_DEST (def_set))))
+    return false;
+
   /* Only try one kind of propagation.  If two are possible, we'll
      do it on the following iterations.  */
   if (forward_propagate_and_simplify (use, def_insn, def_set)
@@ -1483,7 +1553,7 @@ gate_fwprop (void)
 }
 
 static unsigned int
-fwprop (void)
+fwprop (bool fwprop_addr_p)
 {
   unsigned i;
 
@@ -1502,11 +1572,16 @@ fwprop (void)
 
       df_ref use = DF_USES_GET (i);
       if (use)
-	if (DF_REF_TYPE (use) == DF_REF_REG_USE
-	    || DF_REF_BB (use)->loop_father == NULL
-	    /* The outer most loop is not really a loop.  */
-	    || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
-	  forward_propagate_into (use);
+	{
+	  if (DF_REF_TYPE (use) == DF_REF_REG_USE
+	      || DF_REF_BB (use)->loop_father == NULL
+	      /* The outer most loop is not really a loop.  */
+	      || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
+	    forward_propagate_into (use, fwprop_addr_p);
+
+	  else if (fwprop_addr_p)
+	    forward_propagate_into (use, false);
+	}
     }
 
   fwprop_done ();
@@ -1537,7 +1612,7 @@ public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop (); }
+  virtual unsigned int execute (function *) { return fwprop (false); }
 
 }; // class pass_rtl_fwprop
 
@@ -1549,33 +1624,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt)
   return new pass_rtl_fwprop (ctxt);
 }
 
-static unsigned int
-fwprop_addr (void)
-{
-  unsigned i;
-
-  fwprop_init ();
-
-  /* Go through all the uses.  df_uses_create will create new ones at the
-     end, and we'll go through them as well.  */
-  for (i = 0; i < DF_USES_TABLE_SIZE (); i++)
-    {
-      if (!propagations_left)
-	break;
-
-      df_ref use = DF_USES_GET (i);
-      if (use)
-	if (DF_REF_TYPE (use) != DF_REF_REG_USE
-	    && DF_REF_BB (use)->loop_father != NULL
-	    /* The outer most loop is not really a loop.  */
-	    && loop_outer (DF_REF_BB (use)->loop_father) != NULL)
-	  forward_propagate_into (use);
-    }
-
-  fwprop_done ();
-  return 0;
-}
-
 namespace {
 
 const pass_data pass_data_rtl_fwprop_addr =
@@ -1600,7 +1648,7 @@ public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop_addr (); }
+  virtual unsigned int execute (function *) { return fwprop (true); }
 
 }; // class pass_rtl_fwprop_addr
 
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 89a46a933fa..c588c91562b 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -6697,6 +6697,17 @@ simplify_subreg (machine_mode outermode, rtx op,
 	}
     }
 
+  /* If OP is a vector comparison and the subreg is not changing the
+     number of elements or the size of the elements, change the result
+     of the comparison to the new mode.  */
+  if (COMPARISON_P (op)
+      && VECTOR_MODE_P (outermode)
+      && VECTOR_MODE_P (innermode)
+      && known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode))
+      && known_eq (GET_MODE_UNIT_SIZE (outermode),
+		    GET_MODE_UNIT_SIZE (innermode)))
+    return simplify_gen_relational (GET_CODE (op), outermode, innermode,
+				    XEXP (op, 0), XEXP (op, 1)); 
   return NULL_RTX;
 }
 
diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90
new file mode 100644
index 00000000000..224e6ce5f3d
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr88833.f90
@@ -0,0 +1,9 @@
+! { dg-do assemble { target aarch64_asm_sve_ok } }
+! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" }
+
+subroutine foo(x)
+  real :: x(100)
+  x = x + 10
+end subroutine foo
+
+! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }
Richard Sandiford July 2, 2019, 12:52 p.m. | #13
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Tue, 2 Jul 2019 at 16:59, Richard Sandiford

> <richard.sandiford@arm.com> wrote:

>>

>> Thanks for fixing this.

>>

>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

>> > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c

>> > index 89a46a933fa..79bd0cfbd28 100644

>> > --- a/gcc/simplify-rtx.c

>> > +++ b/gcc/simplify-rtx.c

>> > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op,

>> >       }

>> >      }

>> >

>> > +  /* If op is a vector comparison operator, rewrite it in a new mode.

>> > +     This probably won't match, but may allow further simplifications.

>> > +     Also check if number of elements and size of each element

>> > +     match for outermode and innermode.  */

>> > +

>>

>> Excess blank line after the comment.  IMO the second part of the comment

>> reads too much like an afterthought.  How about:

>>

>>   /* If OP is a vector comparison and the subreg is not changing the

>>      number of elements or the size of the elements, change the result

>>      of the comparison to the new mode.  */

>>

>> > +  if (COMPARISON_P (op)

>> > +      && VECTOR_MODE_P (outermode)

>> > +      && VECTOR_MODE_P (innermode)

>> > +      && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode)))

>> > +      && (known_eq (GET_MODE_UNIT_SIZE (outermode),

>> > +                 GET_MODE_UNIT_SIZE (innermode))))

>>

>> Redundant brackets around the known_eq calls.

>>

>> > +    return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP (op, 1));

>>

>> This should use simplify_gen_relational, so that we try to simplify

>> the new expression.

> Does the attached version look OK ?


OK with a suitable changelog (remember to post those!) if it passes testing.

Thanks,
Richard
Prathamesh Kulkarni July 3, 2019, 10:45 a.m. | #14
On Tue, 2 Jul 2019 at 18:22, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> > On Tue, 2 Jul 2019 at 16:59, Richard Sandiford

> > <richard.sandiford@arm.com> wrote:

> >>

> >> Thanks for fixing this.

> >>

> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> >> > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c

> >> > index 89a46a933fa..79bd0cfbd28 100644

> >> > --- a/gcc/simplify-rtx.c

> >> > +++ b/gcc/simplify-rtx.c

> >> > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op,

> >> >       }

> >> >      }

> >> >

> >> > +  /* If op is a vector comparison operator, rewrite it in a new mode.

> >> > +     This probably won't match, but may allow further simplifications.

> >> > +     Also check if number of elements and size of each element

> >> > +     match for outermode and innermode.  */

> >> > +

> >>

> >> Excess blank line after the comment.  IMO the second part of the comment

> >> reads too much like an afterthought.  How about:

> >>

> >>   /* If OP is a vector comparison and the subreg is not changing the

> >>      number of elements or the size of the elements, change the result

> >>      of the comparison to the new mode.  */

> >>

> >> > +  if (COMPARISON_P (op)

> >> > +      && VECTOR_MODE_P (outermode)

> >> > +      && VECTOR_MODE_P (innermode)

> >> > +      && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode)))

> >> > +      && (known_eq (GET_MODE_UNIT_SIZE (outermode),

> >> > +                 GET_MODE_UNIT_SIZE (innermode))))

> >>

> >> Redundant brackets around the known_eq calls.

> >>

> >> > +    return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP (op, 1));

> >>

> >> This should use simplify_gen_relational, so that we try to simplify

> >> the new expression.

> > Does the attached version look OK ?

>

> OK with a suitable changelog (remember to post those!) if it passes testing.

The change to simplify_subreg regressed avx2-pr54700-1.C -;)

For following test-case:
__attribute__((noipa)) __v8sf
f7 (__v8si a, __v8sf b, __v8sf c)
{
  return a < 0 ? b : c;
}

Before patch, combine shows:
Trying 8, 9 -> 10:
    8: r87:V8SI=const_vector
    9: r89:V8SI=r87:V8SI>r90:V8SI
      REG_DEAD r90:V8SI
      REG_DEAD r87:V8SI
   10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104
      REG_DEAD r92:V8SF
      REG_DEAD r89:V8SI
      REG_DEAD r91:V8SF
Successfully matched this instruction:
(set (reg:V8SF 86)
    (unspec:V8SF [
            (reg:V8SF 92)
            (reg:V8SF 91)
            (subreg:V8SF (lt:V8SI (reg:V8SI 90)
                    (const_vector:V8SI [
                            (const_int 0 [0]) repeated x8
                        ])) 0)
        ] UNSPEC_BLENDV))
allowing combination of insns 8, 9 and 10

After applying patch, combine shows:

Trying 8, 9 -> 10:
    8: r87:V8SI=const_vector
    9: r89:V8SI=r87:V8SI>r90:V8SI
      REG_DEAD r90:V8SI
      REG_DEAD r87:V8SI
   10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104
      REG_DEAD r92:V8SF
      REG_DEAD r89:V8SI
      REG_DEAD r91:V8SF
Failed to match this instruction:
(set (reg:V8SF 86)
    (unspec:V8SF [
            (reg:V8SF 92)
            (reg:V8SF 91)
            (lt:V8SF (reg:V8SI 90)
                (const_vector:V8SI [
                        (const_int 0 [0]) repeated x8
                    ]))
        ] UNSPEC_BLENDV))

Adjusting UNSPEC_BLENDV pattern to match the simplified subreg seems to work.
Does it look OK ?

Testing the attached patch in progress.
(A quick comparison for i386.exp now shows no difference before/after patch).

Thanks,
Prathamesh
>

> Thanks,

> Richard
2019-07-03  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>

	* fwprop.c (reg_single_def_p): New function.
	(propagate_rtx_1): Add unconditional else inside RTX_EXTRA case.
	(forward_propagate_into): New parameter reg_prop_only
	with default value false.
	Propagate def's src into loop only if SET_SRC and SET_DEST
	of def_set have single definitions.
	Likewise if reg_prop_only is set to true.
	(fwprop): New param fwprop_addr_p.
	Integrate fwprop_addr into fwprop.
	(fwprop_addr): Remove.
	(pass_rtl_fwprop_addr::execute): Call fwprop with arg set
	to true.
	(pass_rtl_fwprop::execute): Call fwprop with arg set to false.
	* simplify-rtx.c (simplify_subreg): Add case for vector comparison.
	* i386/config/sse.md (UNSPEC_BLENDV): Adjust pattern.

testsuite/
	* gfortran.dg/pr88833.f90: New test. 

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d7d542524fb..5384fe218f9 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16623,10 +16623,9 @@
 	(unspec:VF_128_256
 	  [(match_operand:VF_128_256 1 "register_operand" "0,0,x")
 	   (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")
-	   (subreg:VF_128_256
-	     (lt:<sseintvecmode>
+	     (lt:VF_128_256
 	       (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")
-	       (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)]
+	       (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))]
 	  UNSPEC_BLENDV))]
   "TARGET_SSE4_1"
   "#"
diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index 45703fe5f01..e6f37527192 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -448,6 +448,18 @@ enum {
   PR_OPTIMIZE_FOR_SPEED = 4
 };
 
+/* Check that X has a single def.  */
+
+static bool
+reg_single_def_p (rtx x)
+{
+  if (!REG_P (x))
+    return false;
+
+  int regno = REGNO (x);
+  return (DF_REG_DEF_COUNT (regno) == 1
+	  && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (cfun)), regno));
+}
 
 /* Replace all occurrences of OLD in *PX with NEW and try to simplify the
    resulting expression.  Replace *PX with a new RTL expression if an
@@ -547,6 +559,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags)
 	  tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)),
 				     SUBREG_BYTE (x));
 	}
+
+      else
+	{
+	  rtvec vec;
+	  rtvec newvec;
+	  const char *fmt = GET_RTX_FORMAT (code);
+	  rtx op;
+
+	  for (int i = 0; fmt[i]; i++)
+	    switch (fmt[i])
+	      {
+	      case 'E':
+		vec = XVEC (x, i);
+		newvec = vec;
+		for (int j = 0; j < GET_NUM_ELEM (vec); j++)
+		  {
+		    op = RTVEC_ELT (vec, j);
+		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+		    if (op != RTVEC_ELT (vec, j))
+		      {
+			if (newvec == vec)
+			  {
+			    newvec = shallow_copy_rtvec (vec);
+			    if (!tem)
+			      tem = shallow_copy_rtx (x);
+			    XVEC (tem, i) = newvec;
+			  }
+			RTVEC_ELT (newvec, j) = op;
+		      }
+		  }
+	        break;
+
+	      case 'e':
+		if (XEXP (x, i))
+		  {
+		    op = XEXP (x, i);
+		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+		    if (op != XEXP (x, i))
+		      {
+			if (!tem)
+			  tem = shallow_copy_rtx (x);
+			XEXP (tem, i) = op;
+		      }
+		  }
+	        break;
+	      }
+	}
+
       break;
 
     case RTX_OBJ:
@@ -1370,10 +1430,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set)
 
 /* Given a use USE of an insn, if it has a single reaching
    definition, try to forward propagate it into that insn.
-   Return true if cfg cleanup will be needed.  */
+   Return true if cfg cleanup will be needed.
+   REG_PROP_ONLY is true if we should only propagate register copies.  */
 
 static bool
-forward_propagate_into (df_ref use)
+forward_propagate_into (df_ref use, bool reg_prop_only = false)
 {
   df_ref def;
   rtx_insn *def_insn, *use_insn;
@@ -1394,10 +1455,6 @@ forward_propagate_into (df_ref use)
   if (DF_REF_IS_ARTIFICIAL (def))
     return false;
 
-  /* Do not propagate loop invariant definitions inside the loop.  */
-  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father)
-    return false;
-
   /* Check if the use is still present in the insn!  */
   use_insn = DF_REF_INSN (use);
   if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE)
@@ -1415,6 +1472,19 @@ forward_propagate_into (df_ref use)
   if (!def_set)
     return false;
 
+  if (reg_prop_only
+      && (!reg_single_def_p (SET_SRC (def_set))
+	  || !reg_single_def_p (SET_DEST (def_set))))
+    return false;
+
+  /* Allow propagations into a loop only for reg-to-reg copies, since
+     replacing one register by another shouldn't increase the cost.  */
+
+  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father
+      && (!reg_single_def_p (SET_SRC (def_set))
+	  || !reg_single_def_p (SET_DEST (def_set))))
+    return false;
+
   /* Only try one kind of propagation.  If two are possible, we'll
      do it on the following iterations.  */
   if (forward_propagate_and_simplify (use, def_insn, def_set)
@@ -1483,7 +1553,7 @@ gate_fwprop (void)
 }
 
 static unsigned int
-fwprop (void)
+fwprop (bool fwprop_addr_p)
 {
   unsigned i;
 
@@ -1502,11 +1572,16 @@ fwprop (void)
 
       df_ref use = DF_USES_GET (i);
       if (use)
-	if (DF_REF_TYPE (use) == DF_REF_REG_USE
-	    || DF_REF_BB (use)->loop_father == NULL
-	    /* The outer most loop is not really a loop.  */
-	    || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
-	  forward_propagate_into (use);
+	{
+	  if (DF_REF_TYPE (use) == DF_REF_REG_USE
+	      || DF_REF_BB (use)->loop_father == NULL
+	      /* The outer most loop is not really a loop.  */
+	      || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
+	    forward_propagate_into (use, fwprop_addr_p);
+
+	  else if (fwprop_addr_p)
+	    forward_propagate_into (use, false);
+	}
     }
 
   fwprop_done ();
@@ -1537,7 +1612,7 @@ public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop (); }
+  virtual unsigned int execute (function *) { return fwprop (false); }
 
 }; // class pass_rtl_fwprop
 
@@ -1549,33 +1624,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt)
   return new pass_rtl_fwprop (ctxt);
 }
 
-static unsigned int
-fwprop_addr (void)
-{
-  unsigned i;
-
-  fwprop_init ();
-
-  /* Go through all the uses.  df_uses_create will create new ones at the
-     end, and we'll go through them as well.  */
-  for (i = 0; i < DF_USES_TABLE_SIZE (); i++)
-    {
-      if (!propagations_left)
-	break;
-
-      df_ref use = DF_USES_GET (i);
-      if (use)
-	if (DF_REF_TYPE (use) != DF_REF_REG_USE
-	    && DF_REF_BB (use)->loop_father != NULL
-	    /* The outer most loop is not really a loop.  */
-	    && loop_outer (DF_REF_BB (use)->loop_father) != NULL)
-	  forward_propagate_into (use);
-    }
-
-  fwprop_done ();
-  return 0;
-}
-
 namespace {
 
 const pass_data pass_data_rtl_fwprop_addr =
@@ -1600,7 +1648,7 @@ public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop_addr (); }
+  virtual unsigned int execute (function *) { return fwprop (true); }
 
 }; // class pass_rtl_fwprop_addr
 
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 89a46a933fa..dd2acd4eca9 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -6697,6 +6697,17 @@ simplify_subreg (machine_mode outermode, rtx op,
 	}
     }
 
+  /* If OP is a vector comparison and the subreg is not changing the
+     number of elements or the size of the elements, change the result
+     of the comparison to the new mode.  */
+  if (COMPARISON_P (op)
+      && VECTOR_MODE_P (outermode)
+      && VECTOR_MODE_P (innermode)
+      && known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode))
+      && known_eq (GET_MODE_UNIT_SIZE (outermode),
+		    GET_MODE_UNIT_SIZE (innermode)))
+    return simplify_gen_relational (GET_CODE (op), outermode, innermode,
+				    XEXP (op, 0), XEXP (op, 1));
   return NULL_RTX;
 }
 
diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90
new file mode 100644
index 00000000000..224e6ce5f3d
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr88833.f90
@@ -0,0 +1,9 @@
+! { dg-do assemble { target aarch64_asm_sve_ok } }
+! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" }
+
+subroutine foo(x)
+  real :: x(100)
+  x = x + 10
+end subroutine foo
+
+! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }
Richard Sandiford July 3, 2019, 11:36 a.m. | #15
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Tue, 2 Jul 2019 at 18:22, Richard Sandiford

> <richard.sandiford@arm.com> wrote:

>>

>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

>> > On Tue, 2 Jul 2019 at 16:59, Richard Sandiford

>> > <richard.sandiford@arm.com> wrote:

>> >>

>> >> Thanks for fixing this.

>> >>

>> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

>> >> > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c

>> >> > index 89a46a933fa..79bd0cfbd28 100644

>> >> > --- a/gcc/simplify-rtx.c

>> >> > +++ b/gcc/simplify-rtx.c

>> >> > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op,

>> >> >       }

>> >> >      }

>> >> >

>> >> > +  /* If op is a vector comparison operator, rewrite it in a new mode.

>> >> > +     This probably won't match, but may allow further simplifications.

>> >> > +     Also check if number of elements and size of each element

>> >> > +     match for outermode and innermode.  */

>> >> > +

>> >>

>> >> Excess blank line after the comment.  IMO the second part of the comment

>> >> reads too much like an afterthought.  How about:

>> >>

>> >>   /* If OP is a vector comparison and the subreg is not changing the

>> >>      number of elements or the size of the elements, change the result

>> >>      of the comparison to the new mode.  */

>> >>

>> >> > +  if (COMPARISON_P (op)

>> >> > +      && VECTOR_MODE_P (outermode)

>> >> > +      && VECTOR_MODE_P (innermode)

>> >> > +      && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode)))

>> >> > +      && (known_eq (GET_MODE_UNIT_SIZE (outermode),

>> >> > +                 GET_MODE_UNIT_SIZE (innermode))))

>> >>

>> >> Redundant brackets around the known_eq calls.

>> >>

>> >> > +    return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP (op, 1));

>> >>

>> >> This should use simplify_gen_relational, so that we try to simplify

>> >> the new expression.

>> > Does the attached version look OK ?

>>

>> OK with a suitable changelog (remember to post those!) if it passes testing.

> The change to simplify_subreg regressed avx2-pr54700-1.C -;)

>

> For following test-case:

> __attribute__((noipa)) __v8sf

> f7 (__v8si a, __v8sf b, __v8sf c)

> {

>   return a < 0 ? b : c;

> }

>

> Before patch, combine shows:

> Trying 8, 9 -> 10:

>     8: r87:V8SI=const_vector

>     9: r89:V8SI=r87:V8SI>r90:V8SI

>       REG_DEAD r90:V8SI

>       REG_DEAD r87:V8SI

>    10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104

>       REG_DEAD r92:V8SF

>       REG_DEAD r89:V8SI

>       REG_DEAD r91:V8SF

> Successfully matched this instruction:

> (set (reg:V8SF 86)

>     (unspec:V8SF [

>             (reg:V8SF 92)

>             (reg:V8SF 91)

>             (subreg:V8SF (lt:V8SI (reg:V8SI 90)

>                     (const_vector:V8SI [

>                             (const_int 0 [0]) repeated x8

>                         ])) 0)

>         ] UNSPEC_BLENDV))

> allowing combination of insns 8, 9 and 10

>

> After applying patch, combine shows:

>

> Trying 8, 9 -> 10:

>     8: r87:V8SI=const_vector

>     9: r89:V8SI=r87:V8SI>r90:V8SI

>       REG_DEAD r90:V8SI

>       REG_DEAD r87:V8SI

>    10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104

>       REG_DEAD r92:V8SF

>       REG_DEAD r89:V8SI

>       REG_DEAD r91:V8SF

> Failed to match this instruction:

> (set (reg:V8SF 86)

>     (unspec:V8SF [

>             (reg:V8SF 92)

>             (reg:V8SF 91)

>             (lt:V8SF (reg:V8SI 90)

>                 (const_vector:V8SI [

>                         (const_int 0 [0]) repeated x8

>                     ]))

>         ] UNSPEC_BLENDV))


Bah.  If the port isn't self-consistent about whether a subreg should
be used, it's tempting to say that a subreg should be used and fix the
places that don't.  At least that way we'd avoid the abomination -
ABOMINATION! - of using NaNs to represent truth.

But I agree it looks like this is the only pattern affected.

> Adjusting UNSPEC_BLENDV pattern to match the simplified subreg seems to work.

> Does it look OK ?

>

> Testing the attached patch in progress.

> (A quick comparison for i386.exp now shows no difference before/after patch).

>

> Thanks,

> Prathamesh

>>

>> Thanks,

>> Richard

>

> 2019-07-03  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>

>

> 	* fwprop.c (reg_single_def_p): New function.

> 	(propagate_rtx_1): Add unconditional else inside RTX_EXTRA case.

> 	(forward_propagate_into): New parameter reg_prop_only

> 	with default value false.

> 	Propagate def's src into loop only if SET_SRC and SET_DEST

> 	of def_set have single definitions.

> 	Likewise if reg_prop_only is set to true.

> 	(fwprop): New param fwprop_addr_p.

> 	Integrate fwprop_addr into fwprop.

> 	(fwprop_addr): Remove.

> 	(pass_rtl_fwprop_addr::execute): Call fwprop with arg set

> 	to true.

> 	(pass_rtl_fwprop::execute): Call fwprop with arg set to false.

> 	* simplify-rtx.c (simplify_subreg): Add case for vector comparison.

> 	* i386/config/sse.md (UNSPEC_BLENDV): Adjust pattern.


typo: config/i386/sse.md

>

> testsuite/

> 	* gfortran.dg/pr88833.f90: New test. 

>

> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md

> index d7d542524fb..5384fe218f9 100644

> --- a/gcc/config/i386/sse.md

> +++ b/gcc/config/i386/sse.md

> @@ -16623,10 +16623,9 @@

>  	(unspec:VF_128_256

>  	  [(match_operand:VF_128_256 1 "register_operand" "0,0,x")

>  	   (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")

> -	   (subreg:VF_128_256

> -	     (lt:<sseintvecmode>

> +	     (lt:VF_128_256

>  	       (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")

> -	       (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)]

> +	       (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))]

>  	  UNSPEC_BLENDV))]

>    "TARGET_SSE4_1"

>    "#"


The (lt:...) and its operands should now be indented by two columns fewer
than previously.

OK with that change, thanks.

Richard
Prathamesh Kulkarni July 4, 2019, 6:50 a.m. | #16
On Wed, 3 Jul 2019 at 17:06, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> > On Tue, 2 Jul 2019 at 18:22, Richard Sandiford

> > <richard.sandiford@arm.com> wrote:

> >>

> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> >> > On Tue, 2 Jul 2019 at 16:59, Richard Sandiford

> >> > <richard.sandiford@arm.com> wrote:

> >> >>

> >> >> Thanks for fixing this.

> >> >>

> >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:

> >> >> > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c

> >> >> > index 89a46a933fa..79bd0cfbd28 100644

> >> >> > --- a/gcc/simplify-rtx.c

> >> >> > +++ b/gcc/simplify-rtx.c

> >> >> > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op,

> >> >> >       }

> >> >> >      }

> >> >> >

> >> >> > +  /* If op is a vector comparison operator, rewrite it in a new mode.

> >> >> > +     This probably won't match, but may allow further simplifications.

> >> >> > +     Also check if number of elements and size of each element

> >> >> > +     match for outermode and innermode.  */

> >> >> > +

> >> >>

> >> >> Excess blank line after the comment.  IMO the second part of the comment

> >> >> reads too much like an afterthought.  How about:

> >> >>

> >> >>   /* If OP is a vector comparison and the subreg is not changing the

> >> >>      number of elements or the size of the elements, change the result

> >> >>      of the comparison to the new mode.  */

> >> >>

> >> >> > +  if (COMPARISON_P (op)

> >> >> > +      && VECTOR_MODE_P (outermode)

> >> >> > +      && VECTOR_MODE_P (innermode)

> >> >> > +      && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode)))

> >> >> > +      && (known_eq (GET_MODE_UNIT_SIZE (outermode),

> >> >> > +                 GET_MODE_UNIT_SIZE (innermode))))

> >> >>

> >> >> Redundant brackets around the known_eq calls.

> >> >>

> >> >> > +    return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP (op, 1));

> >> >>

> >> >> This should use simplify_gen_relational, so that we try to simplify

> >> >> the new expression.

> >> > Does the attached version look OK ?

> >>

> >> OK with a suitable changelog (remember to post those!) if it passes testing.

> > The change to simplify_subreg regressed avx2-pr54700-1.C -;)

> >

> > For following test-case:

> > __attribute__((noipa)) __v8sf

> > f7 (__v8si a, __v8sf b, __v8sf c)

> > {

> >   return a < 0 ? b : c;

> > }

> >

> > Before patch, combine shows:

> > Trying 8, 9 -> 10:

> >     8: r87:V8SI=const_vector

> >     9: r89:V8SI=r87:V8SI>r90:V8SI

> >       REG_DEAD r90:V8SI

> >       REG_DEAD r87:V8SI

> >    10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104

> >       REG_DEAD r92:V8SF

> >       REG_DEAD r89:V8SI

> >       REG_DEAD r91:V8SF

> > Successfully matched this instruction:

> > (set (reg:V8SF 86)

> >     (unspec:V8SF [

> >             (reg:V8SF 92)

> >             (reg:V8SF 91)

> >             (subreg:V8SF (lt:V8SI (reg:V8SI 90)

> >                     (const_vector:V8SI [

> >                             (const_int 0 [0]) repeated x8

> >                         ])) 0)

> >         ] UNSPEC_BLENDV))

> > allowing combination of insns 8, 9 and 10

> >

> > After applying patch, combine shows:

> >

> > Trying 8, 9 -> 10:

> >     8: r87:V8SI=const_vector

> >     9: r89:V8SI=r87:V8SI>r90:V8SI

> >       REG_DEAD r90:V8SI

> >       REG_DEAD r87:V8SI

> >    10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104

> >       REG_DEAD r92:V8SF

> >       REG_DEAD r89:V8SI

> >       REG_DEAD r91:V8SF

> > Failed to match this instruction:

> > (set (reg:V8SF 86)

> >     (unspec:V8SF [

> >             (reg:V8SF 92)

> >             (reg:V8SF 91)

> >             (lt:V8SF (reg:V8SI 90)

> >                 (const_vector:V8SI [

> >                         (const_int 0 [0]) repeated x8

> >                     ]))

> >         ] UNSPEC_BLENDV))

>

> Bah.  If the port isn't self-consistent about whether a subreg should

> be used, it's tempting to say that a subreg should be used and fix the

> places that don't.  At least that way we'd avoid the abomination -

> ABOMINATION! - of using NaNs to represent truth.

>

> But I agree it looks like this is the only pattern affected.

>

> > Adjusting UNSPEC_BLENDV pattern to match the simplified subreg seems to work.

> > Does it look OK ?

> >

> > Testing the attached patch in progress.

> > (A quick comparison for i386.exp now shows no difference before/after patch).

> >

> > Thanks,

> > Prathamesh

> >>

> >> Thanks,

> >> Richard

> >

> > 2019-07-03  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>

> >

> >       * fwprop.c (reg_single_def_p): New function.

> >       (propagate_rtx_1): Add unconditional else inside RTX_EXTRA case.

> >       (forward_propagate_into): New parameter reg_prop_only

> >       with default value false.

> >       Propagate def's src into loop only if SET_SRC and SET_DEST

> >       of def_set have single definitions.

> >       Likewise if reg_prop_only is set to true.

> >       (fwprop): New param fwprop_addr_p.

> >       Integrate fwprop_addr into fwprop.

> >       (fwprop_addr): Remove.

> >       (pass_rtl_fwprop_addr::execute): Call fwprop with arg set

> >       to true.

> >       (pass_rtl_fwprop::execute): Call fwprop with arg set to false.

> >       * simplify-rtx.c (simplify_subreg): Add case for vector comparison.

> >       * i386/config/sse.md (UNSPEC_BLENDV): Adjust pattern.

>

> typo: config/i386/sse.md

>

> >

> > testsuite/

> >       * gfortran.dg/pr88833.f90: New test.

> >

> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md

> > index d7d542524fb..5384fe218f9 100644

> > --- a/gcc/config/i386/sse.md

> > +++ b/gcc/config/i386/sse.md

> > @@ -16623,10 +16623,9 @@

> >       (unspec:VF_128_256

> >         [(match_operand:VF_128_256 1 "register_operand" "0,0,x")

> >          (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")

> > -        (subreg:VF_128_256

> > -          (lt:<sseintvecmode>

> > +          (lt:VF_128_256

> >              (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")

> > -            (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)]

> > +            (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))]

> >         UNSPEC_BLENDV))]

> >    "TARGET_SSE4_1"

> >    "#"

>

> The (lt:...) and its operands should now be indented by two columns fewer

> than previously.

>

> OK with that change, thanks.

Thanks, committed in r273040.

Thanks,
Prathamesh
>

> Richard

Patch

diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index 45703fe5f01..93a1a10c9a6 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -547,6 +547,54 @@  propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags)
 	  tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)),
 				     SUBREG_BYTE (x));
 	}
+
+      else
+	{
+	  rtvec vec;
+	  rtvec newvec;
+	  const char *fmt = GET_RTX_FORMAT (code);
+	  rtx op;
+
+	  for (int i = 0; fmt[i]; i++)
+	    switch (fmt[i])
+	      {
+	      case 'E':
+		vec = XVEC (x, i);
+		newvec = vec;
+		for (int j = 0; j < GET_NUM_ELEM (vec); j++)
+		  {
+		    op = RTVEC_ELT (vec, j);
+		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+		    if (op != RTVEC_ELT (vec, j))
+		      {
+			if (newvec == vec)
+			  {
+			    newvec = shallow_copy_rtvec (vec);
+			    if (!tem)
+			      tem = shallow_copy_rtx (x);
+			    XVEC (tem, i) = newvec;
+			  }
+			RTVEC_ELT (newvec, j) = op;
+		      }
+		  }
+	      break;
+
+	      case 'e':
+		if (XEXP (x, i))
+		  {
+		    op = XEXP (x, i);
+		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+		    if (op != XEXP (x, i))
+		      {
+			if (!tem)
+			  tem = shallow_copy_rtx (x);
+			XEXP (tem, i) = op;
+		      }
+		  }
+	      break;
+	      }
+	}
+
       break;
 
     case RTX_OBJ:
@@ -1370,10 +1418,11 @@  forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set)
 
 /* Given a use USE of an insn, if it has a single reaching
    definition, try to forward propagate it into that insn.
-   Return true if cfg cleanup will be needed.  */
+   Return true if cfg cleanup will be needed.
+   REG_PROP_ONLY is true if we should only propagate register copies.  */
 
 static bool
-forward_propagate_into (df_ref use)
+forward_propagate_into (df_ref use, bool reg_prop_only = false)
 {
   df_ref def;
   rtx_insn *def_insn, *use_insn;
@@ -1394,10 +1443,6 @@  forward_propagate_into (df_ref use)
   if (DF_REF_IS_ARTIFICIAL (def))
     return false;
 
-  /* Do not propagate loop invariant definitions inside the loop.  */
-  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father)
-    return false;
-
   /* Check if the use is still present in the insn!  */
   use_insn = DF_REF_INSN (use);
   if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE)
@@ -1415,6 +1460,16 @@  forward_propagate_into (df_ref use)
   if (!def_set)
     return false;
 
+  if (reg_prop_only && !REG_P (SET_SRC (def_set)))
+    return false;
+
+  /* Allow propagating def inside loop only if source of def_set is
+     reg, since replacing reg by source reg shouldn't increase cost.  */
+
+  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father
+      && !REG_P (SET_SRC (def_set)))
+    return false;
+
   /* Only try one kind of propagation.  If two are possible, we'll
      do it on the following iterations.  */
   if (forward_propagate_and_simplify (use, def_insn, def_set)
@@ -1483,7 +1538,7 @@  gate_fwprop (void)
 }
 
 static unsigned int
-fwprop (void)
+fwprop (bool fwprop_addr_p)
 {
   unsigned i;
 
@@ -1502,11 +1557,16 @@  fwprop (void)
 
       df_ref use = DF_USES_GET (i);
       if (use)
-	if (DF_REF_TYPE (use) == DF_REF_REG_USE
-	    || DF_REF_BB (use)->loop_father == NULL
-	    /* The outer most loop is not really a loop.  */
-	    || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
-	  forward_propagate_into (use);
+	{
+	  if (DF_REF_TYPE (use) == DF_REF_REG_USE
+	      || DF_REF_BB (use)->loop_father == NULL
+	      /* The outer most loop is not really a loop.  */
+	      || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
+	    forward_propagate_into (use, fwprop_addr_p);
+
+	  else if (fwprop_addr_p)
+	    forward_propagate_into (use, false);
+	}
     }
 
   fwprop_done ();
@@ -1537,7 +1597,7 @@  public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop (); }
+  virtual unsigned int execute (function *) { return fwprop (false); }
 
 }; // class pass_rtl_fwprop
 
@@ -1549,33 +1609,6 @@  make_pass_rtl_fwprop (gcc::context *ctxt)
   return new pass_rtl_fwprop (ctxt);
 }
 
-static unsigned int
-fwprop_addr (void)
-{
-  unsigned i;
-
-  fwprop_init ();
-
-  /* Go through all the uses.  df_uses_create will create new ones at the
-     end, and we'll go through them as well.  */
-  for (i = 0; i < DF_USES_TABLE_SIZE (); i++)
-    {
-      if (!propagations_left)
-	break;
-
-      df_ref use = DF_USES_GET (i);
-      if (use)
-	if (DF_REF_TYPE (use) != DF_REF_REG_USE
-	    && DF_REF_BB (use)->loop_father != NULL
-	    /* The outer most loop is not really a loop.  */
-	    && loop_outer (DF_REF_BB (use)->loop_father) != NULL)
-	  forward_propagate_into (use);
-    }
-
-  fwprop_done ();
-  return 0;
-}
-
 namespace {
 
 const pass_data pass_data_rtl_fwprop_addr =
@@ -1600,7 +1633,7 @@  public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop_addr (); }
+  virtual unsigned int execute (function *) { return fwprop (true); }
 
 }; // class pass_rtl_fwprop_addr
 
diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90
new file mode 100644
index 00000000000..224e6ce5f3d
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr88833.f90
@@ -0,0 +1,9 @@ 
+! { dg-do assemble { target aarch64_asm_sve_ok } }
+! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" }
+
+subroutine foo(x)
+  real :: x(100)
+  x = x + 10
+end subroutine foo
+
+! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }