diff mbox

[ARM] Fix off by one error in neon_evpc_vrev.

Message ID CACUk7=V9zDvb0em-67zrK04ZrCYWNp50YmQabzgS0y+gCB_HBg@mail.gmail.com
State Accepted
Headers show

Commit Message

Ramana Radhakrishnan May 30, 2012, 2:03 a.m. UTC
On 29 May 2012 18:30, Richard Henderson <rth@redhat.com> wrote:
> On 05/26/2012 01:27 AM, Ramana Radhakrishnan wrote:
>>
>> -  for (i = 0; i<  nelt; i += diff)
>> +  for (i = 0; i<  nelt ; i += (diff + 1))
>>      for (j = 0; j<= diff; j += 1)
>> -      if (d->perm[i + j] != i + diff - j)
>> -       return false;
>> +      {
>> +       /* This is guaranteed to be true as the value of diff
>> +          is 7, 3, 1 and we should have enough elements in the
>> +          queue to generate this. Getting a vector mask with a
>> +          value of diff other than these values implies that
>> +          something is wrong by the time we get here.  */
>> +       gcc_assert ((i + j)<  nelt);
>
>
> Yep, that all looks correct.  Unnecessary () in both lines though.

Bah - Thanks - don't know why I put those in :(  .Committed to trunk
with those changes and I would like to backport this to the 4.7 branch
after a couple of weeks to allow the auto-testers to pick this up as
it really turns on this functionality in this particular case if the
release managers don't object. This is a significant performance issue
in 4.7 for cases where we reverse vectors and would be nice to fix
there. ( 2 loads + 2 generic permutes vs a single reverse instruciton)

regards,
Ramana


2012-05-30  Ramana Radhakrishnan  <ramana.radhakrishnan@linaro.org>

	* config/arm/arm.c (arm_evpc_neon_vrev): Adjust off by one error.
        * gcc.target/arm/neon-vrev..c: New.


>
>
> r~
diff mbox

Patch

Index: gcc/testsuite/gcc.target/arm/neon-vrev.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vrev.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/neon-vrev.c	(revision 187999)
@@ -0,0 +1,105 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint16x4_t
+tst_vrev642_u16 (uint16x4_t __a)
+{
+  uint16x4_t __rv;
+  uint16x4_t __mask1 = { 3, 2, 1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x8_t
+tst_vrev64q2_u16 (uint16x8_t __a)
+{
+  uint16x8_t __rv;
+  uint16x8_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4 };
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint8x8_t
+tst_vrev642_u8 (uint8x8_t __a)
+{
+  uint8x8_t __rv;
+  uint8x8_t __mask1 = { 7, 6, 5, 4, 3, 2, 1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint8x16_t
+tst_vrev64q2_u8 (uint8x16_t __a)
+{
+  uint8x16_t __rv;
+  uint8x16_t __mask1 = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
+  return __builtin_shuffle ( __a, __mask1) ;
+
+}
+
+uint32x2_t
+tst_vrev642_u32 (uint32x2_t __a)
+{
+  uint32x2_t __rv;
+  uint32x2_t __mask1 = {1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+
+}
+
+uint32x4_t
+tst_vrev64q2_u32 (uint32x4_t __a)
+{
+  uint32x4_t __rv;
+  uint32x4_t __mask1 = {1, 0, 3, 2};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x4_t
+tst_vrev322_u16 (uint16x4_t __a)
+{
+  uint16x4_t __mask1 = { 1, 0, 3, 2 };
+  return __builtin_shuffle (__a, __mask1);
+}
+
+uint16x8_t
+tst_vrev32q2_u16 (uint16x8_t __a)
+{
+  uint16x8_t __mask1 = { 1, 0, 3, 2, 5, 4, 7, 6 }; 
+  return __builtin_shuffle (__a, __mask1);
+}
+
+uint8x8_t
+tst_vrev322_u8 (uint8x8_t __a)
+{
+  uint8x8_t __mask1 = { 3, 2, 1, 0, 7, 6, 5, 4};
+  return __builtin_shuffle (__a, __mask1);
+}
+
+uint8x16_t
+tst_vrev32q2_u8 (uint8x16_t __a)
+{
+  uint8x16_t __mask1 = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12};
+  return __builtin_shuffle (__a, __mask1);
+}
+
+uint8x8_t
+tst_vrev162_u8 (uint8x8_t __a)
+{
+  uint8x8_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6};
+  return __builtin_shuffle (__a, __mask);
+}
+
+uint8x16_t
+tst_vrev16q2_u8 (uint8x16_t __a)
+{
+  uint8x16_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+  return __builtin_shuffle (__a, __mask);
+}
+
+/* { dg-final {scan-assembler-times "vrev32\.16\\t" 2} }  */
+/* { dg-final {scan-assembler-times "vrev32\.8\\t" 2} }  */ 
+/* { dg-final {scan-assembler-times "vrev16\.8\\t" 2} }  */
+/* { dg-final {scan-assembler-times "vrev64\.8\\t" 2} }  */
+/* { dg-final {scan-assembler-times "vrev64\.32\\t" 2} }  */
+/* { dg-final {scan-assembler-times "vrev64\.16\\t" 2} }  */
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 187998)
+++ gcc/config/arm/arm.c	(revision 187999)
@@ -25637,10 +25637,18 @@ 
       return false;
     }
 
-  for (i = 0; i < nelt; i += diff)
+  for (i = 0; i < nelt ; i += diff + 1)
     for (j = 0; j <= diff; j += 1)
-      if (d->perm[i + j] != i + diff - j)
-	return false;
+      {
+	/* This is guaranteed to be true as the value of diff
+	   is 7, 3, 1 and we should have enough elements in the
+	   queue to generate this. Getting a vector mask with a
+	   value of diff other than these values implies that
+	   something is wrong by the time we get here.  */
+	gcc_assert (i + j < nelt);
+	if (d->perm[i + j] != i + diff - j)
+	  return false;
+      }
 
   /* Success! */
   if (d->testing_p)