diff mbox series

[v4,09/10] util/bufferiszero: Add simd acceleration for aarch64

Message ID 20240215081449.848220-10-richard.henderson@linaro.org
State Superseded
Headers show
Series Optimize buffer_is_zero | expand

Commit Message

Richard Henderson Feb. 15, 2024, 8:14 a.m. UTC
Because non-embedded aarch64 is expected to have AdvSIMD enabled, merely
double-check with the compiler flags for __ARM_NEON and don't bother with
a runtime check.  Otherwise, model the loop after the x86 SSE2 function,
and use VADDV to reduce the four vector comparisons.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 util/bufferiszero.c | 74 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

Comments

Alexander Monakov Feb. 15, 2024, 8:47 a.m. UTC | #1
On Wed, 14 Feb 2024, Richard Henderson wrote:

> Because non-embedded aarch64 is expected to have AdvSIMD enabled, merely
> double-check with the compiler flags for __ARM_NEON and don't bother with
> a runtime check.  Otherwise, model the loop after the x86 SSE2 function,
> and use VADDV to reduce the four vector comparisons.

I am not very familiar with Neon but I wonder if this couldn't use SHRN
for the final 128b->64b reduction similar to 2022 Glibc optimizations:
https://inbox.sourceware.org/libc-alpha/20220620174628.2820531-1-danilak@google.com/

In git history I see the previous Neon buffer_is_zero was removed because
it was not faster. Is it because integer LDP was as good as vector loads
at saturating load bandwidth on older cores, and things are different now?

Alexander

> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  util/bufferiszero.c | 74 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 74 insertions(+)
> 
> diff --git a/util/bufferiszero.c b/util/bufferiszero.c
> index 4eef6d47bc..2809b09225 100644
> --- a/util/bufferiszero.c
> +++ b/util/bufferiszero.c
> @@ -214,7 +214,81 @@ bool test_buffer_is_zero_next_accel(void)
>      }
>      return false;
>  }
> +
> +#elif defined(__aarch64__) && defined(__ARM_NEON)
> +#include <arm_neon.h>
> +
> +#define REASSOC_BARRIER(vec0, vec1) asm("" : "+w"(vec0), "+w"(vec1))
> +
> +static bool buffer_is_zero_simd(const void *buf, size_t len)
> +{
> +    uint32x4_t t0, t1, t2, t3;
> +
> +    /* Align head/tail to 16-byte boundaries.  */
> +    const uint32x4_t *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
> +    const uint32x4_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
> +
> +    /* Unaligned loads at head/tail.  */
> +    t0 = vld1q_u32(buf) | vld1q_u32(buf + len - 16);
> +
> +    /* Collect a partial block at tail end.  */
> +    t1 = e[-7] | e[-6];
> +    t2 = e[-5] | e[-4];
> +    t3 = e[-3] | e[-2];
> +    t0 |= e[-1];
> +    REASSOC_BARRIER(t0, t1);
> +    REASSOC_BARRIER(t2, t3);
> +    t0 |= t1;
> +    t2 |= t3;
> +    REASSOC_BARRIER(t0, t2);
> +    t0 |= t2;
> +
> +    /*
> +     * Loop over complete 128-byte blocks.
> +     * With the head and tail removed, e - p >= 14, so the loop
> +     * must iterate at least once.
> +     */
> +    do {
> +        /* Each comparison is [-1,0], so reduction is in [-4..0]. */
> +        if (unlikely(vaddvq_u32(vceqzq_u32(t0)) != -4)) {
> +            return false;
> +        }
> +
> +        t0 = p[0] | p[1];
> +        t1 = p[2] | p[3];
> +        t2 = p[4] | p[5];
> +        t3 = p[6] | p[7];
> +        REASSOC_BARRIER(t0, t1);
> +        REASSOC_BARRIER(t2, t3);
> +        t0 |= t1;
> +        t2 |= t3;
> +        REASSOC_BARRIER(t0, t2);
> +        t0 |= t2;
> +        p += 8;
> +    } while (p < e - 7);
> +
> +    return vaddvq_u32(vceqzq_u32(t0)) == -4;
> +}
> +
> +static biz_accel_fn const accel_table[] = {
> +    buffer_is_zero_int_ge256,
> +    buffer_is_zero_simd,
> +};
> +
> +static unsigned accel_index = 1;
> +#define INIT_ACCEL buffer_is_zero_simd
> +
> +bool test_buffer_is_zero_next_accel(void)
> +{
> +    if (accel_index != 0) {
> +        buffer_is_zero_accel = accel_table[--accel_index];
> +        return true;
> +    }
> +    return false;
> +}
> +
>  #else
> +
>  bool test_buffer_is_zero_next_accel(void)
>  {
>      return false;
>
Richard Henderson Feb. 15, 2024, 5:47 p.m. UTC | #2
On 2/14/24 22:47, Alexander Monakov wrote:
> 
> On Wed, 14 Feb 2024, Richard Henderson wrote:
> 
>> Because non-embedded aarch64 is expected to have AdvSIMD enabled, merely
>> double-check with the compiler flags for __ARM_NEON and don't bother with
>> a runtime check.  Otherwise, model the loop after the x86 SSE2 function,
>> and use VADDV to reduce the four vector comparisons.
> 
> I am not very familiar with Neon but I wonder if this couldn't use SHRN
> for the final 128b->64b reduction similar to 2022 Glibc optimizations:
> https://inbox.sourceware.org/libc-alpha/20220620174628.2820531-1-danilak@google.com/

The reason they use SHRN for memchr is that they have also applied a mask
to the comparison so that they can identify which byte contained the match.
That is not required here, so any reduction will do.


> In git history I see the previous Neon buffer_is_zero was removed because
> it was not faster. Is it because integer LDP was as good as vector loads
> at saturating load bandwidth on older cores, and things are different now?

The old reduction was a bit silly,

-#define DO_NONZERO(X)  (vgetq_lane_u64((X), 0) | vgetq_lane_u64((X), 1))

performing two cross-register-set fetches.  It's also possible that we were saturating the 
load bandwidth on the old mustang.  This time I'm testing on a neoverse-n1, which is quite 
a few years newer.

The loop kernel compiles to this:

  19c:   ad401c20        ldp     q0, q7, [x1]
  1a0:   ad411823        ldp     q3, q6, [x1, #32]
  1a4:   ad421421        ldp     q1, q5, [x1, #64]
  1a8:   ad431022        ldp     q2, q4, [x1, #96]
  1ac:   91020021        add     x1, x1, #0x80
  1b0:   4ea71c00        orr     v0.16b, v0.16b, v7.16b
  1b4:   4ea61c63        orr     v3.16b, v3.16b, v6.16b
  1b8:   4ea51c21        orr     v1.16b, v1.16b, v5.16b
  1bc:   4ea41c42        orr     v2.16b, v2.16b, v4.16b
  1c0:   4ea31c00        orr     v0.16b, v0.16b, v3.16b
  1c4:   4ea21c21        orr     v1.16b, v1.16b, v2.16b
  1c8:   4ea11c00        orr     v0.16b, v0.16b, v1.16b
  1cc:   eb03003f        cmp     x1, x3
  1d0:   54000162        b.cs    1fc <buffer_is_zero_simd+0xb8>  // b.hs, b.nlast
  1d4:   4ea09800        cmeq    v0.4s, v0.4s, #0
  1d8:   4eb1b800        addv    s0, v0.4s
  1dc:   1e260000        fmov    w0, s0
  1e0:   3100101f        cmn     w0, #0x4
  1e4:   54fffdc0        b.eq    19c <buffer_is_zero_simd+0x58>  // b.none


r~

> 
> Alexander
> 
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   util/bufferiszero.c | 74 +++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 74 insertions(+)
>>
>> diff --git a/util/bufferiszero.c b/util/bufferiszero.c
>> index 4eef6d47bc..2809b09225 100644
>> --- a/util/bufferiszero.c
>> +++ b/util/bufferiszero.c
>> @@ -214,7 +214,81 @@ bool test_buffer_is_zero_next_accel(void)
>>       }
>>       return false;
>>   }
>> +
>> +#elif defined(__aarch64__) && defined(__ARM_NEON)
>> +#include <arm_neon.h>
>> +
>> +#define REASSOC_BARRIER(vec0, vec1) asm("" : "+w"(vec0), "+w"(vec1))
>> +
>> +static bool buffer_is_zero_simd(const void *buf, size_t len)
>> +{
>> +    uint32x4_t t0, t1, t2, t3;
>> +
>> +    /* Align head/tail to 16-byte boundaries.  */
>> +    const uint32x4_t *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
>> +    const uint32x4_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
>> +
>> +    /* Unaligned loads at head/tail.  */
>> +    t0 = vld1q_u32(buf) | vld1q_u32(buf + len - 16);
>> +
>> +    /* Collect a partial block at tail end.  */
>> +    t1 = e[-7] | e[-6];
>> +    t2 = e[-5] | e[-4];
>> +    t3 = e[-3] | e[-2];
>> +    t0 |= e[-1];
>> +    REASSOC_BARRIER(t0, t1);
>> +    REASSOC_BARRIER(t2, t3);
>> +    t0 |= t1;
>> +    t2 |= t3;
>> +    REASSOC_BARRIER(t0, t2);
>> +    t0 |= t2;
>> +
>> +    /*
>> +     * Loop over complete 128-byte blocks.
>> +     * With the head and tail removed, e - p >= 14, so the loop
>> +     * must iterate at least once.
>> +     */
>> +    do {
>> +        /* Each comparison is [-1,0], so reduction is in [-4..0]. */
>> +        if (unlikely(vaddvq_u32(vceqzq_u32(t0)) != -4)) {
>> +            return false;
>> +        }
>> +
>> +        t0 = p[0] | p[1];
>> +        t1 = p[2] | p[3];
>> +        t2 = p[4] | p[5];
>> +        t3 = p[6] | p[7];
>> +        REASSOC_BARRIER(t0, t1);
>> +        REASSOC_BARRIER(t2, t3);
>> +        t0 |= t1;
>> +        t2 |= t3;
>> +        REASSOC_BARRIER(t0, t2);
>> +        t0 |= t2;
>> +        p += 8;
>> +    } while (p < e - 7);
>> +
>> +    return vaddvq_u32(vceqzq_u32(t0)) == -4;
>> +}
>> +
>> +static biz_accel_fn const accel_table[] = {
>> +    buffer_is_zero_int_ge256,
>> +    buffer_is_zero_simd,
>> +};
>> +
>> +static unsigned accel_index = 1;
>> +#define INIT_ACCEL buffer_is_zero_simd
>> +
>> +bool test_buffer_is_zero_next_accel(void)
>> +{
>> +    if (accel_index != 0) {
>> +        buffer_is_zero_accel = accel_table[--accel_index];
>> +        return true;
>> +    }
>> +    return false;
>> +}
>> +
>>   #else
>> +
>>   bool test_buffer_is_zero_next_accel(void)
>>   {
>>       return false;
>>
Alexander Monakov Feb. 15, 2024, 6:46 p.m. UTC | #3
On Thu, 15 Feb 2024, Richard Henderson wrote:

> On 2/14/24 22:47, Alexander Monakov wrote:
> > 
> > On Wed, 14 Feb 2024, Richard Henderson wrote:
> > 
> >> Because non-embedded aarch64 is expected to have AdvSIMD enabled, merely
> >> double-check with the compiler flags for __ARM_NEON and don't bother with
> >> a runtime check.  Otherwise, model the loop after the x86 SSE2 function,
> >> and use VADDV to reduce the four vector comparisons.
> > 
> > I am not very familiar with Neon but I wonder if this couldn't use SHRN
> > for the final 128b->64b reduction similar to 2022 Glibc optimizations:
> > https://inbox.sourceware.org/libc-alpha/20220620174628.2820531-1-danilak@google.com/
> 
> The reason they use SHRN for memchr is that they have also applied a mask
> to the comparison so that they can identify which byte contained the match.
> That is not required here, so any reduction will do.

Right, so we can pick the cheapest reduction method, and if I'm reading
Neoverse-N1 SOG right, SHRN is marginally cheaper than ADDV (latency 2
instead of 3), and it should be generally preferable on other cores, no?

For that matter, cannot UQXTN (unsigned saturating extract narrow) be
used in place of CMEQ+ADDV here?

Alexander
Richard Henderson Feb. 15, 2024, 9:10 p.m. UTC | #4
On 2/15/24 08:46, Alexander Monakov wrote:
> Right, so we can pick the cheapest reduction method, and if I'm reading
> Neoverse-N1 SOG right, SHRN is marginally cheaper than ADDV (latency 2
> instead of 3), and it should be generally preferable on other cores, no?

Fair.

> For that matter, cannot UQXTN (unsigned saturating extract narrow) be
> used in place of CMEQ+ADDV here?

Interesting.  I hadn't thought about using saturation to preserve non-zeroness like that.

Using 1 4-cycle insn instead of 2 2-cycle insns is interesting as well.  I suppose, since 
it's at the end of the dependency chain, the fact that it is restricted to the V1 pipe 
matters not at all.


r~
diff mbox series

Patch

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 4eef6d47bc..2809b09225 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -214,7 +214,81 @@  bool test_buffer_is_zero_next_accel(void)
     }
     return false;
 }
+
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+#include <arm_neon.h>
+
+#define REASSOC_BARRIER(vec0, vec1) asm("" : "+w"(vec0), "+w"(vec1))
+
+static bool buffer_is_zero_simd(const void *buf, size_t len)
+{
+    uint32x4_t t0, t1, t2, t3;
+
+    /* Align head/tail to 16-byte boundaries.  */
+    const uint32x4_t *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
+    const uint32x4_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
+
+    /* Unaligned loads at head/tail.  */
+    t0 = vld1q_u32(buf) | vld1q_u32(buf + len - 16);
+
+    /* Collect a partial block at tail end.  */
+    t1 = e[-7] | e[-6];
+    t2 = e[-5] | e[-4];
+    t3 = e[-3] | e[-2];
+    t0 |= e[-1];
+    REASSOC_BARRIER(t0, t1);
+    REASSOC_BARRIER(t2, t3);
+    t0 |= t1;
+    t2 |= t3;
+    REASSOC_BARRIER(t0, t2);
+    t0 |= t2;
+
+    /*
+     * Loop over complete 128-byte blocks.
+     * With the head and tail removed, e - p >= 14, so the loop
+     * must iterate at least once.
+     */
+    do {
+        /* Each comparison is [-1,0], so reduction is in [-4..0]. */
+        if (unlikely(vaddvq_u32(vceqzq_u32(t0)) != -4)) {
+            return false;
+        }
+
+        t0 = p[0] | p[1];
+        t1 = p[2] | p[3];
+        t2 = p[4] | p[5];
+        t3 = p[6] | p[7];
+        REASSOC_BARRIER(t0, t1);
+        REASSOC_BARRIER(t2, t3);
+        t0 |= t1;
+        t2 |= t3;
+        REASSOC_BARRIER(t0, t2);
+        t0 |= t2;
+        p += 8;
+    } while (p < e - 7);
+
+    return vaddvq_u32(vceqzq_u32(t0)) == -4;
+}
+
+static biz_accel_fn const accel_table[] = {
+    buffer_is_zero_int_ge256,
+    buffer_is_zero_simd,
+};
+
+static unsigned accel_index = 1;
+#define INIT_ACCEL buffer_is_zero_simd
+
+bool test_buffer_is_zero_next_accel(void)
+{
+    if (accel_index != 0) {
+        buffer_is_zero_accel = accel_table[--accel_index];
+        return true;
+    }
+    return false;
+}
+
 #else
+
 bool test_buffer_is_zero_next_accel(void)
 {
     return false;