diff mbox series

[v5,11/54] accel/tcg: Add aarch64 specific support in ldst_atomicity

Message ID 20230515143313.734053-12-richard.henderson@linaro.org
State New
Headers show
Series tcg: Improve atomicity support | expand

Commit Message

Richard Henderson May 15, 2023, 2:32 p.m. UTC
We have code in atomic128.h noting that through GCC 8, there
was no support for atomic operations on __uint128.  This has
been fixed in GCC 10.  But we can still improve over any
basic compare-and-swap loop using the ldxp/stxp instructions.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/ldst_atomicity.c.inc | 60 ++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 3 deletions(-)

Comments

Peter Maydell May 16, 2023, 1:29 p.m. UTC | #1
On Mon, 15 May 2023 at 15:38, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> We have code in atomic128.h noting that through GCC 8, there
> was no support for atomic operations on __uint128.  This has
> been fixed in GCC 10.  But we can still improve over any
> basic compare-and-swap loop using the ldxp/stxp instructions.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  accel/tcg/ldst_atomicity.c.inc | 60 ++++++++++++++++++++++++++++++++--
>  1 file changed, 57 insertions(+), 3 deletions(-)
>
> diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
> index 69c1c61997..c3b2b35823 100644
> --- a/accel/tcg/ldst_atomicity.c.inc
> +++ b/accel/tcg/ldst_atomicity.c.inc
> @@ -263,7 +263,22 @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
>       * In system mode all guest pages are writable, and for user-only
>       * we have just checked writability.  Try cmpxchg.
>       */
> -#if defined(CONFIG_CMPXCHG128)
> +#if defined(__aarch64__)
> +    /* We can do better than cmpxchg for AArch64.  */
> +    {
> +        uint64_t l, h;
> +        uint32_t fail;
> +
> +        /* The load must be paired with the store to guarantee not tearing. */
> +        asm("0: ldxp %0, %1, %3\n\t"
> +            "stxp %w2, %0, %1, %3\n\t"
> +            "cbnz %w2, 0b"
> +            : "=&r"(l), "=&r"(h), "=&r"(fail) : "Q"(*p));
> +
> +        qemu_build_assert(!HOST_BIG_ENDIAN);
> +        return int128_make128(l, h);
> +    }

The compiler (well, clang 11, anyway) seems able to generate equivalent
code to this inline asm:
https://godbolt.org/z/eraeTn4vs

__int128 do_load2(void *pv) {
    __int128 *p = __builtin_assume_aligned(pv, 16);
    __int128 i = __atomic_load_n(p, __ATOMIC_RELAXED);
    return i;
}

generates

do_load2:                               // @do_load2
        mov     x8, x0
.LBB0_1:                                // =>This Inner Loop Header: Depth=1
        ldxp    x0, x1, [x8]
        stxp    w9, x0, x1, [x8]
        cbnz    w9, .LBB0_1
        ret

thanks
-- PMM
Richard Henderson May 16, 2023, 1:51 p.m. UTC | #2
On 5/16/23 06:29, Peter Maydell wrote:
> On Mon, 15 May 2023 at 15:38, Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> We have code in atomic128.h noting that through GCC 8, there
>> was no support for atomic operations on __uint128.  This has
>> been fixed in GCC 10.  But we can still improve over any
>> basic compare-and-swap loop using the ldxp/stxp instructions.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   accel/tcg/ldst_atomicity.c.inc | 60 ++++++++++++++++++++++++++++++++--
>>   1 file changed, 57 insertions(+), 3 deletions(-)
>>
>> diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
>> index 69c1c61997..c3b2b35823 100644
>> --- a/accel/tcg/ldst_atomicity.c.inc
>> +++ b/accel/tcg/ldst_atomicity.c.inc
>> @@ -263,7 +263,22 @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
>>        * In system mode all guest pages are writable, and for user-only
>>        * we have just checked writability.  Try cmpxchg.
>>        */
>> -#if defined(CONFIG_CMPXCHG128)
>> +#if defined(__aarch64__)
>> +    /* We can do better than cmpxchg for AArch64.  */
>> +    {
>> +        uint64_t l, h;
>> +        uint32_t fail;
>> +
>> +        /* The load must be paired with the store to guarantee not tearing. */
>> +        asm("0: ldxp %0, %1, %3\n\t"
>> +            "stxp %w2, %0, %1, %3\n\t"
>> +            "cbnz %w2, 0b"
>> +            : "=&r"(l), "=&r"(h), "=&r"(fail) : "Q"(*p));
>> +
>> +        qemu_build_assert(!HOST_BIG_ENDIAN);
>> +        return int128_make128(l, h);
>> +    }
> 
> The compiler (well, clang 11, anyway) seems able to generate equivalent
> code to this inline asm:

See above, where GCC 8 can do nothing, and that is still a supported compiler.


r~
Peter Maydell May 16, 2023, 1:56 p.m. UTC | #3
On Tue, 16 May 2023 at 14:51, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 5/16/23 06:29, Peter Maydell wrote:
> > On Mon, 15 May 2023 at 15:38, Richard Henderson
> > <richard.henderson@linaro.org> wrote:
> >>
> >> We have code in atomic128.h noting that through GCC 8, there
> >> was no support for atomic operations on __uint128.  This has
> >> been fixed in GCC 10.  But we can still improve over any
> >> basic compare-and-swap loop using the ldxp/stxp instructions.
> >>
> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> >> ---
> >>   accel/tcg/ldst_atomicity.c.inc | 60 ++++++++++++++++++++++++++++++++--
> >>   1 file changed, 57 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
> >> index 69c1c61997..c3b2b35823 100644
> >> --- a/accel/tcg/ldst_atomicity.c.inc
> >> +++ b/accel/tcg/ldst_atomicity.c.inc
> >> @@ -263,7 +263,22 @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
> >>        * In system mode all guest pages are writable, and for user-only
> >>        * we have just checked writability.  Try cmpxchg.
> >>        */
> >> -#if defined(CONFIG_CMPXCHG128)
> >> +#if defined(__aarch64__)
> >> +    /* We can do better than cmpxchg for AArch64.  */
> >> +    {
> >> +        uint64_t l, h;
> >> +        uint32_t fail;
> >> +
> >> +        /* The load must be paired with the store to guarantee not tearing. */
> >> +        asm("0: ldxp %0, %1, %3\n\t"
> >> +            "stxp %w2, %0, %1, %3\n\t"
> >> +            "cbnz %w2, 0b"
> >> +            : "=&r"(l), "=&r"(h), "=&r"(fail) : "Q"(*p));
> >> +
> >> +        qemu_build_assert(!HOST_BIG_ENDIAN);
> >> +        return int128_make128(l, h);
> >> +    }
> >
> > The compiler (well, clang 11, anyway) seems able to generate equivalent
> > code to this inline asm:
>
> See above, where GCC 8 can do nothing, and that is still a supported compiler.

Yeah, but it'll work fine even without the explicit inline
asm, right? It just might generate slightly worse code.
Is the performance difference critical enough to justify
an inline asm implementation that's only needed for older
compilers ?

thanks
-- PMM
Richard Henderson May 16, 2023, 2:04 p.m. UTC | #4
On 5/16/23 06:56, Peter Maydell wrote:
> On Tue, 16 May 2023 at 14:51, Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> On 5/16/23 06:29, Peter Maydell wrote:
>>> On Mon, 15 May 2023 at 15:38, Richard Henderson
>>> <richard.henderson@linaro.org> wrote:
>>>>
>>>> We have code in atomic128.h noting that through GCC 8, there
>>>> was no support for atomic operations on __uint128.  This has
>>>> been fixed in GCC 10.  But we can still improve over any
>>>> basic compare-and-swap loop using the ldxp/stxp instructions.
>>>>
>>>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>>>> ---
>>>>    accel/tcg/ldst_atomicity.c.inc | 60 ++++++++++++++++++++++++++++++++--
>>>>    1 file changed, 57 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
>>>> index 69c1c61997..c3b2b35823 100644
>>>> --- a/accel/tcg/ldst_atomicity.c.inc
>>>> +++ b/accel/tcg/ldst_atomicity.c.inc
>>>> @@ -263,7 +263,22 @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
>>>>         * In system mode all guest pages are writable, and for user-only
>>>>         * we have just checked writability.  Try cmpxchg.
>>>>         */
>>>> -#if defined(CONFIG_CMPXCHG128)
>>>> +#if defined(__aarch64__)
>>>> +    /* We can do better than cmpxchg for AArch64.  */
>>>> +    {
>>>> +        uint64_t l, h;
>>>> +        uint32_t fail;
>>>> +
>>>> +        /* The load must be paired with the store to guarantee not tearing. */
>>>> +        asm("0: ldxp %0, %1, %3\n\t"
>>>> +            "stxp %w2, %0, %1, %3\n\t"
>>>> +            "cbnz %w2, 0b"
>>>> +            : "=&r"(l), "=&r"(h), "=&r"(fail) : "Q"(*p));
>>>> +
>>>> +        qemu_build_assert(!HOST_BIG_ENDIAN);
>>>> +        return int128_make128(l, h);
>>>> +    }
>>>
>>> The compiler (well, clang 11, anyway) seems able to generate equivalent
>>> code to this inline asm:
>>
>> See above, where GCC 8 can do nothing, and that is still a supported compiler.
> 
> Yeah, but it'll work fine even without the explicit inline
> asm, right? 

No, GCC < 10 does not support __sync_* or __atomic_* on __uint128_t at all.

> Is the performance difference critical enough to justify
> an inline asm implementation that's only needed for older
> compilers ?

Yes.  It's the difference between stop-the-world and not.


r~
diff mbox series

Patch

diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
index 69c1c61997..c3b2b35823 100644
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -263,7 +263,22 @@  static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
      * In system mode all guest pages are writable, and for user-only
      * we have just checked writability.  Try cmpxchg.
      */
-#if defined(CONFIG_CMPXCHG128)
+#if defined(__aarch64__)
+    /* We can do better than cmpxchg for AArch64.  */
+    {
+        uint64_t l, h;
+        uint32_t fail;
+
+        /* The load must be paired with the store to guarantee not tearing. */
+        asm("0: ldxp %0, %1, %3\n\t"
+            "stxp %w2, %0, %1, %3\n\t"
+            "cbnz %w2, 0b"
+            : "=&r"(l), "=&r"(h), "=&r"(fail) : "Q"(*p));
+
+        qemu_build_assert(!HOST_BIG_ENDIAN);
+        return int128_make128(l, h);
+    }
+#elif defined(CONFIG_CMPXCHG128)
     /* Swap 0 with 0, with the side-effect of returning the old value. */
     {
         Int128Alias r;
@@ -759,7 +774,22 @@  store_atomic16(void *pv, Int128Alias val)
         return;
     }
 #endif
-#if defined(CONFIG_CMPXCHG128)
+#if defined(__aarch64__)
+    /* We can do better than cmpxchg for AArch64.  */
+    {
+        uint64_t l, h, t;
+
+        qemu_build_assert(!HOST_BIG_ENDIAN);
+        l = int128_getlo(val.s);
+        h = int128_gethi(val.s);
+
+        asm("0: ldxp %0, xzr, %1\n\t"
+            "stxp %w0, %2, %3, %1\n\t"
+            "cbnz %w0, 0b"
+            : "=&r"(t), "=Q"(*(__uint128_t *)pv) : "r"(l), "r"(h));
+        return;
+    }
+#elif defined(CONFIG_CMPXCHG128)
     {
         __uint128_t *pu = __builtin_assume_aligned(pv, 16);
         __uint128_t o;
@@ -857,7 +887,31 @@  static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
 static void ATTRIBUTE_ATOMIC128_OPT
 store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk)
 {
-#if defined(CONFIG_ATOMIC128)
+#if defined(__aarch64__)
+    /*
+     * GCC only implements __sync* primitives for int128 on aarch64.
+     * We can do better without the barriers, and integrating the
+     * arithmetic into the load-exclusive/store-conditional pair.
+     */
+    uint64_t tl, th, vl, vh, ml, mh;
+    uint32_t fail;
+
+    qemu_build_assert(!HOST_BIG_ENDIAN);
+    vl = int128_getlo(val.s);
+    vh = int128_gethi(val.s);
+    ml = int128_getlo(msk.s);
+    mh = int128_gethi(msk.s);
+
+    asm("0: ldxp %[l], %[h], %[mem]\n\t"
+        "bic %[l], %[l], %[ml]\n\t"
+        "bic %[h], %[h], %[mh]\n\t"
+        "orr %[l], %[l], %[vl]\n\t"
+        "orr %[h], %[h], %[vh]\n\t"
+        "stxp %w[f], %[l], %[h], %[mem]\n\t"
+        "cbnz %w[f], 0b\n"
+        : [mem] "+Q"(*ps), [f] "=&r"(fail), [l] "=&r"(tl), [h] "=&r"(th)
+        : [vl] "r"(vl), [vh] "r"(vh), [ml] "r"(ml), [mh] "r"(mh));
+#elif defined(CONFIG_ATOMIC128)
     __uint128_t *pu, old, new;
 
     /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */