diff mbox

[v1,3/5] include/qemu/atomic.h: default to __atomic functions

Message ID 1453976119-24372-4-git-send-email-alex.bennee@linaro.org
State New
Headers show

Commit Message

Alex Bennée Jan. 28, 2016, 10:15 a.m. UTC
The __atomic primitives have been available since GCC 4.7 and provide
a richer interface for describing memory ordering requirements. As a
bonus by using the primitives instead of hand-rolled functions we can
use tools such as the AddressSanitizer which need the use of well
defined APIs for its analysis.

If we have __ATOMIC defines we exclusively use the __atomic primitives
for all our atomic access. Otherwise we fall back to the mixture of
__sync and hand-rolled barrier cases.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>


---

v2 - review updates
 - add reference to doc/atomics.txt at top of file
 - ensure smb_mb() is full __ATOMIC_SEQ_CST
 - make atomic_read/set __ATOMIC_RELAXED
 - make atomic_rcu_read/set __ATOMIC_CONSUME/RELEASE
 - make atomic_mb_red/set __ATOMIC_RELAXED + explicit barriers
 - move some comments about __ATOMIC no longer relevant to bottom half
 - rm un-needed ifdef/ifndefs
 - in cmpxchg return old instead of ptr
 - extra comments on old style volatile atomic access
---
 include/qemu/atomic.h | 178 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 117 insertions(+), 61 deletions(-)

-- 
2.7.0

Comments

Alex Bennée Jan. 29, 2016, 4:06 p.m. UTC | #1
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 28/01/2016 11:15, Alex Bennée wrote:

>> +/* atomic_mb_read/set semantics map Java volatile variables. They are

>> + * less expensive on some platforms (notably POWER & ARM) than fully

>> + * sequentially consistent operations.

>> + *

>> + * As long as they are used as paired operations they are safe to

>> + * use. See docs/atomic.txt for more discussion.

>> + */

>> +

>> +#define atomic_mb_read(ptr)                             \

>> +    ({                                                  \

>> +    typeof(*ptr) _val;                                  \

>> +     __atomic_load(ptr, &_val, __ATOMIC_RELAXED);       \

>> +     smp_rmb();                                         \

>> +    _val;                                               \

>> +    })

>> +

>> +#define atomic_mb_set(ptr, i)  do {                     \

>> +    typeof(*ptr) _val = (i);                            \

>> +    smp_wmb();                                          \

>> +    __atomic_store(ptr, &_val, __ATOMIC_RELAXED);       \

>> +    smp_mb();                                           \

>> +} while(0)

>

> Great... I'll change this to

>

> #if defined(_ARCH_PPC)

> #define atomic_mb_read(ptr)                             \

>     ({                                                  \

>     typeof(*ptr) _val;                                  \

>      __atomic_load(ptr, &_val, __ATOMIC_RELAXED);       \

>      smp_rmb();                                         \

>     _val;                                               \

>     })

>

> #define atomic_mb_set(ptr, i)  do {                     \

>     typeof(*ptr) _val = (i);                            \

>     smp_wmb();                                          \

>     __atomic_store(ptr, &_val, __ATOMIC_RELAXED);       \

>     smp_mb();                                           \

> } while(0)

> #else

> #define atomic_mb_read(ptr)                       \

>     ({                                            \

>     typeof(*ptr) _val;                            \

>      __atomic_load(ptr, &_val, __ATOMIC_SEQ_CST); \

>     _val;                                         \

>     })

>

> #define atomic_mb_set(ptr, i)  do {               \

>     typeof(*ptr) _val = (i);                      \

>     __atomic_store(ptr, &_val, __ATOMIC_SEQ_CST); \

> } while(0)

> #endif

>

> since this benefits x86 (which can generate mov/xchg respectively) and

> aarch64 (where atomic_mb_read/atomic_mb_set map directly to

> ldar/stlr).


The original comment mentioned both POWER and ARM so I wondering if we
should also special case for the ARMv7?

>

>> +/* Returns the eventual value, failed or not */


Yeah this comment in bogus.

>> +#define atomic_cmpxchg(ptr, old, new)                                   \

>> +    ({                                                                  \

>> +    typeof(*ptr) _old = (old), _new = (new);                            \

>> +    __atomic_compare_exchange(ptr, &_old, &_new, false,                 \

>> +                              __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \

>> +    _old; /* can this race if cmpxchg not used elsewhere? */            \

>> +    })

>

> How so?


My mistake, I was having a worry that we weren't following the old
semantics. In fact having read even more closely I understand that _old is
updated by the __atomic function if the update fails. In fact _old is a
poor name because its _expected at the start and _current in the case it
fails. In fact:

    This compares the contents of *ptr with the contents of *expected. If
    equal, the operation is a read-modify-write operation that writes
    desired into *ptr. If they are not equal, the operation is a read and
    the current contents of *ptr are written into *expected. <snip>...

    If desired is written into *ptr then true is returned and memory is
    affected according to the memory order specified by success_memorder.
    There are no restrictions on what memory order can be used here.

I was wondering if this was subtly different from the old
__sync_val_compare_and_swap:

    The “val” version returns the contents of *ptr before the operation.

I think we are OK because if cmpxchg succeeds _old was by definition
what was already there but it is confusing and leads to funny code like
this:

    if (atomic_cmpxchg(&data[i].n, 0, 3) == 0) {
        data[i].ret = -ECANCELED;
        ...

and

    if (atomic_cmpxchg(&s->state, old_state, new_state) == old_state) {
       ...

Which might be easier to read if atomic_cmpxchg used the bool
semantics, i.e. return true for a successful cmpxchg.

The old code even has a atomic_bool_cmpxchg which no one seems to use. I
wonder if the correct solution is to convert atomic_cmpxchg calls to use
atomic_cmpxchg_bool calls and remove atomic_cmpxchg from atomic.h?

What do you think?

>

> Paolo



--
Alex Bennée
Peter Maydell Feb. 4, 2016, 1 p.m. UTC | #2
On 4 February 2016 at 12:40, Paolo Bonzini <pbonzini@redhat.com> wrote:
> I don't know the exact feature test, and I think ARMv7's penalty is much

> less because processors are slower, with a less deep pipeline and

> usually not used in SMP configurations.


Most v7 cores are SMP, and especially most v7 cores with enough
CPU grunt that it's sensible to run QEMU at all are going to be SMP.

thanks
-- PMM
Peter Maydell April 1, 2016, 2:51 p.m. UTC | #3
On 1 April 2016 at 15:30, James Hogan <james.hogan@imgtec.com> wrote:
> Hi Alex,

>

> On Thu, Jan 28, 2016 at 10:15:17AM +0000, Alex Bennée wrote:

>> The __atomic primitives have been available since GCC 4.7 and provide

>> a richer interface for describing memory ordering requirements. As a

>> bonus by using the primitives instead of hand-rolled functions we can

>> use tools such as the AddressSanitizer which need the use of well

>> defined APIs for its analysis.

>>

>> If we have __ATOMIC defines we exclusively use the __atomic primitives

>> for all our atomic access. Otherwise we fall back to the mixture of

>> __sync and hand-rolled barrier cases.

>>

>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>

> This breaks the build on MIPS32, with the following link error:

>

> cpus.o: In function `icount_warp_rt':

> /work/mips/qemu/vz/cpus.c +343 : undefined reference to `__atomic_load_8'

> collect2: error: ld returned 1 exit status

>

> Seemingly __atomic_load_8 is provided by libatomic, so we're missing

> -latomic.


We should not be doing atomic ops on types larger than the native
pointer type. I think there's a patch proposed on list to fix this.
I don't think we should be adding libatomic.

thanks
-- PMM
Alex Bennée April 1, 2016, 4:06 p.m. UTC | #4
James Hogan <james.hogan@imgtec.com> writes:

> Hi Alex,

>

> On Thu, Jan 28, 2016 at 10:15:17AM +0000, Alex Bennée wrote:

>> The __atomic primitives have been available since GCC 4.7 and provide

>> a richer interface for describing memory ordering requirements. As a

>> bonus by using the primitives instead of hand-rolled functions we can

>> use tools such as the AddressSanitizer which need the use of well

>> defined APIs for its analysis.

>>

>> If we have __ATOMIC defines we exclusively use the __atomic primitives

>> for all our atomic access. Otherwise we fall back to the mixture of

>> __sync and hand-rolled barrier cases.

>>

>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>

> This breaks the build on MIPS32, with the following link error:

>

> cpus.o: In function `icount_warp_rt':

> /work/mips/qemu/vz/cpus.c +343 : undefined reference to `__atomic_load_8'

> collect2: error: ld returned 1 exit status


See <1458577386-9984-1-git-send-email-alex.bennee@linaro.org> which has
two patches. One to avoid doing a > word width atomic load for
icount_warp_rt. The second patch adds a compile time assert in the
atomic.h code.

I'll be re-spinning those patches on Monday.

>

> Seemingly __atomic_load_8 is provided by libatomic, so we're missing

> -latomic.

>

> Cheers

> James

>

>>

>> ---

>>

>> v2 - review updates

>>  - add reference to doc/atomics.txt at top of file

>>  - ensure smb_mb() is full __ATOMIC_SEQ_CST

>>  - make atomic_read/set __ATOMIC_RELAXED

>>  - make atomic_rcu_read/set __ATOMIC_CONSUME/RELEASE

>>  - make atomic_mb_red/set __ATOMIC_RELAXED + explicit barriers

>>  - move some comments about __ATOMIC no longer relevant to bottom half

>>  - rm un-needed ifdef/ifndefs

>>  - in cmpxchg return old instead of ptr

>>  - extra comments on old style volatile atomic access

>> ---

>>  include/qemu/atomic.h | 178 +++++++++++++++++++++++++++++++++-----------------

>>  1 file changed, 117 insertions(+), 61 deletions(-)

>>

>> diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h

>> index bd2c075..ec3208a 100644

>> --- a/include/qemu/atomic.h

>> +++ b/include/qemu/atomic.h

>> @@ -8,6 +8,8 @@

>>   * This work is licensed under the terms of the GNU GPL, version 2 or later.

>>   * See the COPYING file in the top-level directory.

>>   *

>> + * See docs/atomics.txt for discussion about the guarantees each

>> + * atomic primitive is meant to provide.

>>   */

>>

>>  #ifndef __QEMU_ATOMIC_H

>> @@ -15,12 +17,116 @@

>>

>>  #include "qemu/compiler.h"

>>

>> -/* For C11 atomic ops */

>>

>>  /* Compiler barrier */

>>  #define barrier()   ({ asm volatile("" ::: "memory"); (void)0; })

>>

>> -#ifndef __ATOMIC_RELAXED

>> +#ifdef __ATOMIC_RELAXED

>> +/* For C11 atomic ops */

>> +

>> +/* Manual memory barriers

>> + *

>> + *__atomic_thread_fence does not include a compiler barrier; instead,

>> + * the barrier is part of __atomic_load/__atomic_store's "volatile-like"

>> + * semantics. If smp_wmb() is a no-op, absence of the barrier means that

>> + * the compiler is free to reorder stores on each side of the barrier.

>> + * Add one here, and similarly in smp_rmb() and smp_read_barrier_depends().

>> + */

>> +

>> +#define smp_mb()    ({ barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); barrier(); })

>> +#define smp_wmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); barrier(); })

>> +#define smp_rmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); barrier(); })

>> +

>> +#define smp_read_barrier_depends() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); })

>> +

>> +/* Weak atomic operations prevent the compiler moving other

>> + * loads/stores past the atomic operation load/store. However there is

>> + * no explicit memory barrier for the processor.

>> + */

>> +#define atomic_read(ptr)                          \

>> +    ({                                            \

>> +    typeof(*ptr) _val;                            \

>> +     __atomic_load(ptr, &_val, __ATOMIC_RELAXED); \

>> +    _val;                                         \

>> +    })

>> +

>> +#define atomic_set(ptr, i)  do {                  \

>> +    typeof(*ptr) _val = (i);                      \

>> +    __atomic_store(ptr, &_val, __ATOMIC_RELAXED); \

>> +} while(0)

>> +

>> +/* Atomic RCU operations imply weak memory barriers */

>> +

>> +#define atomic_rcu_read(ptr)                      \

>> +    ({                                            \

>> +    typeof(*ptr) _val;                            \

>> +     __atomic_load(ptr, &_val, __ATOMIC_CONSUME); \

>> +    _val;                                         \

>> +    })

>> +

>> +#define atomic_rcu_set(ptr, i)  do {                    \

>> +    typeof(*ptr) _val = (i);                            \

>> +    __atomic_store(ptr, &_val, __ATOMIC_RELEASE);       \

>> +} while(0)

>> +

>> +/* atomic_mb_read/set semantics map Java volatile variables. They are

>> + * less expensive on some platforms (notably POWER & ARM) than fully

>> + * sequentially consistent operations.

>> + *

>> + * As long as they are used as paired operations they are safe to

>> + * use. See docs/atomic.txt for more discussion.

>> + */

>> +

>> +#define atomic_mb_read(ptr)                             \

>> +    ({                                                  \

>> +    typeof(*ptr) _val;                                  \

>> +     __atomic_load(ptr, &_val, __ATOMIC_RELAXED);       \

>> +     smp_rmb();                                         \

>> +    _val;                                               \

>> +    })

>> +

>> +#define atomic_mb_set(ptr, i)  do {                     \

>> +    typeof(*ptr) _val = (i);                            \

>> +    smp_wmb();                                          \

>> +    __atomic_store(ptr, &_val, __ATOMIC_RELAXED);       \

>> +    smp_mb();                                           \

>> +} while(0)

>> +

>> +

>> +/* All the remaining operations are fully sequentially consistent */

>> +

>> +#define atomic_xchg(ptr, i)    ({                           \

>> +    typeof(*ptr) _new = (i), _old;                          \

>> +    __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \

>> +    _old;                                                   \

>> +})

>> +

>> +/* Returns the eventual value, failed or not */

>> +#define atomic_cmpxchg(ptr, old, new)                                   \

>> +    ({                                                                  \

>> +    typeof(*ptr) _old = (old), _new = (new);                            \

>> +    __atomic_compare_exchange(ptr, &_old, &_new, false,                 \

>> +                              __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \

>> +    _old; /* can this race if cmpxchg not used elsewhere? */            \

>> +    })

>> +

>> +/* Provide shorter names for GCC atomic builtins, return old value */

>> +#define atomic_fetch_inc(ptr)  __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST)

>> +#define atomic_fetch_dec(ptr)  __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST)

>> +#define atomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST)

>> +#define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST)

>> +#define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST)

>> +#define atomic_fetch_or(ptr, n)  __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)

>> +

>> +/* And even shorter names that return void.  */

>> +#define atomic_inc(ptr)    ((void) __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST))

>> +#define atomic_dec(ptr)    ((void) __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST))

>> +#define atomic_add(ptr, n) ((void) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST))

>> +#define atomic_sub(ptr, n) ((void) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST))

>> +#define atomic_and(ptr, n) ((void) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST))

>> +#define atomic_or(ptr, n)  ((void) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST))

>> +

>> +#else /* __ATOMIC_RELAXED */

>>

>>  /*

>>   * We use GCC builtin if it's available, as that can use mfence on

>> @@ -85,8 +191,6 @@

>>

>>  #endif /* _ARCH_PPC */

>>

>> -#endif /* C11 atomics */

>> -

>>  /*

>>   * For (host) platforms we don't have explicit barrier definitions

>>   * for, we use the gcc __sync_synchronize() primitive to generate a

>> @@ -98,42 +202,22 @@

>>  #endif

>>

>>  #ifndef smp_wmb

>> -#ifdef __ATOMIC_RELEASE

>> -/* __atomic_thread_fence does not include a compiler barrier; instead,

>> - * the barrier is part of __atomic_load/__atomic_store's "volatile-like"

>> - * semantics. If smp_wmb() is a no-op, absence of the barrier means that

>> - * the compiler is free to reorder stores on each side of the barrier.

>> - * Add one here, and similarly in smp_rmb() and smp_read_barrier_depends().

>> - */

>> -#define smp_wmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); barrier(); })

>> -#else

>>  #define smp_wmb()   __sync_synchronize()

>>  #endif

>> -#endif

>>

>>  #ifndef smp_rmb

>> -#ifdef __ATOMIC_ACQUIRE

>> -#define smp_rmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); barrier(); })

>> -#else

>>  #define smp_rmb()   __sync_synchronize()

>>  #endif

>> -#endif

>>

>>  #ifndef smp_read_barrier_depends

>> -#ifdef __ATOMIC_CONSUME

>> -#define smp_read_barrier_depends()   ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); })

>> -#else

>>  #define smp_read_barrier_depends()   barrier()

>>  #endif

>> -#endif

>>

>> -#ifndef atomic_read

>> +/* These will only be atomic if the processor does the fetch or store

>> + * in a single issue memory operation

>> + */

>>  #define atomic_read(ptr)       (*(__typeof__(*ptr) volatile*) (ptr))

>> -#endif

>> -

>> -#ifndef atomic_set

>>  #define atomic_set(ptr, i)     ((*(__typeof__(*ptr) volatile*) (ptr)) = (i))

>> -#endif

>>

>>  /**

>>   * atomic_rcu_read - reads a RCU-protected pointer to a local variable

>> @@ -146,30 +230,18 @@

>>   * Inserts memory barriers on architectures that require them (currently only

>>   * Alpha) and documents which pointers are protected by RCU.

>>   *

>> - * Unless the __ATOMIC_CONSUME memory order is available, atomic_rcu_read also

>> - * includes a compiler barrier to ensure that value-speculative optimizations

>> - * (e.g. VSS: Value Speculation Scheduling) does not perform the data read

>> - * before the pointer read by speculating the value of the pointer.  On new

>> - * enough compilers, atomic_load takes care of such concern about

>> - * dependency-breaking optimizations.

>> + * atomic_rcu_read also includes a compiler barrier to ensure that

>> + * value-speculative optimizations (e.g. VSS: Value Speculation

>> + * Scheduling) does not perform the data read before the pointer read

>> + * by speculating the value of the pointer.

>>   *

>>   * Should match atomic_rcu_set(), atomic_xchg(), atomic_cmpxchg().

>>   */

>> -#ifndef atomic_rcu_read

>> -#ifdef __ATOMIC_CONSUME

>> -#define atomic_rcu_read(ptr)    ({                \

>> -    typeof(*ptr) _val;                            \

>> -     __atomic_load(ptr, &_val, __ATOMIC_CONSUME); \

>> -    _val;                                         \

>> -})

>> -#else

>>  #define atomic_rcu_read(ptr)    ({                \

>>      typeof(*ptr) _val = atomic_read(ptr);         \

>>      smp_read_barrier_depends();                   \

>>      _val;                                         \

>>  })

>> -#endif

>> -#endif

>>

>>  /**

>>   * atomic_rcu_set - assigns (publicizes) a pointer to a new data structure

>> @@ -182,19 +254,10 @@

>>   *

>>   * Should match atomic_rcu_read().

>>   */

>> -#ifndef atomic_rcu_set

>> -#ifdef __ATOMIC_RELEASE

>> -#define atomic_rcu_set(ptr, i)  do {              \

>> -    typeof(*ptr) _val = (i);                      \

>> -    __atomic_store(ptr, &_val, __ATOMIC_RELEASE); \

>> -} while(0)

>> -#else

>>  #define atomic_rcu_set(ptr, i)  do {              \

>>      smp_wmb();                                    \

>>      atomic_set(ptr, i);                           \

>>  } while (0)

>> -#endif

>> -#endif

>>

>>  /* These have the same semantics as Java volatile variables.

>>   * See http://gee.cs.oswego.edu/dl/jmm/cookbook.html:

>> @@ -218,13 +281,11 @@

>>   * (see docs/atomics.txt), and I'm not sure that __ATOMIC_ACQ_REL is enough.

>>   * Just always use the barriers manually by the rules above.

>>   */

>> -#ifndef atomic_mb_read

>>  #define atomic_mb_read(ptr)    ({           \

>>      typeof(*ptr) _val = atomic_read(ptr);   \

>>      smp_rmb();                              \

>>      _val;                                   \

>>  })

>> -#endif

>>

>>  #ifndef atomic_mb_set

>>  #define atomic_mb_set(ptr, i)  do {         \

>> @@ -237,12 +298,6 @@

>>  #ifndef atomic_xchg

>>  #if defined(__clang__)

>>  #define atomic_xchg(ptr, i)    __sync_swap(ptr, i)

>> -#elif defined(__ATOMIC_SEQ_CST)

>> -#define atomic_xchg(ptr, i)    ({                           \

>> -    typeof(*ptr) _new = (i), _old;                          \

>> -    __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \

>> -    _old;                                                   \

>> -})

>>  #else

>>  /* __sync_lock_test_and_set() is documented to be an acquire barrier only.  */

>>  #define atomic_xchg(ptr, i)    (smp_mb(), __sync_lock_test_and_set(ptr, i))

>> @@ -266,4 +321,5 @@

>>  #define atomic_and(ptr, n)     ((void) __sync_fetch_and_and(ptr, n))

>>  #define atomic_or(ptr, n)      ((void) __sync_fetch_and_or(ptr, n))

>>

>> -#endif

>> +#endif /* __ATOMIC_RELAXED */

>> +#endif /* __QEMU_ATOMIC_H */

>> --

>> 2.7.0

>>

>>



--
Alex Bennée
diff mbox

Patch

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index bd2c075..ec3208a 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -8,6 +8,8 @@ 
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
  *
+ * See docs/atomics.txt for discussion about the guarantees each
+ * atomic primitive is meant to provide.
  */
 
 #ifndef __QEMU_ATOMIC_H
@@ -15,12 +17,116 @@ 
 
 #include "qemu/compiler.h"
 
-/* For C11 atomic ops */
 
 /* Compiler barrier */
 #define barrier()   ({ asm volatile("" ::: "memory"); (void)0; })
 
-#ifndef __ATOMIC_RELAXED
+#ifdef __ATOMIC_RELAXED
+/* For C11 atomic ops */
+
+/* Manual memory barriers
+ *
+ *__atomic_thread_fence does not include a compiler barrier; instead,
+ * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
+ * semantics. If smp_wmb() is a no-op, absence of the barrier means that
+ * the compiler is free to reorder stores on each side of the barrier.
+ * Add one here, and similarly in smp_rmb() and smp_read_barrier_depends().
+ */
+
+#define smp_mb()    ({ barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); barrier(); })
+#define smp_wmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); barrier(); })
+#define smp_rmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); barrier(); })
+
+#define smp_read_barrier_depends() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); })
+
+/* Weak atomic operations prevent the compiler moving other
+ * loads/stores past the atomic operation load/store. However there is
+ * no explicit memory barrier for the processor.
+ */
+#define atomic_read(ptr)                          \
+    ({                                            \
+    typeof(*ptr) _val;                            \
+     __atomic_load(ptr, &_val, __ATOMIC_RELAXED); \
+    _val;                                         \
+    })
+
+#define atomic_set(ptr, i)  do {                  \
+    typeof(*ptr) _val = (i);                      \
+    __atomic_store(ptr, &_val, __ATOMIC_RELAXED); \
+} while(0)
+
+/* Atomic RCU operations imply weak memory barriers */
+
+#define atomic_rcu_read(ptr)                      \
+    ({                                            \
+    typeof(*ptr) _val;                            \
+     __atomic_load(ptr, &_val, __ATOMIC_CONSUME); \
+    _val;                                         \
+    })
+
+#define atomic_rcu_set(ptr, i)  do {                    \
+    typeof(*ptr) _val = (i);                            \
+    __atomic_store(ptr, &_val, __ATOMIC_RELEASE);       \
+} while(0)
+
+/* atomic_mb_read/set semantics map Java volatile variables. They are
+ * less expensive on some platforms (notably POWER & ARM) than fully
+ * sequentially consistent operations.
+ *
+ * As long as they are used as paired operations they are safe to
+ * use. See docs/atomic.txt for more discussion.
+ */
+
+#define atomic_mb_read(ptr)                             \
+    ({                                                  \
+    typeof(*ptr) _val;                                  \
+     __atomic_load(ptr, &_val, __ATOMIC_RELAXED);       \
+     smp_rmb();                                         \
+    _val;                                               \
+    })
+
+#define atomic_mb_set(ptr, i)  do {                     \
+    typeof(*ptr) _val = (i);                            \
+    smp_wmb();                                          \
+    __atomic_store(ptr, &_val, __ATOMIC_RELAXED);       \
+    smp_mb();                                           \
+} while(0)
+
+
+/* All the remaining operations are fully sequentially consistent */
+
+#define atomic_xchg(ptr, i)    ({                           \
+    typeof(*ptr) _new = (i), _old;                          \
+    __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \
+    _old;                                                   \
+})
+
+/* Returns the eventual value, failed or not */
+#define atomic_cmpxchg(ptr, old, new)                                   \
+    ({                                                                  \
+    typeof(*ptr) _old = (old), _new = (new);                            \
+    __atomic_compare_exchange(ptr, &_old, &_new, false,                 \
+                              __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);      \
+    _old; /* can this race if cmpxchg not used elsewhere? */            \
+    })
+
+/* Provide shorter names for GCC atomic builtins, return old value */
+#define atomic_fetch_inc(ptr)  __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST)
+#define atomic_fetch_dec(ptr)  __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST)
+#define atomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST)
+#define atomic_fetch_or(ptr, n)  __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)
+
+/* And even shorter names that return void.  */
+#define atomic_inc(ptr)    ((void) __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST))
+#define atomic_dec(ptr)    ((void) __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST))
+#define atomic_add(ptr, n) ((void) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST))
+#define atomic_sub(ptr, n) ((void) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST))
+#define atomic_and(ptr, n) ((void) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST))
+#define atomic_or(ptr, n)  ((void) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST))
+
+#else /* __ATOMIC_RELAXED */
 
 /*
  * We use GCC builtin if it's available, as that can use mfence on
@@ -85,8 +191,6 @@ 
 
 #endif /* _ARCH_PPC */
 
-#endif /* C11 atomics */
-
 /*
  * For (host) platforms we don't have explicit barrier definitions
  * for, we use the gcc __sync_synchronize() primitive to generate a
@@ -98,42 +202,22 @@ 
 #endif
 
 #ifndef smp_wmb
-#ifdef __ATOMIC_RELEASE
-/* __atomic_thread_fence does not include a compiler barrier; instead,
- * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
- * semantics. If smp_wmb() is a no-op, absence of the barrier means that
- * the compiler is free to reorder stores on each side of the barrier.
- * Add one here, and similarly in smp_rmb() and smp_read_barrier_depends().
- */
-#define smp_wmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); barrier(); })
-#else
 #define smp_wmb()   __sync_synchronize()
 #endif
-#endif
 
 #ifndef smp_rmb
-#ifdef __ATOMIC_ACQUIRE
-#define smp_rmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); barrier(); })
-#else
 #define smp_rmb()   __sync_synchronize()
 #endif
-#endif
 
 #ifndef smp_read_barrier_depends
-#ifdef __ATOMIC_CONSUME
-#define smp_read_barrier_depends()   ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); })
-#else
 #define smp_read_barrier_depends()   barrier()
 #endif
-#endif
 
-#ifndef atomic_read
+/* These will only be atomic if the processor does the fetch or store
+ * in a single issue memory operation
+ */
 #define atomic_read(ptr)       (*(__typeof__(*ptr) volatile*) (ptr))
-#endif
-
-#ifndef atomic_set
 #define atomic_set(ptr, i)     ((*(__typeof__(*ptr) volatile*) (ptr)) = (i))
-#endif
 
 /**
  * atomic_rcu_read - reads a RCU-protected pointer to a local variable
@@ -146,30 +230,18 @@ 
  * Inserts memory barriers on architectures that require them (currently only
  * Alpha) and documents which pointers are protected by RCU.
  *
- * Unless the __ATOMIC_CONSUME memory order is available, atomic_rcu_read also
- * includes a compiler barrier to ensure that value-speculative optimizations
- * (e.g. VSS: Value Speculation Scheduling) does not perform the data read
- * before the pointer read by speculating the value of the pointer.  On new
- * enough compilers, atomic_load takes care of such concern about
- * dependency-breaking optimizations.
+ * atomic_rcu_read also includes a compiler barrier to ensure that
+ * value-speculative optimizations (e.g. VSS: Value Speculation
+ * Scheduling) does not perform the data read before the pointer read
+ * by speculating the value of the pointer.
  *
  * Should match atomic_rcu_set(), atomic_xchg(), atomic_cmpxchg().
  */
-#ifndef atomic_rcu_read
-#ifdef __ATOMIC_CONSUME
-#define atomic_rcu_read(ptr)    ({                \
-    typeof(*ptr) _val;                            \
-     __atomic_load(ptr, &_val, __ATOMIC_CONSUME); \
-    _val;                                         \
-})
-#else
 #define atomic_rcu_read(ptr)    ({                \
     typeof(*ptr) _val = atomic_read(ptr);         \
     smp_read_barrier_depends();                   \
     _val;                                         \
 })
-#endif
-#endif
 
 /**
  * atomic_rcu_set - assigns (publicizes) a pointer to a new data structure
@@ -182,19 +254,10 @@ 
  *
  * Should match atomic_rcu_read().
  */
-#ifndef atomic_rcu_set
-#ifdef __ATOMIC_RELEASE
-#define atomic_rcu_set(ptr, i)  do {              \
-    typeof(*ptr) _val = (i);                      \
-    __atomic_store(ptr, &_val, __ATOMIC_RELEASE); \
-} while(0)
-#else
 #define atomic_rcu_set(ptr, i)  do {              \
     smp_wmb();                                    \
     atomic_set(ptr, i);                           \
 } while (0)
-#endif
-#endif
 
 /* These have the same semantics as Java volatile variables.
  * See http://gee.cs.oswego.edu/dl/jmm/cookbook.html:
@@ -218,13 +281,11 @@ 
  * (see docs/atomics.txt), and I'm not sure that __ATOMIC_ACQ_REL is enough.
  * Just always use the barriers manually by the rules above.
  */
-#ifndef atomic_mb_read
 #define atomic_mb_read(ptr)    ({           \
     typeof(*ptr) _val = atomic_read(ptr);   \
     smp_rmb();                              \
     _val;                                   \
 })
-#endif
 
 #ifndef atomic_mb_set
 #define atomic_mb_set(ptr, i)  do {         \
@@ -237,12 +298,6 @@ 
 #ifndef atomic_xchg
 #if defined(__clang__)
 #define atomic_xchg(ptr, i)    __sync_swap(ptr, i)
-#elif defined(__ATOMIC_SEQ_CST)
-#define atomic_xchg(ptr, i)    ({                           \
-    typeof(*ptr) _new = (i), _old;                          \
-    __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \
-    _old;                                                   \
-})
 #else
 /* __sync_lock_test_and_set() is documented to be an acquire barrier only.  */
 #define atomic_xchg(ptr, i)    (smp_mb(), __sync_lock_test_and_set(ptr, i))
@@ -266,4 +321,5 @@ 
 #define atomic_and(ptr, n)     ((void) __sync_fetch_and_and(ptr, n))
 #define atomic_or(ptr, n)      ((void) __sync_fetch_and_or(ptr, n))
 
-#endif
+#endif /* __ATOMIC_RELAXED */
+#endif /* __QEMU_ATOMIC_H */