diff mbox

[v4,2/2] Look ma, no barriers! C11 memory model

Message ID 1415656941-9987-1-git-send-email-ola.liljedahl@linaro.org
State New
Headers show

Commit Message

Ola Liljedahl Nov. 10, 2014, 10:02 p.m. UTC
Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
---
(PATCH 1/2 is the odp_counter.h patch was sent earlier, I am not resending it
now unless requested. This patch requires the earlier patch).

Implementation of C11-based memory model for atomic operations. 11 operations
(init/load/store/xchg/cmp_xchg_weak|cmp_xchg_strong/fetch_add/add/fetch_inc/inc
/fetch_dec/dec) in odp_atomic.h for 32- and 64-bit atomic types. 4 operations
(init/load/store/xchg) for pointers. The required memory ordering is now a
parameter to each call just like in C11.

Implementation uses GCC __atomic builtins behind a wrapper type and wrapper API.
The wrappers enforce that only atomic operations are used with atomic types and
also allow the compiler implementation to be complemented and/or overridden
should that be desired (e.g. because of lacking or non-optimal support in the
compiler).

Attempt to remove all explicit memory barriers (odp_sync_stores) and volatile
from code that implements multithreaded synchronization primitives (e.g. locks,
barriers). Rewrote such primitives to use the new atomic operations.

By using the new atomics with acquire, release and SC memory models, a number
of bugs (race conditions) were implicitly fixed. E.g. race conditions in
odp_barrier_sync() (non-atomic wrap of counter), odp_ticketlock_lock()
(missing acquire barrier) and odp_ring enqueue/dequeue (missing release
barrier, had only compiler barrier). Also removed redundant use of barriers
which could improve performance.

Removed odp_mem_barrier() (which is just a compiler barrier) as it is neither
used nor useful.

 example/ipsec/odp_ipsec.c                          |   2 +-
 example/odp_example/odp_example.c                  |   2 +-
 example/timer/odp_timer_test.c                     |   2 +-
 helper/include/odph_ring.h                         |   8 +-
 platform/linux-generic/include/api/odp_atomic.h    | 531 ++++++++++++++-------
 platform/linux-generic/include/api/odp_barrier.h   |  12 +-
 platform/linux-generic/include/api/odp_rwlock.h    |  30 +-
 platform/linux-generic/include/api/odp_spinlock.h  |   4 +-
 .../linux-generic/include/api/odp_ticketlock.h     |  12 +-
 platform/linux-generic/include/odp_spin_internal.h |   9 -
 platform/linux-generic/odp_barrier.c               |  46 +-
 platform/linux-generic/odp_queue.c                 |   3 +-
 platform/linux-generic/odp_ring.c                  | 108 +++--
 platform/linux-generic/odp_rwlock.c                |  67 +--
 platform/linux-generic/odp_spinlock.c              |  16 +-
 platform/linux-generic/odp_ticketlock.c            |  29 +-
 test/api_test/odp_atomic_test.c                    |  72 +--
 test/api_test/odp_atomic_test.h                    |  16 +-
 test/api_test/odp_counter_test.c                   |   2 +-
 19 files changed, 583 insertions(+), 388 deletions(-)
diff mbox

Patch

diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
index da6c48e..579e011 100644
--- a/example/ipsec/odp_ipsec.c
+++ b/example/ipsec/odp_ipsec.c
@@ -1222,7 +1222,7 @@  main(int argc, char *argv[])
 	printf("Num worker threads: %i\n", num_workers);
 
 	/* Create a barrier to synchronize thread startup */
-	odp_barrier_init_count(&sync_barrier, num_workers);
+	odp_barrier_init(&sync_barrier, num_workers);
 
 	/*
 	 * By default core #0 runs Linux kernel background tasks.
diff --git a/example/odp_example/odp_example.c b/example/odp_example/odp_example.c
index d0ec977..7a92eb9 100644
--- a/example/odp_example/odp_example.c
+++ b/example/odp_example/odp_example.c
@@ -1105,7 +1105,7 @@  int main(int argc, char *argv[])
 	odp_shm_print_all();
 
 	/* Barrier to sync test case execution */
-	odp_barrier_init_count(&globals->barrier, num_workers);
+	odp_barrier_init(&globals->barrier, num_workers);
 
 	if (args.proc_mode) {
 		int ret;
diff --git a/example/timer/odp_timer_test.c b/example/timer/odp_timer_test.c
index 78b2ae2..dfbeae9 100644
--- a/example/timer/odp_timer_test.c
+++ b/example/timer/odp_timer_test.c
@@ -372,7 +372,7 @@  int main(int argc, char *argv[])
 	printf("\n");
 
 	/* Barrier to sync test case execution */
-	odp_barrier_init_count(&test_barrier, num_workers);
+	odp_barrier_init(&test_barrier, num_workers);
 
 	/* Create and launch worker threads */
 	odph_linux_pthread_create(thread_tbl, num_workers, first_core,
diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
index 76c1db8..7f41ee8 100644
--- a/helper/include/odph_ring.h
+++ b/helper/include/odph_ring.h
@@ -138,8 +138,8 @@  typedef struct odph_ring {
 		uint32_t sp_enqueue;     /* True, if single producer. */
 		uint32_t size;           /* Size of ring. */
 		uint32_t mask;           /* Mask (size-1) of ring. */
-		uint32_t head;		/* Producer head. */
-		uint32_t tail;		/* Producer tail. */
+		odp_atomic_u32_t head;	 /* Producer head. */
+		odp_atomic_u32_t tail;	 /* Producer tail. */
 	} prod ODP_ALIGNED_CACHE;
 
 	/** @private Consumer */
@@ -147,8 +147,8 @@  typedef struct odph_ring {
 		uint32_t sc_dequeue;     /* True, if single consumer. */
 		uint32_t size;           /* Size of the ring. */
 		uint32_t mask;           /* Mask (size-1) of ring. */
-		uint32_t head;		/* Consumer head. */
-		uint32_t tail;		/* Consumer tail. */
+		odp_atomic_u32_t head;	 /* Consumer head. */
+		odp_atomic_u32_t tail;	 /* Consumer tail. */
 	} cons ODP_ALIGNED_CACHE;
 
 	/** @private Memory space of ring starts here. */
diff --git a/platform/linux-generic/include/api/odp_atomic.h b/platform/linux-generic/include/api/odp_atomic.h
index 5c83b39..1c265ff 100644
--- a/platform/linux-generic/include/api/odp_atomic.h
+++ b/platform/linux-generic/include/api/odp_atomic.h
@@ -4,352 +4,531 @@ 
  * SPDX-License-Identifier:     BSD-3-Clause
  */
 
-
 /**
  * @file
  *
- * ODP atomic operations
+ * ODP atomic types and operations, semantically a subset of C11 atomics.
+ * Scalar and pointer variables wrapped in a struct to avoid accessing directly
+ * without using the required access functions.
+ * Atomic functions must be used to operate on atomic variables!
  */
 
 #ifndef ODP_ATOMIC_H_
 #define ODP_ATOMIC_H_
 
+#include <stdint.h>
+#include <stdbool.h>
+#include <odp_align.h>
+#include <odp_hints.h>
+#include <odp_debug.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-
-#include <odp_std_types.h>
-
 /** @addtogroup odp_synchronizers
  *  Atomic operations.
  *  @{
  */
 
-
 /**
- * Atomic unsigned integer 64 bits
+ * 32-bit (unsigned) atomic type
  */
-typedef volatile uint64_t odp_atomic_u64_t;
+typedef struct {
+	uint32_t v; /**< Actual storage for the atomic variable */
+} odp_atomic_u32_t
+ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
 
 /**
- * Atomic unsigned integer 32 bits
+ * 64-bit (unsigned) atomic type
  */
-typedef volatile uint32_t odp_atomic_u32_t;
+typedef struct {
+	uint64_t v; /**< Actual storage for the atomic variable */
+} odp_atomic_u64_t
+ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
 
+/**
+ * Pointer atomic type
+ */
+typedef struct {
+	void *v; /**< Actual storage for the atomic variable */
+} odp_atomic_ptr_t
+ODP_ALIGNED(sizeof(void *)); /* Enforce alignement! */
+
+typedef enum {
+/** Relaxed memory ordering, no ordering of other accesses enforced */
+	ODP_MEMMODEL_RLX = __ATOMIC_RELAXED,
+/** Acquire memory ordering, synchronize with release stores from another
+ * thread (later accesses cannot move before acquire operation) */
+	ODP_MEMMODEL_ACQ = __ATOMIC_ACQUIRE,
+/** Release memory ordering, synchronize with acquire loads from another
+ * thread (earlier accesses cannot move after release operation) */
+	ODP_MEMMODEL_RLS = __ATOMIC_RELEASE,
+/** Acquire&release memory ordering, synchronize with acquire loads and release
+ * stores in another (one other) thread */
+	ODP_MEMMODEL_ACQ_RLS = __ATOMIC_ACQ_REL,
+/** Sequential consistent memory ordering, synchronize with acquire loads and
+ * release stores in all other threads */
+	ODP_MEMMODEL_SC = __ATOMIC_SEQ_CST
+} odp_memmodel_t;
+
+/*****************************************************************************
+ * Operations on 32-bit atomics
+ * odp_atomic_u32_init - no return value
+ * odp_atomic_u32_load - return current value
+ * odp_atomic_u32_store - no return value
+ * odp_atomic_u32_xchg - return old value
+ * odp_atomic_u32_cmp_xchg_weak - return bool
+ * odp_atomic_u32_fetch_add - return old value
+ * odp_atomic_u32_add - no return value
+ * odp_atomic_u32_fetch_inc - return old value
+ * odp_atomic_u32_inc - no return value
+ * odp_atomic_u32_fetch_dec - return old value
+ * odp_atomic_u32_dec - no return value
+ *****************************************************************************/
 
 /**
- * Initialize atomic uint32
- *
- * @param ptr    An atomic variable
+ * Initialization of 32-bit atomic variable
  *
- * @note The operation is not synchronized with other threads
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param val   Value to initialize the variable with
  */
-static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
+static inline void odp_atomic_u32_init(odp_atomic_u32_t *ptr, uint32_t val)
 {
-	*ptr = 0;
+	__atomic_store_n(&ptr->v, val, __ATOMIC_RELAXED);
 }
 
 /**
- * Load value of atomic uint32
+ * Atomic load of 32-bit atomic variable
  *
- * @param ptr    An atomic variable
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the load
+ * (ODP_MEMMODEL_RLX, ODP_MEMMODEL_ACQ, ODP_MEMMODEL_SC)
  *
- * @return atomic uint32 value
+ * @return Value of the variable
+ */
+static inline uint32_t odp_atomic_u32_load(const odp_atomic_u32_t *ptr,
+		odp_memmodel_t mmodel)
+{
+	return __atomic_load_n(&ptr->v, mmodel);
+}
+
+/**
+ * Atomic store to 32-bit atomic variable
  *
- * @note The operation is not synchronized with other threads
+ * @param ptr  Pointer to a 32-bit atomic variable
+ * @param val  Value to write to the atomic variable
+ * @param memmodel Memory model associated with the store
+ * (ODP_MEMMODEL_RLX, ODP_MEMMODEL_RLS, ODP_MEMMODEL_SC)
  */
-static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
+static inline void odp_atomic_u32_store(odp_atomic_u32_t *ptr,
+		uint32_t val,
+		odp_memmodel_t mmodel)
 {
-	return *ptr;
+	__atomic_store_n(&ptr->v, val, mmodel);
 }
 
 /**
- * Store value to atomic uint32
+ * Atomic exchange (swap) of 32-bit atomic variable
  *
- * @param ptr        An atomic variable
- * @param new_value  Store new_value to a variable
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param val   New value to write
+ * @param       memmodel Memory model associated with the exchange operation
  *
- * @note The operation is not synchronized with other threads
+ * @return Old value of variable
+ */
+static inline uint32_t odp_atomic_u32_xchg(odp_atomic_u32_t *ptr,
+		uint32_t val,
+		odp_memmodel_t mmodel)
+
+{
+	return __atomic_exchange_n(&ptr->v, val, mmodel);
+}
+
+/**
+ * Atomic compare and exchange (swap) of 32-bit atomic variable
+ * "Weak" semantics, may fail spuriously and must be used in a loop.
+ *
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param exp_p Pointer to expected value (updated on failure)
+ * @param val   New value to write
+ * @param       succ Memory model associated with a successful compare-and-swap
+ * operation
+ * @param       fail Memory model associated with a failed compare-and-swap
+ * operation (ODP_MEMMODEL_RLX or ODP_MEMMODEL_ACQ)
+ *
+ * @return 1 (true) if exchange successful, 0 (false) if not successful (and
+ * '*exp_p' updated with current value)
  */
-static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
-					uint32_t new_value)
+static inline int odp_atomic_u32_cmp_xchg_weak(odp_atomic_u32_t *ptr,
+		uint32_t *exp_p,
+		uint32_t val,
+		odp_memmodel_t succ,
+		odp_memmodel_t fail)
 {
-	*ptr = new_value;
+	return __atomic_compare_exchange_n(&ptr->v, exp_p, val,
+			true, succ, fail);
 }
 
 /**
- * Fetch and add atomic uint32
+ * Atomic fetch and add to 32-bit atomic variable
+ * @note A - B <=> A + (-B)
  *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
+ * @param memmodel Memory model associated with the add operation
  *
- * @return Value of the variable before the operation
+ * @return Value of the atomic variable before the addition
  */
-static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
-						uint32_t value)
+static inline uint32_t odp_atomic_u32_fetch_add(odp_atomic_u32_t *ptr,
+		uint32_t incr,
+		odp_memmodel_t mmodel)
 {
-	return __sync_fetch_and_add(ptr, value);
+	return __atomic_fetch_add(&ptr->v, incr, mmodel);
 }
 
 /**
- * Fetch and subtract uint32
+ * Atomic add to 32-bit atomic variable
  *
- * @param ptr    An atomic variable
- * @param value  A value to be sub to the variable
- *
- * @return Value of the variable before the operation
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
+ * @param memmodel Memory model associated with the add operation.
  */
-static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
-						uint32_t value)
+static inline void odp_atomic_u32_add(odp_atomic_u32_t *ptr,
+		uint32_t incr,
+		odp_memmodel_t mmodel)
 {
-	return __sync_fetch_and_sub(ptr, value);
+	(void)__atomic_add_fetch(&ptr->v, incr, mmodel);
 }
 
 /**
- * Fetch and increment atomic uint32 by 1
+ * Atomic fetch and increment of 32-bit atomic variable
  *
- * @param ptr    An atomic variable
+ * param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the increment operation.
  *
- * @return Value of the variable before the operation
+ * @return Value of the atomic variable before the increment
  */
-#if defined __OCTEON__
-
-static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
+static inline uint32_t odp_atomic_u32_fetch_inc(odp_atomic_u32_t *ptr,
+		odp_memmodel_t mmodel)
 {
-	uint32_t ret;
-
-	__asm__ __volatile__ ("syncws");
-	__asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
-			      "r" (ptr));
-
-	return ret;
+	return __atomic_fetch_add(&ptr->v, 1, mmodel);
 }
 
-#else
+/**
+ * Atomic increment of 32-bit atomic variable
+ *
+ * param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the increment operation.
+ */
+static inline void odp_atomic_u32_inc(odp_atomic_u32_t *ptr,
+		odp_memmodel_t mmodel)
 
-static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
 {
-	return odp_atomic_fetch_add_u32(ptr, 1);
+	(void)__atomic_add_fetch(&ptr->v, 1, mmodel);
 }
 
-#endif
-
 /**
- * Increment atomic uint32 by 1
+ * Atomic fetch and decrement of 32-bit atomic variable
  *
- * @param ptr    An atomic variable
+ * param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the decrement
+ * operation.
  *
+ * @return Value of the atomic variable before the decrement
  */
-static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
+static inline uint32_t odp_atomic_u32_fetch_dec(odp_atomic_u32_t *ptr,
+		odp_memmodel_t mmodel)
 {
-	odp_atomic_fetch_add_u32(ptr, 1);
+	return __atomic_fetch_sub(&ptr->v, 1, mmodel);
 }
 
 /**
- * Fetch and decrement uint32 by 1
+ * Atomic decrement of 32-bit atomic variable
  *
- * @param ptr    An atomic variable
+ * param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the decrement operation.
+ */
+static inline void odp_atomic_u32_dec(odp_atomic_u32_t *ptr,
+		odp_memmodel_t mmodel)
+
+{
+	(void)__atomic_sub_fetch(&ptr->v, 1, mmodel);
+}
+
+/*****************************************************************************
+ * Operations on 64-bit atomics
+ * odp_atomic_u64_init - no return value
+ * odp_atomic_u64_load - return current value
+ * odp_atomic_u64_store - no return value
+ * odp_atomic_u64_xchg - return old value
+ * odp_atomic_u64_cmp_xchg_strong - return bool
+ * odp_atomic_u64_fetch_add - return old value
+ * odp_atomic_u64_add - no return value
+ * odp_atomic_u64_fetch_inc - return old value
+ * odp_atomic_u64_inc - no return value
+ * odp_atomic_u64_fetch_dec - return old value
+ * odp_atomic_u64_dec - no return value
+ *****************************************************************************/
+
+/**
+ * Initialization of 64-bit atomic variable
  *
- * @return Value of the variable before the operation
+ * @param ptr   Pointer to a 64-bit atomic variable
+ * @param val   Value to initialize the variable with
  */
-static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
+static inline void odp_atomic_u64_init(odp_atomic_u64_t *ptr, uint64_t val)
 {
-	return odp_atomic_fetch_sub_u32(ptr, 1);
+	__atomic_store_n(&ptr->v, val, __ATOMIC_RELAXED);
 }
 
 /**
- * Decrement atomic uint32 by 1
+ * Atomic load of 64-bit atomic variable
  *
- * @param ptr    An atomic variable
+ * @param ptr   Pointer to a 64-bit atomic variable
+ * @param memmodel Memory model associated with the load
+ * (ODP_MEMMODEL_RLX, ODP_MEMMODEL_ACQ, ODP_MEMMODEL_SC)
  *
+ * @return Value of the variable
  */
-static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
+static inline uint64_t odp_atomic_u64_load(const odp_atomic_u64_t *ptr,
+		odp_memmodel_t mmodel)
 {
-	odp_atomic_fetch_sub_u32(ptr, 1);
+	return __atomic_load_n(&ptr->v, mmodel);
 }
 
 /**
- * Atomic compare and set for 32bit
+ * Atomic store to 64-bit atomic variable
  *
- * @param dst destination location into which the value will be written.
- * @param exp expected value.
- * @param src new value.
- * @return Non-zero on success; 0 on failure.
+ * @param ptr  Pointer to a 64-bit atomic variable
+ * @param val  Value to write to the atomic variable
+ * @param memmodel Memory model associated with the store
+ * (ODP_MEMMODEL_RLX, ODP_MEMMODEL_RLS, ODP_MEMMODEL_SC)
  */
-static inline int
-odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
+static inline void odp_atomic_u64_store(odp_atomic_u64_t *ptr,
+		uint64_t val,
+		odp_memmodel_t mmodel)
 {
-	return __sync_bool_compare_and_swap(dst, exp, src);
+	__atomic_store_n(&ptr->v, val, mmodel);
 }
 
 /**
- * Initialize atomic uint64
+ * Atomic exchange (swap) of 64-bit atomic variable
  *
- * @param ptr    An atomic variable
+ * @param ptr   Pointer to a 64-bit atomic variable
+ * @param val   New value to write
+ * @param       memmodel Memory model associated with the exchange operation
  *
- * @note The operation is not synchronized with other threads
+ * @return Old value of variable
  */
-static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
+static inline uint64_t odp_atomic_u64_xchg(odp_atomic_u64_t *ptr,
+		uint64_t val,
+		odp_memmodel_t mmodel)
+
 {
-	*ptr = 0;
+	return __atomic_exchange_n(&ptr->v, val, mmodel);
 }
 
 /**
- * Load value of atomic uint64
- *
- * @param ptr    An atomic variable
- *
- * @return atomic uint64 value
- *
- * @note The operation is not synchronized with other threads
+ * Atomic compare and exchange (swap) of 64-bit atomic variable
+ * "Strong" semantics, will not fail spuriously.
+ *
+ * @param ptr   Pointer to a 64-bit atomic variable
+ * @param exp_p Pointer to expected value (updated on failure)
+ * @param val   New value to write
+ * @param       succ Memory model associated with a successful compare-and-swap
+ * operation
+ * @param       fail Memory model associated with a failed compare-and-swap
+ * operation (ODP_MEMMODEL_RLX or ODP_MEMMODEL_ACQ)
+ *
+ * @return 1 (true) if exchange successful, 0 (false) if not successful (and
+ * '*exp_p' updated with current value)
  */
-static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
+static inline int odp_atomic_u64_cmp_xchg_strong(odp_atomic_u64_t *ptr,
+		uint64_t *exp_p,
+		uint64_t val,
+		odp_memmodel_t succ,
+		odp_memmodel_t fail)
 {
-	return *ptr;
+	return __atomic_compare_exchange_n(&ptr->v, exp_p, val,
+			false, succ, fail);
 }
 
 /**
- * Store value to atomic uint64
+ * Atomic fetch and add to 64-bit atomic variable
+ * @note A - B <=> A + (-B)
  *
- * @param ptr        An atomic variable
- * @param new_value  Store new_value to a variable
+ * @param ptr   Pointer to a 64-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
+ * @param memmodel Memory model associated with the add operation
  *
- * @note The operation is not synchronized with other threads
+ * @return Value of the atomic variable before the addition
  */
-static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
-					uint64_t new_value)
+static inline uint64_t odp_atomic_u64_fetch_add(odp_atomic_u64_t *ptr,
+		uint64_t incr,
+		odp_memmodel_t mmodel)
 {
-	*ptr = new_value;
+	return __atomic_fetch_add(&ptr->v, incr, mmodel);
 }
 
 /**
- * Add atomic uint64
- *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
+ * Atomic add to 64-bit atomic variable
  *
+ * @param ptr   Pointer to a 64-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
+ * @param memmodel Memory model associated with the add operation.
  */
-static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t value)
+static inline void odp_atomic_u64_add(odp_atomic_u64_t *ptr,
+		uint64_t incr,
+		odp_memmodel_t mmodel)
 {
-	__sync_fetch_and_add(ptr, value);
+	(void)__atomic_add_fetch(&ptr->v, incr, mmodel);
 }
 
 /**
- * Fetch and add atomic uint64
+ * Atomic fetch and increment of 64-bit atomic variable
  *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
+ * param ptr   Pointer to a 64-bit atomic variable
+ * @param memmodel Memory model associated with the increment operation.
  *
- * @return Value of the variable before the operation
+ * @return Value of the atomic variable before the increment
  */
-
-#if defined __powerpc__ && !defined __powerpc64__
-static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
+static inline uint64_t odp_atomic_u64_fetch_inc(odp_atomic_u64_t *ptr,
+		odp_memmodel_t mmodel)
 {
-	return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
-				    (uint32_t)value);
+	return __atomic_fetch_add(&ptr->v, 1, mmodel);
 }
-#else
-static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
-{
-	return __sync_fetch_and_add(ptr, value);
-}
-#endif
+
 /**
- * Subtract atomic uint64
- *
- * @param ptr    An atomic variable
- * @param value  A value to be subtracted from the variable
+ * Atomic increment of 64-bit atomic variable
  *
+ * param ptr   Pointer to a 64-bit atomic variable
+ * @param memmodel Memory model associated with the increment operation.
  */
-static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t value)
+static inline void odp_atomic_u64_inc(odp_atomic_u64_t *ptr,
+		odp_memmodel_t mmodel)
+
 {
-	__sync_fetch_and_sub(ptr, value);
+	(void)__atomic_add_fetch(&ptr->v, 1, mmodel);
 }
 
 /**
- * Fetch and subtract atomic uint64
+ * Atomic fetch and decrement of 64-bit atomic variable
  *
- * @param ptr    An atomic variable
- * @param value  A value to be subtracted from the variable
+ * param ptr   Pointer to a 64-bit atomic variable
+ * @param memmodel Memory model associated with the decrement
+ * operation.
  *
- * @return Value of the variable before the operation
+ * @return Value of the atomic variable before the decrement
  */
-#if defined __powerpc__ && !defined __powerpc64__
-static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
+static inline uint64_t odp_atomic_u64_fetch_dec(odp_atomic_u64_t *ptr,
+		odp_memmodel_t mmodel)
 {
-	return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
-				    (uint32_t)value);
+	return __atomic_fetch_sub(&ptr->v, 1, mmodel);
 }
-#else
-static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
+
+/**
+ * Atomic decrement of 64-bit atomic variable
+ *
+ * param ptr   Pointer to a 64-bit atomic variable
+ * @param memmodel Memory model associated with the decrement operation.
+ */
+static inline void odp_atomic_u64_dec(odp_atomic_u64_t *ptr,
+		odp_memmodel_t mmodel)
+
 {
-	return __sync_fetch_and_sub(ptr, value);
+	(void)__atomic_sub_fetch(&ptr->v, 1, mmodel);
 }
-#endif
+
+/*****************************************************************************
+ * Operations on pointer atomics
+ * odp_atomic_ptr_init - no return value
+ * odp_atomic_ptr_load - return current value
+ * odp_atomic_ptr_store - no return value
+ * odp_atomic_ptr_xchg - return old value
+ *****************************************************************************/
+
 /**
- * Fetch and increment atomic uint64 by 1
- *
- * @param ptr    An atomic variable
+ * Initialization of pointer atomic variable
  *
- * @return Value of the variable before the operation
+ * @param ptr   Pointer to a pointer atomic variable
+ * @param val   Value to initialize the variable with
  */
-static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
+static inline void odp_atomic_ptr_init(odp_atomic_ptr_t *ptr, void *val)
 {
-	return odp_atomic_fetch_add_u64(ptr, 1);
+	__atomic_store_n(&ptr->v, val, __ATOMIC_RELAXED);
 }
 
 /**
- * Increment atomic uint64 by 1
+ * Atomic load of pointer atomic variable
  *
- * @param ptr    An atomic variable
+ * @param ptr   Pointer to a pointer atomic variable
+ * @param memmodel Memory model associated with the load
+ * (ODP_MEMMODEL_RLX, ODP_MEMMODEL_ACQ, ODP_MEMMODEL_SC)
  *
+ * @return Value of the variable
  */
-static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
+static inline void *odp_atomic_ptr_load(const odp_atomic_ptr_t *ptr,
+		odp_memmodel_t mmodel)
 {
-	odp_atomic_fetch_add_u64(ptr, 1);
+	return __atomic_load_n(&ptr->v, mmodel);
 }
 
 /**
- * Fetch and decrement atomic uint64 by 1
+ * Atomic store to pointer atomic variable
  *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
+ * @param ptr  Pointer to a pointer atomic variable
+ * @param val  Value to write to the atomic variable
+ * @param memmodel Memory model associated with the store
+ * (ODP_MEMMODEL_RLX, ODP_MEMMODEL_RLS, ODP_MEMMODEL_SC)
  */
-static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
+static inline void odp_atomic_ptr_store(odp_atomic_ptr_t *ptr,
+		void *val,
+		odp_memmodel_t mmodel)
 {
-	return odp_atomic_fetch_sub_u64(ptr, 1);
+	__atomic_store_n(&ptr->v, val, mmodel);
 }
 
 /**
- * Decrement atomic uint64 by 1
+ * Atomic exchange (swap) of pointer atomic variable
  *
- * @param ptr    An atomic variable
+ * @param ptr   Pointer to a pointer atomic variable
+ * @param val   New value to write
+ * @param       memmodel Memory model associated with the exchange operation
  *
+ * @return Old value of variable
  */
-static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
+static inline void *odp_atomic_ptr_xchg(odp_atomic_ptr_t *ptr,
+		void *val,
+		odp_memmodel_t mmodel)
+
 {
-	odp_atomic_fetch_sub_u64(ptr, 1);
+	return __atomic_exchange_n(&ptr->v, val, mmodel);
 }
 
 /**
- * Atomic compare and set for 64bit
- *
- * @param dst destination location into which the value will be written.
- * @param exp expected value.
- * @param src new value.
- * @return Non-zero on success; 0 on failure.
+ * Atomic compare and exchange (swap) of pointer atomic variable
+ * "Weak" semantics, may fail spuriously and must be used in a loop.
+ *
+ * @param ptr   Pointer to a pointer atomic variable
+ * @param exp_p Pointer to expected value (updated on failure)
+ * @param val   New value to write
+ * @param       succ Memory model associated with a successful compare-and-swap
+ * operation
+ * @param       fail Memory model associated with a failed compare-and-swap
+ * operation (ODP_MEMMODEL_RLX or ODP_MEMMODEL_ACQ)
+ *
+ * @return 1 (true) if exchange successful, 0 (false) if not successful (and
+ * '*exp_p' updated with current value)
  */
-static inline int
-odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
+static inline int odp_atomic_ptr_cmp_xchg_weak(odp_atomic_ptr_t *ptr,
+		void **exp_p,
+		void *val,
+		odp_memmodel_t succ,
+		odp_memmodel_t fail)
 {
-	return __sync_bool_compare_and_swap(dst, exp, src);
+	return __atomic_compare_exchange_n(&ptr->v, exp_p, val,
+			true, succ, fail);
 }
 
 /**
diff --git a/platform/linux-generic/include/api/odp_barrier.h b/platform/linux-generic/include/api/odp_barrier.h
index fb02a9d..4e9bb4a 100644
--- a/platform/linux-generic/include/api/odp_barrier.h
+++ b/platform/linux-generic/include/api/odp_barrier.h
@@ -31,22 +31,22 @@  extern "C" {
  * ODP execution barrier
  */
 typedef struct odp_barrier_t {
-	uint32_t         count;  /**< @private Thread count */
-	odp_atomic_u32_t bar;    /**< @private Barrier counter */
+	uint32_t         num_threads;  /**< @private Thread count (constant) */
+	odp_atomic_u32_t in_barrier;   /**< @private Threads in barrier */
 } odp_barrier_t;
 
 
 /**
- * Init barrier with thread count
+ * Initialize barrier
  *
  * @param barrier    Barrier
- * @param count      Thread count
+ * @param count      Number of threads which share the barrier
  */
-void odp_barrier_init_count(odp_barrier_t *barrier, int count);
+void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
 
 
 /**
- * Synchronise thread execution on barrier
+ * Synchronize thread execution on barrier
  *
  * @param barrier    Barrier
  */
diff --git a/platform/linux-generic/include/api/odp_rwlock.h b/platform/linux-generic/include/api/odp_rwlock.h
index a880f92..ce70171 100644
--- a/platform/linux-generic/include/api/odp_rwlock.h
+++ b/platform/linux-generic/include/api/odp_rwlock.h
@@ -10,9 +10,12 @@ 
 /**
  * @file
  *
- * ODP RW Locks
+ * ODP read/write lock
+ * RW lock support multiple concurrent reads but only one (exclusive) writer.
  */
 
+#include <odp_atomic.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -24,47 +27,48 @@  extern "C" {
 
 /**
  * The odp_rwlock_t type.
- * write lock count is -1,
- * read lock count > 0
+ * Write lock held, count is ~0U
+ * Read lock held, count >0 && <~0U
  */
 typedef struct {
-	volatile int32_t cnt; /**< -1 Write lock,
-				> 0 for Read lock. */
+	odp_atomic_u32_t cnt; /**< == 0: unlocked,
+				   == ~0: locked for write,
+				   > 0 number of concurrent read locks */
 } odp_rwlock_t;
 
 
 /**
- * Initialize the rwlock to an unlocked state.
+ * Initialize a read/write lock.
  *
- * @param rwlock pointer to the RW Lock.
+ * @param rwlock pointer to a RW lock.
  */
 void odp_rwlock_init(odp_rwlock_t *rwlock);
 
 /**
  * Aquire a read lock.
  *
- * @param rwlock pointer to a RW Lock.
+ * @param rwlock pointer to a RW lock.
  */
 void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
 
 /**
  * Release a read lock.
  *
- * @param rwlock pointer to the RW Lock.
+ * @param rwlock pointer to a RW lock.
  */
 void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
 
 /**
- * Aquire a write lock.
+ * Aquire the write lock.
  *
- * @param rwlock pointer to a RW Lock.
+ * @param rwlock pointer to a RW lock.
  */
 void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
 
 /**
- * Release a write lock.
+ * Release the write lock.
  *
- * @param rwlock pointer to a RW Lock.
+ * @param rwlock pointer to a RW lock.
  */
 void odp_rwlock_write_unlock(odp_rwlock_t *rwlock);
 
diff --git a/platform/linux-generic/include/api/odp_spinlock.h b/platform/linux-generic/include/api/odp_spinlock.h
index 462ff97..5d16051 100644
--- a/platform/linux-generic/include/api/odp_spinlock.h
+++ b/platform/linux-generic/include/api/odp_spinlock.h
@@ -19,7 +19,7 @@  extern "C" {
 #endif
 
 
-#include <odp_std_types.h>
+#include <odp_atomic.h>
 
 /** @addtogroup odp_synchronizers
  *  Operations on spinlock.
@@ -30,7 +30,7 @@  extern "C" {
  * ODP spinlock
  */
 typedef struct odp_spinlock_t {
-	volatile int lock;  /**< @private Lock */
+	odp_atomic_u32_t lock;  /**< @private Lock */
 } odp_spinlock_t;
 
 
diff --git a/platform/linux-generic/include/api/odp_ticketlock.h b/platform/linux-generic/include/api/odp_ticketlock.h
index 4a84136..49e7a41 100644
--- a/platform/linux-generic/include/api/odp_ticketlock.h
+++ b/platform/linux-generic/include/api/odp_ticketlock.h
@@ -21,6 +21,7 @@  extern "C" {
 
 #include <odp_std_types.h>
 #include <odp_counter.h>
+#include <odp_atomic.h>
 
 /** @addtogroup odp_synchronizers
  *  @{
@@ -31,7 +32,7 @@  extern "C" {
  */
 typedef struct odp_ticketlock_t {
 	odp_counter_u32_t next_ticket; /**< @private Next ticket */
-	volatile uint32_t cur_ticket;  /**< @private Current ticket */
+	odp_atomic_u32_t cur_ticket;  /**< @private Current ticket */
 } odp_ticketlock_t;
 
 
@@ -70,15 +71,6 @@  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock);
 
 
 /**
- * Test if ticketlock is locked
- *
- * @param ticketlock  Ticketlock
- *
- * @return 1 if the lock is locked, otherwise 0.
- */
-int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock);
-
-/**
  * @}
  */
 
diff --git a/platform/linux-generic/include/odp_spin_internal.h b/platform/linux-generic/include/odp_spin_internal.h
index b7e2071..29c524f 100644
--- a/platform/linux-generic/include/odp_spin_internal.h
+++ b/platform/linux-generic/include/odp_spin_internal.h
@@ -15,15 +15,6 @@  extern "C" {
 
 
 /**
- * GCC memory barrier for ODP internal use
- */
-static inline void odp_mem_barrier(void)
-{
-	__asm__ __volatile__ ("" : : : "memory");
-}
-
-
-/**
  * Spin loop for ODP internal use
  */
 static inline void odp_spin(void)
diff --git a/platform/linux-generic/odp_barrier.c b/platform/linux-generic/odp_barrier.c
index f4a87c8..2cb2cb8 100644
--- a/platform/linux-generic/odp_barrier.c
+++ b/platform/linux-generic/odp_barrier.c
@@ -8,22 +8,22 @@ 
 #include <odp_sync.h>
 #include <odp_spin_internal.h>
 
-void odp_barrier_init_count(odp_barrier_t *barrier, int count)
+void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
 {
-	barrier->count = count;
-	barrier->bar   = 0;
-	odp_sync_stores();
+	barrier->num_threads = num_threads; /* Constant after initialisation */
+	odp_atomic_u32_init(&barrier->in_barrier, 0);
 }
 
 /*
  * Efficient barrier_sync -
  *
  *   Barriers are initialized with a count of the number of callers
- *   that must sync on the barrier before any may proceed.
+ *   that must sync on (enter) the barrier before any may proceed (exit).
  *
  *   To avoid race conditions and to permit the barrier to be fully
- *   reusable, the barrier value cycles between 0..2*count-1. When
- *   synchronizing the wasless variable simply tracks which half of
+ *   reusable, the barrier value cycles between 0..2*count-1 (temporarily
+ *   hitting 2*count before being wrapped). When
+ *   synchronizing, the waslow variable simply tracks which half of
  *   the cycle the barrier was in upon entry.  Exit is when the
  *   barrier crosses to the other half of the cycle.
  */
@@ -31,18 +31,28 @@  void odp_barrier_init_count(odp_barrier_t *barrier, int count)
 void odp_barrier_sync(odp_barrier_t *barrier)
 {
 	uint32_t count;
-	int wasless;
+	bool waslow;
 
-	odp_sync_stores();
-	wasless = barrier->bar < barrier->count;
-	count   = odp_atomic_fetch_inc_u32(&barrier->bar);
+	/* Increase threads in_barrier count, this will automatically release
+	 * the other threads when lower/upper range is switched.
+	 * We use SC memory model since we are synchronizing with all other
+	 * threads that may be released by this update */
+	count = odp_atomic_u32_fetch_add(&barrier->in_barrier, 1,
+					 ODP_MEMMODEL_SC);
+	/* Compute lower or higher range indicator */
+	waslow = count < barrier->num_threads;
 
-	if (count == 2*barrier->count-1) {
-		barrier->bar = 0;
-	} else {
-		while ((barrier->bar < barrier->count) == wasless)
-			odp_spin();
+	/* Check if in_barrier count should wrap */
+	if (count == 2 * barrier->num_threads - 1) {
+		/* Manually wrap the counter */
+		odp_atomic_u32_add(&barrier->in_barrier,
+				   -2 * barrier->num_threads,
+				   ODP_MEMMODEL_SC);
+		/* Fall-through the final part for the barrier */
+	}
+	/* Wait for counter to change half */
+	while ((odp_atomic_u32_load(&barrier->in_barrier, ODP_MEMMODEL_RLX) <
+	       barrier->num_threads) == waslow) {
+		odp_spin();
 	}
-
-	odp_mem_barrier();
 }
diff --git a/platform/linux-generic/odp_queue.c b/platform/linux-generic/odp_queue.c
index 1318bcd..7db63ee 100644
--- a/platform/linux-generic/odp_queue.c
+++ b/platform/linux-generic/odp_queue.c
@@ -214,9 +214,10 @@  int odp_queue_set_context(odp_queue_t handle, void *context)
 {
 	queue_entry_t *queue;
 	queue = queue_to_qentry(handle);
+	/* Release the queue context (to all threads) by installing the
+	 * pointer in the queue metdadata */
 	odp_sync_stores();
 	queue->s.param.context = context;
-	odp_sync_stores();
 	return 0;
 }
 
diff --git a/platform/linux-generic/odp_ring.c b/platform/linux-generic/odp_ring.c
index 632aa66..12ab1d8 100644
--- a/platform/linux-generic/odp_ring.c
+++ b/platform/linux-generic/odp_ring.c
@@ -187,10 +187,10 @@  odph_ring_create(const char *name, unsigned count, unsigned flags)
 		r->cons.size = count;
 		r->prod.mask = count-1;
 		r->cons.mask = count-1;
-		r->prod.head = 0;
-		r->cons.head = 0;
-		r->prod.tail = 0;
-		r->cons.tail = 0;
+		odp_atomic_u32_init(&r->prod.head, 0);
+		odp_atomic_u32_init(&r->cons.head, 0);
+		odp_atomic_u32_init(&r->prod.tail, 0);
+		odp_atomic_u32_init(&r->cons.tail, 0);
 
 		TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
 	} else {
@@ -227,7 +227,7 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	uint32_t prod_head, prod_next;
 	uint32_t cons_tail, free_entries;
 	const unsigned max = n;
-	int success;
+	bool success;
 	unsigned i;
 	uint32_t mask = r->prod.mask;
 	int ret;
@@ -237,8 +237,10 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 		/* Reset n to the initial burst count */
 		n = max;
 
-		prod_head = r->prod.head;
-		cons_tail = r->cons.tail;
+		prod_head = odp_atomic_u32_load(&r->prod.head,
+				ODP_MEMMODEL_RLX);
+		cons_tail = odp_atomic_u32_load(&r->cons.tail,
+				ODP_MEMMODEL_ACQ);
 		/* The subtraction is done between two unsigned 32bits value
 		 * (the result is always modulo 32 bits even if we have
 		 * prod_head > cons_tail). So 'free_entries' is always between 0
@@ -259,13 +261,15 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 		}
 
 		prod_next = prod_head + n;
-		success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
-					      prod_next);
-	} while (odp_unlikely(success == 0));
+		success = odp_atomic_u32_cmp_xchg_weak(&r->prod.head,
+						     &prod_head,
+						     prod_next,
+						     ODP_MEMMODEL_RLX,
+						     ODP_MEMMODEL_RLX);
+	} while (odp_unlikely(!success));
 
 	/* write entries in ring */
 	ENQUEUE_PTRS();
-	odp_mem_barrier();
 
 	/* if we exceed the watermark */
 	if (odp_unlikely(((mask + 1) - free_entries + n) > r->prod.watermark)) {
@@ -279,10 +283,11 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	 * If there are other enqueues in progress that preceeded us,
 	 * we need to wait for them to complete
 	 */
-	while (odp_unlikely(r->prod.tail != prod_head))
+	while (odp_unlikely(odp_atomic_u32_load(&r->prod.tail,
+					ODP_MEMMODEL_RLX) != prod_head))
 		odp_spin();
 
-	r->prod.tail = prod_next;
+	odp_atomic_u32_store(&r->prod.tail, prod_next, ODP_MEMMODEL_RLS);
 	return ret;
 }
 
@@ -298,8 +303,8 @@  int __odph_ring_sp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	uint32_t mask = r->prod.mask;
 	int ret;
 
-	prod_head = r->prod.head;
-	cons_tail = r->cons.tail;
+	prod_head = odp_atomic_u32_load(&r->prod.head, ODP_MEMMODEL_RLX);
+	cons_tail = odp_atomic_u32_load(&r->cons.tail, ODP_MEMMODEL_ACQ);
 	/* The subtraction is done between two unsigned 32bits value
 	 * (the result is always modulo 32 bits even if we have
 	 * prod_head > cons_tail). So 'free_entries' is always between 0
@@ -320,11 +325,10 @@  int __odph_ring_sp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	}
 
 	prod_next = prod_head + n;
-	r->prod.head = prod_next;
+	odp_atomic_u32_store(&r->prod.head, prod_next, ODP_MEMMODEL_RLX);
 
 	/* write entries in ring */
 	ENQUEUE_PTRS();
-	odp_mem_barrier();
 
 	/* if we exceed the watermark */
 	if (odp_unlikely(((mask + 1) - free_entries + n) > r->prod.watermark)) {
@@ -334,7 +338,7 @@  int __odph_ring_sp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 		ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
 	}
 
-	r->prod.tail = prod_next;
+	odp_atomic_u32_store(&r->prod.tail, prod_next, ODP_MEMMODEL_RLS);
 	return ret;
 }
 
@@ -348,7 +352,7 @@  int __odph_ring_mc_do_dequeue(odph_ring_t *r, void **obj_table,
 	uint32_t cons_head, prod_tail;
 	uint32_t cons_next, entries;
 	const unsigned max = n;
-	int success;
+	bool success;
 	unsigned i;
 	uint32_t mask = r->prod.mask;
 
@@ -357,8 +361,10 @@  int __odph_ring_mc_do_dequeue(odph_ring_t *r, void **obj_table,
 		/* Restore n as it may change every loop */
 		n = max;
 
-		cons_head = r->cons.head;
-		prod_tail = r->prod.tail;
+		cons_head = odp_atomic_u32_load(&r->cons.head,
+				ODP_MEMMODEL_RLX);
+		prod_tail = odp_atomic_u32_load(&r->prod.tail,
+				ODP_MEMMODEL_ACQ);
 		/* The subtraction is done between two unsigned 32bits value
 		 * (the result is always modulo 32 bits even if we have
 		 * cons_head > prod_tail). So 'entries' is always between 0
@@ -378,22 +384,25 @@  int __odph_ring_mc_do_dequeue(odph_ring_t *r, void **obj_table,
 		}
 
 		cons_next = cons_head + n;
-		success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
-					      cons_next);
-	} while (odp_unlikely(success == 0));
+		success = odp_atomic_u32_cmp_xchg_weak(&r->cons.head,
+						     &cons_head,
+						     cons_next,
+						     ODP_MEMMODEL_RLX,
+						     ODP_MEMMODEL_RLX);
+	} while (odp_unlikely(!success));
 
 	/* copy in table */
 	DEQUEUE_PTRS();
-	odp_mem_barrier();
 
 	/*
 	 * If there are other dequeues in progress that preceded us,
 	 * we need to wait for them to complete
 	 */
-	while (odp_unlikely(r->cons.tail != cons_head))
+	while (odp_unlikely(odp_atomic_u32_load(&r->cons.tail,
+					      ODP_MEMMODEL_RLX) != cons_head))
 		odp_spin();
 
-	r->cons.tail = cons_next;
+	odp_atomic_u32_store(&r->cons.tail, cons_next, ODP_MEMMODEL_RLS);
 
 	return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
 }
@@ -409,8 +418,8 @@  int __odph_ring_sc_do_dequeue(odph_ring_t *r, void **obj_table,
 	unsigned i;
 	uint32_t mask = r->prod.mask;
 
-	cons_head = r->cons.head;
-	prod_tail = r->prod.tail;
+	cons_head = odp_atomic_u32_load(&r->cons.head, ODP_MEMMODEL_RLX);
+	prod_tail = odp_atomic_u32_load(&r->prod.tail, ODP_MEMMODEL_ACQ);
 	/* The subtraction is done between two unsigned 32bits value
 	 * (the result is always modulo 32 bits even if we have
 	 * cons_head > prod_tail). So 'entries' is always between 0
@@ -429,13 +438,12 @@  int __odph_ring_sc_do_dequeue(odph_ring_t *r, void **obj_table,
 	}
 
 	cons_next = cons_head + n;
-	r->cons.head = cons_next;
+	odp_atomic_u32_store(&r->cons.head, cons_next, ODP_MEMMODEL_RLX);
 
 	/* copy in table */
 	DEQUEUE_PTRS();
-	odp_mem_barrier();
 
-	r->cons.tail = cons_next;
+	odp_atomic_u32_store(&r->cons.tail, cons_next, ODP_MEMMODEL_RLS);
 	return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
 }
 
@@ -482,8 +490,10 @@  int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void **obj_table, unsigned n)
  */
 int odph_ring_full(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic_u32_load(&r->prod.tail,
+			ODP_MEMMODEL_RLX);
+	uint32_t cons_tail = odp_atomic_u32_load(&r->cons.tail,
+			ODP_MEMMODEL_RLX);
 	return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
 }
 
@@ -492,8 +502,10 @@  int odph_ring_full(const odph_ring_t *r)
  */
 int odph_ring_empty(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic_u32_load(&r->prod.tail,
+			ODP_MEMMODEL_RLX);
+	uint32_t cons_tail = odp_atomic_u32_load(&r->cons.tail,
+			ODP_MEMMODEL_RLX);
 	return !!(cons_tail == prod_tail);
 }
 
@@ -502,8 +514,10 @@  int odph_ring_empty(const odph_ring_t *r)
  */
 unsigned odph_ring_count(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic_u32_load(&r->prod.tail,
+			ODP_MEMMODEL_RLX);
+	uint32_t cons_tail = odp_atomic_u32_load(&r->cons.tail,
+			ODP_MEMMODEL_RLX);
 	return (prod_tail - cons_tail) & r->prod.mask;
 }
 
@@ -512,8 +526,10 @@  unsigned odph_ring_count(const odph_ring_t *r)
  */
 unsigned odph_ring_free_count(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic_u32_load(&r->prod.tail,
+			ODP_MEMMODEL_RLX);
+	uint32_t cons_tail = odp_atomic_u32_load(&r->cons.tail,
+			ODP_MEMMODEL_RLX);
 	return (cons_tail - prod_tail - 1) & r->prod.mask;
 }
 
@@ -523,10 +539,14 @@  void odph_ring_dump(const odph_ring_t *r)
 	ODP_DBG("ring <%s>@%p\n", r->name, r);
 	ODP_DBG("  flags=%x\n", r->flags);
 	ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
-	ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
-	ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
-	ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
-	ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
+	ODP_DBG("  ct=%"PRIu32"\n", odp_atomic_u32_load(&r->cons.tail,
+							ODP_MEMMODEL_RLX));
+	ODP_DBG("  ch=%"PRIu32"\n", odp_atomic_u32_load(&r->cons.head,
+							ODP_MEMMODEL_RLX));
+	ODP_DBG("  pt=%"PRIu32"\n", odp_atomic_u32_load(&r->prod.tail,
+							ODP_MEMMODEL_RLX));
+	ODP_DBG("  ph=%"PRIu32"\n", odp_atomic_u32_load(&r->prod.head,
+							ODP_MEMMODEL_RLX));
 	ODP_DBG("  used=%u\n", odph_ring_count(r));
 	ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
 	if (r->prod.watermark == r->prod.size)
diff --git a/platform/linux-generic/odp_rwlock.c b/platform/linux-generic/odp_rwlock.c
index 11c8dd7..d7f843c 100644
--- a/platform/linux-generic/odp_rwlock.c
+++ b/platform/linux-generic/odp_rwlock.c
@@ -4,58 +4,69 @@ 
  * SPDX-License-Identifier:     BSD-3-Clause
  */
 
+#include <stdbool.h>
 #include <odp_atomic.h>
 #include <odp_rwlock.h>
-
 #include <odp_spin_internal.h>
 
 void odp_rwlock_init(odp_rwlock_t *rwlock)
 {
-	rwlock->cnt = 0;
+	odp_atomic_u32_init(&rwlock->cnt, 0);
 }
 
 void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
 {
-	int32_t cnt;
-	int  is_locked = 0;
-
-	while (is_locked == 0) {
-		cnt = rwlock->cnt;
-		/* waiting for read lock */
-		if (cnt < 0) {
+	bool gotit;
+	uint32_t cnt = odp_atomic_u32_load(&rwlock->cnt, ODP_MEMMODEL_ACQ);
+	do {
+		/* Wait for any writer to release lock */
+		while ((int32_t)cnt < 0) {
 			odp_spin();
-			continue;
+			cnt = odp_atomic_u32_load(&rwlock->cnt,
+						  ODP_MEMMODEL_RLX);
 		}
-		is_locked = odp_atomic_cmpset_u32(
-					(volatile uint32_t *)&rwlock->cnt,
-					      cnt, cnt + 1);
-	}
+		/* Attempt to take another read lock */
+		gotit = odp_atomic_u32_cmp_xchg_weak(&rwlock->cnt,
+						     &cnt, cnt + 1,
+						     ODP_MEMMODEL_RLX,
+						     ODP_MEMMODEL_RLX);
+		/* If operation fails, 'cnt' will contain current value */
+	} while (!gotit);
 }
 
 void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
 {
-	odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
+	/* Release one read lock by subtracting 1 */
+	/* Supposedly we did not make any updates but still need to contain
+	 * loads within the lock/unlock boundaries */
+	odp_atomic_u32_dec(&rwlock->cnt, ODP_MEMMODEL_RLS);
 }
 
 void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
 {
-	int32_t cnt;
-	int is_locked = 0;
-
-	while (is_locked == 0) {
-		cnt = rwlock->cnt;
-		/* lock aquired, wait */
-		if (cnt != 0) {
+	bool gotit;
+	uint32_t cnt = odp_atomic_u32_load(&rwlock->cnt, ODP_MEMMODEL_ACQ);
+	do {
+		/* Wait for all lock holders to release lock */
+		while (cnt != 0) {
+			/* Lock is busy */
 			odp_spin();
-			continue;
+			cnt = odp_atomic_u32_load(&rwlock->cnt,
+						  ODP_MEMMODEL_RLX);
 		}
-		is_locked = odp_atomic_cmpset_u32(
-					(volatile uint32_t *)&rwlock->cnt,
-					      0, -1);
-	}
+		/* Attempt to take write lock */
+		gotit = odp_atomic_u32_cmp_xchg_weak(&rwlock->cnt,
+						     &cnt,
+						     (uint32_t)-1,
+						     ODP_MEMMODEL_RLX,
+						     ODP_MEMMODEL_RLX);
+		/* If operation fails, 'cnt' will contain current value */
+	} while (!gotit);
 }
 
 void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
 {
-	odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
+	/* Release the write lock by adding 1 */
+	/* We are releasing the lock to all other threads so use SC */
+	odp_atomic_u32_inc(&rwlock->cnt, ODP_MEMMODEL_SC);
 }
diff --git a/platform/linux-generic/odp_spinlock.c b/platform/linux-generic/odp_spinlock.c
index 4eba015..48a6cff 100644
--- a/platform/linux-generic/odp_spinlock.c
+++ b/platform/linux-generic/odp_spinlock.c
@@ -10,31 +10,35 @@ 
 
 void odp_spinlock_init(odp_spinlock_t *spinlock)
 {
-	__sync_lock_release(&spinlock->lock);
+	odp_atomic_u32_init(&spinlock->lock, 0);
 }
 
 
 void odp_spinlock_lock(odp_spinlock_t *spinlock)
 {
-	while (__sync_lock_test_and_set(&spinlock->lock, 1))
-		while (spinlock->lock)
+	/* Outer loop: spin while exchange returns 1 => lock is already taken */
+	while (odp_atomic_u32_xchg(&spinlock->lock, 1, ODP_MEMMODEL_ACQ))
+		/* Inner loop: spin while lock is taken */
+		while (odp_atomic_u32_load(&spinlock->lock, ODP_MEMMODEL_RLX))
 			odp_spin();
 }
 
 
 int odp_spinlock_trylock(odp_spinlock_t *spinlock)
 {
-	return (__sync_lock_test_and_set(&spinlock->lock, 1) == 0);
+	/* Return true if lock was not taken (xchg() returned 0). */
+	return odp_atomic_u32_xchg(&spinlock->lock, 1, ODP_MEMMODEL_ACQ) == 0;
 }
 
 
 void odp_spinlock_unlock(odp_spinlock_t *spinlock)
 {
-	__sync_lock_release(&spinlock->lock);
+	/* Releasing the lock to one other thread so use release model */
+	odp_atomic_u32_store(&spinlock->lock, 0, ODP_MEMMODEL_RLS);
 }
 
 
 int odp_spinlock_is_locked(odp_spinlock_t *spinlock)
 {
-	return spinlock->lock != 0;
+	return odp_atomic_u32_load(&spinlock->lock, ODP_MEMMODEL_RLX) != 0;
 }
diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux-generic/odp_ticketlock.c
index 0e1d880..57d453c 100644
--- a/platform/linux-generic/odp_ticketlock.c
+++ b/platform/linux-generic/odp_ticketlock.c
@@ -5,16 +5,13 @@ 
  */
 
 #include <odp_ticketlock.h>
-#include <odp_counter.h>
-#include <odp_sync.h>
 #include <odp_spin_internal.h>
 
 
 void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
 {
 	odp_counter_u32_init(&ticketlock->next_ticket, 0);
-	ticketlock->cur_ticket  = 0;
-	odp_sync_stores();
+	odp_atomic_u32_init(&ticketlock->cur_ticket, 0);
 }
 
 
@@ -24,29 +21,15 @@  void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
 
 	ticket = odp_counter_u32_read_inc(&ticketlock->next_ticket);
 
-	while (ticket != ticketlock->cur_ticket)
+	while (ticket != odp_atomic_u32_load(&ticketlock->cur_ticket,
+					     ODP_MEMMODEL_ACQ))
 		odp_spin();
-
-	odp_mem_barrier();
 }
 
 
 void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
 {
-	odp_sync_stores();
-
-	ticketlock->cur_ticket++;
-
-#if defined __OCTEON__
-	odp_sync_stores();
-#else
-	odp_mem_barrier();
-#endif
-}
-
-
-int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
-{
-	return ticketlock->cur_ticket !=
-		odp_counter_u32_read(&ticketlock->next_ticket);
+	/* We are releasing the lock to one other thread (the next thread
+	 * to acquire the lock) so use release memory model */
+	odp_atomic_u32_inc(&ticketlock->cur_ticket, ODP_MEMMODEL_RLS);
 }
diff --git a/test/api_test/odp_atomic_test.c b/test/api_test/odp_atomic_test.c
index 3ca7674..e1efa30 100644
--- a/test/api_test/odp_atomic_test.c
+++ b/test/api_test/odp_atomic_test.c
@@ -13,7 +13,7 @@ 
 static odp_atomic_u32_t a32u;
 static odp_atomic_u64_t a64u;
 
-static odp_atomic_u32_t numthrds;
+static odp_counter_u32_t numthrds;
 
 static const char * const test_name[] = {
 	"dummy",
@@ -51,15 +51,15 @@  void test_atomic_inc_u32(void)
 	int i;
 
 	for (i = 0; i < CNT; i++)
-		odp_atomic_inc_u32(&a32u);
+		odp_atomic_u32_inc(&a32u, ODP_MEMMODEL_RLX);
 }
 
-void test_atomic_inc_64(void)
+void test_atomic_inc_u64(void)
 {
 	int i;
 
 	for (i = 0; i < CNT; i++)
-		odp_atomic_inc_u64(&a64u);
+		odp_atomic_u64_inc(&a64u, ODP_MEMMODEL_RLX);
 }
 
 void test_atomic_dec_u32(void)
@@ -67,15 +67,15 @@  void test_atomic_dec_u32(void)
 	int i;
 
 	for (i = 0; i < CNT; i++)
-		odp_atomic_dec_u32(&a32u);
+		odp_atomic_u32_dec(&a32u, ODP_MEMMODEL_RLX);
 }
 
-void test_atomic_dec_64(void)
+void test_atomic_dec_u64(void)
 {
 	int i;
 
 	for (i = 0; i < CNT; i++)
-		odp_atomic_dec_u64(&a64u);
+		odp_atomic_u64_dec(&a64u, ODP_MEMMODEL_RLX);
 }
 
 void test_atomic_add_u32(void)
@@ -83,15 +83,15 @@  void test_atomic_add_u32(void)
 	int i;
 
 	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
+		odp_atomic_u32_fetch_add(&a32u, ADD_SUB_CNT, ODP_MEMMODEL_RLX);
 }
 
-void test_atomic_add_64(void)
+void test_atomic_add_u64(void)
 {
 	int i;
 
 	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
+		odp_atomic_u64_fetch_add(&a64u, ADD_SUB_CNT, ODP_MEMMODEL_RLX);
 }
 
 void test_atomic_sub_u32(void)
@@ -99,15 +99,15 @@  void test_atomic_sub_u32(void)
 	int i;
 
 	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
+		odp_atomic_u32_fetch_add(&a32u, -ADD_SUB_CNT, ODP_MEMMODEL_RLX);
 }
 
-void test_atomic_sub_64(void)
+void test_atomic_sub_u64(void)
 {
 	int i;
 
 	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
+		odp_atomic_u64_fetch_add(&a64u, -ADD_SUB_CNT, ODP_MEMMODEL_RLX);
 }
 
 void test_atomic_inc_dec_u32(void)
@@ -122,16 +122,16 @@  void test_atomic_add_sub_u32(void)
 	test_atomic_sub_u32();
 }
 
-void test_atomic_inc_dec_64(void)
+void test_atomic_inc_dec_u64(void)
 {
-	test_atomic_inc_64();
-	test_atomic_dec_64();
+	test_atomic_inc_u64();
+	test_atomic_dec_u64();
 }
 
-void test_atomic_add_sub_64(void)
+void test_atomic_add_sub_u64(void)
 {
-	test_atomic_add_64();
-	test_atomic_sub_64();
+	test_atomic_add_u64();
+	test_atomic_sub_u64();
 }
 
 /**
@@ -145,32 +145,32 @@  void test_atomic_basic(void)
 	test_atomic_add_u32();
 	test_atomic_sub_u32();
 
-	test_atomic_inc_64();
-	test_atomic_dec_64();
-	test_atomic_add_64();
-	test_atomic_sub_64();
+	test_atomic_inc_u64();
+	test_atomic_dec_u64();
+	test_atomic_add_u64();
+	test_atomic_sub_u64();
 }
 
 void test_atomic_init(void)
 {
-	odp_atomic_init_u32(&a32u);
-	odp_atomic_init_u64(&a64u);
+	odp_atomic_u32_init(&a32u, 0);
+	odp_atomic_u64_init(&a64u, 0);
 }
 
 void test_atomic_store(void)
 {
-	odp_atomic_store_u32(&a32u, U32_INIT_VAL);
-	odp_atomic_store_u64(&a64u, U64_INIT_VAL);
+	odp_atomic_u32_store(&a32u, U32_INIT_VAL, ODP_MEMMODEL_RLX);
+	odp_atomic_u64_store(&a64u, U64_INIT_VAL, ODP_MEMMODEL_RLX);
 }
 
 int test_atomic_validate(void)
 {
-	if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
+	if (odp_atomic_u32_load(&a32u, ODP_MEMMODEL_RLX) != U32_INIT_VAL) {
 		ODP_ERR("Atomic u32 usual functions failed\n");
 		return -1;
 	}
 
-	if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
+	if (odp_atomic_u64_load(&a64u, ODP_MEMMODEL_RLX) != U64_INIT_VAL) {
 		ODP_ERR("Atomic u64 usual functions failed\n");
 		return -1;
 	}
@@ -187,10 +187,10 @@  static void *run_thread(void *arg)
 
 	ODP_DBG("Thread %i starts\n", thr);
 
-	odp_atomic_inc_u32(&numthrds);
+	odp_counter_u32_inc(&numthrds);
 
 	/* Wait here until all pthreads are created */
-	while (*(volatile int *)&numthrds < parg->numthrds)
+	while (odp_counter_u32_read(&numthrds) < (uint32_t)parg->numthrds)
 		;
 
 	gettimeofday(&tv0[thr], NULL);
@@ -205,11 +205,11 @@  static void *run_thread(void *arg)
 	case TEST_ADD_SUB_U32:
 		test_atomic_add_sub_u32();
 		break;
-	case TEST_INC_DEC_64:
-		test_atomic_inc_dec_64();
+	case TEST_INC_DEC_U64:
+		test_atomic_inc_dec_u64();
 		break;
-	case TEST_ADD_SUB_64:
-		test_atomic_add_sub_64();
+	case TEST_ADD_SUB_U64:
+		test_atomic_add_sub_u64();
 		break;
 	}
 	gettimeofday(&tv1[thr], NULL);
@@ -262,7 +262,7 @@  int main(int argc, char *argv[])
 	if (pthrdnum == 0)
 		pthrdnum = odp_sys_core_count();
 
-	odp_atomic_init_u32(&numthrds);
+	odp_counter_u32_init(&numthrds, 0);
 	test_atomic_init();
 	test_atomic_store();
 
diff --git a/test/api_test/odp_atomic_test.h b/test/api_test/odp_atomic_test.h
index aaa9d34..4c18ce6 100644
--- a/test/api_test/odp_atomic_test.h
+++ b/test/api_test/odp_atomic_test.h
@@ -25,24 +25,24 @@  typedef enum {
 	TEST_MIX = 1, /* Must be first test case num */
 	TEST_INC_DEC_U32,
 	TEST_ADD_SUB_U32,
-	TEST_INC_DEC_64,
-	TEST_ADD_SUB_64,
+	TEST_INC_DEC_U64,
+	TEST_ADD_SUB_U64,
 	TEST_MAX,
 } odp_test_atomic_t;
 
 
 void test_atomic_inc_dec_u32(void);
 void test_atomic_add_sub_u32(void);
-void test_atomic_inc_dec_64(void);
-void test_atomic_add_sub_64(void);
+void test_atomic_inc_dec_u64(void);
+void test_atomic_add_sub_u64(void);
 void test_atomic_inc_u32(void);
 void test_atomic_dec_u32(void);
 void test_atomic_add_u32(void);
 void test_atomic_sub_u32(void);
-void test_atomic_inc_64(void);
-void test_atomic_dec_64(void);
-void test_atomic_add_64(void);
-void test_atomic_sub_64(void);
+void test_atomic_inc_u64(void);
+void test_atomic_dec_u64(void);
+void test_atomic_add_u64(void);
+void test_atomic_sub_u64(void);
 void test_atomic_init(void);
 void test_atomic_basic(void);
 void test_atomic_store(void);
diff --git a/test/api_test/odp_counter_test.c b/test/api_test/odp_counter_test.c
index abbc3cb..37bf393 100644
--- a/test/api_test/odp_counter_test.c
+++ b/test/api_test/odp_counter_test.c
@@ -341,7 +341,7 @@  int main(int argc, char *argv[])
 		usage();
 		goto err_exit;
 	}
-	odp_barrier_init_count(&barrier, pthrdnum);
+	odp_barrier_init(&barrier, pthrdnum);
 	odp_test_thread_create(run_thread, &thrdarg);
 
 	odp_test_thread_exit(&thrdarg);