diff mbox

[ODP/PATCH,v2] Look ma, no barriers! C11 memory model

Message ID 1413810006-26257-1-git-send-email-ola.liljedahl@linaro.org
State New
Headers show

Commit Message

Ola Liljedahl Oct. 20, 2014, 1 p.m. UTC
Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
---
Added header file odp_counter.h with support for 32- and 64-bit atomic counters
using relaxed memory order. 6 operations (init/read/write/add/read_inc/inc) on
32-bit and 64-bit counters respectively.A
Renamed odp_atomic_test to odp_counter_test and changed to use odp_counter.h

Implementation of C11-based memory model for atomic operations. 10 operations
(init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec) in
odp_atomic.h. The required memory ordering is now a parameter to each call just
like in C11.

Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
fall back to GCC __sync builtins which often include unnecessarily heavy
barrier/sync operations (always sequentially consistent).

Attempt to remove all explicit memory barriers (odp_sync_stores) from code that
implements multithreaded synchronization primitives (e.g. locks, barriers).
Rewrote such primitives to use the new atomic operations.

Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
odp_ticketlock_lock() (missing acquire barrier) and odp_ring enqueue/dequeue
(missing release barrier, had only compiler barrier).

 .gitignore                                         |   2 +-
 example/generator/odp_generator.c                  |  43 +-
 example/ipsec/odp_ipsec.c                          |   2 +-
 example/odp_example/odp_example.c                  |   2 +-
 example/timer/odp_timer_test.c                     |   2 +-
 helper/include/odph_ring.h                         |   8 +-
 platform/linux-generic/include/api/odp.h           |   1 +
 platform/linux-generic/include/api/odp_atomic.h    | 838 +++++++++++----------
 platform/linux-generic/include/api/odp_barrier.h   |  10 +-
 platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
 platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
 .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
 .../linux-generic/include/odp_buffer_internal.h    |   2 +-
 platform/linux-generic/include/odp_spin_internal.h |   9 -
 platform/linux-generic/odp_barrier.c               |  49 +-
 platform/linux-generic/odp_buffer.c                |   3 +-
 platform/linux-generic/odp_crypto.c                |   7 +-
 platform/linux-generic/odp_queue.c                 |   7 +-
 platform/linux-generic/odp_ring.c                  |  94 +--
 platform/linux-generic/odp_rwlock.c                |  62 +-
 platform/linux-generic/odp_thread.c                |   9 +-
 platform/linux-generic/odp_ticketlock.c            |  29 +-
 platform/linux-generic/odp_timer.c                 |  22 +-
 test/api_test/Makefile.am                          |   6 +-
 test/api_test/odp_atomic_test.c                    | 362 ---------
 test/api_test/odp_atomic_test.h                    |  60 --
 test/api_test/odp_common.c                         |   1 -
 test/api_test/odp_counter_test.c                   | 361 +++++++++
 28 files changed, 1365 insertions(+), 1014 deletions(-)
 create mode 100644 platform/linux-generic/include/api/odp_counter.h
 delete mode 100644 test/api_test/odp_atomic_test.c
 delete mode 100644 test/api_test/odp_atomic_test.h
 create mode 100644 test/api_test/odp_counter_test.c

Comments

Ola Liljedahl Oct. 20, 2014, 1:07 p.m. UTC | #1
Ignore this version, I forgot one tiny thing... v3 coming real soon now.

On 20 October 2014 15:00, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:

> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
> ---
> Added header file odp_counter.h with support for 32- and 64-bit atomic
> counters
> using relaxed memory order. 6 operations
> (init/read/write/add/read_inc/inc) on
> 32-bit and 64-bit counters respectively.A
> Renamed odp_atomic_test to odp_counter_test and changed to use
> odp_counter.h
>
> Implementation of C11-based memory model for atomic operations. 10
> operations
> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
> in
> odp_atomic.h. The required memory ordering is now a parameter to each call
> just
> like in C11.
>
> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
> fall back to GCC __sync builtins which often include unnecessarily heavy
> barrier/sync operations (always sequentially consistent).
>
> Attempt to remove all explicit memory barriers (odp_sync_stores) from code
> that
> implements multithreaded synchronization primitives (e.g. locks, barriers).
> Rewrote such primitives to use the new atomic operations.
>
> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
> enqueue/dequeue
> (missing release barrier, had only compiler barrier).
>
>  .gitignore                                         |   2 +-
>  example/generator/odp_generator.c                  |  43 +-
>  example/ipsec/odp_ipsec.c                          |   2 +-
>  example/odp_example/odp_example.c                  |   2 +-
>  example/timer/odp_timer_test.c                     |   2 +-
>  helper/include/odph_ring.h                         |   8 +-
>  platform/linux-generic/include/api/odp.h           |   1 +
>  platform/linux-generic/include/api/odp_atomic.h    | 838
> +++++++++++----------
>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>  platform/linux-generic/odp_barrier.c               |  49 +-
>  platform/linux-generic/odp_buffer.c                |   3 +-
>  platform/linux-generic/odp_crypto.c                |   7 +-
>  platform/linux-generic/odp_queue.c                 |   7 +-
>  platform/linux-generic/odp_ring.c                  |  94 +--
>  platform/linux-generic/odp_rwlock.c                |  62 +-
>  platform/linux-generic/odp_thread.c                |   9 +-
>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>  platform/linux-generic/odp_timer.c                 |  22 +-
>  test/api_test/Makefile.am                          |   6 +-
>  test/api_test/odp_atomic_test.c                    | 362 ---------
>  test/api_test/odp_atomic_test.h                    |  60 --
>  test/api_test/odp_common.c                         |   1 -
>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>  delete mode 100644 test/api_test/odp_atomic_test.c
>  delete mode 100644 test/api_test/odp_atomic_test.h
>  create mode 100644 test/api_test/odp_counter_test.c
>
> diff --git a/.gitignore b/.gitignore
> index 6342e34..77db4d6 100644
> --- a/.gitignore
> +++ b/.gitignore
> @@ -35,7 +35,7 @@ build/
>  odp_example
>  odp_packet
>  odp_packet_netmap
> -odp_atomic
> +odp_counter
>  odp_shm
>  odp_ring
>  odp_timer_ping
> diff --git a/example/generator/odp_generator.c
> b/example/generator/odp_generator.c
> index eb8b340..252157d 100644
> --- a/example/generator/odp_generator.c
> +++ b/example/generator/odp_generator.c
> @@ -62,10 +62,10 @@ typedef struct {
>   * counters
>  */
>  static struct {
> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
> -       odp_atomic_u64_t ip;    /**< ip packets */
> -       odp_atomic_u64_t udp;   /**< udp packets */
> -       odp_atomic_u64_t icmp;  /**< icmp packets */
> +       odp_counter64_t seq;    /**< ip seq to be send */
> +       odp_counter64_t ip;     /**< ip packets */
> +       odp_counter64_t udp;    /**< udp packets */
> +       odp_counter64_t icmp;   /**< icmp packets */
>  } counters;
>
>  /** * Thread specific arguments
> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_UDPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_UDP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_ICMPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_ICMP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>                 }
>
>                 if (args->appl.interval != 0) {
> +                       uint64_t seq = odp_counter64_read(&counters.seq);
>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
> -                              thr, counters.seq, counters.seq%0xffff);
> +                              thr, seq, seq%0xffff);
>                         /* TODO use odp timer */
>                         usleep(args->appl.interval * 1000);
>                 }
> -               if (args->appl.number != -1 && counters.seq
> -                   >= (unsigned int)args->appl.number) {
> +               if (args->appl.number != -1 &&
> +                   odp_counter64_read(&counters.seq) >=
> +                   (unsigned int)args->appl.number) {
>                         break;
>                 }
>         }
> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>         /* receive number of reply pks until timeout */
>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>                 while (args->appl.timeout >= 0) {
> -                       if (counters.icmp >= (unsigned
> int)args->appl.number)
> +                       if (odp_counter64_read(&counters.icmp) >=
> +                           (unsigned int)args->appl.number)
>                                 break;
>                         /* TODO use odp timer */
>                         sleep(1);
> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>
>         /* print info */
>         if (args->appl.mode == APPL_MODE_UDP) {
> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
> +               printf("  [%02i] total send: %ju\n", thr,
> +                      odp_counter64_read(&counters.seq));
>         } else if (args->appl.mode == APPL_MODE_PING) {
>                 printf("  [%02i] total send: %ju total receive: %ju\n",
> -                      thr, counters.seq, counters.icmp);
> +                      thr, odp_counter64_read(&counters.seq),
> +                      odp_counter64_read(&counters.icmp));
>         }
>         return arg;
>  }
> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                 if (!odp_packet_inflag_ipv4(pkt))
>                         continue;
>
> -               odp_atomic_inc_u64(&counters.ip);
> +               odp_counter64_inc(&counters.ip);
>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>
>                 /* udp */
>                 if (ip->proto == ODPH_IPPROTO_UDP) {
> -                       odp_atomic_inc_u64(&counters.udp);
> +                       odp_counter64_inc(&counters.udp);
>                         udp = (odph_udphdr_t *)(buf + offset);
>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>                                         odp_be_to_cpu_16(udp->length) -
> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                         icmp = (odph_icmphdr_t *)(buf + offset);
>                         /* echo reply */
>                         if (icmp->type == ICMP_ECHOREPLY) {
> -                               odp_atomic_inc_u64(&counters.icmp);
> +                               odp_counter64_inc(&counters.icmp);
>                                 memcpy(&tvsend, buf + offset +
> ODPH_ICMPHDR_LEN,
>                                        sizeof(struct timeval));
>                                 /* TODO This should be changed to use an
> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>         }
>
>         /* init counters */
> -       odp_atomic_init_u64(&counters.seq);
> -       odp_atomic_init_u64(&counters.ip);
> -       odp_atomic_init_u64(&counters.udp);
> -       odp_atomic_init_u64(&counters.icmp);
> +       odp_counter64_init(&counters.seq, 0);
> +       odp_counter64_init(&counters.ip, 0);
> +       odp_counter64_init(&counters.udp, 0);
> +       odp_counter64_init(&counters.icmp, 0);
>
>         /* Reserve memory for args from shared mem */
>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
> index 2f2dc19..76c27d0 100644
> --- a/example/ipsec/odp_ipsec.c
> +++ b/example/ipsec/odp_ipsec.c
> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>         printf("Num worker threads: %i\n", num_workers);
>
>         /* Create a barrier to synchronize thread startup */
> -       odp_barrier_init_count(&sync_barrier, num_workers);
> +       odp_barrier_init(&sync_barrier, num_workers);
>
>         /*
>          * By default core #0 runs Linux kernel background tasks.
> diff --git a/example/odp_example/odp_example.c
> b/example/odp_example/odp_example.c
> index 0e9aa3d..c473395 100644
> --- a/example/odp_example/odp_example.c
> +++ b/example/odp_example/odp_example.c
> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>         odp_shm_print_all();
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&globals->barrier, num_workers);
> +       odp_barrier_init(&globals->barrier, num_workers);
>
>         if (args.proc_mode) {
>                 int ret;
> diff --git a/example/timer/odp_timer_test.c
> b/example/timer/odp_timer_test.c
> index 78b2ae2..dfbeae9 100644
> --- a/example/timer/odp_timer_test.c
> +++ b/example/timer/odp_timer_test.c
> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>         printf("\n");
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&test_barrier, num_workers);
> +       odp_barrier_init(&test_barrier, num_workers);
>
>         /* Create and launch worker threads */
>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
> index 76c1db8..5e78b34 100644
> --- a/helper/include/odph_ring.h
> +++ b/helper/include/odph_ring.h
> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>                 uint32_t sp_enqueue;     /* True, if single producer. */
>                 uint32_t size;           /* Size of ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Producer head. */
> -               uint32_t tail;          /* Producer tail. */
> +               odp_atomic32_t head;    /* Producer head. */
> +               odp_atomic32_t tail;    /* Producer tail. */
>         } prod ODP_ALIGNED_CACHE;
>
>         /** @private Consumer */
> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>                 uint32_t size;           /* Size of the ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Consumer head. */
> -               uint32_t tail;          /* Consumer tail. */
> +               odp_atomic32_t head;    /* Consumer head. */
> +               odp_atomic32_t tail;    /* Consumer tail. */
>         } cons ODP_ALIGNED_CACHE;
>
>         /** @private Memory space of ring starts here. */
> diff --git a/platform/linux-generic/include/api/odp.h
> b/platform/linux-generic/include/api/odp.h
> index 0ee3faf..d124d52 100644
> --- a/platform/linux-generic/include/api/odp.h
> +++ b/platform/linux-generic/include/api/odp.h
> @@ -32,6 +32,7 @@ extern "C" {
>  #include <odp_barrier.h>
>  #include <odp_spinlock.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>
>  #include <odp_init.h>
>  #include <odp_system_info.h>
> diff --git a/platform/linux-generic/include/api/odp_atomic.h
> b/platform/linux-generic/include/api/odp_atomic.h
> index 0cc4cf4..ccdd096 100644
> --- a/platform/linux-generic/include/api/odp_atomic.h
> +++ b/platform/linux-generic/include/api/odp_atomic.h
> @@ -4,464 +4,494 @@
>   * SPDX-License-Identifier:     BSD-3-Clause
>   */
>
> -
>  /**
>   * @file
>   *
> - * ODP atomic operations
> + * ODP atomic types and operations, semantically a subset of C11 atomics.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Atomic functions must be used to operate on atomic variables!
>   */
>
>  #ifndef ODP_ATOMIC_H_
>  #define ODP_ATOMIC_H_
>
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +#include <odp_debug.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
>
> -
> -#include <odp_std_types.h>
> -
> -
> -/**
> - * Atomic integer
> - */
> -typedef volatile int32_t odp_atomic_int_t;
> -
> -/**
> - * Atomic unsigned integer 64 bits
> - */
> -typedef volatile uint64_t odp_atomic_u64_t;
> -
> -/**
> - * Atomic unsigned integer 32 bits
> - */
> -typedef volatile uint32_t odp_atomic_u32_t;
> -
> -
> -/**
> - * Initialize atomic integer
> - *
> - * @param ptr    An integer atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic integer
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic integer value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
> -{
> -       return *ptr;
> -}
> -
> -/**
> - * Store value to atomic integer
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
> new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic integer
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic integer
> - *
> - * @param ptr    An atomic integer variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement atomic integer by 1
> - *
> - * @param ptr    An atomic int variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Initialize atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic uint32 value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
> -{
> -       return *ptr;
> -}
> -
> -/**
> - * Store value to atomic uint32
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
> -                                       uint32_t new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be sub to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
>  /**
> - * Fetch and increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __OCTEON__
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       uint32_t ret;
> -
> -       __asm__ __volatile__ ("syncws");
> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
> -                             "r" (ptr));
> -
> -       return ret;
> -}
> -
> + * 32-bit (unsigned) atomic type
> + */
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the atomic variable */
> +} odp_atomic32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
> +
> +typedef enum {
> +       /** Relaxed memory order, no ordering of other accesses enforced */
> +       ODP_MEMORDER_RLX,
> +       /** Acquire memory order, later accesses cannot move before
> +        * acquire operation */
> +       ODP_MEMORDER_ACQ,
> +       /** Release memory order, earlier accesses cannot move after
> +        * release operation */
> +       ODP_MEMORDER_RLS
> +} odp_memorder_t;
> +
>
> +/*****************************************************************************
> + * Just some private helpers
>
> +*****************************************************************************/
> +
> +#ifdef __OCTEON__
> +/* OCTEON Write Memory Barrier */
> +#define COMPILER_HW_BARRIER() __asm __volatile( \
> +       /* Double syncw to work around errata */ \
> +       "syncw\n\tsyncw" : : : )
>  #else
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> +/** Compiler and hardware full memory barrier */
> +#define COMPILER_HW_BARRIER() __sync_synchronize()
> +/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a
> */
>  #endif
>
> -/**
> - * Increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u32(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_sub_u32(ptr, 1);
> -}
> -
> -/**
> - * Atomic compare and set for 32bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> +#define MEMORY "memory"
> +
>
> +/*****************************************************************************
> + * Operations on 32-bit atomics
> + * odp_atomic32_init - no return value
> + * odp_atomic32_load - return current value
> + * odp_atomic32_store - no return value
> + * odp_atomic32_cmp_xchg_weak - return bool
> + * odp_atomic32_fetch_add - return old value
> + * odp_atomic32_add - no return value
> + * odp_atomic32_fetch_inc - return old value
> + * odp_atomic32_inc - no return value
> + * odp_atomic32_fetch_dec - return old value
> + * odp_atomic32_dec - no return value
> +
> *****************************************************************************/
> +
> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
> +{
> +       /* Write of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) write variable,
> thus we
> +        * can avoid using compiler memory barriers */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Atomic load of 32-bit atomic variable
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the load
> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
> + *
> + * @return Value of the variable
> + */
> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               uint32_t val;
> +               /* Read of aligned word is atomic */
> +               /* Cast to volatile to force compiler to (re-) read
> variable,
> +                * thus we can avoid using compiler memory barriers */
> +               val = *(__volatile const uint32_t *)&ptr->v;
> +               return val;
> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
> +#if defined __aarch64__
> +               uint32_t val;
> +               __asm __volatile("ldar %w0, [%1]"
> +                               : "=&r"(val)
> +                               : "r"(&ptr->v)
> +                               : MEMORY);
> +               return val;
> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
> +               /* Read of aligned word is atomic */
> +               uint32_t val = ptr->v;
> +               /* To prevent later accesses from moving up */
> +               /* Herb Sutter claims HW barrier not needed on x86? */
> +               COMPILER_HW_BARRIER();
> +               return val;
> +#else
> +#warning odp_atomic32_load() may not be efficiently implemented
> +               /* Assume read of aligned word is atomic */
> +               uint32_t val = ptr->v;
> +               /* To prevent later accesses from moving up */
> +               COMPILER_HW_BARRIER();
> +               return val;
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +/**
> + * Atomic store to 32-bit atomic variable
> + *
> + * @param ptr  Pointer to a 32-bit atomic variable
> + * @param val  Value to write to the atomic variable
> + * @param memmodel Memory model associated with the store
> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
> + */
> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
> +               uint32_t val,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Write of aligned word is atomic */
> +               /* Cast to volatile to force compiler to (re-) write
> variable,
> +                * thus we will avoid using compiler memory barriers */
> +               *(__volatile uint32_t *)&ptr->v = val;
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
> +               /* Compiler and HW barrier to prevent earlier accesses from
> +                * moving down */
> +               COMPILER_HW_BARRIER();
> +               /* Write of aligned word is atomic */
> +               ptr->v = val;
> +               /* Compiler and HW barrier to prevent this store from
> moving
> +                * down after a later load-acquire and thus create
> overlapping
> +                * critical sections. Herb Sutter thinks this is needed */
> +               COMPILER_HW_BARRIER();
> +#elif defined __aarch64__
> +               __asm __volatile("stlr %w0, [%1]"
> +                               :
> +                               : "r"(val), "r"(&ptr->v)
> +                               : MEMORY);
> +#elif defined __x86_64__
> +               /* This is actually an atomic exchange operation */
> +               /* Generates good code on x86_64 */
> +               (void)__sync_lock_test_and_set(&ptr->v, val);
> +#else
> +#warning odp_atomic32_store_rls() may not be efficiently implemented
> +               /* This is actually an atomic exchange operation */
> +               (void)__sync_lock_test_and_set(&ptr->v, val);
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +
> +/**
> + * Atomic compare and exchange (swap) of 32-bit atomic variable
> + * "Weak" semantics, may fail spuriously and must be used in a loop.
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param exp_p Pointer to expected value (updated on failure)
> + * @param val   New value to write
> + * @param       memmodel Memory model associated with the compare-and-swap
> + * operation (ODP_MEMORDER_RLX only)
> + *
> + * @return 1 (true) if exchange successful, 0 (false) if not successful
> (and
> + * '*exp_p' updated with current value)
> + */
> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
> +               uint32_t *exp_p,
> +               uint32_t val,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +#if defined __arm__ /* A32/T32 ISA */
> +               uint32_t old;
> +               uint32_t exp = *exp_p;
> +               int status;
> +               __asm __volatile("ldrex %0, [%2]\t\n"
> +                                "cmp   %0, %3\t\n"
> +                                "bne   1f\t\n"
> +                                "strex %1, %4, [%2]\t\n"
> +                                "1:\t\n"
> +                               : "=&r"(old), "=&r"(status)
> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
> +                               : MEMORY);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Clear exclusive access monitor */
> +                       __asm __volatile("clrex");
> +                       /* Return current value */
> +                       *exp_p = old;
> +                       return 0;
> +               }
> +               /* strex returns 0 on success */
> +               if (odp_unlikely(status != 0)) {
> +                       /* strex failed, reservation was disturbed */
> +                       /* Return potentially changed value */
> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
> +                       return 0;
> +               }
> +               return 1;
> +#elif defined __mips64__
> +               uint32_t old;
> +               uint32_t exp = *exp_p;
> +               uint32_t status = val;
> +               __asm __volatile("llw %0, [%2]\t\n"
> +                                "bne %0, %3, 1f\t\n"
> +                                "scw %1, [%2]\t\n"
> +                                "1:\t\n"
> +                               : "=&r"(old), "+&r"(status)
> +                               : "r"(&ptr->v), "r"(exp)
> +                               : MEMORY);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Return current value */
> +                       *exp_p = old;
> +                       return 0;
> +               }
> +               /* scw returns 1 on success, 0 on failure */
> +               if (odp_unlikely(status == 0)) {
> +                       /* scw failed, reservation was disturbed */
> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
> +                       return 0;
> +               }
> +               return 1;
> +#elif defined __x86_64__
> +               uint32_t exp = *exp_p;
> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
> val);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Return the unexpected content of '*ptr' */
> +                       *exp_p = old;
> +                       return 0;
> +               } else {
> +                       return 1;
> +               }
> +#else
> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
> +               uint32_t exp = *exp_p;
> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
> val);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Return the unexpected content of '*ptr' */
> +                       *exp_p = old;
> +                       return 0;
> +               } else {
> +                       return 1;
> +               }
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +/**
> + * Atomic fetch and add to 32-bit atomic variable
> + * @note A - B <=> A + (-B)
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + * @param memmodel Memory model associated with the add
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
> + *
> + * @return Value of the atomic variable before the addition
> + */
> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
> +               uint32_t incr,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +#if defined __arm__ /* A32/T32 ISA */
> +               uint32_t old_val, tmp;
> +               int status;
> +               do {
> +                       __asm __volatile("ldrex %0, [%3]\t\n"
> +                                        "add   %1, %0, %4\t\n"
> +                                        "strex %2, %1, [%3]\t\n"
> +                                       : "=&r"(old_val), "+&r"(tmp),
> +                                         "=&r"(status)
> +                                       : "r"(&ptr->v), "r"(incr)
> +                                       : MEMORY);
> +               } while (odp_unlikely(status != 0));
> +               return old_val;
> +#elif defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("laa %0,(%2),%3"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr), "r" (incr)
> +                               : MEMORY);
> +               return old_val;
> +#elif defined __x86_64__
> +               /* Generates good code on x86_64 */
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +#else
> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("laa %0,(%2),%3"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr), "r" (incr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
> +#endif
> +               /* __sync_fetch_and_add() will give us barriers before and
> +                * after, we are fine with this for release operations */
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
>  }
>
>  /**
> - * Initialize atomic uint64
> + * Atomic add to 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + * @param memmodel Memory model associated with the add
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
> +               uint32_t incr,
> +               odp_memorder_t mmodel)
>  {
> -       *ptr = 0;
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic add instructions can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               __asm __volatile("saa %[inc], (%[base])"
> +                               : "+m" (*ptr)
> +                               : [inc] "r" (incr), [base] "r" (ptr)
> +                               : MEMORY);
> +               return;
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +               /* Platforms that support atomic add instructions can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("saa %[inc], (%[base])"
> +                               : "+m" (*ptr)
> +                               : [inc] "r" (incr), [base] "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return;
> +#endif
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>  }
>
>  /**
> - * Load value of atomic uint64
> - *
> - * @param ptr    An atomic variable
> + * Atomic fetch and increment of 32-bit atomic variable
>   *
> - * @return atomic uint64 value
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the increment
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   *
> - * @note The operation is not synchronized with other threads
> + * @return Value of the atomic variable before the increment
>   */
> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>  {
> -       return *ptr;
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic increment instructions
> can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("lai %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               return old_val;
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("lai %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
> +#endif
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>  }
>
>  /**
> - * Store value to atomic uint64
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> + * Atomic increment of 32-bit atomic variable
>   *
> - * @note The operation is not synchronized with other threads
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the increment
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
> -                                       uint64_t new_value)
> -{
> -       *ptr = new_value;
> -}
> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>
> -/**
> - * Add atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - */
> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
>  {
> -       __sync_fetch_and_add(ptr, value);
> +       /* Default to using odp_atomic32_fetch_inc() */
> +       /* Platforms that support atomic increment instructions can add
> +        * their implementations here */
> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>  }
>
>  /**
> - * Fetch and add atomic uint64
> + * Atomic fetch and decrement of 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the decrement
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   *
> - * @return Value of the variable before the operation
> + * @return Value of the atomic variable before the decrement
>   */
> -
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>  {
> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> -#else
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic decrement instructions
> can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("lad %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               return old_val;
>  #endif
> -/**
> - * Subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - */
> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
> -{
> -       __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> -#else
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("lad %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
>  #endif
> -/**
> - * Fetch and increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u64(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       odp_atomic_fetch_add_u64(ptr, 1);
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>  }
>
>  /**
> - * Fetch and decrement atomic uint64 by 1
> + * Atomic decrement of 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the decrement
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u64(ptr, 1);
> -}
> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
> +               odp_memorder_t memorder)
>
> -/**
> - * Decrement atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>  {
> -       odp_atomic_fetch_sub_u64(ptr, 1);
> +       /* Default to using odp_atomic32_fetch_dec() */
> +       /* Platforms that support atomic decrement instructions can add
> +        * their implementations here */
> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>  }
>
> -/**
> - * Atomic compare and set for 64bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> -}
> +/* We are not exporting this macro */
> +#undef COMPILER_HW_BARRIER
> +#undef MEMORY
>
>  #ifdef __cplusplus
>  }
> diff --git a/platform/linux-generic/include/api/odp_barrier.h
> b/platform/linux-generic/include/api/odp_barrier.h
> index a7b3215..69b1eb8 100644
> --- a/platform/linux-generic/include/api/odp_barrier.h
> +++ b/platform/linux-generic/include/api/odp_barrier.h
> @@ -27,18 +27,18 @@ extern "C" {
>   * ODP execution barrier
>   */
>  typedef struct odp_barrier_t {
> -       int              count;  /**< @private Thread count */
> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
> +       uint32_t       num_threads;  /**< @private Thread count (constant)
> */
> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
>  } odp_barrier_t;
>
>
>  /**
>   * Init barrier with thread count
>   *
> - * @param barrier    Barrier
> - * @param count      Thread count
> + * @param barrier     Barrier
> + * @param num_threads Number of threads which share the barrier
>   */
> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>
>
>  /**
> diff --git a/platform/linux-generic/include/api/odp_counter.h
> b/platform/linux-generic/include/api/odp_counter.h
> new file mode 100644
> index 0000000..b93c992
> --- /dev/null
> +++ b/platform/linux-generic/include/api/odp_counter.h
> @@ -0,0 +1,363 @@
> +/* Copyright (c) 2013, Linaro Limited
> + * All rights reserved.
> + *
> + * SPDX-License-Identifier:     BSD-3-Clause
> + */
> +
> +/**
> + * @file
> + *
> + * ODP atomic counter types and operations, suitable for e.g. shared
> statistics.
> + * Relaxed memory model assumed for lowest overhead.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Counter functions must be used to operate on counter variables!
> + */
> +
> +#ifndef ODP_COUNTER_H_
> +#define ODP_COUNTER_H_
> +
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * 32-bit (unsigned) atomic counter type
> + */
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the counter variable */
> +} odp_counter32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
> +
> +/**
> + * 64-bit (unsigned) atomic counter type
> + */
> +typedef struct {
> +       uint64_t v; /**< Actual storage for the counter variable */
> +       /* Room for other data structures (e.g. spin lock) that might be
> +        * needed to ensure atomicity on some architectures */
> +} odp_counter64_t
> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
> +
>
> +/*****************************************************************************
> + * Operations on 32-bit atomic counters
> + * odp_counter32_init - returns no value
> + * odp_counter32_read - returns current value
> + * odp_counter32_write - returns no value
> + * odp_counter32_add - returns no value
> + * odp_counter32_read_inc - returns old value
> + * odp_counter32_inc - returns no value
> +
> *****************************************************************************/
> +
> +/**
> + * Initialize 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param val   Initial value
> + */
> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t val)
> +{
> +       /* No implementation requires any other type of initialization */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Read 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + *
> + * @return Value of the variable
> + */
> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
> +{
> +       uint32_t val;
> +       /* Read of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) read variable, thus
> we
> +        * will avoid using compiler memory barriers */
> +       val = *(__volatile const uint32_t *)&ptr->v;
> +       return val;
> +}
> +
> +/**
> + * Write 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param val   Value to write to the variable
> + */
> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t val)
> +{
> +       /* Write of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) write variable,
> thus we
> +        * will avoid using compiler memory barriers */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Atomic add to 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t result;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%2]\t\n"
> +                                "add   %0, %0, %3\t\n"
> +                                "strex %1, %0, [%2]"
> +                                : "=&r"(result), "=&r"(status)
> +                                : "r"(&ptr->v), "Ir" (incr)
> +                                : );
> +       } while (odp_unlikely(status != 0));
> +#elif defined __OCTEON__
> +       __asm __volatile("saa %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +       /* Warning odp_counter32_add() may not be efficiently implemented
> */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) of 32-bit counter variable, return original value
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + *
> + * @return Original value of counter
> + */
> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t result, tmp;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%3]\t\n"
> +                                "add   %1, %0, #1\t\n"
> +                                "strex %2, %1, [%3]"
> +                                : "=&r"(result), "=&r"(tmp), "+&r"(status)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0));
> +       return result;
> +#elif defined __OCTEON__
> +       uint32_t old_val;
> +       __asm __volatile("lai %0,(%2)"
> +                        : "=r" (old_val), "+m" (ptr)
> +                        : "r" (ptr)
> +                        : );
> +       return old_val;
> +#elif defined __x86_64__
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#else
> +/* Warning odp_counter32_read_inc() may not be efficiently implemented */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + */
> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
> +{
> +#if defined __OCTEON__
> +       odp_counter32_add(ptr, 1);
> +#else
> +       (void)odp_counter32_read_inc(ptr);
> +#endif
> +}
> +
>
> +/*****************************************************************************
> + * Operations on 64-bit atomic counters
> + * odp_counter64_init
> + * odp_counter64_read
> + * odp_counter64_write
> + * odp_counter64_add
> + * odp_counter64_read_inc
> + * odp_counter64_inc
> +
> *****************************************************************************/
> +
> +/**
> + * Read 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + *
> + * @return Value of the counter variable
> + */
> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t val;
> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
> +                        "clrex" /* Clear exclusive access monitor */
> +                        : "=&r"(val)
> +                        : "r"(&ptr->v)
> +                        : );
> +       return val;
> +#elif defined __x86_64__ || defined __aarch64__
> +       /* Read of aligned quad/double word is atomic */
> +       return ptr->v;
> +#else
> +/* Warning odp_counter64_read() may not be efficiently implemented */
> +       return __sync_fetch_and_or(&ptr->v, 0);
> +#endif
> +}
> +
> +/**
> + * Write 64-bit counter variable
> + *
> + * @param ptr  Pointer to a 64-bit counter variable
> + * @param val  Value to write to the counter variable
> + */
> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t val)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               /* Read counter variable exclusively so we can write to it
> +                * later */
> +               /* Attempt to write the new value */
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "strexd %1, %3, %H3, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(val)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __x86_64__ || defined __aarch64__
> +       /* Write of aligned quad/double word is atomic */
> +       ptr->v = val;
> +#else
> +/* Warning odp_counter64_write() may not be efficiently implemented */
> +       /* This is actually an counter exchange operation */
> +       (void)__sync_lock_test_and_set(&ptr->v, val);
> +#endif
> +}
> +
> +/**
> + * Initialize 64-bit counter variable
> + * Perform implementation specific initializations, assign initial value.
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param val   Initial value
> + */
> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t val)
> +{
> +       /* No implementation requires any other type of initialization */
> +       odp_counter64_write(ptr, val);
> +}
> +
> +/**
> + * Atomic add to 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, %0, %3\t\n"
> +                                "adc    %H0, %H3\t\n"
> +                                "strexd %1, %0, %H0, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(incr)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __OCTEON__
> +       __asm __volatile("saad %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +/* Warning odp_counter64_add() may not be efficiently implemented */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +}
> +
> +
> +/**
> + * Atomic increment (+1) 64-bit counter variable and return original value
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + *
> + * @return Original value of counter
> + */
> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val, tmp;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
> +                                "adds   %2, %0, #1\t\n"
> +                                "adc    %H2, %H0, #0\t\n"
> +                                "strexd %1, %2, %H2, [%3]"
> +                                : "=&r"(old_val), "=&r"(status),
> "=&r"(tmp)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +       return old_val;
> +#elif defined __OCTEON__
> +       uint64_t old_val;
> +       __asm __volatile("laid %0,(%2)"
> +                       : "=r" (old_val), "+m" (ptr)
> +                       : "r" (ptr)
> +                       : );
> +       return old_val;
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#else
> +/* Warning odp_counter64_read_inc() may not be efficiently implemented */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + */
> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, #1\t\n"
> +                                "adc    %H0, #0\t\n"
> +                                "strexd %1, %0, %H0, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#else
> +       (void)odp_counter64_read_inc(ptr);
> +#endif
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif
> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
> b/platform/linux-generic/include/api/odp_rwlock.h
> index 252ebb2..ff8a9a2 100644
> --- a/platform/linux-generic/include/api/odp_rwlock.h
> +++ b/platform/linux-generic/include/api/odp_rwlock.h
> @@ -10,26 +10,30 @@
>  /**
>   * @file
>   *
> - * ODP RW Locks
> + * ODP read/write lock
> + * RW lock support multiple concurrent reads but only one (exclusive)
> writer.
>   */
>
> +#include <odp_atomic.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
>
>  /**
>   * The odp_rwlock_t type.
> - * write lock count is -1,
> - * read lock count > 0
> + * write lock is ~0U
> + * read lock count >0 && <~0U
>   */
>  typedef struct {
> -       volatile int32_t cnt; /**< -1 Write lock,
> -                               > 0 for Read lock. */
> +       odp_atomic32_t cnt; /**< == 0: unlocked,
> +                                == ~0: locked for write,
> +                                > 0 number of concurrent read locks */
>  } odp_rwlock_t;
>
>
>  /**
> - * Initialize the rwlock to an unlocked state.
> + * Initialize the rwlock to the unlocked state.
>   *
>   * @param rwlock pointer to the RW Lock.
>   */
> @@ -50,14 +54,14 @@ void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
>
>  /**
> - * Aquire a write lock.
> + * Aquire the write lock.
>   *
>   * @param rwlock pointer to a RW Lock.
>   */
>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
>
>  /**
> - * Release a write lock.
> + * Release the write lock.
>   *
>   * @param rwlock pointer to a RW Lock.
>   */
> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
> b/platform/linux-generic/include/api/odp_ticketlock.h
> index 6277a18..5933f85 100644
> --- a/platform/linux-generic/include/api/odp_ticketlock.h
> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
> @@ -21,14 +21,15 @@ extern "C" {
>
>  #include <odp_std_types.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>
>
>  /**
>   * ODP ticketlock
>   */
>  typedef struct odp_ticketlock_t {
> -       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
> -       volatile uint32_t cur_ticket;  /**< @private Current ticket */
> +       odp_counter32_t next_ticket; /**< @private Next ticket */
> +       odp_atomic32_t cur_ticket;  /**< @private Current ticket */
>  } odp_ticketlock_t;
>
>
> diff --git a/platform/linux-generic/include/odp_buffer_internal.h
> b/platform/linux-generic/include/odp_buffer_internal.h
> index 2002b51..530ab96 100644
> --- a/platform/linux-generic/include/odp_buffer_internal.h
> +++ b/platform/linux-generic/include/odp_buffer_internal.h
> @@ -88,7 +88,7 @@ typedef struct odp_buffer_hdr_t {
>         uint32_t                 index;      /* buf index in the pool */
>         size_t                   size;       /* max data size */
>         size_t                   cur_offset; /* current offset */
> -       odp_atomic_int_t         ref_count;  /* reference count */
> +       odp_atomic32_t           ref_count;  /* reference count */
>         odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
>         int                      type;       /* type of next header */
>         odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
> diff --git a/platform/linux-generic/include/odp_spin_internal.h
> b/platform/linux-generic/include/odp_spin_internal.h
> index b7e2071..29c524f 100644
> --- a/platform/linux-generic/include/odp_spin_internal.h
> +++ b/platform/linux-generic/include/odp_spin_internal.h
> @@ -15,15 +15,6 @@ extern "C" {
>
>
>  /**
> - * GCC memory barrier for ODP internal use
> - */
> -static inline void odp_mem_barrier(void)
> -{
> -       __asm__ __volatile__ ("" : : : "memory");
> -}
> -
> -
> -/**
>   * Spin loop for ODP internal use
>   */
>  static inline void odp_spin(void)
> diff --git a/platform/linux-generic/odp_barrier.c
> b/platform/linux-generic/odp_barrier.c
> index a82b294..10368b5 100644
> --- a/platform/linux-generic/odp_barrier.c
> +++ b/platform/linux-generic/odp_barrier.c
> @@ -8,41 +8,52 @@
>  #include <odp_sync.h>
>  #include <odp_spin_internal.h>
>
> -void odp_barrier_init_count(odp_barrier_t *barrier, int count)
> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
>  {
> -       barrier->count = count;
> -       barrier->bar = 0;
> -       odp_sync_stores();
> +       barrier->num_threads = num_threads; /* Constant after
> initialisation */
> +       odp_atomic32_init(&barrier->in_barrier, 0);
>  }
>
>  /*
>   * Efficient barrier_sync -
>   *
>   *   Barriers are initialized with a count of the number of callers
> - *   that must sync on the barrier before any may proceed.
> + *   that must sync on (enter) the barrier before any may proceed (exit).
>   *
>   *   To avoid race conditions and to permit the barrier to be fully
> - *   reusable, the barrier value cycles between 0..2*count-1. When
> - *   synchronizing the wasless variable simply tracks which half of
> + *   reusable, the barrier value cycles between 0..2*count-1 (temporarily
> + *   hitting 2*count before being wrapped). When
> + *   synchronizing, the waslow variable simply tracks which half of
>   *   the cycle the barrier was in upon entry.  Exit is when the
>   *   barrier crosses to the other half of the cycle.
>   */
>
>  void odp_barrier_sync(odp_barrier_t *barrier)
>  {
> -       int count;
> -       int wasless;
> +       uint32_t count;
> +       bool waslow;
>
> -       odp_sync_stores();
> -       wasless = barrier->bar < barrier->count;
> -       count = odp_atomic_fetch_inc_int(&barrier->bar);
> +       /* We need both acquire and release barriers but does the order
> +        * matter? Here we start with release and end with acquire. */
>
> -       if (count == 2*barrier->count-1) {
> -               barrier->bar = 0;
> -       } else {
> -               while ((barrier->bar < barrier->count) == wasless)
> -                       odp_spin();
> -       }
> +       /* Increase threads in_barrier count, this will automatically
> release
> +        * the other threads when lower/upper range is switched */
> +       count = odp_atomic32_fetch_add(&barrier->in_barrier, 1,
> +                                      ODP_MEMORDER_RLS);
> +       /* Compute lower or higher range indicator */
> +       waslow = count < barrier->num_threads;
>
> -       odp_mem_barrier();
> +       /* Check if in_barrier count should wrap */
> +       if (count == 2 * barrier->num_threads - 1) {
> +               /* Manually wrap the counter */
> +               odp_atomic32_add(&barrier->in_barrier,
> +                                -2 * barrier->num_threads,
> +                                ODP_MEMORDER_RLX);
> +               /* Fall-through the final part for the acquire barrier */
> +       }
> +       /* Wait for counter to change half */
> +       while ((odp_atomic32_load(&barrier->in_barrier, ODP_MEMORDER_ACQ) <
> +              barrier->num_threads) == waslow) {
> +               odp_spin();
> +       }
>  }
> diff --git a/platform/linux-generic/odp_buffer.c
> b/platform/linux-generic/odp_buffer.c
> index e54e0e7..fc3506b 100644
> --- a/platform/linux-generic/odp_buffer.c
> +++ b/platform/linux-generic/odp_buffer.c
> @@ -73,7 +73,8 @@ int odp_buffer_snprint(char *str, size_t n, odp_buffer_t
> buf)
>         len += snprintf(&str[len], n-len,
>                         "  cur_offset   %zu\n",       hdr->cur_offset);
>         len += snprintf(&str[len], n-len,
> -                       "  ref_count    %i\n",        hdr->ref_count);
> +                       "  ref_count    %u\n",
> +                       odp_atomic32_load(&hdr->ref_count,
> ODP_MEMORDER_RLX));
>         len += snprintf(&str[len], n-len,
>                         "  type         %i\n",        hdr->type);
>         len += snprintf(&str[len], n-len,
> diff --git a/platform/linux-generic/odp_crypto.c
> b/platform/linux-generic/odp_crypto.c
> index b37ad6b..75b4ce0 100644
> --- a/platform/linux-generic/odp_crypto.c
> +++ b/platform/linux-generic/odp_crypto.c
> @@ -6,7 +6,7 @@
>
>  #include <odp_crypto.h>
>  #include <odp_internal.h>
> -#include <odp_atomic.h>
> +#include <odp_counter.h>
>  #include <odp_spinlock.h>
>  #include <odp_sync.h>
>  #include <odp_debug.h>
> @@ -26,7 +26,7 @@
>  #define MAX_SESSIONS 32
>
>  typedef struct {
> -       odp_atomic_u32_t next;
> +       odp_counter32_t   next;
>         uint32_t         max;
>         odp_crypto_generic_session_t sessions[0];
>  } odp_crypto_global_t;
> @@ -58,7 +58,7 @@ odp_crypto_generic_session_t *alloc_session(void)
>         uint32_t idx;
>         odp_crypto_generic_session_t *session = NULL;
>
> -       idx = odp_atomic_fetch_inc_u32(&global->next);
> +       idx = odp_counter32_read_inc(&global->next);
>         if (idx < global->max) {
>                 session = &global->sessions[idx];
>                 session->index = idx;
> @@ -420,6 +420,7 @@ odp_crypto_init_global(void)
>
>         /* Initialize it */
>         global->max = MAX_SESSIONS;
> +       odp_counter32_init(&global->next, 0);
>
>         return 0;
>  }
> diff --git a/platform/linux-generic/odp_queue.c
> b/platform/linux-generic/odp_queue.c
> index 1318bcd..08c0d29 100644
> --- a/platform/linux-generic/odp_queue.c
> +++ b/platform/linux-generic/odp_queue.c
> @@ -214,8 +214,13 @@ int odp_queue_set_context(odp_queue_t handle, void
> *context)
>  {
>         queue_entry_t *queue;
>         queue = queue_to_qentry(handle);
> +       /* Setting a new queue context can be viewed as a release
> operation,
> +        * all writes to the context must be observable before the context
> +        * is made observable */
>         odp_sync_stores();
> -       queue->s.param.context = context;
> +       queue->s.param.context = context; /* Store-release */
> +       /* Ensure queue modification is globally visible before we return
> +        * and the application might cause the queue to be scheduled */
>         odp_sync_stores();
>         return 0;
>  }
> diff --git a/platform/linux-generic/odp_ring.c
> b/platform/linux-generic/odp_ring.c
> index 632aa66..e5b9c23 100644
> --- a/platform/linux-generic/odp_ring.c
> +++ b/platform/linux-generic/odp_ring.c
> @@ -187,10 +187,10 @@ odph_ring_create(const char *name, unsigned count,
> unsigned flags)
>                 r->cons.size = count;
>                 r->prod.mask = count-1;
>                 r->cons.mask = count-1;
> -               r->prod.head = 0;
> -               r->cons.head = 0;
> -               r->prod.tail = 0;
> -               r->cons.tail = 0;
> +               odp_atomic32_init(&r->prod.head, 0);
> +               odp_atomic32_init(&r->cons.head, 0);
> +               odp_atomic32_init(&r->prod.tail, 0);
> +               odp_atomic32_init(&r->cons.tail, 0);
>
>                 TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
>         } else {
> @@ -227,7 +227,7 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>         uint32_t prod_head, prod_next;
>         uint32_t cons_tail, free_entries;
>         const unsigned max = n;
> -       int success;
> +       bool success;
>         unsigned i;
>         uint32_t mask = r->prod.mask;
>         int ret;
> @@ -237,8 +237,8 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>                 /* Reset n to the initial burst count */
>                 n = max;
>
> -               prod_head = r->prod.head;
> -               cons_tail = r->cons.tail;
> +               prod_head = odp_atomic32_load(&r->prod.head,
> ODP_MEMORDER_RLX);
> +               cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_ACQ);
>                 /* The subtraction is done between two unsigned 32bits
> value
>                  * (the result is always modulo 32 bits even if we have
>                  * prod_head > cons_tail). So 'free_entries' is always
> between 0
> @@ -259,13 +259,14 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>                 }
>
>                 prod_next = prod_head + n;
> -               success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
> -                                             prod_next);
> -       } while (odp_unlikely(success == 0));
> +               success = odp_atomic32_cmp_xchg_weak(&r->prod.head,
> +                                                    &prod_head,
> +                                                    prod_next,
> +                                                    ODP_MEMORDER_RLX);
> +       } while (odp_unlikely(!success));
>
>         /* write entries in ring */
>         ENQUEUE_PTRS();
> -       odp_mem_barrier();
>
>         /* if we exceed the watermark */
>         if (odp_unlikely(((mask + 1) - free_entries + n) >
> r->prod.watermark)) {
> @@ -279,10 +280,11 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>          * If there are other enqueues in progress that preceeded us,
>          * we need to wait for them to complete
>          */
> -       while (odp_unlikely(r->prod.tail != prod_head))
> +       while (odp_unlikely(odp_atomic32_load(&r->prod.tail,
> +                                             ODP_MEMORDER_RLX) !=
> prod_head))
>                 odp_spin();
>
> -       r->prod.tail = prod_next;
> +       odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
>         return ret;
>  }
>
> @@ -298,8 +300,8 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>         uint32_t mask = r->prod.mask;
>         int ret;
>
> -       prod_head = r->prod.head;
> -       cons_tail = r->cons.tail;
> +       prod_head = odp_atomic32_load(&r->prod.head, ODP_MEMORDER_RLX);
> +       cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_ACQ);
>         /* The subtraction is done between two unsigned 32bits value
>          * (the result is always modulo 32 bits even if we have
>          * prod_head > cons_tail). So 'free_entries' is always between 0
> @@ -320,11 +322,10 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>         }
>
>         prod_next = prod_head + n;
> -       r->prod.head = prod_next;
> +       odp_atomic32_store(&r->prod.head, prod_next, ODP_MEMORDER_RLX);
>
>         /* write entries in ring */
>         ENQUEUE_PTRS();
> -       odp_mem_barrier();
>
>         /* if we exceed the watermark */
>         if (odp_unlikely(((mask + 1) - free_entries + n) >
> r->prod.watermark)) {
> @@ -334,7 +335,7 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>                 ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
>         }
>
> -       r->prod.tail = prod_next;
> +       odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
>         return ret;
>  }
>
> @@ -348,7 +349,7 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>         uint32_t cons_head, prod_tail;
>         uint32_t cons_next, entries;
>         const unsigned max = n;
> -       int success;
> +       bool success;
>         unsigned i;
>         uint32_t mask = r->prod.mask;
>
> @@ -357,8 +358,8 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>                 /* Restore n as it may change every loop */
>                 n = max;
>
> -               cons_head = r->cons.head;
> -               prod_tail = r->prod.tail;
> +               cons_head = odp_atomic32_load(&r->cons.head,
> ODP_MEMORDER_RLX);
> +               prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_ACQ);
>                 /* The subtraction is done between two unsigned 32bits
> value
>                  * (the result is always modulo 32 bits even if we have
>                  * cons_head > prod_tail). So 'entries' is always between 0
> @@ -378,22 +379,24 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>                 }
>
>                 cons_next = cons_head + n;
> -               success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
> -                                             cons_next);
> -       } while (odp_unlikely(success == 0));
> +               success = odp_atomic32_cmp_xchg_weak(&r->cons.head,
> +                                                    &cons_head,
> +                                                    cons_next,
> +                                                    ODP_MEMORDER_RLX);
> +       } while (odp_unlikely(!success));
>
>         /* copy in table */
>         DEQUEUE_PTRS();
> -       odp_mem_barrier();
>
>         /*
>          * If there are other dequeues in progress that preceded us,
>          * we need to wait for them to complete
>          */
> -       while (odp_unlikely(r->cons.tail != cons_head))
> +       while (odp_unlikely(odp_atomic32_load(&r->cons.tail,
> +                                             ODP_MEMORDER_RLX) !=
> cons_head))
>                 odp_spin();
>
> -       r->cons.tail = cons_next;
> +       odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
>
>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>  }
> @@ -409,8 +412,8 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>         unsigned i;
>         uint32_t mask = r->prod.mask;
>
> -       cons_head = r->cons.head;
> -       prod_tail = r->prod.tail;
> +       cons_head = odp_atomic32_load(&r->cons.head, ODP_MEMORDER_RLX);
> +       prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_ACQ);
>         /* The subtraction is done between two unsigned 32bits value
>          * (the result is always modulo 32 bits even if we have
>          * cons_head > prod_tail). So 'entries' is always between 0
> @@ -429,13 +432,12 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>         }
>
>         cons_next = cons_head + n;
> -       r->cons.head = cons_next;
> +       odp_atomic32_store(&r->cons.head, cons_next, ODP_MEMORDER_RLX);
>
>         /* copy in table */
>         DEQUEUE_PTRS();
> -       odp_mem_barrier();
>
> -       r->cons.tail = cons_next;
> +       odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>  }
>
> @@ -482,8 +484,8 @@ int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void
> **obj_table, unsigned n)
>   */
>  int odph_ring_full(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_RLX);
> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_RLX);
>         return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
>  }
>
> @@ -492,8 +494,8 @@ int odph_ring_full(const odph_ring_t *r)
>   */
>  int odph_ring_empty(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_RLX);
> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_RLX);
>         return !!(cons_tail == prod_tail);
>  }
>
> @@ -502,8 +504,8 @@ int odph_ring_empty(const odph_ring_t *r)
>   */
>  unsigned odph_ring_count(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_RLX);
> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_RLX);
>         return (prod_tail - cons_tail) & r->prod.mask;
>  }
>
> @@ -512,8 +514,8 @@ unsigned odph_ring_count(const odph_ring_t *r)
>   */
>  unsigned odph_ring_free_count(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_RLX);
> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_RLX);
>         return (cons_tail - prod_tail - 1) & r->prod.mask;
>  }
>
> @@ -523,10 +525,14 @@ void odph_ring_dump(const odph_ring_t *r)
>         ODP_DBG("ring <%s>@%p\n", r->name, r);
>         ODP_DBG("  flags=%x\n", r->flags);
>         ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
> -       ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
> -       ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
> -       ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
> -       ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
> +       ODP_DBG("  ct=%"PRIu32"\n", odp_atomic32_load(&r->cons.tail,
> +                                                     ODP_MEMORDER_RLX));
> +       ODP_DBG("  ch=%"PRIu32"\n", odp_atomic32_load(&r->cons.head,
> +                                                     ODP_MEMORDER_RLX));
> +       ODP_DBG("  pt=%"PRIu32"\n", odp_atomic32_load(&r->prod.tail,
> +                                                     ODP_MEMORDER_RLX));
> +       ODP_DBG("  ph=%"PRIu32"\n", odp_atomic32_load(&r->prod.head,
> +                                                     ODP_MEMORDER_RLX));
>         ODP_DBG("  used=%u\n", odph_ring_count(r));
>         ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
>         if (r->prod.watermark == r->prod.size)
> diff --git a/platform/linux-generic/odp_rwlock.c
> b/platform/linux-generic/odp_rwlock.c
> index 11c8dd7..a5fae4d 100644
> --- a/platform/linux-generic/odp_rwlock.c
> +++ b/platform/linux-generic/odp_rwlock.c
> @@ -4,58 +4,64 @@
>   * SPDX-License-Identifier:     BSD-3-Clause
>   */
>
> +#include <stdbool.h>
>  #include <odp_atomic.h>
>  #include <odp_rwlock.h>
> -
>  #include <odp_spin_internal.h>
>
>  void odp_rwlock_init(odp_rwlock_t *rwlock)
>  {
> -       rwlock->cnt = 0;
> +       odp_atomic32_init(&rwlock->cnt, 0);
>  }
>
>  void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
>  {
> -       int32_t cnt;
> -       int  is_locked = 0;
> -
> -       while (is_locked == 0) {
> -               cnt = rwlock->cnt;
> -               /* waiting for read lock */
> -               if (cnt < 0) {
> +       bool gotit;
> +       uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
> +       do {
> +               /* Wait for any writer to release lock */
> +               while ((int32_t)cnt < 0) {
>                         odp_spin();
> -                       continue;
> +                       cnt = odp_atomic32_load(&rwlock->cnt,
> +                                               ODP_MEMORDER_RLX);
>                 }
> -               is_locked = odp_atomic_cmpset_u32(
> -                                       (volatile uint32_t *)&rwlock->cnt,
> -                                             cnt, cnt + 1);
> -       }
> +               /* Attempt to take another read lock */
> +               gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
> +                                                  &cnt, cnt + 1,
> +                                                  ODP_MEMORDER_RLX);
> +               /* If operation fails, 'cnt' will contain current value */
> +       } while (!gotit);
>  }
>
>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
>  {
> -       odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
> +       /* Release one read lock by subtracting 1 */
> +       odp_atomic32_dec(&rwlock->cnt, ODP_MEMORDER_RLS);
>  }
>
>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
>  {
> -       int32_t cnt;
> -       int is_locked = 0;
> -
> -       while (is_locked == 0) {
> -               cnt = rwlock->cnt;
> -               /* lock aquired, wait */
> -               if (cnt != 0) {
> +       bool gotit;
> +       uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
> +       do {
> +               /* Wait for all lock holders to release lock */
> +               while (cnt != 0) {
> +                       /* Lock is busy */
>                         odp_spin();
> -                       continue;
> +                       cnt = odp_atomic32_load(&rwlock->cnt,
> +                                               ODP_MEMORDER_RLX);
>                 }
> -               is_locked = odp_atomic_cmpset_u32(
> -                                       (volatile uint32_t *)&rwlock->cnt,
> -                                             0, -1);
> -       }
> +               /* Attempt to take write lock */
> +               gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
> +                                                  &cnt,
> +                                                  (uint32_t)-1,
> +                                                  ODP_MEMORDER_RLX);
> +               /* If operation fails, 'cnt' will contain current value */
> +       } while (!gotit);
>  }
>
>  void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
>  {
> -       odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
> +       /* Release the write lock by adding 1 */
> +       odp_atomic32_inc(&rwlock->cnt, ODP_MEMORDER_RLS);
>  }
> diff --git a/platform/linux-generic/odp_thread.c
> b/platform/linux-generic/odp_thread.c
> index b869b27..652d317 100644
> --- a/platform/linux-generic/odp_thread.c
> +++ b/platform/linux-generic/odp_thread.c
> @@ -11,7 +11,7 @@
>
>  #include <odp_thread.h>
>  #include <odp_internal.h>
> -#include <odp_atomic.h>
> +#include <odp_counter.h>
>  #include <odp_config.h>
>  #include <odp_debug.h>
>  #include <odp_shared_memory.h>
> @@ -31,7 +31,7 @@ typedef struct {
>
>  typedef struct {
>         thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
> -       odp_atomic_int_t num;
> +       odp_counter32_t   num;
>
>  } thread_globals_t;
>
> @@ -58,6 +58,7 @@ int odp_thread_init_global(void)
>                 return -1;
>
>         memset(thread_globals, 0, sizeof(thread_globals_t));
> +       odp_counter32_init(&thread_globals->num, 0);
>         return 0;
>  }
>
> @@ -67,7 +68,7 @@ static int thread_id(void)
>         int id;
>         int cpu;
>
> -       id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
> +       id = (int)odp_counter32_read_inc(&thread_globals->num);
>
>         if (id >= ODP_CONFIG_MAX_THREADS) {
>                 ODP_ERR("Too many threads\n");
> @@ -77,7 +78,7 @@ static int thread_id(void)
>         cpu = sched_getcpu();
>
>         if (cpu < 0) {
> -               ODP_ERR("getcpu failed\n");
> +               ODP_ERR("sched_getcpu failed\n");
>                 return -1;
>         }
>
> diff --git a/platform/linux-generic/odp_ticketlock.c
> b/platform/linux-generic/odp_ticketlock.c
> index be5b885..510aa9f 100644
> --- a/platform/linux-generic/odp_ticketlock.c
> +++ b/platform/linux-generic/odp_ticketlock.c
> @@ -6,15 +6,15 @@
>
>  #include <odp_ticketlock.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>  #include <odp_sync.h>
>  #include <odp_spin_internal.h>
>
>
>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>  {
> -       ticketlock->next_ticket = 0;
> -       ticketlock->cur_ticket  = 0;
> -       odp_sync_stores();
> +       odp_counter32_init(&ticketlock->next_ticket, 0);
> +       odp_atomic32_init(&ticketlock->cur_ticket, 0);
>  }
>
>
> @@ -22,30 +22,15 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>  {
>         uint32_t ticket;
>
> -       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
> +       ticket = odp_counter32_read_inc(&ticketlock->next_ticket);
>
> -       while (ticket != ticketlock->cur_ticket)
> +       while (ticket != odp_atomic32_load(&ticketlock->cur_ticket,
> +                                          ODP_MEMORDER_ACQ))
>                 odp_spin();
> -
> -       odp_mem_barrier();
>  }
>
>
>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>  {
> -       odp_sync_stores();
> -
> -       ticketlock->cur_ticket++;
> -
> -#if defined __OCTEON__
> -       odp_sync_stores();
> -#else
> -       odp_mem_barrier();
> -#endif
> -}
> -
> -
> -int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
> -{
> -       return ticketlock->cur_ticket != ticketlock->next_ticket;
> +       odp_atomic32_inc(&ticketlock->cur_ticket, ODP_MEMORDER_RLS);
>  }
> diff --git a/platform/linux-generic/odp_timer.c
> b/platform/linux-generic/odp_timer.c
> index 313c713..fffaa44 100644
> --- a/platform/linux-generic/odp_timer.c
> +++ b/platform/linux-generic/odp_timer.c
> @@ -10,6 +10,7 @@
>  #include <odp_buffer_pool_internal.h>
>  #include <odp_internal.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>  #include <odp_spinlock.h>
>  #include <odp_sync.h>
>  #include <odp_debug.h>
> @@ -32,8 +33,8 @@ typedef struct {
>
>  typedef struct {
>         int               allocated;
> -       volatile int      active;
> -       volatile uint64_t cur_tick;
> +       odp_atomic32_t    active;
> +       odp_counter64_t   cur_tick;
>         timer_t           timerid;
>         odp_timer_t       timer_hdl;
>         odp_buffer_pool_t pool;
> @@ -150,16 +151,16 @@ static void notify_function(union sigval sigval)
>
>         timer = sigval.sival_ptr;
>
> -       if (timer->active == 0) {
> +       if (odp_atomic32_load(&timer->active, ODP_MEMORDER_RLX) == 0) {
>                 ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
>                 return;
>         }
>
>         /* ODP_DBG("Tick\n"); */
>
> -       cur_tick = timer->cur_tick++;
> -
> -       odp_sync_stores();
> +       /* Increment and read are not atomic but we are the only writer */
> +       odp_counter64_inc(&timer->cur_tick);
> +       cur_tick = odp_counter64_read(&timer->cur_tick);
>
>         tick = &timer->tick[cur_tick % MAX_TICKS];
>
> @@ -308,6 +309,8 @@ odp_timer_t odp_timer_create(const char *name,
> odp_buffer_pool_t pool,
>
>         timer_hdl = id + 1;
>
> +       odp_atomic32_init(&timer->active, 0);
> +       odp_counter64_init(&timer->cur_tick, 0);
>         timer->timer_hdl     = timer_hdl;
>         timer->pool          = pool;
>         timer->resolution_ns = resolution_ns;
> @@ -318,8 +321,7 @@ odp_timer_t odp_timer_create(const char *name,
> odp_buffer_pool_t pool,
>                 timer->tick[i].list = NULL;
>         }
>
> -       timer->active = 1;
> -       odp_sync_stores();
> +       odp_atomic32_store(&timer->active, 1, ODP_MEMORDER_RLS);
>
>         timer_start(timer);
>
> @@ -340,7 +342,7 @@ odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t
> timer_hdl, uint64_t tmo_tick,
>         id = (int)timer_hdl - 1;
>         timer = &odp_timer.timer[id];
>
> -       cur_tick = timer->cur_tick;
> +       cur_tick = odp_counter64_read(&timer->cur_tick);
>         if (tmo_tick <= cur_tick) {
>                 ODP_DBG("timeout too close\n");
>                 return ODP_TIMER_TMO_INVALID;
> @@ -416,7 +418,7 @@ uint64_t odp_timer_current_tick(odp_timer_t timer_hdl)
>         uint32_t id;
>
>         id = timer_hdl - 1;
> -       return odp_timer.timer[id].cur_tick;
> +       return odp_counter64_read(&odp_timer.timer[id].cur_tick);
>  }
>
>  odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
> diff --git a/test/api_test/Makefile.am b/test/api_test/Makefile.am
> index 5104454..478aa6c 100644
> --- a/test/api_test/Makefile.am
> +++ b/test/api_test/Makefile.am
> @@ -1,12 +1,12 @@
>  include $(top_srcdir)/test/Makefile.inc
>
> -bin_PROGRAMS = odp_atomic odp_shm odp_ring odp_timer_ping
> -odp_atomic_LDFLAGS = $(AM_LDFLAGS) -static
> +bin_PROGRAMS = odp_counter odp_shm odp_ring odp_timer_ping
> +odp_counter_LDFLAGS = $(AM_LDFLAGS) -static
>  odp_shm_LDFLAGS = $(AM_LDFLAGS) -static
>  odp_ring_LDFLAGS = $(AM_LDFLAGS) -static
>  odp_timer_ping_LDFLAGS = $(AM_LDFLAGS) -static
>
> -dist_odp_atomic_SOURCES = odp_atomic_test.c odp_common.c
> +dist_odp_counter_SOURCES = odp_counter_test.c odp_common.c
>  dist_odp_shm_SOURCES = odp_shm_test.c odp_common.c
>  dist_odp_ring_SOURCES = odp_ring_test.c odp_common.c
>  dist_odp_timer_ping_SOURCES = odp_timer_ping.c odp_common.c
> diff --git a/test/api_test/odp_atomic_test.c
> b/test/api_test/odp_atomic_test.c
> deleted file mode 100644
> index 9019d4f..0000000
> --- a/test/api_test/odp_atomic_test.c
> +++ /dev/null
> @@ -1,362 +0,0 @@
> -/* Copyright (c) 2013, Linaro Limited
> - * All rights reserved.
> - *
> - * SPDX-License-Identifier:     BSD-3-Clause
> - */
> -
> -#include <string.h>
> -#include <sys/time.h>
> -#include <odp_debug.h>
> -#include <odp_common.h>
> -#include <odp_atomic_test.h>
> -
> -static odp_atomic_int_t a32;
> -static odp_atomic_u32_t a32u;
> -static odp_atomic_u64_t a64u;
> -
> -static odp_atomic_int_t numthrds;
> -
> -static const char * const test_name[] = {
> -       "dummy",
> -       "test atomic basic ops add/sub/inc/dec",
> -       "test atomic inc/dec of signed word",
> -       "test atomic add/sub of signed word",
> -       "test atomic inc/dec of unsigned word",
> -       "test atomic add/sub of unsigned word",
> -       "test atomic inc/dec of unsigned double word",
> -       "test atomic add/sub of unsigned double word"
> -};
> -
> -static struct timeval tv0[MAX_WORKERS], tv1[MAX_WORKERS];
> -
> -static void usage(void)
> -{
> -       printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
> -              "\t<testcase> is\n"
> -              "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
> -              "\t\t2 - Test inc dec of signed word\n"
> -              "\t\t3 - Test add sub of signed word\n"
> -              "\t\t4 - Test inc dec of unsigned word\n"
> -              "\t\t5 - Test add sub of unsigned word\n"
> -              "\t\t6 - Test inc dec of double word\n"
> -              "\t\t7 - Test add sub of double word\n"
> -              "\t<num of pthread> is optional\n"
> -              "\t\t<1 - 31> - no of pthreads to start\n"
> -              "\t\tif user doesn't specify this option, then\n"
> -              "\t\tno of pthreads created is equivalent to no of cores\n"
> -              "\t\tavailable in the system\n"
> -              "\tExample usage:\n"
> -              "\t\t./odp_atomic -t 2\n"
> -              "\t\t./odp_atomic -t 3 -n 12\n");
> -}
> -
> -void test_atomic_inc_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_inc_int(&a32);
> -}
> -
> -void test_atomic_inc_u32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_inc_u32(&a32u);
> -}
> -
> -void test_atomic_inc_64(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_inc_u64(&a64u);
> -}
> -
> -void test_atomic_dec_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_dec_int(&a32);
> -}
> -
> -void test_atomic_dec_u32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_dec_u32(&a32u);
> -}
> -
> -void test_atomic_dec_64(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_dec_u64(&a64u);
> -}
> -
> -void test_atomic_add_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_add_u32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_add_64(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_sub_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_sub_u32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_sub_64(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_inc_dec_32(void)
> -{
> -       test_atomic_inc_32();
> -       test_atomic_dec_32();
> -}
> -
> -void test_atomic_add_sub_32(void)
> -{
> -       test_atomic_add_32();
> -       test_atomic_sub_32();
> -}
> -
> -void test_atomic_inc_dec_u32(void)
> -{
> -       test_atomic_inc_u32();
> -       test_atomic_dec_u32();
> -}
> -
> -void test_atomic_add_sub_u32(void)
> -{
> -       test_atomic_add_u32();
> -       test_atomic_sub_u32();
> -}
> -
> -void test_atomic_inc_dec_64(void)
> -{
> -       test_atomic_inc_64();
> -       test_atomic_dec_64();
> -}
> -
> -void test_atomic_add_sub_64(void)
> -{
> -       test_atomic_add_64();
> -       test_atomic_sub_64();
> -}
> -
> -/**
> - * Test basic atomic operation like
> - * add/sub/increment/decrement operation.
> - */
> -void test_atomic_basic(void)
> -{
> -       test_atomic_inc_32();
> -       test_atomic_dec_32();
> -       test_atomic_add_32();
> -       test_atomic_sub_32();
> -
> -       test_atomic_inc_u32();
> -       test_atomic_dec_u32();
> -       test_atomic_add_u32();
> -       test_atomic_sub_u32();
> -
> -       test_atomic_inc_64();
> -       test_atomic_dec_64();
> -       test_atomic_add_64();
> -       test_atomic_sub_64();
> -}
> -
> -void test_atomic_init(void)
> -{
> -       odp_atomic_init_int(&a32);
> -       odp_atomic_init_u32(&a32u);
> -       odp_atomic_init_u64(&a64u);
> -}
> -
> -void test_atomic_store(void)
> -{
> -       odp_atomic_store_int(&a32, S32_INIT_VAL);
> -       odp_atomic_store_u32(&a32u, U32_INIT_VAL);
> -       odp_atomic_store_u64(&a64u, U64_INIT_VAL);
> -}
> -
> -int test_atomic_validate(void)
> -{
> -       if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
> -               ODP_ERR("Atomic signed 32 usual functions failed\n");
> -               return -1;
> -       }
> -
> -       if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
> -               ODP_ERR("Atomic u32 usual functions failed\n");
> -               return -1;
> -       }
> -
> -       if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
> -               ODP_ERR("Atomic u64 usual functions failed\n");
> -               return -1;
> -       }
> -
> -       return 0;
> -}
> -
> -static void *run_thread(void *arg)
> -{
> -       pthrd_arg *parg = (pthrd_arg *)arg;
> -       int thr;
> -
> -       thr = odp_thread_id();
> -
> -       ODP_DBG("Thread %i starts\n", thr);
> -
> -       odp_atomic_inc_int(&numthrds);
> -
> -       /* Wait here until all pthreads are created */
> -       while (*(volatile int *)&numthrds < parg->numthrds)
> -               ;
> -
> -       gettimeofday(&tv0[thr], NULL);
> -
> -       switch (parg->testcase) {
> -       case TEST_MIX:
> -               test_atomic_basic();
> -               break;
> -       case TEST_INC_DEC_S32:
> -               test_atomic_inc_dec_32();
> -               break;
> -       case TEST_ADD_SUB_S32:
> -               test_atomic_add_sub_32();
> -               break;
> -       case TEST_INC_DEC_U32:
> -               test_atomic_inc_dec_u32();
> -               break;
> -       case TEST_ADD_SUB_U32:
> -               test_atomic_add_sub_u32();
> -               break;
> -       case TEST_INC_DEC_64:
> -               test_atomic_inc_dec_64();
> -               break;
> -       case TEST_ADD_SUB_64:
> -               test_atomic_add_sub_64();
> -               break;
> -       }
> -       gettimeofday(&tv1[thr], NULL);
> -       fflush(NULL);
> -
> -       printf("Time taken in thread %02d to complete op is %lld usec\n",
> thr,
> -              (tv1[thr].tv_sec - tv0[thr].tv_sec) * 1000000ULL +
> -              (tv1[thr].tv_usec - tv0[thr].tv_usec));
> -
> -       return parg;
> -}
> -
> -int main(int argc, char *argv[])
> -{
> -       pthrd_arg thrdarg;
> -       int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
> -       char c;
> -       int result;
> -
> -       if (argc == 1 || argc % 2 == 0) {
> -               usage();
> -               goto err_exit;
> -       }
> -       if (odp_test_global_init() != 0)
> -               goto err_exit;
> -       odp_print_system_info();
> -
> -       while (cnt != 0) {
> -               sscanf(argv[++i], "-%c", &c);
> -               switch (c) {
> -               case 't':
> -                       sscanf(argv[++i], "%d", &test_type);
> -                       break;
> -               case 'n':
> -                       sscanf(argv[++i], "%d", &pthrdnum);
> -                       break;
> -               default:
> -                       ODP_ERR("Invalid option %c\n", c);
> -                       usage();
> -                       goto err_exit;
> -               }
> -               if (test_type < TEST_MIX || test_type > TEST_MAX ||
> -                   pthrdnum > odp_sys_core_count()) {
> -                       usage();
> -                       goto err_exit;
> -               }
> -               cnt -= 2;
> -       }
> -       if (pthrdnum == 0)
> -               pthrdnum = odp_sys_core_count();
> -
> -       odp_atomic_init_int(&numthrds);
> -       test_atomic_init();
> -       test_atomic_store();
> -
> -       memset(&thrdarg, 0, sizeof(pthrd_arg));
> -       thrdarg.testcase = test_type;
> -       thrdarg.numthrds = pthrdnum;
> -
> -       if ((test_type > 0) && (test_type < TEST_MAX)) {
> -               printf("%s\n", test_name[test_type]);
> -       } else {
> -               ODP_ERR("Invalid test case [%d]\n", test_type);
> -               usage();
> -               goto err_exit;
> -       }
> -       odp_test_thread_create(run_thread, &thrdarg);
> -
> -       odp_test_thread_exit(&thrdarg);
> -
> -       result = test_atomic_validate();
> -
> -       if (result == 0) {
> -               printf("%s_%d_%d Result:pass\n",
> -                      test_name[test_type], test_type, pthrdnum);
> -       } else {
> -               printf("%s_%d_%d Result:fail\n",
> -                      test_name[test_type], test_type, pthrdnum);
> -       }
> -       return 0;
> -
> -err_exit:
> -       return -1;
> -}
> diff --git a/test/api_test/odp_atomic_test.h
> b/test/api_test/odp_atomic_test.h
> deleted file mode 100644
> index 7814da5..0000000
> --- a/test/api_test/odp_atomic_test.h
> +++ /dev/null
> @@ -1,60 +0,0 @@
> -/* Copyright (c) 2013, Linaro Limited
> - * All rights reserved.
> - *
> - * SPDX-License-Identifier:     BSD-3-Clause
> - */
> -
> -#ifndef ODP_ATOMIC_TEST_H_
> -#define ODP_ATOMIC_TEST_H_
> -
> -#include <odp.h>
> -#include <odph_linux.h>
> -
> -/**
> - * add_sub_cnt could be any valid value
> - * so to excercise explicit atomic_add/sub
> - * ops. For now using 5..
> - */
> -#define ADD_SUB_CNT    5
> -
> -#define        CNT 500000
> -#define        S32_INIT_VAL    (1UL << 10)
> -#define        U32_INIT_VAL    (1UL << 10)
> -#define        U64_INIT_VAL    (1ULL << 33)
> -
> -typedef enum {
> -       TEST_MIX = 1, /* Must be first test case num */
> -       TEST_INC_DEC_S32,
> -       TEST_ADD_SUB_S32,
> -       TEST_INC_DEC_U32,
> -       TEST_ADD_SUB_U32,
> -       TEST_INC_DEC_64,
> -       TEST_ADD_SUB_64,
> -       TEST_MAX,
> -} odp_test_atomic_t;
> -
> -
> -void test_atomic_inc_dec_32(void);
> -void test_atomic_add_sub_32(void);
> -void test_atomic_inc_dec_u32(void);
> -void test_atomic_add_sub_u32(void);
> -void test_atomic_inc_dec_64(void);
> -void test_atomic_add_sub_64(void);
> -void test_atomic_inc_32(void);
> -void test_atomic_dec_32(void);
> -void test_atomic_add_32(void);
> -void test_atomic_sub_32(void);
> -void test_atomic_inc_u32(void);
> -void test_atomic_dec_u32(void);
> -void test_atomic_add_u32(void);
> -void test_atomic_sub_u32(void);
> -void test_atomic_inc_64(void);
> -void test_atomic_dec_64(void);
> -void test_atomic_add_64(void);
> -void test_atomic_sub_64(void);
> -void test_atomic_init(void);
> -void test_atomic_basic(void);
> -void test_atomic_store(void);
> -int test_atomic_validate(void);
> -
> -#endif /* ODP_ATOMIC_TEST_H_ */
> diff --git a/test/api_test/odp_common.c b/test/api_test/odp_common.c
> index ed1fc97..198fe8f 100644
> --- a/test/api_test/odp_common.c
> +++ b/test/api_test/odp_common.c
> @@ -14,7 +14,6 @@
>  #include <odp.h>
>  #include <odph_linux.h>
>  #include <odp_common.h>
> -#include <odp_atomic_test.h>
>  #include <odp_shm_test.h>
>
>
> diff --git a/test/api_test/odp_counter_test.c
> b/test/api_test/odp_counter_test.c
> new file mode 100644
> index 0000000..c72328e
> --- /dev/null
> +++ b/test/api_test/odp_counter_test.c
> @@ -0,0 +1,361 @@
> +/* Copyright (c) 2013, Linaro Limited
> + * All rights reserved.
> + *
> + * SPDX-License-Identifier:     BSD-3-Clause
> + */
> +
> +#include <string.h>
> +#include <sys/time.h>
> +#include <odp.h>
> +#include <odp_debug.h>
> +#include <odp_common.h>
> +#include <odph_linux.h>
> +
> +/**
> + * add_sub_cnt could be any valid value
> + * so to excercise explicit atomic_add/sub
> + * ops. For now using 5..
> + */
> +#define ADD_SUB_CNT    5
> +
> +#define        CNT 500000
> +#define        U32_INIT_VAL    (1UL << 10)
> +#define        U64_INIT_VAL    (1ULL << 33)
> +
> +typedef enum {
> +       TEST_MIX = 1, /* Must be first test case num */
> +       TEST_INC_DEC_U32 = 2,
> +       TEST_ADD_SUB_U32 = 3,
> +       TEST_INC_DEC_64 = 4,
> +       TEST_ADD_SUB_64 = 5,
> +       TEST_MAX,
> +} odp_test_counter_t;
> +
> +
> +static uint32_t test_counter_inc_dec_u32(void);
> +static uint32_t test_counter_add_sub_u32(void);
> +static uint32_t test_counter_inc_dec_64(void);
> +static uint32_t test_counter_add_sub_64(void);
> +static uint32_t test_counter_inc_u32(void);
> +static uint32_t test_counter_dec_u32(void);
> +static uint32_t test_counter_add_u32(void);
> +static uint32_t test_counter_sub_u32(void);
> +static uint32_t test_counter_inc_64(void);
> +static uint32_t test_counter_dec_64(void);
> +static uint32_t test_counter_add_64(void);
> +static uint32_t test_counter_sub_64(void);
> +static void test_counter_init(void);
> +static uint32_t test_counter_basic(void);
> +static void test_counter_write(void);
> +static int test_counter_validate(void);
> +
> +static odp_counter32_t a32u;
> +static odp_counter64_t a64u;
> +
> +static odp_barrier_t barrier;
> +
> +static const char * const test_name[] = {
> +       "dummy",
> +       "test atomic counter basic ops add/sub/inc/dec",
> +       "test atomic inc/dec of 32-bit counter",
> +       "test atomic add/sub of 32-bit counter",
> +       "test atomic inc/dec of 64-bit counter",
> +       "test atomic add/sub of 64-bit counter"
> +};
> +
> +static uint64_t accops[MAX_WORKERS];
> +
> +static void usage(void)
> +{
> +       printf("\n./odp_counter -t <testcase> -n <num of threads>\n\n"
> +              "\t<testcase> is\n"
> +              "\t\t1 - Test mix (inc/dec/add/sub on 32- and 64-bit
> counters)\n"
> +              "\t\t2 - Test inc/dec of 32-bit counter\n"
> +              "\t\t3 - Test add/sub of 32-bit counter\n"
> +              "\t\t4 - Test inc/dec of 64-bit counter\n"
> +              "\t\t5 - Test add/sub of 64-bit counter\n"
> +              "\t<num of thread> is optional\n"
> +              "\t\t<1 - 31> - no of threads to start\n"
> +              "\t\tif user doesn't specify this option, then\n"
> +              "\t\tno of threads created is equivalent to no of cores\n"
> +              "\t\tavailable in the system\n"
> +              "\tExample usage:\n"
> +              "\t\t./odp_counter -t 2\n"
> +              "\t\t./odp_counter -t 3 -n 12\n");
> +}
> +
> +static uint32_t test_counter_inc_u32(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < CNT; i++)
> +               odp_counter32_inc(&a32u);
> +       return i;
> +}
> +
> +static uint32_t test_counter_inc_64(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < CNT; i++)
> +               odp_counter64_inc(&a64u);
> +       return i;
> +}
> +
> +static uint32_t test_counter_dec_u32(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < CNT; i++)
> +               odp_counter32_add(&a32u, (uint32_t)-1);
> +       return i;
> +}
> +
> +static uint32_t test_counter_dec_64(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < CNT; i++)
> +               odp_counter64_add(&a64u, (uint64_t)-1);
> +       return i;
> +}
> +
> +static uint32_t test_counter_add_u32(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> +               odp_counter32_add(&a32u, ADD_SUB_CNT);
> +       return i;
> +}
> +
> +static uint32_t test_counter_add_64(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> +               odp_counter64_add(&a64u, ADD_SUB_CNT);
> +       return i;
> +}
> +
> +static uint32_t test_counter_sub_u32(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> +               odp_counter32_add(&a32u, -ADD_SUB_CNT);
> +       return i;
> +}
> +
> +static uint32_t test_counter_sub_64(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> +               odp_counter64_add(&a64u, -ADD_SUB_CNT);
> +       return i;
> +}
> +
> +static uint32_t test_counter_inc_dec_u32(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_inc_u32();
> +       nops += test_counter_dec_u32();
> +       return nops;
> +}
> +
> +static uint32_t test_counter_add_sub_u32(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_add_u32();
> +       nops += test_counter_sub_u32();
> +       return nops;
> +}
> +
> +static uint32_t test_counter_inc_dec_64(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_inc_64();
> +       nops += test_counter_dec_64();
> +       return nops;
> +}
> +
> +static uint32_t test_counter_add_sub_64(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_add_64();
> +       nops += test_counter_sub_64();
> +       return nops;
> +}
> +
> +/**
> + * Test basic counter operation like
> + * add/sub/increment/decrement operation.
> + */
> +static uint32_t test_counter_basic(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_inc_u32();
> +       nops += test_counter_dec_u32();
> +       nops += test_counter_add_u32();
> +       nops += test_counter_sub_u32();
> +
> +       nops += test_counter_inc_64();
> +       nops += test_counter_dec_64();
> +       nops += test_counter_add_64();
> +       nops += test_counter_sub_64();
> +
> +       return nops;
> +}
> +
> +static void test_counter_init(void)
> +{
> +       odp_counter32_init(&a32u, 0);
> +       odp_counter64_init(&a64u, 0);
> +}
> +
> +static void test_counter_write(void)
> +{
> +       odp_counter32_write(&a32u, U32_INIT_VAL);
> +       odp_counter64_write(&a64u, U64_INIT_VAL);
> +}
> +
> +static int test_counter_validate(void)
> +{
> +       if (odp_counter32_read(&a32u) != U32_INIT_VAL) {
> +               ODP_ERR("Atomic u32 usual functions failed\n");
> +               return -1;
> +       }
> +
> +       if (odp_counter64_read(&a64u) != U64_INIT_VAL) {
> +               ODP_ERR("Atomic u64 usual functions failed\n");
> +               return -1;
> +       }
> +
> +       return 0;
> +}
> +
> +static void *run_thread(void *arg)
> +{
> +       pthrd_arg *parg = (pthrd_arg *)arg;
> +       int thr;
> +       uint64_t nops = 0;
> +       struct timeval tv0, tv1;
> +
> +       thr = odp_thread_id();
> +
> +       ODP_DBG("Thread %i starts\n", thr);
> +
> +       /* Wait here until all threads have arrived */
> +       /* Use multiple barriers to verify that it handles wrap around and
> +        * has no race conditions which could be exposed when invoked back-
> +        * to-back */
> +       odp_barrier_sync(&barrier);
> +       odp_barrier_sync(&barrier);
> +       odp_barrier_sync(&barrier);
> +       odp_barrier_sync(&barrier);
> +
> +       gettimeofday(&tv0, NULL);
> +
> +       switch (parg->testcase) {
> +       case TEST_MIX:
> +               nops += test_counter_basic();
> +               break;
> +       case TEST_INC_DEC_U32:
> +               nops += test_counter_inc_dec_u32();
> +               break;
> +       case TEST_ADD_SUB_U32:
> +               nops += test_counter_add_sub_u32();
> +               break;
> +       case TEST_INC_DEC_64:
> +               nops += test_counter_inc_dec_64();
> +               break;
> +       case TEST_ADD_SUB_64:
> +               nops += test_counter_add_sub_64();
> +               break;
> +       }
> +       gettimeofday(&tv1, NULL);
> +       accops[thr] = nops;
> +       fflush(NULL);
> +
> +       uint64_t usecs = (tv1.tv_sec - tv0.tv_sec) * 1000000ULL +
> +                        tv1.tv_usec - tv0.tv_usec;
> +       printf("Time taken in thread %02d to complete %"PRIu64" op is "
> +              "%"PRIu64" usec, %"PRIu64" ns/op\n",
> +              thr, nops, usecs, 1000 * usecs / nops);
> +
> +       return parg;
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +       pthrd_arg thrdarg;
> +       int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
> +       char c;
> +       int result;
> +
> +       if (argc == 1 || argc % 2 == 0) {
> +               usage();
> +               goto err_exit;
> +       }
> +       if (odp_test_global_init() != 0)
> +               goto err_exit;
> +       odp_print_system_info();
> +
> +       while (cnt != 0) {
> +               sscanf(argv[++i], "-%c", &c);
> +               switch (c) {
> +               case 't':
> +                       sscanf(argv[++i], "%d", &test_type);
> +                       break;
> +               case 'n':
> +                       sscanf(argv[++i], "%d", &pthrdnum);
> +                       break;
> +               default:
> +                       ODP_ERR("Invalid option %c\n", c);
> +                       usage();
> +                       goto err_exit;
> +               }
> +               if (test_type < TEST_MIX || test_type > TEST_MAX ||
> +                   pthrdnum > odp_sys_core_count()) {
> +                       usage();
> +                       goto err_exit;
> +               }
> +               cnt -= 2;
> +       }
> +       if (pthrdnum == 0)
> +               pthrdnum = odp_sys_core_count();
> +
> +       test_counter_init();
> +       test_counter_write();
> +
> +       memset(&thrdarg, 0, sizeof(pthrd_arg));
> +       thrdarg.testcase = test_type;
> +       thrdarg.numthrds = pthrdnum;
> +
> +       if ((test_type > 0) && (test_type < TEST_MAX)) {
> +               printf("%s\n", test_name[test_type]);
> +       } else {
> +               ODP_ERR("Invalid test case [%d]\n", test_type);
> +               usage();
> +               goto err_exit;
> +       }
> +       odp_barrier_init(&barrier, pthrdnum);
> +       odp_test_thread_create(run_thread, &thrdarg);
> +
> +       odp_test_thread_exit(&thrdarg);
> +
> +       result = test_counter_validate();
> +
> +       if (result == 0) {
> +               printf("%s_%d_%d Result:pass\n",
> +                      test_name[test_type], test_type, pthrdnum);
> +       } else {
> +               printf("%s_%d_%d Result:fail\n",
> +                      test_name[test_type], test_type, pthrdnum);
> +       }
> +       return 0;
> +
> +err_exit:
> +       return -1;
> +}
> --
> 1.9.1
>
>
Bill Fischofer Nov. 4, 2014, 2:02 p.m. UTC | #2
On Mon, Oct 20, 2014 at 8:00 AM, Ola Liljedahl <ola.liljedahl@linaro.org>
wrote:

> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>

    Reviewed-by: Bill Fischofer <bill.fischofer@linaro.org>


> ---
> Added header file odp_counter.h with support for 32- and 64-bit atomic
> counters
> using relaxed memory order. 6 operations
> (init/read/write/add/read_inc/inc) on
> 32-bit and 64-bit counters respectively.A
> Renamed odp_atomic_test to odp_counter_test and changed to use
> odp_counter.h
>
> Implementation of C11-based memory model for atomic operations. 10
> operations
> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
> in
> odp_atomic.h. The required memory ordering is now a parameter to each call
> just
> like in C11.
>
> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
> fall back to GCC __sync builtins which often include unnecessarily heavy
> barrier/sync operations (always sequentially consistent).
>
> Attempt to remove all explicit memory barriers (odp_sync_stores) from code
> that
> implements multithreaded synchronization primitives (e.g. locks, barriers).
> Rewrote such primitives to use the new atomic operations.
>
> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
> enqueue/dequeue
> (missing release barrier, had only compiler barrier).
>
>  .gitignore                                         |   2 +-
>  example/generator/odp_generator.c                  |  43 +-
>  example/ipsec/odp_ipsec.c                          |   2 +-
>  example/odp_example/odp_example.c                  |   2 +-
>  example/timer/odp_timer_test.c                     |   2 +-
>  helper/include/odph_ring.h                         |   8 +-
>  platform/linux-generic/include/api/odp.h           |   1 +
>  platform/linux-generic/include/api/odp_atomic.h    | 838
> +++++++++++----------
>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>  platform/linux-generic/odp_barrier.c               |  49 +-
>  platform/linux-generic/odp_buffer.c                |   3 +-
>  platform/linux-generic/odp_crypto.c                |   7 +-
>  platform/linux-generic/odp_queue.c                 |   7 +-
>  platform/linux-generic/odp_ring.c                  |  94 +--
>  platform/linux-generic/odp_rwlock.c                |  62 +-
>  platform/linux-generic/odp_thread.c                |   9 +-
>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>  platform/linux-generic/odp_timer.c                 |  22 +-
>  test/api_test/Makefile.am                          |   6 +-
>  test/api_test/odp_atomic_test.c                    | 362 ---------
>  test/api_test/odp_atomic_test.h                    |  60 --
>  test/api_test/odp_common.c                         |   1 -
>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>  delete mode 100644 test/api_test/odp_atomic_test.c
>  delete mode 100644 test/api_test/odp_atomic_test.h
>  create mode 100644 test/api_test/odp_counter_test.c
>
> diff --git a/.gitignore b/.gitignore
> index 6342e34..77db4d6 100644
> --- a/.gitignore
> +++ b/.gitignore
> @@ -35,7 +35,7 @@ build/
>  odp_example
>  odp_packet
>  odp_packet_netmap
> -odp_atomic
> +odp_counter
>  odp_shm
>  odp_ring
>  odp_timer_ping
> diff --git a/example/generator/odp_generator.c
> b/example/generator/odp_generator.c
> index eb8b340..252157d 100644
> --- a/example/generator/odp_generator.c
> +++ b/example/generator/odp_generator.c
> @@ -62,10 +62,10 @@ typedef struct {
>   * counters
>  */
>  static struct {
> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
> -       odp_atomic_u64_t ip;    /**< ip packets */
> -       odp_atomic_u64_t udp;   /**< udp packets */
> -       odp_atomic_u64_t icmp;  /**< icmp packets */
> +       odp_counter64_t seq;    /**< ip seq to be send */
> +       odp_counter64_t ip;     /**< ip packets */
> +       odp_counter64_t udp;    /**< udp packets */
> +       odp_counter64_t icmp;   /**< icmp packets */
>  } counters;
>
>  /** * Thread specific arguments
> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_UDPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_UDP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_ICMPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_ICMP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>                 }
>
>                 if (args->appl.interval != 0) {
> +                       uint64_t seq = odp_counter64_read(&counters.seq);
>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
> -                              thr, counters.seq, counters.seq%0xffff);
> +                              thr, seq, seq%0xffff);
>                         /* TODO use odp timer */
>                         usleep(args->appl.interval * 1000);
>                 }
> -               if (args->appl.number != -1 && counters.seq
> -                   >= (unsigned int)args->appl.number) {
> +               if (args->appl.number != -1 &&
> +                   odp_counter64_read(&counters.seq) >=
> +                   (unsigned int)args->appl.number) {
>                         break;
>                 }
>         }
> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>         /* receive number of reply pks until timeout */
>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>                 while (args->appl.timeout >= 0) {
> -                       if (counters.icmp >= (unsigned
> int)args->appl.number)
> +                       if (odp_counter64_read(&counters.icmp) >=
> +                           (unsigned int)args->appl.number)
>                                 break;
>                         /* TODO use odp timer */
>                         sleep(1);
> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>
>         /* print info */
>         if (args->appl.mode == APPL_MODE_UDP) {
> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
> +               printf("  [%02i] total send: %ju\n", thr,
> +                      odp_counter64_read(&counters.seq));
>         } else if (args->appl.mode == APPL_MODE_PING) {
>                 printf("  [%02i] total send: %ju total receive: %ju\n",
> -                      thr, counters.seq, counters.icmp);
> +                      thr, odp_counter64_read(&counters.seq),
> +                      odp_counter64_read(&counters.icmp));
>         }
>         return arg;
>  }
> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                 if (!odp_packet_inflag_ipv4(pkt))
>                         continue;
>
> -               odp_atomic_inc_u64(&counters.ip);
> +               odp_counter64_inc(&counters.ip);
>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>
>                 /* udp */
>                 if (ip->proto == ODPH_IPPROTO_UDP) {
> -                       odp_atomic_inc_u64(&counters.udp);
> +                       odp_counter64_inc(&counters.udp);
>                         udp = (odph_udphdr_t *)(buf + offset);
>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>                                         odp_be_to_cpu_16(udp->length) -
> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                         icmp = (odph_icmphdr_t *)(buf + offset);
>                         /* echo reply */
>                         if (icmp->type == ICMP_ECHOREPLY) {
> -                               odp_atomic_inc_u64(&counters.icmp);
> +                               odp_counter64_inc(&counters.icmp);
>                                 memcpy(&tvsend, buf + offset +
> ODPH_ICMPHDR_LEN,
>                                        sizeof(struct timeval));
>                                 /* TODO This should be changed to use an
> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>         }
>
>         /* init counters */
> -       odp_atomic_init_u64(&counters.seq);
> -       odp_atomic_init_u64(&counters.ip);
> -       odp_atomic_init_u64(&counters.udp);
> -       odp_atomic_init_u64(&counters.icmp);
> +       odp_counter64_init(&counters.seq, 0);
> +       odp_counter64_init(&counters.ip, 0);
> +       odp_counter64_init(&counters.udp, 0);
> +       odp_counter64_init(&counters.icmp, 0);
>
>         /* Reserve memory for args from shared mem */
>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
> index 2f2dc19..76c27d0 100644
> --- a/example/ipsec/odp_ipsec.c
> +++ b/example/ipsec/odp_ipsec.c
> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>         printf("Num worker threads: %i\n", num_workers);
>
>         /* Create a barrier to synchronize thread startup */
> -       odp_barrier_init_count(&sync_barrier, num_workers);
> +       odp_barrier_init(&sync_barrier, num_workers);
>
>         /*
>          * By default core #0 runs Linux kernel background tasks.
> diff --git a/example/odp_example/odp_example.c
> b/example/odp_example/odp_example.c
> index 0e9aa3d..c473395 100644
> --- a/example/odp_example/odp_example.c
> +++ b/example/odp_example/odp_example.c
> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>         odp_shm_print_all();
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&globals->barrier, num_workers);
> +       odp_barrier_init(&globals->barrier, num_workers);
>
>         if (args.proc_mode) {
>                 int ret;
> diff --git a/example/timer/odp_timer_test.c
> b/example/timer/odp_timer_test.c
> index 78b2ae2..dfbeae9 100644
> --- a/example/timer/odp_timer_test.c
> +++ b/example/timer/odp_timer_test.c
> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>         printf("\n");
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&test_barrier, num_workers);
> +       odp_barrier_init(&test_barrier, num_workers);
>
>         /* Create and launch worker threads */
>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
> index 76c1db8..5e78b34 100644
> --- a/helper/include/odph_ring.h
> +++ b/helper/include/odph_ring.h
> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>                 uint32_t sp_enqueue;     /* True, if single producer. */
>                 uint32_t size;           /* Size of ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Producer head. */
> -               uint32_t tail;          /* Producer tail. */
> +               odp_atomic32_t head;    /* Producer head. */
> +               odp_atomic32_t tail;    /* Producer tail. */
>         } prod ODP_ALIGNED_CACHE;
>
>         /** @private Consumer */
> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>                 uint32_t size;           /* Size of the ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Consumer head. */
> -               uint32_t tail;          /* Consumer tail. */
> +               odp_atomic32_t head;    /* Consumer head. */
> +               odp_atomic32_t tail;    /* Consumer tail. */
>         } cons ODP_ALIGNED_CACHE;
>
>         /** @private Memory space of ring starts here. */
> diff --git a/platform/linux-generic/include/api/odp.h
> b/platform/linux-generic/include/api/odp.h
> index 0ee3faf..d124d52 100644
> --- a/platform/linux-generic/include/api/odp.h
> +++ b/platform/linux-generic/include/api/odp.h
> @@ -32,6 +32,7 @@ extern "C" {
>  #include <odp_barrier.h>
>  #include <odp_spinlock.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>
>  #include <odp_init.h>
>  #include <odp_system_info.h>
> diff --git a/platform/linux-generic/include/api/odp_atomic.h
> b/platform/linux-generic/include/api/odp_atomic.h
> index 0cc4cf4..ccdd096 100644
> --- a/platform/linux-generic/include/api/odp_atomic.h
> +++ b/platform/linux-generic/include/api/odp_atomic.h
> @@ -4,464 +4,494 @@
>   * SPDX-License-Identifier:     BSD-3-Clause
>   */
>
> -
>  /**
>   * @file
>   *
> - * ODP atomic operations
> + * ODP atomic types and operations, semantically a subset of C11 atomics.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Atomic functions must be used to operate on atomic variables!
>   */
>
>  #ifndef ODP_ATOMIC_H_
>  #define ODP_ATOMIC_H_
>
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +#include <odp_debug.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
>
> -
> -#include <odp_std_types.h>
> -
> -
> -/**
> - * Atomic integer
> - */
> -typedef volatile int32_t odp_atomic_int_t;
> -
> -/**
> - * Atomic unsigned integer 64 bits
> - */
> -typedef volatile uint64_t odp_atomic_u64_t;
> -
> -/**
> - * Atomic unsigned integer 32 bits
> - */
> -typedef volatile uint32_t odp_atomic_u32_t;
> -
> -
> -/**
> - * Initialize atomic integer
> - *
> - * @param ptr    An integer atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic integer
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic integer value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
> -{
> -       return *ptr;
> -}
> -
> -/**
> - * Store value to atomic integer
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
> new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic integer
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic integer
> - *
> - * @param ptr    An atomic integer variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement atomic integer by 1
> - *
> - * @param ptr    An atomic int variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Initialize atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic uint32 value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
> -{
> -       return *ptr;
> -}
> -
> -/**
> - * Store value to atomic uint32
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
> -                                       uint32_t new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be sub to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
>  /**
> - * Fetch and increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __OCTEON__
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       uint32_t ret;
> -
> -       __asm__ __volatile__ ("syncws");
> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
> -                             "r" (ptr));
> -
> -       return ret;
> -}
> -
> + * 32-bit (unsigned) atomic type
> + */
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the atomic variable */
> +} odp_atomic32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
> +
> +typedef enum {
> +       /** Relaxed memory order, no ordering of other accesses enforced */
> +       ODP_MEMORDER_RLX,
> +       /** Acquire memory order, later accesses cannot move before
> +        * acquire operation */
> +       ODP_MEMORDER_ACQ,
> +       /** Release memory order, earlier accesses cannot move after
> +        * release operation */
> +       ODP_MEMORDER_RLS
> +} odp_memorder_t;
> +
>
> +/*****************************************************************************
> + * Just some private helpers
>
> +*****************************************************************************/
> +
> +#ifdef __OCTEON__
> +/* OCTEON Write Memory Barrier */
> +#define COMPILER_HW_BARRIER() __asm __volatile( \
> +       /* Double syncw to work around errata */ \
> +       "syncw\n\tsyncw" : : : )
>  #else
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> +/** Compiler and hardware full memory barrier */
> +#define COMPILER_HW_BARRIER() __sync_synchronize()
> +/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a
> */
>  #endif
>
> -/**
> - * Increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u32(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_sub_u32(ptr, 1);
> -}
> -
> -/**
> - * Atomic compare and set for 32bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> +#define MEMORY "memory"
> +
>
> +/*****************************************************************************
> + * Operations on 32-bit atomics
> + * odp_atomic32_init - no return value
> + * odp_atomic32_load - return current value
> + * odp_atomic32_store - no return value
> + * odp_atomic32_cmp_xchg_weak - return bool
> + * odp_atomic32_fetch_add - return old value
> + * odp_atomic32_add - no return value
> + * odp_atomic32_fetch_inc - return old value
> + * odp_atomic32_inc - no return value
> + * odp_atomic32_fetch_dec - return old value
> + * odp_atomic32_dec - no return value
> +
> *****************************************************************************/
> +
> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
> +{
> +       /* Write of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) write variable,
> thus we
> +        * can avoid using compiler memory barriers */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Atomic load of 32-bit atomic variable
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the load
> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
> + *
> + * @return Value of the variable
> + */
> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               uint32_t val;
> +               /* Read of aligned word is atomic */
> +               /* Cast to volatile to force compiler to (re-) read
> variable,
> +                * thus we can avoid using compiler memory barriers */
> +               val = *(__volatile const uint32_t *)&ptr->v;
> +               return val;
> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
> +#if defined __aarch64__
> +               uint32_t val;
> +               __asm __volatile("ldar %w0, [%1]"
> +                               : "=&r"(val)
> +                               : "r"(&ptr->v)
> +                               : MEMORY);
> +               return val;
> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
> +               /* Read of aligned word is atomic */
> +               uint32_t val = ptr->v;
> +               /* To prevent later accesses from moving up */
> +               /* Herb Sutter claims HW barrier not needed on x86? */
> +               COMPILER_HW_BARRIER();
> +               return val;
> +#else
> +#warning odp_atomic32_load() may not be efficiently implemented
> +               /* Assume read of aligned word is atomic */
> +               uint32_t val = ptr->v;
> +               /* To prevent later accesses from moving up */
> +               COMPILER_HW_BARRIER();
> +               return val;
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +/**
> + * Atomic store to 32-bit atomic variable
> + *
> + * @param ptr  Pointer to a 32-bit atomic variable
> + * @param val  Value to write to the atomic variable
> + * @param memmodel Memory model associated with the store
> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
> + */
> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
> +               uint32_t val,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Write of aligned word is atomic */
> +               /* Cast to volatile to force compiler to (re-) write
> variable,
> +                * thus we will avoid using compiler memory barriers */
> +               *(__volatile uint32_t *)&ptr->v = val;
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
> +               /* Compiler and HW barrier to prevent earlier accesses from
> +                * moving down */
> +               COMPILER_HW_BARRIER();
> +               /* Write of aligned word is atomic */
> +               ptr->v = val;
> +               /* Compiler and HW barrier to prevent this store from
> moving
> +                * down after a later load-acquire and thus create
> overlapping
> +                * critical sections. Herb Sutter thinks this is needed */
> +               COMPILER_HW_BARRIER();
> +#elif defined __aarch64__
> +               __asm __volatile("stlr %w0, [%1]"
> +                               :
> +                               : "r"(val), "r"(&ptr->v)
> +                               : MEMORY);
> +#elif defined __x86_64__
> +               /* This is actually an atomic exchange operation */
> +               /* Generates good code on x86_64 */
> +               (void)__sync_lock_test_and_set(&ptr->v, val);
> +#else
> +#warning odp_atomic32_store_rls() may not be efficiently implemented
> +               /* This is actually an atomic exchange operation */
> +               (void)__sync_lock_test_and_set(&ptr->v, val);
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +
> +/**
> + * Atomic compare and exchange (swap) of 32-bit atomic variable
> + * "Weak" semantics, may fail spuriously and must be used in a loop.
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param exp_p Pointer to expected value (updated on failure)
> + * @param val   New value to write
> + * @param       memmodel Memory model associated with the compare-and-swap
> + * operation (ODP_MEMORDER_RLX only)
> + *
> + * @return 1 (true) if exchange successful, 0 (false) if not successful
> (and
> + * '*exp_p' updated with current value)
> + */
> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
> +               uint32_t *exp_p,
> +               uint32_t val,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +#if defined __arm__ /* A32/T32 ISA */
> +               uint32_t old;
> +               uint32_t exp = *exp_p;
> +               int status;
> +               __asm __volatile("ldrex %0, [%2]\t\n"
> +                                "cmp   %0, %3\t\n"
> +                                "bne   1f\t\n"
> +                                "strex %1, %4, [%2]\t\n"
> +                                "1:\t\n"
> +                               : "=&r"(old), "=&r"(status)
> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
> +                               : MEMORY);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Clear exclusive access monitor */
> +                       __asm __volatile("clrex");
> +                       /* Return current value */
> +                       *exp_p = old;
> +                       return 0;
> +               }
> +               /* strex returns 0 on success */
> +               if (odp_unlikely(status != 0)) {
> +                       /* strex failed, reservation was disturbed */
> +                       /* Return potentially changed value */
> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
> +                       return 0;
> +               }
> +               return 1;
> +#elif defined __mips64__
> +               uint32_t old;
> +               uint32_t exp = *exp_p;
> +               uint32_t status = val;
> +               __asm __volatile("llw %0, [%2]\t\n"
> +                                "bne %0, %3, 1f\t\n"
> +                                "scw %1, [%2]\t\n"
> +                                "1:\t\n"
> +                               : "=&r"(old), "+&r"(status)
> +                               : "r"(&ptr->v), "r"(exp)
> +                               : MEMORY);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Return current value */
> +                       *exp_p = old;
> +                       return 0;
> +               }
> +               /* scw returns 1 on success, 0 on failure */
> +               if (odp_unlikely(status == 0)) {
> +                       /* scw failed, reservation was disturbed */
> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
> +                       return 0;
> +               }
> +               return 1;
> +#elif defined __x86_64__
> +               uint32_t exp = *exp_p;
> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
> val);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Return the unexpected content of '*ptr' */
> +                       *exp_p = old;
> +                       return 0;
> +               } else {
> +                       return 1;
> +               }
> +#else
> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
> +               uint32_t exp = *exp_p;
> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
> val);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Return the unexpected content of '*ptr' */
> +                       *exp_p = old;
> +                       return 0;
> +               } else {
> +                       return 1;
> +               }
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +/**
> + * Atomic fetch and add to 32-bit atomic variable
> + * @note A - B <=> A + (-B)
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + * @param memmodel Memory model associated with the add
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
> + *
> + * @return Value of the atomic variable before the addition
> + */
> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
> +               uint32_t incr,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +#if defined __arm__ /* A32/T32 ISA */
> +               uint32_t old_val, tmp;
> +               int status;
> +               do {
> +                       __asm __volatile("ldrex %0, [%3]\t\n"
> +                                        "add   %1, %0, %4\t\n"
> +                                        "strex %2, %1, [%3]\t\n"
> +                                       : "=&r"(old_val), "+&r"(tmp),
> +                                         "=&r"(status)
> +                                       : "r"(&ptr->v), "r"(incr)
> +                                       : MEMORY);
> +               } while (odp_unlikely(status != 0));
> +               return old_val;
> +#elif defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("laa %0,(%2),%3"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr), "r" (incr)
> +                               : MEMORY);
> +               return old_val;
> +#elif defined __x86_64__
> +               /* Generates good code on x86_64 */
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +#else
> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("laa %0,(%2),%3"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr), "r" (incr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
> +#endif
> +               /* __sync_fetch_and_add() will give us barriers before and
> +                * after, we are fine with this for release operations */
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
>  }
>
>  /**
> - * Initialize atomic uint64
> + * Atomic add to 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + * @param memmodel Memory model associated with the add
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
> +               uint32_t incr,
> +               odp_memorder_t mmodel)
>  {
> -       *ptr = 0;
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic add instructions can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               __asm __volatile("saa %[inc], (%[base])"
> +                               : "+m" (*ptr)
> +                               : [inc] "r" (incr), [base] "r" (ptr)
> +                               : MEMORY);
> +               return;
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +               /* Platforms that support atomic add instructions can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("saa %[inc], (%[base])"
> +                               : "+m" (*ptr)
> +                               : [inc] "r" (incr), [base] "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return;
> +#endif
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>  }
>
>  /**
> - * Load value of atomic uint64
> - *
> - * @param ptr    An atomic variable
> + * Atomic fetch and increment of 32-bit atomic variable
>   *
> - * @return atomic uint64 value
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the increment
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   *
> - * @note The operation is not synchronized with other threads
> + * @return Value of the atomic variable before the increment
>   */
> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>  {
> -       return *ptr;
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic increment instructions
> can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("lai %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               return old_val;
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("lai %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
> +#endif
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>  }
>
>  /**
> - * Store value to atomic uint64
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> + * Atomic increment of 32-bit atomic variable
>   *
> - * @note The operation is not synchronized with other threads
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the increment
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
> -                                       uint64_t new_value)
> -{
> -       *ptr = new_value;
> -}
> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>
> -/**
> - * Add atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - */
> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
>  {
> -       __sync_fetch_and_add(ptr, value);
> +       /* Default to using odp_atomic32_fetch_inc() */
> +       /* Platforms that support atomic increment instructions can add
> +        * their implementations here */
> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>  }
>
>  /**
> - * Fetch and add atomic uint64
> + * Atomic fetch and decrement of 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the decrement
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   *
> - * @return Value of the variable before the operation
> + * @return Value of the atomic variable before the decrement
>   */
> -
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>  {
> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> -#else
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic decrement instructions
> can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("lad %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               return old_val;
>  #endif
> -/**
> - * Subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - */
> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
> -{
> -       __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> -#else
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("lad %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
>  #endif
> -/**
> - * Fetch and increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u64(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       odp_atomic_fetch_add_u64(ptr, 1);
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>  }
>
>  /**
> - * Fetch and decrement atomic uint64 by 1
> + * Atomic decrement of 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the decrement
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u64(ptr, 1);
> -}
> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
> +               odp_memorder_t memorder)
>
> -/**
> - * Decrement atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>  {
> -       odp_atomic_fetch_sub_u64(ptr, 1);
> +       /* Default to using odp_atomic32_fetch_dec() */
> +       /* Platforms that support atomic decrement instructions can add
> +        * their implementations here */
> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>  }
>
> -/**
> - * Atomic compare and set for 64bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> -}
> +/* We are not exporting this macro */
> +#undef COMPILER_HW_BARRIER
> +#undef MEMORY
>
>  #ifdef __cplusplus
>  }
> diff --git a/platform/linux-generic/include/api/odp_barrier.h
> b/platform/linux-generic/include/api/odp_barrier.h
> index a7b3215..69b1eb8 100644
> --- a/platform/linux-generic/include/api/odp_barrier.h
> +++ b/platform/linux-generic/include/api/odp_barrier.h
> @@ -27,18 +27,18 @@ extern "C" {
>   * ODP execution barrier
>   */
>  typedef struct odp_barrier_t {
> -       int              count;  /**< @private Thread count */
> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
> +       uint32_t       num_threads;  /**< @private Thread count (constant)
> */
> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
>  } odp_barrier_t;
>
>
>  /**
>   * Init barrier with thread count
>   *
> - * @param barrier    Barrier
> - * @param count      Thread count
> + * @param barrier     Barrier
> + * @param num_threads Number of threads which share the barrier
>   */
> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>
>
>  /**
> diff --git a/platform/linux-generic/include/api/odp_counter.h
> b/platform/linux-generic/include/api/odp_counter.h
> new file mode 100644
> index 0000000..b93c992
> --- /dev/null
> +++ b/platform/linux-generic/include/api/odp_counter.h
> @@ -0,0 +1,363 @@
> +/* Copyright (c) 2013, Linaro Limited
> + * All rights reserved.
> + *
> + * SPDX-License-Identifier:     BSD-3-Clause
> + */
> +
> +/**
> + * @file
> + *
> + * ODP atomic counter types and operations, suitable for e.g. shared
> statistics.
> + * Relaxed memory model assumed for lowest overhead.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Counter functions must be used to operate on counter variables!
> + */
> +
> +#ifndef ODP_COUNTER_H_
> +#define ODP_COUNTER_H_
> +
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * 32-bit (unsigned) atomic counter type
> + */
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the counter variable */
> +} odp_counter32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
> +
> +/**
> + * 64-bit (unsigned) atomic counter type
> + */
> +typedef struct {
> +       uint64_t v; /**< Actual storage for the counter variable */
> +       /* Room for other data structures (e.g. spin lock) that might be
> +        * needed to ensure atomicity on some architectures */
> +} odp_counter64_t
> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
> +
>
> +/*****************************************************************************
> + * Operations on 32-bit atomic counters
> + * odp_counter32_init - returns no value
> + * odp_counter32_read - returns current value
> + * odp_counter32_write - returns no value
> + * odp_counter32_add - returns no value
> + * odp_counter32_read_inc - returns old value
> + * odp_counter32_inc - returns no value
> +
> *****************************************************************************/
> +
> +/**
> + * Initialize 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param val   Initial value
> + */
> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t val)
> +{
> +       /* No implementation requires any other type of initialization */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Read 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + *
> + * @return Value of the variable
> + */
> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
> +{
> +       uint32_t val;
> +       /* Read of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) read variable, thus
> we
> +        * will avoid using compiler memory barriers */
> +       val = *(__volatile const uint32_t *)&ptr->v;
> +       return val;
> +}
> +
> +/**
> + * Write 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param val   Value to write to the variable
> + */
> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t val)
> +{
> +       /* Write of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) write variable,
> thus we
> +        * will avoid using compiler memory barriers */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Atomic add to 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t result;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%2]\t\n"
> +                                "add   %0, %0, %3\t\n"
> +                                "strex %1, %0, [%2]"
> +                                : "=&r"(result), "=&r"(status)
> +                                : "r"(&ptr->v), "Ir" (incr)
> +                                : );
> +       } while (odp_unlikely(status != 0));
> +#elif defined __OCTEON__
> +       __asm __volatile("saa %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +       /* Warning odp_counter32_add() may not be efficiently implemented
> */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) of 32-bit counter variable, return original value
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + *
> + * @return Original value of counter
> + */
> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t result, tmp;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%3]\t\n"
> +                                "add   %1, %0, #1\t\n"
> +                                "strex %2, %1, [%3]"
> +                                : "=&r"(result), "=&r"(tmp), "+&r"(status)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0));
> +       return result;
> +#elif defined __OCTEON__
> +       uint32_t old_val;
> +       __asm __volatile("lai %0,(%2)"
> +                        : "=r" (old_val), "+m" (ptr)
> +                        : "r" (ptr)
> +                        : );
> +       return old_val;
> +#elif defined __x86_64__
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#else
> +/* Warning odp_counter32_read_inc() may not be efficiently implemented */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + */
> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
> +{
> +#if defined __OCTEON__
> +       odp_counter32_add(ptr, 1);
> +#else
> +       (void)odp_counter32_read_inc(ptr);
> +#endif
> +}
> +
>
> +/*****************************************************************************
> + * Operations on 64-bit atomic counters
> + * odp_counter64_init
> + * odp_counter64_read
> + * odp_counter64_write
> + * odp_counter64_add
> + * odp_counter64_read_inc
> + * odp_counter64_inc
> +
> *****************************************************************************/
> +
> +/**
> + * Read 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + *
> + * @return Value of the counter variable
> + */
> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t val;
> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
> +                        "clrex" /* Clear exclusive access monitor */
> +                        : "=&r"(val)
> +                        : "r"(&ptr->v)
> +                        : );
> +       return val;
> +#elif defined __x86_64__ || defined __aarch64__
> +       /* Read of aligned quad/double word is atomic */
> +       return ptr->v;
> +#else
> +/* Warning odp_counter64_read() may not be efficiently implemented */
> +       return __sync_fetch_and_or(&ptr->v, 0);
> +#endif
> +}
> +
> +/**
> + * Write 64-bit counter variable
> + *
> + * @param ptr  Pointer to a 64-bit counter variable
> + * @param val  Value to write to the counter variable
> + */
> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t val)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               /* Read counter variable exclusively so we can write to it
> +                * later */
> +               /* Attempt to write the new value */
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "strexd %1, %3, %H3, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(val)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __x86_64__ || defined __aarch64__
> +       /* Write of aligned quad/double word is atomic */
> +       ptr->v = val;
> +#else
> +/* Warning odp_counter64_write() may not be efficiently implemented */
> +       /* This is actually an counter exchange operation */
> +       (void)__sync_lock_test_and_set(&ptr->v, val);
> +#endif
> +}
> +
> +/**
> + * Initialize 64-bit counter variable
> + * Perform implementation specific initializations, assign initial value.
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param val   Initial value
> + */
> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t val)
> +{
> +       /* No implementation requires any other type of initialization */
> +       odp_counter64_write(ptr, val);
> +}
> +
> +/**
> + * Atomic add to 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, %0, %3\t\n"
> +                                "adc    %H0, %H3\t\n"
> +                                "strexd %1, %0, %H0, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(incr)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __OCTEON__
> +       __asm __volatile("saad %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +/* Warning odp_counter64_add() may not be efficiently implemented */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +}
> +
> +
> +/**
> + * Atomic increment (+1) 64-bit counter variable and return original value
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + *
> + * @return Original value of counter
> + */
> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val, tmp;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
> +                                "adds   %2, %0, #1\t\n"
> +                                "adc    %H2, %H0, #0\t\n"
> +                                "strexd %1, %2, %H2, [%3]"
> +                                : "=&r"(old_val), "=&r"(status),
> "=&r"(tmp)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +       return old_val;
> +#elif defined __OCTEON__
> +       uint64_t old_val;
> +       __asm __volatile("laid %0,(%2)"
> +                       : "=r" (old_val), "+m" (ptr)
> +                       : "r" (ptr)
> +                       : );
> +       return old_val;
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#else
> +/* Warning odp_counter64_read_inc() may not be efficiently implemented */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + */
> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, #1\t\n"
> +                                "adc    %H0, #0\t\n"
> +                                "strexd %1, %0, %H0, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#else
> +       (void)odp_counter64_read_inc(ptr);
> +#endif
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif
> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
> b/platform/linux-generic/include/api/odp_rwlock.h
> index 252ebb2..ff8a9a2 100644
> --- a/platform/linux-generic/include/api/odp_rwlock.h
> +++ b/platform/linux-generic/include/api/odp_rwlock.h
> @@ -10,26 +10,30 @@
>  /**
>   * @file
>   *
> - * ODP RW Locks
> + * ODP read/write lock
> + * RW lock support multiple concurrent reads but only one (exclusive)
> writer.
>   */
>
> +#include <odp_atomic.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
>
>  /**
>   * The odp_rwlock_t type.
> - * write lock count is -1,
> - * read lock count > 0
> + * write lock is ~0U
> + * read lock count >0 && <~0U
>   */
>  typedef struct {
> -       volatile int32_t cnt; /**< -1 Write lock,
> -                               > 0 for Read lock. */
> +       odp_atomic32_t cnt; /**< == 0: unlocked,
> +                                == ~0: locked for write,
> +                                > 0 number of concurrent read locks */
>  } odp_rwlock_t;
>
>
>  /**
> - * Initialize the rwlock to an unlocked state.
> + * Initialize the rwlock to the unlocked state.
>   *
>   * @param rwlock pointer to the RW Lock.
>   */
> @@ -50,14 +54,14 @@ void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
>
>  /**
> - * Aquire a write lock.
> + * Aquire the write lock.
>   *
>   * @param rwlock pointer to a RW Lock.
>   */
>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
>
>  /**
> - * Release a write lock.
> + * Release the write lock.
>   *
>   * @param rwlock pointer to a RW Lock.
>   */
> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
> b/platform/linux-generic/include/api/odp_ticketlock.h
> index 6277a18..5933f85 100644
> --- a/platform/linux-generic/include/api/odp_ticketlock.h
> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
> @@ -21,14 +21,15 @@ extern "C" {
>
>  #include <odp_std_types.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>
>
>  /**
>   * ODP ticketlock
>   */
>  typedef struct odp_ticketlock_t {
> -       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
> -       volatile uint32_t cur_ticket;  /**< @private Current ticket */
> +       odp_counter32_t next_ticket; /**< @private Next ticket */
> +       odp_atomic32_t cur_ticket;  /**< @private Current ticket */
>  } odp_ticketlock_t;
>
>
> diff --git a/platform/linux-generic/include/odp_buffer_internal.h
> b/platform/linux-generic/include/odp_buffer_internal.h
> index 2002b51..530ab96 100644
> --- a/platform/linux-generic/include/odp_buffer_internal.h
> +++ b/platform/linux-generic/include/odp_buffer_internal.h
> @@ -88,7 +88,7 @@ typedef struct odp_buffer_hdr_t {
>         uint32_t                 index;      /* buf index in the pool */
>         size_t                   size;       /* max data size */
>         size_t                   cur_offset; /* current offset */
> -       odp_atomic_int_t         ref_count;  /* reference count */
> +       odp_atomic32_t           ref_count;  /* reference count */
>         odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
>         int                      type;       /* type of next header */
>         odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
> diff --git a/platform/linux-generic/include/odp_spin_internal.h
> b/platform/linux-generic/include/odp_spin_internal.h
> index b7e2071..29c524f 100644
> --- a/platform/linux-generic/include/odp_spin_internal.h
> +++ b/platform/linux-generic/include/odp_spin_internal.h
> @@ -15,15 +15,6 @@ extern "C" {
>
>
>  /**
> - * GCC memory barrier for ODP internal use
> - */
> -static inline void odp_mem_barrier(void)
> -{
> -       __asm__ __volatile__ ("" : : : "memory");
> -}
> -
> -
> -/**
>   * Spin loop for ODP internal use
>   */
>  static inline void odp_spin(void)
> diff --git a/platform/linux-generic/odp_barrier.c
> b/platform/linux-generic/odp_barrier.c
> index a82b294..10368b5 100644
> --- a/platform/linux-generic/odp_barrier.c
> +++ b/platform/linux-generic/odp_barrier.c
> @@ -8,41 +8,52 @@
>  #include <odp_sync.h>
>  #include <odp_spin_internal.h>
>
> -void odp_barrier_init_count(odp_barrier_t *barrier, int count)
> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
>  {
> -       barrier->count = count;
> -       barrier->bar = 0;
> -       odp_sync_stores();
> +       barrier->num_threads = num_threads; /* Constant after
> initialisation */
> +       odp_atomic32_init(&barrier->in_barrier, 0);
>  }
>
>  /*
>   * Efficient barrier_sync -
>   *
>   *   Barriers are initialized with a count of the number of callers
> - *   that must sync on the barrier before any may proceed.
> + *   that must sync on (enter) the barrier before any may proceed (exit).
>   *
>   *   To avoid race conditions and to permit the barrier to be fully
> - *   reusable, the barrier value cycles between 0..2*count-1. When
> - *   synchronizing the wasless variable simply tracks which half of
> + *   reusable, the barrier value cycles between 0..2*count-1 (temporarily
> + *   hitting 2*count before being wrapped). When
> + *   synchronizing, the waslow variable simply tracks which half of
>   *   the cycle the barrier was in upon entry.  Exit is when the
>   *   barrier crosses to the other half of the cycle.
>   */
>
>  void odp_barrier_sync(odp_barrier_t *barrier)
>  {
> -       int count;
> -       int wasless;
> +       uint32_t count;
> +       bool waslow;
>
> -       odp_sync_stores();
> -       wasless = barrier->bar < barrier->count;
> -       count = odp_atomic_fetch_inc_int(&barrier->bar);
> +       /* We need both acquire and release barriers but does the order
> +        * matter? Here we start with release and end with acquire. */
>
> -       if (count == 2*barrier->count-1) {
> -               barrier->bar = 0;
> -       } else {
> -               while ((barrier->bar < barrier->count) == wasless)
> -                       odp_spin();
> -       }
> +       /* Increase threads in_barrier count, this will automatically
> release
> +        * the other threads when lower/upper range is switched */
> +       count = odp_atomic32_fetch_add(&barrier->in_barrier, 1,
> +                                      ODP_MEMORDER_RLS);
> +       /* Compute lower or higher range indicator */
> +       waslow = count < barrier->num_threads;
>
> -       odp_mem_barrier();
> +       /* Check if in_barrier count should wrap */
> +       if (count == 2 * barrier->num_threads - 1) {
> +               /* Manually wrap the counter */
> +               odp_atomic32_add(&barrier->in_barrier,
> +                                -2 * barrier->num_threads,
> +                                ODP_MEMORDER_RLX);
> +               /* Fall-through the final part for the acquire barrier */
> +       }
> +       /* Wait for counter to change half */
> +       while ((odp_atomic32_load(&barrier->in_barrier, ODP_MEMORDER_ACQ) <
> +              barrier->num_threads) == waslow) {
> +               odp_spin();
> +       }
>  }
> diff --git a/platform/linux-generic/odp_buffer.c
> b/platform/linux-generic/odp_buffer.c
> index e54e0e7..fc3506b 100644
> --- a/platform/linux-generic/odp_buffer.c
> +++ b/platform/linux-generic/odp_buffer.c
> @@ -73,7 +73,8 @@ int odp_buffer_snprint(char *str, size_t n, odp_buffer_t
> buf)
>         len += snprintf(&str[len], n-len,
>                         "  cur_offset   %zu\n",       hdr->cur_offset);
>         len += snprintf(&str[len], n-len,
> -                       "  ref_count    %i\n",        hdr->ref_count);
> +                       "  ref_count    %u\n",
> +                       odp_atomic32_load(&hdr->ref_count,
> ODP_MEMORDER_RLX));
>         len += snprintf(&str[len], n-len,
>                         "  type         %i\n",        hdr->type);
>         len += snprintf(&str[len], n-len,
> diff --git a/platform/linux-generic/odp_crypto.c
> b/platform/linux-generic/odp_crypto.c
> index b37ad6b..75b4ce0 100644
> --- a/platform/linux-generic/odp_crypto.c
> +++ b/platform/linux-generic/odp_crypto.c
> @@ -6,7 +6,7 @@
>
>  #include <odp_crypto.h>
>  #include <odp_internal.h>
> -#include <odp_atomic.h>
> +#include <odp_counter.h>
>  #include <odp_spinlock.h>
>  #include <odp_sync.h>
>  #include <odp_debug.h>
> @@ -26,7 +26,7 @@
>  #define MAX_SESSIONS 32
>
>  typedef struct {
> -       odp_atomic_u32_t next;
> +       odp_counter32_t   next;
>         uint32_t         max;
>         odp_crypto_generic_session_t sessions[0];
>  } odp_crypto_global_t;
> @@ -58,7 +58,7 @@ odp_crypto_generic_session_t *alloc_session(void)
>         uint32_t idx;
>         odp_crypto_generic_session_t *session = NULL;
>
> -       idx = odp_atomic_fetch_inc_u32(&global->next);
> +       idx = odp_counter32_read_inc(&global->next);
>         if (idx < global->max) {
>                 session = &global->sessions[idx];
>                 session->index = idx;
> @@ -420,6 +420,7 @@ odp_crypto_init_global(void)
>
>         /* Initialize it */
>         global->max = MAX_SESSIONS;
> +       odp_counter32_init(&global->next, 0);
>
>         return 0;
>  }
> diff --git a/platform/linux-generic/odp_queue.c
> b/platform/linux-generic/odp_queue.c
> index 1318bcd..08c0d29 100644
> --- a/platform/linux-generic/odp_queue.c
> +++ b/platform/linux-generic/odp_queue.c
> @@ -214,8 +214,13 @@ int odp_queue_set_context(odp_queue_t handle, void
> *context)
>  {
>         queue_entry_t *queue;
>         queue = queue_to_qentry(handle);
> +       /* Setting a new queue context can be viewed as a release
> operation,
> +        * all writes to the context must be observable before the context
> +        * is made observable */
>         odp_sync_stores();
> -       queue->s.param.context = context;
> +       queue->s.param.context = context; /* Store-release */
> +       /* Ensure queue modification is globally visible before we return
> +        * and the application might cause the queue to be scheduled */
>         odp_sync_stores();
>         return 0;
>  }
> diff --git a/platform/linux-generic/odp_ring.c
> b/platform/linux-generic/odp_ring.c
> index 632aa66..e5b9c23 100644
> --- a/platform/linux-generic/odp_ring.c
> +++ b/platform/linux-generic/odp_ring.c
> @@ -187,10 +187,10 @@ odph_ring_create(const char *name, unsigned count,
> unsigned flags)
>                 r->cons.size = count;
>                 r->prod.mask = count-1;
>                 r->cons.mask = count-1;
> -               r->prod.head = 0;
> -               r->cons.head = 0;
> -               r->prod.tail = 0;
> -               r->cons.tail = 0;
> +               odp_atomic32_init(&r->prod.head, 0);
> +               odp_atomic32_init(&r->cons.head, 0);
> +               odp_atomic32_init(&r->prod.tail, 0);
> +               odp_atomic32_init(&r->cons.tail, 0);
>
>                 TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
>         } else {
> @@ -227,7 +227,7 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>         uint32_t prod_head, prod_next;
>         uint32_t cons_tail, free_entries;
>         const unsigned max = n;
> -       int success;
> +       bool success;
>         unsigned i;
>         uint32_t mask = r->prod.mask;
>         int ret;
> @@ -237,8 +237,8 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>                 /* Reset n to the initial burst count */
>                 n = max;
>
> -               prod_head = r->prod.head;
> -               cons_tail = r->cons.tail;
> +               prod_head = odp_atomic32_load(&r->prod.head,
> ODP_MEMORDER_RLX);
> +               cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_ACQ);
>                 /* The subtraction is done between two unsigned 32bits
> value
>                  * (the result is always modulo 32 bits even if we have
>                  * prod_head > cons_tail). So 'free_entries' is always
> between 0
> @@ -259,13 +259,14 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>                 }
>
>                 prod_next = prod_head + n;
> -               success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
> -                                             prod_next);
> -       } while (odp_unlikely(success == 0));
> +               success = odp_atomic32_cmp_xchg_weak(&r->prod.head,
> +                                                    &prod_head,
> +                                                    prod_next,
> +                                                    ODP_MEMORDER_RLX);
> +       } while (odp_unlikely(!success));
>
>         /* write entries in ring */
>         ENQUEUE_PTRS();
> -       odp_mem_barrier();
>
>         /* if we exceed the watermark */
>         if (odp_unlikely(((mask + 1) - free_entries + n) >
> r->prod.watermark)) {
> @@ -279,10 +280,11 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>          * If there are other enqueues in progress that preceeded us,
>          * we need to wait for them to complete
>          */
> -       while (odp_unlikely(r->prod.tail != prod_head))
> +       while (odp_unlikely(odp_atomic32_load(&r->prod.tail,
> +                                             ODP_MEMORDER_RLX) !=
> prod_head))
>                 odp_spin();
>
> -       r->prod.tail = prod_next;
> +       odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
>         return ret;
>  }
>
> @@ -298,8 +300,8 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>         uint32_t mask = r->prod.mask;
>         int ret;
>
> -       prod_head = r->prod.head;
> -       cons_tail = r->cons.tail;
> +       prod_head = odp_atomic32_load(&r->prod.head, ODP_MEMORDER_RLX);
> +       cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_ACQ);
>         /* The subtraction is done between two unsigned 32bits value
>          * (the result is always modulo 32 bits even if we have
>          * prod_head > cons_tail). So 'free_entries' is always between 0
> @@ -320,11 +322,10 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>         }
>
>         prod_next = prod_head + n;
> -       r->prod.head = prod_next;
> +       odp_atomic32_store(&r->prod.head, prod_next, ODP_MEMORDER_RLX);
>
>         /* write entries in ring */
>         ENQUEUE_PTRS();
> -       odp_mem_barrier();
>
>         /* if we exceed the watermark */
>         if (odp_unlikely(((mask + 1) - free_entries + n) >
> r->prod.watermark)) {
> @@ -334,7 +335,7 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>                 ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
>         }
>
> -       r->prod.tail = prod_next;
> +       odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
>         return ret;
>  }
>
> @@ -348,7 +349,7 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>         uint32_t cons_head, prod_tail;
>         uint32_t cons_next, entries;
>         const unsigned max = n;
> -       int success;
> +       bool success;
>         unsigned i;
>         uint32_t mask = r->prod.mask;
>
> @@ -357,8 +358,8 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>                 /* Restore n as it may change every loop */
>                 n = max;
>
> -               cons_head = r->cons.head;
> -               prod_tail = r->prod.tail;
> +               cons_head = odp_atomic32_load(&r->cons.head,
> ODP_MEMORDER_RLX);
> +               prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_ACQ);
>                 /* The subtraction is done between two unsigned 32bits
> value
>                  * (the result is always modulo 32 bits even if we have
>                  * cons_head > prod_tail). So 'entries' is always between 0
> @@ -378,22 +379,24 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>                 }
>
>                 cons_next = cons_head + n;
> -               success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
> -                                             cons_next);
> -       } while (odp_unlikely(success == 0));
> +               success = odp_atomic32_cmp_xchg_weak(&r->cons.head,
> +                                                    &cons_head,
> +                                                    cons_next,
> +                                                    ODP_MEMORDER_RLX);
> +       } while (odp_unlikely(!success));
>
>         /* copy in table */
>         DEQUEUE_PTRS();
> -       odp_mem_barrier();
>
>         /*
>          * If there are other dequeues in progress that preceded us,
>          * we need to wait for them to complete
>          */
> -       while (odp_unlikely(r->cons.tail != cons_head))
> +       while (odp_unlikely(odp_atomic32_load(&r->cons.tail,
> +                                             ODP_MEMORDER_RLX) !=
> cons_head))
>                 odp_spin();
>
> -       r->cons.tail = cons_next;
> +       odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
>
>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>  }
> @@ -409,8 +412,8 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>         unsigned i;
>         uint32_t mask = r->prod.mask;
>
> -       cons_head = r->cons.head;
> -       prod_tail = r->prod.tail;
> +       cons_head = odp_atomic32_load(&r->cons.head, ODP_MEMORDER_RLX);
> +       prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_ACQ);
>         /* The subtraction is done between two unsigned 32bits value
>          * (the result is always modulo 32 bits even if we have
>          * cons_head > prod_tail). So 'entries' is always between 0
> @@ -429,13 +432,12 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>         }
>
>         cons_next = cons_head + n;
> -       r->cons.head = cons_next;
> +       odp_atomic32_store(&r->cons.head, cons_next, ODP_MEMORDER_RLX);
>
>         /* copy in table */
>         DEQUEUE_PTRS();
> -       odp_mem_barrier();
>
> -       r->cons.tail = cons_next;
> +       odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>  }
>
> @@ -482,8 +484,8 @@ int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void
> **obj_table, unsigned n)
>   */
>  int odph_ring_full(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_RLX);
> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_RLX);
>         return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
>  }
>
> @@ -492,8 +494,8 @@ int odph_ring_full(const odph_ring_t *r)
>   */
>  int odph_ring_empty(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_RLX);
> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_RLX);
>         return !!(cons_tail == prod_tail);
>  }
>
> @@ -502,8 +504,8 @@ int odph_ring_empty(const odph_ring_t *r)
>   */
>  unsigned odph_ring_count(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_RLX);
> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_RLX);
>         return (prod_tail - cons_tail) & r->prod.mask;
>  }
>
> @@ -512,8 +514,8 @@ unsigned odph_ring_count(const odph_ring_t *r)
>   */
>  unsigned odph_ring_free_count(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_RLX);
> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_RLX);
>         return (cons_tail - prod_tail - 1) & r->prod.mask;
>  }
>
> @@ -523,10 +525,14 @@ void odph_ring_dump(const odph_ring_t *r)
>         ODP_DBG("ring <%s>@%p\n", r->name, r);
>         ODP_DBG("  flags=%x\n", r->flags);
>         ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
> -       ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
> -       ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
> -       ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
> -       ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
> +       ODP_DBG("  ct=%"PRIu32"\n", odp_atomic32_load(&r->cons.tail,
> +                                                     ODP_MEMORDER_RLX));
> +       ODP_DBG("  ch=%"PRIu32"\n", odp_atomic32_load(&r->cons.head,
> +                                                     ODP_MEMORDER_RLX));
> +       ODP_DBG("  pt=%"PRIu32"\n", odp_atomic32_load(&r->prod.tail,
> +                                                     ODP_MEMORDER_RLX));
> +       ODP_DBG("  ph=%"PRIu32"\n", odp_atomic32_load(&r->prod.head,
> +                                                     ODP_MEMORDER_RLX));
>         ODP_DBG("  used=%u\n", odph_ring_count(r));
>         ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
>         if (r->prod.watermark == r->prod.size)
> diff --git a/platform/linux-generic/odp_rwlock.c
> b/platform/linux-generic/odp_rwlock.c
> index 11c8dd7..a5fae4d 100644
> --- a/platform/linux-generic/odp_rwlock.c
> +++ b/platform/linux-generic/odp_rwlock.c
> @@ -4,58 +4,64 @@
>   * SPDX-License-Identifier:     BSD-3-Clause
>   */
>
> +#include <stdbool.h>
>  #include <odp_atomic.h>
>  #include <odp_rwlock.h>
> -
>  #include <odp_spin_internal.h>
>
>  void odp_rwlock_init(odp_rwlock_t *rwlock)
>  {
> -       rwlock->cnt = 0;
> +       odp_atomic32_init(&rwlock->cnt, 0);
>  }
>
>  void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
>  {
> -       int32_t cnt;
> -       int  is_locked = 0;
> -
> -       while (is_locked == 0) {
> -               cnt = rwlock->cnt;
> -               /* waiting for read lock */
> -               if (cnt < 0) {
> +       bool gotit;
> +       uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
> +       do {
> +               /* Wait for any writer to release lock */
> +               while ((int32_t)cnt < 0) {
>                         odp_spin();
> -                       continue;
> +                       cnt = odp_atomic32_load(&rwlock->cnt,
> +                                               ODP_MEMORDER_RLX);
>                 }
> -               is_locked = odp_atomic_cmpset_u32(
> -                                       (volatile uint32_t *)&rwlock->cnt,
> -                                             cnt, cnt + 1);
> -       }
> +               /* Attempt to take another read lock */
> +               gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
> +                                                  &cnt, cnt + 1,
> +                                                  ODP_MEMORDER_RLX);
> +               /* If operation fails, 'cnt' will contain current value */
> +       } while (!gotit);
>  }
>
>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
>  {
> -       odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
> +       /* Release one read lock by subtracting 1 */
> +       odp_atomic32_dec(&rwlock->cnt, ODP_MEMORDER_RLS);
>  }
>
>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
>  {
> -       int32_t cnt;
> -       int is_locked = 0;
> -
> -       while (is_locked == 0) {
> -               cnt = rwlock->cnt;
> -               /* lock aquired, wait */
> -               if (cnt != 0) {
> +       bool gotit;
> +       uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
> +       do {
> +               /* Wait for all lock holders to release lock */
> +               while (cnt != 0) {
> +                       /* Lock is busy */
>                         odp_spin();
> -                       continue;
> +                       cnt = odp_atomic32_load(&rwlock->cnt,
> +                                               ODP_MEMORDER_RLX);
>                 }
> -               is_locked = odp_atomic_cmpset_u32(
> -                                       (volatile uint32_t *)&rwlock->cnt,
> -                                             0, -1);
> -       }
> +               /* Attempt to take write lock */
> +               gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
> +                                                  &cnt,
> +                                                  (uint32_t)-1,
> +                                                  ODP_MEMORDER_RLX);
> +               /* If operation fails, 'cnt' will contain current value */
> +       } while (!gotit);
>  }
>
>  void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
>  {
> -       odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
> +       /* Release the write lock by adding 1 */
> +       odp_atomic32_inc(&rwlock->cnt, ODP_MEMORDER_RLS);
>  }
> diff --git a/platform/linux-generic/odp_thread.c
> b/platform/linux-generic/odp_thread.c
> index b869b27..652d317 100644
> --- a/platform/linux-generic/odp_thread.c
> +++ b/platform/linux-generic/odp_thread.c
> @@ -11,7 +11,7 @@
>
>  #include <odp_thread.h>
>  #include <odp_internal.h>
> -#include <odp_atomic.h>
> +#include <odp_counter.h>
>  #include <odp_config.h>
>  #include <odp_debug.h>
>  #include <odp_shared_memory.h>
> @@ -31,7 +31,7 @@ typedef struct {
>
>  typedef struct {
>         thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
> -       odp_atomic_int_t num;
> +       odp_counter32_t   num;
>
>  } thread_globals_t;
>
> @@ -58,6 +58,7 @@ int odp_thread_init_global(void)
>                 return -1;
>
>         memset(thread_globals, 0, sizeof(thread_globals_t));
> +       odp_counter32_init(&thread_globals->num, 0);
>         return 0;
>  }
>
> @@ -67,7 +68,7 @@ static int thread_id(void)
>         int id;
>         int cpu;
>
> -       id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
> +       id = (int)odp_counter32_read_inc(&thread_globals->num);
>
>         if (id >= ODP_CONFIG_MAX_THREADS) {
>                 ODP_ERR("Too many threads\n");
> @@ -77,7 +78,7 @@ static int thread_id(void)
>         cpu = sched_getcpu();
>
>         if (cpu < 0) {
> -               ODP_ERR("getcpu failed\n");
> +               ODP_ERR("sched_getcpu failed\n");
>                 return -1;
>         }
>
> diff --git a/platform/linux-generic/odp_ticketlock.c
> b/platform/linux-generic/odp_ticketlock.c
> index be5b885..510aa9f 100644
> --- a/platform/linux-generic/odp_ticketlock.c
> +++ b/platform/linux-generic/odp_ticketlock.c
> @@ -6,15 +6,15 @@
>
>  #include <odp_ticketlock.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>  #include <odp_sync.h>
>  #include <odp_spin_internal.h>
>
>
>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>  {
> -       ticketlock->next_ticket = 0;
> -       ticketlock->cur_ticket  = 0;
> -       odp_sync_stores();
> +       odp_counter32_init(&ticketlock->next_ticket, 0);
> +       odp_atomic32_init(&ticketlock->cur_ticket, 0);
>  }
>
>
> @@ -22,30 +22,15 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>  {
>         uint32_t ticket;
>
> -       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
> +       ticket = odp_counter32_read_inc(&ticketlock->next_ticket);
>
> -       while (ticket != ticketlock->cur_ticket)
> +       while (ticket != odp_atomic32_load(&ticketlock->cur_ticket,
> +                                          ODP_MEMORDER_ACQ))
>                 odp_spin();
> -
> -       odp_mem_barrier();
>  }
>
>
>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>  {
> -       odp_sync_stores();
> -
> -       ticketlock->cur_ticket++;
> -
> -#if defined __OCTEON__
> -       odp_sync_stores();
> -#else
> -       odp_mem_barrier();
> -#endif
> -}
> -
> -
> -int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
> -{
> -       return ticketlock->cur_ticket != ticketlock->next_ticket;
> +       odp_atomic32_inc(&ticketlock->cur_ticket, ODP_MEMORDER_RLS);
>  }
> diff --git a/platform/linux-generic/odp_timer.c
> b/platform/linux-generic/odp_timer.c
> index 313c713..fffaa44 100644
> --- a/platform/linux-generic/odp_timer.c
> +++ b/platform/linux-generic/odp_timer.c
> @@ -10,6 +10,7 @@
>  #include <odp_buffer_pool_internal.h>
>  #include <odp_internal.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>  #include <odp_spinlock.h>
>  #include <odp_sync.h>
>  #include <odp_debug.h>
> @@ -32,8 +33,8 @@ typedef struct {
>
>  typedef struct {
>         int               allocated;
> -       volatile int      active;
> -       volatile uint64_t cur_tick;
> +       odp_atomic32_t    active;
> +       odp_counter64_t   cur_tick;
>         timer_t           timerid;
>         odp_timer_t       timer_hdl;
>         odp_buffer_pool_t pool;
> @@ -150,16 +151,16 @@ static void notify_function(union sigval sigval)
>
>         timer = sigval.sival_ptr;
>
> -       if (timer->active == 0) {
> +       if (odp_atomic32_load(&timer->active, ODP_MEMORDER_RLX) == 0) {
>                 ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
>                 return;
>         }
>
>         /* ODP_DBG("Tick\n"); */
>
> -       cur_tick = timer->cur_tick++;
> -
> -       odp_sync_stores();
> +       /* Increment and read are not atomic but we are the only writer */
> +       odp_counter64_inc(&timer->cur_tick);
> +       cur_tick = odp_counter64_read(&timer->cur_tick);
>
>         tick = &timer->tick[cur_tick % MAX_TICKS];
>
> @@ -308,6 +309,8 @@ odp_timer_t odp_timer_create(const char *name,
> odp_buffer_pool_t pool,
>
>         timer_hdl = id + 1;
>
> +       odp_atomic32_init(&timer->active, 0);
> +       odp_counter64_init(&timer->cur_tick, 0);
>         timer->timer_hdl     = timer_hdl;
>         timer->pool          = pool;
>         timer->resolution_ns = resolution_ns;
> @@ -318,8 +321,7 @@ odp_timer_t odp_timer_create(const char *name,
> odp_buffer_pool_t pool,
>                 timer->tick[i].list = NULL;
>         }
>
> -       timer->active = 1;
> -       odp_sync_stores();
> +       odp_atomic32_store(&timer->active, 1, ODP_MEMORDER_RLS);
>
>         timer_start(timer);
>
> @@ -340,7 +342,7 @@ odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t
> timer_hdl, uint64_t tmo_tick,
>         id = (int)timer_hdl - 1;
>         timer = &odp_timer.timer[id];
>
> -       cur_tick = timer->cur_tick;
> +       cur_tick = odp_counter64_read(&timer->cur_tick);
>         if (tmo_tick <= cur_tick) {
>                 ODP_DBG("timeout too close\n");
>                 return ODP_TIMER_TMO_INVALID;
> @@ -416,7 +418,7 @@ uint64_t odp_timer_current_tick(odp_timer_t timer_hdl)
>         uint32_t id;
>
>         id = timer_hdl - 1;
> -       return odp_timer.timer[id].cur_tick;
> +       return odp_counter64_read(&odp_timer.timer[id].cur_tick);
>  }
>
>  odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
> diff --git a/test/api_test/Makefile.am b/test/api_test/Makefile.am
> index 5104454..478aa6c 100644
> --- a/test/api_test/Makefile.am
> +++ b/test/api_test/Makefile.am
> @@ -1,12 +1,12 @@
>  include $(top_srcdir)/test/Makefile.inc
>
> -bin_PROGRAMS = odp_atomic odp_shm odp_ring odp_timer_ping
> -odp_atomic_LDFLAGS = $(AM_LDFLAGS) -static
> +bin_PROGRAMS = odp_counter odp_shm odp_ring odp_timer_ping
> +odp_counter_LDFLAGS = $(AM_LDFLAGS) -static
>  odp_shm_LDFLAGS = $(AM_LDFLAGS) -static
>  odp_ring_LDFLAGS = $(AM_LDFLAGS) -static
>  odp_timer_ping_LDFLAGS = $(AM_LDFLAGS) -static
>
> -dist_odp_atomic_SOURCES = odp_atomic_test.c odp_common.c
> +dist_odp_counter_SOURCES = odp_counter_test.c odp_common.c
>  dist_odp_shm_SOURCES = odp_shm_test.c odp_common.c
>  dist_odp_ring_SOURCES = odp_ring_test.c odp_common.c
>  dist_odp_timer_ping_SOURCES = odp_timer_ping.c odp_common.c
> diff --git a/test/api_test/odp_atomic_test.c
> b/test/api_test/odp_atomic_test.c
> deleted file mode 100644
> index 9019d4f..0000000
> --- a/test/api_test/odp_atomic_test.c
> +++ /dev/null
> @@ -1,362 +0,0 @@
> -/* Copyright (c) 2013, Linaro Limited
> - * All rights reserved.
> - *
> - * SPDX-License-Identifier:     BSD-3-Clause
> - */
> -
> -#include <string.h>
> -#include <sys/time.h>
> -#include <odp_debug.h>
> -#include <odp_common.h>
> -#include <odp_atomic_test.h>
> -
> -static odp_atomic_int_t a32;
> -static odp_atomic_u32_t a32u;
> -static odp_atomic_u64_t a64u;
> -
> -static odp_atomic_int_t numthrds;
> -
> -static const char * const test_name[] = {
> -       "dummy",
> -       "test atomic basic ops add/sub/inc/dec",
> -       "test atomic inc/dec of signed word",
> -       "test atomic add/sub of signed word",
> -       "test atomic inc/dec of unsigned word",
> -       "test atomic add/sub of unsigned word",
> -       "test atomic inc/dec of unsigned double word",
> -       "test atomic add/sub of unsigned double word"
> -};
> -
> -static struct timeval tv0[MAX_WORKERS], tv1[MAX_WORKERS];
> -
> -static void usage(void)
> -{
> -       printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
> -              "\t<testcase> is\n"
> -              "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
> -              "\t\t2 - Test inc dec of signed word\n"
> -              "\t\t3 - Test add sub of signed word\n"
> -              "\t\t4 - Test inc dec of unsigned word\n"
> -              "\t\t5 - Test add sub of unsigned word\n"
> -              "\t\t6 - Test inc dec of double word\n"
> -              "\t\t7 - Test add sub of double word\n"
> -              "\t<num of pthread> is optional\n"
> -              "\t\t<1 - 31> - no of pthreads to start\n"
> -              "\t\tif user doesn't specify this option, then\n"
> -              "\t\tno of pthreads created is equivalent to no of cores\n"
> -              "\t\tavailable in the system\n"
> -              "\tExample usage:\n"
> -              "\t\t./odp_atomic -t 2\n"
> -              "\t\t./odp_atomic -t 3 -n 12\n");
> -}
> -
> -void test_atomic_inc_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_inc_int(&a32);
> -}
> -
> -void test_atomic_inc_u32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_inc_u32(&a32u);
> -}
> -
> -void test_atomic_inc_64(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_inc_u64(&a64u);
> -}
> -
> -void test_atomic_dec_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_dec_int(&a32);
> -}
> -
> -void test_atomic_dec_u32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_dec_u32(&a32u);
> -}
> -
> -void test_atomic_dec_64(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_dec_u64(&a64u);
> -}
> -
> -void test_atomic_add_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_add_u32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_add_64(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_sub_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_sub_u32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_sub_64(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_inc_dec_32(void)
> -{
> -       test_atomic_inc_32();
> -       test_atomic_dec_32();
> -}
> -
> -void test_atomic_add_sub_32(void)
> -{
> -       test_atomic_add_32();
> -       test_atomic_sub_32();
> -}
> -
> -void test_atomic_inc_dec_u32(void)
> -{
> -       test_atomic_inc_u32();
> -       test_atomic_dec_u32();
> -}
> -
> -void test_atomic_add_sub_u32(void)
> -{
> -       test_atomic_add_u32();
> -       test_atomic_sub_u32();
> -}
> -
> -void test_atomic_inc_dec_64(void)
> -{
> -       test_atomic_inc_64();
> -       test_atomic_dec_64();
> -}
> -
> -void test_atomic_add_sub_64(void)
> -{
> -       test_atomic_add_64();
> -       test_atomic_sub_64();
> -}
> -
> -/**
> - * Test basic atomic operation like
> - * add/sub/increment/decrement operation.
> - */
> -void test_atomic_basic(void)
> -{
> -       test_atomic_inc_32();
> -       test_atomic_dec_32();
> -       test_atomic_add_32();
> -       test_atomic_sub_32();
> -
> -       test_atomic_inc_u32();
> -       test_atomic_dec_u32();
> -       test_atomic_add_u32();
> -       test_atomic_sub_u32();
> -
> -       test_atomic_inc_64();
> -       test_atomic_dec_64();
> -       test_atomic_add_64();
> -       test_atomic_sub_64();
> -}
> -
> -void test_atomic_init(void)
> -{
> -       odp_atomic_init_int(&a32);
> -       odp_atomic_init_u32(&a32u);
> -       odp_atomic_init_u64(&a64u);
> -}
> -
> -void test_atomic_store(void)
> -{
> -       odp_atomic_store_int(&a32, S32_INIT_VAL);
> -       odp_atomic_store_u32(&a32u, U32_INIT_VAL);
> -       odp_atomic_store_u64(&a64u, U64_INIT_VAL);
> -}
> -
> -int test_atomic_validate(void)
> -{
> -       if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
> -               ODP_ERR("Atomic signed 32 usual functions failed\n");
> -               return -1;
> -       }
> -
> -       if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
> -               ODP_ERR("Atomic u32 usual functions failed\n");
> -               return -1;
> -       }
> -
> -       if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
> -               ODP_ERR("Atomic u64 usual functions failed\n");
> -               return -1;
> -       }
> -
> -       return 0;
> -}
> -
> -static void *run_thread(void *arg)
> -{
> -       pthrd_arg *parg = (pthrd_arg *)arg;
> -       int thr;
> -
> -       thr = odp_thread_id();
> -
> -       ODP_DBG("Thread %i starts\n", thr);
> -
> -       odp_atomic_inc_int(&numthrds);
> -
> -       /* Wait here until all pthreads are created */
> -       while (*(volatile int *)&numthrds < parg->numthrds)
> -               ;
> -
> -       gettimeofday(&tv0[thr], NULL);
> -
> -       switch (parg->testcase) {
> -       case TEST_MIX:
> -               test_atomic_basic();
> -               break;
> -       case TEST_INC_DEC_S32:
> -               test_atomic_inc_dec_32();
> -               break;
> -       case TEST_ADD_SUB_S32:
> -               test_atomic_add_sub_32();
> -               break;
> -       case TEST_INC_DEC_U32:
> -               test_atomic_inc_dec_u32();
> -               break;
> -       case TEST_ADD_SUB_U32:
> -               test_atomic_add_sub_u32();
> -               break;
> -       case TEST_INC_DEC_64:
> -               test_atomic_inc_dec_64();
> -               break;
> -       case TEST_ADD_SUB_64:
> -               test_atomic_add_sub_64();
> -               break;
> -       }
> -       gettimeofday(&tv1[thr], NULL);
> -       fflush(NULL);
> -
> -       printf("Time taken in thread %02d to complete op is %lld usec\n",
> thr,
> -              (tv1[thr].tv_sec - tv0[thr].tv_sec) * 1000000ULL +
> -              (tv1[thr].tv_usec - tv0[thr].tv_usec));
> -
> -       return parg;
> -}
> -
> -int main(int argc, char *argv[])
> -{
> -       pthrd_arg thrdarg;
> -       int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
> -       char c;
> -       int result;
> -
> -       if (argc == 1 || argc % 2 == 0) {
> -               usage();
> -               goto err_exit;
> -       }
> -       if (odp_test_global_init() != 0)
> -               goto err_exit;
> -       odp_print_system_info();
> -
> -       while (cnt != 0) {
> -               sscanf(argv[++i], "-%c", &c);
> -               switch (c) {
> -               case 't':
> -                       sscanf(argv[++i], "%d", &test_type);
> -                       break;
> -               case 'n':
> -                       sscanf(argv[++i], "%d", &pthrdnum);
> -                       break;
> -               default:
> -                       ODP_ERR("Invalid option %c\n", c);
> -                       usage();
> -                       goto err_exit;
> -               }
> -               if (test_type < TEST_MIX || test_type > TEST_MAX ||
> -                   pthrdnum > odp_sys_core_count()) {
> -                       usage();
> -                       goto err_exit;
> -               }
> -               cnt -= 2;
> -       }
> -       if (pthrdnum == 0)
> -               pthrdnum = odp_sys_core_count();
> -
> -       odp_atomic_init_int(&numthrds);
> -       test_atomic_init();
> -       test_atomic_store();
> -
> -       memset(&thrdarg, 0, sizeof(pthrd_arg));
> -       thrdarg.testcase = test_type;
> -       thrdarg.numthrds = pthrdnum;
> -
> -       if ((test_type > 0) && (test_type < TEST_MAX)) {
> -               printf("%s\n", test_name[test_type]);
> -       } else {
> -               ODP_ERR("Invalid test case [%d]\n", test_type);
> -               usage();
> -               goto err_exit;
> -       }
> -       odp_test_thread_create(run_thread, &thrdarg);
> -
> -       odp_test_thread_exit(&thrdarg);
> -
> -       result = test_atomic_validate();
> -
> -       if (result == 0) {
> -               printf("%s_%d_%d Result:pass\n",
> -                      test_name[test_type], test_type, pthrdnum);
> -       } else {
> -               printf("%s_%d_%d Result:fail\n",
> -                      test_name[test_type], test_type, pthrdnum);
> -       }
> -       return 0;
> -
> -err_exit:
> -       return -1;
> -}
> diff --git a/test/api_test/odp_atomic_test.h
> b/test/api_test/odp_atomic_test.h
> deleted file mode 100644
> index 7814da5..0000000
> --- a/test/api_test/odp_atomic_test.h
> +++ /dev/null
> @@ -1,60 +0,0 @@
> -/* Copyright (c) 2013, Linaro Limited
> - * All rights reserved.
> - *
> - * SPDX-License-Identifier:     BSD-3-Clause
> - */
> -
> -#ifndef ODP_ATOMIC_TEST_H_
> -#define ODP_ATOMIC_TEST_H_
> -
> -#include <odp.h>
> -#include <odph_linux.h>
> -
> -/**
> - * add_sub_cnt could be any valid value
> - * so to excercise explicit atomic_add/sub
> - * ops. For now using 5..
> - */
> -#define ADD_SUB_CNT    5
> -
> -#define        CNT 500000
> -#define        S32_INIT_VAL    (1UL << 10)
> -#define        U32_INIT_VAL    (1UL << 10)
> -#define        U64_INIT_VAL    (1ULL << 33)
> -
> -typedef enum {
> -       TEST_MIX = 1, /* Must be first test case num */
> -       TEST_INC_DEC_S32,
> -       TEST_ADD_SUB_S32,
> -       TEST_INC_DEC_U32,
> -       TEST_ADD_SUB_U32,
> -       TEST_INC_DEC_64,
> -       TEST_ADD_SUB_64,
> -       TEST_MAX,
> -} odp_test_atomic_t;
> -
> -
> -void test_atomic_inc_dec_32(void);
> -void test_atomic_add_sub_32(void);
> -void test_atomic_inc_dec_u32(void);
> -void test_atomic_add_sub_u32(void);
> -void test_atomic_inc_dec_64(void);
> -void test_atomic_add_sub_64(void);
> -void test_atomic_inc_32(void);
> -void test_atomic_dec_32(void);
> -void test_atomic_add_32(void);
> -void test_atomic_sub_32(void);
> -void test_atomic_inc_u32(void);
> -void test_atomic_dec_u32(void);
> -void test_atomic_add_u32(void);
> -void test_atomic_sub_u32(void);
> -void test_atomic_inc_64(void);
> -void test_atomic_dec_64(void);
> -void test_atomic_add_64(void);
> -void test_atomic_sub_64(void);
> -void test_atomic_init(void);
> -void test_atomic_basic(void);
> -void test_atomic_store(void);
> -int test_atomic_validate(void);
> -
> -#endif /* ODP_ATOMIC_TEST_H_ */
> diff --git a/test/api_test/odp_common.c b/test/api_test/odp_common.c
> index ed1fc97..198fe8f 100644
> --- a/test/api_test/odp_common.c
> +++ b/test/api_test/odp_common.c
> @@ -14,7 +14,6 @@
>  #include <odp.h>
>  #include <odph_linux.h>
>  #include <odp_common.h>
> -#include <odp_atomic_test.h>
>  #include <odp_shm_test.h>
>
>
> diff --git a/test/api_test/odp_counter_test.c
> b/test/api_test/odp_counter_test.c
> new file mode 100644
> index 0000000..c72328e
> --- /dev/null
> +++ b/test/api_test/odp_counter_test.c
> @@ -0,0 +1,361 @@
> +/* Copyright (c) 2013, Linaro Limited
> + * All rights reserved.
> + *
> + * SPDX-License-Identifier:     BSD-3-Clause
> + */
> +
> +#include <string.h>
> +#include <sys/time.h>
> +#include <odp.h>
> +#include <odp_debug.h>
> +#include <odp_common.h>
> +#include <odph_linux.h>
> +
> +/**
> + * add_sub_cnt could be any valid value
> + * so to excercise explicit atomic_add/sub
> + * ops. For now using 5..
> + */
> +#define ADD_SUB_CNT    5
> +
> +#define        CNT 500000
> +#define        U32_INIT_VAL    (1UL << 10)
> +#define        U64_INIT_VAL    (1ULL << 33)
> +
> +typedef enum {
> +       TEST_MIX = 1, /* Must be first test case num */
> +       TEST_INC_DEC_U32 = 2,
> +       TEST_ADD_SUB_U32 = 3,
> +       TEST_INC_DEC_64 = 4,
> +       TEST_ADD_SUB_64 = 5,
> +       TEST_MAX,
> +} odp_test_counter_t;
> +
> +
> +static uint32_t test_counter_inc_dec_u32(void);
> +static uint32_t test_counter_add_sub_u32(void);
> +static uint32_t test_counter_inc_dec_64(void);
> +static uint32_t test_counter_add_sub_64(void);
> +static uint32_t test_counter_inc_u32(void);
> +static uint32_t test_counter_dec_u32(void);
> +static uint32_t test_counter_add_u32(void);
> +static uint32_t test_counter_sub_u32(void);
> +static uint32_t test_counter_inc_64(void);
> +static uint32_t test_counter_dec_64(void);
> +static uint32_t test_counter_add_64(void);
> +static uint32_t test_counter_sub_64(void);
> +static void test_counter_init(void);
> +static uint32_t test_counter_basic(void);
> +static void test_counter_write(void);
> +static int test_counter_validate(void);
> +
> +static odp_counter32_t a32u;
> +static odp_counter64_t a64u;
> +
> +static odp_barrier_t barrier;
> +
> +static const char * const test_name[] = {
> +       "dummy",
> +       "test atomic counter basic ops add/sub/inc/dec",
> +       "test atomic inc/dec of 32-bit counter",
> +       "test atomic add/sub of 32-bit counter",
> +       "test atomic inc/dec of 64-bit counter",
> +       "test atomic add/sub of 64-bit counter"
> +};
> +
> +static uint64_t accops[MAX_WORKERS];
> +
> +static void usage(void)
> +{
> +       printf("\n./odp_counter -t <testcase> -n <num of threads>\n\n"
> +              "\t<testcase> is\n"
> +              "\t\t1 - Test mix (inc/dec/add/sub on 32- and 64-bit
> counters)\n"
> +              "\t\t2 - Test inc/dec of 32-bit counter\n"
> +              "\t\t3 - Test add/sub of 32-bit counter\n"
> +              "\t\t4 - Test inc/dec of 64-bit counter\n"
> +              "\t\t5 - Test add/sub of 64-bit counter\n"
> +              "\t<num of thread> is optional\n"
> +              "\t\t<1 - 31> - no of threads to start\n"
> +              "\t\tif user doesn't specify this option, then\n"
> +              "\t\tno of threads created is equivalent to no of cores\n"
> +              "\t\tavailable in the system\n"
> +              "\tExample usage:\n"
> +              "\t\t./odp_counter -t 2\n"
> +              "\t\t./odp_counter -t 3 -n 12\n");
> +}
> +
> +static uint32_t test_counter_inc_u32(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < CNT; i++)
> +               odp_counter32_inc(&a32u);
> +       return i;
> +}
> +
> +static uint32_t test_counter_inc_64(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < CNT; i++)
> +               odp_counter64_inc(&a64u);
> +       return i;
> +}
> +
> +static uint32_t test_counter_dec_u32(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < CNT; i++)
> +               odp_counter32_add(&a32u, (uint32_t)-1);
> +       return i;
> +}
> +
> +static uint32_t test_counter_dec_64(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < CNT; i++)
> +               odp_counter64_add(&a64u, (uint64_t)-1);
> +       return i;
> +}
> +
> +static uint32_t test_counter_add_u32(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> +               odp_counter32_add(&a32u, ADD_SUB_CNT);
> +       return i;
> +}
> +
> +static uint32_t test_counter_add_64(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> +               odp_counter64_add(&a64u, ADD_SUB_CNT);
> +       return i;
> +}
> +
> +static uint32_t test_counter_sub_u32(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> +               odp_counter32_add(&a32u, -ADD_SUB_CNT);
> +       return i;
> +}
> +
> +static uint32_t test_counter_sub_64(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> +               odp_counter64_add(&a64u, -ADD_SUB_CNT);
> +       return i;
> +}
> +
> +static uint32_t test_counter_inc_dec_u32(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_inc_u32();
> +       nops += test_counter_dec_u32();
> +       return nops;
> +}
> +
> +static uint32_t test_counter_add_sub_u32(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_add_u32();
> +       nops += test_counter_sub_u32();
> +       return nops;
> +}
> +
> +static uint32_t test_counter_inc_dec_64(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_inc_64();
> +       nops += test_counter_dec_64();
> +       return nops;
> +}
> +
> +static uint32_t test_counter_add_sub_64(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_add_64();
> +       nops += test_counter_sub_64();
> +       return nops;
> +}
> +
> +/**
> + * Test basic counter operation like
> + * add/sub/increment/decrement operation.
> + */
> +static uint32_t test_counter_basic(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_inc_u32();
> +       nops += test_counter_dec_u32();
> +       nops += test_counter_add_u32();
> +       nops += test_counter_sub_u32();
> +
> +       nops += test_counter_inc_64();
> +       nops += test_counter_dec_64();
> +       nops += test_counter_add_64();
> +       nops += test_counter_sub_64();
> +
> +       return nops;
> +}
> +
> +static void test_counter_init(void)
> +{
> +       odp_counter32_init(&a32u, 0);
> +       odp_counter64_init(&a64u, 0);
> +}
> +
> +static void test_counter_write(void)
> +{
> +       odp_counter32_write(&a32u, U32_INIT_VAL);
> +       odp_counter64_write(&a64u, U64_INIT_VAL);
> +}
> +
> +static int test_counter_validate(void)
> +{
> +       if (odp_counter32_read(&a32u) != U32_INIT_VAL) {
> +               ODP_ERR("Atomic u32 usual functions failed\n");
> +               return -1;
> +       }
> +
> +       if (odp_counter64_read(&a64u) != U64_INIT_VAL) {
> +               ODP_ERR("Atomic u64 usual functions failed\n");
> +               return -1;
> +       }
> +
> +       return 0;
> +}
> +
> +static void *run_thread(void *arg)
> +{
> +       pthrd_arg *parg = (pthrd_arg *)arg;
> +       int thr;
> +       uint64_t nops = 0;
> +       struct timeval tv0, tv1;
> +
> +       thr = odp_thread_id();
> +
> +       ODP_DBG("Thread %i starts\n", thr);
> +
> +       /* Wait here until all threads have arrived */
> +       /* Use multiple barriers to verify that it handles wrap around and
> +        * has no race conditions which could be exposed when invoked back-
> +        * to-back */
> +       odp_barrier_sync(&barrier);
> +       odp_barrier_sync(&barrier);
> +       odp_barrier_sync(&barrier);
> +       odp_barrier_sync(&barrier);
> +
> +       gettimeofday(&tv0, NULL);
> +
> +       switch (parg->testcase) {
> +       case TEST_MIX:
> +               nops += test_counter_basic();
> +               break;
> +       case TEST_INC_DEC_U32:
> +               nops += test_counter_inc_dec_u32();
> +               break;
> +       case TEST_ADD_SUB_U32:
> +               nops += test_counter_add_sub_u32();
> +               break;
> +       case TEST_INC_DEC_64:
> +               nops += test_counter_inc_dec_64();
> +               break;
> +       case TEST_ADD_SUB_64:
> +               nops += test_counter_add_sub_64();
> +               break;
> +       }
> +       gettimeofday(&tv1, NULL);
> +       accops[thr] = nops;
> +       fflush(NULL);
> +
> +       uint64_t usecs = (tv1.tv_sec - tv0.tv_sec) * 1000000ULL +
> +                        tv1.tv_usec - tv0.tv_usec;
> +       printf("Time taken in thread %02d to complete %"PRIu64" op is "
> +              "%"PRIu64" usec, %"PRIu64" ns/op\n",
> +              thr, nops, usecs, 1000 * usecs / nops);
> +
> +       return parg;
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +       pthrd_arg thrdarg;
> +       int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
> +       char c;
> +       int result;
> +
> +       if (argc == 1 || argc % 2 == 0) {
> +               usage();
> +               goto err_exit;
> +       }
> +       if (odp_test_global_init() != 0)
> +               goto err_exit;
> +       odp_print_system_info();
> +
> +       while (cnt != 0) {
> +               sscanf(argv[++i], "-%c", &c);
> +               switch (c) {
> +               case 't':
> +                       sscanf(argv[++i], "%d", &test_type);
> +                       break;
> +               case 'n':
> +                       sscanf(argv[++i], "%d", &pthrdnum);
> +                       break;
> +               default:
> +                       ODP_ERR("Invalid option %c\n", c);
> +                       usage();
> +                       goto err_exit;
> +               }
> +               if (test_type < TEST_MIX || test_type > TEST_MAX ||
> +                   pthrdnum > odp_sys_core_count()) {
> +                       usage();
> +                       goto err_exit;
> +               }
> +               cnt -= 2;
> +       }
> +       if (pthrdnum == 0)
> +               pthrdnum = odp_sys_core_count();
> +
> +       test_counter_init();
> +       test_counter_write();
> +
> +       memset(&thrdarg, 0, sizeof(pthrd_arg));
> +       thrdarg.testcase = test_type;
> +       thrdarg.numthrds = pthrdnum;
> +
> +       if ((test_type > 0) && (test_type < TEST_MAX)) {
> +               printf("%s\n", test_name[test_type]);
> +       } else {
> +               ODP_ERR("Invalid test case [%d]\n", test_type);
> +               usage();
> +               goto err_exit;
> +       }
> +       odp_barrier_init(&barrier, pthrdnum);
> +       odp_test_thread_create(run_thread, &thrdarg);
> +
> +       odp_test_thread_exit(&thrdarg);
> +
> +       result = test_counter_validate();
> +
> +       if (result == 0) {
> +               printf("%s_%d_%d Result:pass\n",
> +                      test_name[test_type], test_type, pthrdnum);
> +       } else {
> +               printf("%s_%d_%d Result:fail\n",
> +                      test_name[test_type], test_type, pthrdnum);
> +       }
> +       return 0;
> +
> +err_exit:
> +       return -1;
> +}
> --
> 1.9.1
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
vkamensky Nov. 5, 2014, 7:12 p.m. UTC | #3
Hi Ola,

Below is an example of one issue I noticed.

If you would post arm counter implementation in separate
patch, I would be able to comment on all relevant places,
but with current huge patch I just give one example. Also if it
would be separate patch, we would be able to work on it in
separate thread, till we get it right. But if you would continue
to repost big whole patch with each iteration working on this
issue, it would be quite a pain for me to find out what has
changed, although I would just had one thing in mind - counter
ARM V7 implementation.

> +/**
> + * Atomic add to 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, %0, %3\t\n"
> +                                "adc    %H0, %H3\t\n"
> +                                "strexd %1, %0, %H0, [%2]"

Above looks very wrong to me. Did you test that on BE
system? Please see

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2245f92498b216b50e744423bde17626287409d8

You should use %Q %R instead of %H.

The same issue exist other ARM V7 counter64
functions.

Thanks,
Victor

> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(incr)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds */
> +#elif defined __OCTEON__
> +       __asm __volatile("saad %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +/* Warning odp_counter64_add() may not be efficiently implemented */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
vkamensky Nov. 5, 2014, 8:26 p.m. UTC | #4
Hi Ola,

Please see below general comment about your approach.

> +typedef enum {
> +       /** Relaxed memory order, no ordering of other accesses enforced */
> +       ODP_MEMORDER_RLX,
> +       /** Acquire memory order, later accesses cannot move before
> +        * acquire operation */
> +       ODP_MEMORDER_ACQ,
> +       /** Release memory order, earlier accesses cannot move after
> +        * release operation */
> +       ODP_MEMORDER_RLS
> +} odp_memorder_t;

Why do you have 3 memory models while C11
has 6?

Are you aware about gcc __atomic built extenstion?
https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
They follow C11 quite closely with few caveats (like run-time
vs built time). Why don't you use those, instead of implementing
them by yourself?

What is the need to reimplement those in ODP?

I am not sure, it seems to me that Petri raised similar point.
In ODP we need to provide atomic operations that map to
possible h/w accelerations that SOCs come up with, like
Octeon atomic and ARM8.1A atomic instructions. If we just
need atomic function for general purpose memory, those
IMHO should come up from somewhere else (like compiler,
C11 implementation, etc).

Basically if linux-generic needs
some of atomic C11 style primitives it could use __atomic
builtins directly or with some tiny wrapup api, that deals
with gcc/clang differences, but those should not be exposed
as ODP apis. I think that point was raised during discussion.

BTW linux-generic atomic implementation (one that implements
ODP API) could use __atomic with relaxed option to do required
operation, but without barrier, if you are not happy with current
implementation that uses old __sync builtins.

Thanks,
Victor
Maxim Uvarov Nov. 5, 2014, 10:46 p.m. UTC | #5
Hello,

I might be too late for that discussion but I think we need to summarize 
all discussion
in all patch threads. Otherwise we are going in loop and Ola needs to do 
that hard work
again and again.

So I think the summary is:
1. ODP supports C99, not C11 and that is why we can not use C11 built in 
atomic operations.
If we think that C99 is bad idea (atomic things can be reason for that) 
we can discuss supporting C11.

2. Yes we need odp abstractions for atomic operations. People like to 
use use-cases and follow examples.
And to not dependent on gcc/glibc is also great benefit. So you can wait 
for changes
in upstream gcc or correct function in your odp arch implementation.

The other case is why linux-generic has to have Octeon (for example) 
optimized code if Octeon SoC is not supported in linux-generic.
I would like to see assembly code in linux-octeon then in linux-generic. 
And linux-generic should use gcc built in function. Gcc functions
might not be optimal but they are generic. There is only one reason that 
other platforms can reuse that code. And probably it's
good idea to provide effective functions (I think there are not match 
experts in memory models in atomic operations.)

Abstractions should be very useful for implementations. You just can see 
which scope of work you need to redo in other arch.

3. Ola's patch supports 3 memory models which cover all our hw. I think 
it's good for now. If somebody has other mode then he is
free to go to add or use gcc built in function like Ola did for x86.


Summarizing 1,2 and 3 I think we have options 1. accept Ola's work. 2. 
Switch to C11 and ask Ola to reuse C11 gcc built ins where
it's possible.

Thanks,
Maxim.


On 11/05/2014 11:26 PM, Victor Kamensky wrote:
> Hi Ola,
>
> Please see below general comment about your approach.
>
>> +typedef enum {
>> +       /** Relaxed memory order, no ordering of other accesses enforced */
>> +       ODP_MEMORDER_RLX,
>> +       /** Acquire memory order, later accesses cannot move before
>> +        * acquire operation */
>> +       ODP_MEMORDER_ACQ,
>> +       /** Release memory order, earlier accesses cannot move after
>> +        * release operation */
>> +       ODP_MEMORDER_RLS
>> +} odp_memorder_t;
> Why do you have 3 memory models while C11
> has 6?
>
> Are you aware about gcc __atomic built extenstion?
> https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
> They follow C11 quite closely with few caveats (like run-time
> vs built time). Why don't you use those, instead of implementing
> them by yourself?
>
> What is the need to reimplement those in ODP?
>
> I am not sure, it seems to me that Petri raised similar point.
> In ODP we need to provide atomic operations that map to
> possible h/w accelerations that SOCs come up with, like
> Octeon atomic and ARM8.1A atomic instructions. If we just
> need atomic function for general purpose memory, those
> IMHO should come up from somewhere else (like compiler,
> C11 implementation, etc).
>
> Basically if linux-generic needs
> some of atomic C11 style primitives it could use __atomic
> builtins directly or with some tiny wrapup api, that deals
> with gcc/clang differences, but those should not be exposed
> as ODP apis. I think that point was raised during discussion.
>
> BTW linux-generic atomic implementation (one that implements
> ODP API) could use __atomic with relaxed option to do required
> operation, but without barrier, if you are not happy with current
> implementation that uses old __sync builtins.
>
> Thanks,
> Victor
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
Mike Holmes Nov. 5, 2014, 10:54 p.m. UTC | #6
On 5 November 2014 15:26, Victor Kamensky <victor.kamensky@linaro.org>
wrote:

> Hi Ola,
>
> Please see below general comment about your approach.
>
> > +typedef enum {
> > +       /** Relaxed memory order, no ordering of other accesses enforced
> */
> > +       ODP_MEMORDER_RLX,
> > +       /** Acquire memory order, later accesses cannot move before
> > +        * acquire operation */
> > +       ODP_MEMORDER_ACQ,
> > +       /** Release memory order, earlier accesses cannot move after
> > +        * release operation */
> > +       ODP_MEMORDER_RLS
> > +} odp_memorder_t;
>
> Why do you have 3 memory models while C11
> has 6?
>
> Are you aware about gcc __atomic built extenstion?
> https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
> They follow C11 quite closely with few caveats (like run-time
> vs built time). Why don't you use those, instead of implementing
> them by yourself?
>

If we plan to be compiler agnostic do we want to add more gcc mechanisms ?


>
> What is the need to reimplement those in ODP?
>
> I am not sure, it seems to me that Petri raised similar point.
> In ODP we need to provide atomic operations that map to
> possible h/w accelerations that SOCs come up with, like
> Octeon atomic and ARM8.1A atomic instructions. If we just
> need atomic function for general purpose memory, those
> IMHO should come up from somewhere else (like compiler,
> C11 implementation, etc).
>
> Basically if linux-generic needs
> some of atomic C11 style primitives it could use __atomic
> builtins directly or with some tiny wrapup api, that deals
> with gcc/clang differences, but those should not be exposed
> as ODP apis. I think that point was raised during discussion.
>
> BTW linux-generic atomic implementation (one that implements
> ODP API) could use __atomic with relaxed option to do required
> operation, but without barrier, if you are not happy with current
> implementation that uses old __sync builtins.
>
> Thanks,
> Victor
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
vkamensky Nov. 5, 2014, 11:38 p.m. UTC | #7
On 5 November 2014 14:54, Mike Holmes <mike.holmes@linaro.org> wrote:
>
>
> On 5 November 2014 15:26, Victor Kamensky <victor.kamensky@linaro.org>
> wrote:
>>
>> Hi Ola,
>>
>> Please see below general comment about your approach.
>>
>> > +typedef enum {
>> > +       /** Relaxed memory order, no ordering of other accesses enforced
>> > */
>> > +       ODP_MEMORDER_RLX,
>> > +       /** Acquire memory order, later accesses cannot move before
>> > +        * acquire operation */
>> > +       ODP_MEMORDER_ACQ,
>> > +       /** Release memory order, earlier accesses cannot move after
>> > +        * release operation */
>> > +       ODP_MEMORDER_RLS
>> > +} odp_memorder_t;
>>
>> Why do you have 3 memory models while C11
>> has 6?
>>
>> Are you aware about gcc __atomic built extenstion?
>> https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
>> They follow C11 quite closely with few caveats (like run-time
>> vs built time). Why don't you use those, instead of implementing
>> them by yourself?
>
>
> If we plan to be compiler agnostic do we want to add more gcc mechanisms ?

I wrote aboute that below - use tiny wrappers; there will
be similar functionality in another compiler ...
I.e look for "__c11_atomic builtins"
http://clang.llvm.org/docs/LanguageExtensions.html

And current and Ola's code is already doing it with __sync
atomics, those are gcc mechanisms. What is different
if __atomic gcc builtin will be used?

Implementing general purpose C11 atomics in ODP
does not make sense to me. IMHO it does not belong
here. ODP is about h/w accelerators abstraction. Where
are h/w accelerators in "C11" (note in quotes) atomics
implementation?

Thanks,
Victor

>>
>>
>> What is the need to reimplement those in ODP?
>>
>> I am not sure, it seems to me that Petri raised similar point.
>> In ODP we need to provide atomic operations that map to
>> possible h/w accelerations that SOCs come up with, like
>> Octeon atomic and ARM8.1A atomic instructions. If we just
>> need atomic function for general purpose memory, those
>> IMHO should come up from somewhere else (like compiler,
>> C11 implementation, etc).
>>
>> Basically if linux-generic needs
>> some of atomic C11 style primitives it could use __atomic
>> builtins directly or with some tiny wrapup api, that deals
>> with gcc/clang differences, but those should not be exposed
>> as ODP apis. I think that point was raised during discussion.
>>
>> BTW linux-generic atomic implementation (one that implements
>> ODP API) could use __atomic with relaxed option to do required
>> operation, but without barrier, if you are not happy with current
>> implementation that uses old __sync builtins.
>>
>> Thanks,
>> Victor
>>
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
>
>
> --
> Mike Holmes
> Linaro  Sr Technical Manager
> LNG - ODP
Savolainen, Petri (NSN - FI/Espoo) Nov. 6, 2014, 8:20 a.m. UTC | #8
> -----Original Message-----
> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp-
> bounces@lists.linaro.org] On Behalf Of ext Victor Kamensky
> Sent: Thursday, November 06, 2014 1:38 AM
> To: Mike Holmes
> Cc: lng-odp@lists.linaro.org
> Subject: Re: [lng-odp] [ODP/PATCH v2] Look ma, no barriers! C11 memory
> model
> 
> On 5 November 2014 14:54, Mike Holmes <mike.holmes@linaro.org> wrote:
> >
> >
> > On 5 November 2014 15:26, Victor Kamensky <victor.kamensky@linaro.org>
> > wrote:
> >>
> >> Hi Ola,
> >>
> >> Please see below general comment about your approach.
> >>
> >> > +typedef enum {
> >> > +       /** Relaxed memory order, no ordering of other accesses
> enforced
> >> > */
> >> > +       ODP_MEMORDER_RLX,
> >> > +       /** Acquire memory order, later accesses cannot move before
> >> > +        * acquire operation */
> >> > +       ODP_MEMORDER_ACQ,
> >> > +       /** Release memory order, earlier accesses cannot move after
> >> > +        * release operation */
> >> > +       ODP_MEMORDER_RLS
> >> > +} odp_memorder_t;
> >>
> >> Why do you have 3 memory models while C11
> >> has 6?
> >>
> >> Are you aware about gcc __atomic built extenstion?
> >> https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
> >> They follow C11 quite closely with few caveats (like run-time
> >> vs built time). Why don't you use those, instead of implementing
> >> them by yourself?
> >
> >
> > If we plan to be compiler agnostic do we want to add more gcc mechanisms
> ?
> 
> I wrote aboute that below - use tiny wrappers; there will
> be similar functionality in another compiler ...
> I.e look for "__c11_atomic builtins"
> http://clang.llvm.org/docs/LanguageExtensions.html
> 
> And current and Ola's code is already doing it with __sync
> atomics, those are gcc mechanisms. What is different
> if __atomic gcc builtin will be used?
> 
> Implementing general purpose C11 atomics in ODP
> does not make sense to me. IMHO it does not belong
> here. ODP is about h/w accelerators abstraction. Where
> are h/w accelerators in "C11" (note in quotes) atomics
> implementation?
> 
> Thanks,
> Victor
> 

This is exactly the point I have been trying to make on this "C11 atomics" thread. Maybe today, the C11 style atomics fit ARMv8.0 ISA perfectly, but the day when ARM ISA will have proper "far atomics" - it's not optimal any more. The atomics API is targeting "the multi-core scalable" way of incrementing those in  memory counters. That process does not include aqc/rel retry cycle. 

As Victor and I have noted, SW lock implementation abstraction is not hugely important goal for ODP API. GCC __atomic provide already pretty good abstraction for that. If user really cares about lock (or lock free algorithm) implementation, it's better to write it in assembly and takeout all changes from any abstraction to spoil the algorithm.


-Petri
Ola Liljedahl Nov. 7, 2014, 11:30 p.m. UTC | #9
On 5 November 2014 20:12, Victor Kamensky <victor.kamensky@linaro.org> wrote:
> Hi Ola,
>
> Below is an example of one issue I noticed.
>
> If you would post arm counter implementation in separate
> patch, I would be able to comment on all relevant places,
> but with current huge patch I just give one example. Also if it
> would be separate patch, we would be able to work on it in
> separate thread, till we get it right. But if you would continue
> to repost big whole patch with each iteration working on this
> issue, it would be quite a pain for me to find out what has
> changed, although I would just had one thing in mind - counter
> ARM V7 implementation.
I could send preliminary patches with just the ARMv7 implementation
but the complete patch should have support for all architectures me
think. At least ARMv7, ARMv8/AArch64, MIPS64/OCTEON and x86(-64)
because those are already supported in the existing code and I don't want
to be accused of removing functionality.

>
>> +/**
>> + * Atomic add to 64-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + * @param incr  The value to be added to the counter variable
>> + */
>> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>> +                                "adds   %0, %0, %3\t\n"
>> +                                "adc    %H0, %H3\t\n"
>> +                                "strexd %1, %0, %H0, [%2]"
>
> Above looks very wrong to me. Did you test that on BE
> system? Please see
No I did not test on BE system as I don't any such available. I was reading a
page describing GCC inline assembler for ARM. Either it did not mention BE
(quite possible as few people have used ARM in BE mode before) or I missed it.

>
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2245f92498b216b50e744423bde17626287409d8
>
> You should use %Q %R instead of %H.
Thanks. Just to be sure, using %Q %R is endian neutral so I won't need any
ifdef for big and little endian?


>
> The same issue exist other ARM V7 counter64
> functions.
OK.

>
> Thanks,
> Victor
>
>> +                                : "=&r"(old_val), "=&r"(status)
>> +                                : "r"(&ptr->v), "r"(incr)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds */
>> +#elif defined __OCTEON__
>> +       __asm __volatile("saad %[inc], (%[base])"
>> +                        : "+m" (*ptr)
>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>> +                        : );
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +/* Warning odp_counter64_add() may not be efficiently implemented */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#endif
Ola Liljedahl Nov. 7, 2014, 11:48 p.m. UTC | #10
On 6 November 2014 00:38, Victor Kamensky <victor.kamensky@linaro.org> wrote:
> On 5 November 2014 14:54, Mike Holmes <mike.holmes@linaro.org> wrote:
>>
>>
>> On 5 November 2014 15:26, Victor Kamensky <victor.kamensky@linaro.org>
>> wrote:
>>>
>>> Hi Ola,
>>>
>>> Please see below general comment about your approach.
>>>
>>> > +typedef enum {
>>> > +       /** Relaxed memory order, no ordering of other accesses enforced
>>> > */
>>> > +       ODP_MEMORDER_RLX,
>>> > +       /** Acquire memory order, later accesses cannot move before
>>> > +        * acquire operation */
>>> > +       ODP_MEMORDER_ACQ,
>>> > +       /** Release memory order, earlier accesses cannot move after
>>> > +        * release operation */
>>> > +       ODP_MEMORDER_RLS
>>> > +} odp_memorder_t;
>>>
>>> Why do you have 3 memory models while C11
>>> has 6?
>>>
>>> Are you aware about gcc __atomic built extenstion?
>>> https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
No I wasn't aware of this. Thanks for the link, I will have a deep look.
Possibly all my atomics work is redundant, that is no problem to me as I
learnt a lot doing it. I can kill my darlings any time.

>>> They follow C11 quite closely with few caveats (like run-time
>>> vs built time). Why don't you use those, instead of implementing
>>> them by yourself?
Because of ignorance. Somehow my searches never found this page,
just the pages on the __sync builtins and the pages on C11. I guess it
depends on what you search for.

>>
>>
>> If we plan to be compiler agnostic do we want to add more gcc mechanisms ?
>
> I wrote aboute that below - use tiny wrappers; there will
> be similar functionality in another compiler ...
> I.e look for "__c11_atomic builtins"
> http://clang.llvm.org/docs/LanguageExtensions.html
>
> And current and Ola's code is already doing it with __sync
> atomics, those are gcc mechanisms. What is different
> if __atomic gcc builtin will be used?
>
> Implementing general purpose C11 atomics in ODP
> does not make sense to me. IMHO it does not belong
> here. ODP is about h/w accelerators abstraction. Where
> are h/w accelerators in "C11" (note in quotes) atomics
> implementation?
Because a lot of the ODP components requires atomic operations.
The thread barrier, the different locks, the "lock-less" ring, the timer etc.
They don't use them today, instead either using heavy __sync builtins
and/or using barriers. Or not using atomics at all (e.g. the timer
implementation)
which is wrong and creates race conditions. Barriers are so 1990's...
They mostly get wrong as well (too few or too many). And barriers are bad
for performance as well. There are other problems with the current ad-hoc
model, e.g. a load-barrier (e.g. odp_synch_loads) is missing (needed
when acquiring locks). Actually the user (e.g. lock implementer) does not
need load and store barriers but one-sided acquire and release barriers
which are semantically different (they order both types of accesses, not just
loads or stores, it is the direction which is important).

The C11/C++11 memory model is well-defined and target agnostic. Using
C11 atomics enables the programmer to work on a higher abstraction level
which better describes the intent of the user and can then be mapped to
different underlying implementations. Using HW specific operations is not
they way to efficiently program.

odp_atomic.h does not have to be a public API if it is only used by the ODP
implementation. Perhaps we don't need any ODP API's (internal or public) for
this at all if the ODP implementations may use compiler specific functionality
without wrappers.

-- Ola

>
> Thanks,
> Victor
>
>>>
>>>
>>> What is the need to reimplement those in ODP?
>>>
>>> I am not sure, it seems to me that Petri raised similar point.
>>> In ODP we need to provide atomic operations that map to
>>> possible h/w accelerations that SOCs come up with, like
>>> Octeon atomic and ARM8.1A atomic instructions. If we just
>>> need atomic function for general purpose memory, those
>>> IMHO should come up from somewhere else (like compiler,
>>> C11 implementation, etc).
>>>
>>> Basically if linux-generic needs
>>> some of atomic C11 style primitives it could use __atomic
>>> builtins directly or with some tiny wrapup api, that deals
>>> with gcc/clang differences, but those should not be exposed
>>> as ODP apis. I think that point was raised during discussion.
>>>
>>> BTW linux-generic atomic implementation (one that implements
>>> ODP API) could use __atomic with relaxed option to do required
>>> operation, but without barrier, if you are not happy with current
>>> implementation that uses old __sync builtins.
>>>
>>> Thanks,
>>> Victor
>>>
>>> _______________________________________________
>>> lng-odp mailing list
>>> lng-odp@lists.linaro.org
>>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>
>>
>>
>>
>> --
>> Mike Holmes
>> Linaro  Sr Technical Manager
>> LNG - ODP
Ola Liljedahl Nov. 8, 2014, 12:16 a.m. UTC | #11
On 6 November 2014 09:20, Savolainen, Petri (NSN - FI/Espoo)
<petri.savolainen@nsn.com> wrote:
>
>
>> -----Original Message-----
>> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp-
>> bounces@lists.linaro.org] On Behalf Of ext Victor Kamensky
>> Sent: Thursday, November 06, 2014 1:38 AM
>> To: Mike Holmes
>> Cc: lng-odp@lists.linaro.org
>> Subject: Re: [lng-odp] [ODP/PATCH v2] Look ma, no barriers! C11 memory
>> model
>>
>> On 5 November 2014 14:54, Mike Holmes <mike.holmes@linaro.org> wrote:
>> >
>> >
>> > On 5 November 2014 15:26, Victor Kamensky <victor.kamensky@linaro.org>
>> > wrote:
>> >>
>> >> Hi Ola,
>> >>
>> >> Please see below general comment about your approach.
>> >>
>> >> > +typedef enum {
>> >> > +       /** Relaxed memory order, no ordering of other accesses
>> enforced
>> >> > */
>> >> > +       ODP_MEMORDER_RLX,
>> >> > +       /** Acquire memory order, later accesses cannot move before
>> >> > +        * acquire operation */
>> >> > +       ODP_MEMORDER_ACQ,
>> >> > +       /** Release memory order, earlier accesses cannot move after
>> >> > +        * release operation */
>> >> > +       ODP_MEMORDER_RLS
>> >> > +} odp_memorder_t;
>> >>
>> >> Why do you have 3 memory models while C11
>> >> has 6?
>> >>
>> >> Are you aware about gcc __atomic built extenstion?
>> >> https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
>> >> They follow C11 quite closely with few caveats (like run-time
>> >> vs built time). Why don't you use those, instead of implementing
>> >> them by yourself?
>> >
>> >
>> > If we plan to be compiler agnostic do we want to add more gcc mechanisms
>> ?
>>
>> I wrote aboute that below - use tiny wrappers; there will
>> be similar functionality in another compiler ...
>> I.e look for "__c11_atomic builtins"
>> http://clang.llvm.org/docs/LanguageExtensions.html
>>
>> And current and Ola's code is already doing it with __sync
My code uses only __sync builtins if it is compiled for some
unknown/unsupported architecture. But yes it does create a
potential compiler portability problem (but at least clang seem
good at implement GCC extensions). Using a standard mechanism
such as C11 atomics would avoid this...

>> atomics, those are gcc mechanisms. What is different
>> if __atomic gcc builtin will be used?
>>
>> Implementing general purpose C11 atomics in ODP
>> does not make sense to me. IMHO it does not belong
>> here. ODP is about h/w accelerators abstraction. Where
>> are h/w accelerators in "C11" (note in quotes) atomics
>> implementation?
>>
>> Thanks,
>> Victor
>>
>
> This is exactly the point I have been trying to make on this "C11 atomics" thread. Maybe today, the C11 style atomics fit ARMv8.0 ISA perfectly, but the day when ARM ISA will have proper "far atomics" - it's not optimal any more. The atomics API is targeting "the multi-core scalable" way of incrementing those in  memory counters. That process does not include aqc/rel retry cycle.
If the current odp_atomics.h is indented only for counters, then both the
name and the implementation are wrong.

Acquire/release has nothing to do with LL/SC. Acquire and release
are memory orderings which can be associated with any atomic operation
(they don't make sense for non-atomic operations). ARMv8 load-acquire
is a load instruction that can be used e.g. in ticketlock_lock() when waiting
for the 'current' variable to become equal to your ticket. Memory accesses
after this load must be prevented from moving up before the load-acquire.
Memory accesses before this load-acquire are allowed to move down after
load-acquire. A DMB or sync (PPC or MIPS) is unnecessarily heavy, why
wait for *all* preceding stores to be globally observable before we can acquire
the lock? A "far" atomic update with release ordering makes sense when
incrementing the ticketlock 'current' variable in order to release the lock.
This avoid the DMB or SYNC before the increment operation. We have
benchmarks that should the detrimental effects of full barriers.


My odp_counter.h API uses relaxed memory order. fetch_and_add, add,
fetch_and_inc etc can be mapped directly to atomic corresponding instructions
if such are available. See the implementation for OCTEON that uses
laa, saa, lai etc.


>
> As Victor and I have noted, SW lock implementation abstraction is not hugely important goal for ODP API. GCC __atomic provide already pretty good abstraction for that. If user really cares about lock (or lock free algorithm) implementation, it's better to write it in assembly and takeout all changes from any abstraction to spoil the algorithm.
I disagree 100% with this. There is no need to write anything at
all in assembler. The inline assembler in the atomics implementation
could be replaced by the proper compiler support. Indeed I asked if
we couldn't relax our requirement of C99 compatibility and allow C11
usage in the implementation as well. But this as denied so I set out
to recreate the necessary support in a C99 compliant way. Victor has
pointed to a different approach which avoids the usage of a proprietary
atomics API and I will have a look at this.

I also believe that SW lock and synchronization performance will be very
important for some ODP implementations and I prefer not have reimplement
all of linux-generic just to be able to do it in a more efficient and
scalable way.
Doing it in linux-generic will also benefit many others, many ODP implementation
might borrow SW-implementations from linux-generic.


-- Ola

>
>
> -Petri
>
>
>
>
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
Savolainen, Petri (NSN - FI/Espoo) Nov. 10, 2014, 8:57 a.m. UTC | #12
> > This is exactly the point I have been trying to make on this "C11
> atomics" thread. Maybe today, the C11 style atomics fit ARMv8.0 ISA
> perfectly, but the day when ARM ISA will have proper "far atomics" - it's
> not optimal any more. The atomics API is targeting "the multi-core
> scalable" way of incrementing those in  memory counters. That process does
> not include aqc/rel retry cycle.
> If the current odp_atomics.h is indented only for counters, then both the
> name and the implementation are wrong.

"atomic" is more familiar for people that have used these (far) atomic instructions before (e.g. "atomic add" in ISA, not "counter add"). It's also in align with similar kernel and DPDK APIs, which makes porting work easier between these three. I think either one could be used, but "atomic" is more familiar.


> 
> Acquire/release has nothing to do with LL/SC. Acquire and release
> are memory orderings which can be associated with any atomic operation
> (they don't make sense for non-atomic operations).

Any atomic? Meaning also the "far atomic" instructions? If application uses only far atomics (others atomics are used through various lock implementations), does it need to define acq/rel ordering still?

> ARMv8 load-acquire
> is a load instruction that can be used e.g. in ticketlock_lock() when
> waiting
> for the 'current' variable to become equal to your ticket. Memory accesses
> after this load must be prevented from moving up before the load-acquire.
> Memory accesses before this load-acquire are allowed to move down after
> load-acquire. A DMB or sync (PPC or MIPS) is unnecessarily heavy, why
> wait for *all* preceding stores to be globally observable before we can
> acquire
> the lock? A "far" atomic update with release ordering makes sense when
> incrementing the ticketlock 'current' variable in order to release the
> lock.
> This avoid the DMB or SYNC before the increment operation. We have
> benchmarks that should the detrimental effects of full barriers.

This is all correct for a lock implementation. When implementing different locks for ARMv8 you should take advantage of those features of ISA. You can optimize lock implementations for ARM as you wish.

We are now discussing whether API needs to expose acq/rel/etc C11 memory models. I think it should not, it's too low level detail.

> 
> 
> My odp_counter.h API uses relaxed memory order. fetch_and_add, add,
> fetch_and_inc etc can be mapped directly to atomic corresponding
> instructions
> if such are available. See the implementation for OCTEON that uses
> laa, saa, lai etc.
> 
> 
> >
> > As Victor and I have noted, SW lock implementation abstraction is not
> hugely important goal for ODP API. GCC __atomic provide already pretty
> good abstraction for that. If user really cares about lock (or lock free
> algorithm) implementation, it's better to write it in assembly and takeout
> all changes from any abstraction to spoil the algorithm.
> I disagree 100% with this. There is no need to write anything at
> all in assembler. The inline assembler in the atomics implementation
> could be replaced by the proper compiler support. Indeed I asked if
> we couldn't relax our requirement of C99 compatibility and allow C11
> usage in the implementation as well. But this as denied so I set out
> to recreate the necessary support in a C99 compliant way. Victor has
> pointed to a different approach which avoids the usage of a proprietary
> atomics API and I will have a look at this.

Victor and I have mentioned __atomics built-inns in this same context already many times before (during past months). It's implementation trade-off whether one uses __atomic or direct assembly (abstraction vs full control). Abstraction comes with a cost - e.g. you cannot be sure that a "relaxed __atomic add by one" always uses the optimal "atomic increment" instruction on all compiler versions, etc. It may generate functionally correct but less scalable sequence of instructions (e.g. by not using far atomics).

As said above GCC __atomic is pretty good abstraction towards C11 atomics, and thus ODP API does not have to duplicate it. It's also safe to use __atomics in linux-generic implementation.


> 
> I also believe that SW lock and synchronization performance will be very
> important for some ODP implementations and I prefer not have reimplement
> all of linux-generic just to be able to do it in a more efficient and
> scalable way.
> Doing it in linux-generic will also benefit many others, many ODP
> implementation
> might borrow SW-implementations from linux-generic.
>

True. You can #ifdef and optimize all lock/barrier implementations for ARM in linux-generic without changing the API. 

-Petri
Ola Liljedahl Nov. 10, 2014, 10:47 a.m. UTC | #13
On 10 November 2014 09:57, Savolainen, Petri (NSN - FI/Espoo)
<petri.savolainen@nsn.com> wrote:
>> > This is exactly the point I have been trying to make on this "C11
>> atomics" thread. Maybe today, the C11 style atomics fit ARMv8.0 ISA
>> perfectly, but the day when ARM ISA will have proper "far atomics" - it's
>> not optimal any more. The atomics API is targeting "the multi-core
>> scalable" way of incrementing those in  memory counters. That process does
>> not include aqc/rel retry cycle.
>> If the current odp_atomics.h is indented only for counters, then both the
>> name and the implementation are wrong.
>
> "atomic" is more familiar for people that have used these (far) atomic instructions before (e.g. "atomic add" in ISA, not "counter add"). It's also in align with similar kernel and DPDK APIs, which makes porting work easier between these three. I think either one could be used, but "atomic" is more familiar.
Naming user level API's after some machine instruction that may be
used to implement a part of the function seems like to bad choice to
me.

There will be an odp_atomic.h as well but this API will allow the user
to specify the memory model for the atomic access. Just like you do in
C11/C++11. Time to learn the new ways of doing multithreaded
programming. It is actually a lot simpler than using barriers.

Since the shared counters cannot be used for synchronization (because
they don't guarantee any memory model), they are not called atomics.


>
>
>>
>> Acquire/release has nothing to do with LL/SC. Acquire and release
>> are memory orderings which can be associated with any atomic operation
>> (they don't make sense for non-atomic operations).
>
> Any atomic? Meaning also the "far atomic" instructions? If application uses only far atomics (others atomics are used through various lock implementations), does it need to define acq/rel ordering still?
In ARMv8, acquire and release memory models are options to most if not
all instructions that perform memory accesses, be they loads, stores
or atomic RMW operations. If you don't specify acquire or release
ordering, you get relaxed ordering.

>
>> ARMv8 load-acquire
>> is a load instruction that can be used e.g. in ticketlock_lock() when
>> waiting
>> for the 'current' variable to become equal to your ticket. Memory accesses
>> after this load must be prevented from moving up before the load-acquire.
>> Memory accesses before this load-acquire are allowed to move down after
>> load-acquire. A DMB or sync (PPC or MIPS) is unnecessarily heavy, why
>> wait for *all* preceding stores to be globally observable before we can
>> acquire
>> the lock? A "far" atomic update with release ordering makes sense when
>> incrementing the ticketlock 'current' variable in order to release the
>> lock.
>> This avoid the DMB or SYNC before the increment operation. We have
>> benchmarks that should the detrimental effects of full barriers.
>
> This is all correct for a lock implementation. When implementing different locks for ARMv8 you should take advantage of those features of ISA. You can optimize lock implementations for ARM as you wish.
The implementation of locks and other functions that use atomic types
and operations can be generic and target independent (on a source code
level) by using C11-style atomics. odp_atomic(_internal).h will then
translate the operations to what is required on the actual hardware
(and may thus use barriers).

Why not implement ODP components in a well-defined and hardware
independent way (and more robust as well me thinks, the current lack
of and redundant use of barriers shows the problems with using an
ad-hoc approach to multithreaded programming)? The lock code will
express the intent of different operations (e.g. here we acquire or
release the data associated with the lock flag) and this is
automatically translated to the most efficient implementation for
target HW. When you realize that every lock function will need a load
with acquire ordering, every unlock function will need a store with
release ordering and every atomic RMV (e.g. the ring enqueue/dequeue
operations) will need both acquire and release operations,
multithreaded programming becomes simpler and easier to get correct.

>
> We are now discussing whether API needs to expose acq/rel/etc C11 memory models. I think it should not, it's too low level detail.
So perhaps odp_atomic.h should be odp_atomic_internal.h? I am OK with
this. But the users of atomics (e.g. lock implementors) need to know
when to use the different memory models. Just as they would have to
know when to use a barrier which I mean is an even lower level detail.

The lock-less timer implementation requires proper atomic operations
with relaxed, acquire, release and possible SC (when there are
multiple consumers of your stores) memory models. This is how you do
multithreaded programming in a portable way without barriers/fences.


>
>>
>>
>> My odp_counter.h API uses relaxed memory order. fetch_and_add, add,
>> fetch_and_inc etc can be mapped directly to atomic corresponding
>> instructions
>> if such are available. See the implementation for OCTEON that uses
>> laa, saa, lai etc.
>>
>>
>> >
>> > As Victor and I have noted, SW lock implementation abstraction is not
>> hugely important goal for ODP API. GCC __atomic provide already pretty
>> good abstraction for that. If user really cares about lock (or lock free
>> algorithm) implementation, it's better to write it in assembly and takeout
>> all changes from any abstraction to spoil the algorithm.
>> I disagree 100% with this. There is no need to write anything at
>> all in assembler. The inline assembler in the atomics implementation
>> could be replaced by the proper compiler support. Indeed I asked if
>> we couldn't relax our requirement of C99 compatibility and allow C11
>> usage in the implementation as well. But this as denied so I set out
>> to recreate the necessary support in a C99 compliant way. Victor has
>> pointed to a different approach which avoids the usage of a proprietary
>> atomics API and I will have a look at this.
>
> Victor and I have mentioned __atomics built-inns in this same context already many times before (during past months). It's implementation trade-off whether one uses __atomic or direct assembly (abstraction vs full control). Abstraction comes with a cost - e.g. you cannot be sure that a "relaxed __atomic add by one" always uses the optimal "atomic increment" instruction on all compiler versions, etc. It may generate functionally correct but less scalable sequence of instructions (e.g. by not using far atomics).
That's why wrapping the counters in an ODP API enables the ODP
implementer to generate more optimal code if the compiler is not up to
the task. I am not taking away this. The new odp_counter.h API has
better performance than the current odp_atomic.h so there is not extra
cost of this abstraction.


>
> As said above GCC __atomic is pretty good abstraction towards C11 atomics, and thus ODP API does not have to duplicate it. It's also safe to use __atomics in linux-generic implementation.
I prefer to hide these compiler specific functions behind an API.
There is precedent for that in ODP, e.g. odp_sync.h attempts to
abstracts some hardware and compiler specific operations (otherwise
could have used __sync_synchronize() instead of odp_synch_stores()
and #ifdef's for hardware-specific usage of barriers (dmb, syncw
etc)).

>
>
>>
>> I also believe that SW lock and synchronization performance will be very
>> important for some ODP implementations and I prefer not have reimplement
>> all of linux-generic just to be able to do it in a more efficient and
>> scalable way.
>> Doing it in linux-generic will also benefit many others, many ODP
>> implementation
>> might borrow SW-implementations from linux-generic.
>>
>
> True. You can #ifdef and optimize all lock/barrier implementations for ARM in linux-generic without changing the API.
You don't need the ifdef's and the barriers in the linux-generic code,
a better odp_atomics.h will handle this automatically for you. But the
current odp_atomics.h idoes not provide for this.


I feel a very strong not invented here syndrome in this discussion.
However I cannot let your old-fashioned views of multithreaded
programming (ad-hoc and fragile use of barriers, target specific
#ifdef's everywhere, unabstracted use of compiler specific
functionality) stand in the way of a better approach (use well-defined
memory model, move target and compiler specific code into specific
abstraction headers).

>
> -Petri
>
diff mbox

Patch

diff --git a/.gitignore b/.gitignore
index 6342e34..77db4d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,7 +35,7 @@  build/
 odp_example
 odp_packet
 odp_packet_netmap
-odp_atomic
+odp_counter
 odp_shm
 odp_ring
 odp_timer_ping
diff --git a/example/generator/odp_generator.c b/example/generator/odp_generator.c
index eb8b340..252157d 100644
--- a/example/generator/odp_generator.c
+++ b/example/generator/odp_generator.c
@@ -62,10 +62,10 @@  typedef struct {
  * counters
 */
 static struct {
-	odp_atomic_u64_t seq;	/**< ip seq to be send */
-	odp_atomic_u64_t ip;	/**< ip packets */
-	odp_atomic_u64_t udp;	/**< udp packets */
-	odp_atomic_u64_t icmp;	/**< icmp packets */
+	odp_counter64_t seq;	/**< ip seq to be send */
+	odp_counter64_t ip;	/**< ip packets */
+	odp_counter64_t udp;	/**< udp packets */
+	odp_counter64_t icmp;	/**< icmp packets */
 } counters;
 
 /** * Thread specific arguments
@@ -201,7 +201,7 @@  static void pack_udp_pkt(odp_buffer_t obuf)
 	ip->tot_len = odp_cpu_to_be_16(args->appl.payload + ODPH_UDPHDR_LEN +
 				       ODPH_IPV4HDR_LEN);
 	ip->proto = ODPH_IPPROTO_UDP;
-	seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
+	seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
 	ip->id = odp_cpu_to_be_16(seq);
 	ip->chksum = 0;
 	odph_ipv4_csum_update(pkt);
@@ -258,7 +258,7 @@  static void pack_icmp_pkt(odp_buffer_t obuf)
 	ip->tot_len = odp_cpu_to_be_16(args->appl.payload + ODPH_ICMPHDR_LEN +
 				       ODPH_IPV4HDR_LEN);
 	ip->proto = ODPH_IPPROTO_ICMP;
-	seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
+	seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
 	ip->id = odp_cpu_to_be_16(seq);
 	ip->chksum = 0;
 	odph_ipv4_csum_update(pkt);
@@ -334,13 +334,15 @@  static void *gen_send_thread(void *arg)
 		}
 
 		if (args->appl.interval != 0) {
+			uint64_t seq = odp_counter64_read(&counters.seq);
 			printf("  [%02i] send pkt no:%ju seq %ju\n",
-			       thr, counters.seq, counters.seq%0xffff);
+			       thr, seq, seq%0xffff);
 			/* TODO use odp timer */
 			usleep(args->appl.interval * 1000);
 		}
-		if (args->appl.number != -1 && counters.seq
-		    >= (unsigned int)args->appl.number) {
+		if (args->appl.number != -1 &&
+		    odp_counter64_read(&counters.seq) >=
+		    (unsigned int)args->appl.number) {
 			break;
 		}
 	}
@@ -348,7 +350,8 @@  static void *gen_send_thread(void *arg)
 	/* receive number of reply pks until timeout */
 	if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
 		while (args->appl.timeout >= 0) {
-			if (counters.icmp >= (unsigned int)args->appl.number)
+			if (odp_counter64_read(&counters.icmp) >=
+			    (unsigned int)args->appl.number)
 				break;
 			/* TODO use odp timer */
 			sleep(1);
@@ -358,10 +361,12 @@  static void *gen_send_thread(void *arg)
 
 	/* print info */
 	if (args->appl.mode == APPL_MODE_UDP) {
-		printf("  [%02i] total send: %ju\n", thr, counters.seq);
+		printf("  [%02i] total send: %ju\n", thr,
+		       odp_counter64_read(&counters.seq));
 	} else if (args->appl.mode == APPL_MODE_PING) {
 		printf("  [%02i] total send: %ju total receive: %ju\n",
-		       thr, counters.seq, counters.icmp);
+		       thr, odp_counter64_read(&counters.seq),
+		       odp_counter64_read(&counters.icmp));
 	}
 	return arg;
 }
@@ -395,7 +400,7 @@  static void print_pkts(int thr, odp_packet_t pkt_tbl[], unsigned len)
 		if (!odp_packet_inflag_ipv4(pkt))
 			continue;
 
-		odp_atomic_inc_u64(&counters.ip);
+		odp_counter64_inc(&counters.ip);
 		rlen += sprintf(msg, "receive Packet proto:IP ");
 		buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
 		ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
@@ -405,7 +410,7 @@  static void print_pkts(int thr, odp_packet_t pkt_tbl[], unsigned len)
 
 		/* udp */
 		if (ip->proto == ODPH_IPPROTO_UDP) {
-			odp_atomic_inc_u64(&counters.udp);
+			odp_counter64_inc(&counters.udp);
 			udp = (odph_udphdr_t *)(buf + offset);
 			rlen += sprintf(msg + rlen, "UDP payload %d ",
 					odp_be_to_cpu_16(udp->length) -
@@ -417,7 +422,7 @@  static void print_pkts(int thr, odp_packet_t pkt_tbl[], unsigned len)
 			icmp = (odph_icmphdr_t *)(buf + offset);
 			/* echo reply */
 			if (icmp->type == ICMP_ECHOREPLY) {
-				odp_atomic_inc_u64(&counters.icmp);
+				odp_counter64_inc(&counters.icmp);
 				memcpy(&tvsend, buf + offset + ODPH_ICMPHDR_LEN,
 				       sizeof(struct timeval));
 				/* TODO This should be changed to use an
@@ -530,10 +535,10 @@  int main(int argc, char *argv[])
 	}
 
 	/* init counters */
-	odp_atomic_init_u64(&counters.seq);
-	odp_atomic_init_u64(&counters.ip);
-	odp_atomic_init_u64(&counters.udp);
-	odp_atomic_init_u64(&counters.icmp);
+	odp_counter64_init(&counters.seq, 0);
+	odp_counter64_init(&counters.ip, 0);
+	odp_counter64_init(&counters.udp, 0);
+	odp_counter64_init(&counters.icmp, 0);
 
 	/* Reserve memory for args from shared mem */
 	shm = odp_shm_reserve("shm_args", sizeof(args_t),
diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
index 2f2dc19..76c27d0 100644
--- a/example/ipsec/odp_ipsec.c
+++ b/example/ipsec/odp_ipsec.c
@@ -1223,7 +1223,7 @@  main(int argc, char *argv[])
 	printf("Num worker threads: %i\n", num_workers);
 
 	/* Create a barrier to synchronize thread startup */
-	odp_barrier_init_count(&sync_barrier, num_workers);
+	odp_barrier_init(&sync_barrier, num_workers);
 
 	/*
 	 * By default core #0 runs Linux kernel background tasks.
diff --git a/example/odp_example/odp_example.c b/example/odp_example/odp_example.c
index 0e9aa3d..c473395 100644
--- a/example/odp_example/odp_example.c
+++ b/example/odp_example/odp_example.c
@@ -1120,7 +1120,7 @@  int main(int argc, char *argv[])
 	odp_shm_print_all();
 
 	/* Barrier to sync test case execution */
-	odp_barrier_init_count(&globals->barrier, num_workers);
+	odp_barrier_init(&globals->barrier, num_workers);
 
 	if (args.proc_mode) {
 		int ret;
diff --git a/example/timer/odp_timer_test.c b/example/timer/odp_timer_test.c
index 78b2ae2..dfbeae9 100644
--- a/example/timer/odp_timer_test.c
+++ b/example/timer/odp_timer_test.c
@@ -372,7 +372,7 @@  int main(int argc, char *argv[])
 	printf("\n");
 
 	/* Barrier to sync test case execution */
-	odp_barrier_init_count(&test_barrier, num_workers);
+	odp_barrier_init(&test_barrier, num_workers);
 
 	/* Create and launch worker threads */
 	odph_linux_pthread_create(thread_tbl, num_workers, first_core,
diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
index 76c1db8..5e78b34 100644
--- a/helper/include/odph_ring.h
+++ b/helper/include/odph_ring.h
@@ -138,8 +138,8 @@  typedef struct odph_ring {
 		uint32_t sp_enqueue;     /* True, if single producer. */
 		uint32_t size;           /* Size of ring. */
 		uint32_t mask;           /* Mask (size-1) of ring. */
-		uint32_t head;		/* Producer head. */
-		uint32_t tail;		/* Producer tail. */
+		odp_atomic32_t head;	/* Producer head. */
+		odp_atomic32_t tail;	/* Producer tail. */
 	} prod ODP_ALIGNED_CACHE;
 
 	/** @private Consumer */
@@ -147,8 +147,8 @@  typedef struct odph_ring {
 		uint32_t sc_dequeue;     /* True, if single consumer. */
 		uint32_t size;           /* Size of the ring. */
 		uint32_t mask;           /* Mask (size-1) of ring. */
-		uint32_t head;		/* Consumer head. */
-		uint32_t tail;		/* Consumer tail. */
+		odp_atomic32_t head;	/* Consumer head. */
+		odp_atomic32_t tail;	/* Consumer tail. */
 	} cons ODP_ALIGNED_CACHE;
 
 	/** @private Memory space of ring starts here. */
diff --git a/platform/linux-generic/include/api/odp.h b/platform/linux-generic/include/api/odp.h
index 0ee3faf..d124d52 100644
--- a/platform/linux-generic/include/api/odp.h
+++ b/platform/linux-generic/include/api/odp.h
@@ -32,6 +32,7 @@  extern "C" {
 #include <odp_barrier.h>
 #include <odp_spinlock.h>
 #include <odp_atomic.h>
+#include <odp_counter.h>
 
 #include <odp_init.h>
 #include <odp_system_info.h>
diff --git a/platform/linux-generic/include/api/odp_atomic.h b/platform/linux-generic/include/api/odp_atomic.h
index 0cc4cf4..ccdd096 100644
--- a/platform/linux-generic/include/api/odp_atomic.h
+++ b/platform/linux-generic/include/api/odp_atomic.h
@@ -4,464 +4,494 @@ 
  * SPDX-License-Identifier:     BSD-3-Clause
  */
 
-
 /**
  * @file
  *
- * ODP atomic operations
+ * ODP atomic types and operations, semantically a subset of C11 atomics.
+ * Scalar variable wrapped in a struct to avoid accessing scalar directly
+ * without using the required access functions.
+ * Atomic functions must be used to operate on atomic variables!
  */
 
 #ifndef ODP_ATOMIC_H_
 #define ODP_ATOMIC_H_
 
+#include <stdint.h>
+#include <odp_align.h>
+#include <odp_hints.h>
+#include <odp_debug.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-
-#include <odp_std_types.h>
-
-
-/**
- * Atomic integer
- */
-typedef volatile int32_t odp_atomic_int_t;
-
-/**
- * Atomic unsigned integer 64 bits
- */
-typedef volatile uint64_t odp_atomic_u64_t;
-
-/**
- * Atomic unsigned integer 32 bits
- */
-typedef volatile uint32_t odp_atomic_u32_t;
-
-
-/**
- * Initialize atomic integer
- *
- * @param ptr    An integer atomic variable
- *
- * @note The operation is not synchronized with other threads
- */
-static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
-{
-	*ptr = 0;
-}
-
-/**
- * Load value of atomic integer
- *
- * @param ptr    An atomic variable
- *
- * @return atomic integer value
- *
- * @note The operation is not synchronized with other threads
- */
-static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
-{
-	return *ptr;
-}
-
-/**
- * Store value to atomic integer
- *
- * @param ptr        An atomic variable
- * @param new_value  Store new_value to a variable
- *
- * @note The operation is not synchronized with other threads
- */
-static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int new_value)
-{
-	*ptr = new_value;
-}
-
-/**
- * Fetch and add atomic integer
- *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
- *
- * @return Value of the variable before the operation
- */
-static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int value)
-{
-	return __sync_fetch_and_add(ptr, value);
-}
-
-/**
- * Fetch and subtract atomic integer
- *
- * @param ptr    An atomic integer variable
- * @param value  A value to be subtracted from the variable
- *
- * @return Value of the variable before the operation
- */
-static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int value)
-{
-	return __sync_fetch_and_sub(ptr, value);
-}
-
-/**
- * Fetch and increment atomic integer by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
-{
-	return odp_atomic_fetch_add_int(ptr, 1);
-}
-
-/**
- * Increment atomic integer by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
-{
-	odp_atomic_fetch_add_int(ptr, 1);
-}
-
-/**
- * Fetch and decrement atomic integer by 1
- *
- * @param ptr    An atomic int variable
- *
- * @return Value of the variable before the operation
- */
-static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
-{
-	return odp_atomic_fetch_sub_int(ptr, 1);
-}
-
-/**
- * Decrement atomic integer by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
-{
-	odp_atomic_fetch_sub_int(ptr, 1);
-}
-
-/**
- * Initialize atomic uint32
- *
- * @param ptr    An atomic variable
- *
- * @note The operation is not synchronized with other threads
- */
-static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
-{
-	*ptr = 0;
-}
-
-/**
- * Load value of atomic uint32
- *
- * @param ptr    An atomic variable
- *
- * @return atomic uint32 value
- *
- * @note The operation is not synchronized with other threads
- */
-static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
-{
-	return *ptr;
-}
-
-/**
- * Store value to atomic uint32
- *
- * @param ptr        An atomic variable
- * @param new_value  Store new_value to a variable
- *
- * @note The operation is not synchronized with other threads
- */
-static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
-					uint32_t new_value)
-{
-	*ptr = new_value;
-}
-
-/**
- * Fetch and add atomic uint32
- *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
- *
- * @return Value of the variable before the operation
- */
-static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
-						uint32_t value)
-{
-	return __sync_fetch_and_add(ptr, value);
-}
-
-/**
- * Fetch and subtract uint32
- *
- * @param ptr    An atomic variable
- * @param value  A value to be sub to the variable
- *
- * @return Value of the variable before the operation
- */
-static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
-						uint32_t value)
-{
-	return __sync_fetch_and_sub(ptr, value);
-}
-
 /**
- * Fetch and increment atomic uint32 by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-#if defined __OCTEON__
-
-static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
-{
-	uint32_t ret;
-
-	__asm__ __volatile__ ("syncws");
-	__asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
-			      "r" (ptr));
-
-	return ret;
-}
-
+ * 32-bit (unsigned) atomic type
+ */
+typedef struct {
+	uint32_t v; /**< Actual storage for the atomic variable */
+} odp_atomic32_t
+ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
+
+typedef enum {
+	/** Relaxed memory order, no ordering of other accesses enforced */
+	ODP_MEMORDER_RLX,
+	/** Acquire memory order, later accesses cannot move before
+	 * acquire operation */
+	ODP_MEMORDER_ACQ,
+	/** Release memory order, earlier accesses cannot move after
+	 * release operation */
+	ODP_MEMORDER_RLS
+} odp_memorder_t;
+
+/*****************************************************************************
+ * Just some private helpers
+*****************************************************************************/
+
+#ifdef __OCTEON__
+/* OCTEON Write Memory Barrier */
+#define COMPILER_HW_BARRIER() __asm __volatile( \
+	/* Double syncw to work around errata */ \
+	"syncw\n\tsyncw" : : : )
 #else
-
-static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
-{
-	return odp_atomic_fetch_add_u32(ptr, 1);
-}
-
+/** Compiler and hardware full memory barrier */
+#define COMPILER_HW_BARRIER() __sync_synchronize()
+/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a */
 #endif
 
-/**
- * Increment atomic uint32 by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
-{
-	odp_atomic_fetch_add_u32(ptr, 1);
-}
-
-/**
- * Fetch and decrement uint32 by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
-{
-	return odp_atomic_fetch_sub_u32(ptr, 1);
-}
-
-/**
- * Decrement atomic uint32 by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
-{
-	odp_atomic_fetch_sub_u32(ptr, 1);
-}
-
-/**
- * Atomic compare and set for 32bit
- *
- * @param dst destination location into which the value will be written.
- * @param exp expected value.
- * @param src new value.
- * @return Non-zero on success; 0 on failure.
- */
-static inline int
-odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
-{
-	return __sync_bool_compare_and_swap(dst, exp, src);
+#define MEMORY "memory"
+
+/*****************************************************************************
+ * Operations on 32-bit atomics
+ * odp_atomic32_init - no return value
+ * odp_atomic32_load - return current value
+ * odp_atomic32_store - no return value
+ * odp_atomic32_cmp_xchg_weak - return bool
+ * odp_atomic32_fetch_add - return old value
+ * odp_atomic32_add - no return value
+ * odp_atomic32_fetch_inc - return old value
+ * odp_atomic32_inc - no return value
+ * odp_atomic32_fetch_dec - return old value
+ * odp_atomic32_dec - no return value
+ *****************************************************************************/
+
+static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
+{
+	/* Write of aligned word is atomic */
+	/* Cast to volatile to force compiler to (re-) write variable, thus we
+	 * can avoid using compiler memory barriers */
+	*(__volatile uint32_t *)&ptr->v = val;
+}
+
+/**
+ * Atomic load of 32-bit atomic variable
+ *
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the load
+ * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
+ *
+ * @return Value of the variable
+ */
+static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
+		odp_memorder_t mmodel)
+{
+	if (mmodel == ODP_MEMORDER_RLX) {
+		uint32_t val;
+		/* Read of aligned word is atomic */
+		/* Cast to volatile to force compiler to (re-) read variable,
+		 * thus we can avoid using compiler memory barriers */
+		val = *(__volatile const uint32_t *)&ptr->v;
+		return val;
+	} else if (mmodel == ODP_MEMORDER_ACQ) {
+#if defined __aarch64__
+		uint32_t val;
+		__asm __volatile("ldar %w0, [%1]"
+				: "=&r"(val)
+				: "r"(&ptr->v)
+				: MEMORY);
+		return val;
+#elif defined __arm__  || defined __mips64__ || defined __x86_64__
+		/* Read of aligned word is atomic */
+		uint32_t val = ptr->v;
+		/* To prevent later accesses from moving up */
+		/* Herb Sutter claims HW barrier not needed on x86? */
+		COMPILER_HW_BARRIER();
+		return val;
+#else
+#warning odp_atomic32_load() may not be efficiently implemented
+		/* Assume read of aligned word is atomic */
+		uint32_t val = ptr->v;
+		/* To prevent later accesses from moving up */
+		COMPILER_HW_BARRIER();
+		return val;
+#endif
+	} else {
+		ODP_ABORT("Invalid memory model %u\n", mmodel);
+	}
+}
+
+/**
+ * Atomic store to 32-bit atomic variable
+ *
+ * @param ptr  Pointer to a 32-bit atomic variable
+ * @param val  Value to write to the atomic variable
+ * @param memmodel Memory model associated with the store
+ * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
+ */
+static inline void odp_atomic32_store(odp_atomic32_t *ptr,
+		uint32_t val,
+		odp_memorder_t mmodel)
+{
+	if (mmodel == ODP_MEMORDER_RLX) {
+		/* Write of aligned word is atomic */
+		/* Cast to volatile to force compiler to (re-) write variable,
+		 * thus we will avoid using compiler memory barriers */
+		*(__volatile uint32_t *)&ptr->v = val;
+	} else if (mmodel == ODP_MEMORDER_RLS) {
+#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
+		/* Compiler and HW barrier to prevent earlier accesses from
+		 * moving down */
+		COMPILER_HW_BARRIER();
+		/* Write of aligned word is atomic */
+		ptr->v = val;
+		/* Compiler and HW barrier to prevent this store from moving
+		 * down after a later load-acquire and thus create overlapping
+		 * critical sections. Herb Sutter thinks this is needed */
+		COMPILER_HW_BARRIER();
+#elif defined __aarch64__
+		__asm __volatile("stlr %w0, [%1]"
+				:
+				: "r"(val), "r"(&ptr->v)
+				: MEMORY);
+#elif defined __x86_64__
+		/* This is actually an atomic exchange operation */
+		/* Generates good code on x86_64 */
+		(void)__sync_lock_test_and_set(&ptr->v, val);
+#else
+#warning odp_atomic32_store_rls() may not be efficiently implemented
+		/* This is actually an atomic exchange operation */
+		(void)__sync_lock_test_and_set(&ptr->v, val);
+#endif
+	} else {
+		ODP_ABORT("Invalid memory model %u\n", mmodel);
+	}
+}
+
+
+/**
+ * Atomic compare and exchange (swap) of 32-bit atomic variable
+ * "Weak" semantics, may fail spuriously and must be used in a loop.
+ *
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param exp_p Pointer to expected value (updated on failure)
+ * @param val   New value to write
+ * @param       memmodel Memory model associated with the compare-and-swap
+ * operation (ODP_MEMORDER_RLX only)
+ *
+ * @return 1 (true) if exchange successful, 0 (false) if not successful (and
+ * '*exp_p' updated with current value)
+ */
+static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
+		uint32_t *exp_p,
+		uint32_t val,
+		odp_memorder_t mmodel)
+{
+	if (mmodel == ODP_MEMORDER_RLX) {
+#if defined __arm__ /* A32/T32 ISA */
+		uint32_t old;
+		uint32_t exp = *exp_p;
+		int status;
+		__asm __volatile("ldrex %0, [%2]\t\n"
+				 "cmp   %0, %3\t\n"
+				 "bne   1f\t\n"
+				 "strex %1, %4, [%2]\t\n"
+				 "1:\t\n"
+				: "=&r"(old), "=&r"(status)
+				: "r"(&ptr->v), "r"(exp), "r"(val)
+				: MEMORY);
+		if (odp_unlikely(old != exp)) {
+			/* Value has changed, can't proceed */
+			/* Clear exclusive access monitor */
+			__asm __volatile("clrex");
+			/* Return current value */
+			*exp_p = old;
+			return 0;
+		}
+		/* strex returns 0 on success */
+		if (odp_unlikely(status != 0)) {
+			/* strex failed, reservation was disturbed */
+			/* Return potentially changed value */
+			*exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
+			return 0;
+		}
+		return 1;
+#elif defined __mips64__
+		uint32_t old;
+		uint32_t exp = *exp_p;
+		uint32_t status = val;
+		__asm __volatile("llw %0, [%2]\t\n"
+				 "bne %0, %3, 1f\t\n"
+				 "scw %1, [%2]\t\n"
+				 "1:\t\n"
+				: "=&r"(old), "+&r"(status)
+				: "r"(&ptr->v), "r"(exp)
+				: MEMORY);
+		if (odp_unlikely(old != exp)) {
+			/* Value has changed, can't proceed */
+			/* Return current value */
+			*exp_p = old;
+			return 0;
+		}
+		/* scw returns 1 on success, 0 on failure */
+		if (odp_unlikely(status == 0)) {
+			/* scw failed, reservation was disturbed */
+			*exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
+			return 0;
+		}
+		return 1;
+#elif defined __x86_64__
+		uint32_t exp = *exp_p;
+		uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp, val);
+		if (odp_unlikely(old != exp)) {
+			/* Return the unexpected content of '*ptr' */
+			*exp_p = old;
+			return 0;
+		} else {
+			return 1;
+		}
+#else
+#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
+		uint32_t exp = *exp_p;
+		uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp, val);
+		if (odp_unlikely(old != exp)) {
+			/* Return the unexpected content of '*ptr' */
+			*exp_p = old;
+			return 0;
+		} else {
+			return 1;
+		}
+#endif
+	} else {
+		ODP_ABORT("Invalid memory model %u\n", mmodel);
+	}
+}
+
+/**
+ * Atomic fetch and add to 32-bit atomic variable
+ * @note A - B <=> A + (-B)
+ *
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
+ * @param memmodel Memory model associated with the add
+ * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
+ *
+ * @return Value of the atomic variable before the addition
+ */
+static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
+		uint32_t incr,
+		odp_memorder_t mmodel)
+{
+	if (mmodel == ODP_MEMORDER_RLX) {
+#if defined __arm__ /* A32/T32 ISA */
+		uint32_t old_val, tmp;
+		int status;
+		do {
+			__asm __volatile("ldrex %0, [%3]\t\n"
+					 "add   %1, %0, %4\t\n"
+					 "strex %2, %1, [%3]\t\n"
+					: "=&r"(old_val), "+&r"(tmp),
+					  "=&r"(status)
+					: "r"(&ptr->v), "r"(incr)
+					: MEMORY);
+		} while (odp_unlikely(status != 0));
+		return old_val;
+#elif defined __OCTEON__
+		uint32_t old_val;
+		__asm __volatile("laa %0,(%2),%3"
+				: "=r" (old_val), "+m" (ptr)
+				: "r" (ptr), "r" (incr)
+				: MEMORY);
+		return old_val;
+#elif defined __x86_64__
+		/* Generates good code on x86_64 */
+		return __sync_fetch_and_add(&ptr->v, incr);
+#else
+#warning odp_atomic32_fetch_add() may not be efficiently implemented
+		return __sync_fetch_and_add(&ptr->v, incr);
+#endif
+	} else if (mmodel == ODP_MEMORDER_RLS) {
+#if defined __OCTEON__
+		uint32_t old_val;
+		COMPILER_HW_BARRIER();
+		__asm __volatile("laa %0,(%2),%3"
+				: "=r" (old_val), "+m" (ptr)
+				: "r" (ptr), "r" (incr)
+				: MEMORY);
+		COMPILER_HW_BARRIER();
+		return old_val;
+#endif
+		/* __sync_fetch_and_add() will give us barriers before and
+		 * after, we are fine with this for release operations */
+		return __sync_fetch_and_add(&ptr->v, incr);
+	} else {
+		ODP_ABORT("Invalid memory model %u\n", mmodel);
+	}
 }
 
 /**
- * Initialize atomic uint64
+ * Atomic add to 32-bit atomic variable
  *
- * @param ptr    An atomic variable
- *
- * @note The operation is not synchronized with other threads
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
+ * @param memmodel Memory model associated with the add
+ * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
  */
-static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
+static inline void odp_atomic32_add(odp_atomic32_t *ptr,
+		uint32_t incr,
+		odp_memorder_t mmodel)
 {
-	*ptr = 0;
+	if (mmodel == ODP_MEMORDER_RLX) {
+		/* Platforms that support atomic add instructions can add
+		 * their implementations here */
+#if defined __OCTEON__
+		__asm __volatile("saa %[inc], (%[base])"
+				: "+m" (*ptr)
+				: [inc] "r" (incr), [base] "r" (ptr)
+				: MEMORY);
+		return;
+#endif
+	} else if (mmodel == ODP_MEMORDER_RLS) {
+		/* Platforms that support atomic add instructions can add
+		 * their implementations here */
+#if defined __OCTEON__
+		COMPILER_HW_BARRIER();
+		__asm __volatile("saa %[inc], (%[base])"
+				: "+m" (*ptr)
+				: [inc] "r" (incr), [base] "r" (ptr)
+				: MEMORY);
+		COMPILER_HW_BARRIER();
+		return;
+#endif
+	}
+	/* Default to using odp_atomic32_fetch_add() */
+	(void)odp_atomic32_fetch_add(ptr, incr, mmodel);
 }
 
 /**
- * Load value of atomic uint64
- *
- * @param ptr    An atomic variable
+ * Atomic fetch and increment of 32-bit atomic variable
  *
- * @return atomic uint64 value
+ * param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the increment
+ * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
  *
- * @note The operation is not synchronized with other threads
+ * @return Value of the atomic variable before the increment
  */
-static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
+static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
+		odp_memorder_t mmodel)
 {
-	return *ptr;
+	if (mmodel == ODP_MEMORDER_RLX) {
+		/* Platforms that support atomic increment instructions can add
+		 * their implementations here */
+#if defined __OCTEON__
+		uint32_t old_val;
+		__asm __volatile("lai %0,(%2)"
+				: "=r" (old_val), "+m" (ptr)
+				: "r" (ptr)
+				: MEMORY);
+		return old_val;
+#endif
+	} else if (mmodel == ODP_MEMORDER_RLS) {
+#if defined __OCTEON__
+		uint32_t old_val;
+		COMPILER_HW_BARRIER();
+		__asm __volatile("lai %0,(%2)"
+				: "=r" (old_val), "+m" (ptr)
+				: "r" (ptr)
+				: MEMORY);
+		COMPILER_HW_BARRIER();
+		return old_val;
+#endif
+	}
+	/* Default to using odp_atomic32_fetch_add() */
+	return odp_atomic32_fetch_add(ptr, 1, mmodel);
 }
 
 /**
- * Store value to atomic uint64
- *
- * @param ptr        An atomic variable
- * @param new_value  Store new_value to a variable
+ * Atomic increment of 32-bit atomic variable
  *
- * @note The operation is not synchronized with other threads
+ * param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the increment
+ * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
  */
-static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
-					uint64_t new_value)
-{
-	*ptr = new_value;
-}
+static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
+		odp_memorder_t mmodel)
 
-/**
- * Add atomic uint64
- *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
- *
- */
-static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t value)
 {
-	__sync_fetch_and_add(ptr, value);
+	/* Default to using odp_atomic32_fetch_inc() */
+	/* Platforms that support atomic increment instructions can add
+	 * their implementations here */
+	(void)odp_atomic32_fetch_inc(ptr, mmodel);
 }
 
 /**
- * Fetch and add atomic uint64
+ * Atomic fetch and decrement of 32-bit atomic variable
  *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
+ * param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the decrement
+ * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
  *
- * @return Value of the variable before the operation
+ * @return Value of the atomic variable before the decrement
  */
-
-#if defined __powerpc__ && !defined __powerpc64__
-static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
+static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
+		odp_memorder_t mmodel)
 {
-	return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
-				    (uint32_t)value);
-}
-#else
-static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
-{
-	return __sync_fetch_and_add(ptr, value);
-}
+	if (mmodel == ODP_MEMORDER_RLX) {
+		/* Platforms that support atomic decrement instructions can add
+		 * their implementations here */
+#if defined __OCTEON__
+		uint32_t old_val;
+		__asm __volatile("lad %0,(%2)"
+				: "=r" (old_val), "+m" (ptr)
+				: "r" (ptr)
+				: MEMORY);
+		return old_val;
 #endif
-/**
- * Subtract atomic uint64
- *
- * @param ptr    An atomic variable
- * @param value  A value to be subtracted from the variable
- *
- */
-static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t value)
-{
-	__sync_fetch_and_sub(ptr, value);
-}
-
-/**
- * Fetch and subtract atomic uint64
- *
- * @param ptr    An atomic variable
- * @param value  A value to be subtracted from the variable
- *
- * @return Value of the variable before the operation
- */
-#if defined __powerpc__ && !defined __powerpc64__
-static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
-{
-	return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
-				    (uint32_t)value);
-}
-#else
-static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
-{
-	return __sync_fetch_and_sub(ptr, value);
-}
+	} else if (mmodel == ODP_MEMORDER_RLS) {
+#if defined __OCTEON__
+		uint32_t old_val;
+		COMPILER_HW_BARRIER();
+		__asm __volatile("lad %0,(%2)"
+				: "=r" (old_val), "+m" (ptr)
+				: "r" (ptr)
+				: MEMORY);
+		COMPILER_HW_BARRIER();
+		return old_val;
 #endif
-/**
- * Fetch and increment atomic uint64 by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
-{
-	return odp_atomic_fetch_add_u64(ptr, 1);
-}
-
-/**
- * Increment atomic uint64 by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
-{
-	odp_atomic_fetch_add_u64(ptr, 1);
+	}
+	/* Default to using odp_atomic32_fetch_add() */
+	return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
 }
 
 /**
- * Fetch and decrement atomic uint64 by 1
+ * Atomic decrement of 32-bit atomic variable
  *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
+ * param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the decrement
+ * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
  */
-static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
-{
-	return odp_atomic_fetch_sub_u64(ptr, 1);
-}
+static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
+		odp_memorder_t memorder)
 
-/**
- * Decrement atomic uint64 by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
 {
-	odp_atomic_fetch_sub_u64(ptr, 1);
+	/* Default to using odp_atomic32_fetch_dec() */
+	/* Platforms that support atomic decrement instructions can add
+	 * their implementations here */
+	(void)odp_atomic32_fetch_dec(ptr, memorder);
 }
 
-/**
- * Atomic compare and set for 64bit
- *
- * @param dst destination location into which the value will be written.
- * @param exp expected value.
- * @param src new value.
- * @return Non-zero on success; 0 on failure.
- */
-static inline int
-odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
-{
-	return __sync_bool_compare_and_swap(dst, exp, src);
-}
+/* We are not exporting this macro */
+#undef COMPILER_HW_BARRIER
+#undef MEMORY
 
 #ifdef __cplusplus
 }
diff --git a/platform/linux-generic/include/api/odp_barrier.h b/platform/linux-generic/include/api/odp_barrier.h
index a7b3215..69b1eb8 100644
--- a/platform/linux-generic/include/api/odp_barrier.h
+++ b/platform/linux-generic/include/api/odp_barrier.h
@@ -27,18 +27,18 @@  extern "C" {
  * ODP execution barrier
  */
 typedef struct odp_barrier_t {
-	int              count;  /**< @private Thread count */
-	odp_atomic_int_t bar;    /**< @private Barrier counter */
+	uint32_t       num_threads;  /**< @private Thread count (constant) */
+	odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
 } odp_barrier_t;
 
 
 /**
  * Init barrier with thread count
  *
- * @param barrier    Barrier
- * @param count      Thread count
+ * @param barrier     Barrier
+ * @param num_threads Number of threads which share the barrier
  */
-void odp_barrier_init_count(odp_barrier_t *barrier, int count);
+void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
 
 
 /**
diff --git a/platform/linux-generic/include/api/odp_counter.h b/platform/linux-generic/include/api/odp_counter.h
new file mode 100644
index 0000000..b93c992
--- /dev/null
+++ b/platform/linux-generic/include/api/odp_counter.h
@@ -0,0 +1,363 @@ 
+/* Copyright (c) 2013, Linaro Limited
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier:     BSD-3-Clause
+ */
+
+/**
+ * @file
+ *
+ * ODP atomic counter types and operations, suitable for e.g. shared statistics.
+ * Relaxed memory model assumed for lowest overhead.
+ * Scalar variable wrapped in a struct to avoid accessing scalar directly
+ * without using the required access functions.
+ * Counter functions must be used to operate on counter variables!
+ */
+
+#ifndef ODP_COUNTER_H_
+#define ODP_COUNTER_H_
+
+#include <stdint.h>
+#include <odp_align.h>
+#include <odp_hints.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * 32-bit (unsigned) atomic counter type
+ */
+typedef struct {
+	uint32_t v; /**< Actual storage for the counter variable */
+} odp_counter32_t
+ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
+
+/**
+ * 64-bit (unsigned) atomic counter type
+ */
+typedef struct {
+	uint64_t v; /**< Actual storage for the counter variable */
+	/* Room for other data structures (e.g. spin lock) that might be
+	 * needed to ensure atomicity on some architectures */
+} odp_counter64_t
+ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
+
+/*****************************************************************************
+ * Operations on 32-bit atomic counters
+ * odp_counter32_init - returns no value
+ * odp_counter32_read - returns current value
+ * odp_counter32_write - returns no value
+ * odp_counter32_add - returns no value
+ * odp_counter32_read_inc - returns old value
+ * odp_counter32_inc - returns no value
+ *****************************************************************************/
+
+/**
+ * Initialize 32-bit counter variable
+ *
+ * @param ptr   Pointer to a 32-bit counter variable
+ * @param val   Initial value
+ */
+static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t val)
+{
+	/* No implementation requires any other type of initialization */
+	*(__volatile uint32_t *)&ptr->v = val;
+}
+
+/**
+ * Read 32-bit counter variable
+ *
+ * @param ptr   Pointer to a 32-bit counter variable
+ *
+ * @return Value of the variable
+ */
+static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
+{
+	uint32_t val;
+	/* Read of aligned word is atomic */
+	/* Cast to volatile to force compiler to (re-) read variable, thus we
+	 * will avoid using compiler memory barriers */
+	val = *(__volatile const uint32_t *)&ptr->v;
+	return val;
+}
+
+/**
+ * Write 32-bit counter variable
+ *
+ * @param ptr   Pointer to a 32-bit counter variable
+ * @param val   Value to write to the variable
+ */
+static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t val)
+{
+	/* Write of aligned word is atomic */
+	/* Cast to volatile to force compiler to (re-) write variable, thus we
+	 * will avoid using compiler memory barriers */
+	*(__volatile uint32_t *)&ptr->v = val;
+}
+
+/**
+ * Atomic add to 32-bit counter variable
+ *
+ * @param ptr   Pointer to a 32-bit counter variable
+ * @param incr  The value to be added to the counter variable
+ */
+static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t incr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint32_t result;
+	int status;
+	do {
+		__asm __volatile("ldrex %0, [%2]\t\n"
+				 "add   %0, %0, %3\t\n"
+				 "strex %1, %0, [%2]"
+				 : "=&r"(result), "=&r"(status)
+				 : "r"(&ptr->v), "Ir" (incr)
+				 : );
+	} while (odp_unlikely(status != 0));
+#elif defined __OCTEON__
+	__asm __volatile("saa %[inc], (%[base])"
+			 : "+m" (*ptr)
+			 : [inc] "r" (incr), [base] "r" (ptr)
+			 : );
+#elif defined __x86_64__
+	/* Generates good code on x86_64 */
+	(void)__sync_fetch_and_add(&ptr->v, incr);
+#else
+	/* Warning odp_counter32_add() may not be efficiently implemented */
+	(void)__sync_fetch_and_add(&ptr->v, incr);
+#endif
+}
+
+/**
+ * Atomic increment (+1) of 32-bit counter variable, return original value
+ *
+ * @param ptr   Pointer to a 32-bit counter variable
+ *
+ * @return Original value of counter
+ */
+static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint32_t result, tmp;
+	int status;
+	do {
+		__asm __volatile("ldrex %0, [%3]\t\n"
+				 "add   %1, %0, #1\t\n"
+				 "strex %2, %1, [%3]"
+				 : "=&r"(result), "=&r"(tmp), "+&r"(status)
+				 : "r"(&ptr->v)
+				 : );
+	} while (odp_unlikely(status != 0));
+	return result;
+#elif defined __OCTEON__
+	uint32_t old_val;
+	__asm __volatile("lai %0,(%2)"
+			 : "=r" (old_val), "+m" (ptr)
+			 : "r" (ptr)
+			 : );
+	return old_val;
+#elif defined __x86_64__
+	return __sync_fetch_and_add(&ptr->v, 1);
+#else
+/* Warning odp_counter32_read_inc() may not be efficiently implemented */
+	return __sync_fetch_and_add(&ptr->v, 1);
+#endif
+}
+
+/**
+ * Atomic increment (+1) 32-bit counter variable
+ *
+ * @param ptr   Pointer to a 32-bit counter variable
+ */
+static inline void odp_counter32_inc(odp_counter32_t *ptr)
+{
+#if defined __OCTEON__
+	odp_counter32_add(ptr, 1);
+#else
+	(void)odp_counter32_read_inc(ptr);
+#endif
+}
+
+/*****************************************************************************
+ * Operations on 64-bit atomic counters
+ * odp_counter64_init
+ * odp_counter64_read
+ * odp_counter64_write
+ * odp_counter64_add
+ * odp_counter64_read_inc
+ * odp_counter64_inc
+ *****************************************************************************/
+
+/**
+ * Read 64-bit counter variable
+ *
+ * @param ptr   Pointer to a 64-bit counter variable
+ *
+ * @return Value of the counter variable
+ */
+static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t val;
+	__asm __volatile("ldrexd %0, %H0, [%1]\n\t"
+			 "clrex" /* Clear exclusive access monitor */
+			 : "=&r"(val)
+			 : "r"(&ptr->v)
+			 : );
+	return val;
+#elif defined __x86_64__ || defined __aarch64__
+	/* Read of aligned quad/double word is atomic */
+	return ptr->v;
+#else
+/* Warning odp_counter64_read() may not be efficiently implemented */
+	return __sync_fetch_and_or(&ptr->v, 0);
+#endif
+}
+
+/**
+ * Write 64-bit counter variable
+ *
+ * @param ptr  Pointer to a 64-bit counter variable
+ * @param val  Value to write to the counter variable
+ */
+static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t val)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t old_val;
+	int status;
+	do {
+		/* Read counter variable exclusively so we can write to it
+		 * later */
+		/* Attempt to write the new value */
+		__asm __volatile("ldrexd %0, %H0, [%2]\t\n"
+				 "strexd %1, %3, %H3, [%2]"
+				 : "=&r"(old_val), "=&r"(status)
+				 : "r"(&ptr->v), "r"(val)
+				 : );
+	} while (odp_unlikely(status != 0)); /* Retry until write succeeds */
+#elif defined __x86_64__ || defined __aarch64__
+	/* Write of aligned quad/double word is atomic */
+	ptr->v = val;
+#else
+/* Warning odp_counter64_write() may not be efficiently implemented */
+	/* This is actually an counter exchange operation */
+	(void)__sync_lock_test_and_set(&ptr->v, val);
+#endif
+}
+
+/**
+ * Initialize 64-bit counter variable
+ * Perform implementation specific initializations, assign initial value.
+ *
+ * @param ptr   Pointer to a 64-bit counter variable
+ * @param val   Initial value
+ */
+static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t val)
+{
+	/* No implementation requires any other type of initialization */
+	odp_counter64_write(ptr, val);
+}
+
+/**
+ * Atomic add to 64-bit counter variable
+ *
+ * @param ptr   Pointer to a 64-bit counter variable
+ * @param incr  The value to be added to the counter variable
+ */
+static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t old_val;
+	int status;
+	do {
+		__asm __volatile("ldrexd %0, %H0, [%2]\t\n"
+				 "adds   %0, %0, %3\t\n"
+				 "adc    %H0, %H3\t\n"
+				 "strexd %1, %0, %H0, [%2]"
+				 : "=&r"(old_val), "=&r"(status)
+				 : "r"(&ptr->v), "r"(incr)
+				 : );
+	} while (odp_unlikely(status != 0)); /* Retry until write succeeds */
+#elif defined __OCTEON__
+	__asm __volatile("saad %[inc], (%[base])"
+			 : "+m" (*ptr)
+			 : [inc] "r" (incr), [base] "r" (ptr)
+			 : );
+#elif defined __x86_64__
+	/* Generates good code on x86_64 */
+	(void)__sync_fetch_and_add(&ptr->v, incr);
+#else
+/* Warning odp_counter64_add() may not be efficiently implemented */
+	(void)__sync_fetch_and_add(&ptr->v, incr);
+#endif
+}
+
+
+/**
+ * Atomic increment (+1) 64-bit counter variable and return original value
+ *
+ * @param ptr   Pointer to a 64-bit counter variable
+ *
+ * @return Original value of counter
+ */
+static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t old_val, tmp;
+	int status;
+	do {
+		__asm __volatile("ldrexd %0, %H0, [%3]\t\n"
+				 "adds   %2, %0, #1\t\n"
+				 "adc    %H2, %H0, #0\t\n"
+				 "strexd %1, %2, %H2, [%3]"
+				 : "=&r"(old_val), "=&r"(status), "=&r"(tmp)
+				 : "r"(&ptr->v)
+				 : );
+	} while (odp_unlikely(status != 0)); /* Retry until write succeeds */
+	return old_val;
+#elif defined __OCTEON__
+	uint64_t old_val;
+	__asm __volatile("laid %0,(%2)"
+			: "=r" (old_val), "+m" (ptr)
+			: "r" (ptr)
+			: );
+	return old_val;
+#elif defined __x86_64__
+	/* Generates good code on x86_64 */
+	return __sync_fetch_and_add(&ptr->v, 1);
+#else
+/* Warning odp_counter64_read_inc() may not be efficiently implemented */
+	return __sync_fetch_and_add(&ptr->v, 1);
+#endif
+}
+
+/**
+ * Atomic increment (+1) 64-bit counter variable
+ *
+ * @param ptr   Pointer to a 64-bit counter variable
+ */
+static inline void odp_counter64_inc(odp_counter64_t *ptr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t old_val;
+	int status;
+	do {
+		__asm __volatile("ldrexd %0, %H0, [%2]\t\n"
+				 "adds   %0, #1\t\n"
+				 "adc    %H0, #0\t\n"
+				 "strexd %1, %0, %H0, [%2]"
+				 : "=&r"(old_val), "=&r"(status)
+				 : "r"(&ptr->v)
+				 : );
+	} while (odp_unlikely(status != 0)); /* Retry until write succeeds */
+#else
+	(void)odp_counter64_read_inc(ptr);
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/platform/linux-generic/include/api/odp_rwlock.h b/platform/linux-generic/include/api/odp_rwlock.h
index 252ebb2..ff8a9a2 100644
--- a/platform/linux-generic/include/api/odp_rwlock.h
+++ b/platform/linux-generic/include/api/odp_rwlock.h
@@ -10,26 +10,30 @@ 
 /**
  * @file
  *
- * ODP RW Locks
+ * ODP read/write lock
+ * RW lock support multiple concurrent reads but only one (exclusive) writer.
  */
 
+#include <odp_atomic.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /**
  * The odp_rwlock_t type.
- * write lock count is -1,
- * read lock count > 0
+ * write lock is ~0U
+ * read lock count >0 && <~0U
  */
 typedef struct {
-	volatile int32_t cnt; /**< -1 Write lock,
-				> 0 for Read lock. */
+	odp_atomic32_t cnt; /**< == 0: unlocked,
+				 == ~0: locked for write,
+				 > 0 number of concurrent read locks */
 } odp_rwlock_t;
 
 
 /**
- * Initialize the rwlock to an unlocked state.
+ * Initialize the rwlock to the unlocked state.
  *
  * @param rwlock pointer to the RW Lock.
  */
@@ -50,14 +54,14 @@  void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
 void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
 
 /**
- * Aquire a write lock.
+ * Aquire the write lock.
  *
  * @param rwlock pointer to a RW Lock.
  */
 void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
 
 /**
- * Release a write lock.
+ * Release the write lock.
  *
  * @param rwlock pointer to a RW Lock.
  */
diff --git a/platform/linux-generic/include/api/odp_ticketlock.h b/platform/linux-generic/include/api/odp_ticketlock.h
index 6277a18..5933f85 100644
--- a/platform/linux-generic/include/api/odp_ticketlock.h
+++ b/platform/linux-generic/include/api/odp_ticketlock.h
@@ -21,14 +21,15 @@  extern "C" {
 
 #include <odp_std_types.h>
 #include <odp_atomic.h>
+#include <odp_counter.h>
 
 
 /**
  * ODP ticketlock
  */
 typedef struct odp_ticketlock_t {
-	odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
-	volatile uint32_t cur_ticket;  /**< @private Current ticket */
+	odp_counter32_t next_ticket; /**< @private Next ticket */
+	odp_atomic32_t cur_ticket;  /**< @private Current ticket */
 } odp_ticketlock_t;
 
 
diff --git a/platform/linux-generic/include/odp_buffer_internal.h b/platform/linux-generic/include/odp_buffer_internal.h
index 2002b51..530ab96 100644
--- a/platform/linux-generic/include/odp_buffer_internal.h
+++ b/platform/linux-generic/include/odp_buffer_internal.h
@@ -88,7 +88,7 @@  typedef struct odp_buffer_hdr_t {
 	uint32_t                 index;	     /* buf index in the pool */
 	size_t                   size;       /* max data size */
 	size_t                   cur_offset; /* current offset */
-	odp_atomic_int_t         ref_count;  /* reference count */
+	odp_atomic32_t           ref_count;  /* reference count */
 	odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
 	int                      type;       /* type of next header */
 	odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
diff --git a/platform/linux-generic/include/odp_spin_internal.h b/platform/linux-generic/include/odp_spin_internal.h
index b7e2071..29c524f 100644
--- a/platform/linux-generic/include/odp_spin_internal.h
+++ b/platform/linux-generic/include/odp_spin_internal.h
@@ -15,15 +15,6 @@  extern "C" {
 
 
 /**
- * GCC memory barrier for ODP internal use
- */
-static inline void odp_mem_barrier(void)
-{
-	__asm__ __volatile__ ("" : : : "memory");
-}
-
-
-/**
  * Spin loop for ODP internal use
  */
 static inline void odp_spin(void)
diff --git a/platform/linux-generic/odp_barrier.c b/platform/linux-generic/odp_barrier.c
index a82b294..10368b5 100644
--- a/platform/linux-generic/odp_barrier.c
+++ b/platform/linux-generic/odp_barrier.c
@@ -8,41 +8,52 @@ 
 #include <odp_sync.h>
 #include <odp_spin_internal.h>
 
-void odp_barrier_init_count(odp_barrier_t *barrier, int count)
+void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
 {
-	barrier->count = count;
-	barrier->bar = 0;
-	odp_sync_stores();
+	barrier->num_threads = num_threads; /* Constant after initialisation */
+	odp_atomic32_init(&barrier->in_barrier, 0);
 }
 
 /*
  * Efficient barrier_sync -
  *
  *   Barriers are initialized with a count of the number of callers
- *   that must sync on the barrier before any may proceed.
+ *   that must sync on (enter) the barrier before any may proceed (exit).
  *
  *   To avoid race conditions and to permit the barrier to be fully
- *   reusable, the barrier value cycles between 0..2*count-1. When
- *   synchronizing the wasless variable simply tracks which half of
+ *   reusable, the barrier value cycles between 0..2*count-1 (temporarily
+ *   hitting 2*count before being wrapped). When
+ *   synchronizing, the waslow variable simply tracks which half of
  *   the cycle the barrier was in upon entry.  Exit is when the
  *   barrier crosses to the other half of the cycle.
  */
 
 void odp_barrier_sync(odp_barrier_t *barrier)
 {
-	int count;
-	int wasless;
+	uint32_t count;
+	bool waslow;
 
-	odp_sync_stores();
-	wasless = barrier->bar < barrier->count;
-	count = odp_atomic_fetch_inc_int(&barrier->bar);
+	/* We need both acquire and release barriers but does the order
+	 * matter? Here we start with release and end with acquire. */
 
-	if (count == 2*barrier->count-1) {
-		barrier->bar = 0;
-	} else {
-		while ((barrier->bar < barrier->count) == wasless)
-			odp_spin();
-	}
+	/* Increase threads in_barrier count, this will automatically release
+	 * the other threads when lower/upper range is switched */
+	count = odp_atomic32_fetch_add(&barrier->in_barrier, 1,
+				       ODP_MEMORDER_RLS);
+	/* Compute lower or higher range indicator */
+	waslow = count < barrier->num_threads;
 
-	odp_mem_barrier();
+	/* Check if in_barrier count should wrap */
+	if (count == 2 * barrier->num_threads - 1) {
+		/* Manually wrap the counter */
+		odp_atomic32_add(&barrier->in_barrier,
+				 -2 * barrier->num_threads,
+				 ODP_MEMORDER_RLX);
+		/* Fall-through the final part for the acquire barrier */
+	}
+	/* Wait for counter to change half */
+	while ((odp_atomic32_load(&barrier->in_barrier, ODP_MEMORDER_ACQ) <
+	       barrier->num_threads) == waslow) {
+		odp_spin();
+	}
 }
diff --git a/platform/linux-generic/odp_buffer.c b/platform/linux-generic/odp_buffer.c
index e54e0e7..fc3506b 100644
--- a/platform/linux-generic/odp_buffer.c
+++ b/platform/linux-generic/odp_buffer.c
@@ -73,7 +73,8 @@  int odp_buffer_snprint(char *str, size_t n, odp_buffer_t buf)
 	len += snprintf(&str[len], n-len,
 			"  cur_offset   %zu\n",       hdr->cur_offset);
 	len += snprintf(&str[len], n-len,
-			"  ref_count    %i\n",        hdr->ref_count);
+			"  ref_count    %u\n",
+			odp_atomic32_load(&hdr->ref_count, ODP_MEMORDER_RLX));
 	len += snprintf(&str[len], n-len,
 			"  type         %i\n",        hdr->type);
 	len += snprintf(&str[len], n-len,
diff --git a/platform/linux-generic/odp_crypto.c b/platform/linux-generic/odp_crypto.c
index b37ad6b..75b4ce0 100644
--- a/platform/linux-generic/odp_crypto.c
+++ b/platform/linux-generic/odp_crypto.c
@@ -6,7 +6,7 @@ 
 
 #include <odp_crypto.h>
 #include <odp_internal.h>
-#include <odp_atomic.h>
+#include <odp_counter.h>
 #include <odp_spinlock.h>
 #include <odp_sync.h>
 #include <odp_debug.h>
@@ -26,7 +26,7 @@ 
 #define MAX_SESSIONS 32
 
 typedef struct {
-	odp_atomic_u32_t next;
+	odp_counter32_t   next;
 	uint32_t         max;
 	odp_crypto_generic_session_t sessions[0];
 } odp_crypto_global_t;
@@ -58,7 +58,7 @@  odp_crypto_generic_session_t *alloc_session(void)
 	uint32_t idx;
 	odp_crypto_generic_session_t *session = NULL;
 
-	idx = odp_atomic_fetch_inc_u32(&global->next);
+	idx = odp_counter32_read_inc(&global->next);
 	if (idx < global->max) {
 		session = &global->sessions[idx];
 		session->index = idx;
@@ -420,6 +420,7 @@  odp_crypto_init_global(void)
 
 	/* Initialize it */
 	global->max = MAX_SESSIONS;
+	odp_counter32_init(&global->next, 0);
 
 	return 0;
 }
diff --git a/platform/linux-generic/odp_queue.c b/platform/linux-generic/odp_queue.c
index 1318bcd..08c0d29 100644
--- a/platform/linux-generic/odp_queue.c
+++ b/platform/linux-generic/odp_queue.c
@@ -214,8 +214,13 @@  int odp_queue_set_context(odp_queue_t handle, void *context)
 {
 	queue_entry_t *queue;
 	queue = queue_to_qentry(handle);
+	/* Setting a new queue context can be viewed as a release operation,
+	 * all writes to the context must be observable before the context
+	 * is made observable */
 	odp_sync_stores();
-	queue->s.param.context = context;
+	queue->s.param.context = context; /* Store-release */
+	/* Ensure queue modification is globally visible before we return
+	 * and the application might cause the queue to be scheduled */
 	odp_sync_stores();
 	return 0;
 }
diff --git a/platform/linux-generic/odp_ring.c b/platform/linux-generic/odp_ring.c
index 632aa66..e5b9c23 100644
--- a/platform/linux-generic/odp_ring.c
+++ b/platform/linux-generic/odp_ring.c
@@ -187,10 +187,10 @@  odph_ring_create(const char *name, unsigned count, unsigned flags)
 		r->cons.size = count;
 		r->prod.mask = count-1;
 		r->cons.mask = count-1;
-		r->prod.head = 0;
-		r->cons.head = 0;
-		r->prod.tail = 0;
-		r->cons.tail = 0;
+		odp_atomic32_init(&r->prod.head, 0);
+		odp_atomic32_init(&r->cons.head, 0);
+		odp_atomic32_init(&r->prod.tail, 0);
+		odp_atomic32_init(&r->cons.tail, 0);
 
 		TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
 	} else {
@@ -227,7 +227,7 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	uint32_t prod_head, prod_next;
 	uint32_t cons_tail, free_entries;
 	const unsigned max = n;
-	int success;
+	bool success;
 	unsigned i;
 	uint32_t mask = r->prod.mask;
 	int ret;
@@ -237,8 +237,8 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 		/* Reset n to the initial burst count */
 		n = max;
 
-		prod_head = r->prod.head;
-		cons_tail = r->cons.tail;
+		prod_head = odp_atomic32_load(&r->prod.head, ODP_MEMORDER_RLX);
+		cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_ACQ);
 		/* The subtraction is done between two unsigned 32bits value
 		 * (the result is always modulo 32 bits even if we have
 		 * prod_head > cons_tail). So 'free_entries' is always between 0
@@ -259,13 +259,14 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 		}
 
 		prod_next = prod_head + n;
-		success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
-					      prod_next);
-	} while (odp_unlikely(success == 0));
+		success = odp_atomic32_cmp_xchg_weak(&r->prod.head,
+						     &prod_head,
+						     prod_next,
+						     ODP_MEMORDER_RLX);
+	} while (odp_unlikely(!success));
 
 	/* write entries in ring */
 	ENQUEUE_PTRS();
-	odp_mem_barrier();
 
 	/* if we exceed the watermark */
 	if (odp_unlikely(((mask + 1) - free_entries + n) > r->prod.watermark)) {
@@ -279,10 +280,11 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	 * If there are other enqueues in progress that preceeded us,
 	 * we need to wait for them to complete
 	 */
-	while (odp_unlikely(r->prod.tail != prod_head))
+	while (odp_unlikely(odp_atomic32_load(&r->prod.tail,
+					      ODP_MEMORDER_RLX) != prod_head))
 		odp_spin();
 
-	r->prod.tail = prod_next;
+	odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
 	return ret;
 }
 
@@ -298,8 +300,8 @@  int __odph_ring_sp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	uint32_t mask = r->prod.mask;
 	int ret;
 
-	prod_head = r->prod.head;
-	cons_tail = r->cons.tail;
+	prod_head = odp_atomic32_load(&r->prod.head, ODP_MEMORDER_RLX);
+	cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_ACQ);
 	/* The subtraction is done between two unsigned 32bits value
 	 * (the result is always modulo 32 bits even if we have
 	 * prod_head > cons_tail). So 'free_entries' is always between 0
@@ -320,11 +322,10 @@  int __odph_ring_sp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	}
 
 	prod_next = prod_head + n;
-	r->prod.head = prod_next;
+	odp_atomic32_store(&r->prod.head, prod_next, ODP_MEMORDER_RLX);
 
 	/* write entries in ring */
 	ENQUEUE_PTRS();
-	odp_mem_barrier();
 
 	/* if we exceed the watermark */
 	if (odp_unlikely(((mask + 1) - free_entries + n) > r->prod.watermark)) {
@@ -334,7 +335,7 @@  int __odph_ring_sp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 		ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
 	}
 
-	r->prod.tail = prod_next;
+	odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
 	return ret;
 }
 
@@ -348,7 +349,7 @@  int __odph_ring_mc_do_dequeue(odph_ring_t *r, void **obj_table,
 	uint32_t cons_head, prod_tail;
 	uint32_t cons_next, entries;
 	const unsigned max = n;
-	int success;
+	bool success;
 	unsigned i;
 	uint32_t mask = r->prod.mask;
 
@@ -357,8 +358,8 @@  int __odph_ring_mc_do_dequeue(odph_ring_t *r, void **obj_table,
 		/* Restore n as it may change every loop */
 		n = max;
 
-		cons_head = r->cons.head;
-		prod_tail = r->prod.tail;
+		cons_head = odp_atomic32_load(&r->cons.head, ODP_MEMORDER_RLX);
+		prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_ACQ);
 		/* The subtraction is done between two unsigned 32bits value
 		 * (the result is always modulo 32 bits even if we have
 		 * cons_head > prod_tail). So 'entries' is always between 0
@@ -378,22 +379,24 @@  int __odph_ring_mc_do_dequeue(odph_ring_t *r, void **obj_table,
 		}
 
 		cons_next = cons_head + n;
-		success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
-					      cons_next);
-	} while (odp_unlikely(success == 0));
+		success = odp_atomic32_cmp_xchg_weak(&r->cons.head,
+						     &cons_head,
+						     cons_next,
+						     ODP_MEMORDER_RLX);
+	} while (odp_unlikely(!success));
 
 	/* copy in table */
 	DEQUEUE_PTRS();
-	odp_mem_barrier();
 
 	/*
 	 * If there are other dequeues in progress that preceded us,
 	 * we need to wait for them to complete
 	 */
-	while (odp_unlikely(r->cons.tail != cons_head))
+	while (odp_unlikely(odp_atomic32_load(&r->cons.tail,
+					      ODP_MEMORDER_RLX) != cons_head))
 		odp_spin();
 
-	r->cons.tail = cons_next;
+	odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
 
 	return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
 }
@@ -409,8 +412,8 @@  int __odph_ring_sc_do_dequeue(odph_ring_t *r, void **obj_table,
 	unsigned i;
 	uint32_t mask = r->prod.mask;
 
-	cons_head = r->cons.head;
-	prod_tail = r->prod.tail;
+	cons_head = odp_atomic32_load(&r->cons.head, ODP_MEMORDER_RLX);
+	prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_ACQ);
 	/* The subtraction is done between two unsigned 32bits value
 	 * (the result is always modulo 32 bits even if we have
 	 * cons_head > prod_tail). So 'entries' is always between 0
@@ -429,13 +432,12 @@  int __odph_ring_sc_do_dequeue(odph_ring_t *r, void **obj_table,
 	}
 
 	cons_next = cons_head + n;
-	r->cons.head = cons_next;
+	odp_atomic32_store(&r->cons.head, cons_next, ODP_MEMORDER_RLX);
 
 	/* copy in table */
 	DEQUEUE_PTRS();
-	odp_mem_barrier();
 
-	r->cons.tail = cons_next;
+	odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
 	return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
 }
 
@@ -482,8 +484,8 @@  int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void **obj_table, unsigned n)
  */
 int odph_ring_full(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_RLX);
+	uint32_t cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_RLX);
 	return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
 }
 
@@ -492,8 +494,8 @@  int odph_ring_full(const odph_ring_t *r)
  */
 int odph_ring_empty(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_RLX);
+	uint32_t cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_RLX);
 	return !!(cons_tail == prod_tail);
 }
 
@@ -502,8 +504,8 @@  int odph_ring_empty(const odph_ring_t *r)
  */
 unsigned odph_ring_count(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_RLX);
+	uint32_t cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_RLX);
 	return (prod_tail - cons_tail) & r->prod.mask;
 }
 
@@ -512,8 +514,8 @@  unsigned odph_ring_count(const odph_ring_t *r)
  */
 unsigned odph_ring_free_count(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_RLX);
+	uint32_t cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_RLX);
 	return (cons_tail - prod_tail - 1) & r->prod.mask;
 }
 
@@ -523,10 +525,14 @@  void odph_ring_dump(const odph_ring_t *r)
 	ODP_DBG("ring <%s>@%p\n", r->name, r);
 	ODP_DBG("  flags=%x\n", r->flags);
 	ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
-	ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
-	ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
-	ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
-	ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
+	ODP_DBG("  ct=%"PRIu32"\n", odp_atomic32_load(&r->cons.tail,
+						      ODP_MEMORDER_RLX));
+	ODP_DBG("  ch=%"PRIu32"\n", odp_atomic32_load(&r->cons.head,
+						      ODP_MEMORDER_RLX));
+	ODP_DBG("  pt=%"PRIu32"\n", odp_atomic32_load(&r->prod.tail,
+						      ODP_MEMORDER_RLX));
+	ODP_DBG("  ph=%"PRIu32"\n", odp_atomic32_load(&r->prod.head,
+						      ODP_MEMORDER_RLX));
 	ODP_DBG("  used=%u\n", odph_ring_count(r));
 	ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
 	if (r->prod.watermark == r->prod.size)
diff --git a/platform/linux-generic/odp_rwlock.c b/platform/linux-generic/odp_rwlock.c
index 11c8dd7..a5fae4d 100644
--- a/platform/linux-generic/odp_rwlock.c
+++ b/platform/linux-generic/odp_rwlock.c
@@ -4,58 +4,64 @@ 
  * SPDX-License-Identifier:     BSD-3-Clause
  */
 
+#include <stdbool.h>
 #include <odp_atomic.h>
 #include <odp_rwlock.h>
-
 #include <odp_spin_internal.h>
 
 void odp_rwlock_init(odp_rwlock_t *rwlock)
 {
-	rwlock->cnt = 0;
+	odp_atomic32_init(&rwlock->cnt, 0);
 }
 
 void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
 {
-	int32_t cnt;
-	int  is_locked = 0;
-
-	while (is_locked == 0) {
-		cnt = rwlock->cnt;
-		/* waiting for read lock */
-		if (cnt < 0) {
+	bool gotit;
+	uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
+	do {
+		/* Wait for any writer to release lock */
+		while ((int32_t)cnt < 0) {
 			odp_spin();
-			continue;
+			cnt = odp_atomic32_load(&rwlock->cnt,
+						ODP_MEMORDER_RLX);
 		}
-		is_locked = odp_atomic_cmpset_u32(
-					(volatile uint32_t *)&rwlock->cnt,
-					      cnt, cnt + 1);
-	}
+		/* Attempt to take another read lock */
+		gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
+						   &cnt, cnt + 1,
+						   ODP_MEMORDER_RLX);
+		/* If operation fails, 'cnt' will contain current value */
+	} while (!gotit);
 }
 
 void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
 {
-	odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
+	/* Release one read lock by subtracting 1 */
+	odp_atomic32_dec(&rwlock->cnt, ODP_MEMORDER_RLS);
 }
 
 void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
 {
-	int32_t cnt;
-	int is_locked = 0;
-
-	while (is_locked == 0) {
-		cnt = rwlock->cnt;
-		/* lock aquired, wait */
-		if (cnt != 0) {
+	bool gotit;
+	uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
+	do {
+		/* Wait for all lock holders to release lock */
+		while (cnt != 0) {
+			/* Lock is busy */
 			odp_spin();
-			continue;
+			cnt = odp_atomic32_load(&rwlock->cnt,
+						ODP_MEMORDER_RLX);
 		}
-		is_locked = odp_atomic_cmpset_u32(
-					(volatile uint32_t *)&rwlock->cnt,
-					      0, -1);
-	}
+		/* Attempt to take write lock */
+		gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
+						   &cnt,
+						   (uint32_t)-1,
+						   ODP_MEMORDER_RLX);
+		/* If operation fails, 'cnt' will contain current value */
+	} while (!gotit);
 }
 
 void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
 {
-	odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
+	/* Release the write lock by adding 1 */
+	odp_atomic32_inc(&rwlock->cnt, ODP_MEMORDER_RLS);
 }
diff --git a/platform/linux-generic/odp_thread.c b/platform/linux-generic/odp_thread.c
index b869b27..652d317 100644
--- a/platform/linux-generic/odp_thread.c
+++ b/platform/linux-generic/odp_thread.c
@@ -11,7 +11,7 @@ 
 
 #include <odp_thread.h>
 #include <odp_internal.h>
-#include <odp_atomic.h>
+#include <odp_counter.h>
 #include <odp_config.h>
 #include <odp_debug.h>
 #include <odp_shared_memory.h>
@@ -31,7 +31,7 @@  typedef struct {
 
 typedef struct {
 	thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
-	odp_atomic_int_t num;
+	odp_counter32_t   num;
 
 } thread_globals_t;
 
@@ -58,6 +58,7 @@  int odp_thread_init_global(void)
 		return -1;
 
 	memset(thread_globals, 0, sizeof(thread_globals_t));
+	odp_counter32_init(&thread_globals->num, 0);
 	return 0;
 }
 
@@ -67,7 +68,7 @@  static int thread_id(void)
 	int id;
 	int cpu;
 
-	id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
+	id = (int)odp_counter32_read_inc(&thread_globals->num);
 
 	if (id >= ODP_CONFIG_MAX_THREADS) {
 		ODP_ERR("Too many threads\n");
@@ -77,7 +78,7 @@  static int thread_id(void)
 	cpu = sched_getcpu();
 
 	if (cpu < 0) {
-		ODP_ERR("getcpu failed\n");
+		ODP_ERR("sched_getcpu failed\n");
 		return -1;
 	}
 
diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux-generic/odp_ticketlock.c
index be5b885..510aa9f 100644
--- a/platform/linux-generic/odp_ticketlock.c
+++ b/platform/linux-generic/odp_ticketlock.c
@@ -6,15 +6,15 @@ 
 
 #include <odp_ticketlock.h>
 #include <odp_atomic.h>
+#include <odp_counter.h>
 #include <odp_sync.h>
 #include <odp_spin_internal.h>
 
 
 void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
 {
-	ticketlock->next_ticket = 0;
-	ticketlock->cur_ticket  = 0;
-	odp_sync_stores();
+	odp_counter32_init(&ticketlock->next_ticket, 0);
+	odp_atomic32_init(&ticketlock->cur_ticket, 0);
 }
 
 
@@ -22,30 +22,15 @@  void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
 {
 	uint32_t ticket;
 
-	ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
+	ticket = odp_counter32_read_inc(&ticketlock->next_ticket);
 
-	while (ticket != ticketlock->cur_ticket)
+	while (ticket != odp_atomic32_load(&ticketlock->cur_ticket,
+					   ODP_MEMORDER_ACQ))
 		odp_spin();
-
-	odp_mem_barrier();
 }
 
 
 void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
 {
-	odp_sync_stores();
-
-	ticketlock->cur_ticket++;
-
-#if defined __OCTEON__
-	odp_sync_stores();
-#else
-	odp_mem_barrier();
-#endif
-}
-
-
-int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
-{
-	return ticketlock->cur_ticket != ticketlock->next_ticket;
+	odp_atomic32_inc(&ticketlock->cur_ticket, ODP_MEMORDER_RLS);
 }
diff --git a/platform/linux-generic/odp_timer.c b/platform/linux-generic/odp_timer.c
index 313c713..fffaa44 100644
--- a/platform/linux-generic/odp_timer.c
+++ b/platform/linux-generic/odp_timer.c
@@ -10,6 +10,7 @@ 
 #include <odp_buffer_pool_internal.h>
 #include <odp_internal.h>
 #include <odp_atomic.h>
+#include <odp_counter.h>
 #include <odp_spinlock.h>
 #include <odp_sync.h>
 #include <odp_debug.h>
@@ -32,8 +33,8 @@  typedef struct {
 
 typedef struct {
 	int               allocated;
-	volatile int      active;
-	volatile uint64_t cur_tick;
+	odp_atomic32_t    active;
+	odp_counter64_t   cur_tick;
 	timer_t           timerid;
 	odp_timer_t       timer_hdl;
 	odp_buffer_pool_t pool;
@@ -150,16 +151,16 @@  static void notify_function(union sigval sigval)
 
 	timer = sigval.sival_ptr;
 
-	if (timer->active == 0) {
+	if (odp_atomic32_load(&timer->active, ODP_MEMORDER_RLX) == 0) {
 		ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
 		return;
 	}
 
 	/* ODP_DBG("Tick\n"); */
 
-	cur_tick = timer->cur_tick++;
-
-	odp_sync_stores();
+	/* Increment and read are not atomic but we are the only writer */
+	odp_counter64_inc(&timer->cur_tick);
+	cur_tick = odp_counter64_read(&timer->cur_tick);
 
 	tick = &timer->tick[cur_tick % MAX_TICKS];
 
@@ -308,6 +309,8 @@  odp_timer_t odp_timer_create(const char *name, odp_buffer_pool_t pool,
 
 	timer_hdl = id + 1;
 
+	odp_atomic32_init(&timer->active, 0);
+	odp_counter64_init(&timer->cur_tick, 0);
 	timer->timer_hdl     = timer_hdl;
 	timer->pool          = pool;
 	timer->resolution_ns = resolution_ns;
@@ -318,8 +321,7 @@  odp_timer_t odp_timer_create(const char *name, odp_buffer_pool_t pool,
 		timer->tick[i].list = NULL;
 	}
 
-	timer->active = 1;
-	odp_sync_stores();
+	odp_atomic32_store(&timer->active, 1, ODP_MEMORDER_RLS);
 
 	timer_start(timer);
 
@@ -340,7 +342,7 @@  odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t timer_hdl, uint64_t tmo_tick,
 	id = (int)timer_hdl - 1;
 	timer = &odp_timer.timer[id];
 
-	cur_tick = timer->cur_tick;
+	cur_tick = odp_counter64_read(&timer->cur_tick);
 	if (tmo_tick <= cur_tick) {
 		ODP_DBG("timeout too close\n");
 		return ODP_TIMER_TMO_INVALID;
@@ -416,7 +418,7 @@  uint64_t odp_timer_current_tick(odp_timer_t timer_hdl)
 	uint32_t id;
 
 	id = timer_hdl - 1;
-	return odp_timer.timer[id].cur_tick;
+	return odp_counter64_read(&odp_timer.timer[id].cur_tick);
 }
 
 odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
diff --git a/test/api_test/Makefile.am b/test/api_test/Makefile.am
index 5104454..478aa6c 100644
--- a/test/api_test/Makefile.am
+++ b/test/api_test/Makefile.am
@@ -1,12 +1,12 @@ 
 include $(top_srcdir)/test/Makefile.inc
 
-bin_PROGRAMS = odp_atomic odp_shm odp_ring odp_timer_ping
-odp_atomic_LDFLAGS = $(AM_LDFLAGS) -static
+bin_PROGRAMS = odp_counter odp_shm odp_ring odp_timer_ping
+odp_counter_LDFLAGS = $(AM_LDFLAGS) -static
 odp_shm_LDFLAGS = $(AM_LDFLAGS) -static
 odp_ring_LDFLAGS = $(AM_LDFLAGS) -static
 odp_timer_ping_LDFLAGS = $(AM_LDFLAGS) -static
 
-dist_odp_atomic_SOURCES = odp_atomic_test.c odp_common.c
+dist_odp_counter_SOURCES = odp_counter_test.c odp_common.c
 dist_odp_shm_SOURCES = odp_shm_test.c odp_common.c
 dist_odp_ring_SOURCES = odp_ring_test.c odp_common.c
 dist_odp_timer_ping_SOURCES = odp_timer_ping.c odp_common.c
diff --git a/test/api_test/odp_atomic_test.c b/test/api_test/odp_atomic_test.c
deleted file mode 100644
index 9019d4f..0000000
--- a/test/api_test/odp_atomic_test.c
+++ /dev/null
@@ -1,362 +0,0 @@ 
-/* Copyright (c) 2013, Linaro Limited
- * All rights reserved.
- *
- * SPDX-License-Identifier:     BSD-3-Clause
- */
-
-#include <string.h>
-#include <sys/time.h>
-#include <odp_debug.h>
-#include <odp_common.h>
-#include <odp_atomic_test.h>
-
-static odp_atomic_int_t a32;
-static odp_atomic_u32_t a32u;
-static odp_atomic_u64_t a64u;
-
-static odp_atomic_int_t numthrds;
-
-static const char * const test_name[] = {
-	"dummy",
-	"test atomic basic ops add/sub/inc/dec",
-	"test atomic inc/dec of signed word",
-	"test atomic add/sub of signed word",
-	"test atomic inc/dec of unsigned word",
-	"test atomic add/sub of unsigned word",
-	"test atomic inc/dec of unsigned double word",
-	"test atomic add/sub of unsigned double word"
-};
-
-static struct timeval tv0[MAX_WORKERS], tv1[MAX_WORKERS];
-
-static void usage(void)
-{
-	printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
-	       "\t<testcase> is\n"
-	       "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
-	       "\t\t2 - Test inc dec of signed word\n"
-	       "\t\t3 - Test add sub of signed word\n"
-	       "\t\t4 - Test inc dec of unsigned word\n"
-	       "\t\t5 - Test add sub of unsigned word\n"
-	       "\t\t6 - Test inc dec of double word\n"
-	       "\t\t7 - Test add sub of double word\n"
-	       "\t<num of pthread> is optional\n"
-	       "\t\t<1 - 31> - no of pthreads to start\n"
-	       "\t\tif user doesn't specify this option, then\n"
-	       "\t\tno of pthreads created is equivalent to no of cores\n"
-	       "\t\tavailable in the system\n"
-	       "\tExample usage:\n"
-	       "\t\t./odp_atomic -t 2\n"
-	       "\t\t./odp_atomic -t 3 -n 12\n");
-}
-
-void test_atomic_inc_32(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_inc_int(&a32);
-}
-
-void test_atomic_inc_u32(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_inc_u32(&a32u);
-}
-
-void test_atomic_inc_64(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_inc_u64(&a64u);
-}
-
-void test_atomic_dec_32(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_dec_int(&a32);
-}
-
-void test_atomic_dec_u32(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_dec_u32(&a32u);
-}
-
-void test_atomic_dec_64(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_dec_u64(&a64u);
-}
-
-void test_atomic_add_32(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
-}
-
-void test_atomic_add_u32(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
-}
-
-void test_atomic_add_64(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
-}
-
-void test_atomic_sub_32(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
-}
-
-void test_atomic_sub_u32(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
-}
-
-void test_atomic_sub_64(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
-}
-
-void test_atomic_inc_dec_32(void)
-{
-	test_atomic_inc_32();
-	test_atomic_dec_32();
-}
-
-void test_atomic_add_sub_32(void)
-{
-	test_atomic_add_32();
-	test_atomic_sub_32();
-}
-
-void test_atomic_inc_dec_u32(void)
-{
-	test_atomic_inc_u32();
-	test_atomic_dec_u32();
-}
-
-void test_atomic_add_sub_u32(void)
-{
-	test_atomic_add_u32();
-	test_atomic_sub_u32();
-}
-
-void test_atomic_inc_dec_64(void)
-{
-	test_atomic_inc_64();
-	test_atomic_dec_64();
-}
-
-void test_atomic_add_sub_64(void)
-{
-	test_atomic_add_64();
-	test_atomic_sub_64();
-}
-
-/**
- * Test basic atomic operation like
- * add/sub/increment/decrement operation.
- */
-void test_atomic_basic(void)
-{
-	test_atomic_inc_32();
-	test_atomic_dec_32();
-	test_atomic_add_32();
-	test_atomic_sub_32();
-
-	test_atomic_inc_u32();
-	test_atomic_dec_u32();
-	test_atomic_add_u32();
-	test_atomic_sub_u32();
-
-	test_atomic_inc_64();
-	test_atomic_dec_64();
-	test_atomic_add_64();
-	test_atomic_sub_64();
-}
-
-void test_atomic_init(void)
-{
-	odp_atomic_init_int(&a32);
-	odp_atomic_init_u32(&a32u);
-	odp_atomic_init_u64(&a64u);
-}
-
-void test_atomic_store(void)
-{
-	odp_atomic_store_int(&a32, S32_INIT_VAL);
-	odp_atomic_store_u32(&a32u, U32_INIT_VAL);
-	odp_atomic_store_u64(&a64u, U64_INIT_VAL);
-}
-
-int test_atomic_validate(void)
-{
-	if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
-		ODP_ERR("Atomic signed 32 usual functions failed\n");
-		return -1;
-	}
-
-	if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
-		ODP_ERR("Atomic u32 usual functions failed\n");
-		return -1;
-	}
-
-	if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
-		ODP_ERR("Atomic u64 usual functions failed\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-static void *run_thread(void *arg)
-{
-	pthrd_arg *parg = (pthrd_arg *)arg;
-	int thr;
-
-	thr = odp_thread_id();
-
-	ODP_DBG("Thread %i starts\n", thr);
-
-	odp_atomic_inc_int(&numthrds);
-
-	/* Wait here until all pthreads are created */
-	while (*(volatile int *)&numthrds < parg->numthrds)
-		;
-
-	gettimeofday(&tv0[thr], NULL);
-
-	switch (parg->testcase) {
-	case TEST_MIX:
-		test_atomic_basic();
-		break;
-	case TEST_INC_DEC_S32:
-		test_atomic_inc_dec_32();
-		break;
-	case TEST_ADD_SUB_S32:
-		test_atomic_add_sub_32();
-		break;
-	case TEST_INC_DEC_U32:
-		test_atomic_inc_dec_u32();
-		break;
-	case TEST_ADD_SUB_U32:
-		test_atomic_add_sub_u32();
-		break;
-	case TEST_INC_DEC_64:
-		test_atomic_inc_dec_64();
-		break;
-	case TEST_ADD_SUB_64:
-		test_atomic_add_sub_64();
-		break;
-	}
-	gettimeofday(&tv1[thr], NULL);
-	fflush(NULL);
-
-	printf("Time taken in thread %02d to complete op is %lld usec\n", thr,
-	       (tv1[thr].tv_sec - tv0[thr].tv_sec) * 1000000ULL +
-	       (tv1[thr].tv_usec - tv0[thr].tv_usec));
-
-	return parg;
-}
-
-int main(int argc, char *argv[])
-{
-	pthrd_arg thrdarg;
-	int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
-	char c;
-	int result;
-
-	if (argc == 1 || argc % 2 == 0) {
-		usage();
-		goto err_exit;
-	}
-	if (odp_test_global_init() != 0)
-		goto err_exit;
-	odp_print_system_info();
-
-	while (cnt != 0) {
-		sscanf(argv[++i], "-%c", &c);
-		switch (c) {
-		case 't':
-			sscanf(argv[++i], "%d", &test_type);
-			break;
-		case 'n':
-			sscanf(argv[++i], "%d", &pthrdnum);
-			break;
-		default:
-			ODP_ERR("Invalid option %c\n", c);
-			usage();
-			goto err_exit;
-		}
-		if (test_type < TEST_MIX || test_type > TEST_MAX ||
-		    pthrdnum > odp_sys_core_count()) {
-			usage();
-			goto err_exit;
-		}
-		cnt -= 2;
-	}
-	if (pthrdnum == 0)
-		pthrdnum = odp_sys_core_count();
-
-	odp_atomic_init_int(&numthrds);
-	test_atomic_init();
-	test_atomic_store();
-
-	memset(&thrdarg, 0, sizeof(pthrd_arg));
-	thrdarg.testcase = test_type;
-	thrdarg.numthrds = pthrdnum;
-
-	if ((test_type > 0) && (test_type < TEST_MAX)) {
-		printf("%s\n", test_name[test_type]);
-	} else {
-		ODP_ERR("Invalid test case [%d]\n", test_type);
-		usage();
-		goto err_exit;
-	}
-	odp_test_thread_create(run_thread, &thrdarg);
-
-	odp_test_thread_exit(&thrdarg);
-
-	result = test_atomic_validate();
-
-	if (result == 0) {
-		printf("%s_%d_%d Result:pass\n",
-		       test_name[test_type], test_type, pthrdnum);
-	} else {
-		printf("%s_%d_%d Result:fail\n",
-		       test_name[test_type], test_type, pthrdnum);
-	}
-	return 0;
-
-err_exit:
-	return -1;
-}
diff --git a/test/api_test/odp_atomic_test.h b/test/api_test/odp_atomic_test.h
deleted file mode 100644
index 7814da5..0000000
--- a/test/api_test/odp_atomic_test.h
+++ /dev/null
@@ -1,60 +0,0 @@ 
-/* Copyright (c) 2013, Linaro Limited
- * All rights reserved.
- *
- * SPDX-License-Identifier:     BSD-3-Clause
- */
-
-#ifndef ODP_ATOMIC_TEST_H_
-#define ODP_ATOMIC_TEST_H_
-
-#include <odp.h>
-#include <odph_linux.h>
-
-/**
- * add_sub_cnt could be any valid value
- * so to excercise explicit atomic_add/sub
- * ops. For now using 5..
- */
-#define ADD_SUB_CNT	5
-
-#define	CNT 500000
-#define	S32_INIT_VAL	(1UL << 10)
-#define	U32_INIT_VAL	(1UL << 10)
-#define	U64_INIT_VAL	(1ULL << 33)
-
-typedef enum {
-	TEST_MIX = 1, /* Must be first test case num */
-	TEST_INC_DEC_S32,
-	TEST_ADD_SUB_S32,
-	TEST_INC_DEC_U32,
-	TEST_ADD_SUB_U32,
-	TEST_INC_DEC_64,
-	TEST_ADD_SUB_64,
-	TEST_MAX,
-} odp_test_atomic_t;
-
-
-void test_atomic_inc_dec_32(void);
-void test_atomic_add_sub_32(void);
-void test_atomic_inc_dec_u32(void);
-void test_atomic_add_sub_u32(void);
-void test_atomic_inc_dec_64(void);
-void test_atomic_add_sub_64(void);
-void test_atomic_inc_32(void);
-void test_atomic_dec_32(void);
-void test_atomic_add_32(void);
-void test_atomic_sub_32(void);
-void test_atomic_inc_u32(void);
-void test_atomic_dec_u32(void);
-void test_atomic_add_u32(void);
-void test_atomic_sub_u32(void);
-void test_atomic_inc_64(void);
-void test_atomic_dec_64(void);
-void test_atomic_add_64(void);
-void test_atomic_sub_64(void);
-void test_atomic_init(void);
-void test_atomic_basic(void);
-void test_atomic_store(void);
-int test_atomic_validate(void);
-
-#endif /* ODP_ATOMIC_TEST_H_ */
diff --git a/test/api_test/odp_common.c b/test/api_test/odp_common.c
index ed1fc97..198fe8f 100644
--- a/test/api_test/odp_common.c
+++ b/test/api_test/odp_common.c
@@ -14,7 +14,6 @@ 
 #include <odp.h>
 #include <odph_linux.h>
 #include <odp_common.h>
-#include <odp_atomic_test.h>
 #include <odp_shm_test.h>
 
 
diff --git a/test/api_test/odp_counter_test.c b/test/api_test/odp_counter_test.c
new file mode 100644
index 0000000..c72328e
--- /dev/null
+++ b/test/api_test/odp_counter_test.c
@@ -0,0 +1,361 @@ 
+/* Copyright (c) 2013, Linaro Limited
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier:     BSD-3-Clause
+ */
+
+#include <string.h>
+#include <sys/time.h>
+#include <odp.h>
+#include <odp_debug.h>
+#include <odp_common.h>
+#include <odph_linux.h>
+
+/**
+ * add_sub_cnt could be any valid value
+ * so to excercise explicit atomic_add/sub
+ * ops. For now using 5..
+ */
+#define ADD_SUB_CNT	5
+
+#define	CNT 500000
+#define	U32_INIT_VAL	(1UL << 10)
+#define	U64_INIT_VAL	(1ULL << 33)
+
+typedef enum {
+	TEST_MIX = 1, /* Must be first test case num */
+	TEST_INC_DEC_U32 = 2,
+	TEST_ADD_SUB_U32 = 3,
+	TEST_INC_DEC_64 = 4,
+	TEST_ADD_SUB_64 = 5,
+	TEST_MAX,
+} odp_test_counter_t;
+
+
+static uint32_t test_counter_inc_dec_u32(void);
+static uint32_t test_counter_add_sub_u32(void);
+static uint32_t test_counter_inc_dec_64(void);
+static uint32_t test_counter_add_sub_64(void);
+static uint32_t test_counter_inc_u32(void);
+static uint32_t test_counter_dec_u32(void);
+static uint32_t test_counter_add_u32(void);
+static uint32_t test_counter_sub_u32(void);
+static uint32_t test_counter_inc_64(void);
+static uint32_t test_counter_dec_64(void);
+static uint32_t test_counter_add_64(void);
+static uint32_t test_counter_sub_64(void);
+static void test_counter_init(void);
+static uint32_t test_counter_basic(void);
+static void test_counter_write(void);
+static int test_counter_validate(void);
+
+static odp_counter32_t a32u;
+static odp_counter64_t a64u;
+
+static odp_barrier_t barrier;
+
+static const char * const test_name[] = {
+	"dummy",
+	"test atomic counter basic ops add/sub/inc/dec",
+	"test atomic inc/dec of 32-bit counter",
+	"test atomic add/sub of 32-bit counter",
+	"test atomic inc/dec of 64-bit counter",
+	"test atomic add/sub of 64-bit counter"
+};
+
+static uint64_t accops[MAX_WORKERS];
+
+static void usage(void)
+{
+	printf("\n./odp_counter -t <testcase> -n <num of threads>\n\n"
+	       "\t<testcase> is\n"
+	       "\t\t1 - Test mix (inc/dec/add/sub on 32- and 64-bit counters)\n"
+	       "\t\t2 - Test inc/dec of 32-bit counter\n"
+	       "\t\t3 - Test add/sub of 32-bit counter\n"
+	       "\t\t4 - Test inc/dec of 64-bit counter\n"
+	       "\t\t5 - Test add/sub of 64-bit counter\n"
+	       "\t<num of thread> is optional\n"
+	       "\t\t<1 - 31> - no of threads to start\n"
+	       "\t\tif user doesn't specify this option, then\n"
+	       "\t\tno of threads created is equivalent to no of cores\n"
+	       "\t\tavailable in the system\n"
+	       "\tExample usage:\n"
+	       "\t\t./odp_counter -t 2\n"
+	       "\t\t./odp_counter -t 3 -n 12\n");
+}
+
+static uint32_t test_counter_inc_u32(void)
+{
+	int i;
+
+	for (i = 0; i < CNT; i++)
+		odp_counter32_inc(&a32u);
+	return i;
+}
+
+static uint32_t test_counter_inc_64(void)
+{
+	int i;
+
+	for (i = 0; i < CNT; i++)
+		odp_counter64_inc(&a64u);
+	return i;
+}
+
+static uint32_t test_counter_dec_u32(void)
+{
+	int i;
+
+	for (i = 0; i < CNT; i++)
+		odp_counter32_add(&a32u, (uint32_t)-1);
+	return i;
+}
+
+static uint32_t test_counter_dec_64(void)
+{
+	int i;
+
+	for (i = 0; i < CNT; i++)
+		odp_counter64_add(&a64u, (uint64_t)-1);
+	return i;
+}
+
+static uint32_t test_counter_add_u32(void)
+{
+	int i;
+
+	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
+		odp_counter32_add(&a32u, ADD_SUB_CNT);
+	return i;
+}
+
+static uint32_t test_counter_add_64(void)
+{
+	int i;
+
+	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
+		odp_counter64_add(&a64u, ADD_SUB_CNT);
+	return i;
+}
+
+static uint32_t test_counter_sub_u32(void)
+{
+	int i;
+
+	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
+		odp_counter32_add(&a32u, -ADD_SUB_CNT);
+	return i;
+}
+
+static uint32_t test_counter_sub_64(void)
+{
+	int i;
+
+	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
+		odp_counter64_add(&a64u, -ADD_SUB_CNT);
+	return i;
+}
+
+static uint32_t test_counter_inc_dec_u32(void)
+{
+	uint32_t nops = 0;
+	nops += test_counter_inc_u32();
+	nops += test_counter_dec_u32();
+	return nops;
+}
+
+static uint32_t test_counter_add_sub_u32(void)
+{
+	uint32_t nops = 0;
+	nops += test_counter_add_u32();
+	nops += test_counter_sub_u32();
+	return nops;
+}
+
+static uint32_t test_counter_inc_dec_64(void)
+{
+	uint32_t nops = 0;
+	nops += test_counter_inc_64();
+	nops += test_counter_dec_64();
+	return nops;
+}
+
+static uint32_t test_counter_add_sub_64(void)
+{
+	uint32_t nops = 0;
+	nops += test_counter_add_64();
+	nops += test_counter_sub_64();
+	return nops;
+}
+
+/**
+ * Test basic counter operation like
+ * add/sub/increment/decrement operation.
+ */
+static uint32_t test_counter_basic(void)
+{
+	uint32_t nops = 0;
+	nops += test_counter_inc_u32();
+	nops += test_counter_dec_u32();
+	nops += test_counter_add_u32();
+	nops += test_counter_sub_u32();
+
+	nops += test_counter_inc_64();
+	nops += test_counter_dec_64();
+	nops += test_counter_add_64();
+	nops += test_counter_sub_64();
+
+	return nops;
+}
+
+static void test_counter_init(void)
+{
+	odp_counter32_init(&a32u, 0);
+	odp_counter64_init(&a64u, 0);
+}
+
+static void test_counter_write(void)
+{
+	odp_counter32_write(&a32u, U32_INIT_VAL);
+	odp_counter64_write(&a64u, U64_INIT_VAL);
+}
+
+static int test_counter_validate(void)
+{
+	if (odp_counter32_read(&a32u) != U32_INIT_VAL) {
+		ODP_ERR("Atomic u32 usual functions failed\n");
+		return -1;
+	}
+
+	if (odp_counter64_read(&a64u) != U64_INIT_VAL) {
+		ODP_ERR("Atomic u64 usual functions failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void *run_thread(void *arg)
+{
+	pthrd_arg *parg = (pthrd_arg *)arg;
+	int thr;
+	uint64_t nops = 0;
+	struct timeval tv0, tv1;
+
+	thr = odp_thread_id();
+
+	ODP_DBG("Thread %i starts\n", thr);
+
+	/* Wait here until all threads have arrived */
+	/* Use multiple barriers to verify that it handles wrap around and
+	 * has no race conditions which could be exposed when invoked back-
+	 * to-back */
+	odp_barrier_sync(&barrier);
+	odp_barrier_sync(&barrier);
+	odp_barrier_sync(&barrier);
+	odp_barrier_sync(&barrier);
+
+	gettimeofday(&tv0, NULL);
+
+	switch (parg->testcase) {
+	case TEST_MIX:
+		nops += test_counter_basic();
+		break;
+	case TEST_INC_DEC_U32:
+		nops += test_counter_inc_dec_u32();
+		break;
+	case TEST_ADD_SUB_U32:
+		nops += test_counter_add_sub_u32();
+		break;
+	case TEST_INC_DEC_64:
+		nops += test_counter_inc_dec_64();
+		break;
+	case TEST_ADD_SUB_64:
+		nops += test_counter_add_sub_64();
+		break;
+	}
+	gettimeofday(&tv1, NULL);
+	accops[thr] = nops;
+	fflush(NULL);
+
+	uint64_t usecs = (tv1.tv_sec - tv0.tv_sec) * 1000000ULL +
+			 tv1.tv_usec - tv0.tv_usec;
+	printf("Time taken in thread %02d to complete %"PRIu64" op is "
+	       "%"PRIu64" usec, %"PRIu64" ns/op\n",
+	       thr, nops, usecs, 1000 * usecs / nops);
+
+	return parg;
+}
+
+int main(int argc, char *argv[])
+{
+	pthrd_arg thrdarg;
+	int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
+	char c;
+	int result;
+
+	if (argc == 1 || argc % 2 == 0) {
+		usage();
+		goto err_exit;
+	}
+	if (odp_test_global_init() != 0)
+		goto err_exit;
+	odp_print_system_info();
+
+	while (cnt != 0) {
+		sscanf(argv[++i], "-%c", &c);
+		switch (c) {
+		case 't':
+			sscanf(argv[++i], "%d", &test_type);
+			break;
+		case 'n':
+			sscanf(argv[++i], "%d", &pthrdnum);
+			break;
+		default:
+			ODP_ERR("Invalid option %c\n", c);
+			usage();
+			goto err_exit;
+		}
+		if (test_type < TEST_MIX || test_type > TEST_MAX ||
+		    pthrdnum > odp_sys_core_count()) {
+			usage();
+			goto err_exit;
+		}
+		cnt -= 2;
+	}
+	if (pthrdnum == 0)
+		pthrdnum = odp_sys_core_count();
+
+	test_counter_init();
+	test_counter_write();
+
+	memset(&thrdarg, 0, sizeof(pthrd_arg));
+	thrdarg.testcase = test_type;
+	thrdarg.numthrds = pthrdnum;
+
+	if ((test_type > 0) && (test_type < TEST_MAX)) {
+		printf("%s\n", test_name[test_type]);
+	} else {
+		ODP_ERR("Invalid test case [%d]\n", test_type);
+		usage();
+		goto err_exit;
+	}
+	odp_barrier_init(&barrier, pthrdnum);
+	odp_test_thread_create(run_thread, &thrdarg);
+
+	odp_test_thread_exit(&thrdarg);
+
+	result = test_counter_validate();
+
+	if (result == 0) {
+		printf("%s_%d_%d Result:pass\n",
+		       test_name[test_type], test_type, pthrdnum);
+	} else {
+		printf("%s_%d_%d Result:fail\n",
+		       test_name[test_type], test_type, pthrdnum);
+	}
+	return 0;
+
+err_exit:
+	return -1;
+}