diff mbox

[ODP/PATCH,v3] Look ma, no barriers! C11 memory model

Message ID 1413810465-26478-1-git-send-email-ola.liljedahl@linaro.org
State New
Headers show

Commit Message

Ola Liljedahl Oct. 20, 2014, 1:07 p.m. UTC
Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
---
Added header file odp_counter.h with support for 32- and 64-bit atomic counters
using relaxed memory order. 6 operations (init/read/write/add/read_inc/inc) on
32-bit and 64-bit counters respectively.
Renamed odp_atomic_test to odp_counter_test and changed to use odp_counter.h

Implementation of C11-based memory model for atomic operations. 10 operations
(init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec) in
odp_atomic.h. The required memory ordering is now a parameter to each call just
like in C11.

Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
fall back to GCC __sync builtins which often include unnecessarily heavy
barrier/sync operations (always sequentially consistent).

Attempt to remove all explicit memory barriers (odp_sync_stores) from code that
implements multithreaded synchronization primitives (e.g. locks, barriers).
Rewrote such primitives to use the new atomic operations.

Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
odp_ticketlock_lock() (missing acquire barrier) and odp_ring enqueue/dequeue
(missing release barrier, had only compiler barrier).

 .gitignore                                         |   2 +-
 example/generator/odp_generator.c                  |  43 +-
 example/ipsec/odp_ipsec.c                          |   2 +-
 example/odp_example/odp_example.c                  |   2 +-
 example/timer/odp_timer_test.c                     |   2 +-
 helper/include/odph_ring.h                         |   8 +-
 platform/linux-generic/include/api/odp.h           |   1 +
 platform/linux-generic/include/api/odp_atomic.h    | 838 +++++++++++----------
 platform/linux-generic/include/api/odp_barrier.h   |  10 +-
 platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
 platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
 .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
 .../linux-generic/include/odp_buffer_internal.h    |   2 +-
 platform/linux-generic/include/odp_spin_internal.h |   9 -
 platform/linux-generic/odp_barrier.c               |  49 +-
 platform/linux-generic/odp_buffer.c                |   3 +-
 platform/linux-generic/odp_crypto.c                |   7 +-
 platform/linux-generic/odp_queue.c                 |   7 +-
 platform/linux-generic/odp_ring.c                  |  94 +--
 platform/linux-generic/odp_rwlock.c                |  62 +-
 platform/linux-generic/odp_thread.c                |   9 +-
 platform/linux-generic/odp_ticketlock.c            |  29 +-
 platform/linux-generic/odp_timer.c                 |  22 +-
 test/api_test/Makefile.am                          |   6 +-
 test/api_test/odp_atomic_test.c                    | 362 ---------
 test/api_test/odp_atomic_test.h                    |  60 --
 test/api_test/odp_common.c                         |   1 -
 test/api_test/odp_counter_test.c                   | 361 +++++++++
 28 files changed, 1365 insertions(+), 1014 deletions(-)
 create mode 100644 platform/linux-generic/include/api/odp_counter.h
 delete mode 100644 test/api_test/odp_atomic_test.c
 delete mode 100644 test/api_test/odp_atomic_test.h
 create mode 100644 test/api_test/odp_counter_test.c

Comments

Savolainen, Petri (NSN - FI/Espoo) Oct. 20, 2014, 3:35 p.m. UTC | #1
Hi,

This patch should be split into many. It's modifying atomic API, introducing a new API (counters) as well as fixing various implementation issues (e.g. barrier).

I'd prefer to define counters after v1.0 due to the fact that there are so many other APIs to be "finalize" before v1.0. Also other than "relaxed" atomics are mainly needed by lock/synchronization algorithm implementations, which do not have the highest priority to abstract (low number of (especially application) C lines in lock implementations).

You are removing many odp_sync_stores - are you sure all of those are redundant? Those are mainly needed to synchronize  stores on user data, not on the lock implementation itself.


> void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>  {
> -	ticketlock->next_ticket = 0;
> -	ticketlock->cur_ticket  = 0;
> -	odp_sync_stores();

This ensures that previous writes (e.g. memset) to the data don't overlap with writes to data when user is holding the lock the first time.

> +	odp_counter32_init(&ticketlock->next_ticket, 0);
> +	odp_atomic32_init(&ticketlock->cur_ticket, 0);
>  }
 
 
> @@ -22,30 +22,15 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>  {
>  	uint32_t ticket;
>  
> -	ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
> +	ticket = odp_counter32_read_inc(&ticketlock->next_ticket);
>
> -	while (ticket != ticketlock->cur_ticket)
> +	while (ticket != odp_atomic32_load(&ticketlock->cur_ticket,
> +					   ODP_MEMORDER_ACQ))
>  		odp_spin();
> -
> -	odp_mem_barrier();
>  }

Why next_ticket is counter, but cur_ticket is atomic ?
 
 
>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>  {
> -	odp_sync_stores();

Sync stores to user data before releasing the lock.

> -
> -	ticketlock->cur_ticket++;
> -
> -#if defined __OCTEON__
> -	odp_sync_stores();

On Octeon, push out ticket update for better performance.

> -#else
> -	odp_mem_barrier();
> -#endif
> -}
> -
> -
> -int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
> -{
> -	return ticketlock->cur_ticket != ticketlock->next_ticket;
> +	odp_atomic32_inc(&ticketlock->cur_ticket, ODP_MEMORDER_RLS);
>  }


-Petri
Ola Liljedahl Oct. 20, 2014, 5:20 p.m. UTC | #2
On 20 October 2014 17:35, Savolainen, Petri (NSN - FI/Espoo) <
petri.savolainen@nsn.com> wrote:

> Hi,
>
> This patch should be split into many. It's modifying atomic API,
> introducing a new API (counters) as well as fixing various implementation
> issues (e.g. barrier).
>
> I'd prefer to define counters after v1.0 due to the fact that there are so
> many other APIs to be "finalize" before v1.0. Also other than "relaxed"
> atomics are mainly needed by lock/synchronization algorithm
> implementations, which do not have the highest priority to abstract (low
> number of (especially application) C lines in lock implementations).
>
And I was under the belief that Nokia was disappointed with the amount of
code (LOC's) in ODP?

Relaxed atomics cannot be used for synchronization and lock
implementations, you need acquire and release operations for that.
Only the atomic counters use relaxed memory order just because the counters
are not used for synchronization and we want to optimize the counters for
low overhead and good scalability.



> You are removing many odp_sync_stores - are you sure all of those are
> redundant? Those are mainly needed to synchronize  stores on user data, not
> on the lock implementation itself.
>
The release operations will include necessary (one-sided) barriers, these
barriers cover all loads and stores issued by the current processor. Watch
the referenced Herb Sutter presentations, they are very good.



>
>
> > void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
> >  {
> > -     ticketlock->next_ticket = 0;
> > -     ticketlock->cur_ticket  = 0;
> > -     odp_sync_stores();
>
> This ensures that previous writes (e.g. memset) to the data don't overlap
> with writes to data when user is holding the lock the first time.
>
For other threads to be able to safely access this ticketlock, they must
either have been created after this lock was initialized or notified in
some way about the existence of this lock. And such thread creation,
notification or synchronization (e.g. odp_barrier_sync) would have included
the necessary barriers. If such synchronization has not occurred before
some other thread accesses the lock, the program has a race condition
(nothing that stopped that thread from accessing the lock *before* it was
initialized) and the behavior is undefined. odp_sync_stores() just makes
sure all processors have seen the preceding stores, it does not synchronize
with the application.

The thread that initialized the lock can also immediately acquire it, don't
need to wait for other threads to have seen the lock initialization (as
they can not access the lock until they have been notified). The executing
thread will see its own writes (e.g. lock initialization, data memset) in
program order, sync doesn't change anything here.



>
> > +     odp_counter32_init(&ticketlock->next_ticket, 0);
> > +     odp_atomic32_init(&ticketlock->cur_ticket, 0);
> >  }
>
>
> > @@ -22,30 +22,15 @@ void odp_ticketlock_lock(odp_ticketlock_t
> *ticketlock)
> >  {
> >       uint32_t ticket;
> >
> > -     ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
> > +     ticket = odp_counter32_read_inc(&ticketlock->next_ticket);
> >
> > -     while (ticket != ticketlock->cur_ticket)
> > +     while (ticket != odp_atomic32_load(&ticketlock->cur_ticket,
> > +                                        ODP_MEMORDER_ACQ))
> >               odp_spin();
> > -
> > -     odp_mem_barrier();
> >  }
>
> Why next_ticket is counter, but cur_ticket is atomic ?
>
next_ticket is just there for handing out tickets, it is not used for
ownership of the lock.
cur_ticket is the actual lock variable that you acquire and release and
ownership of the lock implies exclusive access to the associated user data
structure. Acquiring the lock variable is done with acquire memory order so
implies a one-sided acquire barrier (i.e. later accesses from this
processor cannot be ordered before the acquire load).


>
> >  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
> >  {
> > -     odp_sync_stores();
>
> Sync stores to user data before releasing the lock.
>
The acquire/release design updates the cur_ticket lock variable with the
release memory order. This forces all earlier memory accesses (from this
processor) to complete before the lock is written. Accesses after the
release are still allowed to move before the releasing store (e.g.
speculative load in order to decrease load-to-use latency).
void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
{
        odp_atomic32_inc(&ticketlock->cur_ticket, ODP_MEMORDER_RLS);
}




>
> > -
> > -     ticketlock->cur_ticket++;
> > -
> > -#if defined __OCTEON__
> > -     odp_sync_stores();
>
> On Octeon, push out ticket update for better performance.
>
The odp_atomic32_inc() operation with the release memory order will include
HW barriers (before and after as seems to be required),
in case of OCTEON this is "syncw" which also has the side effect of
flushing the write buffer. In any case, Jerin has promised to have a look
at the code and make any OCTEON-specific optimizations. (Do you still need
syncw;syncw?)


> > -#else
> > -     odp_mem_barrier();
> > -#endif
> > -}
> > -
> > -
> > -int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
> > -{
> > -     return ticketlock->cur_ticket != ticketlock->next_ticket;
> > +     odp_atomic32_inc(&ticketlock->cur_ticket, ODP_MEMORDER_RLS);
>
Here is the actual releasing store (atomic RMW operation with release
memory order). diff is playing tricks with us.


-- Ola

>  }
>
>
> -Petri
>
>
>
>
Ola Liljedahl Nov. 4, 2014, 1:48 p.m. UTC | #3
Ping!

I really need this new working atomics support merged ASAP because I have a
new lock-less implementation of the timer API which uses atomic operations.
I haven't seen any real criticism against the content of the patch so there
is nothing to change.

-- Ola


On 20 October 2014 15:07, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:

> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
> ---
> Added header file odp_counter.h with support for 32- and 64-bit atomic
> counters
> using relaxed memory order. 6 operations
> (init/read/write/add/read_inc/inc) on
> 32-bit and 64-bit counters respectively.
> Renamed odp_atomic_test to odp_counter_test and changed to use
> odp_counter.h
>
> Implementation of C11-based memory model for atomic operations. 10
> operations
> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
> in
> odp_atomic.h. The required memory ordering is now a parameter to each call
> just
> like in C11.
>
> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
> fall back to GCC __sync builtins which often include unnecessarily heavy
> barrier/sync operations (always sequentially consistent).
>
> Attempt to remove all explicit memory barriers (odp_sync_stores) from code
> that
> implements multithreaded synchronization primitives (e.g. locks, barriers).
> Rewrote such primitives to use the new atomic operations.
>
> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
> enqueue/dequeue
> (missing release barrier, had only compiler barrier).
>
>  .gitignore                                         |   2 +-
>  example/generator/odp_generator.c                  |  43 +-
>  example/ipsec/odp_ipsec.c                          |   2 +-
>  example/odp_example/odp_example.c                  |   2 +-
>  example/timer/odp_timer_test.c                     |   2 +-
>  helper/include/odph_ring.h                         |   8 +-
>  platform/linux-generic/include/api/odp.h           |   1 +
>  platform/linux-generic/include/api/odp_atomic.h    | 838
> +++++++++++----------
>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>  platform/linux-generic/odp_barrier.c               |  49 +-
>  platform/linux-generic/odp_buffer.c                |   3 +-
>  platform/linux-generic/odp_crypto.c                |   7 +-
>  platform/linux-generic/odp_queue.c                 |   7 +-
>  platform/linux-generic/odp_ring.c                  |  94 +--
>  platform/linux-generic/odp_rwlock.c                |  62 +-
>  platform/linux-generic/odp_thread.c                |   9 +-
>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>  platform/linux-generic/odp_timer.c                 |  22 +-
>  test/api_test/Makefile.am                          |   6 +-
>  test/api_test/odp_atomic_test.c                    | 362 ---------
>  test/api_test/odp_atomic_test.h                    |  60 --
>  test/api_test/odp_common.c                         |   1 -
>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>  delete mode 100644 test/api_test/odp_atomic_test.c
>  delete mode 100644 test/api_test/odp_atomic_test.h
>  create mode 100644 test/api_test/odp_counter_test.c
>
> diff --git a/.gitignore b/.gitignore
> index 6342e34..77db4d6 100644
> --- a/.gitignore
> +++ b/.gitignore
> @@ -35,7 +35,7 @@ build/
>  odp_example
>  odp_packet
>  odp_packet_netmap
> -odp_atomic
> +odp_counter
>  odp_shm
>  odp_ring
>  odp_timer_ping
> diff --git a/example/generator/odp_generator.c
> b/example/generator/odp_generator.c
> index eb8b340..252157d 100644
> --- a/example/generator/odp_generator.c
> +++ b/example/generator/odp_generator.c
> @@ -62,10 +62,10 @@ typedef struct {
>   * counters
>  */
>  static struct {
> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
> -       odp_atomic_u64_t ip;    /**< ip packets */
> -       odp_atomic_u64_t udp;   /**< udp packets */
> -       odp_atomic_u64_t icmp;  /**< icmp packets */
> +       odp_counter64_t seq;    /**< ip seq to be send */
> +       odp_counter64_t ip;     /**< ip packets */
> +       odp_counter64_t udp;    /**< udp packets */
> +       odp_counter64_t icmp;   /**< icmp packets */
>  } counters;
>
>  /** * Thread specific arguments
> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_UDPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_UDP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_ICMPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_ICMP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>                 }
>
>                 if (args->appl.interval != 0) {
> +                       uint64_t seq = odp_counter64_read(&counters.seq);
>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
> -                              thr, counters.seq, counters.seq%0xffff);
> +                              thr, seq, seq%0xffff);
>                         /* TODO use odp timer */
>                         usleep(args->appl.interval * 1000);
>                 }
> -               if (args->appl.number != -1 && counters.seq
> -                   >= (unsigned int)args->appl.number) {
> +               if (args->appl.number != -1 &&
> +                   odp_counter64_read(&counters.seq) >=
> +                   (unsigned int)args->appl.number) {
>                         break;
>                 }
>         }
> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>         /* receive number of reply pks until timeout */
>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>                 while (args->appl.timeout >= 0) {
> -                       if (counters.icmp >= (unsigned
> int)args->appl.number)
> +                       if (odp_counter64_read(&counters.icmp) >=
> +                           (unsigned int)args->appl.number)
>                                 break;
>                         /* TODO use odp timer */
>                         sleep(1);
> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>
>         /* print info */
>         if (args->appl.mode == APPL_MODE_UDP) {
> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
> +               printf("  [%02i] total send: %ju\n", thr,
> +                      odp_counter64_read(&counters.seq));
>         } else if (args->appl.mode == APPL_MODE_PING) {
>                 printf("  [%02i] total send: %ju total receive: %ju\n",
> -                      thr, counters.seq, counters.icmp);
> +                      thr, odp_counter64_read(&counters.seq),
> +                      odp_counter64_read(&counters.icmp));
>         }
>         return arg;
>  }
> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                 if (!odp_packet_inflag_ipv4(pkt))
>                         continue;
>
> -               odp_atomic_inc_u64(&counters.ip);
> +               odp_counter64_inc(&counters.ip);
>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>
>                 /* udp */
>                 if (ip->proto == ODPH_IPPROTO_UDP) {
> -                       odp_atomic_inc_u64(&counters.udp);
> +                       odp_counter64_inc(&counters.udp);
>                         udp = (odph_udphdr_t *)(buf + offset);
>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>                                         odp_be_to_cpu_16(udp->length) -
> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                         icmp = (odph_icmphdr_t *)(buf + offset);
>                         /* echo reply */
>                         if (icmp->type == ICMP_ECHOREPLY) {
> -                               odp_atomic_inc_u64(&counters.icmp);
> +                               odp_counter64_inc(&counters.icmp);
>                                 memcpy(&tvsend, buf + offset +
> ODPH_ICMPHDR_LEN,
>                                        sizeof(struct timeval));
>                                 /* TODO This should be changed to use an
> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>         }
>
>         /* init counters */
> -       odp_atomic_init_u64(&counters.seq);
> -       odp_atomic_init_u64(&counters.ip);
> -       odp_atomic_init_u64(&counters.udp);
> -       odp_atomic_init_u64(&counters.icmp);
> +       odp_counter64_init(&counters.seq, 0);
> +       odp_counter64_init(&counters.ip, 0);
> +       odp_counter64_init(&counters.udp, 0);
> +       odp_counter64_init(&counters.icmp, 0);
>
>         /* Reserve memory for args from shared mem */
>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
> index 2f2dc19..76c27d0 100644
> --- a/example/ipsec/odp_ipsec.c
> +++ b/example/ipsec/odp_ipsec.c
> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>         printf("Num worker threads: %i\n", num_workers);
>
>         /* Create a barrier to synchronize thread startup */
> -       odp_barrier_init_count(&sync_barrier, num_workers);
> +       odp_barrier_init(&sync_barrier, num_workers);
>
>         /*
>          * By default core #0 runs Linux kernel background tasks.
> diff --git a/example/odp_example/odp_example.c
> b/example/odp_example/odp_example.c
> index 0e9aa3d..c473395 100644
> --- a/example/odp_example/odp_example.c
> +++ b/example/odp_example/odp_example.c
> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>         odp_shm_print_all();
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&globals->barrier, num_workers);
> +       odp_barrier_init(&globals->barrier, num_workers);
>
>         if (args.proc_mode) {
>                 int ret;
> diff --git a/example/timer/odp_timer_test.c
> b/example/timer/odp_timer_test.c
> index 78b2ae2..dfbeae9 100644
> --- a/example/timer/odp_timer_test.c
> +++ b/example/timer/odp_timer_test.c
> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>         printf("\n");
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&test_barrier, num_workers);
> +       odp_barrier_init(&test_barrier, num_workers);
>
>         /* Create and launch worker threads */
>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
> index 76c1db8..5e78b34 100644
> --- a/helper/include/odph_ring.h
> +++ b/helper/include/odph_ring.h
> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>                 uint32_t sp_enqueue;     /* True, if single producer. */
>                 uint32_t size;           /* Size of ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Producer head. */
> -               uint32_t tail;          /* Producer tail. */
> +               odp_atomic32_t head;    /* Producer head. */
> +               odp_atomic32_t tail;    /* Producer tail. */
>         } prod ODP_ALIGNED_CACHE;
>
>         /** @private Consumer */
> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>                 uint32_t size;           /* Size of the ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Consumer head. */
> -               uint32_t tail;          /* Consumer tail. */
> +               odp_atomic32_t head;    /* Consumer head. */
> +               odp_atomic32_t tail;    /* Consumer tail. */
>         } cons ODP_ALIGNED_CACHE;
>
>         /** @private Memory space of ring starts here. */
> diff --git a/platform/linux-generic/include/api/odp.h
> b/platform/linux-generic/include/api/odp.h
> index 0ee3faf..d124d52 100644
> --- a/platform/linux-generic/include/api/odp.h
> +++ b/platform/linux-generic/include/api/odp.h
> @@ -32,6 +32,7 @@ extern "C" {
>  #include <odp_barrier.h>
>  #include <odp_spinlock.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>
>  #include <odp_init.h>
>  #include <odp_system_info.h>
> diff --git a/platform/linux-generic/include/api/odp_atomic.h
> b/platform/linux-generic/include/api/odp_atomic.h
> index 0cc4cf4..ccaad02 100644
> --- a/platform/linux-generic/include/api/odp_atomic.h
> +++ b/platform/linux-generic/include/api/odp_atomic.h
> @@ -4,464 +4,494 @@
>   * SPDX-License-Identifier:     BSD-3-Clause
>   */
>
> -
>  /**
>   * @file
>   *
> - * ODP atomic operations
> + * ODP atomic types and operations, semantically a subset of C11 atomics.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Atomic functions must be used to operate on atomic variables!
>   */
>
>  #ifndef ODP_ATOMIC_H_
>  #define ODP_ATOMIC_H_
>
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +#include <odp_debug.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
>
> -
> -#include <odp_std_types.h>
> -
> -
> -/**
> - * Atomic integer
> - */
> -typedef volatile int32_t odp_atomic_int_t;
> -
> -/**
> - * Atomic unsigned integer 64 bits
> - */
> -typedef volatile uint64_t odp_atomic_u64_t;
> -
> -/**
> - * Atomic unsigned integer 32 bits
> - */
> -typedef volatile uint32_t odp_atomic_u32_t;
> -
> -
> -/**
> - * Initialize atomic integer
> - *
> - * @param ptr    An integer atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic integer
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic integer value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
> -{
> -       return *ptr;
> -}
> -
> -/**
> - * Store value to atomic integer
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
> new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic integer
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic integer
> - *
> - * @param ptr    An atomic integer variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement atomic integer by 1
> - *
> - * @param ptr    An atomic int variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Initialize atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic uint32 value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
> -{
> -       return *ptr;
> -}
> -
> -/**
> - * Store value to atomic uint32
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
> -                                       uint32_t new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be sub to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
>  /**
> - * Fetch and increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __OCTEON__
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       uint32_t ret;
> -
> -       __asm__ __volatile__ ("syncws");
> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
> -                             "r" (ptr));
> -
> -       return ret;
> -}
> -
> + * 32-bit (unsigned) atomic type
> + */
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the atomic variable */
> +} odp_atomic32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
> +
> +typedef enum {
> +       /** Relaxed memory order, no ordering of other accesses enforced */
> +       ODP_MEMORDER_RLX,
> +       /** Acquire memory order, later accesses cannot move before
> +        * acquire operation */
> +       ODP_MEMORDER_ACQ,
> +       /** Release memory order, earlier accesses cannot move after
> +        * release operation */
> +       ODP_MEMORDER_RLS
> +} odp_memorder_t;
> +
>
> +/*****************************************************************************
> + * Just some private helpers
>
> +*****************************************************************************/
> +
> +#ifdef __OCTEON__
> +/* OCTEON Write Memory Barrier */
> +#define COMPILER_HW_BARRIER() __asm __volatile( \
> +       /* Double syncw to work around errata */ \
> +       "syncw\n\tsyncw" : : : )
>  #else
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> +/** Compiler and hardware full memory barrier */
> +#define COMPILER_HW_BARRIER() __sync_synchronize()
> +/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a
> */
>  #endif
>
> -/**
> - * Increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u32(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_sub_u32(ptr, 1);
> -}
> -
> -/**
> - * Atomic compare and set for 32bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> +#define MEMORY "memory"
> +
>
> +/*****************************************************************************
> + * Operations on 32-bit atomics
> + * odp_atomic32_init - no return value
> + * odp_atomic32_load - return current value
> + * odp_atomic32_store - no return value
> + * odp_atomic32_cmp_xchg_weak - return bool
> + * odp_atomic32_fetch_add - return old value
> + * odp_atomic32_add - no return value
> + * odp_atomic32_fetch_inc - return old value
> + * odp_atomic32_inc - no return value
> + * odp_atomic32_fetch_dec - return old value
> + * odp_atomic32_dec - no return value
> +
> *****************************************************************************/
> +
> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
> +{
> +       /* Write of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) write variable,
> thus we
> +        * can avoid using compiler memory barriers */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Atomic load of 32-bit atomic variable
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the load
> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
> + *
> + * @return Value of the variable
> + */
> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               uint32_t val;
> +               /* Read of aligned word is atomic */
> +               /* Cast to volatile to force compiler to (re-) read
> variable,
> +                * thus we can avoid using compiler memory barriers */
> +               val = *(__volatile const uint32_t *)&ptr->v;
> +               return val;
> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
> +#if defined __aarch64__
> +               uint32_t val;
> +               __asm __volatile("ldar %w0, [%1]"
> +                               : "=&r"(val)
> +                               : "r"(&ptr->v)
> +                               : MEMORY);
> +               return val;
> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
> +               /* Read of aligned word is atomic */
> +               uint32_t val = ptr->v;
> +               /* To prevent later accesses from moving up */
> +               /* Herb Sutter claims HW barrier not needed on x86? */
> +               COMPILER_HW_BARRIER();
> +               return val;
> +#else
> +#warning odp_atomic32_load() may not be efficiently implemented
> +               /* Assume read of aligned word is atomic */
> +               uint32_t val = ptr->v;
> +               /* To prevent later accesses from moving up */
> +               COMPILER_HW_BARRIER();
> +               return val;
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +/**
> + * Atomic store to 32-bit atomic variable
> + *
> + * @param ptr  Pointer to a 32-bit atomic variable
> + * @param val  Value to write to the atomic variable
> + * @param memmodel Memory model associated with the store
> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
> + */
> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
> +               uint32_t val,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Write of aligned word is atomic */
> +               /* Cast to volatile to force compiler to (re-) write
> variable,
> +                * thus we will avoid using compiler memory barriers */
> +               *(__volatile uint32_t *)&ptr->v = val;
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
> +               /* Compiler and HW barrier to prevent earlier accesses from
> +                * moving down */
> +               COMPILER_HW_BARRIER();
> +               /* Write of aligned word is atomic */
> +               ptr->v = val;
> +               /* Compiler and HW barrier to prevent this store from
> moving
> +                * down after a later load-acquire and thus create
> overlapping
> +                * critical sections. Herb Sutter thinks this is needed */
> +               COMPILER_HW_BARRIER();
> +#elif defined __aarch64__
> +               __asm __volatile("stlr %w0, [%1]"
> +                               :
> +                               : "r"(val), "r"(&ptr->v)
> +                               : MEMORY);
> +#elif defined __x86_64__
> +               /* This is actually an atomic exchange operation */
> +               /* Generates good code on x86_64 */
> +               (void)__sync_lock_test_and_set(&ptr->v, val);
> +#else
> +#warning odp_atomic32_store_rls() may not be efficiently implemented
> +               /* This is actually an atomic exchange operation */
> +               (void)__sync_lock_test_and_set(&ptr->v, val);
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +
> +/**
> + * Atomic compare and exchange (swap) of 32-bit atomic variable
> + * "Weak" semantics, may fail spuriously and must be used in a loop.
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param exp_p Pointer to expected value (updated on failure)
> + * @param val   New value to write
> + * @param       memmodel Memory model associated with the compare-and-swap
> + * operation (ODP_MEMORDER_RLX only)
> + *
> + * @return 1 (true) if exchange successful, 0 (false) if not successful
> (and
> + * '*exp_p' updated with current value)
> + */
> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
> +               uint32_t *exp_p,
> +               uint32_t val,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +#if defined __arm__ /* A32/T32 ISA */
> +               uint32_t old;
> +               uint32_t exp = *exp_p;
> +               int status;
> +               __asm __volatile("ldrex %0, [%2]\t\n"
> +                                "cmp   %0, %3\t\n"
> +                                "bne   1f\t\n"
> +                                "strex %1, %4, [%2]\t\n"
> +                                "1:\t\n"
> +                               : "=&r"(old), "=&r"(status)
> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
> +                               : MEMORY);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Clear exclusive access monitor */
> +                       __asm __volatile("clrex");
> +                       /* Return current value */
> +                       *exp_p = old;
> +                       return 0;
> +               }
> +               /* strex returns 0 on success */
> +               if (odp_unlikely(status != 0)) {
> +                       /* strex failed, reservation was disturbed */
> +                       /* Return potentially changed value */
> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
> +                       return 0;
> +               }
> +               return 1;
> +#elif defined __mips64__
> +               uint32_t old;
> +               uint32_t exp = *exp_p;
> +               uint32_t status = val;
> +               __asm __volatile("llw %0, [%2]\t\n"
> +                                "bne %0, %3, 1f\t\n"
> +                                "scw %1, [%2]\t\n"
> +                                "1:\t\n"
> +                               : "=&r"(old), "+&r"(status)
> +                               : "r"(&ptr->v), "r"(exp)
> +                               : MEMORY);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Return current value */
> +                       *exp_p = old;
> +                       return 0;
> +               }
> +               /* scw returns 1 on success, 0 on failure */
> +               if (odp_unlikely(status == 0)) {
> +                       /* scw failed, reservation was disturbed */
> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
> +                       return 0;
> +               }
> +               return 1;
> +#elif defined __x86_64__
> +               uint32_t exp = *exp_p;
> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
> val);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Return the unexpected content of '*ptr' */
> +                       *exp_p = old;
> +                       return 0;
> +               } else {
> +                       return 1;
> +               }
> +#else
> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
> +               uint32_t exp = *exp_p;
> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
> val);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Return the unexpected content of '*ptr' */
> +                       *exp_p = old;
> +                       return 0;
> +               } else {
> +                       return 1;
> +               }
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +/**
> + * Atomic fetch and add to 32-bit atomic variable
> + * @note A - B <=> A + (-B)
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + * @param memmodel Memory model associated with the add
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
> + *
> + * @return Value of the atomic variable before the addition
> + */
> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
> +               uint32_t incr,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +#if defined __arm__ /* A32/T32 ISA */
> +               uint32_t old_val, tmp;
> +               int status;
> +               do {
> +                       __asm __volatile("ldrex %0, [%3]\t\n"
> +                                        "add   %1, %0, %4\t\n"
> +                                        "strex %2, %1, [%3]\t\n"
> +                                       : "=&r"(old_val), "=&r"(tmp),
> +                                         "=&r"(status)
> +                                       : "r"(&ptr->v), "r"(incr)
> +                                       : MEMORY);
> +               } while (odp_unlikely(status != 0));
> +               return old_val;
> +#elif defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("laa %0,(%2),%3"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr), "r" (incr)
> +                               : MEMORY);
> +               return old_val;
> +#elif defined __x86_64__
> +               /* Generates good code on x86_64 */
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +#else
> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("laa %0,(%2),%3"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr), "r" (incr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
> +#endif
> +               /* __sync_fetch_and_add() will give us barriers before and
> +                * after, we are fine with this for release operations */
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
>  }
>
>  /**
> - * Initialize atomic uint64
> + * Atomic add to 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + * @param memmodel Memory model associated with the add
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
> +               uint32_t incr,
> +               odp_memorder_t mmodel)
>  {
> -       *ptr = 0;
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic add instructions can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               __asm __volatile("saa %[inc], (%[base])"
> +                               : "+m" (*ptr)
> +                               : [inc] "r" (incr), [base] "r" (ptr)
> +                               : MEMORY);
> +               return;
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +               /* Platforms that support atomic add instructions can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("saa %[inc], (%[base])"
> +                               : "+m" (*ptr)
> +                               : [inc] "r" (incr), [base] "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return;
> +#endif
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>  }
>
>  /**
> - * Load value of atomic uint64
> - *
> - * @param ptr    An atomic variable
> + * Atomic fetch and increment of 32-bit atomic variable
>   *
> - * @return atomic uint64 value
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the increment
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   *
> - * @note The operation is not synchronized with other threads
> + * @return Value of the atomic variable before the increment
>   */
> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>  {
> -       return *ptr;
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic increment instructions
> can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("lai %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               return old_val;
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("lai %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
> +#endif
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>  }
>
>  /**
> - * Store value to atomic uint64
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> + * Atomic increment of 32-bit atomic variable
>   *
> - * @note The operation is not synchronized with other threads
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the increment
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
> -                                       uint64_t new_value)
> -{
> -       *ptr = new_value;
> -}
> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>
> -/**
> - * Add atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - */
> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
>  {
> -       __sync_fetch_and_add(ptr, value);
> +       /* Default to using odp_atomic32_fetch_inc() */
> +       /* Platforms that support atomic increment instructions can add
> +        * their implementations here */
> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>  }
>
>  /**
> - * Fetch and add atomic uint64
> + * Atomic fetch and decrement of 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the decrement
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   *
> - * @return Value of the variable before the operation
> + * @return Value of the atomic variable before the decrement
>   */
> -
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>  {
> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> -#else
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic decrement instructions
> can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("lad %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               return old_val;
>  #endif
> -/**
> - * Subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - */
> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
> -{
> -       __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> -#else
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("lad %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
>  #endif
> -/**
> - * Fetch and increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u64(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       odp_atomic_fetch_add_u64(ptr, 1);
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>  }
>
>  /**
> - * Fetch and decrement atomic uint64 by 1
> + * Atomic decrement of 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the decrement
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u64(ptr, 1);
> -}
> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
> +               odp_memorder_t memorder)
>
> -/**
> - * Decrement atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>  {
> -       odp_atomic_fetch_sub_u64(ptr, 1);
> +       /* Default to using odp_atomic32_fetch_dec() */
> +       /* Platforms that support atomic decrement instructions can add
> +        * their implementations here */
> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>  }
>
> -/**
> - * Atomic compare and set for 64bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> -}
> +/* We are not exporting this macro */
> +#undef COMPILER_HW_BARRIER
> +#undef MEMORY
>
>  #ifdef __cplusplus
>  }
> diff --git a/platform/linux-generic/include/api/odp_barrier.h
> b/platform/linux-generic/include/api/odp_barrier.h
> index a7b3215..69b1eb8 100644
> --- a/platform/linux-generic/include/api/odp_barrier.h
> +++ b/platform/linux-generic/include/api/odp_barrier.h
> @@ -27,18 +27,18 @@ extern "C" {
>   * ODP execution barrier
>   */
>  typedef struct odp_barrier_t {
> -       int              count;  /**< @private Thread count */
> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
> +       uint32_t       num_threads;  /**< @private Thread count (constant)
> */
> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
>  } odp_barrier_t;
>
>
>  /**
>   * Init barrier with thread count
>   *
> - * @param barrier    Barrier
> - * @param count      Thread count
> + * @param barrier     Barrier
> + * @param num_threads Number of threads which share the barrier
>   */
> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>
>
>  /**
> diff --git a/platform/linux-generic/include/api/odp_counter.h
> b/platform/linux-generic/include/api/odp_counter.h
> new file mode 100644
> index 0000000..f937d27
> --- /dev/null
> +++ b/platform/linux-generic/include/api/odp_counter.h
> @@ -0,0 +1,363 @@
> +/* Copyright (c) 2013, Linaro Limited
> + * All rights reserved.
> + *
> + * SPDX-License-Identifier:     BSD-3-Clause
> + */
> +
> +/**
> + * @file
> + *
> + * ODP atomic counter types and operations, suitable for e.g. shared
> statistics.
> + * Relaxed memory model assumed for lowest overhead.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Counter functions must be used to operate on counter variables!
> + */
> +
> +#ifndef ODP_COUNTER_H_
> +#define ODP_COUNTER_H_
> +
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * 32-bit (unsigned) atomic counter type
> + */
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the counter variable */
> +} odp_counter32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
> +
> +/**
> + * 64-bit (unsigned) atomic counter type
> + */
> +typedef struct {
> +       uint64_t v; /**< Actual storage for the counter variable */
> +       /* Room for other data structures (e.g. spin lock) that might be
> +        * needed to ensure atomicity on some architectures */
> +} odp_counter64_t
> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
> +
>
> +/*****************************************************************************
> + * Operations on 32-bit atomic counters
> + * odp_counter32_init - returns no value
> + * odp_counter32_read - returns current value
> + * odp_counter32_write - returns no value
> + * odp_counter32_add - returns no value
> + * odp_counter32_read_inc - returns old value
> + * odp_counter32_inc - returns no value
> +
> *****************************************************************************/
> +
> +/**
> + * Initialize 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param val   Initial value
> + */
> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t val)
> +{
> +       /* No implementation requires any other type of initialization */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Read 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + *
> + * @return Value of the variable
> + */
> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
> +{
> +       uint32_t val;
> +       /* Read of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) read variable, thus
> we
> +        * will avoid using compiler memory barriers */
> +       val = *(__volatile const uint32_t *)&ptr->v;
> +       return val;
> +}
> +
> +/**
> + * Write 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param val   Value to write to the variable
> + */
> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t val)
> +{
> +       /* Write of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) write variable,
> thus we
> +        * will avoid using compiler memory barriers */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Atomic add to 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t result;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%2]\t\n"
> +                                "add   %0, %0, %3\t\n"
> +                                "strex %1, %0, [%2]"
> +                                : "=&r"(result), "=&r"(status)
> +                                : "r"(&ptr->v), "Ir" (incr)
> +                                : );
> +       } while (odp_unlikely(status != 0));
> +#elif defined __OCTEON__
> +       __asm __volatile("saa %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +       /* Warning odp_counter32_add() may not be efficiently implemented
> */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) of 32-bit counter variable, return original value
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + *
> + * @return Original value of counter
> + */
> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t result, tmp;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%3]\t\n"
> +                                "add   %1, %0, #1\t\n"
> +                                "strex %2, %1, [%3]"
> +                                : "=&r"(result), "=&r"(tmp), "=&r"(status)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0));
> +       return result;
> +#elif defined __OCTEON__
> +       uint32_t old_val;
> +       __asm __volatile("lai %0,(%2)"
> +                        : "=r" (old_val), "+m" (ptr)
> +                        : "r" (ptr)
> +                        : );
> +       return old_val;
> +#elif defined __x86_64__
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#else
> +/* Warning odp_counter32_read_inc() may not be efficiently implemented */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + */
> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
> +{
> +#if defined __OCTEON__
> +       odp_counter32_add(ptr, 1);
> +#else
> +       (void)odp_counter32_read_inc(ptr);
> +#endif
> +}
> +
>
> +/*****************************************************************************
> + * Operations on 64-bit atomic counters
> + * odp_counter64_init
> + * odp_counter64_read
> + * odp_counter64_write
> + * odp_counter64_add
> + * odp_counter64_read_inc
> + * odp_counter64_inc
> +
> *****************************************************************************/
> +
> +/**
> + * Read 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + *
> + * @return Value of the counter variable
> + */
> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t val;
> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
> +                        "clrex" /* Clear exclusive access monitor */
> +                        : "=&r"(val)
> +                        : "r"(&ptr->v)
> +                        : );
> +       return val;
> +#elif defined __x86_64__ || defined __aarch64__
> +       /* Read of aligned quad/double word is atomic */
> +       return ptr->v;
> +#else
> +/* Warning odp_counter64_read() may not be efficiently implemented */
> +       return __sync_fetch_and_or(&ptr->v, 0);
> +#endif
> +}
> +
> +/**
> + * Write 64-bit counter variable
> + *
> + * @param ptr  Pointer to a 64-bit counter variable
> + * @param val  Value to write to the counter variable
> + */
> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t val)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               /* Read counter variable exclusively so we can write to it
> +                * later */
> +               /* Attempt to write the new value */
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "strexd %1, %3, %H3, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(val)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __x86_64__ || defined __aarch64__
> +       /* Write of aligned quad/double word is atomic */
> +       ptr->v = val;
> +#else
> +/* Warning odp_counter64_write() may not be efficiently implemented */
> +       /* This is actually an counter exchange operation */
> +       (void)__sync_lock_test_and_set(&ptr->v, val);
> +#endif
> +}
> +
> +/**
> + * Initialize 64-bit counter variable
> + * Perform implementation specific initializations, assign initial value.
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param val   Initial value
> + */
> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t val)
> +{
> +       /* No implementation requires any other type of initialization */
> +       odp_counter64_write(ptr, val);
> +}
> +
> +/**
> + * Atomic add to 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, %0, %3\t\n"
> +                                "adc    %H0, %H3\t\n"
> +                                "strexd %1, %0, %H0, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(incr)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __OCTEON__
> +       __asm __volatile("saad %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +/* Warning odp_counter64_add() may not be efficiently implemented */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +}
> +
> +
> +/**
> + * Atomic increment (+1) 64-bit counter variable and return original value
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + *
> + * @return Original value of counter
> + */
> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val, tmp;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
> +                                "adds   %2, %0, #1\t\n"
> +                                "adc    %H2, %H0, #0\t\n"
> +                                "strexd %1, %2, %H2, [%3]"
> +                                : "=&r"(old_val), "=&r"(status),
> "=&r"(tmp)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +       return old_val;
> +#elif defined __OCTEON__
> +       uint64_t old_val;
> +       __asm __volatile("laid %0,(%2)"
> +                       : "=r" (old_val), "+m" (ptr)
> +                       : "r" (ptr)
> +                       : );
> +       return old_val;
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#else
> +/* Warning odp_counter64_read_inc() may not be efficiently implemented */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + */
> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, #1\t\n"
> +                                "adc    %H0, #0\t\n"
> +                                "strexd %1, %0, %H0, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#else
> +       (void)odp_counter64_read_inc(ptr);
> +#endif
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif
> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
> b/platform/linux-generic/include/api/odp_rwlock.h
> index 252ebb2..ff8a9a2 100644
> --- a/platform/linux-generic/include/api/odp_rwlock.h
> +++ b/platform/linux-generic/include/api/odp_rwlock.h
> @@ -10,26 +10,30 @@
>  /**
>   * @file
>   *
> - * ODP RW Locks
> + * ODP read/write lock
> + * RW lock support multiple concurrent reads but only one (exclusive)
> writer.
>   */
>
> +#include <odp_atomic.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
>
>  /**
>   * The odp_rwlock_t type.
> - * write lock count is -1,
> - * read lock count > 0
> + * write lock is ~0U
> + * read lock count >0 && <~0U
>   */
>  typedef struct {
> -       volatile int32_t cnt; /**< -1 Write lock,
> -                               > 0 for Read lock. */
> +       odp_atomic32_t cnt; /**< == 0: unlocked,
> +                                == ~0: locked for write,
> +                                > 0 number of concurrent read locks */
>  } odp_rwlock_t;
>
>
>  /**
> - * Initialize the rwlock to an unlocked state.
> + * Initialize the rwlock to the unlocked state.
>   *
>   * @param rwlock pointer to the RW Lock.
>   */
> @@ -50,14 +54,14 @@ void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
>
>  /**
> - * Aquire a write lock.
> + * Aquire the write lock.
>   *
>   * @param rwlock pointer to a RW Lock.
>   */
>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
>
>  /**
> - * Release a write lock.
> + * Release the write lock.
>   *
>   * @param rwlock pointer to a RW Lock.
>   */
> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
> b/platform/linux-generic/include/api/odp_ticketlock.h
> index 6277a18..5933f85 100644
> --- a/platform/linux-generic/include/api/odp_ticketlock.h
> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
> @@ -21,14 +21,15 @@ extern "C" {
>
>  #include <odp_std_types.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>
>
>  /**
>   * ODP ticketlock
>   */
>  typedef struct odp_ticketlock_t {
> -       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
> -       volatile uint32_t cur_ticket;  /**< @private Current ticket */
> +       odp_counter32_t next_ticket; /**< @private Next ticket */
> +       odp_atomic32_t cur_ticket;  /**< @private Current ticket */
>  } odp_ticketlock_t;
>
>
> diff --git a/platform/linux-generic/include/odp_buffer_internal.h
> b/platform/linux-generic/include/odp_buffer_internal.h
> index 2002b51..530ab96 100644
> --- a/platform/linux-generic/include/odp_buffer_internal.h
> +++ b/platform/linux-generic/include/odp_buffer_internal.h
> @@ -88,7 +88,7 @@ typedef struct odp_buffer_hdr_t {
>         uint32_t                 index;      /* buf index in the pool */
>         size_t                   size;       /* max data size */
>         size_t                   cur_offset; /* current offset */
> -       odp_atomic_int_t         ref_count;  /* reference count */
> +       odp_atomic32_t           ref_count;  /* reference count */
>         odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
>         int                      type;       /* type of next header */
>         odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
> diff --git a/platform/linux-generic/include/odp_spin_internal.h
> b/platform/linux-generic/include/odp_spin_internal.h
> index b7e2071..29c524f 100644
> --- a/platform/linux-generic/include/odp_spin_internal.h
> +++ b/platform/linux-generic/include/odp_spin_internal.h
> @@ -15,15 +15,6 @@ extern "C" {
>
>
>  /**
> - * GCC memory barrier for ODP internal use
> - */
> -static inline void odp_mem_barrier(void)
> -{
> -       __asm__ __volatile__ ("" : : : "memory");
> -}
> -
> -
> -/**
>   * Spin loop for ODP internal use
>   */
>  static inline void odp_spin(void)
> diff --git a/platform/linux-generic/odp_barrier.c
> b/platform/linux-generic/odp_barrier.c
> index a82b294..10368b5 100644
> --- a/platform/linux-generic/odp_barrier.c
> +++ b/platform/linux-generic/odp_barrier.c
> @@ -8,41 +8,52 @@
>  #include <odp_sync.h>
>  #include <odp_spin_internal.h>
>
> -void odp_barrier_init_count(odp_barrier_t *barrier, int count)
> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
>  {
> -       barrier->count = count;
> -       barrier->bar = 0;
> -       odp_sync_stores();
> +       barrier->num_threads = num_threads; /* Constant after
> initialisation */
> +       odp_atomic32_init(&barrier->in_barrier, 0);
>  }
>
>  /*
>   * Efficient barrier_sync -
>   *
>   *   Barriers are initialized with a count of the number of callers
> - *   that must sync on the barrier before any may proceed.
> + *   that must sync on (enter) the barrier before any may proceed (exit).
>   *
>   *   To avoid race conditions and to permit the barrier to be fully
> - *   reusable, the barrier value cycles between 0..2*count-1. When
> - *   synchronizing the wasless variable simply tracks which half of
> + *   reusable, the barrier value cycles between 0..2*count-1 (temporarily
> + *   hitting 2*count before being wrapped). When
> + *   synchronizing, the waslow variable simply tracks which half of
>   *   the cycle the barrier was in upon entry.  Exit is when the
>   *   barrier crosses to the other half of the cycle.
>   */
>
>  void odp_barrier_sync(odp_barrier_t *barrier)
>  {
> -       int count;
> -       int wasless;
> +       uint32_t count;
> +       bool waslow;
>
> -       odp_sync_stores();
> -       wasless = barrier->bar < barrier->count;
> -       count = odp_atomic_fetch_inc_int(&barrier->bar);
> +       /* We need both acquire and release barriers but does the order
> +        * matter? Here we start with release and end with acquire. */
>
> -       if (count == 2*barrier->count-1) {
> -               barrier->bar = 0;
> -       } else {
> -               while ((barrier->bar < barrier->count) == wasless)
> -                       odp_spin();
> -       }
> +       /* Increase threads in_barrier count, this will automatically
> release
> +        * the other threads when lower/upper range is switched */
> +       count = odp_atomic32_fetch_add(&barrier->in_barrier, 1,
> +                                      ODP_MEMORDER_RLS);
> +       /* Compute lower or higher range indicator */
> +       waslow = count < barrier->num_threads;
>
> -       odp_mem_barrier();
> +       /* Check if in_barrier count should wrap */
> +       if (count == 2 * barrier->num_threads - 1) {
> +               /* Manually wrap the counter */
> +               odp_atomic32_add(&barrier->in_barrier,
> +                                -2 * barrier->num_threads,
> +                                ODP_MEMORDER_RLX);
> +               /* Fall-through the final part for the acquire barrier */
> +       }
> +       /* Wait for counter to change half */
> +       while ((odp_atomic32_load(&barrier->in_barrier, ODP_MEMORDER_ACQ) <
> +              barrier->num_threads) == waslow) {
> +               odp_spin();
> +       }
>  }
> diff --git a/platform/linux-generic/odp_buffer.c
> b/platform/linux-generic/odp_buffer.c
> index e54e0e7..fc3506b 100644
> --- a/platform/linux-generic/odp_buffer.c
> +++ b/platform/linux-generic/odp_buffer.c
> @@ -73,7 +73,8 @@ int odp_buffer_snprint(char *str, size_t n, odp_buffer_t
> buf)
>         len += snprintf(&str[len], n-len,
>                         "  cur_offset   %zu\n",       hdr->cur_offset);
>         len += snprintf(&str[len], n-len,
> -                       "  ref_count    %i\n",        hdr->ref_count);
> +                       "  ref_count    %u\n",
> +                       odp_atomic32_load(&hdr->ref_count,
> ODP_MEMORDER_RLX));
>         len += snprintf(&str[len], n-len,
>                         "  type         %i\n",        hdr->type);
>         len += snprintf(&str[len], n-len,
> diff --git a/platform/linux-generic/odp_crypto.c
> b/platform/linux-generic/odp_crypto.c
> index b37ad6b..75b4ce0 100644
> --- a/platform/linux-generic/odp_crypto.c
> +++ b/platform/linux-generic/odp_crypto.c
> @@ -6,7 +6,7 @@
>
>  #include <odp_crypto.h>
>  #include <odp_internal.h>
> -#include <odp_atomic.h>
> +#include <odp_counter.h>
>  #include <odp_spinlock.h>
>  #include <odp_sync.h>
>  #include <odp_debug.h>
> @@ -26,7 +26,7 @@
>  #define MAX_SESSIONS 32
>
>  typedef struct {
> -       odp_atomic_u32_t next;
> +       odp_counter32_t   next;
>         uint32_t         max;
>         odp_crypto_generic_session_t sessions[0];
>  } odp_crypto_global_t;
> @@ -58,7 +58,7 @@ odp_crypto_generic_session_t *alloc_session(void)
>         uint32_t idx;
>         odp_crypto_generic_session_t *session = NULL;
>
> -       idx = odp_atomic_fetch_inc_u32(&global->next);
> +       idx = odp_counter32_read_inc(&global->next);
>         if (idx < global->max) {
>                 session = &global->sessions[idx];
>                 session->index = idx;
> @@ -420,6 +420,7 @@ odp_crypto_init_global(void)
>
>         /* Initialize it */
>         global->max = MAX_SESSIONS;
> +       odp_counter32_init(&global->next, 0);
>
>         return 0;
>  }
> diff --git a/platform/linux-generic/odp_queue.c
> b/platform/linux-generic/odp_queue.c
> index 1318bcd..08c0d29 100644
> --- a/platform/linux-generic/odp_queue.c
> +++ b/platform/linux-generic/odp_queue.c
> @@ -214,8 +214,13 @@ int odp_queue_set_context(odp_queue_t handle, void
> *context)
>  {
>         queue_entry_t *queue;
>         queue = queue_to_qentry(handle);
> +       /* Setting a new queue context can be viewed as a release
> operation,
> +        * all writes to the context must be observable before the context
> +        * is made observable */
>         odp_sync_stores();
> -       queue->s.param.context = context;
> +       queue->s.param.context = context; /* Store-release */
> +       /* Ensure queue modification is globally visible before we return
> +        * and the application might cause the queue to be scheduled */
>         odp_sync_stores();
>         return 0;
>  }
> diff --git a/platform/linux-generic/odp_ring.c
> b/platform/linux-generic/odp_ring.c
> index 632aa66..e5b9c23 100644
> --- a/platform/linux-generic/odp_ring.c
> +++ b/platform/linux-generic/odp_ring.c
> @@ -187,10 +187,10 @@ odph_ring_create(const char *name, unsigned count,
> unsigned flags)
>                 r->cons.size = count;
>                 r->prod.mask = count-1;
>                 r->cons.mask = count-1;
> -               r->prod.head = 0;
> -               r->cons.head = 0;
> -               r->prod.tail = 0;
> -               r->cons.tail = 0;
> +               odp_atomic32_init(&r->prod.head, 0);
> +               odp_atomic32_init(&r->cons.head, 0);
> +               odp_atomic32_init(&r->prod.tail, 0);
> +               odp_atomic32_init(&r->cons.tail, 0);
>
>                 TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
>         } else {
> @@ -227,7 +227,7 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>         uint32_t prod_head, prod_next;
>         uint32_t cons_tail, free_entries;
>         const unsigned max = n;
> -       int success;
> +       bool success;
>         unsigned i;
>         uint32_t mask = r->prod.mask;
>         int ret;
> @@ -237,8 +237,8 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>                 /* Reset n to the initial burst count */
>                 n = max;
>
> -               prod_head = r->prod.head;
> -               cons_tail = r->cons.tail;
> +               prod_head = odp_atomic32_load(&r->prod.head,
> ODP_MEMORDER_RLX);
> +               cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_ACQ);
>                 /* The subtraction is done between two unsigned 32bits
> value
>                  * (the result is always modulo 32 bits even if we have
>                  * prod_head > cons_tail). So 'free_entries' is always
> between 0
> @@ -259,13 +259,14 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>                 }
>
>                 prod_next = prod_head + n;
> -               success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
> -                                             prod_next);
> -       } while (odp_unlikely(success == 0));
> +               success = odp_atomic32_cmp_xchg_weak(&r->prod.head,
> +                                                    &prod_head,
> +                                                    prod_next,
> +                                                    ODP_MEMORDER_RLX);
> +       } while (odp_unlikely(!success));
>
>         /* write entries in ring */
>         ENQUEUE_PTRS();
> -       odp_mem_barrier();
>
>         /* if we exceed the watermark */
>         if (odp_unlikely(((mask + 1) - free_entries + n) >
> r->prod.watermark)) {
> @@ -279,10 +280,11 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>          * If there are other enqueues in progress that preceeded us,
>          * we need to wait for them to complete
>          */
> -       while (odp_unlikely(r->prod.tail != prod_head))
> +       while (odp_unlikely(odp_atomic32_load(&r->prod.tail,
> +                                             ODP_MEMORDER_RLX) !=
> prod_head))
>                 odp_spin();
>
> -       r->prod.tail = prod_next;
> +       odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
>         return ret;
>  }
>
> @@ -298,8 +300,8 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>         uint32_t mask = r->prod.mask;
>         int ret;
>
> -       prod_head = r->prod.head;
> -       cons_tail = r->cons.tail;
> +       prod_head = odp_atomic32_load(&r->prod.head, ODP_MEMORDER_RLX);
> +       cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_ACQ);
>         /* The subtraction is done between two unsigned 32bits value
>          * (the result is always modulo 32 bits even if we have
>          * prod_head > cons_tail). So 'free_entries' is always between 0
> @@ -320,11 +322,10 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>         }
>
>         prod_next = prod_head + n;
> -       r->prod.head = prod_next;
> +       odp_atomic32_store(&r->prod.head, prod_next, ODP_MEMORDER_RLX);
>
>         /* write entries in ring */
>         ENQUEUE_PTRS();
> -       odp_mem_barrier();
>
>         /* if we exceed the watermark */
>         if (odp_unlikely(((mask + 1) - free_entries + n) >
> r->prod.watermark)) {
> @@ -334,7 +335,7 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>                 ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
>         }
>
> -       r->prod.tail = prod_next;
> +       odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
>         return ret;
>  }
>
> @@ -348,7 +349,7 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>         uint32_t cons_head, prod_tail;
>         uint32_t cons_next, entries;
>         const unsigned max = n;
> -       int success;
> +       bool success;
>         unsigned i;
>         uint32_t mask = r->prod.mask;
>
> @@ -357,8 +358,8 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>                 /* Restore n as it may change every loop */
>                 n = max;
>
> -               cons_head = r->cons.head;
> -               prod_tail = r->prod.tail;
> +               cons_head = odp_atomic32_load(&r->cons.head,
> ODP_MEMORDER_RLX);
> +               prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_ACQ);
>                 /* The subtraction is done between two unsigned 32bits
> value
>                  * (the result is always modulo 32 bits even if we have
>                  * cons_head > prod_tail). So 'entries' is always between 0
> @@ -378,22 +379,24 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>                 }
>
>                 cons_next = cons_head + n;
> -               success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
> -                                             cons_next);
> -       } while (odp_unlikely(success == 0));
> +               success = odp_atomic32_cmp_xchg_weak(&r->cons.head,
> +                                                    &cons_head,
> +                                                    cons_next,
> +                                                    ODP_MEMORDER_RLX);
> +       } while (odp_unlikely(!success));
>
>         /* copy in table */
>         DEQUEUE_PTRS();
> -       odp_mem_barrier();
>
>         /*
>          * If there are other dequeues in progress that preceded us,
>          * we need to wait for them to complete
>          */
> -       while (odp_unlikely(r->cons.tail != cons_head))
> +       while (odp_unlikely(odp_atomic32_load(&r->cons.tail,
> +                                             ODP_MEMORDER_RLX) !=
> cons_head))
>                 odp_spin();
>
> -       r->cons.tail = cons_next;
> +       odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
>
>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>  }
> @@ -409,8 +412,8 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>         unsigned i;
>         uint32_t mask = r->prod.mask;
>
> -       cons_head = r->cons.head;
> -       prod_tail = r->prod.tail;
> +       cons_head = odp_atomic32_load(&r->cons.head, ODP_MEMORDER_RLX);
> +       prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_ACQ);
>         /* The subtraction is done between two unsigned 32bits value
>          * (the result is always modulo 32 bits even if we have
>          * cons_head > prod_tail). So 'entries' is always between 0
> @@ -429,13 +432,12 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>         }
>
>         cons_next = cons_head + n;
> -       r->cons.head = cons_next;
> +       odp_atomic32_store(&r->cons.head, cons_next, ODP_MEMORDER_RLX);
>
>         /* copy in table */
>         DEQUEUE_PTRS();
> -       odp_mem_barrier();
>
> -       r->cons.tail = cons_next;
> +       odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>  }
>
> @@ -482,8 +484,8 @@ int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void
> **obj_table, unsigned n)
>   */
>  int odph_ring_full(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_RLX);
> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_RLX);
>         return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
>  }
>
> @@ -492,8 +494,8 @@ int odph_ring_full(const odph_ring_t *r)
>   */
>  int odph_ring_empty(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_RLX);
> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_RLX);
>         return !!(cons_tail == prod_tail);
>  }
>
> @@ -502,8 +504,8 @@ int odph_ring_empty(const odph_ring_t *r)
>   */
>  unsigned odph_ring_count(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_RLX);
> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_RLX);
>         return (prod_tail - cons_tail) & r->prod.mask;
>  }
>
> @@ -512,8 +514,8 @@ unsigned odph_ring_count(const odph_ring_t *r)
>   */
>  unsigned odph_ring_free_count(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
> ODP_MEMORDER_RLX);
> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
> ODP_MEMORDER_RLX);
>         return (cons_tail - prod_tail - 1) & r->prod.mask;
>  }
>
> @@ -523,10 +525,14 @@ void odph_ring_dump(const odph_ring_t *r)
>         ODP_DBG("ring <%s>@%p\n", r->name, r);
>         ODP_DBG("  flags=%x\n", r->flags);
>         ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
> -       ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
> -       ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
> -       ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
> -       ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
> +       ODP_DBG("  ct=%"PRIu32"\n", odp_atomic32_load(&r->cons.tail,
> +                                                     ODP_MEMORDER_RLX));
> +       ODP_DBG("  ch=%"PRIu32"\n", odp_atomic32_load(&r->cons.head,
> +                                                     ODP_MEMORDER_RLX));
> +       ODP_DBG("  pt=%"PRIu32"\n", odp_atomic32_load(&r->prod.tail,
> +                                                     ODP_MEMORDER_RLX));
> +       ODP_DBG("  ph=%"PRIu32"\n", odp_atomic32_load(&r->prod.head,
> +                                                     ODP_MEMORDER_RLX));
>         ODP_DBG("  used=%u\n", odph_ring_count(r));
>         ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
>         if (r->prod.watermark == r->prod.size)
> diff --git a/platform/linux-generic/odp_rwlock.c
> b/platform/linux-generic/odp_rwlock.c
> index 11c8dd7..a5fae4d 100644
> --- a/platform/linux-generic/odp_rwlock.c
> +++ b/platform/linux-generic/odp_rwlock.c
> @@ -4,58 +4,64 @@
>   * SPDX-License-Identifier:     BSD-3-Clause
>   */
>
> +#include <stdbool.h>
>  #include <odp_atomic.h>
>  #include <odp_rwlock.h>
> -
>  #include <odp_spin_internal.h>
>
>  void odp_rwlock_init(odp_rwlock_t *rwlock)
>  {
> -       rwlock->cnt = 0;
> +       odp_atomic32_init(&rwlock->cnt, 0);
>  }
>
>  void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
>  {
> -       int32_t cnt;
> -       int  is_locked = 0;
> -
> -       while (is_locked == 0) {
> -               cnt = rwlock->cnt;
> -               /* waiting for read lock */
> -               if (cnt < 0) {
> +       bool gotit;
> +       uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
> +       do {
> +               /* Wait for any writer to release lock */
> +               while ((int32_t)cnt < 0) {
>                         odp_spin();
> -                       continue;
> +                       cnt = odp_atomic32_load(&rwlock->cnt,
> +                                               ODP_MEMORDER_RLX);
>                 }
> -               is_locked = odp_atomic_cmpset_u32(
> -                                       (volatile uint32_t *)&rwlock->cnt,
> -                                             cnt, cnt + 1);
> -       }
> +               /* Attempt to take another read lock */
> +               gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
> +                                                  &cnt, cnt + 1,
> +                                                  ODP_MEMORDER_RLX);
> +               /* If operation fails, 'cnt' will contain current value */
> +       } while (!gotit);
>  }
>
>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
>  {
> -       odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
> +       /* Release one read lock by subtracting 1 */
> +       odp_atomic32_dec(&rwlock->cnt, ODP_MEMORDER_RLS);
>  }
>
>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
>  {
> -       int32_t cnt;
> -       int is_locked = 0;
> -
> -       while (is_locked == 0) {
> -               cnt = rwlock->cnt;
> -               /* lock aquired, wait */
> -               if (cnt != 0) {
> +       bool gotit;
> +       uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
> +       do {
> +               /* Wait for all lock holders to release lock */
> +               while (cnt != 0) {
> +                       /* Lock is busy */
>                         odp_spin();
> -                       continue;
> +                       cnt = odp_atomic32_load(&rwlock->cnt,
> +                                               ODP_MEMORDER_RLX);
>                 }
> -               is_locked = odp_atomic_cmpset_u32(
> -                                       (volatile uint32_t *)&rwlock->cnt,
> -                                             0, -1);
> -       }
> +               /* Attempt to take write lock */
> +               gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
> +                                                  &cnt,
> +                                                  (uint32_t)-1,
> +                                                  ODP_MEMORDER_RLX);
> +               /* If operation fails, 'cnt' will contain current value */
> +       } while (!gotit);
>  }
>
>  void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
>  {
> -       odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
> +       /* Release the write lock by adding 1 */
> +       odp_atomic32_inc(&rwlock->cnt, ODP_MEMORDER_RLS);
>  }
> diff --git a/platform/linux-generic/odp_thread.c
> b/platform/linux-generic/odp_thread.c
> index b869b27..652d317 100644
> --- a/platform/linux-generic/odp_thread.c
> +++ b/platform/linux-generic/odp_thread.c
> @@ -11,7 +11,7 @@
>
>  #include <odp_thread.h>
>  #include <odp_internal.h>
> -#include <odp_atomic.h>
> +#include <odp_counter.h>
>  #include <odp_config.h>
>  #include <odp_debug.h>
>  #include <odp_shared_memory.h>
> @@ -31,7 +31,7 @@ typedef struct {
>
>  typedef struct {
>         thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
> -       odp_atomic_int_t num;
> +       odp_counter32_t   num;
>
>  } thread_globals_t;
>
> @@ -58,6 +58,7 @@ int odp_thread_init_global(void)
>                 return -1;
>
>         memset(thread_globals, 0, sizeof(thread_globals_t));
> +       odp_counter32_init(&thread_globals->num, 0);
>         return 0;
>  }
>
> @@ -67,7 +68,7 @@ static int thread_id(void)
>         int id;
>         int cpu;
>
> -       id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
> +       id = (int)odp_counter32_read_inc(&thread_globals->num);
>
>         if (id >= ODP_CONFIG_MAX_THREADS) {
>                 ODP_ERR("Too many threads\n");
> @@ -77,7 +78,7 @@ static int thread_id(void)
>         cpu = sched_getcpu();
>
>         if (cpu < 0) {
> -               ODP_ERR("getcpu failed\n");
> +               ODP_ERR("sched_getcpu failed\n");
>                 return -1;
>         }
>
> diff --git a/platform/linux-generic/odp_ticketlock.c
> b/platform/linux-generic/odp_ticketlock.c
> index be5b885..510aa9f 100644
> --- a/platform/linux-generic/odp_ticketlock.c
> +++ b/platform/linux-generic/odp_ticketlock.c
> @@ -6,15 +6,15 @@
>
>  #include <odp_ticketlock.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>  #include <odp_sync.h>
>  #include <odp_spin_internal.h>
>
>
>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>  {
> -       ticketlock->next_ticket = 0;
> -       ticketlock->cur_ticket  = 0;
> -       odp_sync_stores();
> +       odp_counter32_init(&ticketlock->next_ticket, 0);
> +       odp_atomic32_init(&ticketlock->cur_ticket, 0);
>  }
>
>
> @@ -22,30 +22,15 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>  {
>         uint32_t ticket;
>
> -       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
> +       ticket = odp_counter32_read_inc(&ticketlock->next_ticket);
>
> -       while (ticket != ticketlock->cur_ticket)
> +       while (ticket != odp_atomic32_load(&ticketlock->cur_ticket,
> +                                          ODP_MEMORDER_ACQ))
>                 odp_spin();
> -
> -       odp_mem_barrier();
>  }
>
>
>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>  {
> -       odp_sync_stores();
> -
> -       ticketlock->cur_ticket++;
> -
> -#if defined __OCTEON__
> -       odp_sync_stores();
> -#else
> -       odp_mem_barrier();
> -#endif
> -}
> -
> -
> -int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
> -{
> -       return ticketlock->cur_ticket != ticketlock->next_ticket;
> +       odp_atomic32_inc(&ticketlock->cur_ticket, ODP_MEMORDER_RLS);
>  }
> diff --git a/platform/linux-generic/odp_timer.c
> b/platform/linux-generic/odp_timer.c
> index 313c713..fffaa44 100644
> --- a/platform/linux-generic/odp_timer.c
> +++ b/platform/linux-generic/odp_timer.c
> @@ -10,6 +10,7 @@
>  #include <odp_buffer_pool_internal.h>
>  #include <odp_internal.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>  #include <odp_spinlock.h>
>  #include <odp_sync.h>
>  #include <odp_debug.h>
> @@ -32,8 +33,8 @@ typedef struct {
>
>  typedef struct {
>         int               allocated;
> -       volatile int      active;
> -       volatile uint64_t cur_tick;
> +       odp_atomic32_t    active;
> +       odp_counter64_t   cur_tick;
>         timer_t           timerid;
>         odp_timer_t       timer_hdl;
>         odp_buffer_pool_t pool;
> @@ -150,16 +151,16 @@ static void notify_function(union sigval sigval)
>
>         timer = sigval.sival_ptr;
>
> -       if (timer->active == 0) {
> +       if (odp_atomic32_load(&timer->active, ODP_MEMORDER_RLX) == 0) {
>                 ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
>                 return;
>         }
>
>         /* ODP_DBG("Tick\n"); */
>
> -       cur_tick = timer->cur_tick++;
> -
> -       odp_sync_stores();
> +       /* Increment and read are not atomic but we are the only writer */
> +       odp_counter64_inc(&timer->cur_tick);
> +       cur_tick = odp_counter64_read(&timer->cur_tick);
>
>         tick = &timer->tick[cur_tick % MAX_TICKS];
>
> @@ -308,6 +309,8 @@ odp_timer_t odp_timer_create(const char *name,
> odp_buffer_pool_t pool,
>
>         timer_hdl = id + 1;
>
> +       odp_atomic32_init(&timer->active, 0);
> +       odp_counter64_init(&timer->cur_tick, 0);
>         timer->timer_hdl     = timer_hdl;
>         timer->pool          = pool;
>         timer->resolution_ns = resolution_ns;
> @@ -318,8 +321,7 @@ odp_timer_t odp_timer_create(const char *name,
> odp_buffer_pool_t pool,
>                 timer->tick[i].list = NULL;
>         }
>
> -       timer->active = 1;
> -       odp_sync_stores();
> +       odp_atomic32_store(&timer->active, 1, ODP_MEMORDER_RLS);
>
>         timer_start(timer);
>
> @@ -340,7 +342,7 @@ odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t
> timer_hdl, uint64_t tmo_tick,
>         id = (int)timer_hdl - 1;
>         timer = &odp_timer.timer[id];
>
> -       cur_tick = timer->cur_tick;
> +       cur_tick = odp_counter64_read(&timer->cur_tick);
>         if (tmo_tick <= cur_tick) {
>                 ODP_DBG("timeout too close\n");
>                 return ODP_TIMER_TMO_INVALID;
> @@ -416,7 +418,7 @@ uint64_t odp_timer_current_tick(odp_timer_t timer_hdl)
>         uint32_t id;
>
>         id = timer_hdl - 1;
> -       return odp_timer.timer[id].cur_tick;
> +       return odp_counter64_read(&odp_timer.timer[id].cur_tick);
>  }
>
>  odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
> diff --git a/test/api_test/Makefile.am b/test/api_test/Makefile.am
> index 5104454..478aa6c 100644
> --- a/test/api_test/Makefile.am
> +++ b/test/api_test/Makefile.am
> @@ -1,12 +1,12 @@
>  include $(top_srcdir)/test/Makefile.inc
>
> -bin_PROGRAMS = odp_atomic odp_shm odp_ring odp_timer_ping
> -odp_atomic_LDFLAGS = $(AM_LDFLAGS) -static
> +bin_PROGRAMS = odp_counter odp_shm odp_ring odp_timer_ping
> +odp_counter_LDFLAGS = $(AM_LDFLAGS) -static
>  odp_shm_LDFLAGS = $(AM_LDFLAGS) -static
>  odp_ring_LDFLAGS = $(AM_LDFLAGS) -static
>  odp_timer_ping_LDFLAGS = $(AM_LDFLAGS) -static
>
> -dist_odp_atomic_SOURCES = odp_atomic_test.c odp_common.c
> +dist_odp_counter_SOURCES = odp_counter_test.c odp_common.c
>  dist_odp_shm_SOURCES = odp_shm_test.c odp_common.c
>  dist_odp_ring_SOURCES = odp_ring_test.c odp_common.c
>  dist_odp_timer_ping_SOURCES = odp_timer_ping.c odp_common.c
> diff --git a/test/api_test/odp_atomic_test.c
> b/test/api_test/odp_atomic_test.c
> deleted file mode 100644
> index 9019d4f..0000000
> --- a/test/api_test/odp_atomic_test.c
> +++ /dev/null
> @@ -1,362 +0,0 @@
> -/* Copyright (c) 2013, Linaro Limited
> - * All rights reserved.
> - *
> - * SPDX-License-Identifier:     BSD-3-Clause
> - */
> -
> -#include <string.h>
> -#include <sys/time.h>
> -#include <odp_debug.h>
> -#include <odp_common.h>
> -#include <odp_atomic_test.h>
> -
> -static odp_atomic_int_t a32;
> -static odp_atomic_u32_t a32u;
> -static odp_atomic_u64_t a64u;
> -
> -static odp_atomic_int_t numthrds;
> -
> -static const char * const test_name[] = {
> -       "dummy",
> -       "test atomic basic ops add/sub/inc/dec",
> -       "test atomic inc/dec of signed word",
> -       "test atomic add/sub of signed word",
> -       "test atomic inc/dec of unsigned word",
> -       "test atomic add/sub of unsigned word",
> -       "test atomic inc/dec of unsigned double word",
> -       "test atomic add/sub of unsigned double word"
> -};
> -
> -static struct timeval tv0[MAX_WORKERS], tv1[MAX_WORKERS];
> -
> -static void usage(void)
> -{
> -       printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
> -              "\t<testcase> is\n"
> -              "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
> -              "\t\t2 - Test inc dec of signed word\n"
> -              "\t\t3 - Test add sub of signed word\n"
> -              "\t\t4 - Test inc dec of unsigned word\n"
> -              "\t\t5 - Test add sub of unsigned word\n"
> -              "\t\t6 - Test inc dec of double word\n"
> -              "\t\t7 - Test add sub of double word\n"
> -              "\t<num of pthread> is optional\n"
> -              "\t\t<1 - 31> - no of pthreads to start\n"
> -              "\t\tif user doesn't specify this option, then\n"
> -              "\t\tno of pthreads created is equivalent to no of cores\n"
> -              "\t\tavailable in the system\n"
> -              "\tExample usage:\n"
> -              "\t\t./odp_atomic -t 2\n"
> -              "\t\t./odp_atomic -t 3 -n 12\n");
> -}
> -
> -void test_atomic_inc_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_inc_int(&a32);
> -}
> -
> -void test_atomic_inc_u32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_inc_u32(&a32u);
> -}
> -
> -void test_atomic_inc_64(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_inc_u64(&a64u);
> -}
> -
> -void test_atomic_dec_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_dec_int(&a32);
> -}
> -
> -void test_atomic_dec_u32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_dec_u32(&a32u);
> -}
> -
> -void test_atomic_dec_64(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_dec_u64(&a64u);
> -}
> -
> -void test_atomic_add_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_add_u32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_add_64(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_sub_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_sub_u32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_sub_64(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_inc_dec_32(void)
> -{
> -       test_atomic_inc_32();
> -       test_atomic_dec_32();
> -}
> -
> -void test_atomic_add_sub_32(void)
> -{
> -       test_atomic_add_32();
> -       test_atomic_sub_32();
> -}
> -
> -void test_atomic_inc_dec_u32(void)
> -{
> -       test_atomic_inc_u32();
> -       test_atomic_dec_u32();
> -}
> -
> -void test_atomic_add_sub_u32(void)
> -{
> -       test_atomic_add_u32();
> -       test_atomic_sub_u32();
> -}
> -
> -void test_atomic_inc_dec_64(void)
> -{
> -       test_atomic_inc_64();
> -       test_atomic_dec_64();
> -}
> -
> -void test_atomic_add_sub_64(void)
> -{
> -       test_atomic_add_64();
> -       test_atomic_sub_64();
> -}
> -
> -/**
> - * Test basic atomic operation like
> - * add/sub/increment/decrement operation.
> - */
> -void test_atomic_basic(void)
> -{
> -       test_atomic_inc_32();
> -       test_atomic_dec_32();
> -       test_atomic_add_32();
> -       test_atomic_sub_32();
> -
> -       test_atomic_inc_u32();
> -       test_atomic_dec_u32();
> -       test_atomic_add_u32();
> -       test_atomic_sub_u32();
> -
> -       test_atomic_inc_64();
> -       test_atomic_dec_64();
> -       test_atomic_add_64();
> -       test_atomic_sub_64();
> -}
> -
> -void test_atomic_init(void)
> -{
> -       odp_atomic_init_int(&a32);
> -       odp_atomic_init_u32(&a32u);
> -       odp_atomic_init_u64(&a64u);
> -}
> -
> -void test_atomic_store(void)
> -{
> -       odp_atomic_store_int(&a32, S32_INIT_VAL);
> -       odp_atomic_store_u32(&a32u, U32_INIT_VAL);
> -       odp_atomic_store_u64(&a64u, U64_INIT_VAL);
> -}
> -
> -int test_atomic_validate(void)
> -{
> -       if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
> -               ODP_ERR("Atomic signed 32 usual functions failed\n");
> -               return -1;
> -       }
> -
> -       if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
> -               ODP_ERR("Atomic u32 usual functions failed\n");
> -               return -1;
> -       }
> -
> -       if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
> -               ODP_ERR("Atomic u64 usual functions failed\n");
> -               return -1;
> -       }
> -
> -       return 0;
> -}
> -
> -static void *run_thread(void *arg)
> -{
> -       pthrd_arg *parg = (pthrd_arg *)arg;
> -       int thr;
> -
> -       thr = odp_thread_id();
> -
> -       ODP_DBG("Thread %i starts\n", thr);
> -
> -       odp_atomic_inc_int(&numthrds);
> -
> -       /* Wait here until all pthreads are created */
> -       while (*(volatile int *)&numthrds < parg->numthrds)
> -               ;
> -
> -       gettimeofday(&tv0[thr], NULL);
> -
> -       switch (parg->testcase) {
> -       case TEST_MIX:
> -               test_atomic_basic();
> -               break;
> -       case TEST_INC_DEC_S32:
> -               test_atomic_inc_dec_32();
> -               break;
> -       case TEST_ADD_SUB_S32:
> -               test_atomic_add_sub_32();
> -               break;
> -       case TEST_INC_DEC_U32:
> -               test_atomic_inc_dec_u32();
> -               break;
> -       case TEST_ADD_SUB_U32:
> -               test_atomic_add_sub_u32();
> -               break;
> -       case TEST_INC_DEC_64:
> -               test_atomic_inc_dec_64();
> -               break;
> -       case TEST_ADD_SUB_64:
> -               test_atomic_add_sub_64();
> -               break;
> -       }
> -       gettimeofday(&tv1[thr], NULL);
> -       fflush(NULL);
> -
> -       printf("Time taken in thread %02d to complete op is %lld usec\n",
> thr,
> -              (tv1[thr].tv_sec - tv0[thr].tv_sec) * 1000000ULL +
> -              (tv1[thr].tv_usec - tv0[thr].tv_usec));
> -
> -       return parg;
> -}
> -
> -int main(int argc, char *argv[])
> -{
> -       pthrd_arg thrdarg;
> -       int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
> -       char c;
> -       int result;
> -
> -       if (argc == 1 || argc % 2 == 0) {
> -               usage();
> -               goto err_exit;
> -       }
> -       if (odp_test_global_init() != 0)
> -               goto err_exit;
> -       odp_print_system_info();
> -
> -       while (cnt != 0) {
> -               sscanf(argv[++i], "-%c", &c);
> -               switch (c) {
> -               case 't':
> -                       sscanf(argv[++i], "%d", &test_type);
> -                       break;
> -               case 'n':
> -                       sscanf(argv[++i], "%d", &pthrdnum);
> -                       break;
> -               default:
> -                       ODP_ERR("Invalid option %c\n", c);
> -                       usage();
> -                       goto err_exit;
> -               }
> -               if (test_type < TEST_MIX || test_type > TEST_MAX ||
> -                   pthrdnum > odp_sys_core_count()) {
> -                       usage();
> -                       goto err_exit;
> -               }
> -               cnt -= 2;
> -       }
> -       if (pthrdnum == 0)
> -               pthrdnum = odp_sys_core_count();
> -
> -       odp_atomic_init_int(&numthrds);
> -       test_atomic_init();
> -       test_atomic_store();
> -
> -       memset(&thrdarg, 0, sizeof(pthrd_arg));
> -       thrdarg.testcase = test_type;
> -       thrdarg.numthrds = pthrdnum;
> -
> -       if ((test_type > 0) && (test_type < TEST_MAX)) {
> -               printf("%s\n", test_name[test_type]);
> -       } else {
> -               ODP_ERR("Invalid test case [%d]\n", test_type);
> -               usage();
> -               goto err_exit;
> -       }
> -       odp_test_thread_create(run_thread, &thrdarg);
> -
> -       odp_test_thread_exit(&thrdarg);
> -
> -       result = test_atomic_validate();
> -
> -       if (result == 0) {
> -               printf("%s_%d_%d Result:pass\n",
> -                      test_name[test_type], test_type, pthrdnum);
> -       } else {
> -               printf("%s_%d_%d Result:fail\n",
> -                      test_name[test_type], test_type, pthrdnum);
> -       }
> -       return 0;
> -
> -err_exit:
> -       return -1;
> -}
> diff --git a/test/api_test/odp_atomic_test.h
> b/test/api_test/odp_atomic_test.h
> deleted file mode 100644
> index 7814da5..0000000
> --- a/test/api_test/odp_atomic_test.h
> +++ /dev/null
> @@ -1,60 +0,0 @@
> -/* Copyright (c) 2013, Linaro Limited
> - * All rights reserved.
> - *
> - * SPDX-License-Identifier:     BSD-3-Clause
> - */
> -
> -#ifndef ODP_ATOMIC_TEST_H_
> -#define ODP_ATOMIC_TEST_H_
> -
> -#include <odp.h>
> -#include <odph_linux.h>
> -
> -/**
> - * add_sub_cnt could be any valid value
> - * so to excercise explicit atomic_add/sub
> - * ops. For now using 5..
> - */
> -#define ADD_SUB_CNT    5
> -
> -#define        CNT 500000
> -#define        S32_INIT_VAL    (1UL << 10)
> -#define        U32_INIT_VAL    (1UL << 10)
> -#define        U64_INIT_VAL    (1ULL << 33)
> -
> -typedef enum {
> -       TEST_MIX = 1, /* Must be first test case num */
> -       TEST_INC_DEC_S32,
> -       TEST_ADD_SUB_S32,
> -       TEST_INC_DEC_U32,
> -       TEST_ADD_SUB_U32,
> -       TEST_INC_DEC_64,
> -       TEST_ADD_SUB_64,
> -       TEST_MAX,
> -} odp_test_atomic_t;
> -
> -
> -void test_atomic_inc_dec_32(void);
> -void test_atomic_add_sub_32(void);
> -void test_atomic_inc_dec_u32(void);
> -void test_atomic_add_sub_u32(void);
> -void test_atomic_inc_dec_64(void);
> -void test_atomic_add_sub_64(void);
> -void test_atomic_inc_32(void);
> -void test_atomic_dec_32(void);
> -void test_atomic_add_32(void);
> -void test_atomic_sub_32(void);
> -void test_atomic_inc_u32(void);
> -void test_atomic_dec_u32(void);
> -void test_atomic_add_u32(void);
> -void test_atomic_sub_u32(void);
> -void test_atomic_inc_64(void);
> -void test_atomic_dec_64(void);
> -void test_atomic_add_64(void);
> -void test_atomic_sub_64(void);
> -void test_atomic_init(void);
> -void test_atomic_basic(void);
> -void test_atomic_store(void);
> -int test_atomic_validate(void);
> -
> -#endif /* ODP_ATOMIC_TEST_H_ */
> diff --git a/test/api_test/odp_common.c b/test/api_test/odp_common.c
> index ed1fc97..198fe8f 100644
> --- a/test/api_test/odp_common.c
> +++ b/test/api_test/odp_common.c
> @@ -14,7 +14,6 @@
>  #include <odp.h>
>  #include <odph_linux.h>
>  #include <odp_common.h>
> -#include <odp_atomic_test.h>
>  #include <odp_shm_test.h>
>
>
> diff --git a/test/api_test/odp_counter_test.c
> b/test/api_test/odp_counter_test.c
> new file mode 100644
> index 0000000..c72328e
> --- /dev/null
> +++ b/test/api_test/odp_counter_test.c
> @@ -0,0 +1,361 @@
> +/* Copyright (c) 2013, Linaro Limited
> + * All rights reserved.
> + *
> + * SPDX-License-Identifier:     BSD-3-Clause
> + */
> +
> +#include <string.h>
> +#include <sys/time.h>
> +#include <odp.h>
> +#include <odp_debug.h>
> +#include <odp_common.h>
> +#include <odph_linux.h>
> +
> +/**
> + * add_sub_cnt could be any valid value
> + * so to excercise explicit atomic_add/sub
> + * ops. For now using 5..
> + */
> +#define ADD_SUB_CNT    5
> +
> +#define        CNT 500000
> +#define        U32_INIT_VAL    (1UL << 10)
> +#define        U64_INIT_VAL    (1ULL << 33)
> +
> +typedef enum {
> +       TEST_MIX = 1, /* Must be first test case num */
> +       TEST_INC_DEC_U32 = 2,
> +       TEST_ADD_SUB_U32 = 3,
> +       TEST_INC_DEC_64 = 4,
> +       TEST_ADD_SUB_64 = 5,
> +       TEST_MAX,
> +} odp_test_counter_t;
> +
> +
> +static uint32_t test_counter_inc_dec_u32(void);
> +static uint32_t test_counter_add_sub_u32(void);
> +static uint32_t test_counter_inc_dec_64(void);
> +static uint32_t test_counter_add_sub_64(void);
> +static uint32_t test_counter_inc_u32(void);
> +static uint32_t test_counter_dec_u32(void);
> +static uint32_t test_counter_add_u32(void);
> +static uint32_t test_counter_sub_u32(void);
> +static uint32_t test_counter_inc_64(void);
> +static uint32_t test_counter_dec_64(void);
> +static uint32_t test_counter_add_64(void);
> +static uint32_t test_counter_sub_64(void);
> +static void test_counter_init(void);
> +static uint32_t test_counter_basic(void);
> +static void test_counter_write(void);
> +static int test_counter_validate(void);
> +
> +static odp_counter32_t a32u;
> +static odp_counter64_t a64u;
> +
> +static odp_barrier_t barrier;
> +
> +static const char * const test_name[] = {
> +       "dummy",
> +       "test atomic counter basic ops add/sub/inc/dec",
> +       "test atomic inc/dec of 32-bit counter",
> +       "test atomic add/sub of 32-bit counter",
> +       "test atomic inc/dec of 64-bit counter",
> +       "test atomic add/sub of 64-bit counter"
> +};
> +
> +static uint64_t accops[MAX_WORKERS];
> +
> +static void usage(void)
> +{
> +       printf("\n./odp_counter -t <testcase> -n <num of threads>\n\n"
> +              "\t<testcase> is\n"
> +              "\t\t1 - Test mix (inc/dec/add/sub on 32- and 64-bit
> counters)\n"
> +              "\t\t2 - Test inc/dec of 32-bit counter\n"
> +              "\t\t3 - Test add/sub of 32-bit counter\n"
> +              "\t\t4 - Test inc/dec of 64-bit counter\n"
> +              "\t\t5 - Test add/sub of 64-bit counter\n"
> +              "\t<num of thread> is optional\n"
> +              "\t\t<1 - 31> - no of threads to start\n"
> +              "\t\tif user doesn't specify this option, then\n"
> +              "\t\tno of threads created is equivalent to no of cores\n"
> +              "\t\tavailable in the system\n"
> +              "\tExample usage:\n"
> +              "\t\t./odp_counter -t 2\n"
> +              "\t\t./odp_counter -t 3 -n 12\n");
> +}
> +
> +static uint32_t test_counter_inc_u32(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < CNT; i++)
> +               odp_counter32_inc(&a32u);
> +       return i;
> +}
> +
> +static uint32_t test_counter_inc_64(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < CNT; i++)
> +               odp_counter64_inc(&a64u);
> +       return i;
> +}
> +
> +static uint32_t test_counter_dec_u32(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < CNT; i++)
> +               odp_counter32_add(&a32u, (uint32_t)-1);
> +       return i;
> +}
> +
> +static uint32_t test_counter_dec_64(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < CNT; i++)
> +               odp_counter64_add(&a64u, (uint64_t)-1);
> +       return i;
> +}
> +
> +static uint32_t test_counter_add_u32(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> +               odp_counter32_add(&a32u, ADD_SUB_CNT);
> +       return i;
> +}
> +
> +static uint32_t test_counter_add_64(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> +               odp_counter64_add(&a64u, ADD_SUB_CNT);
> +       return i;
> +}
> +
> +static uint32_t test_counter_sub_u32(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> +               odp_counter32_add(&a32u, -ADD_SUB_CNT);
> +       return i;
> +}
> +
> +static uint32_t test_counter_sub_64(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> +               odp_counter64_add(&a64u, -ADD_SUB_CNT);
> +       return i;
> +}
> +
> +static uint32_t test_counter_inc_dec_u32(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_inc_u32();
> +       nops += test_counter_dec_u32();
> +       return nops;
> +}
> +
> +static uint32_t test_counter_add_sub_u32(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_add_u32();
> +       nops += test_counter_sub_u32();
> +       return nops;
> +}
> +
> +static uint32_t test_counter_inc_dec_64(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_inc_64();
> +       nops += test_counter_dec_64();
> +       return nops;
> +}
> +
> +static uint32_t test_counter_add_sub_64(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_add_64();
> +       nops += test_counter_sub_64();
> +       return nops;
> +}
> +
> +/**
> + * Test basic counter operation like
> + * add/sub/increment/decrement operation.
> + */
> +static uint32_t test_counter_basic(void)
> +{
> +       uint32_t nops = 0;
> +       nops += test_counter_inc_u32();
> +       nops += test_counter_dec_u32();
> +       nops += test_counter_add_u32();
> +       nops += test_counter_sub_u32();
> +
> +       nops += test_counter_inc_64();
> +       nops += test_counter_dec_64();
> +       nops += test_counter_add_64();
> +       nops += test_counter_sub_64();
> +
> +       return nops;
> +}
> +
> +static void test_counter_init(void)
> +{
> +       odp_counter32_init(&a32u, 0);
> +       odp_counter64_init(&a64u, 0);
> +}
> +
> +static void test_counter_write(void)
> +{
> +       odp_counter32_write(&a32u, U32_INIT_VAL);
> +       odp_counter64_write(&a64u, U64_INIT_VAL);
> +}
> +
> +static int test_counter_validate(void)
> +{
> +       if (odp_counter32_read(&a32u) != U32_INIT_VAL) {
> +               ODP_ERR("Atomic u32 usual functions failed\n");
> +               return -1;
> +       }
> +
> +       if (odp_counter64_read(&a64u) != U64_INIT_VAL) {
> +               ODP_ERR("Atomic u64 usual functions failed\n");
> +               return -1;
> +       }
> +
> +       return 0;
> +}
> +
> +static void *run_thread(void *arg)
> +{
> +       pthrd_arg *parg = (pthrd_arg *)arg;
> +       int thr;
> +       uint64_t nops = 0;
> +       struct timeval tv0, tv1;
> +
> +       thr = odp_thread_id();
> +
> +       ODP_DBG("Thread %i starts\n", thr);
> +
> +       /* Wait here until all threads have arrived */
> +       /* Use multiple barriers to verify that it handles wrap around and
> +        * has no race conditions which could be exposed when invoked back-
> +        * to-back */
> +       odp_barrier_sync(&barrier);
> +       odp_barrier_sync(&barrier);
> +       odp_barrier_sync(&barrier);
> +       odp_barrier_sync(&barrier);
> +
> +       gettimeofday(&tv0, NULL);
> +
> +       switch (parg->testcase) {
> +       case TEST_MIX:
> +               nops += test_counter_basic();
> +               break;
> +       case TEST_INC_DEC_U32:
> +               nops += test_counter_inc_dec_u32();
> +               break;
> +       case TEST_ADD_SUB_U32:
> +               nops += test_counter_add_sub_u32();
> +               break;
> +       case TEST_INC_DEC_64:
> +               nops += test_counter_inc_dec_64();
> +               break;
> +       case TEST_ADD_SUB_64:
> +               nops += test_counter_add_sub_64();
> +               break;
> +       }
> +       gettimeofday(&tv1, NULL);
> +       accops[thr] = nops;
> +       fflush(NULL);
> +
> +       uint64_t usecs = (tv1.tv_sec - tv0.tv_sec) * 1000000ULL +
> +                        tv1.tv_usec - tv0.tv_usec;
> +       printf("Time taken in thread %02d to complete %"PRIu64" op is "
> +              "%"PRIu64" usec, %"PRIu64" ns/op\n",
> +              thr, nops, usecs, 1000 * usecs / nops);
> +
> +       return parg;
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +       pthrd_arg thrdarg;
> +       int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
> +       char c;
> +       int result;
> +
> +       if (argc == 1 || argc % 2 == 0) {
> +               usage();
> +               goto err_exit;
> +       }
> +       if (odp_test_global_init() != 0)
> +               goto err_exit;
> +       odp_print_system_info();
> +
> +       while (cnt != 0) {
> +               sscanf(argv[++i], "-%c", &c);
> +               switch (c) {
> +               case 't':
> +                       sscanf(argv[++i], "%d", &test_type);
> +                       break;
> +               case 'n':
> +                       sscanf(argv[++i], "%d", &pthrdnum);
> +                       break;
> +               default:
> +                       ODP_ERR("Invalid option %c\n", c);
> +                       usage();
> +                       goto err_exit;
> +               }
> +               if (test_type < TEST_MIX || test_type > TEST_MAX ||
> +                   pthrdnum > odp_sys_core_count()) {
> +                       usage();
> +                       goto err_exit;
> +               }
> +               cnt -= 2;
> +       }
> +       if (pthrdnum == 0)
> +               pthrdnum = odp_sys_core_count();
> +
> +       test_counter_init();
> +       test_counter_write();
> +
> +       memset(&thrdarg, 0, sizeof(pthrd_arg));
> +       thrdarg.testcase = test_type;
> +       thrdarg.numthrds = pthrdnum;
> +
> +       if ((test_type > 0) && (test_type < TEST_MAX)) {
> +               printf("%s\n", test_name[test_type]);
> +       } else {
> +               ODP_ERR("Invalid test case [%d]\n", test_type);
> +               usage();
> +               goto err_exit;
> +       }
> +       odp_barrier_init(&barrier, pthrdnum);
> +       odp_test_thread_create(run_thread, &thrdarg);
> +
> +       odp_test_thread_exit(&thrdarg);
> +
> +       result = test_counter_validate();
> +
> +       if (result == 0) {
> +               printf("%s_%d_%d Result:pass\n",
> +                      test_name[test_type], test_type, pthrdnum);
> +       } else {
> +               printf("%s_%d_%d Result:fail\n",
> +                      test_name[test_type], test_type, pthrdnum);
> +       }
> +       return 0;
> +
> +err_exit:
> +       return -1;
> +}
> --
> 1.9.1
>
>
Bill Fischofer Nov. 4, 2014, 1:51 p.m. UTC | #4
+1  Merge and refine if needed. Time is ticking.

On Tue, Nov 4, 2014 at 7:48 AM, Ola Liljedahl <ola.liljedahl@linaro.org>
wrote:

> Ping!
>
> I really need this new working atomics support merged ASAP because I have
> a new lock-less implementation of the timer API which uses atomic
> operations. I haven't seen any real criticism against the content of the
> patch so there is nothing to change.
>
> -- Ola
>
>
> On 20 October 2014 15:07, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:
>
>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>> ---
>> Added header file odp_counter.h with support for 32- and 64-bit atomic
>> counters
>> using relaxed memory order. 6 operations
>> (init/read/write/add/read_inc/inc) on
>> 32-bit and 64-bit counters respectively.
>> Renamed odp_atomic_test to odp_counter_test and changed to use
>> odp_counter.h
>>
>> Implementation of C11-based memory model for atomic operations. 10
>> operations
>> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
>> in
>> odp_atomic.h. The required memory ordering is now a parameter to each
>> call just
>> like in C11.
>>
>> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
>> fall back to GCC __sync builtins which often include unnecessarily heavy
>> barrier/sync operations (always sequentially consistent).
>>
>> Attempt to remove all explicit memory barriers (odp_sync_stores) from
>> code that
>> implements multithreaded synchronization primitives (e.g. locks,
>> barriers).
>> Rewrote such primitives to use the new atomic operations.
>>
>> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
>> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
>> enqueue/dequeue
>> (missing release barrier, had only compiler barrier).
>>
>>  .gitignore                                         |   2 +-
>>  example/generator/odp_generator.c                  |  43 +-
>>  example/ipsec/odp_ipsec.c                          |   2 +-
>>  example/odp_example/odp_example.c                  |   2 +-
>>  example/timer/odp_timer_test.c                     |   2 +-
>>  helper/include/odph_ring.h                         |   8 +-
>>  platform/linux-generic/include/api/odp.h           |   1 +
>>  platform/linux-generic/include/api/odp_atomic.h    | 838
>> +++++++++++----------
>>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>>  platform/linux-generic/odp_barrier.c               |  49 +-
>>  platform/linux-generic/odp_buffer.c                |   3 +-
>>  platform/linux-generic/odp_crypto.c                |   7 +-
>>  platform/linux-generic/odp_queue.c                 |   7 +-
>>  platform/linux-generic/odp_ring.c                  |  94 +--
>>  platform/linux-generic/odp_rwlock.c                |  62 +-
>>  platform/linux-generic/odp_thread.c                |   9 +-
>>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>>  platform/linux-generic/odp_timer.c                 |  22 +-
>>  test/api_test/Makefile.am                          |   6 +-
>>  test/api_test/odp_atomic_test.c                    | 362 ---------
>>  test/api_test/odp_atomic_test.h                    |  60 --
>>  test/api_test/odp_common.c                         |   1 -
>>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>>  delete mode 100644 test/api_test/odp_atomic_test.c
>>  delete mode 100644 test/api_test/odp_atomic_test.h
>>  create mode 100644 test/api_test/odp_counter_test.c
>>
>> diff --git a/.gitignore b/.gitignore
>> index 6342e34..77db4d6 100644
>> --- a/.gitignore
>> +++ b/.gitignore
>> @@ -35,7 +35,7 @@ build/
>>  odp_example
>>  odp_packet
>>  odp_packet_netmap
>> -odp_atomic
>> +odp_counter
>>  odp_shm
>>  odp_ring
>>  odp_timer_ping
>> diff --git a/example/generator/odp_generator.c
>> b/example/generator/odp_generator.c
>> index eb8b340..252157d 100644
>> --- a/example/generator/odp_generator.c
>> +++ b/example/generator/odp_generator.c
>> @@ -62,10 +62,10 @@ typedef struct {
>>   * counters
>>  */
>>  static struct {
>> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
>> -       odp_atomic_u64_t ip;    /**< ip packets */
>> -       odp_atomic_u64_t udp;   /**< udp packets */
>> -       odp_atomic_u64_t icmp;  /**< icmp packets */
>> +       odp_counter64_t seq;    /**< ip seq to be send */
>> +       odp_counter64_t ip;     /**< ip packets */
>> +       odp_counter64_t udp;    /**< udp packets */
>> +       odp_counter64_t icmp;   /**< icmp packets */
>>  } counters;
>>
>>  /** * Thread specific arguments
>> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>> ODPH_UDPHDR_LEN +
>>                                        ODPH_IPV4HDR_LEN);
>>         ip->proto = ODPH_IPPROTO_UDP;
>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>>         ip->id = odp_cpu_to_be_16(seq);
>>         ip->chksum = 0;
>>         odph_ipv4_csum_update(pkt);
>> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>> ODPH_ICMPHDR_LEN +
>>                                        ODPH_IPV4HDR_LEN);
>>         ip->proto = ODPH_IPPROTO_ICMP;
>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>>         ip->id = odp_cpu_to_be_16(seq);
>>         ip->chksum = 0;
>>         odph_ipv4_csum_update(pkt);
>> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>>                 }
>>
>>                 if (args->appl.interval != 0) {
>> +                       uint64_t seq = odp_counter64_read(&counters.seq);
>>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
>> -                              thr, counters.seq, counters.seq%0xffff);
>> +                              thr, seq, seq%0xffff);
>>                         /* TODO use odp timer */
>>                         usleep(args->appl.interval * 1000);
>>                 }
>> -               if (args->appl.number != -1 && counters.seq
>> -                   >= (unsigned int)args->appl.number) {
>> +               if (args->appl.number != -1 &&
>> +                   odp_counter64_read(&counters.seq) >=
>> +                   (unsigned int)args->appl.number) {
>>                         break;
>>                 }
>>         }
>> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>>         /* receive number of reply pks until timeout */
>>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>>                 while (args->appl.timeout >= 0) {
>> -                       if (counters.icmp >= (unsigned
>> int)args->appl.number)
>> +                       if (odp_counter64_read(&counters.icmp) >=
>> +                           (unsigned int)args->appl.number)
>>                                 break;
>>                         /* TODO use odp timer */
>>                         sleep(1);
>> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>>
>>         /* print info */
>>         if (args->appl.mode == APPL_MODE_UDP) {
>> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
>> +               printf("  [%02i] total send: %ju\n", thr,
>> +                      odp_counter64_read(&counters.seq));
>>         } else if (args->appl.mode == APPL_MODE_PING) {
>>                 printf("  [%02i] total send: %ju total receive: %ju\n",
>> -                      thr, counters.seq, counters.icmp);
>> +                      thr, odp_counter64_read(&counters.seq),
>> +                      odp_counter64_read(&counters.icmp));
>>         }
>>         return arg;
>>  }
>> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>                 if (!odp_packet_inflag_ipv4(pkt))
>>                         continue;
>>
>> -               odp_atomic_inc_u64(&counters.ip);
>> +               odp_counter64_inc(&counters.ip);
>>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
>> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>
>>                 /* udp */
>>                 if (ip->proto == ODPH_IPPROTO_UDP) {
>> -                       odp_atomic_inc_u64(&counters.udp);
>> +                       odp_counter64_inc(&counters.udp);
>>                         udp = (odph_udphdr_t *)(buf + offset);
>>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>>                                         odp_be_to_cpu_16(udp->length) -
>> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>                         icmp = (odph_icmphdr_t *)(buf + offset);
>>                         /* echo reply */
>>                         if (icmp->type == ICMP_ECHOREPLY) {
>> -                               odp_atomic_inc_u64(&counters.icmp);
>> +                               odp_counter64_inc(&counters.icmp);
>>                                 memcpy(&tvsend, buf + offset +
>> ODPH_ICMPHDR_LEN,
>>                                        sizeof(struct timeval));
>>                                 /* TODO This should be changed to use an
>> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>>         }
>>
>>         /* init counters */
>> -       odp_atomic_init_u64(&counters.seq);
>> -       odp_atomic_init_u64(&counters.ip);
>> -       odp_atomic_init_u64(&counters.udp);
>> -       odp_atomic_init_u64(&counters.icmp);
>> +       odp_counter64_init(&counters.seq, 0);
>> +       odp_counter64_init(&counters.ip, 0);
>> +       odp_counter64_init(&counters.udp, 0);
>> +       odp_counter64_init(&counters.icmp, 0);
>>
>>         /* Reserve memory for args from shared mem */
>>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
>> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
>> index 2f2dc19..76c27d0 100644
>> --- a/example/ipsec/odp_ipsec.c
>> +++ b/example/ipsec/odp_ipsec.c
>> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>>         printf("Num worker threads: %i\n", num_workers);
>>
>>         /* Create a barrier to synchronize thread startup */
>> -       odp_barrier_init_count(&sync_barrier, num_workers);
>> +       odp_barrier_init(&sync_barrier, num_workers);
>>
>>         /*
>>          * By default core #0 runs Linux kernel background tasks.
>> diff --git a/example/odp_example/odp_example.c
>> b/example/odp_example/odp_example.c
>> index 0e9aa3d..c473395 100644
>> --- a/example/odp_example/odp_example.c
>> +++ b/example/odp_example/odp_example.c
>> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>>         odp_shm_print_all();
>>
>>         /* Barrier to sync test case execution */
>> -       odp_barrier_init_count(&globals->barrier, num_workers);
>> +       odp_barrier_init(&globals->barrier, num_workers);
>>
>>         if (args.proc_mode) {
>>                 int ret;
>> diff --git a/example/timer/odp_timer_test.c
>> b/example/timer/odp_timer_test.c
>> index 78b2ae2..dfbeae9 100644
>> --- a/example/timer/odp_timer_test.c
>> +++ b/example/timer/odp_timer_test.c
>> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>>         printf("\n");
>>
>>         /* Barrier to sync test case execution */
>> -       odp_barrier_init_count(&test_barrier, num_workers);
>> +       odp_barrier_init(&test_barrier, num_workers);
>>
>>         /* Create and launch worker threads */
>>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
>> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
>> index 76c1db8..5e78b34 100644
>> --- a/helper/include/odph_ring.h
>> +++ b/helper/include/odph_ring.h
>> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>>                 uint32_t sp_enqueue;     /* True, if single producer. */
>>                 uint32_t size;           /* Size of ring. */
>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>> -               uint32_t head;          /* Producer head. */
>> -               uint32_t tail;          /* Producer tail. */
>> +               odp_atomic32_t head;    /* Producer head. */
>> +               odp_atomic32_t tail;    /* Producer tail. */
>>         } prod ODP_ALIGNED_CACHE;
>>
>>         /** @private Consumer */
>> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>>                 uint32_t size;           /* Size of the ring. */
>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>> -               uint32_t head;          /* Consumer head. */
>> -               uint32_t tail;          /* Consumer tail. */
>> +               odp_atomic32_t head;    /* Consumer head. */
>> +               odp_atomic32_t tail;    /* Consumer tail. */
>>         } cons ODP_ALIGNED_CACHE;
>>
>>         /** @private Memory space of ring starts here. */
>> diff --git a/platform/linux-generic/include/api/odp.h
>> b/platform/linux-generic/include/api/odp.h
>> index 0ee3faf..d124d52 100644
>> --- a/platform/linux-generic/include/api/odp.h
>> +++ b/platform/linux-generic/include/api/odp.h
>> @@ -32,6 +32,7 @@ extern "C" {
>>  #include <odp_barrier.h>
>>  #include <odp_spinlock.h>
>>  #include <odp_atomic.h>
>> +#include <odp_counter.h>
>>
>>  #include <odp_init.h>
>>  #include <odp_system_info.h>
>> diff --git a/platform/linux-generic/include/api/odp_atomic.h
>> b/platform/linux-generic/include/api/odp_atomic.h
>> index 0cc4cf4..ccaad02 100644
>> --- a/platform/linux-generic/include/api/odp_atomic.h
>> +++ b/platform/linux-generic/include/api/odp_atomic.h
>> @@ -4,464 +4,494 @@
>>   * SPDX-License-Identifier:     BSD-3-Clause
>>   */
>>
>> -
>>  /**
>>   * @file
>>   *
>> - * ODP atomic operations
>> + * ODP atomic types and operations, semantically a subset of C11 atomics.
>> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
>> + * without using the required access functions.
>> + * Atomic functions must be used to operate on atomic variables!
>>   */
>>
>>  #ifndef ODP_ATOMIC_H_
>>  #define ODP_ATOMIC_H_
>>
>> +#include <stdint.h>
>> +#include <odp_align.h>
>> +#include <odp_hints.h>
>> +#include <odp_debug.h>
>> +
>>  #ifdef __cplusplus
>>  extern "C" {
>>  #endif
>>
>> -
>> -#include <odp_std_types.h>
>> -
>> -
>> -/**
>> - * Atomic integer
>> - */
>> -typedef volatile int32_t odp_atomic_int_t;
>> -
>> -/**
>> - * Atomic unsigned integer 64 bits
>> - */
>> -typedef volatile uint64_t odp_atomic_u64_t;
>> -
>> -/**
>> - * Atomic unsigned integer 32 bits
>> - */
>> -typedef volatile uint32_t odp_atomic_u32_t;
>> -
>> -
>> -/**
>> - * Initialize atomic integer
>> - *
>> - * @param ptr    An integer atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
>> -{
>> -       *ptr = 0;
>> -}
>> -
>> -/**
>> - * Load value of atomic integer
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return atomic integer value
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
>> -{
>> -       return *ptr;
>> -}
>> -
>> -/**
>> - * Store value to atomic integer
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
>> new_value)
>> -{
>> -       *ptr = new_value;
>> -}
>> -
>> -/**
>> - * Fetch and add atomic integer
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
>> value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and subtract atomic integer
>> - *
>> - * @param ptr    An atomic integer variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
>> value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and increment atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Increment atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Fetch and decrement atomic integer by 1
>> - *
>> - * @param ptr    An atomic int variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Decrement atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
>> -{
>> -       odp_atomic_fetch_sub_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Initialize atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       *ptr = 0;
>> -}
>> -
>> -/**
>> - * Load value of atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return atomic uint32 value
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return *ptr;
>> -}
>> -
>> -/**
>> - * Store value to atomic uint32
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
>> -                                       uint32_t new_value)
>> -{
>> -       *ptr = new_value;
>> -}
>> -
>> -/**
>> - * Fetch and add atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>> -                                               uint32_t value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and subtract uint32
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be sub to the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
>> -                                               uint32_t value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> -
>>  /**
>> - * Fetch and increment atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -#if defined __OCTEON__
>> -
>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       uint32_t ret;
>> -
>> -       __asm__ __volatile__ ("syncws");
>> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
>> -                             "r" (ptr));
>> -
>> -       return ret;
>> -}
>> -
>> + * 32-bit (unsigned) atomic type
>> + */
>> +typedef struct {
>> +       uint32_t v; /**< Actual storage for the atomic variable */
>> +} odp_atomic32_t
>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>> +
>> +typedef enum {
>> +       /** Relaxed memory order, no ordering of other accesses enforced
>> */
>> +       ODP_MEMORDER_RLX,
>> +       /** Acquire memory order, later accesses cannot move before
>> +        * acquire operation */
>> +       ODP_MEMORDER_ACQ,
>> +       /** Release memory order, earlier accesses cannot move after
>> +        * release operation */
>> +       ODP_MEMORDER_RLS
>> +} odp_memorder_t;
>> +
>>
>> +/*****************************************************************************
>> + * Just some private helpers
>>
>> +*****************************************************************************/
>> +
>> +#ifdef __OCTEON__
>> +/* OCTEON Write Memory Barrier */
>> +#define COMPILER_HW_BARRIER() __asm __volatile( \
>> +       /* Double syncw to work around errata */ \
>> +       "syncw\n\tsyncw" : : : )
>>  #else
>> -
>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_u32(ptr, 1);
>> -}
>> -
>> +/** Compiler and hardware full memory barrier */
>> +#define COMPILER_HW_BARRIER() __sync_synchronize()
>> +/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a
>> */
>>  #endif
>>
>> -/**
>> - * Increment atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_u32(ptr, 1);
>> -}
>> -
>> -/**
>> - * Fetch and decrement uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_u32(ptr, 1);
>> -}
>> -
>> -/**
>> - * Decrement atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       odp_atomic_fetch_sub_u32(ptr, 1);
>> -}
>> -
>> -/**
>> - * Atomic compare and set for 32bit
>> - *
>> - * @param dst destination location into which the value will be written.
>> - * @param exp expected value.
>> - * @param src new value.
>> - * @return Non-zero on success; 0 on failure.
>> - */
>> -static inline int
>> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
>> -{
>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>> +#define MEMORY "memory"
>> +
>>
>> +/*****************************************************************************
>> + * Operations on 32-bit atomics
>> + * odp_atomic32_init - no return value
>> + * odp_atomic32_load - return current value
>> + * odp_atomic32_store - no return value
>> + * odp_atomic32_cmp_xchg_weak - return bool
>> + * odp_atomic32_fetch_add - return old value
>> + * odp_atomic32_add - no return value
>> + * odp_atomic32_fetch_inc - return old value
>> + * odp_atomic32_inc - no return value
>> + * odp_atomic32_fetch_dec - return old value
>> + * odp_atomic32_dec - no return value
>> +
>> *****************************************************************************/
>> +
>> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
>> +{
>> +       /* Write of aligned word is atomic */
>> +       /* Cast to volatile to force compiler to (re-) write variable,
>> thus we
>> +        * can avoid using compiler memory barriers */
>> +       *(__volatile uint32_t *)&ptr->v = val;
>> +}
>> +
>> +/**
>> + * Atomic load of 32-bit atomic variable
>> + *
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the load
>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
>> + *
>> + * @return Value of the variable
>> + */
>> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
>> +               odp_memorder_t mmodel)
>> +{
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               uint32_t val;
>> +               /* Read of aligned word is atomic */
>> +               /* Cast to volatile to force compiler to (re-) read
>> variable,
>> +                * thus we can avoid using compiler memory barriers */
>> +               val = *(__volatile const uint32_t *)&ptr->v;
>> +               return val;
>> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
>> +#if defined __aarch64__
>> +               uint32_t val;
>> +               __asm __volatile("ldar %w0, [%1]"
>> +                               : "=&r"(val)
>> +                               : "r"(&ptr->v)
>> +                               : MEMORY);
>> +               return val;
>> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
>> +               /* Read of aligned word is atomic */
>> +               uint32_t val = ptr->v;
>> +               /* To prevent later accesses from moving up */
>> +               /* Herb Sutter claims HW barrier not needed on x86? */
>> +               COMPILER_HW_BARRIER();
>> +               return val;
>> +#else
>> +#warning odp_atomic32_load() may not be efficiently implemented
>> +               /* Assume read of aligned word is atomic */
>> +               uint32_t val = ptr->v;
>> +               /* To prevent later accesses from moving up */
>> +               COMPILER_HW_BARRIER();
>> +               return val;
>> +#endif
>> +       } else {
>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>> +       }
>> +}
>> +
>> +/**
>> + * Atomic store to 32-bit atomic variable
>> + *
>> + * @param ptr  Pointer to a 32-bit atomic variable
>> + * @param val  Value to write to the atomic variable
>> + * @param memmodel Memory model associated with the store
>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
>> + */
>> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
>> +               uint32_t val,
>> +               odp_memorder_t mmodel)
>> +{
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               /* Write of aligned word is atomic */
>> +               /* Cast to volatile to force compiler to (re-) write
>> variable,
>> +                * thus we will avoid using compiler memory barriers */
>> +               *(__volatile uint32_t *)&ptr->v = val;
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
>> +               /* Compiler and HW barrier to prevent earlier accesses
>> from
>> +                * moving down */
>> +               COMPILER_HW_BARRIER();
>> +               /* Write of aligned word is atomic */
>> +               ptr->v = val;
>> +               /* Compiler and HW barrier to prevent this store from
>> moving
>> +                * down after a later load-acquire and thus create
>> overlapping
>> +                * critical sections. Herb Sutter thinks this is needed */
>> +               COMPILER_HW_BARRIER();
>> +#elif defined __aarch64__
>> +               __asm __volatile("stlr %w0, [%1]"
>> +                               :
>> +                               : "r"(val), "r"(&ptr->v)
>> +                               : MEMORY);
>> +#elif defined __x86_64__
>> +               /* This is actually an atomic exchange operation */
>> +               /* Generates good code on x86_64 */
>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>> +#else
>> +#warning odp_atomic32_store_rls() may not be efficiently implemented
>> +               /* This is actually an atomic exchange operation */
>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>> +#endif
>> +       } else {
>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>> +       }
>> +}
>> +
>> +
>> +/**
>> + * Atomic compare and exchange (swap) of 32-bit atomic variable
>> + * "Weak" semantics, may fail spuriously and must be used in a loop.
>> + *
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param exp_p Pointer to expected value (updated on failure)
>> + * @param val   New value to write
>> + * @param       memmodel Memory model associated with the
>> compare-and-swap
>> + * operation (ODP_MEMORDER_RLX only)
>> + *
>> + * @return 1 (true) if exchange successful, 0 (false) if not successful
>> (and
>> + * '*exp_p' updated with current value)
>> + */
>> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
>> +               uint32_t *exp_p,
>> +               uint32_t val,
>> +               odp_memorder_t mmodel)
>> +{
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +#if defined __arm__ /* A32/T32 ISA */
>> +               uint32_t old;
>> +               uint32_t exp = *exp_p;
>> +               int status;
>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>> +                                "cmp   %0, %3\t\n"
>> +                                "bne   1f\t\n"
>> +                                "strex %1, %4, [%2]\t\n"
>> +                                "1:\t\n"
>> +                               : "=&r"(old), "=&r"(status)
>> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
>> +                               : MEMORY);
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Value has changed, can't proceed */
>> +                       /* Clear exclusive access monitor */
>> +                       __asm __volatile("clrex");
>> +                       /* Return current value */
>> +                       *exp_p = old;
>> +                       return 0;
>> +               }
>> +               /* strex returns 0 on success */
>> +               if (odp_unlikely(status != 0)) {
>> +                       /* strex failed, reservation was disturbed */
>> +                       /* Return potentially changed value */
>> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
>> +                       return 0;
>> +               }
>> +               return 1;
>> +#elif defined __mips64__
>> +               uint32_t old;
>> +               uint32_t exp = *exp_p;
>> +               uint32_t status = val;
>> +               __asm __volatile("llw %0, [%2]\t\n"
>> +                                "bne %0, %3, 1f\t\n"
>> +                                "scw %1, [%2]\t\n"
>> +                                "1:\t\n"
>> +                               : "=&r"(old), "+&r"(status)
>> +                               : "r"(&ptr->v), "r"(exp)
>> +                               : MEMORY);
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Value has changed, can't proceed */
>> +                       /* Return current value */
>> +                       *exp_p = old;
>> +                       return 0;
>> +               }
>> +               /* scw returns 1 on success, 0 on failure */
>> +               if (odp_unlikely(status == 0)) {
>> +                       /* scw failed, reservation was disturbed */
>> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
>> +                       return 0;
>> +               }
>> +               return 1;
>> +#elif defined __x86_64__
>> +               uint32_t exp = *exp_p;
>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
>> val);
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Return the unexpected content of '*ptr' */
>> +                       *exp_p = old;
>> +                       return 0;
>> +               } else {
>> +                       return 1;
>> +               }
>> +#else
>> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
>> +               uint32_t exp = *exp_p;
>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
>> val);
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Return the unexpected content of '*ptr' */
>> +                       *exp_p = old;
>> +                       return 0;
>> +               } else {
>> +                       return 1;
>> +               }
>> +#endif
>> +       } else {
>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>> +       }
>> +}
>> +
>> +/**
>> + * Atomic fetch and add to 32-bit atomic variable
>> + * @note A - B <=> A + (-B)
>> + *
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>> + * @param memmodel Memory model associated with the add
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>> + *
>> + * @return Value of the atomic variable before the addition
>> + */
>> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
>> +               uint32_t incr,
>> +               odp_memorder_t mmodel)
>> +{
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +#if defined __arm__ /* A32/T32 ISA */
>> +               uint32_t old_val, tmp;
>> +               int status;
>> +               do {
>> +                       __asm __volatile("ldrex %0, [%3]\t\n"
>> +                                        "add   %1, %0, %4\t\n"
>> +                                        "strex %2, %1, [%3]\t\n"
>> +                                       : "=&r"(old_val), "=&r"(tmp),
>> +                                         "=&r"(status)
>> +                                       : "r"(&ptr->v), "r"(incr)
>> +                                       : MEMORY);
>> +               } while (odp_unlikely(status != 0));
>> +               return old_val;
>> +#elif defined __OCTEON__
>> +               uint32_t old_val;
>> +               __asm __volatile("laa %0,(%2),%3"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr), "r" (incr)
>> +                               : MEMORY);
>> +               return old_val;
>> +#elif defined __x86_64__
>> +               /* Generates good code on x86_64 */
>> +               return __sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
>> +               return __sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               COMPILER_HW_BARRIER();
>> +               __asm __volatile("laa %0,(%2),%3"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr), "r" (incr)
>> +                               : MEMORY);
>> +               COMPILER_HW_BARRIER();
>> +               return old_val;
>> +#endif
>> +               /* __sync_fetch_and_add() will give us barriers before and
>> +                * after, we are fine with this for release operations */
>> +               return __sync_fetch_and_add(&ptr->v, incr);
>> +       } else {
>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>> +       }
>>  }
>>
>>  /**
>> - * Initialize atomic uint64
>> + * Atomic add to 32-bit atomic variable
>>   *
>> - * @param ptr    An atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>> + * @param memmodel Memory model associated with the add
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   */
>> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
>> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
>> +               uint32_t incr,
>> +               odp_memorder_t mmodel)
>>  {
>> -       *ptr = 0;
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               /* Platforms that support atomic add instructions can add
>> +                * their implementations here */
>> +#if defined __OCTEON__
>> +               __asm __volatile("saa %[inc], (%[base])"
>> +                               : "+m" (*ptr)
>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>> +                               : MEMORY);
>> +               return;
>> +#endif
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +               /* Platforms that support atomic add instructions can add
>> +                * their implementations here */
>> +#if defined __OCTEON__
>> +               COMPILER_HW_BARRIER();
>> +               __asm __volatile("saa %[inc], (%[base])"
>> +                               : "+m" (*ptr)
>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>> +                               : MEMORY);
>> +               COMPILER_HW_BARRIER();
>> +               return;
>> +#endif
>> +       }
>> +       /* Default to using odp_atomic32_fetch_add() */
>> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>>  }
>>
>>  /**
>> - * Load value of atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> + * Atomic fetch and increment of 32-bit atomic variable
>>   *
>> - * @return atomic uint64 value
>> + * param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the increment
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   *
>> - * @note The operation is not synchronized with other threads
>> + * @return Value of the atomic variable before the increment
>>   */
>> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
>> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
>> +               odp_memorder_t mmodel)
>>  {
>> -       return *ptr;
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               /* Platforms that support atomic increment instructions
>> can add
>> +                * their implementations here */
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               __asm __volatile("lai %0,(%2)"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr)
>> +                               : MEMORY);
>> +               return old_val;
>> +#endif
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               COMPILER_HW_BARRIER();
>> +               __asm __volatile("lai %0,(%2)"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr)
>> +                               : MEMORY);
>> +               COMPILER_HW_BARRIER();
>> +               return old_val;
>> +#endif
>> +       }
>> +       /* Default to using odp_atomic32_fetch_add() */
>> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>>  }
>>
>>  /**
>> - * Store value to atomic uint64
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> + * Atomic increment of 32-bit atomic variable
>>   *
>> - * @note The operation is not synchronized with other threads
>> + * param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the increment
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   */
>> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
>> -                                       uint64_t new_value)
>> -{
>> -       *ptr = new_value;
>> -}
>> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
>> +               odp_memorder_t mmodel)
>>
>> -/**
>> - * Add atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - */
>> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
>> value)
>>  {
>> -       __sync_fetch_and_add(ptr, value);
>> +       /* Default to using odp_atomic32_fetch_inc() */
>> +       /* Platforms that support atomic increment instructions can add
>> +        * their implementations here */
>> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>>  }
>>
>>  /**
>> - * Fetch and add atomic uint64
>> + * Atomic fetch and decrement of 32-bit atomic variable
>>   *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> + * param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the decrement
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   *
>> - * @return Value of the variable before the operation
>> + * @return Value of the atomic variable before the decrement
>>   */
>> -
>> -#if defined __powerpc__ && !defined __powerpc64__
>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
>> +               odp_memorder_t mmodel)
>>  {
>> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
>> -                                   (uint32_t)value);
>> -}
>> -#else
>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               /* Platforms that support atomic decrement instructions
>> can add
>> +                * their implementations here */
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               __asm __volatile("lad %0,(%2)"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr)
>> +                               : MEMORY);
>> +               return old_val;
>>  #endif
>> -/**
>> - * Subtract atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - */
>> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
>> value)
>> -{
>> -       __sync_fetch_and_sub(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and subtract atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -#if defined __powerpc__ && !defined __powerpc64__
>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
>> -                                   (uint32_t)value);
>> -}
>> -#else
>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               COMPILER_HW_BARRIER();
>> +               __asm __volatile("lad %0,(%2)"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr)
>> +                               : MEMORY);
>> +               COMPILER_HW_BARRIER();
>> +               return old_val;
>>  #endif
>> -/**
>> - * Fetch and increment atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_u64(ptr, 1);
>> -}
>> -
>> -/**
>> - * Increment atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_u64(ptr, 1);
>> +       }
>> +       /* Default to using odp_atomic32_fetch_add() */
>> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>>  }
>>
>>  /**
>> - * Fetch and decrement atomic uint64 by 1
>> + * Atomic decrement of 32-bit atomic variable
>>   *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> + * param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the decrement
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   */
>> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_u64(ptr, 1);
>> -}
>> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
>> +               odp_memorder_t memorder)
>>
>> -/**
>> - * Decrement atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>>  {
>> -       odp_atomic_fetch_sub_u64(ptr, 1);
>> +       /* Default to using odp_atomic32_fetch_dec() */
>> +       /* Platforms that support atomic decrement instructions can add
>> +        * their implementations here */
>> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>>  }
>>
>> -/**
>> - * Atomic compare and set for 64bit
>> - *
>> - * @param dst destination location into which the value will be written.
>> - * @param exp expected value.
>> - * @param src new value.
>> - * @return Non-zero on success; 0 on failure.
>> - */
>> -static inline int
>> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
>> -{
>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>> -}
>> +/* We are not exporting this macro */
>> +#undef COMPILER_HW_BARRIER
>> +#undef MEMORY
>>
>>  #ifdef __cplusplus
>>  }
>> diff --git a/platform/linux-generic/include/api/odp_barrier.h
>> b/platform/linux-generic/include/api/odp_barrier.h
>> index a7b3215..69b1eb8 100644
>> --- a/platform/linux-generic/include/api/odp_barrier.h
>> +++ b/platform/linux-generic/include/api/odp_barrier.h
>> @@ -27,18 +27,18 @@ extern "C" {
>>   * ODP execution barrier
>>   */
>>  typedef struct odp_barrier_t {
>> -       int              count;  /**< @private Thread count */
>> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
>> +       uint32_t       num_threads;  /**< @private Thread count
>> (constant) */
>> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
>>  } odp_barrier_t;
>>
>>
>>  /**
>>   * Init barrier with thread count
>>   *
>> - * @param barrier    Barrier
>> - * @param count      Thread count
>> + * @param barrier     Barrier
>> + * @param num_threads Number of threads which share the barrier
>>   */
>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>>
>>
>>  /**
>> diff --git a/platform/linux-generic/include/api/odp_counter.h
>> b/platform/linux-generic/include/api/odp_counter.h
>> new file mode 100644
>> index 0000000..f937d27
>> --- /dev/null
>> +++ b/platform/linux-generic/include/api/odp_counter.h
>> @@ -0,0 +1,363 @@
>> +/* Copyright (c) 2013, Linaro Limited
>> + * All rights reserved.
>> + *
>> + * SPDX-License-Identifier:     BSD-3-Clause
>> + */
>> +
>> +/**
>> + * @file
>> + *
>> + * ODP atomic counter types and operations, suitable for e.g. shared
>> statistics.
>> + * Relaxed memory model assumed for lowest overhead.
>> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
>> + * without using the required access functions.
>> + * Counter functions must be used to operate on counter variables!
>> + */
>> +
>> +#ifndef ODP_COUNTER_H_
>> +#define ODP_COUNTER_H_
>> +
>> +#include <stdint.h>
>> +#include <odp_align.h>
>> +#include <odp_hints.h>
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +/**
>> + * 32-bit (unsigned) atomic counter type
>> + */
>> +typedef struct {
>> +       uint32_t v; /**< Actual storage for the counter variable */
>> +} odp_counter32_t
>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>> +
>> +/**
>> + * 64-bit (unsigned) atomic counter type
>> + */
>> +typedef struct {
>> +       uint64_t v; /**< Actual storage for the counter variable */
>> +       /* Room for other data structures (e.g. spin lock) that might be
>> +        * needed to ensure atomicity on some architectures */
>> +} odp_counter64_t
>> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
>> +
>>
>> +/*****************************************************************************
>> + * Operations on 32-bit atomic counters
>> + * odp_counter32_init - returns no value
>> + * odp_counter32_read - returns current value
>> + * odp_counter32_write - returns no value
>> + * odp_counter32_add - returns no value
>> + * odp_counter32_read_inc - returns old value
>> + * odp_counter32_inc - returns no value
>> +
>> *****************************************************************************/
>> +
>> +/**
>> + * Initialize 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + * @param val   Initial value
>> + */
>> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t val)
>> +{
>> +       /* No implementation requires any other type of initialization */
>> +       *(__volatile uint32_t *)&ptr->v = val;
>> +}
>> +
>> +/**
>> + * Read 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + *
>> + * @return Value of the variable
>> + */
>> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
>> +{
>> +       uint32_t val;
>> +       /* Read of aligned word is atomic */
>> +       /* Cast to volatile to force compiler to (re-) read variable,
>> thus we
>> +        * will avoid using compiler memory barriers */
>> +       val = *(__volatile const uint32_t *)&ptr->v;
>> +       return val;
>> +}
>> +
>> +/**
>> + * Write 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + * @param val   Value to write to the variable
>> + */
>> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t
>> val)
>> +{
>> +       /* Write of aligned word is atomic */
>> +       /* Cast to volatile to force compiler to (re-) write variable,
>> thus we
>> +        * will avoid using compiler memory barriers */
>> +       *(__volatile uint32_t *)&ptr->v = val;
>> +}
>> +
>> +/**
>> + * Atomic add to 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + * @param incr  The value to be added to the counter variable
>> + */
>> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t incr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint32_t result;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>> +                                "add   %0, %0, %3\t\n"
>> +                                "strex %1, %0, [%2]"
>> +                                : "=&r"(result), "=&r"(status)
>> +                                : "r"(&ptr->v), "Ir" (incr)
>> +                                : );
>> +       } while (odp_unlikely(status != 0));
>> +#elif defined __OCTEON__
>> +       __asm __volatile("saa %[inc], (%[base])"
>> +                        : "+m" (*ptr)
>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>> +                        : );
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +       /* Warning odp_counter32_add() may not be efficiently implemented
>> */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>> +}
>> +
>> +/**
>> + * Atomic increment (+1) of 32-bit counter variable, return original
>> value
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + *
>> + * @return Original value of counter
>> + */
>> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint32_t result, tmp;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrex %0, [%3]\t\n"
>> +                                "add   %1, %0, #1\t\n"
>> +                                "strex %2, %1, [%3]"
>> +                                : "=&r"(result), "=&r"(tmp),
>> "=&r"(status)
>> +                                : "r"(&ptr->v)
>> +                                : );
>> +       } while (odp_unlikely(status != 0));
>> +       return result;
>> +#elif defined __OCTEON__
>> +       uint32_t old_val;
>> +       __asm __volatile("lai %0,(%2)"
>> +                        : "=r" (old_val), "+m" (ptr)
>> +                        : "r" (ptr)
>> +                        : );
>> +       return old_val;
>> +#elif defined __x86_64__
>> +       return __sync_fetch_and_add(&ptr->v, 1);
>> +#else
>> +/* Warning odp_counter32_read_inc() may not be efficiently implemented */
>> +       return __sync_fetch_and_add(&ptr->v, 1);
>> +#endif
>> +}
>> +
>> +/**
>> + * Atomic increment (+1) 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + */
>> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
>> +{
>> +#if defined __OCTEON__
>> +       odp_counter32_add(ptr, 1);
>> +#else
>> +       (void)odp_counter32_read_inc(ptr);
>> +#endif
>> +}
>> +
>>
>> +/*****************************************************************************
>> + * Operations on 64-bit atomic counters
>> + * odp_counter64_init
>> + * odp_counter64_read
>> + * odp_counter64_write
>> + * odp_counter64_add
>> + * odp_counter64_read_inc
>> + * odp_counter64_inc
>> +
>> *****************************************************************************/
>> +
>> +/**
>> + * Read 64-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + *
>> + * @return Value of the counter variable
>> + */
>> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t val;
>> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
>> +                        "clrex" /* Clear exclusive access monitor */
>> +                        : "=&r"(val)
>> +                        : "r"(&ptr->v)
>> +                        : );
>> +       return val;
>> +#elif defined __x86_64__ || defined __aarch64__
>> +       /* Read of aligned quad/double word is atomic */
>> +       return ptr->v;
>> +#else
>> +/* Warning odp_counter64_read() may not be efficiently implemented */
>> +       return __sync_fetch_and_or(&ptr->v, 0);
>> +#endif
>> +}
>> +
>> +/**
>> + * Write 64-bit counter variable
>> + *
>> + * @param ptr  Pointer to a 64-bit counter variable
>> + * @param val  Value to write to the counter variable
>> + */
>> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t
>> val)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val;
>> +       int status;
>> +       do {
>> +               /* Read counter variable exclusively so we can write to it
>> +                * later */
>> +               /* Attempt to write the new value */
>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>> +                                "strexd %1, %3, %H3, [%2]"
>> +                                : "=&r"(old_val), "=&r"(status)
>> +                                : "r"(&ptr->v), "r"(val)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +#elif defined __x86_64__ || defined __aarch64__
>> +       /* Write of aligned quad/double word is atomic */
>> +       ptr->v = val;
>> +#else
>> +/* Warning odp_counter64_write() may not be efficiently implemented */
>> +       /* This is actually an counter exchange operation */
>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>> +#endif
>> +}
>> +
>> +/**
>> + * Initialize 64-bit counter variable
>> + * Perform implementation specific initializations, assign initial value.
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + * @param val   Initial value
>> + */
>> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t val)
>> +{
>> +       /* No implementation requires any other type of initialization */
>> +       odp_counter64_write(ptr, val);
>> +}
>> +
>> +/**
>> + * Atomic add to 64-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + * @param incr  The value to be added to the counter variable
>> + */
>> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>> +                                "adds   %0, %0, %3\t\n"
>> +                                "adc    %H0, %H3\t\n"
>> +                                "strexd %1, %0, %H0, [%2]"
>> +                                : "=&r"(old_val), "=&r"(status)
>> +                                : "r"(&ptr->v), "r"(incr)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +#elif defined __OCTEON__
>> +       __asm __volatile("saad %[inc], (%[base])"
>> +                        : "+m" (*ptr)
>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>> +                        : );
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +/* Warning odp_counter64_add() may not be efficiently implemented */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>> +}
>> +
>> +
>> +/**
>> + * Atomic increment (+1) 64-bit counter variable and return original
>> value
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + *
>> + * @return Original value of counter
>> + */
>> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val, tmp;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
>> +                                "adds   %2, %0, #1\t\n"
>> +                                "adc    %H2, %H0, #0\t\n"
>> +                                "strexd %1, %2, %H2, [%3]"
>> +                                : "=&r"(old_val), "=&r"(status),
>> "=&r"(tmp)
>> +                                : "r"(&ptr->v)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +       return old_val;
>> +#elif defined __OCTEON__
>> +       uint64_t old_val;
>> +       __asm __volatile("laid %0,(%2)"
>> +                       : "=r" (old_val), "+m" (ptr)
>> +                       : "r" (ptr)
>> +                       : );
>> +       return old_val;
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       return __sync_fetch_and_add(&ptr->v, 1);
>> +#else
>> +/* Warning odp_counter64_read_inc() may not be efficiently implemented */
>> +       return __sync_fetch_and_add(&ptr->v, 1);
>> +#endif
>> +}
>> +
>> +/**
>> + * Atomic increment (+1) 64-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + */
>> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>> +                                "adds   %0, #1\t\n"
>> +                                "adc    %H0, #0\t\n"
>> +                                "strexd %1, %0, %H0, [%2]"
>> +                                : "=&r"(old_val), "=&r"(status)
>> +                                : "r"(&ptr->v)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +#else
>> +       (void)odp_counter64_read_inc(ptr);
>> +#endif
>> +}
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif
>> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
>> b/platform/linux-generic/include/api/odp_rwlock.h
>> index 252ebb2..ff8a9a2 100644
>> --- a/platform/linux-generic/include/api/odp_rwlock.h
>> +++ b/platform/linux-generic/include/api/odp_rwlock.h
>> @@ -10,26 +10,30 @@
>>  /**
>>   * @file
>>   *
>> - * ODP RW Locks
>> + * ODP read/write lock
>> + * RW lock support multiple concurrent reads but only one (exclusive)
>> writer.
>>   */
>>
>> +#include <odp_atomic.h>
>> +
>>  #ifdef __cplusplus
>>  extern "C" {
>>  #endif
>>
>>  /**
>>   * The odp_rwlock_t type.
>> - * write lock count is -1,
>> - * read lock count > 0
>> + * write lock is ~0U
>> + * read lock count >0 && <~0U
>>   */
>>  typedef struct {
>> -       volatile int32_t cnt; /**< -1 Write lock,
>> -                               > 0 for Read lock. */
>> +       odp_atomic32_t cnt; /**< == 0: unlocked,
>> +                                == ~0: locked for write,
>> +                                > 0 number of concurrent read locks */
>>  } odp_rwlock_t;
>>
>>
>>  /**
>> - * Initialize the rwlock to an unlocked state.
>> + * Initialize the rwlock to the unlocked state.
>>   *
>>   * @param rwlock pointer to the RW Lock.
>>   */
>> @@ -50,14 +54,14 @@ void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
>>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
>>
>>  /**
>> - * Aquire a write lock.
>> + * Aquire the write lock.
>>   *
>>   * @param rwlock pointer to a RW Lock.
>>   */
>>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
>>
>>  /**
>> - * Release a write lock.
>> + * Release the write lock.
>>   *
>>   * @param rwlock pointer to a RW Lock.
>>   */
>> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
>> b/platform/linux-generic/include/api/odp_ticketlock.h
>> index 6277a18..5933f85 100644
>> --- a/platform/linux-generic/include/api/odp_ticketlock.h
>> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
>> @@ -21,14 +21,15 @@ extern "C" {
>>
>>  #include <odp_std_types.h>
>>  #include <odp_atomic.h>
>> +#include <odp_counter.h>
>>
>>
>>  /**
>>   * ODP ticketlock
>>   */
>>  typedef struct odp_ticketlock_t {
>> -       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
>> -       volatile uint32_t cur_ticket;  /**< @private Current ticket */
>> +       odp_counter32_t next_ticket; /**< @private Next ticket */
>> +       odp_atomic32_t cur_ticket;  /**< @private Current ticket */
>>  } odp_ticketlock_t;
>>
>>
>> diff --git a/platform/linux-generic/include/odp_buffer_internal.h
>> b/platform/linux-generic/include/odp_buffer_internal.h
>> index 2002b51..530ab96 100644
>> --- a/platform/linux-generic/include/odp_buffer_internal.h
>> +++ b/platform/linux-generic/include/odp_buffer_internal.h
>> @@ -88,7 +88,7 @@ typedef struct odp_buffer_hdr_t {
>>         uint32_t                 index;      /* buf index in the pool */
>>         size_t                   size;       /* max data size */
>>         size_t                   cur_offset; /* current offset */
>> -       odp_atomic_int_t         ref_count;  /* reference count */
>> +       odp_atomic32_t           ref_count;  /* reference count */
>>         odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
>>         int                      type;       /* type of next header */
>>         odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
>> diff --git a/platform/linux-generic/include/odp_spin_internal.h
>> b/platform/linux-generic/include/odp_spin_internal.h
>> index b7e2071..29c524f 100644
>> --- a/platform/linux-generic/include/odp_spin_internal.h
>> +++ b/platform/linux-generic/include/odp_spin_internal.h
>> @@ -15,15 +15,6 @@ extern "C" {
>>
>>
>>  /**
>> - * GCC memory barrier for ODP internal use
>> - */
>> -static inline void odp_mem_barrier(void)
>> -{
>> -       __asm__ __volatile__ ("" : : : "memory");
>> -}
>> -
>> -
>> -/**
>>   * Spin loop for ODP internal use
>>   */
>>  static inline void odp_spin(void)
>> diff --git a/platform/linux-generic/odp_barrier.c
>> b/platform/linux-generic/odp_barrier.c
>> index a82b294..10368b5 100644
>> --- a/platform/linux-generic/odp_barrier.c
>> +++ b/platform/linux-generic/odp_barrier.c
>> @@ -8,41 +8,52 @@
>>  #include <odp_sync.h>
>>  #include <odp_spin_internal.h>
>>
>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count)
>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
>>  {
>> -       barrier->count = count;
>> -       barrier->bar = 0;
>> -       odp_sync_stores();
>> +       barrier->num_threads = num_threads; /* Constant after
>> initialisation */
>> +       odp_atomic32_init(&barrier->in_barrier, 0);
>>  }
>>
>>  /*
>>   * Efficient barrier_sync -
>>   *
>>   *   Barriers are initialized with a count of the number of callers
>> - *   that must sync on the barrier before any may proceed.
>> + *   that must sync on (enter) the barrier before any may proceed (exit).
>>   *
>>   *   To avoid race conditions and to permit the barrier to be fully
>> - *   reusable, the barrier value cycles between 0..2*count-1. When
>> - *   synchronizing the wasless variable simply tracks which half of
>> + *   reusable, the barrier value cycles between 0..2*count-1 (temporarily
>> + *   hitting 2*count before being wrapped). When
>> + *   synchronizing, the waslow variable simply tracks which half of
>>   *   the cycle the barrier was in upon entry.  Exit is when the
>>   *   barrier crosses to the other half of the cycle.
>>   */
>>
>>  void odp_barrier_sync(odp_barrier_t *barrier)
>>  {
>> -       int count;
>> -       int wasless;
>> +       uint32_t count;
>> +       bool waslow;
>>
>> -       odp_sync_stores();
>> -       wasless = barrier->bar < barrier->count;
>> -       count = odp_atomic_fetch_inc_int(&barrier->bar);
>> +       /* We need both acquire and release barriers but does the order
>> +        * matter? Here we start with release and end with acquire. */
>>
>> -       if (count == 2*barrier->count-1) {
>> -               barrier->bar = 0;
>> -       } else {
>> -               while ((barrier->bar < barrier->count) == wasless)
>> -                       odp_spin();
>> -       }
>> +       /* Increase threads in_barrier count, this will automatically
>> release
>> +        * the other threads when lower/upper range is switched */
>> +       count = odp_atomic32_fetch_add(&barrier->in_barrier, 1,
>> +                                      ODP_MEMORDER_RLS);
>> +       /* Compute lower or higher range indicator */
>> +       waslow = count < barrier->num_threads;
>>
>> -       odp_mem_barrier();
>> +       /* Check if in_barrier count should wrap */
>> +       if (count == 2 * barrier->num_threads - 1) {
>> +               /* Manually wrap the counter */
>> +               odp_atomic32_add(&barrier->in_barrier,
>> +                                -2 * barrier->num_threads,
>> +                                ODP_MEMORDER_RLX);
>> +               /* Fall-through the final part for the acquire barrier */
>> +       }
>> +       /* Wait for counter to change half */
>> +       while ((odp_atomic32_load(&barrier->in_barrier, ODP_MEMORDER_ACQ)
>> <
>> +              barrier->num_threads) == waslow) {
>> +               odp_spin();
>> +       }
>>  }
>> diff --git a/platform/linux-generic/odp_buffer.c
>> b/platform/linux-generic/odp_buffer.c
>> index e54e0e7..fc3506b 100644
>> --- a/platform/linux-generic/odp_buffer.c
>> +++ b/platform/linux-generic/odp_buffer.c
>> @@ -73,7 +73,8 @@ int odp_buffer_snprint(char *str, size_t n,
>> odp_buffer_t buf)
>>         len += snprintf(&str[len], n-len,
>>                         "  cur_offset   %zu\n",       hdr->cur_offset);
>>         len += snprintf(&str[len], n-len,
>> -                       "  ref_count    %i\n",        hdr->ref_count);
>> +                       "  ref_count    %u\n",
>> +                       odp_atomic32_load(&hdr->ref_count,
>> ODP_MEMORDER_RLX));
>>         len += snprintf(&str[len], n-len,
>>                         "  type         %i\n",        hdr->type);
>>         len += snprintf(&str[len], n-len,
>> diff --git a/platform/linux-generic/odp_crypto.c
>> b/platform/linux-generic/odp_crypto.c
>> index b37ad6b..75b4ce0 100644
>> --- a/platform/linux-generic/odp_crypto.c
>> +++ b/platform/linux-generic/odp_crypto.c
>> @@ -6,7 +6,7 @@
>>
>>  #include <odp_crypto.h>
>>  #include <odp_internal.h>
>> -#include <odp_atomic.h>
>> +#include <odp_counter.h>
>>  #include <odp_spinlock.h>
>>  #include <odp_sync.h>
>>  #include <odp_debug.h>
>> @@ -26,7 +26,7 @@
>>  #define MAX_SESSIONS 32
>>
>>  typedef struct {
>> -       odp_atomic_u32_t next;
>> +       odp_counter32_t   next;
>>         uint32_t         max;
>>         odp_crypto_generic_session_t sessions[0];
>>  } odp_crypto_global_t;
>> @@ -58,7 +58,7 @@ odp_crypto_generic_session_t *alloc_session(void)
>>         uint32_t idx;
>>         odp_crypto_generic_session_t *session = NULL;
>>
>> -       idx = odp_atomic_fetch_inc_u32(&global->next);
>> +       idx = odp_counter32_read_inc(&global->next);
>>         if (idx < global->max) {
>>                 session = &global->sessions[idx];
>>                 session->index = idx;
>> @@ -420,6 +420,7 @@ odp_crypto_init_global(void)
>>
>>         /* Initialize it */
>>         global->max = MAX_SESSIONS;
>> +       odp_counter32_init(&global->next, 0);
>>
>>         return 0;
>>  }
>> diff --git a/platform/linux-generic/odp_queue.c
>> b/platform/linux-generic/odp_queue.c
>> index 1318bcd..08c0d29 100644
>> --- a/platform/linux-generic/odp_queue.c
>> +++ b/platform/linux-generic/odp_queue.c
>> @@ -214,8 +214,13 @@ int odp_queue_set_context(odp_queue_t handle, void
>> *context)
>>  {
>>         queue_entry_t *queue;
>>         queue = queue_to_qentry(handle);
>> +       /* Setting a new queue context can be viewed as a release
>> operation,
>> +        * all writes to the context must be observable before the context
>> +        * is made observable */
>>         odp_sync_stores();
>> -       queue->s.param.context = context;
>> +       queue->s.param.context = context; /* Store-release */
>> +       /* Ensure queue modification is globally visible before we return
>> +        * and the application might cause the queue to be scheduled */
>>         odp_sync_stores();
>>         return 0;
>>  }
>> diff --git a/platform/linux-generic/odp_ring.c
>> b/platform/linux-generic/odp_ring.c
>> index 632aa66..e5b9c23 100644
>> --- a/platform/linux-generic/odp_ring.c
>> +++ b/platform/linux-generic/odp_ring.c
>> @@ -187,10 +187,10 @@ odph_ring_create(const char *name, unsigned count,
>> unsigned flags)
>>                 r->cons.size = count;
>>                 r->prod.mask = count-1;
>>                 r->cons.mask = count-1;
>> -               r->prod.head = 0;
>> -               r->cons.head = 0;
>> -               r->prod.tail = 0;
>> -               r->cons.tail = 0;
>> +               odp_atomic32_init(&r->prod.head, 0);
>> +               odp_atomic32_init(&r->cons.head, 0);
>> +               odp_atomic32_init(&r->prod.tail, 0);
>> +               odp_atomic32_init(&r->cons.tail, 0);
>>
>>                 TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
>>         } else {
>> @@ -227,7 +227,7 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
>> const *obj_table,
>>         uint32_t prod_head, prod_next;
>>         uint32_t cons_tail, free_entries;
>>         const unsigned max = n;
>> -       int success;
>> +       bool success;
>>         unsigned i;
>>         uint32_t mask = r->prod.mask;
>>         int ret;
>> @@ -237,8 +237,8 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
>> const *obj_table,
>>                 /* Reset n to the initial burst count */
>>                 n = max;
>>
>> -               prod_head = r->prod.head;
>> -               cons_tail = r->cons.tail;
>> +               prod_head = odp_atomic32_load(&r->prod.head,
>> ODP_MEMORDER_RLX);
>> +               cons_tail = odp_atomic32_load(&r->cons.tail,
>> ODP_MEMORDER_ACQ);
>>                 /* The subtraction is done between two unsigned 32bits
>> value
>>                  * (the result is always modulo 32 bits even if we have
>>                  * prod_head > cons_tail). So 'free_entries' is always
>> between 0
>> @@ -259,13 +259,14 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void
>> * const *obj_table,
>>                 }
>>
>>                 prod_next = prod_head + n;
>> -               success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
>> -                                             prod_next);
>> -       } while (odp_unlikely(success == 0));
>> +               success = odp_atomic32_cmp_xchg_weak(&r->prod.head,
>> +                                                    &prod_head,
>> +                                                    prod_next,
>> +                                                    ODP_MEMORDER_RLX);
>> +       } while (odp_unlikely(!success));
>>
>>         /* write entries in ring */
>>         ENQUEUE_PTRS();
>> -       odp_mem_barrier();
>>
>>         /* if we exceed the watermark */
>>         if (odp_unlikely(((mask + 1) - free_entries + n) >
>> r->prod.watermark)) {
>> @@ -279,10 +280,11 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void
>> * const *obj_table,
>>          * If there are other enqueues in progress that preceeded us,
>>          * we need to wait for them to complete
>>          */
>> -       while (odp_unlikely(r->prod.tail != prod_head))
>> +       while (odp_unlikely(odp_atomic32_load(&r->prod.tail,
>> +                                             ODP_MEMORDER_RLX) !=
>> prod_head))
>>                 odp_spin();
>>
>> -       r->prod.tail = prod_next;
>> +       odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
>>         return ret;
>>  }
>>
>> @@ -298,8 +300,8 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
>> const *obj_table,
>>         uint32_t mask = r->prod.mask;
>>         int ret;
>>
>> -       prod_head = r->prod.head;
>> -       cons_tail = r->cons.tail;
>> +       prod_head = odp_atomic32_load(&r->prod.head, ODP_MEMORDER_RLX);
>> +       cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_ACQ);
>>         /* The subtraction is done between two unsigned 32bits value
>>          * (the result is always modulo 32 bits even if we have
>>          * prod_head > cons_tail). So 'free_entries' is always between 0
>> @@ -320,11 +322,10 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void
>> * const *obj_table,
>>         }
>>
>>         prod_next = prod_head + n;
>> -       r->prod.head = prod_next;
>> +       odp_atomic32_store(&r->prod.head, prod_next, ODP_MEMORDER_RLX);
>>
>>         /* write entries in ring */
>>         ENQUEUE_PTRS();
>> -       odp_mem_barrier();
>>
>>         /* if we exceed the watermark */
>>         if (odp_unlikely(((mask + 1) - free_entries + n) >
>> r->prod.watermark)) {
>> @@ -334,7 +335,7 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
>> const *obj_table,
>>                 ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
>>         }
>>
>> -       r->prod.tail = prod_next;
>> +       odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
>>         return ret;
>>  }
>>
>> @@ -348,7 +349,7 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>         uint32_t cons_head, prod_tail;
>>         uint32_t cons_next, entries;
>>         const unsigned max = n;
>> -       int success;
>> +       bool success;
>>         unsigned i;
>>         uint32_t mask = r->prod.mask;
>>
>> @@ -357,8 +358,8 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>                 /* Restore n as it may change every loop */
>>                 n = max;
>>
>> -               cons_head = r->cons.head;
>> -               prod_tail = r->prod.tail;
>> +               cons_head = odp_atomic32_load(&r->cons.head,
>> ODP_MEMORDER_RLX);
>> +               prod_tail = odp_atomic32_load(&r->prod.tail,
>> ODP_MEMORDER_ACQ);
>>                 /* The subtraction is done between two unsigned 32bits
>> value
>>                  * (the result is always modulo 32 bits even if we have
>>                  * cons_head > prod_tail). So 'entries' is always between
>> 0
>> @@ -378,22 +379,24 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>                 }
>>
>>                 cons_next = cons_head + n;
>> -               success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
>> -                                             cons_next);
>> -       } while (odp_unlikely(success == 0));
>> +               success = odp_atomic32_cmp_xchg_weak(&r->cons.head,
>> +                                                    &cons_head,
>> +                                                    cons_next,
>> +                                                    ODP_MEMORDER_RLX);
>> +       } while (odp_unlikely(!success));
>>
>>         /* copy in table */
>>         DEQUEUE_PTRS();
>> -       odp_mem_barrier();
>>
>>         /*
>>          * If there are other dequeues in progress that preceded us,
>>          * we need to wait for them to complete
>>          */
>> -       while (odp_unlikely(r->cons.tail != cons_head))
>> +       while (odp_unlikely(odp_atomic32_load(&r->cons.tail,
>> +                                             ODP_MEMORDER_RLX) !=
>> cons_head))
>>                 odp_spin();
>>
>> -       r->cons.tail = cons_next;
>> +       odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
>>
>>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>>  }
>> @@ -409,8 +412,8 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>         unsigned i;
>>         uint32_t mask = r->prod.mask;
>>
>> -       cons_head = r->cons.head;
>> -       prod_tail = r->prod.tail;
>> +       cons_head = odp_atomic32_load(&r->cons.head, ODP_MEMORDER_RLX);
>> +       prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_ACQ);
>>         /* The subtraction is done between two unsigned 32bits value
>>          * (the result is always modulo 32 bits even if we have
>>          * cons_head > prod_tail). So 'entries' is always between 0
>> @@ -429,13 +432,12 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>         }
>>
>>         cons_next = cons_head + n;
>> -       r->cons.head = cons_next;
>> +       odp_atomic32_store(&r->cons.head, cons_next, ODP_MEMORDER_RLX);
>>
>>         /* copy in table */
>>         DEQUEUE_PTRS();
>> -       odp_mem_barrier();
>>
>> -       r->cons.tail = cons_next;
>> +       odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
>>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>>  }
>>
>> @@ -482,8 +484,8 @@ int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void
>> **obj_table, unsigned n)
>>   */
>>  int odph_ring_full(const odph_ring_t *r)
>>  {
>> -       uint32_t prod_tail = r->prod.tail;
>> -       uint32_t cons_tail = r->cons.tail;
>> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
>> ODP_MEMORDER_RLX);
>> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
>> ODP_MEMORDER_RLX);
>>         return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
>>  }
>>
>> @@ -492,8 +494,8 @@ int odph_ring_full(const odph_ring_t *r)
>>   */
>>  int odph_ring_empty(const odph_ring_t *r)
>>  {
>> -       uint32_t prod_tail = r->prod.tail;
>> -       uint32_t cons_tail = r->cons.tail;
>> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
>> ODP_MEMORDER_RLX);
>> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
>> ODP_MEMORDER_RLX);
>>         return !!(cons_tail == prod_tail);
>>  }
>>
>> @@ -502,8 +504,8 @@ int odph_ring_empty(const odph_ring_t *r)
>>   */
>>  unsigned odph_ring_count(const odph_ring_t *r)
>>  {
>> -       uint32_t prod_tail = r->prod.tail;
>> -       uint32_t cons_tail = r->cons.tail;
>> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
>> ODP_MEMORDER_RLX);
>> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
>> ODP_MEMORDER_RLX);
>>         return (prod_tail - cons_tail) & r->prod.mask;
>>  }
>>
>> @@ -512,8 +514,8 @@ unsigned odph_ring_count(const odph_ring_t *r)
>>   */
>>  unsigned odph_ring_free_count(const odph_ring_t *r)
>>  {
>> -       uint32_t prod_tail = r->prod.tail;
>> -       uint32_t cons_tail = r->cons.tail;
>> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
>> ODP_MEMORDER_RLX);
>> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
>> ODP_MEMORDER_RLX);
>>         return (cons_tail - prod_tail - 1) & r->prod.mask;
>>  }
>>
>> @@ -523,10 +525,14 @@ void odph_ring_dump(const odph_ring_t *r)
>>         ODP_DBG("ring <%s>@%p\n", r->name, r);
>>         ODP_DBG("  flags=%x\n", r->flags);
>>         ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
>> -       ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
>> -       ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
>> -       ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
>> -       ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
>> +       ODP_DBG("  ct=%"PRIu32"\n", odp_atomic32_load(&r->cons.tail,
>> +                                                     ODP_MEMORDER_RLX));
>> +       ODP_DBG("  ch=%"PRIu32"\n", odp_atomic32_load(&r->cons.head,
>> +                                                     ODP_MEMORDER_RLX));
>> +       ODP_DBG("  pt=%"PRIu32"\n", odp_atomic32_load(&r->prod.tail,
>> +                                                     ODP_MEMORDER_RLX));
>> +       ODP_DBG("  ph=%"PRIu32"\n", odp_atomic32_load(&r->prod.head,
>> +                                                     ODP_MEMORDER_RLX));
>>         ODP_DBG("  used=%u\n", odph_ring_count(r));
>>         ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
>>         if (r->prod.watermark == r->prod.size)
>> diff --git a/platform/linux-generic/odp_rwlock.c
>> b/platform/linux-generic/odp_rwlock.c
>> index 11c8dd7..a5fae4d 100644
>> --- a/platform/linux-generic/odp_rwlock.c
>> +++ b/platform/linux-generic/odp_rwlock.c
>> @@ -4,58 +4,64 @@
>>   * SPDX-License-Identifier:     BSD-3-Clause
>>   */
>>
>> +#include <stdbool.h>
>>  #include <odp_atomic.h>
>>  #include <odp_rwlock.h>
>> -
>>  #include <odp_spin_internal.h>
>>
>>  void odp_rwlock_init(odp_rwlock_t *rwlock)
>>  {
>> -       rwlock->cnt = 0;
>> +       odp_atomic32_init(&rwlock->cnt, 0);
>>  }
>>
>>  void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
>>  {
>> -       int32_t cnt;
>> -       int  is_locked = 0;
>> -
>> -       while (is_locked == 0) {
>> -               cnt = rwlock->cnt;
>> -               /* waiting for read lock */
>> -               if (cnt < 0) {
>> +       bool gotit;
>> +       uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
>> +       do {
>> +               /* Wait for any writer to release lock */
>> +               while ((int32_t)cnt < 0) {
>>                         odp_spin();
>> -                       continue;
>> +                       cnt = odp_atomic32_load(&rwlock->cnt,
>> +                                               ODP_MEMORDER_RLX);
>>                 }
>> -               is_locked = odp_atomic_cmpset_u32(
>> -                                       (volatile uint32_t *)&rwlock->cnt,
>> -                                             cnt, cnt + 1);
>> -       }
>> +               /* Attempt to take another read lock */
>> +               gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
>> +                                                  &cnt, cnt + 1,
>> +                                                  ODP_MEMORDER_RLX);
>> +               /* If operation fails, 'cnt' will contain current value */
>> +       } while (!gotit);
>>  }
>>
>>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
>>  {
>> -       odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
>> +       /* Release one read lock by subtracting 1 */
>> +       odp_atomic32_dec(&rwlock->cnt, ODP_MEMORDER_RLS);
>>  }
>>
>>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
>>  {
>> -       int32_t cnt;
>> -       int is_locked = 0;
>> -
>> -       while (is_locked == 0) {
>> -               cnt = rwlock->cnt;
>> -               /* lock aquired, wait */
>> -               if (cnt != 0) {
>> +       bool gotit;
>> +       uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
>> +       do {
>> +               /* Wait for all lock holders to release lock */
>> +               while (cnt != 0) {
>> +                       /* Lock is busy */
>>                         odp_spin();
>> -                       continue;
>> +                       cnt = odp_atomic32_load(&rwlock->cnt,
>> +                                               ODP_MEMORDER_RLX);
>>                 }
>> -               is_locked = odp_atomic_cmpset_u32(
>> -                                       (volatile uint32_t *)&rwlock->cnt,
>> -                                             0, -1);
>> -       }
>> +               /* Attempt to take write lock */
>> +               gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
>> +                                                  &cnt,
>> +                                                  (uint32_t)-1,
>> +                                                  ODP_MEMORDER_RLX);
>> +               /* If operation fails, 'cnt' will contain current value */
>> +       } while (!gotit);
>>  }
>>
>>  void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
>>  {
>> -       odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
>> +       /* Release the write lock by adding 1 */
>> +       odp_atomic32_inc(&rwlock->cnt, ODP_MEMORDER_RLS);
>>  }
>> diff --git a/platform/linux-generic/odp_thread.c
>> b/platform/linux-generic/odp_thread.c
>> index b869b27..652d317 100644
>> --- a/platform/linux-generic/odp_thread.c
>> +++ b/platform/linux-generic/odp_thread.c
>> @@ -11,7 +11,7 @@
>>
>>  #include <odp_thread.h>
>>  #include <odp_internal.h>
>> -#include <odp_atomic.h>
>> +#include <odp_counter.h>
>>  #include <odp_config.h>
>>  #include <odp_debug.h>
>>  #include <odp_shared_memory.h>
>> @@ -31,7 +31,7 @@ typedef struct {
>>
>>  typedef struct {
>>         thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
>> -       odp_atomic_int_t num;
>> +       odp_counter32_t   num;
>>
>>  } thread_globals_t;
>>
>> @@ -58,6 +58,7 @@ int odp_thread_init_global(void)
>>                 return -1;
>>
>>         memset(thread_globals, 0, sizeof(thread_globals_t));
>> +       odp_counter32_init(&thread_globals->num, 0);
>>         return 0;
>>  }
>>
>> @@ -67,7 +68,7 @@ static int thread_id(void)
>>         int id;
>>         int cpu;
>>
>> -       id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
>> +       id = (int)odp_counter32_read_inc(&thread_globals->num);
>>
>>         if (id >= ODP_CONFIG_MAX_THREADS) {
>>                 ODP_ERR("Too many threads\n");
>> @@ -77,7 +78,7 @@ static int thread_id(void)
>>         cpu = sched_getcpu();
>>
>>         if (cpu < 0) {
>> -               ODP_ERR("getcpu failed\n");
>> +               ODP_ERR("sched_getcpu failed\n");
>>                 return -1;
>>         }
>>
>> diff --git a/platform/linux-generic/odp_ticketlock.c
>> b/platform/linux-generic/odp_ticketlock.c
>> index be5b885..510aa9f 100644
>> --- a/platform/linux-generic/odp_ticketlock.c
>> +++ b/platform/linux-generic/odp_ticketlock.c
>> @@ -6,15 +6,15 @@
>>
>>  #include <odp_ticketlock.h>
>>  #include <odp_atomic.h>
>> +#include <odp_counter.h>
>>  #include <odp_sync.h>
>>  #include <odp_spin_internal.h>
>>
>>
>>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>>  {
>> -       ticketlock->next_ticket = 0;
>> -       ticketlock->cur_ticket  = 0;
>> -       odp_sync_stores();
>> +       odp_counter32_init(&ticketlock->next_ticket, 0);
>> +       odp_atomic32_init(&ticketlock->cur_ticket, 0);
>>  }
>>
>>
>> @@ -22,30 +22,15 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>>  {
>>         uint32_t ticket;
>>
>> -       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>> +       ticket = odp_counter32_read_inc(&ticketlock->next_ticket);
>>
>> -       while (ticket != ticketlock->cur_ticket)
>> +       while (ticket != odp_atomic32_load(&ticketlock->cur_ticket,
>> +                                          ODP_MEMORDER_ACQ))
>>                 odp_spin();
>> -
>> -       odp_mem_barrier();
>>  }
>>
>>
>>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>>  {
>> -       odp_sync_stores();
>> -
>> -       ticketlock->cur_ticket++;
>> -
>> -#if defined __OCTEON__
>> -       odp_sync_stores();
>> -#else
>> -       odp_mem_barrier();
>> -#endif
>> -}
>> -
>> -
>> -int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
>> -{
>> -       return ticketlock->cur_ticket != ticketlock->next_ticket;
>> +       odp_atomic32_inc(&ticketlock->cur_ticket, ODP_MEMORDER_RLS);
>>  }
>> diff --git a/platform/linux-generic/odp_timer.c
>> b/platform/linux-generic/odp_timer.c
>> index 313c713..fffaa44 100644
>> --- a/platform/linux-generic/odp_timer.c
>> +++ b/platform/linux-generic/odp_timer.c
>> @@ -10,6 +10,7 @@
>>  #include <odp_buffer_pool_internal.h>
>>  #include <odp_internal.h>
>>  #include <odp_atomic.h>
>> +#include <odp_counter.h>
>>  #include <odp_spinlock.h>
>>  #include <odp_sync.h>
>>  #include <odp_debug.h>
>> @@ -32,8 +33,8 @@ typedef struct {
>>
>>  typedef struct {
>>         int               allocated;
>> -       volatile int      active;
>> -       volatile uint64_t cur_tick;
>> +       odp_atomic32_t    active;
>> +       odp_counter64_t   cur_tick;
>>         timer_t           timerid;
>>         odp_timer_t       timer_hdl;
>>         odp_buffer_pool_t pool;
>> @@ -150,16 +151,16 @@ static void notify_function(union sigval sigval)
>>
>>         timer = sigval.sival_ptr;
>>
>> -       if (timer->active == 0) {
>> +       if (odp_atomic32_load(&timer->active, ODP_MEMORDER_RLX) == 0) {
>>                 ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
>>                 return;
>>         }
>>
>>         /* ODP_DBG("Tick\n"); */
>>
>> -       cur_tick = timer->cur_tick++;
>> -
>> -       odp_sync_stores();
>> +       /* Increment and read are not atomic but we are the only writer */
>> +       odp_counter64_inc(&timer->cur_tick);
>> +       cur_tick = odp_counter64_read(&timer->cur_tick);
>>
>>         tick = &timer->tick[cur_tick % MAX_TICKS];
>>
>> @@ -308,6 +309,8 @@ odp_timer_t odp_timer_create(const char *name,
>> odp_buffer_pool_t pool,
>>
>>         timer_hdl = id + 1;
>>
>> +       odp_atomic32_init(&timer->active, 0);
>> +       odp_counter64_init(&timer->cur_tick, 0);
>>         timer->timer_hdl     = timer_hdl;
>>         timer->pool          = pool;
>>         timer->resolution_ns = resolution_ns;
>> @@ -318,8 +321,7 @@ odp_timer_t odp_timer_create(const char *name,
>> odp_buffer_pool_t pool,
>>                 timer->tick[i].list = NULL;
>>         }
>>
>> -       timer->active = 1;
>> -       odp_sync_stores();
>> +       odp_atomic32_store(&timer->active, 1, ODP_MEMORDER_RLS);
>>
>>         timer_start(timer);
>>
>> @@ -340,7 +342,7 @@ odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t
>> timer_hdl, uint64_t tmo_tick,
>>         id = (int)timer_hdl - 1;
>>         timer = &odp_timer.timer[id];
>>
>> -       cur_tick = timer->cur_tick;
>> +       cur_tick = odp_counter64_read(&timer->cur_tick);
>>         if (tmo_tick <= cur_tick) {
>>                 ODP_DBG("timeout too close\n");
>>                 return ODP_TIMER_TMO_INVALID;
>> @@ -416,7 +418,7 @@ uint64_t odp_timer_current_tick(odp_timer_t timer_hdl)
>>         uint32_t id;
>>
>>         id = timer_hdl - 1;
>> -       return odp_timer.timer[id].cur_tick;
>> +       return odp_counter64_read(&odp_timer.timer[id].cur_tick);
>>  }
>>
>>  odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
>> diff --git a/test/api_test/Makefile.am b/test/api_test/Makefile.am
>> index 5104454..478aa6c 100644
>> --- a/test/api_test/Makefile.am
>> +++ b/test/api_test/Makefile.am
>> @@ -1,12 +1,12 @@
>>  include $(top_srcdir)/test/Makefile.inc
>>
>> -bin_PROGRAMS = odp_atomic odp_shm odp_ring odp_timer_ping
>> -odp_atomic_LDFLAGS = $(AM_LDFLAGS) -static
>> +bin_PROGRAMS = odp_counter odp_shm odp_ring odp_timer_ping
>> +odp_counter_LDFLAGS = $(AM_LDFLAGS) -static
>>  odp_shm_LDFLAGS = $(AM_LDFLAGS) -static
>>  odp_ring_LDFLAGS = $(AM_LDFLAGS) -static
>>  odp_timer_ping_LDFLAGS = $(AM_LDFLAGS) -static
>>
>> -dist_odp_atomic_SOURCES = odp_atomic_test.c odp_common.c
>> +dist_odp_counter_SOURCES = odp_counter_test.c odp_common.c
>>  dist_odp_shm_SOURCES = odp_shm_test.c odp_common.c
>>  dist_odp_ring_SOURCES = odp_ring_test.c odp_common.c
>>  dist_odp_timer_ping_SOURCES = odp_timer_ping.c odp_common.c
>> diff --git a/test/api_test/odp_atomic_test.c
>> b/test/api_test/odp_atomic_test.c
>> deleted file mode 100644
>> index 9019d4f..0000000
>> --- a/test/api_test/odp_atomic_test.c
>> +++ /dev/null
>> @@ -1,362 +0,0 @@
>> -/* Copyright (c) 2013, Linaro Limited
>> - * All rights reserved.
>> - *
>> - * SPDX-License-Identifier:     BSD-3-Clause
>> - */
>> -
>> -#include <string.h>
>> -#include <sys/time.h>
>> -#include <odp_debug.h>
>> -#include <odp_common.h>
>> -#include <odp_atomic_test.h>
>> -
>> -static odp_atomic_int_t a32;
>> -static odp_atomic_u32_t a32u;
>> -static odp_atomic_u64_t a64u;
>> -
>> -static odp_atomic_int_t numthrds;
>> -
>> -static const char * const test_name[] = {
>> -       "dummy",
>> -       "test atomic basic ops add/sub/inc/dec",
>> -       "test atomic inc/dec of signed word",
>> -       "test atomic add/sub of signed word",
>> -       "test atomic inc/dec of unsigned word",
>> -       "test atomic add/sub of unsigned word",
>> -       "test atomic inc/dec of unsigned double word",
>> -       "test atomic add/sub of unsigned double word"
>> -};
>> -
>> -static struct timeval tv0[MAX_WORKERS], tv1[MAX_WORKERS];
>> -
>> -static void usage(void)
>> -{
>> -       printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
>> -              "\t<testcase> is\n"
>> -              "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
>> -              "\t\t2 - Test inc dec of signed word\n"
>> -              "\t\t3 - Test add sub of signed word\n"
>> -              "\t\t4 - Test inc dec of unsigned word\n"
>> -              "\t\t5 - Test add sub of unsigned word\n"
>> -              "\t\t6 - Test inc dec of double word\n"
>> -              "\t\t7 - Test add sub of double word\n"
>> -              "\t<num of pthread> is optional\n"
>> -              "\t\t<1 - 31> - no of pthreads to start\n"
>> -              "\t\tif user doesn't specify this option, then\n"
>> -              "\t\tno of pthreads created is equivalent to no of cores\n"
>> -              "\t\tavailable in the system\n"
>> -              "\tExample usage:\n"
>> -              "\t\t./odp_atomic -t 2\n"
>> -              "\t\t./odp_atomic -t 3 -n 12\n");
>> -}
>> -
>> -void test_atomic_inc_32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < CNT; i++)
>> -               odp_atomic_inc_int(&a32);
>> -}
>> -
>> -void test_atomic_inc_u32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < CNT; i++)
>> -               odp_atomic_inc_u32(&a32u);
>> -}
>> -
>> -void test_atomic_inc_64(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < CNT; i++)
>> -               odp_atomic_inc_u64(&a64u);
>> -}
>> -
>> -void test_atomic_dec_32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < CNT; i++)
>> -               odp_atomic_dec_int(&a32);
>> -}
>> -
>> -void test_atomic_dec_u32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < CNT; i++)
>> -               odp_atomic_dec_u32(&a32u);
>> -}
>> -
>> -void test_atomic_dec_64(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < CNT; i++)
>> -               odp_atomic_dec_u64(&a64u);
>> -}
>> -
>> -void test_atomic_add_32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
>> -}
>> -
>> -void test_atomic_add_u32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
>> -}
>> -
>> -void test_atomic_add_64(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
>> -}
>> -
>> -void test_atomic_sub_32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
>> -}
>> -
>> -void test_atomic_sub_u32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
>> -}
>> -
>> -void test_atomic_sub_64(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
>> -}
>> -
>> -void test_atomic_inc_dec_32(void)
>> -{
>> -       test_atomic_inc_32();
>> -       test_atomic_dec_32();
>> -}
>> -
>> -void test_atomic_add_sub_32(void)
>> -{
>> -       test_atomic_add_32();
>> -       test_atomic_sub_32();
>> -}
>> -
>> -void test_atomic_inc_dec_u32(void)
>> -{
>> -       test_atomic_inc_u32();
>> -       test_atomic_dec_u32();
>> -}
>> -
>> -void test_atomic_add_sub_u32(void)
>> -{
>> -       test_atomic_add_u32();
>> -       test_atomic_sub_u32();
>> -}
>> -
>> -void test_atomic_inc_dec_64(void)
>> -{
>> -       test_atomic_inc_64();
>> -       test_atomic_dec_64();
>> -}
>> -
>> -void test_atomic_add_sub_64(void)
>> -{
>> -       test_atomic_add_64();
>> -       test_atomic_sub_64();
>> -}
>> -
>> -/**
>> - * Test basic atomic operation like
>> - * add/sub/increment/decrement operation.
>> - */
>> -void test_atomic_basic(void)
>> -{
>> -       test_atomic_inc_32();
>> -       test_atomic_dec_32();
>> -       test_atomic_add_32();
>> -       test_atomic_sub_32();
>> -
>> -       test_atomic_inc_u32();
>> -       test_atomic_dec_u32();
>> -       test_atomic_add_u32();
>> -       test_atomic_sub_u32();
>> -
>> -       test_atomic_inc_64();
>> -       test_atomic_dec_64();
>> -       test_atomic_add_64();
>> -       test_atomic_sub_64();
>> -}
>> -
>> -void test_atomic_init(void)
>> -{
>> -       odp_atomic_init_int(&a32);
>> -       odp_atomic_init_u32(&a32u);
>> -       odp_atomic_init_u64(&a64u);
>> -}
>> -
>> -void test_atomic_store(void)
>> -{
>> -       odp_atomic_store_int(&a32, S32_INIT_VAL);
>> -       odp_atomic_store_u32(&a32u, U32_INIT_VAL);
>> -       odp_atomic_store_u64(&a64u, U64_INIT_VAL);
>> -}
>> -
>> -int test_atomic_validate(void)
>> -{
>> -       if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
>> -               ODP_ERR("Atomic signed 32 usual functions failed\n");
>> -               return -1;
>> -       }
>> -
>> -       if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
>> -               ODP_ERR("Atomic u32 usual functions failed\n");
>> -               return -1;
>> -       }
>> -
>> -       if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
>> -               ODP_ERR("Atomic u64 usual functions failed\n");
>> -               return -1;
>> -       }
>> -
>> -       return 0;
>> -}
>> -
>> -static void *run_thread(void *arg)
>> -{
>> -       pthrd_arg *parg = (pthrd_arg *)arg;
>> -       int thr;
>> -
>> -       thr = odp_thread_id();
>> -
>> -       ODP_DBG("Thread %i starts\n", thr);
>> -
>> -       odp_atomic_inc_int(&numthrds);
>> -
>> -       /* Wait here until all pthreads are created */
>> -       while (*(volatile int *)&numthrds < parg->numthrds)
>> -               ;
>> -
>> -       gettimeofday(&tv0[thr], NULL);
>> -
>> -       switch (parg->testcase) {
>> -       case TEST_MIX:
>> -               test_atomic_basic();
>> -               break;
>> -       case TEST_INC_DEC_S32:
>> -               test_atomic_inc_dec_32();
>> -               break;
>> -       case TEST_ADD_SUB_S32:
>> -               test_atomic_add_sub_32();
>> -               break;
>> -       case TEST_INC_DEC_U32:
>> -               test_atomic_inc_dec_u32();
>> -               break;
>> -       case TEST_ADD_SUB_U32:
>> -               test_atomic_add_sub_u32();
>> -               break;
>> -       case TEST_INC_DEC_64:
>> -               test_atomic_inc_dec_64();
>> -               break;
>> -       case TEST_ADD_SUB_64:
>> -               test_atomic_add_sub_64();
>> -               break;
>> -       }
>> -       gettimeofday(&tv1[thr], NULL);
>> -       fflush(NULL);
>> -
>> -       printf("Time taken in thread %02d to complete op is %lld usec\n",
>> thr,
>> -              (tv1[thr].tv_sec - tv0[thr].tv_sec) * 1000000ULL +
>> -              (tv1[thr].tv_usec - tv0[thr].tv_usec));
>> -
>> -       return parg;
>> -}
>> -
>> -int main(int argc, char *argv[])
>> -{
>> -       pthrd_arg thrdarg;
>> -       int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
>> -       char c;
>> -       int result;
>> -
>> -       if (argc == 1 || argc % 2 == 0) {
>> -               usage();
>> -               goto err_exit;
>> -       }
>> -       if (odp_test_global_init() != 0)
>> -               goto err_exit;
>> -       odp_print_system_info();
>> -
>> -       while (cnt != 0) {
>> -               sscanf(argv[++i], "-%c", &c);
>> -               switch (c) {
>> -               case 't':
>> -                       sscanf(argv[++i], "%d", &test_type);
>> -                       break;
>> -               case 'n':
>> -                       sscanf(argv[++i], "%d", &pthrdnum);
>> -                       break;
>> -               default:
>> -                       ODP_ERR("Invalid option %c\n", c);
>> -                       usage();
>> -                       goto err_exit;
>> -               }
>> -               if (test_type < TEST_MIX || test_type > TEST_MAX ||
>> -                   pthrdnum > odp_sys_core_count()) {
>> -                       usage();
>> -                       goto err_exit;
>> -               }
>> -               cnt -= 2;
>> -       }
>> -       if (pthrdnum == 0)
>> -               pthrdnum = odp_sys_core_count();
>> -
>> -       odp_atomic_init_int(&numthrds);
>> -       test_atomic_init();
>> -       test_atomic_store();
>> -
>> -       memset(&thrdarg, 0, sizeof(pthrd_arg));
>> -       thrdarg.testcase = test_type;
>> -       thrdarg.numthrds = pthrdnum;
>> -
>> -       if ((test_type > 0) && (test_type < TEST_MAX)) {
>> -               printf("%s\n", test_name[test_type]);
>> -       } else {
>> -               ODP_ERR("Invalid test case [%d]\n", test_type);
>> -               usage();
>> -               goto err_exit;
>> -       }
>> -       odp_test_thread_create(run_thread, &thrdarg);
>> -
>> -       odp_test_thread_exit(&thrdarg);
>> -
>> -       result = test_atomic_validate();
>> -
>> -       if (result == 0) {
>> -               printf("%s_%d_%d Result:pass\n",
>> -                      test_name[test_type], test_type, pthrdnum);
>> -       } else {
>> -               printf("%s_%d_%d Result:fail\n",
>> -                      test_name[test_type], test_type, pthrdnum);
>> -       }
>> -       return 0;
>> -
>> -err_exit:
>> -       return -1;
>> -}
>> diff --git a/test/api_test/odp_atomic_test.h
>> b/test/api_test/odp_atomic_test.h
>> deleted file mode 100644
>> index 7814da5..0000000
>> --- a/test/api_test/odp_atomic_test.h
>> +++ /dev/null
>> @@ -1,60 +0,0 @@
>> -/* Copyright (c) 2013, Linaro Limited
>> - * All rights reserved.
>> - *
>> - * SPDX-License-Identifier:     BSD-3-Clause
>> - */
>> -
>> -#ifndef ODP_ATOMIC_TEST_H_
>> -#define ODP_ATOMIC_TEST_H_
>> -
>> -#include <odp.h>
>> -#include <odph_linux.h>
>> -
>> -/**
>> - * add_sub_cnt could be any valid value
>> - * so to excercise explicit atomic_add/sub
>> - * ops. For now using 5..
>> - */
>> -#define ADD_SUB_CNT    5
>> -
>> -#define        CNT 500000
>> -#define        S32_INIT_VAL    (1UL << 10)
>> -#define        U32_INIT_VAL    (1UL << 10)
>> -#define        U64_INIT_VAL    (1ULL << 33)
>> -
>> -typedef enum {
>> -       TEST_MIX = 1, /* Must be first test case num */
>> -       TEST_INC_DEC_S32,
>> -       TEST_ADD_SUB_S32,
>> -       TEST_INC_DEC_U32,
>> -       TEST_ADD_SUB_U32,
>> -       TEST_INC_DEC_64,
>> -       TEST_ADD_SUB_64,
>> -       TEST_MAX,
>> -} odp_test_atomic_t;
>> -
>> -
>> -void test_atomic_inc_dec_32(void);
>> -void test_atomic_add_sub_32(void);
>> -void test_atomic_inc_dec_u32(void);
>> -void test_atomic_add_sub_u32(void);
>> -void test_atomic_inc_dec_64(void);
>> -void test_atomic_add_sub_64(void);
>> -void test_atomic_inc_32(void);
>> -void test_atomic_dec_32(void);
>> -void test_atomic_add_32(void);
>> -void test_atomic_sub_32(void);
>> -void test_atomic_inc_u32(void);
>> -void test_atomic_dec_u32(void);
>> -void test_atomic_add_u32(void);
>> -void test_atomic_sub_u32(void);
>> -void test_atomic_inc_64(void);
>> -void test_atomic_dec_64(void);
>> -void test_atomic_add_64(void);
>> -void test_atomic_sub_64(void);
>> -void test_atomic_init(void);
>> -void test_atomic_basic(void);
>> -void test_atomic_store(void);
>> -int test_atomic_validate(void);
>> -
>> -#endif /* ODP_ATOMIC_TEST_H_ */
>> diff --git a/test/api_test/odp_common.c b/test/api_test/odp_common.c
>> index ed1fc97..198fe8f 100644
>> --- a/test/api_test/odp_common.c
>> +++ b/test/api_test/odp_common.c
>> @@ -14,7 +14,6 @@
>>  #include <odp.h>
>>  #include <odph_linux.h>
>>  #include <odp_common.h>
>> -#include <odp_atomic_test.h>
>>  #include <odp_shm_test.h>
>>
>>
>> diff --git a/test/api_test/odp_counter_test.c
>> b/test/api_test/odp_counter_test.c
>> new file mode 100644
>> index 0000000..c72328e
>> --- /dev/null
>> +++ b/test/api_test/odp_counter_test.c
>> @@ -0,0 +1,361 @@
>> +/* Copyright (c) 2013, Linaro Limited
>> + * All rights reserved.
>> + *
>> + * SPDX-License-Identifier:     BSD-3-Clause
>> + */
>> +
>> +#include <string.h>
>> +#include <sys/time.h>
>> +#include <odp.h>
>> +#include <odp_debug.h>
>> +#include <odp_common.h>
>> +#include <odph_linux.h>
>> +
>> +/**
>> + * add_sub_cnt could be any valid value
>> + * so to excercise explicit atomic_add/sub
>> + * ops. For now using 5..
>> + */
>> +#define ADD_SUB_CNT    5
>> +
>> +#define        CNT 500000
>> +#define        U32_INIT_VAL    (1UL << 10)
>> +#define        U64_INIT_VAL    (1ULL << 33)
>> +
>> +typedef enum {
>> +       TEST_MIX = 1, /* Must be first test case num */
>> +       TEST_INC_DEC_U32 = 2,
>> +       TEST_ADD_SUB_U32 = 3,
>> +       TEST_INC_DEC_64 = 4,
>> +       TEST_ADD_SUB_64 = 5,
>> +       TEST_MAX,
>> +} odp_test_counter_t;
>> +
>> +
>> +static uint32_t test_counter_inc_dec_u32(void);
>> +static uint32_t test_counter_add_sub_u32(void);
>> +static uint32_t test_counter_inc_dec_64(void);
>> +static uint32_t test_counter_add_sub_64(void);
>> +static uint32_t test_counter_inc_u32(void);
>> +static uint32_t test_counter_dec_u32(void);
>> +static uint32_t test_counter_add_u32(void);
>> +static uint32_t test_counter_sub_u32(void);
>> +static uint32_t test_counter_inc_64(void);
>> +static uint32_t test_counter_dec_64(void);
>> +static uint32_t test_counter_add_64(void);
>> +static uint32_t test_counter_sub_64(void);
>> +static void test_counter_init(void);
>> +static uint32_t test_counter_basic(void);
>> +static void test_counter_write(void);
>> +static int test_counter_validate(void);
>> +
>> +static odp_counter32_t a32u;
>> +static odp_counter64_t a64u;
>> +
>> +static odp_barrier_t barrier;
>> +
>> +static const char * const test_name[] = {
>> +       "dummy",
>> +       "test atomic counter basic ops add/sub/inc/dec",
>> +       "test atomic inc/dec of 32-bit counter",
>> +       "test atomic add/sub of 32-bit counter",
>> +       "test atomic inc/dec of 64-bit counter",
>> +       "test atomic add/sub of 64-bit counter"
>> +};
>> +
>> +static uint64_t accops[MAX_WORKERS];
>> +
>> +static void usage(void)
>> +{
>> +       printf("\n./odp_counter -t <testcase> -n <num of threads>\n\n"
>> +              "\t<testcase> is\n"
>> +              "\t\t1 - Test mix (inc/dec/add/sub on 32- and 64-bit
>> counters)\n"
>> +              "\t\t2 - Test inc/dec of 32-bit counter\n"
>> +              "\t\t3 - Test add/sub of 32-bit counter\n"
>> +              "\t\t4 - Test inc/dec of 64-bit counter\n"
>> +              "\t\t5 - Test add/sub of 64-bit counter\n"
>> +              "\t<num of thread> is optional\n"
>> +              "\t\t<1 - 31> - no of threads to start\n"
>> +              "\t\tif user doesn't specify this option, then\n"
>> +              "\t\tno of threads created is equivalent to no of cores\n"
>> +              "\t\tavailable in the system\n"
>> +              "\tExample usage:\n"
>> +              "\t\t./odp_counter -t 2\n"
>> +              "\t\t./odp_counter -t 3 -n 12\n");
>> +}
>> +
>> +static uint32_t test_counter_inc_u32(void)
>> +{
>> +       int i;
>> +
>> +       for (i = 0; i < CNT; i++)
>> +               odp_counter32_inc(&a32u);
>> +       return i;
>> +}
>> +
>> +static uint32_t test_counter_inc_64(void)
>> +{
>> +       int i;
>> +
>> +       for (i = 0; i < CNT; i++)
>> +               odp_counter64_inc(&a64u);
>> +       return i;
>> +}
>> +
>> +static uint32_t test_counter_dec_u32(void)
>> +{
>> +       int i;
>> +
>> +       for (i = 0; i < CNT; i++)
>> +               odp_counter32_add(&a32u, (uint32_t)-1);
>> +       return i;
>> +}
>> +
>> +static uint32_t test_counter_dec_64(void)
>> +{
>> +       int i;
>> +
>> +       for (i = 0; i < CNT; i++)
>> +               odp_counter64_add(&a64u, (uint64_t)-1);
>> +       return i;
>> +}
>> +
>> +static uint32_t test_counter_add_u32(void)
>> +{
>> +       int i;
>> +
>> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> +               odp_counter32_add(&a32u, ADD_SUB_CNT);
>> +       return i;
>> +}
>> +
>> +static uint32_t test_counter_add_64(void)
>> +{
>> +       int i;
>> +
>> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> +               odp_counter64_add(&a64u, ADD_SUB_CNT);
>> +       return i;
>> +}
>> +
>> +static uint32_t test_counter_sub_u32(void)
>> +{
>> +       int i;
>> +
>> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> +               odp_counter32_add(&a32u, -ADD_SUB_CNT);
>> +       return i;
>> +}
>> +
>> +static uint32_t test_counter_sub_64(void)
>> +{
>> +       int i;
>> +
>> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> +               odp_counter64_add(&a64u, -ADD_SUB_CNT);
>> +       return i;
>> +}
>> +
>> +static uint32_t test_counter_inc_dec_u32(void)
>> +{
>> +       uint32_t nops = 0;
>> +       nops += test_counter_inc_u32();
>> +       nops += test_counter_dec_u32();
>> +       return nops;
>> +}
>> +
>> +static uint32_t test_counter_add_sub_u32(void)
>> +{
>> +       uint32_t nops = 0;
>> +       nops += test_counter_add_u32();
>> +       nops += test_counter_sub_u32();
>> +       return nops;
>> +}
>> +
>> +static uint32_t test_counter_inc_dec_64(void)
>> +{
>> +       uint32_t nops = 0;
>> +       nops += test_counter_inc_64();
>> +       nops += test_counter_dec_64();
>> +       return nops;
>> +}
>> +
>> +static uint32_t test_counter_add_sub_64(void)
>> +{
>> +       uint32_t nops = 0;
>> +       nops += test_counter_add_64();
>> +       nops += test_counter_sub_64();
>> +       return nops;
>> +}
>> +
>> +/**
>> + * Test basic counter operation like
>> + * add/sub/increment/decrement operation.
>> + */
>> +static uint32_t test_counter_basic(void)
>> +{
>> +       uint32_t nops = 0;
>> +       nops += test_counter_inc_u32();
>> +       nops += test_counter_dec_u32();
>> +       nops += test_counter_add_u32();
>> +       nops += test_counter_sub_u32();
>> +
>> +       nops += test_counter_inc_64();
>> +       nops += test_counter_dec_64();
>> +       nops += test_counter_add_64();
>> +       nops += test_counter_sub_64();
>> +
>> +       return nops;
>> +}
>> +
>> +static void test_counter_init(void)
>> +{
>> +       odp_counter32_init(&a32u, 0);
>> +       odp_counter64_init(&a64u, 0);
>> +}
>> +
>> +static void test_counter_write(void)
>> +{
>> +       odp_counter32_write(&a32u, U32_INIT_VAL);
>> +       odp_counter64_write(&a64u, U64_INIT_VAL);
>> +}
>> +
>> +static int test_counter_validate(void)
>> +{
>> +       if (odp_counter32_read(&a32u) != U32_INIT_VAL) {
>> +               ODP_ERR("Atomic u32 usual functions failed\n");
>> +               return -1;
>> +       }
>> +
>> +       if (odp_counter64_read(&a64u) != U64_INIT_VAL) {
>> +               ODP_ERR("Atomic u64 usual functions failed\n");
>> +               return -1;
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static void *run_thread(void *arg)
>> +{
>> +       pthrd_arg *parg = (pthrd_arg *)arg;
>> +       int thr;
>> +       uint64_t nops = 0;
>> +       struct timeval tv0, tv1;
>> +
>> +       thr = odp_thread_id();
>> +
>> +       ODP_DBG("Thread %i starts\n", thr);
>> +
>> +       /* Wait here until all threads have arrived */
>> +       /* Use multiple barriers to verify that it handles wrap around and
>> +        * has no race conditions which could be exposed when invoked
>> back-
>> +        * to-back */
>> +       odp_barrier_sync(&barrier);
>> +       odp_barrier_sync(&barrier);
>> +       odp_barrier_sync(&barrier);
>> +       odp_barrier_sync(&barrier);
>> +
>> +       gettimeofday(&tv0, NULL);
>> +
>> +       switch (parg->testcase) {
>> +       case TEST_MIX:
>> +               nops += test_counter_basic();
>> +               break;
>> +       case TEST_INC_DEC_U32:
>> +               nops += test_counter_inc_dec_u32();
>> +               break;
>> +       case TEST_ADD_SUB_U32:
>> +               nops += test_counter_add_sub_u32();
>> +               break;
>> +       case TEST_INC_DEC_64:
>> +               nops += test_counter_inc_dec_64();
>> +               break;
>> +       case TEST_ADD_SUB_64:
>> +               nops += test_counter_add_sub_64();
>> +               break;
>> +       }
>> +       gettimeofday(&tv1, NULL);
>> +       accops[thr] = nops;
>> +       fflush(NULL);
>> +
>> +       uint64_t usecs = (tv1.tv_sec - tv0.tv_sec) * 1000000ULL +
>> +                        tv1.tv_usec - tv0.tv_usec;
>> +       printf("Time taken in thread %02d to complete %"PRIu64" op is "
>> +              "%"PRIu64" usec, %"PRIu64" ns/op\n",
>> +              thr, nops, usecs, 1000 * usecs / nops);
>> +
>> +       return parg;
>> +}
>> +
>> +int main(int argc, char *argv[])
>> +{
>> +       pthrd_arg thrdarg;
>> +       int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
>> +       char c;
>> +       int result;
>> +
>> +       if (argc == 1 || argc % 2 == 0) {
>> +               usage();
>> +               goto err_exit;
>> +       }
>> +       if (odp_test_global_init() != 0)
>> +               goto err_exit;
>> +       odp_print_system_info();
>> +
>> +       while (cnt != 0) {
>> +               sscanf(argv[++i], "-%c", &c);
>> +               switch (c) {
>> +               case 't':
>> +                       sscanf(argv[++i], "%d", &test_type);
>> +                       break;
>> +               case 'n':
>> +                       sscanf(argv[++i], "%d", &pthrdnum);
>> +                       break;
>> +               default:
>> +                       ODP_ERR("Invalid option %c\n", c);
>> +                       usage();
>> +                       goto err_exit;
>> +               }
>> +               if (test_type < TEST_MIX || test_type > TEST_MAX ||
>> +                   pthrdnum > odp_sys_core_count()) {
>> +                       usage();
>> +                       goto err_exit;
>> +               }
>> +               cnt -= 2;
>> +       }
>> +       if (pthrdnum == 0)
>> +               pthrdnum = odp_sys_core_count();
>> +
>> +       test_counter_init();
>> +       test_counter_write();
>> +
>> +       memset(&thrdarg, 0, sizeof(pthrd_arg));
>> +       thrdarg.testcase = test_type;
>> +       thrdarg.numthrds = pthrdnum;
>> +
>> +       if ((test_type > 0) && (test_type < TEST_MAX)) {
>> +               printf("%s\n", test_name[test_type]);
>> +       } else {
>> +               ODP_ERR("Invalid test case [%d]\n", test_type);
>> +               usage();
>> +               goto err_exit;
>> +       }
>> +       odp_barrier_init(&barrier, pthrdnum);
>> +       odp_test_thread_create(run_thread, &thrdarg);
>> +
>> +       odp_test_thread_exit(&thrdarg);
>> +
>> +       result = test_counter_validate();
>> +
>> +       if (result == 0) {
>> +               printf("%s_%d_%d Result:pass\n",
>> +                      test_name[test_type], test_type, pthrdnum);
>> +       } else {
>> +               printf("%s_%d_%d Result:fail\n",
>> +                      test_name[test_type], test_type, pthrdnum);
>> +       }
>> +       return 0;
>> +
>> +err_exit:
>> +       return -1;
>> +}
>> --
>> 1.9.1
>>
>>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
Mike Holmes Nov. 4, 2014, 1:58 p.m. UTC | #5
Ola, you need to get someone to add Reviewed-by (did a proper visual
inspection) or Tested-by (executed a binary and nothing blew up) to the
patch.

Bill, if you reviewed it you could add Reviewed-by and then Maxim can pull
it in, normally that is done in line below the Signed-off

Mike

On 4 November 2014 08:51, Bill Fischofer <bill.fischofer@linaro.org> wrote:

> +1  Merge and refine if needed. Time is ticking.
>
> On Tue, Nov 4, 2014 at 7:48 AM, Ola Liljedahl <ola.liljedahl@linaro.org>
> wrote:
>
>> Ping!
>>
>> I really need this new working atomics support merged ASAP because I have
>> a new lock-less implementation of the timer API which uses atomic
>> operations. I haven't seen any real criticism against the content of the
>> patch so there is nothing to change.
>>
>> -- Ola
>>
>>
>> On 20 October 2014 15:07, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:
>>
>>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>>> ---
>>> Added header file odp_counter.h with support for 32- and 64-bit atomic
>>> counters
>>> using relaxed memory order. 6 operations
>>> (init/read/write/add/read_inc/inc) on
>>> 32-bit and 64-bit counters respectively.
>>> Renamed odp_atomic_test to odp_counter_test and changed to use
>>> odp_counter.h
>>>
>>> Implementation of C11-based memory model for atomic operations. 10
>>> operations
>>> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
>>> in
>>> odp_atomic.h. The required memory ordering is now a parameter to each
>>> call just
>>> like in C11.
>>>
>>> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
>>> fall back to GCC __sync builtins which often include unnecessarily heavy
>>> barrier/sync operations (always sequentially consistent).
>>>
>>> Attempt to remove all explicit memory barriers (odp_sync_stores) from
>>> code that
>>> implements multithreaded synchronization primitives (e.g. locks,
>>> barriers).
>>> Rewrote such primitives to use the new atomic operations.
>>>
>>> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
>>> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
>>> enqueue/dequeue
>>> (missing release barrier, had only compiler barrier).
>>>
>>>  .gitignore                                         |   2 +-
>>>  example/generator/odp_generator.c                  |  43 +-
>>>  example/ipsec/odp_ipsec.c                          |   2 +-
>>>  example/odp_example/odp_example.c                  |   2 +-
>>>  example/timer/odp_timer_test.c                     |   2 +-
>>>  helper/include/odph_ring.h                         |   8 +-
>>>  platform/linux-generic/include/api/odp.h           |   1 +
>>>  platform/linux-generic/include/api/odp_atomic.h    | 838
>>> +++++++++++----------
>>>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>>>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>>>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>>>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>>>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>>>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>>>  platform/linux-generic/odp_barrier.c               |  49 +-
>>>  platform/linux-generic/odp_buffer.c                |   3 +-
>>>  platform/linux-generic/odp_crypto.c                |   7 +-
>>>  platform/linux-generic/odp_queue.c                 |   7 +-
>>>  platform/linux-generic/odp_ring.c                  |  94 +--
>>>  platform/linux-generic/odp_rwlock.c                |  62 +-
>>>  platform/linux-generic/odp_thread.c                |   9 +-
>>>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>>>  platform/linux-generic/odp_timer.c                 |  22 +-
>>>  test/api_test/Makefile.am                          |   6 +-
>>>  test/api_test/odp_atomic_test.c                    | 362 ---------
>>>  test/api_test/odp_atomic_test.h                    |  60 --
>>>  test/api_test/odp_common.c                         |   1 -
>>>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>>>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>>>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>>>  delete mode 100644 test/api_test/odp_atomic_test.c
>>>  delete mode 100644 test/api_test/odp_atomic_test.h
>>>  create mode 100644 test/api_test/odp_counter_test.c
>>>
>>> diff --git a/.gitignore b/.gitignore
>>> index 6342e34..77db4d6 100644
>>> --- a/.gitignore
>>> +++ b/.gitignore
>>> @@ -35,7 +35,7 @@ build/
>>>  odp_example
>>>  odp_packet
>>>  odp_packet_netmap
>>> -odp_atomic
>>> +odp_counter
>>>  odp_shm
>>>  odp_ring
>>>  odp_timer_ping
>>> diff --git a/example/generator/odp_generator.c
>>> b/example/generator/odp_generator.c
>>> index eb8b340..252157d 100644
>>> --- a/example/generator/odp_generator.c
>>> +++ b/example/generator/odp_generator.c
>>> @@ -62,10 +62,10 @@ typedef struct {
>>>   * counters
>>>  */
>>>  static struct {
>>> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
>>> -       odp_atomic_u64_t ip;    /**< ip packets */
>>> -       odp_atomic_u64_t udp;   /**< udp packets */
>>> -       odp_atomic_u64_t icmp;  /**< icmp packets */
>>> +       odp_counter64_t seq;    /**< ip seq to be send */
>>> +       odp_counter64_t ip;     /**< ip packets */
>>> +       odp_counter64_t udp;    /**< udp packets */
>>> +       odp_counter64_t icmp;   /**< icmp packets */
>>>  } counters;
>>>
>>>  /** * Thread specific arguments
>>> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>>> ODPH_UDPHDR_LEN +
>>>                                        ODPH_IPV4HDR_LEN);
>>>         ip->proto = ODPH_IPPROTO_UDP;
>>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
>>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>>>         ip->id = odp_cpu_to_be_16(seq);
>>>         ip->chksum = 0;
>>>         odph_ipv4_csum_update(pkt);
>>> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>>> ODPH_ICMPHDR_LEN +
>>>                                        ODPH_IPV4HDR_LEN);
>>>         ip->proto = ODPH_IPPROTO_ICMP;
>>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
>>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>>>         ip->id = odp_cpu_to_be_16(seq);
>>>         ip->chksum = 0;
>>>         odph_ipv4_csum_update(pkt);
>>> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>>>                 }
>>>
>>>                 if (args->appl.interval != 0) {
>>> +                       uint64_t seq = odp_counter64_read(&counters.seq);
>>>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
>>> -                              thr, counters.seq, counters.seq%0xffff);
>>> +                              thr, seq, seq%0xffff);
>>>                         /* TODO use odp timer */
>>>                         usleep(args->appl.interval * 1000);
>>>                 }
>>> -               if (args->appl.number != -1 && counters.seq
>>> -                   >= (unsigned int)args->appl.number) {
>>> +               if (args->appl.number != -1 &&
>>> +                   odp_counter64_read(&counters.seq) >=
>>> +                   (unsigned int)args->appl.number) {
>>>                         break;
>>>                 }
>>>         }
>>> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>>>         /* receive number of reply pks until timeout */
>>>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>>>                 while (args->appl.timeout >= 0) {
>>> -                       if (counters.icmp >= (unsigned
>>> int)args->appl.number)
>>> +                       if (odp_counter64_read(&counters.icmp) >=
>>> +                           (unsigned int)args->appl.number)
>>>                                 break;
>>>                         /* TODO use odp timer */
>>>                         sleep(1);
>>> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>>>
>>>         /* print info */
>>>         if (args->appl.mode == APPL_MODE_UDP) {
>>> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
>>> +               printf("  [%02i] total send: %ju\n", thr,
>>> +                      odp_counter64_read(&counters.seq));
>>>         } else if (args->appl.mode == APPL_MODE_PING) {
>>>                 printf("  [%02i] total send: %ju total receive: %ju\n",
>>> -                      thr, counters.seq, counters.icmp);
>>> +                      thr, odp_counter64_read(&counters.seq),
>>> +                      odp_counter64_read(&counters.icmp));
>>>         }
>>>         return arg;
>>>  }
>>> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
>>> pkt_tbl[], unsigned len)
>>>                 if (!odp_packet_inflag_ipv4(pkt))
>>>                         continue;
>>>
>>> -               odp_atomic_inc_u64(&counters.ip);
>>> +               odp_counter64_inc(&counters.ip);
>>>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>>>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>>>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
>>> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
>>> pkt_tbl[], unsigned len)
>>>
>>>                 /* udp */
>>>                 if (ip->proto == ODPH_IPPROTO_UDP) {
>>> -                       odp_atomic_inc_u64(&counters.udp);
>>> +                       odp_counter64_inc(&counters.udp);
>>>                         udp = (odph_udphdr_t *)(buf + offset);
>>>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>>>                                         odp_be_to_cpu_16(udp->length) -
>>> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
>>> pkt_tbl[], unsigned len)
>>>                         icmp = (odph_icmphdr_t *)(buf + offset);
>>>                         /* echo reply */
>>>                         if (icmp->type == ICMP_ECHOREPLY) {
>>> -                               odp_atomic_inc_u64(&counters.icmp);
>>> +                               odp_counter64_inc(&counters.icmp);
>>>                                 memcpy(&tvsend, buf + offset +
>>> ODPH_ICMPHDR_LEN,
>>>                                        sizeof(struct timeval));
>>>                                 /* TODO This should be changed to use an
>>> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>>>         }
>>>
>>>         /* init counters */
>>> -       odp_atomic_init_u64(&counters.seq);
>>> -       odp_atomic_init_u64(&counters.ip);
>>> -       odp_atomic_init_u64(&counters.udp);
>>> -       odp_atomic_init_u64(&counters.icmp);
>>> +       odp_counter64_init(&counters.seq, 0);
>>> +       odp_counter64_init(&counters.ip, 0);
>>> +       odp_counter64_init(&counters.udp, 0);
>>> +       odp_counter64_init(&counters.icmp, 0);
>>>
>>>         /* Reserve memory for args from shared mem */
>>>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
>>> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
>>> index 2f2dc19..76c27d0 100644
>>> --- a/example/ipsec/odp_ipsec.c
>>> +++ b/example/ipsec/odp_ipsec.c
>>> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>>>         printf("Num worker threads: %i\n", num_workers);
>>>
>>>         /* Create a barrier to synchronize thread startup */
>>> -       odp_barrier_init_count(&sync_barrier, num_workers);
>>> +       odp_barrier_init(&sync_barrier, num_workers);
>>>
>>>         /*
>>>          * By default core #0 runs Linux kernel background tasks.
>>> diff --git a/example/odp_example/odp_example.c
>>> b/example/odp_example/odp_example.c
>>> index 0e9aa3d..c473395 100644
>>> --- a/example/odp_example/odp_example.c
>>> +++ b/example/odp_example/odp_example.c
>>> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>>>         odp_shm_print_all();
>>>
>>>         /* Barrier to sync test case execution */
>>> -       odp_barrier_init_count(&globals->barrier, num_workers);
>>> +       odp_barrier_init(&globals->barrier, num_workers);
>>>
>>>         if (args.proc_mode) {
>>>                 int ret;
>>> diff --git a/example/timer/odp_timer_test.c
>>> b/example/timer/odp_timer_test.c
>>> index 78b2ae2..dfbeae9 100644
>>> --- a/example/timer/odp_timer_test.c
>>> +++ b/example/timer/odp_timer_test.c
>>> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>>>         printf("\n");
>>>
>>>         /* Barrier to sync test case execution */
>>> -       odp_barrier_init_count(&test_barrier, num_workers);
>>> +       odp_barrier_init(&test_barrier, num_workers);
>>>
>>>         /* Create and launch worker threads */
>>>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
>>> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
>>> index 76c1db8..5e78b34 100644
>>> --- a/helper/include/odph_ring.h
>>> +++ b/helper/include/odph_ring.h
>>> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>>>                 uint32_t sp_enqueue;     /* True, if single producer. */
>>>                 uint32_t size;           /* Size of ring. */
>>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>>> -               uint32_t head;          /* Producer head. */
>>> -               uint32_t tail;          /* Producer tail. */
>>> +               odp_atomic32_t head;    /* Producer head. */
>>> +               odp_atomic32_t tail;    /* Producer tail. */
>>>         } prod ODP_ALIGNED_CACHE;
>>>
>>>         /** @private Consumer */
>>> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>>>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>>>                 uint32_t size;           /* Size of the ring. */
>>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>>> -               uint32_t head;          /* Consumer head. */
>>> -               uint32_t tail;          /* Consumer tail. */
>>> +               odp_atomic32_t head;    /* Consumer head. */
>>> +               odp_atomic32_t tail;    /* Consumer tail. */
>>>         } cons ODP_ALIGNED_CACHE;
>>>
>>>         /** @private Memory space of ring starts here. */
>>> diff --git a/platform/linux-generic/include/api/odp.h
>>> b/platform/linux-generic/include/api/odp.h
>>> index 0ee3faf..d124d52 100644
>>> --- a/platform/linux-generic/include/api/odp.h
>>> +++ b/platform/linux-generic/include/api/odp.h
>>> @@ -32,6 +32,7 @@ extern "C" {
>>>  #include <odp_barrier.h>
>>>  #include <odp_spinlock.h>
>>>  #include <odp_atomic.h>
>>> +#include <odp_counter.h>
>>>
>>>  #include <odp_init.h>
>>>  #include <odp_system_info.h>
>>> diff --git a/platform/linux-generic/include/api/odp_atomic.h
>>> b/platform/linux-generic/include/api/odp_atomic.h
>>> index 0cc4cf4..ccaad02 100644
>>> --- a/platform/linux-generic/include/api/odp_atomic.h
>>> +++ b/platform/linux-generic/include/api/odp_atomic.h
>>> @@ -4,464 +4,494 @@
>>>   * SPDX-License-Identifier:     BSD-3-Clause
>>>   */
>>>
>>> -
>>>  /**
>>>   * @file
>>>   *
>>> - * ODP atomic operations
>>> + * ODP atomic types and operations, semantically a subset of C11
>>> atomics.
>>> + * Scalar variable wrapped in a struct to avoid accessing scalar
>>> directly
>>> + * without using the required access functions.
>>> + * Atomic functions must be used to operate on atomic variables!
>>>   */
>>>
>>>  #ifndef ODP_ATOMIC_H_
>>>  #define ODP_ATOMIC_H_
>>>
>>> +#include <stdint.h>
>>> +#include <odp_align.h>
>>> +#include <odp_hints.h>
>>> +#include <odp_debug.h>
>>> +
>>>  #ifdef __cplusplus
>>>  extern "C" {
>>>  #endif
>>>
>>> -
>>> -#include <odp_std_types.h>
>>> -
>>> -
>>> -/**
>>> - * Atomic integer
>>> - */
>>> -typedef volatile int32_t odp_atomic_int_t;
>>> -
>>> -/**
>>> - * Atomic unsigned integer 64 bits
>>> - */
>>> -typedef volatile uint64_t odp_atomic_u64_t;
>>> -
>>> -/**
>>> - * Atomic unsigned integer 32 bits
>>> - */
>>> -typedef volatile uint32_t odp_atomic_u32_t;
>>> -
>>> -
>>> -/**
>>> - * Initialize atomic integer
>>> - *
>>> - * @param ptr    An integer atomic variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       *ptr = 0;
>>> -}
>>> -
>>> -/**
>>> - * Load value of atomic integer
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return atomic integer value
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       return *ptr;
>>> -}
>>> -
>>> -/**
>>> - * Store value to atomic integer
>>> - *
>>> - * @param ptr        An atomic variable
>>> - * @param new_value  Store new_value to a variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
>>> new_value)
>>> -{
>>> -       *ptr = new_value;
>>> -}
>>> -
>>> -/**
>>> - * Fetch and add atomic integer
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be added to the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
>>> value)
>>> -{
>>> -       return __sync_fetch_and_add(ptr, value);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and subtract atomic integer
>>> - *
>>> - * @param ptr    An atomic integer variable
>>> - * @param value  A value to be subtracted from the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
>>> value)
>>> -{
>>> -       return __sync_fetch_and_sub(ptr, value);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and increment atomic integer by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_add_int(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Increment atomic integer by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_add_int(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and decrement atomic integer by 1
>>> - *
>>> - * @param ptr    An atomic int variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_sub_int(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Decrement atomic integer by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_sub_int(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Initialize atomic uint32
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       *ptr = 0;
>>> -}
>>> -
>>> -/**
>>> - * Load value of atomic uint32
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return atomic uint32 value
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       return *ptr;
>>> -}
>>> -
>>> -/**
>>> - * Store value to atomic uint32
>>> - *
>>> - * @param ptr        An atomic variable
>>> - * @param new_value  Store new_value to a variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
>>> -                                       uint32_t new_value)
>>> -{
>>> -       *ptr = new_value;
>>> -}
>>> -
>>> -/**
>>> - * Fetch and add atomic uint32
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be added to the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>>> -                                               uint32_t value)
>>> -{
>>> -       return __sync_fetch_and_add(ptr, value);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and subtract uint32
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be sub to the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
>>> -                                               uint32_t value)
>>> -{
>>> -       return __sync_fetch_and_sub(ptr, value);
>>> -}
>>> -
>>>  /**
>>> - * Fetch and increment atomic uint32 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -#if defined __OCTEON__
>>> -
>>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       uint32_t ret;
>>> -
>>> -       __asm__ __volatile__ ("syncws");
>>> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
>>> -                             "r" (ptr));
>>> -
>>> -       return ret;
>>> -}
>>> -
>>> + * 32-bit (unsigned) atomic type
>>> + */
>>> +typedef struct {
>>> +       uint32_t v; /**< Actual storage for the atomic variable */
>>> +} odp_atomic32_t
>>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>>> +
>>> +typedef enum {
>>> +       /** Relaxed memory order, no ordering of other accesses enforced
>>> */
>>> +       ODP_MEMORDER_RLX,
>>> +       /** Acquire memory order, later accesses cannot move before
>>> +        * acquire operation */
>>> +       ODP_MEMORDER_ACQ,
>>> +       /** Release memory order, earlier accesses cannot move after
>>> +        * release operation */
>>> +       ODP_MEMORDER_RLS
>>> +} odp_memorder_t;
>>> +
>>>
>>> +/*****************************************************************************
>>> + * Just some private helpers
>>>
>>> +*****************************************************************************/
>>> +
>>> +#ifdef __OCTEON__
>>> +/* OCTEON Write Memory Barrier */
>>> +#define COMPILER_HW_BARRIER() __asm __volatile( \
>>> +       /* Double syncw to work around errata */ \
>>> +       "syncw\n\tsyncw" : : : )
>>>  #else
>>> -
>>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_add_u32(ptr, 1);
>>> -}
>>> -
>>> +/** Compiler and hardware full memory barrier */
>>> +#define COMPILER_HW_BARRIER() __sync_synchronize()
>>> +/* __sync_synchronize() generates the right insn for ARMv6t2 and
>>> ARMv7-a */
>>>  #endif
>>>
>>> -/**
>>> - * Increment atomic uint32 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_add_u32(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and decrement uint32 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_sub_u32(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Decrement atomic uint32 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_sub_u32(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Atomic compare and set for 32bit
>>> - *
>>> - * @param dst destination location into which the value will be written.
>>> - * @param exp expected value.
>>> - * @param src new value.
>>> - * @return Non-zero on success; 0 on failure.
>>> - */
>>> -static inline int
>>> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
>>> -{
>>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>>> +#define MEMORY "memory"
>>> +
>>>
>>> +/*****************************************************************************
>>> + * Operations on 32-bit atomics
>>> + * odp_atomic32_init - no return value
>>> + * odp_atomic32_load - return current value
>>> + * odp_atomic32_store - no return value
>>> + * odp_atomic32_cmp_xchg_weak - return bool
>>> + * odp_atomic32_fetch_add - return old value
>>> + * odp_atomic32_add - no return value
>>> + * odp_atomic32_fetch_inc - return old value
>>> + * odp_atomic32_inc - no return value
>>> + * odp_atomic32_fetch_dec - return old value
>>> + * odp_atomic32_dec - no return value
>>> +
>>> *****************************************************************************/
>>> +
>>> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
>>> +{
>>> +       /* Write of aligned word is atomic */
>>> +       /* Cast to volatile to force compiler to (re-) write variable,
>>> thus we
>>> +        * can avoid using compiler memory barriers */
>>> +       *(__volatile uint32_t *)&ptr->v = val;
>>> +}
>>> +
>>> +/**
>>> + * Atomic load of 32-bit atomic variable
>>> + *
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param memmodel Memory model associated with the load
>>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
>>> + *
>>> + * @return Value of the variable
>>> + */
>>> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
>>> +               odp_memorder_t mmodel)
>>> +{
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +               uint32_t val;
>>> +               /* Read of aligned word is atomic */
>>> +               /* Cast to volatile to force compiler to (re-) read
>>> variable,
>>> +                * thus we can avoid using compiler memory barriers */
>>> +               val = *(__volatile const uint32_t *)&ptr->v;
>>> +               return val;
>>> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
>>> +#if defined __aarch64__
>>> +               uint32_t val;
>>> +               __asm __volatile("ldar %w0, [%1]"
>>> +                               : "=&r"(val)
>>> +                               : "r"(&ptr->v)
>>> +                               : MEMORY);
>>> +               return val;
>>> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
>>> +               /* Read of aligned word is atomic */
>>> +               uint32_t val = ptr->v;
>>> +               /* To prevent later accesses from moving up */
>>> +               /* Herb Sutter claims HW barrier not needed on x86? */
>>> +               COMPILER_HW_BARRIER();
>>> +               return val;
>>> +#else
>>> +#warning odp_atomic32_load() may not be efficiently implemented
>>> +               /* Assume read of aligned word is atomic */
>>> +               uint32_t val = ptr->v;
>>> +               /* To prevent later accesses from moving up */
>>> +               COMPILER_HW_BARRIER();
>>> +               return val;
>>> +#endif
>>> +       } else {
>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>> +       }
>>> +}
>>> +
>>> +/**
>>> + * Atomic store to 32-bit atomic variable
>>> + *
>>> + * @param ptr  Pointer to a 32-bit atomic variable
>>> + * @param val  Value to write to the atomic variable
>>> + * @param memmodel Memory model associated with the store
>>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
>>> + */
>>> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
>>> +               uint32_t val,
>>> +               odp_memorder_t mmodel)
>>> +{
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +               /* Write of aligned word is atomic */
>>> +               /* Cast to volatile to force compiler to (re-) write
>>> variable,
>>> +                * thus we will avoid using compiler memory barriers */
>>> +               *(__volatile uint32_t *)&ptr->v = val;
>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
>>> +               /* Compiler and HW barrier to prevent earlier accesses
>>> from
>>> +                * moving down */
>>> +               COMPILER_HW_BARRIER();
>>> +               /* Write of aligned word is atomic */
>>> +               ptr->v = val;
>>> +               /* Compiler and HW barrier to prevent this store from
>>> moving
>>> +                * down after a later load-acquire and thus create
>>> overlapping
>>> +                * critical sections. Herb Sutter thinks this is needed
>>> */
>>> +               COMPILER_HW_BARRIER();
>>> +#elif defined __aarch64__
>>> +               __asm __volatile("stlr %w0, [%1]"
>>> +                               :
>>> +                               : "r"(val), "r"(&ptr->v)
>>> +                               : MEMORY);
>>> +#elif defined __x86_64__
>>> +               /* This is actually an atomic exchange operation */
>>> +               /* Generates good code on x86_64 */
>>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>>> +#else
>>> +#warning odp_atomic32_store_rls() may not be efficiently implemented
>>> +               /* This is actually an atomic exchange operation */
>>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>>> +#endif
>>> +       } else {
>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>> +       }
>>> +}
>>> +
>>> +
>>> +/**
>>> + * Atomic compare and exchange (swap) of 32-bit atomic variable
>>> + * "Weak" semantics, may fail spuriously and must be used in a loop.
>>> + *
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param exp_p Pointer to expected value (updated on failure)
>>> + * @param val   New value to write
>>> + * @param       memmodel Memory model associated with the
>>> compare-and-swap
>>> + * operation (ODP_MEMORDER_RLX only)
>>> + *
>>> + * @return 1 (true) if exchange successful, 0 (false) if not successful
>>> (and
>>> + * '*exp_p' updated with current value)
>>> + */
>>> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
>>> +               uint32_t *exp_p,
>>> +               uint32_t val,
>>> +               odp_memorder_t mmodel)
>>> +{
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +               uint32_t old;
>>> +               uint32_t exp = *exp_p;
>>> +               int status;
>>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>>> +                                "cmp   %0, %3\t\n"
>>> +                                "bne   1f\t\n"
>>> +                                "strex %1, %4, [%2]\t\n"
>>> +                                "1:\t\n"
>>> +                               : "=&r"(old), "=&r"(status)
>>> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
>>> +                               : MEMORY);
>>> +               if (odp_unlikely(old != exp)) {
>>> +                       /* Value has changed, can't proceed */
>>> +                       /* Clear exclusive access monitor */
>>> +                       __asm __volatile("clrex");
>>> +                       /* Return current value */
>>> +                       *exp_p = old;
>>> +                       return 0;
>>> +               }
>>> +               /* strex returns 0 on success */
>>> +               if (odp_unlikely(status != 0)) {
>>> +                       /* strex failed, reservation was disturbed */
>>> +                       /* Return potentially changed value */
>>> +                       *exp_p = odp_atomic32_load(ptr,
>>> ODP_MEMORDER_RLX);
>>> +                       return 0;
>>> +               }
>>> +               return 1;
>>> +#elif defined __mips64__
>>> +               uint32_t old;
>>> +               uint32_t exp = *exp_p;
>>> +               uint32_t status = val;
>>> +               __asm __volatile("llw %0, [%2]\t\n"
>>> +                                "bne %0, %3, 1f\t\n"
>>> +                                "scw %1, [%2]\t\n"
>>> +                                "1:\t\n"
>>> +                               : "=&r"(old), "+&r"(status)
>>> +                               : "r"(&ptr->v), "r"(exp)
>>> +                               : MEMORY);
>>> +               if (odp_unlikely(old != exp)) {
>>> +                       /* Value has changed, can't proceed */
>>> +                       /* Return current value */
>>> +                       *exp_p = old;
>>> +                       return 0;
>>> +               }
>>> +               /* scw returns 1 on success, 0 on failure */
>>> +               if (odp_unlikely(status == 0)) {
>>> +                       /* scw failed, reservation was disturbed */
>>> +                       *exp_p = odp_atomic32_load(ptr,
>>> ODP_MEMORDER_RLX);
>>> +                       return 0;
>>> +               }
>>> +               return 1;
>>> +#elif defined __x86_64__
>>> +               uint32_t exp = *exp_p;
>>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
>>> val);
>>> +               if (odp_unlikely(old != exp)) {
>>> +                       /* Return the unexpected content of '*ptr' */
>>> +                       *exp_p = old;
>>> +                       return 0;
>>> +               } else {
>>> +                       return 1;
>>> +               }
>>> +#else
>>> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
>>> +               uint32_t exp = *exp_p;
>>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
>>> val);
>>> +               if (odp_unlikely(old != exp)) {
>>> +                       /* Return the unexpected content of '*ptr' */
>>> +                       *exp_p = old;
>>> +                       return 0;
>>> +               } else {
>>> +                       return 1;
>>> +               }
>>> +#endif
>>> +       } else {
>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>> +       }
>>> +}
>>> +
>>> +/**
>>> + * Atomic fetch and add to 32-bit atomic variable
>>> + * @note A - B <=> A + (-B)
>>> + *
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param incr  The value to be added to the atomic variable
>>> + * @param memmodel Memory model associated with the add
>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>> + *
>>> + * @return Value of the atomic variable before the addition
>>> + */
>>> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
>>> +               uint32_t incr,
>>> +               odp_memorder_t mmodel)
>>> +{
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +               uint32_t old_val, tmp;
>>> +               int status;
>>> +               do {
>>> +                       __asm __volatile("ldrex %0, [%3]\t\n"
>>> +                                        "add   %1, %0, %4\t\n"
>>> +                                        "strex %2, %1, [%3]\t\n"
>>> +                                       : "=&r"(old_val), "=&r"(tmp),
>>> +                                         "=&r"(status)
>>> +                                       : "r"(&ptr->v), "r"(incr)
>>> +                                       : MEMORY);
>>> +               } while (odp_unlikely(status != 0));
>>> +               return old_val;
>>> +#elif defined __OCTEON__
>>> +               uint32_t old_val;
>>> +               __asm __volatile("laa %0,(%2),%3"
>>> +                               : "=r" (old_val), "+m" (ptr)
>>> +                               : "r" (ptr), "r" (incr)
>>> +                               : MEMORY);
>>> +               return old_val;
>>> +#elif defined __x86_64__
>>> +               /* Generates good code on x86_64 */
>>> +               return __sync_fetch_and_add(&ptr->v, incr);
>>> +#else
>>> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
>>> +               return __sync_fetch_and_add(&ptr->v, incr);
>>> +#endif
>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>> +#if defined __OCTEON__
>>> +               uint32_t old_val;
>>> +               COMPILER_HW_BARRIER();
>>> +               __asm __volatile("laa %0,(%2),%3"
>>> +                               : "=r" (old_val), "+m" (ptr)
>>> +                               : "r" (ptr), "r" (incr)
>>> +                               : MEMORY);
>>> +               COMPILER_HW_BARRIER();
>>> +               return old_val;
>>> +#endif
>>> +               /* __sync_fetch_and_add() will give us barriers before
>>> and
>>> +                * after, we are fine with this for release operations */
>>> +               return __sync_fetch_and_add(&ptr->v, incr);
>>> +       } else {
>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>> +       }
>>>  }
>>>
>>>  /**
>>> - * Initialize atomic uint64
>>> + * Atomic add to 32-bit atomic variable
>>>   *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param incr  The value to be added to the atomic variable
>>> + * @param memmodel Memory model associated with the add
>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>   */
>>> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
>>> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
>>> +               uint32_t incr,
>>> +               odp_memorder_t mmodel)
>>>  {
>>> -       *ptr = 0;
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +               /* Platforms that support atomic add instructions can add
>>> +                * their implementations here */
>>> +#if defined __OCTEON__
>>> +               __asm __volatile("saa %[inc], (%[base])"
>>> +                               : "+m" (*ptr)
>>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>>> +                               : MEMORY);
>>> +               return;
>>> +#endif
>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>> +               /* Platforms that support atomic add instructions can add
>>> +                * their implementations here */
>>> +#if defined __OCTEON__
>>> +               COMPILER_HW_BARRIER();
>>> +               __asm __volatile("saa %[inc], (%[base])"
>>> +                               : "+m" (*ptr)
>>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>>> +                               : MEMORY);
>>> +               COMPILER_HW_BARRIER();
>>> +               return;
>>> +#endif
>>> +       }
>>> +       /* Default to using odp_atomic32_fetch_add() */
>>> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>>>  }
>>>
>>>  /**
>>> - * Load value of atomic uint64
>>> - *
>>> - * @param ptr    An atomic variable
>>> + * Atomic fetch and increment of 32-bit atomic variable
>>>   *
>>> - * @return atomic uint64 value
>>> + * param ptr   Pointer to a 32-bit atomic variable
>>> + * @param memmodel Memory model associated with the increment
>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>   *
>>> - * @note The operation is not synchronized with other threads
>>> + * @return Value of the atomic variable before the increment
>>>   */
>>> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
>>> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
>>> +               odp_memorder_t mmodel)
>>>  {
>>> -       return *ptr;
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +               /* Platforms that support atomic increment instructions
>>> can add
>>> +                * their implementations here */
>>> +#if defined __OCTEON__
>>> +               uint32_t old_val;
>>> +               __asm __volatile("lai %0,(%2)"
>>> +                               : "=r" (old_val), "+m" (ptr)
>>> +                               : "r" (ptr)
>>> +                               : MEMORY);
>>> +               return old_val;
>>> +#endif
>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>> +#if defined __OCTEON__
>>> +               uint32_t old_val;
>>> +               COMPILER_HW_BARRIER();
>>> +               __asm __volatile("lai %0,(%2)"
>>> +                               : "=r" (old_val), "+m" (ptr)
>>> +                               : "r" (ptr)
>>> +                               : MEMORY);
>>> +               COMPILER_HW_BARRIER();
>>> +               return old_val;
>>> +#endif
>>> +       }
>>> +       /* Default to using odp_atomic32_fetch_add() */
>>> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>>>  }
>>>
>>>  /**
>>> - * Store value to atomic uint64
>>> - *
>>> - * @param ptr        An atomic variable
>>> - * @param new_value  Store new_value to a variable
>>> + * Atomic increment of 32-bit atomic variable
>>>   *
>>> - * @note The operation is not synchronized with other threads
>>> + * param ptr   Pointer to a 32-bit atomic variable
>>> + * @param memmodel Memory model associated with the increment
>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>   */
>>> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
>>> -                                       uint64_t new_value)
>>> -{
>>> -       *ptr = new_value;
>>> -}
>>> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
>>> +               odp_memorder_t mmodel)
>>>
>>> -/**
>>> - * Add atomic uint64
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be added to the variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
>>> value)
>>>  {
>>> -       __sync_fetch_and_add(ptr, value);
>>> +       /* Default to using odp_atomic32_fetch_inc() */
>>> +       /* Platforms that support atomic increment instructions can add
>>> +        * their implementations here */
>>> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>>>  }
>>>
>>>  /**
>>> - * Fetch and add atomic uint64
>>> + * Atomic fetch and decrement of 32-bit atomic variable
>>>   *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be added to the variable
>>> + * param ptr   Pointer to a 32-bit atomic variable
>>> + * @param memmodel Memory model associated with the decrement
>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>   *
>>> - * @return Value of the variable before the operation
>>> + * @return Value of the atomic variable before the decrement
>>>   */
>>> -
>>> -#if defined __powerpc__ && !defined __powerpc64__
>>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>>> -                                               uint64_t value)
>>> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
>>> +               odp_memorder_t mmodel)
>>>  {
>>> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
>>> -                                   (uint32_t)value);
>>> -}
>>> -#else
>>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>>> -                                               uint64_t value)
>>> -{
>>> -       return __sync_fetch_and_add(ptr, value);
>>> -}
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +               /* Platforms that support atomic decrement instructions
>>> can add
>>> +                * their implementations here */
>>> +#if defined __OCTEON__
>>> +               uint32_t old_val;
>>> +               __asm __volatile("lad %0,(%2)"
>>> +                               : "=r" (old_val), "+m" (ptr)
>>> +                               : "r" (ptr)
>>> +                               : MEMORY);
>>> +               return old_val;
>>>  #endif
>>> -/**
>>> - * Subtract atomic uint64
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be subtracted from the variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
>>> value)
>>> -{
>>> -       __sync_fetch_and_sub(ptr, value);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and subtract atomic uint64
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be subtracted from the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -#if defined __powerpc__ && !defined __powerpc64__
>>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>>> -                                               uint64_t value)
>>> -{
>>> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
>>> -                                   (uint32_t)value);
>>> -}
>>> -#else
>>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>>> -                                               uint64_t value)
>>> -{
>>> -       return __sync_fetch_and_sub(ptr, value);
>>> -}
>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>> +#if defined __OCTEON__
>>> +               uint32_t old_val;
>>> +               COMPILER_HW_BARRIER();
>>> +               __asm __volatile("lad %0,(%2)"
>>> +                               : "=r" (old_val), "+m" (ptr)
>>> +                               : "r" (ptr)
>>> +                               : MEMORY);
>>> +               COMPILER_HW_BARRIER();
>>> +               return old_val;
>>>  #endif
>>> -/**
>>> - * Fetch and increment atomic uint64 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_add_u64(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Increment atomic uint64 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_add_u64(ptr, 1);
>>> +       }
>>> +       /* Default to using odp_atomic32_fetch_add() */
>>> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>>>  }
>>>
>>>  /**
>>> - * Fetch and decrement atomic uint64 by 1
>>> + * Atomic decrement of 32-bit atomic variable
>>>   *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> + * param ptr   Pointer to a 32-bit atomic variable
>>> + * @param memmodel Memory model associated with the decrement
>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>   */
>>> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_sub_u64(ptr, 1);
>>> -}
>>> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
>>> +               odp_memorder_t memorder)
>>>
>>> -/**
>>> - * Decrement atomic uint64 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>>>  {
>>> -       odp_atomic_fetch_sub_u64(ptr, 1);
>>> +       /* Default to using odp_atomic32_fetch_dec() */
>>> +       /* Platforms that support atomic decrement instructions can add
>>> +        * their implementations here */
>>> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>>>  }
>>>
>>> -/**
>>> - * Atomic compare and set for 64bit
>>> - *
>>> - * @param dst destination location into which the value will be written.
>>> - * @param exp expected value.
>>> - * @param src new value.
>>> - * @return Non-zero on success; 0 on failure.
>>> - */
>>> -static inline int
>>> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
>>> -{
>>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>>> -}
>>> +/* We are not exporting this macro */
>>> +#undef COMPILER_HW_BARRIER
>>> +#undef MEMORY
>>>
>>>  #ifdef __cplusplus
>>>  }
>>> diff --git a/platform/linux-generic/include/api/odp_barrier.h
>>> b/platform/linux-generic/include/api/odp_barrier.h
>>> index a7b3215..69b1eb8 100644
>>> --- a/platform/linux-generic/include/api/odp_barrier.h
>>> +++ b/platform/linux-generic/include/api/odp_barrier.h
>>> @@ -27,18 +27,18 @@ extern "C" {
>>>   * ODP execution barrier
>>>   */
>>>  typedef struct odp_barrier_t {
>>> -       int              count;  /**< @private Thread count */
>>> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
>>> +       uint32_t       num_threads;  /**< @private Thread count
>>> (constant) */
>>> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
>>>  } odp_barrier_t;
>>>
>>>
>>>  /**
>>>   * Init barrier with thread count
>>>   *
>>> - * @param barrier    Barrier
>>> - * @param count      Thread count
>>> + * @param barrier     Barrier
>>> + * @param num_threads Number of threads which share the barrier
>>>   */
>>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
>>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>>>
>>>
>>>  /**
>>> diff --git a/platform/linux-generic/include/api/odp_counter.h
>>> b/platform/linux-generic/include/api/odp_counter.h
>>> new file mode 100644
>>> index 0000000..f937d27
>>> --- /dev/null
>>> +++ b/platform/linux-generic/include/api/odp_counter.h
>>> @@ -0,0 +1,363 @@
>>> +/* Copyright (c) 2013, Linaro Limited
>>> + * All rights reserved.
>>> + *
>>> + * SPDX-License-Identifier:     BSD-3-Clause
>>> + */
>>> +
>>> +/**
>>> + * @file
>>> + *
>>> + * ODP atomic counter types and operations, suitable for e.g. shared
>>> statistics.
>>> + * Relaxed memory model assumed for lowest overhead.
>>> + * Scalar variable wrapped in a struct to avoid accessing scalar
>>> directly
>>> + * without using the required access functions.
>>> + * Counter functions must be used to operate on counter variables!
>>> + */
>>> +
>>> +#ifndef ODP_COUNTER_H_
>>> +#define ODP_COUNTER_H_
>>> +
>>> +#include <stdint.h>
>>> +#include <odp_align.h>
>>> +#include <odp_hints.h>
>>> +
>>> +#ifdef __cplusplus
>>> +extern "C" {
>>> +#endif
>>> +
>>> +/**
>>> + * 32-bit (unsigned) atomic counter type
>>> + */
>>> +typedef struct {
>>> +       uint32_t v; /**< Actual storage for the counter variable */
>>> +} odp_counter32_t
>>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>>> +
>>> +/**
>>> + * 64-bit (unsigned) atomic counter type
>>> + */
>>> +typedef struct {
>>> +       uint64_t v; /**< Actual storage for the counter variable */
>>> +       /* Room for other data structures (e.g. spin lock) that might be
>>> +        * needed to ensure atomicity on some architectures */
>>> +} odp_counter64_t
>>> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
>>> +
>>>
>>> +/*****************************************************************************
>>> + * Operations on 32-bit atomic counters
>>> + * odp_counter32_init - returns no value
>>> + * odp_counter32_read - returns current value
>>> + * odp_counter32_write - returns no value
>>> + * odp_counter32_add - returns no value
>>> + * odp_counter32_read_inc - returns old value
>>> + * odp_counter32_inc - returns no value
>>> +
>>> *****************************************************************************/
>>> +
>>> +/**
>>> + * Initialize 32-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 32-bit counter variable
>>> + * @param val   Initial value
>>> + */
>>> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t
>>> val)
>>> +{
>>> +       /* No implementation requires any other type of initialization */
>>> +       *(__volatile uint32_t *)&ptr->v = val;
>>> +}
>>> +
>>> +/**
>>> + * Read 32-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 32-bit counter variable
>>> + *
>>> + * @return Value of the variable
>>> + */
>>> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
>>> +{
>>> +       uint32_t val;
>>> +       /* Read of aligned word is atomic */
>>> +       /* Cast to volatile to force compiler to (re-) read variable,
>>> thus we
>>> +        * will avoid using compiler memory barriers */
>>> +       val = *(__volatile const uint32_t *)&ptr->v;
>>> +       return val;
>>> +}
>>> +
>>> +/**
>>> + * Write 32-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 32-bit counter variable
>>> + * @param val   Value to write to the variable
>>> + */
>>> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t
>>> val)
>>> +{
>>> +       /* Write of aligned word is atomic */
>>> +       /* Cast to volatile to force compiler to (re-) write variable,
>>> thus we
>>> +        * will avoid using compiler memory barriers */
>>> +       *(__volatile uint32_t *)&ptr->v = val;
>>> +}
>>> +
>>> +/**
>>> + * Atomic add to 32-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 32-bit counter variable
>>> + * @param incr  The value to be added to the counter variable
>>> + */
>>> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t
>>> incr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint32_t result;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>>> +                                "add   %0, %0, %3\t\n"
>>> +                                "strex %1, %0, [%2]"
>>> +                                : "=&r"(result), "=&r"(status)
>>> +                                : "r"(&ptr->v), "Ir" (incr)
>>> +                                : );
>>> +       } while (odp_unlikely(status != 0));
>>> +#elif defined __OCTEON__
>>> +       __asm __volatile("saa %[inc], (%[base])"
>>> +                        : "+m" (*ptr)
>>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>>> +                        : );
>>> +#elif defined __x86_64__
>>> +       /* Generates good code on x86_64 */
>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>> +#else
>>> +       /* Warning odp_counter32_add() may not be efficiently
>>> implemented */
>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>> +#endif
>>> +}
>>> +
>>> +/**
>>> + * Atomic increment (+1) of 32-bit counter variable, return original
>>> value
>>> + *
>>> + * @param ptr   Pointer to a 32-bit counter variable
>>> + *
>>> + * @return Original value of counter
>>> + */
>>> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint32_t result, tmp;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrex %0, [%3]\t\n"
>>> +                                "add   %1, %0, #1\t\n"
>>> +                                "strex %2, %1, [%3]"
>>> +                                : "=&r"(result), "=&r"(tmp),
>>> "=&r"(status)
>>> +                                : "r"(&ptr->v)
>>> +                                : );
>>> +       } while (odp_unlikely(status != 0));
>>> +       return result;
>>> +#elif defined __OCTEON__
>>> +       uint32_t old_val;
>>> +       __asm __volatile("lai %0,(%2)"
>>> +                        : "=r" (old_val), "+m" (ptr)
>>> +                        : "r" (ptr)
>>> +                        : );
>>> +       return old_val;
>>> +#elif defined __x86_64__
>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>> +#else
>>> +/* Warning odp_counter32_read_inc() may not be efficiently implemented
>>> */
>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>> +#endif
>>> +}
>>> +
>>> +/**
>>> + * Atomic increment (+1) 32-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 32-bit counter variable
>>> + */
>>> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
>>> +{
>>> +#if defined __OCTEON__
>>> +       odp_counter32_add(ptr, 1);
>>> +#else
>>> +       (void)odp_counter32_read_inc(ptr);
>>> +#endif
>>> +}
>>> +
>>>
>>> +/*****************************************************************************
>>> + * Operations on 64-bit atomic counters
>>> + * odp_counter64_init
>>> + * odp_counter64_read
>>> + * odp_counter64_write
>>> + * odp_counter64_add
>>> + * odp_counter64_read_inc
>>> + * odp_counter64_inc
>>> +
>>> *****************************************************************************/
>>> +
>>> +/**
>>> + * Read 64-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 64-bit counter variable
>>> + *
>>> + * @return Value of the counter variable
>>> + */
>>> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t val;
>>> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
>>> +                        "clrex" /* Clear exclusive access monitor */
>>> +                        : "=&r"(val)
>>> +                        : "r"(&ptr->v)
>>> +                        : );
>>> +       return val;
>>> +#elif defined __x86_64__ || defined __aarch64__
>>> +       /* Read of aligned quad/double word is atomic */
>>> +       return ptr->v;
>>> +#else
>>> +/* Warning odp_counter64_read() may not be efficiently implemented */
>>> +       return __sync_fetch_and_or(&ptr->v, 0);
>>> +#endif
>>> +}
>>> +
>>> +/**
>>> + * Write 64-bit counter variable
>>> + *
>>> + * @param ptr  Pointer to a 64-bit counter variable
>>> + * @param val  Value to write to the counter variable
>>> + */
>>> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t
>>> val)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t old_val;
>>> +       int status;
>>> +       do {
>>> +               /* Read counter variable exclusively so we can write to
>>> it
>>> +                * later */
>>> +               /* Attempt to write the new value */
>>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>>> +                                "strexd %1, %3, %H3, [%2]"
>>> +                                : "=&r"(old_val), "=&r"(status)
>>> +                                : "r"(&ptr->v), "r"(val)
>>> +                                : );
>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>> succeeds */
>>> +#elif defined __x86_64__ || defined __aarch64__
>>> +       /* Write of aligned quad/double word is atomic */
>>> +       ptr->v = val;
>>> +#else
>>> +/* Warning odp_counter64_write() may not be efficiently implemented */
>>> +       /* This is actually an counter exchange operation */
>>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>> +#endif
>>> +}
>>> +
>>> +/**
>>> + * Initialize 64-bit counter variable
>>> + * Perform implementation specific initializations, assign initial
>>> value.
>>> + *
>>> + * @param ptr   Pointer to a 64-bit counter variable
>>> + * @param val   Initial value
>>> + */
>>> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t
>>> val)
>>> +{
>>> +       /* No implementation requires any other type of initialization */
>>> +       odp_counter64_write(ptr, val);
>>> +}
>>> +
>>> +/**
>>> + * Atomic add to 64-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 64-bit counter variable
>>> + * @param incr  The value to be added to the counter variable
>>> + */
>>> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t
>>> incr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t old_val;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>>> +                                "adds   %0, %0, %3\t\n"
>>> +                                "adc    %H0, %H3\t\n"
>>> +                                "strexd %1, %0, %H0, [%2]"
>>> +                                : "=&r"(old_val), "=&r"(status)
>>> +                                : "r"(&ptr->v), "r"(incr)
>>> +                                : );
>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>> succeeds */
>>> +#elif defined __OCTEON__
>>> +       __asm __volatile("saad %[inc], (%[base])"
>>> +                        : "+m" (*ptr)
>>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>>> +                        : );
>>> +#elif defined __x86_64__
>>> +       /* Generates good code on x86_64 */
>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>> +#else
>>> +/* Warning odp_counter64_add() may not be efficiently implemented */
>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>> +#endif
>>> +}
>>> +
>>> +
>>> +/**
>>> + * Atomic increment (+1) 64-bit counter variable and return original
>>> value
>>> + *
>>> + * @param ptr   Pointer to a 64-bit counter variable
>>> + *
>>> + * @return Original value of counter
>>> + */
>>> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t old_val, tmp;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
>>> +                                "adds   %2, %0, #1\t\n"
>>> +                                "adc    %H2, %H0, #0\t\n"
>>> +                                "strexd %1, %2, %H2, [%3]"
>>> +                                : "=&r"(old_val), "=&r"(status),
>>> "=&r"(tmp)
>>> +                                : "r"(&ptr->v)
>>> +                                : );
>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>> succeeds */
>>> +       return old_val;
>>> +#elif defined __OCTEON__
>>> +       uint64_t old_val;
>>> +       __asm __volatile("laid %0,(%2)"
>>> +                       : "=r" (old_val), "+m" (ptr)
>>> +                       : "r" (ptr)
>>> +                       : );
>>> +       return old_val;
>>> +#elif defined __x86_64__
>>> +       /* Generates good code on x86_64 */
>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>> +#else
>>> +/* Warning odp_counter64_read_inc() may not be efficiently implemented
>>> */
>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>> +#endif
>>> +}
>>> +
>>> +/**
>>> + * Atomic increment (+1) 64-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 64-bit counter variable
>>> + */
>>> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t old_val;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>>> +                                "adds   %0, #1\t\n"
>>> +                                "adc    %H0, #0\t\n"
>>> +                                "strexd %1, %0, %H0, [%2]"
>>> +                                : "=&r"(old_val), "=&r"(status)
>>> +                                : "r"(&ptr->v)
>>> +                                : );
>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>> succeeds */
>>> +#else
>>> +       (void)odp_counter64_read_inc(ptr);
>>> +#endif
>>> +}
>>> +
>>> +#ifdef __cplusplus
>>> +}
>>> +#endif
>>> +
>>> +#endif
>>> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
>>> b/platform/linux-generic/include/api/odp_rwlock.h
>>> index 252ebb2..ff8a9a2 100644
>>> --- a/platform/linux-generic/include/api/odp_rwlock.h
>>> +++ b/platform/linux-generic/include/api/odp_rwlock.h
>>> @@ -10,26 +10,30 @@
>>>  /**
>>>   * @file
>>>   *
>>> - * ODP RW Locks
>>> + * ODP read/write lock
>>> + * RW lock support multiple concurrent reads but only one (exclusive)
>>> writer.
>>>   */
>>>
>>> +#include <odp_atomic.h>
>>> +
>>>  #ifdef __cplusplus
>>>  extern "C" {
>>>  #endif
>>>
>>>  /**
>>>   * The odp_rwlock_t type.
>>> - * write lock count is -1,
>>> - * read lock count > 0
>>> + * write lock is ~0U
>>> + * read lock count >0 && <~0U
>>>   */
>>>  typedef struct {
>>> -       volatile int32_t cnt; /**< -1 Write lock,
>>> -                               > 0 for Read lock. */
>>> +       odp_atomic32_t cnt; /**< == 0: unlocked,
>>> +                                == ~0: locked for write,
>>> +                                > 0 number of concurrent read locks */
>>>  } odp_rwlock_t;
>>>
>>>
>>>  /**
>>> - * Initialize the rwlock to an unlocked state.
>>> + * Initialize the rwlock to the unlocked state.
>>>   *
>>>   * @param rwlock pointer to the RW Lock.
>>>   */
>>> @@ -50,14 +54,14 @@ void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
>>>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
>>>
>>>  /**
>>> - * Aquire a write lock.
>>> + * Aquire the write lock.
>>>   *
>>>   * @param rwlock pointer to a RW Lock.
>>>   */
>>>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
>>>
>>>  /**
>>> - * Release a write lock.
>>> + * Release the write lock.
>>>   *
>>>   * @param rwlock pointer to a RW Lock.
>>>   */
>>> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
>>> b/platform/linux-generic/include/api/odp_ticketlock.h
>>> index 6277a18..5933f85 100644
>>> --- a/platform/linux-generic/include/api/odp_ticketlock.h
>>> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
>>> @@ -21,14 +21,15 @@ extern "C" {
>>>
>>>  #include <odp_std_types.h>
>>>  #include <odp_atomic.h>
>>> +#include <odp_counter.h>
>>>
>>>
>>>  /**
>>>   * ODP ticketlock
>>>   */
>>>  typedef struct odp_ticketlock_t {
>>> -       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
>>> -       volatile uint32_t cur_ticket;  /**< @private Current ticket */
>>> +       odp_counter32_t next_ticket; /**< @private Next ticket */
>>> +       odp_atomic32_t cur_ticket;  /**< @private Current ticket */
>>>  } odp_ticketlock_t;
>>>
>>>
>>> diff --git a/platform/linux-generic/include/odp_buffer_internal.h
>>> b/platform/linux-generic/include/odp_buffer_internal.h
>>> index 2002b51..530ab96 100644
>>> --- a/platform/linux-generic/include/odp_buffer_internal.h
>>> +++ b/platform/linux-generic/include/odp_buffer_internal.h
>>> @@ -88,7 +88,7 @@ typedef struct odp_buffer_hdr_t {
>>>         uint32_t                 index;      /* buf index in the pool */
>>>         size_t                   size;       /* max data size */
>>>         size_t                   cur_offset; /* current offset */
>>> -       odp_atomic_int_t         ref_count;  /* reference count */
>>> +       odp_atomic32_t           ref_count;  /* reference count */
>>>         odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
>>>         int                      type;       /* type of next header */
>>>         odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
>>> diff --git a/platform/linux-generic/include/odp_spin_internal.h
>>> b/platform/linux-generic/include/odp_spin_internal.h
>>> index b7e2071..29c524f 100644
>>> --- a/platform/linux-generic/include/odp_spin_internal.h
>>> +++ b/platform/linux-generic/include/odp_spin_internal.h
>>> @@ -15,15 +15,6 @@ extern "C" {
>>>
>>>
>>>  /**
>>> - * GCC memory barrier for ODP internal use
>>> - */
>>> -static inline void odp_mem_barrier(void)
>>> -{
>>> -       __asm__ __volatile__ ("" : : : "memory");
>>> -}
>>> -
>>> -
>>> -/**
>>>   * Spin loop for ODP internal use
>>>   */
>>>  static inline void odp_spin(void)
>>> diff --git a/platform/linux-generic/odp_barrier.c
>>> b/platform/linux-generic/odp_barrier.c
>>> index a82b294..10368b5 100644
>>> --- a/platform/linux-generic/odp_barrier.c
>>> +++ b/platform/linux-generic/odp_barrier.c
>>> @@ -8,41 +8,52 @@
>>>  #include <odp_sync.h>
>>>  #include <odp_spin_internal.h>
>>>
>>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count)
>>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
>>>  {
>>> -       barrier->count = count;
>>> -       barrier->bar = 0;
>>> -       odp_sync_stores();
>>> +       barrier->num_threads = num_threads; /* Constant after
>>> initialisation */
>>> +       odp_atomic32_init(&barrier->in_barrier, 0);
>>>  }
>>>
>>>  /*
>>>   * Efficient barrier_sync -
>>>   *
>>>   *   Barriers are initialized with a count of the number of callers
>>> - *   that must sync on the barrier before any may proceed.
>>> + *   that must sync on (enter) the barrier before any may proceed
>>> (exit).
>>>   *
>>>   *   To avoid race conditions and to permit the barrier to be fully
>>> - *   reusable, the barrier value cycles between 0..2*count-1. When
>>> - *   synchronizing the wasless variable simply tracks which half of
>>> + *   reusable, the barrier value cycles between 0..2*count-1
>>> (temporarily
>>> + *   hitting 2*count before being wrapped). When
>>> + *   synchronizing, the waslow variable simply tracks which half of
>>>   *   the cycle the barrier was in upon entry.  Exit is when the
>>>   *   barrier crosses to the other half of the cycle.
>>>   */
>>>
>>>  void odp_barrier_sync(odp_barrier_t *barrier)
>>>  {
>>> -       int count;
>>> -       int wasless;
>>> +       uint32_t count;
>>> +       bool waslow;
>>>
>>> -       odp_sync_stores();
>>> -       wasless = barrier->bar < barrier->count;
>>> -       count = odp_atomic_fetch_inc_int(&barrier->bar);
>>> +       /* We need both acquire and release barriers but does the order
>>> +        * matter? Here we start with release and end with acquire. */
>>>
>>> -       if (count == 2*barrier->count-1) {
>>> -               barrier->bar = 0;
>>> -       } else {
>>> -               while ((barrier->bar < barrier->count) == wasless)
>>> -                       odp_spin();
>>> -       }
>>> +       /* Increase threads in_barrier count, this will automatically
>>> release
>>> +        * the other threads when lower/upper range is switched */
>>> +       count = odp_atomic32_fetch_add(&barrier->in_barrier, 1,
>>> +                                      ODP_MEMORDER_RLS);
>>> +       /* Compute lower or higher range indicator */
>>> +       waslow = count < barrier->num_threads;
>>>
>>> -       odp_mem_barrier();
>>> +       /* Check if in_barrier count should wrap */
>>> +       if (count == 2 * barrier->num_threads - 1) {
>>> +               /* Manually wrap the counter */
>>> +               odp_atomic32_add(&barrier->in_barrier,
>>> +                                -2 * barrier->num_threads,
>>> +                                ODP_MEMORDER_RLX);
>>> +               /* Fall-through the final part for the acquire barrier */
>>> +       }
>>> +       /* Wait for counter to change half */
>>> +       while ((odp_atomic32_load(&barrier->in_barrier,
>>> ODP_MEMORDER_ACQ) <
>>> +              barrier->num_threads) == waslow) {
>>> +               odp_spin();
>>> +       }
>>>  }
>>> diff --git a/platform/linux-generic/odp_buffer.c
>>> b/platform/linux-generic/odp_buffer.c
>>> index e54e0e7..fc3506b 100644
>>> --- a/platform/linux-generic/odp_buffer.c
>>> +++ b/platform/linux-generic/odp_buffer.c
>>> @@ -73,7 +73,8 @@ int odp_buffer_snprint(char *str, size_t n,
>>> odp_buffer_t buf)
>>>         len += snprintf(&str[len], n-len,
>>>                         "  cur_offset   %zu\n",       hdr->cur_offset);
>>>         len += snprintf(&str[len], n-len,
>>> -                       "  ref_count    %i\n",        hdr->ref_count);
>>> +                       "  ref_count    %u\n",
>>> +                       odp_atomic32_load(&hdr->ref_count,
>>> ODP_MEMORDER_RLX));
>>>         len += snprintf(&str[len], n-len,
>>>                         "  type         %i\n",        hdr->type);
>>>         len += snprintf(&str[len], n-len,
>>> diff --git a/platform/linux-generic/odp_crypto.c
>>> b/platform/linux-generic/odp_crypto.c
>>> index b37ad6b..75b4ce0 100644
>>> --- a/platform/linux-generic/odp_crypto.c
>>> +++ b/platform/linux-generic/odp_crypto.c
>>> @@ -6,7 +6,7 @@
>>>
>>>  #include <odp_crypto.h>
>>>  #include <odp_internal.h>
>>> -#include <odp_atomic.h>
>>> +#include <odp_counter.h>
>>>  #include <odp_spinlock.h>
>>>  #include <odp_sync.h>
>>>  #include <odp_debug.h>
>>> @@ -26,7 +26,7 @@
>>>  #define MAX_SESSIONS 32
>>>
>>>  typedef struct {
>>> -       odp_atomic_u32_t next;
>>> +       odp_counter32_t   next;
>>>         uint32_t         max;
>>>         odp_crypto_generic_session_t sessions[0];
>>>  } odp_crypto_global_t;
>>> @@ -58,7 +58,7 @@ odp_crypto_generic_session_t *alloc_session(void)
>>>         uint32_t idx;
>>>         odp_crypto_generic_session_t *session = NULL;
>>>
>>> -       idx = odp_atomic_fetch_inc_u32(&global->next);
>>> +       idx = odp_counter32_read_inc(&global->next);
>>>         if (idx < global->max) {
>>>                 session = &global->sessions[idx];
>>>                 session->index = idx;
>>> @@ -420,6 +420,7 @@ odp_crypto_init_global(void)
>>>
>>>         /* Initialize it */
>>>         global->max = MAX_SESSIONS;
>>> +       odp_counter32_init(&global->next, 0);
>>>
>>>         return 0;
>>>  }
>>> diff --git a/platform/linux-generic/odp_queue.c
>>> b/platform/linux-generic/odp_queue.c
>>> index 1318bcd..08c0d29 100644
>>> --- a/platform/linux-generic/odp_queue.c
>>> +++ b/platform/linux-generic/odp_queue.c
>>> @@ -214,8 +214,13 @@ int odp_queue_set_context(odp_queue_t handle, void
>>> *context)
>>>  {
>>>         queue_entry_t *queue;
>>>         queue = queue_to_qentry(handle);
>>> +       /* Setting a new queue context can be viewed as a release
>>> operation,
>>> +        * all writes to the context must be observable before the
>>> context
>>> +        * is made observable */
>>>         odp_sync_stores();
>>> -       queue->s.param.context = context;
>>> +       queue->s.param.context = context; /* Store-release */
>>> +       /* Ensure queue modification is globally visible before we return
>>> +        * and the application might cause the queue to be scheduled */
>>>         odp_sync_stores();
>>>         return 0;
>>>  }
>>> diff --git a/platform/linux-generic/odp_ring.c
>>> b/platform/linux-generic/odp_ring.c
>>> index 632aa66..e5b9c23 100644
>>> --- a/platform/linux-generic/odp_ring.c
>>> +++ b/platform/linux-generic/odp_ring.c
>>> @@ -187,10 +187,10 @@ odph_ring_create(const char *name, unsigned count,
>>> unsigned flags)
>>>                 r->cons.size = count;
>>>                 r->prod.mask = count-1;
>>>                 r->cons.mask = count-1;
>>> -               r->prod.head = 0;
>>> -               r->cons.head = 0;
>>> -               r->prod.tail = 0;
>>> -               r->cons.tail = 0;
>>> +               odp_atomic32_init(&r->prod.head, 0);
>>> +               odp_atomic32_init(&r->cons.head, 0);
>>> +               odp_atomic32_init(&r->prod.tail, 0);
>>> +               odp_atomic32_init(&r->cons.tail, 0);
>>>
>>>                 TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
>>>         } else {
>>> @@ -227,7 +227,7 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
>>> const *obj_table,
>>>         uint32_t prod_head, prod_next;
>>>         uint32_t cons_tail, free_entries;
>>>         const unsigned max = n;
>>> -       int success;
>>> +       bool success;
>>>         unsigned i;
>>>         uint32_t mask = r->prod.mask;
>>>         int ret;
>>> @@ -237,8 +237,8 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
>>> const *obj_table,
>>>                 /* Reset n to the initial burst count */
>>>                 n = max;
>>>
>>> -               prod_head = r->prod.head;
>>> -               cons_tail = r->cons.tail;
>>> +               prod_head = odp_atomic32_load(&r->prod.head,
>>> ODP_MEMORDER_RLX);
>>> +               cons_tail = odp_atomic32_load(&r->cons.tail,
>>> ODP_MEMORDER_ACQ);
>>>                 /* The subtraction is done between two unsigned 32bits
>>> value
>>>                  * (the result is always modulo 32 bits even if we have
>>>                  * prod_head > cons_tail). So 'free_entries' is always
>>> between 0
>>> @@ -259,13 +259,14 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void
>>> * const *obj_table,
>>>                 }
>>>
>>>                 prod_next = prod_head + n;
>>> -               success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
>>> -                                             prod_next);
>>> -       } while (odp_unlikely(success == 0));
>>> +               success = odp_atomic32_cmp_xchg_weak(&r->prod.head,
>>> +                                                    &prod_head,
>>> +                                                    prod_next,
>>> +                                                    ODP_MEMORDER_RLX);
>>> +       } while (odp_unlikely(!success));
>>>
>>>         /* write entries in ring */
>>>         ENQUEUE_PTRS();
>>> -       odp_mem_barrier();
>>>
>>>         /* if we exceed the watermark */
>>>         if (odp_unlikely(((mask + 1) - free_entries + n) >
>>> r->prod.watermark)) {
>>> @@ -279,10 +280,11 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void
>>> * const *obj_table,
>>>          * If there are other enqueues in progress that preceeded us,
>>>          * we need to wait for them to complete
>>>          */
>>> -       while (odp_unlikely(r->prod.tail != prod_head))
>>> +       while (odp_unlikely(odp_atomic32_load(&r->prod.tail,
>>> +                                             ODP_MEMORDER_RLX) !=
>>> prod_head))
>>>                 odp_spin();
>>>
>>> -       r->prod.tail = prod_next;
>>> +       odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
>>>         return ret;
>>>  }
>>>
>>> @@ -298,8 +300,8 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
>>> const *obj_table,
>>>         uint32_t mask = r->prod.mask;
>>>         int ret;
>>>
>>> -       prod_head = r->prod.head;
>>> -       cons_tail = r->cons.tail;
>>> +       prod_head = odp_atomic32_load(&r->prod.head, ODP_MEMORDER_RLX);
>>> +       cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_ACQ);
>>>         /* The subtraction is done between two unsigned 32bits value
>>>          * (the result is always modulo 32 bits even if we have
>>>          * prod_head > cons_tail). So 'free_entries' is always between 0
>>> @@ -320,11 +322,10 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void
>>> * const *obj_table,
>>>         }
>>>
>>>         prod_next = prod_head + n;
>>> -       r->prod.head = prod_next;
>>> +       odp_atomic32_store(&r->prod.head, prod_next, ODP_MEMORDER_RLX);
>>>
>>>         /* write entries in ring */
>>>         ENQUEUE_PTRS();
>>> -       odp_mem_barrier();
>>>
>>>         /* if we exceed the watermark */
>>>         if (odp_unlikely(((mask + 1) - free_entries + n) >
>>> r->prod.watermark)) {
>>> @@ -334,7 +335,7 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
>>> const *obj_table,
>>>                 ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
>>>         }
>>>
>>> -       r->prod.tail = prod_next;
>>> +       odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
>>>         return ret;
>>>  }
>>>
>>> @@ -348,7 +349,7 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>>> **obj_table,
>>>         uint32_t cons_head, prod_tail;
>>>         uint32_t cons_next, entries;
>>>         const unsigned max = n;
>>> -       int success;
>>> +       bool success;
>>>         unsigned i;
>>>         uint32_t mask = r->prod.mask;
>>>
>>> @@ -357,8 +358,8 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>>> **obj_table,
>>>                 /* Restore n as it may change every loop */
>>>                 n = max;
>>>
>>> -               cons_head = r->cons.head;
>>> -               prod_tail = r->prod.tail;
>>> +               cons_head = odp_atomic32_load(&r->cons.head,
>>> ODP_MEMORDER_RLX);
>>> +               prod_tail = odp_atomic32_load(&r->prod.tail,
>>> ODP_MEMORDER_ACQ);
>>>                 /* The subtraction is done between two unsigned 32bits
>>> value
>>>                  * (the result is always modulo 32 bits even if we have
>>>                  * cons_head > prod_tail). So 'entries' is always
>>> between 0
>>> @@ -378,22 +379,24 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>>> **obj_table,
>>>                 }
>>>
>>>                 cons_next = cons_head + n;
>>> -               success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
>>> -                                             cons_next);
>>> -       } while (odp_unlikely(success == 0));
>>> +               success = odp_atomic32_cmp_xchg_weak(&r->cons.head,
>>> +                                                    &cons_head,
>>> +                                                    cons_next,
>>> +                                                    ODP_MEMORDER_RLX);
>>> +       } while (odp_unlikely(!success));
>>>
>>>         /* copy in table */
>>>         DEQUEUE_PTRS();
>>> -       odp_mem_barrier();
>>>
>>>         /*
>>>          * If there are other dequeues in progress that preceded us,
>>>          * we need to wait for them to complete
>>>          */
>>> -       while (odp_unlikely(r->cons.tail != cons_head))
>>> +       while (odp_unlikely(odp_atomic32_load(&r->cons.tail,
>>> +                                             ODP_MEMORDER_RLX) !=
>>> cons_head))
>>>                 odp_spin();
>>>
>>> -       r->cons.tail = cons_next;
>>> +       odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
>>>
>>>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>>>  }
>>> @@ -409,8 +412,8 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
>>> **obj_table,
>>>         unsigned i;
>>>         uint32_t mask = r->prod.mask;
>>>
>>> -       cons_head = r->cons.head;
>>> -       prod_tail = r->prod.tail;
>>> +       cons_head = odp_atomic32_load(&r->cons.head, ODP_MEMORDER_RLX);
>>> +       prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_ACQ);
>>>         /* The subtraction is done between two unsigned 32bits value
>>>          * (the result is always modulo 32 bits even if we have
>>>          * cons_head > prod_tail). So 'entries' is always between 0
>>> @@ -429,13 +432,12 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
>>> **obj_table,
>>>         }
>>>
>>>         cons_next = cons_head + n;
>>> -       r->cons.head = cons_next;
>>> +       odp_atomic32_store(&r->cons.head, cons_next, ODP_MEMORDER_RLX);
>>>
>>>         /* copy in table */
>>>         DEQUEUE_PTRS();
>>> -       odp_mem_barrier();
>>>
>>> -       r->cons.tail = cons_next;
>>> +       odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
>>>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>>>  }
>>>
>>> @@ -482,8 +484,8 @@ int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void
>>> **obj_table, unsigned n)
>>>   */
>>>  int odph_ring_full(const odph_ring_t *r)
>>>  {
>>> -       uint32_t prod_tail = r->prod.tail;
>>> -       uint32_t cons_tail = r->cons.tail;
>>> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
>>> ODP_MEMORDER_RLX);
>>> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
>>> ODP_MEMORDER_RLX);
>>>         return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
>>>  }
>>>
>>> @@ -492,8 +494,8 @@ int odph_ring_full(const odph_ring_t *r)
>>>   */
>>>  int odph_ring_empty(const odph_ring_t *r)
>>>  {
>>> -       uint32_t prod_tail = r->prod.tail;
>>> -       uint32_t cons_tail = r->cons.tail;
>>> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
>>> ODP_MEMORDER_RLX);
>>> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
>>> ODP_MEMORDER_RLX);
>>>         return !!(cons_tail == prod_tail);
>>>  }
>>>
>>> @@ -502,8 +504,8 @@ int odph_ring_empty(const odph_ring_t *r)
>>>   */
>>>  unsigned odph_ring_count(const odph_ring_t *r)
>>>  {
>>> -       uint32_t prod_tail = r->prod.tail;
>>> -       uint32_t cons_tail = r->cons.tail;
>>> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
>>> ODP_MEMORDER_RLX);
>>> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
>>> ODP_MEMORDER_RLX);
>>>         return (prod_tail - cons_tail) & r->prod.mask;
>>>  }
>>>
>>> @@ -512,8 +514,8 @@ unsigned odph_ring_count(const odph_ring_t *r)
>>>   */
>>>  unsigned odph_ring_free_count(const odph_ring_t *r)
>>>  {
>>> -       uint32_t prod_tail = r->prod.tail;
>>> -       uint32_t cons_tail = r->cons.tail;
>>> +       uint32_t prod_tail = odp_atomic32_load(&r->prod.tail,
>>> ODP_MEMORDER_RLX);
>>> +       uint32_t cons_tail = odp_atomic32_load(&r->cons.tail,
>>> ODP_MEMORDER_RLX);
>>>         return (cons_tail - prod_tail - 1) & r->prod.mask;
>>>  }
>>>
>>> @@ -523,10 +525,14 @@ void odph_ring_dump(const odph_ring_t *r)
>>>         ODP_DBG("ring <%s>@%p\n", r->name, r);
>>>         ODP_DBG("  flags=%x\n", r->flags);
>>>         ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
>>> -       ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
>>> -       ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
>>> -       ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
>>> -       ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
>>> +       ODP_DBG("  ct=%"PRIu32"\n", odp_atomic32_load(&r->cons.tail,
>>> +                                                     ODP_MEMORDER_RLX));
>>> +       ODP_DBG("  ch=%"PRIu32"\n", odp_atomic32_load(&r->cons.head,
>>> +                                                     ODP_MEMORDER_RLX));
>>> +       ODP_DBG("  pt=%"PRIu32"\n", odp_atomic32_load(&r->prod.tail,
>>> +                                                     ODP_MEMORDER_RLX));
>>> +       ODP_DBG("  ph=%"PRIu32"\n", odp_atomic32_load(&r->prod.head,
>>> +                                                     ODP_MEMORDER_RLX));
>>>         ODP_DBG("  used=%u\n", odph_ring_count(r));
>>>         ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
>>>         if (r->prod.watermark == r->prod.size)
>>> diff --git a/platform/linux-generic/odp_rwlock.c
>>> b/platform/linux-generic/odp_rwlock.c
>>> index 11c8dd7..a5fae4d 100644
>>> --- a/platform/linux-generic/odp_rwlock.c
>>> +++ b/platform/linux-generic/odp_rwlock.c
>>> @@ -4,58 +4,64 @@
>>>   * SPDX-License-Identifier:     BSD-3-Clause
>>>   */
>>>
>>> +#include <stdbool.h>
>>>  #include <odp_atomic.h>
>>>  #include <odp_rwlock.h>
>>> -
>>>  #include <odp_spin_internal.h>
>>>
>>>  void odp_rwlock_init(odp_rwlock_t *rwlock)
>>>  {
>>> -       rwlock->cnt = 0;
>>> +       odp_atomic32_init(&rwlock->cnt, 0);
>>>  }
>>>
>>>  void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
>>>  {
>>> -       int32_t cnt;
>>> -       int  is_locked = 0;
>>> -
>>> -       while (is_locked == 0) {
>>> -               cnt = rwlock->cnt;
>>> -               /* waiting for read lock */
>>> -               if (cnt < 0) {
>>> +       bool gotit;
>>> +       uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
>>> +       do {
>>> +               /* Wait for any writer to release lock */
>>> +               while ((int32_t)cnt < 0) {
>>>                         odp_spin();
>>> -                       continue;
>>> +                       cnt = odp_atomic32_load(&rwlock->cnt,
>>> +                                               ODP_MEMORDER_RLX);
>>>                 }
>>> -               is_locked = odp_atomic_cmpset_u32(
>>> -                                       (volatile uint32_t
>>> *)&rwlock->cnt,
>>> -                                             cnt, cnt + 1);
>>> -       }
>>> +               /* Attempt to take another read lock */
>>> +               gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
>>> +                                                  &cnt, cnt + 1,
>>> +                                                  ODP_MEMORDER_RLX);
>>> +               /* If operation fails, 'cnt' will contain current value
>>> */
>>> +       } while (!gotit);
>>>  }
>>>
>>>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
>>>  {
>>> -       odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
>>> +       /* Release one read lock by subtracting 1 */
>>> +       odp_atomic32_dec(&rwlock->cnt, ODP_MEMORDER_RLS);
>>>  }
>>>
>>>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
>>>  {
>>> -       int32_t cnt;
>>> -       int is_locked = 0;
>>> -
>>> -       while (is_locked == 0) {
>>> -               cnt = rwlock->cnt;
>>> -               /* lock aquired, wait */
>>> -               if (cnt != 0) {
>>> +       bool gotit;
>>> +       uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
>>> +       do {
>>> +               /* Wait for all lock holders to release lock */
>>> +               while (cnt != 0) {
>>> +                       /* Lock is busy */
>>>                         odp_spin();
>>> -                       continue;
>>> +                       cnt = odp_atomic32_load(&rwlock->cnt,
>>> +                                               ODP_MEMORDER_RLX);
>>>                 }
>>> -               is_locked = odp_atomic_cmpset_u32(
>>> -                                       (volatile uint32_t
>>> *)&rwlock->cnt,
>>> -                                             0, -1);
>>> -       }
>>> +               /* Attempt to take write lock */
>>> +               gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
>>> +                                                  &cnt,
>>> +                                                  (uint32_t)-1,
>>> +                                                  ODP_MEMORDER_RLX);
>>> +               /* If operation fails, 'cnt' will contain current value
>>> */
>>> +       } while (!gotit);
>>>  }
>>>
>>>  void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
>>>  {
>>> -       odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
>>> +       /* Release the write lock by adding 1 */
>>> +       odp_atomic32_inc(&rwlock->cnt, ODP_MEMORDER_RLS);
>>>  }
>>> diff --git a/platform/linux-generic/odp_thread.c
>>> b/platform/linux-generic/odp_thread.c
>>> index b869b27..652d317 100644
>>> --- a/platform/linux-generic/odp_thread.c
>>> +++ b/platform/linux-generic/odp_thread.c
>>> @@ -11,7 +11,7 @@
>>>
>>>  #include <odp_thread.h>
>>>  #include <odp_internal.h>
>>> -#include <odp_atomic.h>
>>> +#include <odp_counter.h>
>>>  #include <odp_config.h>
>>>  #include <odp_debug.h>
>>>  #include <odp_shared_memory.h>
>>> @@ -31,7 +31,7 @@ typedef struct {
>>>
>>>  typedef struct {
>>>         thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
>>> -       odp_atomic_int_t num;
>>> +       odp_counter32_t   num;
>>>
>>>  } thread_globals_t;
>>>
>>> @@ -58,6 +58,7 @@ int odp_thread_init_global(void)
>>>                 return -1;
>>>
>>>         memset(thread_globals, 0, sizeof(thread_globals_t));
>>> +       odp_counter32_init(&thread_globals->num, 0);
>>>         return 0;
>>>  }
>>>
>>> @@ -67,7 +68,7 @@ static int thread_id(void)
>>>         int id;
>>>         int cpu;
>>>
>>> -       id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
>>> +       id = (int)odp_counter32_read_inc(&thread_globals->num);
>>>
>>>         if (id >= ODP_CONFIG_MAX_THREADS) {
>>>                 ODP_ERR("Too many threads\n");
>>> @@ -77,7 +78,7 @@ static int thread_id(void)
>>>         cpu = sched_getcpu();
>>>
>>>         if (cpu < 0) {
>>> -               ODP_ERR("getcpu failed\n");
>>> +               ODP_ERR("sched_getcpu failed\n");
>>>                 return -1;
>>>         }
>>>
>>> diff --git a/platform/linux-generic/odp_ticketlock.c
>>> b/platform/linux-generic/odp_ticketlock.c
>>> index be5b885..510aa9f 100644
>>> --- a/platform/linux-generic/odp_ticketlock.c
>>> +++ b/platform/linux-generic/odp_ticketlock.c
>>> @@ -6,15 +6,15 @@
>>>
>>>  #include <odp_ticketlock.h>
>>>  #include <odp_atomic.h>
>>> +#include <odp_counter.h>
>>>  #include <odp_sync.h>
>>>  #include <odp_spin_internal.h>
>>>
>>>
>>>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>>>  {
>>> -       ticketlock->next_ticket = 0;
>>> -       ticketlock->cur_ticket  = 0;
>>> -       odp_sync_stores();
>>> +       odp_counter32_init(&ticketlock->next_ticket, 0);
>>> +       odp_atomic32_init(&ticketlock->cur_ticket, 0);
>>>  }
>>>
>>>
>>> @@ -22,30 +22,15 @@ void odp_ticketlock_lock(odp_ticketlock_t
>>> *ticketlock)
>>>  {
>>>         uint32_t ticket;
>>>
>>> -       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>>> +       ticket = odp_counter32_read_inc(&ticketlock->next_ticket);
>>>
>>> -       while (ticket != ticketlock->cur_ticket)
>>> +       while (ticket != odp_atomic32_load(&ticketlock->cur_ticket,
>>> +                                          ODP_MEMORDER_ACQ))
>>>                 odp_spin();
>>> -
>>> -       odp_mem_barrier();
>>>  }
>>>
>>>
>>>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>>>  {
>>> -       odp_sync_stores();
>>> -
>>> -       ticketlock->cur_ticket++;
>>> -
>>> -#if defined __OCTEON__
>>> -       odp_sync_stores();
>>> -#else
>>> -       odp_mem_barrier();
>>> -#endif
>>> -}
>>> -
>>> -
>>> -int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
>>> -{
>>> -       return ticketlock->cur_ticket != ticketlock->next_ticket;
>>> +       odp_atomic32_inc(&ticketlock->cur_ticket, ODP_MEMORDER_RLS);
>>>  }
>>> diff --git a/platform/linux-generic/odp_timer.c
>>> b/platform/linux-generic/odp_timer.c
>>> index 313c713..fffaa44 100644
>>> --- a/platform/linux-generic/odp_timer.c
>>> +++ b/platform/linux-generic/odp_timer.c
>>> @@ -10,6 +10,7 @@
>>>  #include <odp_buffer_pool_internal.h>
>>>  #include <odp_internal.h>
>>>  #include <odp_atomic.h>
>>> +#include <odp_counter.h>
>>>  #include <odp_spinlock.h>
>>>  #include <odp_sync.h>
>>>  #include <odp_debug.h>
>>> @@ -32,8 +33,8 @@ typedef struct {
>>>
>>>  typedef struct {
>>>         int               allocated;
>>> -       volatile int      active;
>>> -       volatile uint64_t cur_tick;
>>> +       odp_atomic32_t    active;
>>> +       odp_counter64_t   cur_tick;
>>>         timer_t           timerid;
>>>         odp_timer_t       timer_hdl;
>>>         odp_buffer_pool_t pool;
>>> @@ -150,16 +151,16 @@ static void notify_function(union sigval sigval)
>>>
>>>         timer = sigval.sival_ptr;
>>>
>>> -       if (timer->active == 0) {
>>> +       if (odp_atomic32_load(&timer->active, ODP_MEMORDER_RLX) == 0) {
>>>                 ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
>>>                 return;
>>>         }
>>>
>>>         /* ODP_DBG("Tick\n"); */
>>>
>>> -       cur_tick = timer->cur_tick++;
>>> -
>>> -       odp_sync_stores();
>>> +       /* Increment and read are not atomic but we are the only writer
>>> */
>>> +       odp_counter64_inc(&timer->cur_tick);
>>> +       cur_tick = odp_counter64_read(&timer->cur_tick);
>>>
>>>         tick = &timer->tick[cur_tick % MAX_TICKS];
>>>
>>> @@ -308,6 +309,8 @@ odp_timer_t odp_timer_create(const char *name,
>>> odp_buffer_pool_t pool,
>>>
>>>         timer_hdl = id + 1;
>>>
>>> +       odp_atomic32_init(&timer->active, 0);
>>> +       odp_counter64_init(&timer->cur_tick, 0);
>>>         timer->timer_hdl     = timer_hdl;
>>>         timer->pool          = pool;
>>>         timer->resolution_ns = resolution_ns;
>>> @@ -318,8 +321,7 @@ odp_timer_t odp_timer_create(const char *name,
>>> odp_buffer_pool_t pool,
>>>                 timer->tick[i].list = NULL;
>>>         }
>>>
>>> -       timer->active = 1;
>>> -       odp_sync_stores();
>>> +       odp_atomic32_store(&timer->active, 1, ODP_MEMORDER_RLS);
>>>
>>>         timer_start(timer);
>>>
>>> @@ -340,7 +342,7 @@ odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t
>>> timer_hdl, uint64_t tmo_tick,
>>>         id = (int)timer_hdl - 1;
>>>         timer = &odp_timer.timer[id];
>>>
>>> -       cur_tick = timer->cur_tick;
>>> +       cur_tick = odp_counter64_read(&timer->cur_tick);
>>>         if (tmo_tick <= cur_tick) {
>>>                 ODP_DBG("timeout too close\n");
>>>                 return ODP_TIMER_TMO_INVALID;
>>> @@ -416,7 +418,7 @@ uint64_t odp_timer_current_tick(odp_timer_t
>>> timer_hdl)
>>>         uint32_t id;
>>>
>>>         id = timer_hdl - 1;
>>> -       return odp_timer.timer[id].cur_tick;
>>> +       return odp_counter64_read(&odp_timer.timer[id].cur_tick);
>>>  }
>>>
>>>  odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
>>> diff --git a/test/api_test/Makefile.am b/test/api_test/Makefile.am
>>> index 5104454..478aa6c 100644
>>> --- a/test/api_test/Makefile.am
>>> +++ b/test/api_test/Makefile.am
>>> @@ -1,12 +1,12 @@
>>>  include $(top_srcdir)/test/Makefile.inc
>>>
>>> -bin_PROGRAMS = odp_atomic odp_shm odp_ring odp_timer_ping
>>> -odp_atomic_LDFLAGS = $(AM_LDFLAGS) -static
>>> +bin_PROGRAMS = odp_counter odp_shm odp_ring odp_timer_ping
>>> +odp_counter_LDFLAGS = $(AM_LDFLAGS) -static
>>>  odp_shm_LDFLAGS = $(AM_LDFLAGS) -static
>>>  odp_ring_LDFLAGS = $(AM_LDFLAGS) -static
>>>  odp_timer_ping_LDFLAGS = $(AM_LDFLAGS) -static
>>>
>>> -dist_odp_atomic_SOURCES = odp_atomic_test.c odp_common.c
>>> +dist_odp_counter_SOURCES = odp_counter_test.c odp_common.c
>>>  dist_odp_shm_SOURCES = odp_shm_test.c odp_common.c
>>>  dist_odp_ring_SOURCES = odp_ring_test.c odp_common.c
>>>  dist_odp_timer_ping_SOURCES = odp_timer_ping.c odp_common.c
>>> diff --git a/test/api_test/odp_atomic_test.c
>>> b/test/api_test/odp_atomic_test.c
>>> deleted file mode 100644
>>> index 9019d4f..0000000
>>> --- a/test/api_test/odp_atomic_test.c
>>> +++ /dev/null
>>> @@ -1,362 +0,0 @@
>>> -/* Copyright (c) 2013, Linaro Limited
>>> - * All rights reserved.
>>> - *
>>> - * SPDX-License-Identifier:     BSD-3-Clause
>>> - */
>>> -
>>> -#include <string.h>
>>> -#include <sys/time.h>
>>> -#include <odp_debug.h>
>>> -#include <odp_common.h>
>>> -#include <odp_atomic_test.h>
>>> -
>>> -static odp_atomic_int_t a32;
>>> -static odp_atomic_u32_t a32u;
>>> -static odp_atomic_u64_t a64u;
>>> -
>>> -static odp_atomic_int_t numthrds;
>>> -
>>> -static const char * const test_name[] = {
>>> -       "dummy",
>>> -       "test atomic basic ops add/sub/inc/dec",
>>> -       "test atomic inc/dec of signed word",
>>> -       "test atomic add/sub of signed word",
>>> -       "test atomic inc/dec of unsigned word",
>>> -       "test atomic add/sub of unsigned word",
>>> -       "test atomic inc/dec of unsigned double word",
>>> -       "test atomic add/sub of unsigned double word"
>>> -};
>>> -
>>> -static struct timeval tv0[MAX_WORKERS], tv1[MAX_WORKERS];
>>> -
>>> -static void usage(void)
>>> -{
>>> -       printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
>>> -              "\t<testcase> is\n"
>>> -              "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
>>> -              "\t\t2 - Test inc dec of signed word\n"
>>> -              "\t\t3 - Test add sub of signed word\n"
>>> -              "\t\t4 - Test inc dec of unsigned word\n"
>>> -              "\t\t5 - Test add sub of unsigned word\n"
>>> -              "\t\t6 - Test inc dec of double word\n"
>>> -              "\t\t7 - Test add sub of double word\n"
>>> -              "\t<num of pthread> is optional\n"
>>> -              "\t\t<1 - 31> - no of pthreads to start\n"
>>> -              "\t\tif user doesn't specify this option, then\n"
>>> -              "\t\tno of pthreads created is equivalent to no of
>>> cores\n"
>>> -              "\t\tavailable in the system\n"
>>> -              "\tExample usage:\n"
>>> -              "\t\t./odp_atomic -t 2\n"
>>> -              "\t\t./odp_atomic -t 3 -n 12\n");
>>> -}
>>> -
>>> -void test_atomic_inc_32(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < CNT; i++)
>>> -               odp_atomic_inc_int(&a32);
>>> -}
>>> -
>>> -void test_atomic_inc_u32(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < CNT; i++)
>>> -               odp_atomic_inc_u32(&a32u);
>>> -}
>>> -
>>> -void test_atomic_inc_64(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < CNT; i++)
>>> -               odp_atomic_inc_u64(&a64u);
>>> -}
>>> -
>>> -void test_atomic_dec_32(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < CNT; i++)
>>> -               odp_atomic_dec_int(&a32);
>>> -}
>>> -
>>> -void test_atomic_dec_u32(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < CNT; i++)
>>> -               odp_atomic_dec_u32(&a32u);
>>> -}
>>> -
>>> -void test_atomic_dec_64(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < CNT; i++)
>>> -               odp_atomic_dec_u64(&a64u);
>>> -}
>>> -
>>> -void test_atomic_add_32(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> -               odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
>>> -}
>>> -
>>> -void test_atomic_add_u32(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> -               odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
>>> -}
>>> -
>>> -void test_atomic_add_64(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> -               odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
>>> -}
>>> -
>>> -void test_atomic_sub_32(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> -               odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
>>> -}
>>> -
>>> -void test_atomic_sub_u32(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> -               odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
>>> -}
>>> -
>>> -void test_atomic_sub_64(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> -               odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
>>> -}
>>> -
>>> -void test_atomic_inc_dec_32(void)
>>> -{
>>> -       test_atomic_inc_32();
>>> -       test_atomic_dec_32();
>>> -}
>>> -
>>> -void test_atomic_add_sub_32(void)
>>> -{
>>> -       test_atomic_add_32();
>>> -       test_atomic_sub_32();
>>> -}
>>> -
>>> -void test_atomic_inc_dec_u32(void)
>>> -{
>>> -       test_atomic_inc_u32();
>>> -       test_atomic_dec_u32();
>>> -}
>>> -
>>> -void test_atomic_add_sub_u32(void)
>>> -{
>>> -       test_atomic_add_u32();
>>> -       test_atomic_sub_u32();
>>> -}
>>> -
>>> -void test_atomic_inc_dec_64(void)
>>> -{
>>> -       test_atomic_inc_64();
>>> -       test_atomic_dec_64();
>>> -}
>>> -
>>> -void test_atomic_add_sub_64(void)
>>> -{
>>> -       test_atomic_add_64();
>>> -       test_atomic_sub_64();
>>> -}
>>> -
>>> -/**
>>> - * Test basic atomic operation like
>>> - * add/sub/increment/decrement operation.
>>> - */
>>> -void test_atomic_basic(void)
>>> -{
>>> -       test_atomic_inc_32();
>>> -       test_atomic_dec_32();
>>> -       test_atomic_add_32();
>>> -       test_atomic_sub_32();
>>> -
>>> -       test_atomic_inc_u32();
>>> -       test_atomic_dec_u32();
>>> -       test_atomic_add_u32();
>>> -       test_atomic_sub_u32();
>>> -
>>> -       test_atomic_inc_64();
>>> -       test_atomic_dec_64();
>>> -       test_atomic_add_64();
>>> -       test_atomic_sub_64();
>>> -}
>>> -
>>> -void test_atomic_init(void)
>>> -{
>>> -       odp_atomic_init_int(&a32);
>>> -       odp_atomic_init_u32(&a32u);
>>> -       odp_atomic_init_u64(&a64u);
>>> -}
>>> -
>>> -void test_atomic_store(void)
>>> -{
>>> -       odp_atomic_store_int(&a32, S32_INIT_VAL);
>>> -       odp_atomic_store_u32(&a32u, U32_INIT_VAL);
>>> -       odp_atomic_store_u64(&a64u, U64_INIT_VAL);
>>> -}
>>> -
>>> -int test_atomic_validate(void)
>>> -{
>>> -       if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
>>> -               ODP_ERR("Atomic signed 32 usual functions failed\n");
>>> -               return -1;
>>> -       }
>>> -
>>> -       if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
>>> -               ODP_ERR("Atomic u32 usual functions failed\n");
>>> -               return -1;
>>> -       }
>>> -
>>> -       if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
>>> -               ODP_ERR("Atomic u64 usual functions failed\n");
>>> -               return -1;
>>> -       }
>>> -
>>> -       return 0;
>>> -}
>>> -
>>> -static void *run_thread(void *arg)
>>> -{
>>> -       pthrd_arg *parg = (pthrd_arg *)arg;
>>> -       int thr;
>>> -
>>> -       thr = odp_thread_id();
>>> -
>>> -       ODP_DBG("Thread %i starts\n", thr);
>>> -
>>> -       odp_atomic_inc_int(&numthrds);
>>> -
>>> -       /* Wait here until all pthreads are created */
>>> -       while (*(volatile int *)&numthrds < parg->numthrds)
>>> -               ;
>>> -
>>> -       gettimeofday(&tv0[thr], NULL);
>>> -
>>> -       switch (parg->testcase) {
>>> -       case TEST_MIX:
>>> -               test_atomic_basic();
>>> -               break;
>>> -       case TEST_INC_DEC_S32:
>>> -               test_atomic_inc_dec_32();
>>> -               break;
>>> -       case TEST_ADD_SUB_S32:
>>> -               test_atomic_add_sub_32();
>>> -               break;
>>> -       case TEST_INC_DEC_U32:
>>> -               test_atomic_inc_dec_u32();
>>> -               break;
>>> -       case TEST_ADD_SUB_U32:
>>> -               test_atomic_add_sub_u32();
>>> -               break;
>>> -       case TEST_INC_DEC_64:
>>> -               test_atomic_inc_dec_64();
>>> -               break;
>>> -       case TEST_ADD_SUB_64:
>>> -               test_atomic_add_sub_64();
>>> -               break;
>>> -       }
>>> -       gettimeofday(&tv1[thr], NULL);
>>> -       fflush(NULL);
>>> -
>>> -       printf("Time taken in thread %02d to complete op is %lld
>>> usec\n", thr,
>>> -              (tv1[thr].tv_sec - tv0[thr].tv_sec) * 1000000ULL +
>>> -              (tv1[thr].tv_usec - tv0[thr].tv_usec));
>>> -
>>> -       return parg;
>>> -}
>>> -
>>> -int main(int argc, char *argv[])
>>> -{
>>> -       pthrd_arg thrdarg;
>>> -       int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
>>> -       char c;
>>> -       int result;
>>> -
>>> -       if (argc == 1 || argc % 2 == 0) {
>>> -               usage();
>>> -               goto err_exit;
>>> -       }
>>> -       if (odp_test_global_init() != 0)
>>> -               goto err_exit;
>>> -       odp_print_system_info();
>>> -
>>> -       while (cnt != 0) {
>>> -               sscanf(argv[++i], "-%c", &c);
>>> -               switch (c) {
>>> -               case 't':
>>> -                       sscanf(argv[++i], "%d", &test_type);
>>> -                       break;
>>> -               case 'n':
>>> -                       sscanf(argv[++i], "%d", &pthrdnum);
>>> -                       break;
>>> -               default:
>>> -                       ODP_ERR("Invalid option %c\n", c);
>>> -                       usage();
>>> -                       goto err_exit;
>>> -               }
>>> -               if (test_type < TEST_MIX || test_type > TEST_MAX ||
>>> -                   pthrdnum > odp_sys_core_count()) {
>>> -                       usage();
>>> -                       goto err_exit;
>>> -               }
>>> -               cnt -= 2;
>>> -       }
>>> -       if (pthrdnum == 0)
>>> -               pthrdnum = odp_sys_core_count();
>>> -
>>> -       odp_atomic_init_int(&numthrds);
>>> -       test_atomic_init();
>>> -       test_atomic_store();
>>> -
>>> -       memset(&thrdarg, 0, sizeof(pthrd_arg));
>>> -       thrdarg.testcase = test_type;
>>> -       thrdarg.numthrds = pthrdnum;
>>> -
>>> -       if ((test_type > 0) && (test_type < TEST_MAX)) {
>>> -               printf("%s\n", test_name[test_type]);
>>> -       } else {
>>> -               ODP_ERR("Invalid test case [%d]\n", test_type);
>>> -               usage();
>>> -               goto err_exit;
>>> -       }
>>> -       odp_test_thread_create(run_thread, &thrdarg);
>>> -
>>> -       odp_test_thread_exit(&thrdarg);
>>> -
>>> -       result = test_atomic_validate();
>>> -
>>> -       if (result == 0) {
>>> -               printf("%s_%d_%d Result:pass\n",
>>> -                      test_name[test_type], test_type, pthrdnum);
>>> -       } else {
>>> -               printf("%s_%d_%d Result:fail\n",
>>> -                      test_name[test_type], test_type, pthrdnum);
>>> -       }
>>> -       return 0;
>>> -
>>> -err_exit:
>>> -       return -1;
>>> -}
>>> diff --git a/test/api_test/odp_atomic_test.h
>>> b/test/api_test/odp_atomic_test.h
>>> deleted file mode 100644
>>> index 7814da5..0000000
>>> --- a/test/api_test/odp_atomic_test.h
>>> +++ /dev/null
>>> @@ -1,60 +0,0 @@
>>> -/* Copyright (c) 2013, Linaro Limited
>>> - * All rights reserved.
>>> - *
>>> - * SPDX-License-Identifier:     BSD-3-Clause
>>> - */
>>> -
>>> -#ifndef ODP_ATOMIC_TEST_H_
>>> -#define ODP_ATOMIC_TEST_H_
>>> -
>>> -#include <odp.h>
>>> -#include <odph_linux.h>
>>> -
>>> -/**
>>> - * add_sub_cnt could be any valid value
>>> - * so to excercise explicit atomic_add/sub
>>> - * ops. For now using 5..
>>> - */
>>> -#define ADD_SUB_CNT    5
>>> -
>>> -#define        CNT 500000
>>> -#define        S32_INIT_VAL    (1UL << 10)
>>> -#define        U32_INIT_VAL    (1UL << 10)
>>> -#define        U64_INIT_VAL    (1ULL << 33)
>>> -
>>> -typedef enum {
>>> -       TEST_MIX = 1, /* Must be first test case num */
>>> -       TEST_INC_DEC_S32,
>>> -       TEST_ADD_SUB_S32,
>>> -       TEST_INC_DEC_U32,
>>> -       TEST_ADD_SUB_U32,
>>> -       TEST_INC_DEC_64,
>>> -       TEST_ADD_SUB_64,
>>> -       TEST_MAX,
>>> -} odp_test_atomic_t;
>>> -
>>> -
>>> -void test_atomic_inc_dec_32(void);
>>> -void test_atomic_add_sub_32(void);
>>> -void test_atomic_inc_dec_u32(void);
>>> -void test_atomic_add_sub_u32(void);
>>> -void test_atomic_inc_dec_64(void);
>>> -void test_atomic_add_sub_64(void);
>>> -void test_atomic_inc_32(void);
>>> -void test_atomic_dec_32(void);
>>> -void test_atomic_add_32(void);
>>> -void test_atomic_sub_32(void);
>>> -void test_atomic_inc_u32(void);
>>> -void test_atomic_dec_u32(void);
>>> -void test_atomic_add_u32(void);
>>> -void test_atomic_sub_u32(void);
>>> -void test_atomic_inc_64(void);
>>> -void test_atomic_dec_64(void);
>>> -void test_atomic_add_64(void);
>>> -void test_atomic_sub_64(void);
>>> -void test_atomic_init(void);
>>> -void test_atomic_basic(void);
>>> -void test_atomic_store(void);
>>> -int test_atomic_validate(void);
>>> -
>>> -#endif /* ODP_ATOMIC_TEST_H_ */
>>> diff --git a/test/api_test/odp_common.c b/test/api_test/odp_common.c
>>> index ed1fc97..198fe8f 100644
>>> --- a/test/api_test/odp_common.c
>>> +++ b/test/api_test/odp_common.c
>>> @@ -14,7 +14,6 @@
>>>  #include <odp.h>
>>>  #include <odph_linux.h>
>>>  #include <odp_common.h>
>>> -#include <odp_atomic_test.h>
>>>  #include <odp_shm_test.h>
>>>
>>>
>>> diff --git a/test/api_test/odp_counter_test.c
>>> b/test/api_test/odp_counter_test.c
>>> new file mode 100644
>>> index 0000000..c72328e
>>> --- /dev/null
>>> +++ b/test/api_test/odp_counter_test.c
>>> @@ -0,0 +1,361 @@
>>> +/* Copyright (c) 2013, Linaro Limited
>>> + * All rights reserved.
>>> + *
>>> + * SPDX-License-Identifier:     BSD-3-Clause
>>> + */
>>> +
>>> +#include <string.h>
>>> +#include <sys/time.h>
>>> +#include <odp.h>
>>> +#include <odp_debug.h>
>>> +#include <odp_common.h>
>>> +#include <odph_linux.h>
>>> +
>>> +/**
>>> + * add_sub_cnt could be any valid value
>>> + * so to excercise explicit atomic_add/sub
>>> + * ops. For now using 5..
>>> + */
>>> +#define ADD_SUB_CNT    5
>>> +
>>> +#define        CNT 500000
>>> +#define        U32_INIT_VAL    (1UL << 10)
>>> +#define        U64_INIT_VAL    (1ULL << 33)
>>> +
>>> +typedef enum {
>>> +       TEST_MIX = 1, /* Must be first test case num */
>>> +       TEST_INC_DEC_U32 = 2,
>>> +       TEST_ADD_SUB_U32 = 3,
>>> +       TEST_INC_DEC_64 = 4,
>>> +       TEST_ADD_SUB_64 = 5,
>>> +       TEST_MAX,
>>> +} odp_test_counter_t;
>>> +
>>> +
>>> +static uint32_t test_counter_inc_dec_u32(void);
>>> +static uint32_t test_counter_add_sub_u32(void);
>>> +static uint32_t test_counter_inc_dec_64(void);
>>> +static uint32_t test_counter_add_sub_64(void);
>>> +static uint32_t test_counter_inc_u32(void);
>>> +static uint32_t test_counter_dec_u32(void);
>>> +static uint32_t test_counter_add_u32(void);
>>> +static uint32_t test_counter_sub_u32(void);
>>> +static uint32_t test_counter_inc_64(void);
>>> +static uint32_t test_counter_dec_64(void);
>>> +static uint32_t test_counter_add_64(void);
>>> +static uint32_t test_counter_sub_64(void);
>>> +static void test_counter_init(void);
>>> +static uint32_t test_counter_basic(void);
>>> +static void test_counter_write(void);
>>> +static int test_counter_validate(void);
>>> +
>>> +static odp_counter32_t a32u;
>>> +static odp_counter64_t a64u;
>>> +
>>> +static odp_barrier_t barrier;
>>> +
>>> +static const char * const test_name[] = {
>>> +       "dummy",
>>> +       "test atomic counter basic ops add/sub/inc/dec",
>>> +       "test atomic inc/dec of 32-bit counter",
>>> +       "test atomic add/sub of 32-bit counter",
>>> +       "test atomic inc/dec of 64-bit counter",
>>> +       "test atomic add/sub of 64-bit counter"
>>> +};
>>> +
>>> +static uint64_t accops[MAX_WORKERS];
>>> +
>>> +static void usage(void)
>>> +{
>>> +       printf("\n./odp_counter -t <testcase> -n <num of threads>\n\n"
>>> +              "\t<testcase> is\n"
>>> +              "\t\t1 - Test mix (inc/dec/add/sub on 32- and 64-bit
>>> counters)\n"
>>> +              "\t\t2 - Test inc/dec of 32-bit counter\n"
>>> +              "\t\t3 - Test add/sub of 32-bit counter\n"
>>> +              "\t\t4 - Test inc/dec of 64-bit counter\n"
>>> +              "\t\t5 - Test add/sub of 64-bit counter\n"
>>> +              "\t<num of thread> is optional\n"
>>> +              "\t\t<1 - 31> - no of threads to start\n"
>>> +              "\t\tif user doesn't specify this option, then\n"
>>> +              "\t\tno of threads created is equivalent to no of cores\n"
>>> +              "\t\tavailable in the system\n"
>>> +              "\tExample usage:\n"
>>> +              "\t\t./odp_counter -t 2\n"
>>> +              "\t\t./odp_counter -t 3 -n 12\n");
>>> +}
>>> +
>>> +static uint32_t test_counter_inc_u32(void)
>>> +{
>>> +       int i;
>>> +
>>> +       for (i = 0; i < CNT; i++)
>>> +               odp_counter32_inc(&a32u);
>>> +       return i;
>>> +}
>>> +
>>> +static uint32_t test_counter_inc_64(void)
>>> +{
>>> +       int i;
>>> +
>>> +       for (i = 0; i < CNT; i++)
>>> +               odp_counter64_inc(&a64u);
>>> +       return i;
>>> +}
>>> +
>>> +static uint32_t test_counter_dec_u32(void)
>>> +{
>>> +       int i;
>>> +
>>> +       for (i = 0; i < CNT; i++)
>>> +               odp_counter32_add(&a32u, (uint32_t)-1);
>>> +       return i;
>>> +}
>>> +
>>> +static uint32_t test_counter_dec_64(void)
>>> +{
>>> +       int i;
>>> +
>>> +       for (i = 0; i < CNT; i++)
>>> +               odp_counter64_add(&a64u, (uint64_t)-1);
>>> +       return i;
>>> +}
>>> +
>>> +static uint32_t test_counter_add_u32(void)
>>> +{
>>> +       int i;
>>> +
>>> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> +               odp_counter32_add(&a32u, ADD_SUB_CNT);
>>> +       return i;
>>> +}
>>> +
>>> +static uint32_t test_counter_add_64(void)
>>> +{
>>> +       int i;
>>> +
>>> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> +               odp_counter64_add(&a64u, ADD_SUB_CNT);
>>> +       return i;
>>> +}
>>> +
>>> +static uint32_t test_counter_sub_u32(void)
>>> +{
>>> +       int i;
>>> +
>>> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> +               odp_counter32_add(&a32u, -ADD_SUB_CNT);
>>> +       return i;
>>> +}
>>> +
>>> +static uint32_t test_counter_sub_64(void)
>>> +{
>>> +       int i;
>>> +
>>> +       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> +               odp_counter64_add(&a64u, -ADD_SUB_CNT);
>>> +       return i;
>>> +}
>>> +
>>> +static uint32_t test_counter_inc_dec_u32(void)
>>> +{
>>> +       uint32_t nops = 0;
>>> +       nops += test_counter_inc_u32();
>>> +       nops += test_counter_dec_u32();
>>> +       return nops;
>>> +}
>>> +
>>> +static uint32_t test_counter_add_sub_u32(void)
>>> +{
>>> +       uint32_t nops = 0;
>>> +       nops += test_counter_add_u32();
>>> +       nops += test_counter_sub_u32();
>>> +       return nops;
>>> +}
>>> +
>>> +static uint32_t test_counter_inc_dec_64(void)
>>> +{
>>> +       uint32_t nops = 0;
>>> +       nops += test_counter_inc_64();
>>> +       nops += test_counter_dec_64();
>>> +       return nops;
>>> +}
>>> +
>>> +static uint32_t test_counter_add_sub_64(void)
>>> +{
>>> +       uint32_t nops = 0;
>>> +       nops += test_counter_add_64();
>>> +       nops += test_counter_sub_64();
>>> +       return nops;
>>> +}
>>> +
>>> +/**
>>> + * Test basic counter operation like
>>> + * add/sub/increment/decrement operation.
>>> + */
>>> +static uint32_t test_counter_basic(void)
>>> +{
>>> +       uint32_t nops = 0;
>>> +       nops += test_counter_inc_u32();
>>> +       nops += test_counter_dec_u32();
>>> +       nops += test_counter_add_u32();
>>> +       nops += test_counter_sub_u32();
>>> +
>>> +       nops += test_counter_inc_64();
>>> +       nops += test_counter_dec_64();
>>> +       nops += test_counter_add_64();
>>> +       nops += test_counter_sub_64();
>>> +
>>> +       return nops;
>>> +}
>>> +
>>> +static void test_counter_init(void)
>>> +{
>>> +       odp_counter32_init(&a32u, 0);
>>> +       odp_counter64_init(&a64u, 0);
>>> +}
>>> +
>>> +static void test_counter_write(void)
>>> +{
>>> +       odp_counter32_write(&a32u, U32_INIT_VAL);
>>> +       odp_counter64_write(&a64u, U64_INIT_VAL);
>>> +}
>>> +
>>> +static int test_counter_validate(void)
>>> +{
>>> +       if (odp_counter32_read(&a32u) != U32_INIT_VAL) {
>>> +               ODP_ERR("Atomic u32 usual functions failed\n");
>>> +               return -1;
>>> +       }
>>> +
>>> +       if (odp_counter64_read(&a64u) != U64_INIT_VAL) {
>>> +               ODP_ERR("Atomic u64 usual functions failed\n");
>>> +               return -1;
>>> +       }
>>> +
>>> +       return 0;
>>> +}
>>> +
>>> +static void *run_thread(void *arg)
>>> +{
>>> +       pthrd_arg *parg = (pthrd_arg *)arg;
>>> +       int thr;
>>> +       uint64_t nops = 0;
>>> +       struct timeval tv0, tv1;
>>> +
>>> +       thr = odp_thread_id();
>>> +
>>> +       ODP_DBG("Thread %i starts\n", thr);
>>> +
>>> +       /* Wait here until all threads have arrived */
>>> +       /* Use multiple barriers to verify that it handles wrap around
>>> and
>>> +        * has no race conditions which could be exposed when invoked
>>> back-
>>> +        * to-back */
>>> +       odp_barrier_sync(&barrier);
>>> +       odp_barrier_sync(&barrier);
>>> +       odp_barrier_sync(&barrier);
>>> +       odp_barrier_sync(&barrier);
>>> +
>>> +       gettimeofday(&tv0, NULL);
>>> +
>>> +       switch (parg->testcase) {
>>> +       case TEST_MIX:
>>> +               nops += test_counter_basic();
>>> +               break;
>>> +       case TEST_INC_DEC_U32:
>>> +               nops += test_counter_inc_dec_u32();
>>> +               break;
>>> +       case TEST_ADD_SUB_U32:
>>> +               nops += test_counter_add_sub_u32();
>>> +               break;
>>> +       case TEST_INC_DEC_64:
>>> +               nops += test_counter_inc_dec_64();
>>> +               break;
>>> +       case TEST_ADD_SUB_64:
>>> +               nops += test_counter_add_sub_64();
>>> +               break;
>>> +       }
>>> +       gettimeofday(&tv1, NULL);
>>> +       accops[thr] = nops;
>>> +       fflush(NULL);
>>> +
>>> +       uint64_t usecs = (tv1.tv_sec - tv0.tv_sec) * 1000000ULL +
>>> +                        tv1.tv_usec - tv0.tv_usec;
>>> +       printf("Time taken in thread %02d to complete %"PRIu64" op is "
>>> +              "%"PRIu64" usec, %"PRIu64" ns/op\n",
>>> +              thr, nops, usecs, 1000 * usecs / nops);
>>> +
>>> +       return parg;
>>> +}
>>> +
>>> +int main(int argc, char *argv[])
>>> +{
>>> +       pthrd_arg thrdarg;
>>> +       int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
>>> +       char c;
>>> +       int result;
>>> +
>>> +       if (argc == 1 || argc % 2 == 0) {
>>> +               usage();
>>> +               goto err_exit;
>>> +       }
>>> +       if (odp_test_global_init() != 0)
>>> +               goto err_exit;
>>> +       odp_print_system_info();
>>> +
>>> +       while (cnt != 0) {
>>> +               sscanf(argv[++i], "-%c", &c);
>>> +               switch (c) {
>>> +               case 't':
>>> +                       sscanf(argv[++i], "%d", &test_type);
>>> +                       break;
>>> +               case 'n':
>>> +                       sscanf(argv[++i], "%d", &pthrdnum);
>>> +                       break;
>>> +               default:
>>> +                       ODP_ERR("Invalid option %c\n", c);
>>> +                       usage();
>>> +                       goto err_exit;
>>> +               }
>>> +               if (test_type < TEST_MIX || test_type > TEST_MAX ||
>>> +                   pthrdnum > odp_sys_core_count()) {
>>> +                       usage();
>>> +                       goto err_exit;
>>> +               }
>>> +               cnt -= 2;
>>> +       }
>>> +       if (pthrdnum == 0)
>>> +               pthrdnum = odp_sys_core_count();
>>> +
>>> +       test_counter_init();
>>> +       test_counter_write();
>>> +
>>> +       memset(&thrdarg, 0, sizeof(pthrd_arg));
>>> +       thrdarg.testcase = test_type;
>>> +       thrdarg.numthrds = pthrdnum;
>>> +
>>> +       if ((test_type > 0) && (test_type < TEST_MAX)) {
>>> +               printf("%s\n", test_name[test_type]);
>>> +       } else {
>>> +               ODP_ERR("Invalid test case [%d]\n", test_type);
>>> +               usage();
>>> +               goto err_exit;
>>> +       }
>>> +       odp_barrier_init(&barrier, pthrdnum);
>>> +       odp_test_thread_create(run_thread, &thrdarg);
>>> +
>>> +       odp_test_thread_exit(&thrdarg);
>>> +
>>> +       result = test_counter_validate();
>>> +
>>> +       if (result == 0) {
>>> +               printf("%s_%d_%d Result:pass\n",
>>> +                      test_name[test_type], test_type, pthrdnum);
>>> +       } else {
>>> +               printf("%s_%d_%d Result:fail\n",
>>> +                      test_name[test_type], test_type, pthrdnum);
>>> +       }
>>> +       return 0;
>>> +
>>> +err_exit:
>>> +       return -1;
>>> +}
>>> --
>>> 1.9.1
>>>
>>>
>>
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>
>>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
Ola Liljedahl Nov. 4, 2014, 2:33 p.m. UTC | #6
Possibly odp_atomics.h should then be internal leaving only odp_counter.h
as the only public API. The original odp_atomics.h is public so I left it
that way.

The counter API does not allow the user to specify any memory ordering,
relaxed memory order is expected, i.e. no ordering is guaranteed.

Why does acquire/release not fit well with the far atomics? And what do you
mean specifically with "far atomics"? Just the counter updates like Cavium
has?

As Linux kernel atomics interface predates C11/C++11 atomics support, I do
not see it as model to follow.

The patch summary contained a brief description of what I wanted to achieve
with the patch. What do you want more, a Google Docs design document?

-- Ola

On 4 November 2014 15:22, Savolainen, Petri (NSN - FI/Espoo) <
petri.savolainen@nsn.com> wrote:

>  There are many things I’d change in this patch. I think it’s better to
> take a step back and talk what you are trying to achieve here, and then
> correct those step by step. E.g. the whole idea of acquire / release does
> not fit well on far atomics, and far atomics is the thing I’d abstract from
> applications with this API. Other synchronization primitives (such as
> locks) would not be implemented (too often) by applications, so it’s not
> very productive to abstract that (implementation of locks). E.g. Linux
> kernel atomics.h looks pretty much like the odp_atomic.h.
>
>
>
> -Petri
>
>
>
>
>
> *From:* lng-odp-bounces@lists.linaro.org [mailto:
> lng-odp-bounces@lists.linaro.org] *On Behalf Of *ext Ola Liljedahl
> *Sent:* Tuesday, November 04, 2014 3:49 PM
> *To:* lng-odp@lists.linaro.org
> *Subject:* Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11 memory
> model
>
>
>
> Ping!
>
>
>
> I really need this new working atomics support merged ASAP because I have
> a new lock-less implementation of the timer API which uses atomic
> operations. I haven't seen any real criticism against the content of the
> patch so there is nothing to change.
>
>
>
> -- Ola
>
>
>
>
>
> On 20 October 2014 15:07, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:
>
> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
> ---
> Added header file odp_counter.h with support for 32- and 64-bit atomic
> counters
> using relaxed memory order. 6 operations
> (init/read/write/add/read_inc/inc) on
> 32-bit and 64-bit counters respectively.
>
> Renamed odp_atomic_test to odp_counter_test and changed to use
> odp_counter.h
>
> Implementation of C11-based memory model for atomic operations. 10
> operations
> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
> in
> odp_atomic.h. The required memory ordering is now a parameter to each call
> just
> like in C11.
>
> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
> fall back to GCC __sync builtins which often include unnecessarily heavy
> barrier/sync operations (always sequentially consistent).
>
> Attempt to remove all explicit memory barriers (odp_sync_stores) from code
> that
> implements multithreaded synchronization primitives (e.g. locks, barriers).
> Rewrote such primitives to use the new atomic operations.
>
> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
> enqueue/dequeue
> (missing release barrier, had only compiler barrier).
>
>  .gitignore                                         |   2 +-
>  example/generator/odp_generator.c                  |  43 +-
>  example/ipsec/odp_ipsec.c                          |   2 +-
>  example/odp_example/odp_example.c                  |   2 +-
>  example/timer/odp_timer_test.c                     |   2 +-
>  helper/include/odph_ring.h                         |   8 +-
>  platform/linux-generic/include/api/odp.h           |   1 +
>  platform/linux-generic/include/api/odp_atomic.h    | 838
> +++++++++++----------
>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>  platform/linux-generic/odp_barrier.c               |  49 +-
>  platform/linux-generic/odp_buffer.c                |   3 +-
>  platform/linux-generic/odp_crypto.c                |   7 +-
>  platform/linux-generic/odp_queue.c                 |   7 +-
>  platform/linux-generic/odp_ring.c                  |  94 +--
>  platform/linux-generic/odp_rwlock.c                |  62 +-
>  platform/linux-generic/odp_thread.c                |   9 +-
>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>  platform/linux-generic/odp_timer.c                 |  22 +-
>  test/api_test/Makefile.am                          |   6 +-
>  test/api_test/odp_atomic_test.c                    | 362 ---------
>  test/api_test/odp_atomic_test.h                    |  60 --
>  test/api_test/odp_common.c                         |   1 -
>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>  delete mode 100644 test/api_test/odp_atomic_test.c
>  delete mode 100644 test/api_test/odp_atomic_test.h
>  create mode 100644 test/api_test/odp_counter_test.c
>
> diff --git a/.gitignore b/.gitignore
> index 6342e34..77db4d6 100644
> --- a/.gitignore
> +++ b/.gitignore
> @@ -35,7 +35,7 @@ build/
>  odp_example
>  odp_packet
>  odp_packet_netmap
> -odp_atomic
> +odp_counter
>  odp_shm
>  odp_ring
>  odp_timer_ping
> diff --git a/example/generator/odp_generator.c
> b/example/generator/odp_generator.c
> index eb8b340..252157d 100644
> --- a/example/generator/odp_generator.c
> +++ b/example/generator/odp_generator.c
> @@ -62,10 +62,10 @@ typedef struct {
>   * counters
>  */
>  static struct {
> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
> -       odp_atomic_u64_t ip;    /**< ip packets */
> -       odp_atomic_u64_t udp;   /**< udp packets */
> -       odp_atomic_u64_t icmp;  /**< icmp packets */
> +       odp_counter64_t seq;    /**< ip seq to be send */
> +       odp_counter64_t ip;     /**< ip packets */
> +       odp_counter64_t udp;    /**< udp packets */
> +       odp_counter64_t icmp;   /**< icmp packets */
>  } counters;
>
>  /** * Thread specific arguments
> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_UDPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_UDP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_ICMPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_ICMP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>                 }
>
>                 if (args->appl.interval != 0) {
> +                       uint64_t seq = odp_counter64_read(&counters.seq);
>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
> -                              thr, counters.seq, counters.seq%0xffff);
> +                              thr, seq, seq%0xffff);
>                         /* TODO use odp timer */
>                         usleep(args->appl.interval * 1000);
>                 }
> -               if (args->appl.number != -1 && counters.seq
> -                   >= (unsigned int)args->appl.number) {
> +               if (args->appl.number != -1 &&
> +                   odp_counter64_read(&counters.seq) >=
> +                   (unsigned int)args->appl.number) {
>                         break;
>                 }
>         }
> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>         /* receive number of reply pks until timeout */
>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>                 while (args->appl.timeout >= 0) {
> -                       if (counters.icmp >= (unsigned
> int)args->appl.number)
> +                       if (odp_counter64_read(&counters.icmp) >=
> +                           (unsigned int)args->appl.number)
>                                 break;
>                         /* TODO use odp timer */
>                         sleep(1);
> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>
>         /* print info */
>         if (args->appl.mode == APPL_MODE_UDP) {
> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
> +               printf("  [%02i] total send: %ju\n", thr,
> +                      odp_counter64_read(&counters.seq));
>         } else if (args->appl.mode == APPL_MODE_PING) {
>                 printf("  [%02i] total send: %ju total receive: %ju\n",
> -                      thr, counters.seq, counters.icmp);
> +                      thr, odp_counter64_read(&counters.seq),
> +                      odp_counter64_read(&counters.icmp));
>         }
>         return arg;
>  }
> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                 if (!odp_packet_inflag_ipv4(pkt))
>                         continue;
>
> -               odp_atomic_inc_u64(&counters.ip);
> +               odp_counter64_inc(&counters.ip);
>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>
>                 /* udp */
>                 if (ip->proto == ODPH_IPPROTO_UDP) {
> -                       odp_atomic_inc_u64(&counters.udp);
> +                       odp_counter64_inc(&counters.udp);
>                         udp = (odph_udphdr_t *)(buf + offset);
>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>                                         odp_be_to_cpu_16(udp->length) -
> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                         icmp = (odph_icmphdr_t *)(buf + offset);
>                         /* echo reply */
>                         if (icmp->type == ICMP_ECHOREPLY) {
> -                               odp_atomic_inc_u64(&counters.icmp);
> +                               odp_counter64_inc(&counters.icmp);
>                                 memcpy(&tvsend, buf + offset +
> ODPH_ICMPHDR_LEN,
>                                        sizeof(struct timeval));
>                                 /* TODO This should be changed to use an
> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>         }
>
>         /* init counters */
> -       odp_atomic_init_u64(&counters.seq);
> -       odp_atomic_init_u64(&counters.ip);
> -       odp_atomic_init_u64(&counters.udp);
> -       odp_atomic_init_u64(&counters.icmp);
> +       odp_counter64_init(&counters.seq, 0);
> +       odp_counter64_init(&counters.ip, 0);
> +       odp_counter64_init(&counters.udp, 0);
> +       odp_counter64_init(&counters.icmp, 0);
>
>         /* Reserve memory for args from shared mem */
>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
> index 2f2dc19..76c27d0 100644
> --- a/example/ipsec/odp_ipsec.c
> +++ b/example/ipsec/odp_ipsec.c
> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>         printf("Num worker threads: %i\n", num_workers);
>
>         /* Create a barrier to synchronize thread startup */
> -       odp_barrier_init_count(&sync_barrier, num_workers);
> +       odp_barrier_init(&sync_barrier, num_workers);
>
>         /*
>          * By default core #0 runs Linux kernel background tasks.
> diff --git a/example/odp_example/odp_example.c
> b/example/odp_example/odp_example.c
> index 0e9aa3d..c473395 100644
> --- a/example/odp_example/odp_example.c
> +++ b/example/odp_example/odp_example.c
> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>         odp_shm_print_all();
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&globals->barrier, num_workers);
> +       odp_barrier_init(&globals->barrier, num_workers);
>
>         if (args.proc_mode) {
>                 int ret;
> diff --git a/example/timer/odp_timer_test.c
> b/example/timer/odp_timer_test.c
> index 78b2ae2..dfbeae9 100644
> --- a/example/timer/odp_timer_test.c
> +++ b/example/timer/odp_timer_test.c
> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>         printf("\n");
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&test_barrier, num_workers);
> +       odp_barrier_init(&test_barrier, num_workers);
>
>         /* Create and launch worker threads */
>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
> index 76c1db8..5e78b34 100644
> --- a/helper/include/odph_ring.h
> +++ b/helper/include/odph_ring.h
> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>                 uint32_t sp_enqueue;     /* True, if single producer. */
>                 uint32_t size;           /* Size of ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Producer head. */
> -               uint32_t tail;          /* Producer tail. */
> +               odp_atomic32_t head;    /* Producer head. */
> +               odp_atomic32_t tail;    /* Producer tail. */
>         } prod ODP_ALIGNED_CACHE;
>
>         /** @private Consumer */
> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>                 uint32_t size;           /* Size of the ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Consumer head. */
> -               uint32_t tail;          /* Consumer tail. */
> +               odp_atomic32_t head;    /* Consumer head. */
> +               odp_atomic32_t tail;    /* Consumer tail. */
>         } cons ODP_ALIGNED_CACHE;
>
>         /** @private Memory space of ring starts here. */
> diff --git a/platform/linux-generic/include/api/odp.h
> b/platform/linux-generic/include/api/odp.h
> index 0ee3faf..d124d52 100644
> --- a/platform/linux-generic/include/api/odp.h
> +++ b/platform/linux-generic/include/api/odp.h
> @@ -32,6 +32,7 @@ extern "C" {
>  #include <odp_barrier.h>
>  #include <odp_spinlock.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>
>  #include <odp_init.h>
>  #include <odp_system_info.h>
> diff --git a/platform/linux-generic/include/api/odp_atomic.h
> b/platform/linux-generic/include/api/odp_atomic.h
>
> index 0cc4cf4..ccaad02 100644
>
> --- a/platform/linux-generic/include/api/odp_atomic.h
> +++ b/platform/linux-generic/include/api/odp_atomic.h
> @@ -4,464 +4,494 @@
>   * SPDX-License-Identifier:     BSD-3-Clause
>   */
>
> -
>  /**
>   * @file
>   *
> - * ODP atomic operations
> + * ODP atomic types and operations, semantically a subset of C11 atomics.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Atomic functions must be used to operate on atomic variables!
>   */
>
>  #ifndef ODP_ATOMIC_H_
>  #define ODP_ATOMIC_H_
>
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +#include <odp_debug.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
>
> -
> -#include <odp_std_types.h>
> -
> -
> -/**
> - * Atomic integer
> - */
> -typedef volatile int32_t odp_atomic_int_t;
> -
> -/**
> - * Atomic unsigned integer 64 bits
> - */
> -typedef volatile uint64_t odp_atomic_u64_t;
> -
> -/**
> - * Atomic unsigned integer 32 bits
> - */
> -typedef volatile uint32_t odp_atomic_u32_t;
> -
> -
> -/**
> - * Initialize atomic integer
> - *
> - * @param ptr    An integer atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic integer
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic integer value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
> -{
> -       return *ptr;
> -}
> -
> -/**
> - * Store value to atomic integer
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
> new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic integer
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic integer
> - *
> - * @param ptr    An atomic integer variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement atomic integer by 1
> - *
> - * @param ptr    An atomic int variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Initialize atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic uint32 value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
> -{
> -       return *ptr;
> -}
> -
> -/**
> - * Store value to atomic uint32
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
> -                                       uint32_t new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be sub to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
>  /**
> - * Fetch and increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __OCTEON__
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       uint32_t ret;
> -
> -       __asm__ __volatile__ ("syncws");
> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
> -                             "r" (ptr));
> -
> -       return ret;
> -}
> -
> + * 32-bit (unsigned) atomic type
> + */
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the atomic variable */
> +} odp_atomic32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
> +
> +typedef enum {
> +       /** Relaxed memory order, no ordering of other accesses enforced */
> +       ODP_MEMORDER_RLX,
> +       /** Acquire memory order, later accesses cannot move before
> +        * acquire operation */
> +       ODP_MEMORDER_ACQ,
> +       /** Release memory order, earlier accesses cannot move after
> +        * release operation */
> +       ODP_MEMORDER_RLS
> +} odp_memorder_t;
> +
>
> +/*****************************************************************************
> + * Just some private helpers
>
> +*****************************************************************************/
> +
> +#ifdef __OCTEON__
> +/* OCTEON Write Memory Barrier */
> +#define COMPILER_HW_BARRIER() __asm __volatile( \
> +       /* Double syncw to work around errata */ \
> +       "syncw\n\tsyncw" : : : )
>  #else
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> +/** Compiler and hardware full memory barrier */
> +#define COMPILER_HW_BARRIER() __sync_synchronize()
> +/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a
> */
>  #endif
>
> -/**
> - * Increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u32(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_sub_u32(ptr, 1);
> -}
> -
> -/**
> - * Atomic compare and set for 32bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> +#define MEMORY "memory"
> +
>
> +/*****************************************************************************
> + * Operations on 32-bit atomics
> + * odp_atomic32_init - no return value
> + * odp_atomic32_load - return current value
> + * odp_atomic32_store - no return value
> + * odp_atomic32_cmp_xchg_weak - return bool
> + * odp_atomic32_fetch_add - return old value
> + * odp_atomic32_add - no return value
> + * odp_atomic32_fetch_inc - return old value
> + * odp_atomic32_inc - no return value
> + * odp_atomic32_fetch_dec - return old value
> + * odp_atomic32_dec - no return value
> +
> *****************************************************************************/
> +
> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
> +{
> +       /* Write of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) write variable,
> thus we
> +        * can avoid using compiler memory barriers */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Atomic load of 32-bit atomic variable
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the load
> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
> + *
> + * @return Value of the variable
> + */
> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               uint32_t val;
> +               /* Read of aligned word is atomic */
> +               /* Cast to volatile to force compiler to (re-) read
> variable,
> +                * thus we can avoid using compiler memory barriers */
> +               val = *(__volatile const uint32_t *)&ptr->v;
> +               return val;
> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
> +#if defined __aarch64__
> +               uint32_t val;
> +               __asm __volatile("ldar %w0, [%1]"
> +                               : "=&r"(val)
> +                               : "r"(&ptr->v)
> +                               : MEMORY);
> +               return val;
> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
> +               /* Read of aligned word is atomic */
> +               uint32_t val = ptr->v;
> +               /* To prevent later accesses from moving up */
> +               /* Herb Sutter claims HW barrier not needed on x86? */
> +               COMPILER_HW_BARRIER();
> +               return val;
> +#else
> +#warning odp_atomic32_load() may not be efficiently implemented
> +               /* Assume read of aligned word is atomic */
> +               uint32_t val = ptr->v;
> +               /* To prevent later accesses from moving up */
> +               COMPILER_HW_BARRIER();
> +               return val;
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +/**
> + * Atomic store to 32-bit atomic variable
> + *
> + * @param ptr  Pointer to a 32-bit atomic variable
> + * @param val  Value to write to the atomic variable
> + * @param memmodel Memory model associated with the store
> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
> + */
> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
> +               uint32_t val,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Write of aligned word is atomic */
> +               /* Cast to volatile to force compiler to (re-) write
> variable,
> +                * thus we will avoid using compiler memory barriers */
> +               *(__volatile uint32_t *)&ptr->v = val;
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
> +               /* Compiler and HW barrier to prevent earlier accesses from
> +                * moving down */
> +               COMPILER_HW_BARRIER();
> +               /* Write of aligned word is atomic */
> +               ptr->v = val;
> +               /* Compiler and HW barrier to prevent this store from
> moving
> +                * down after a later load-acquire and thus create
> overlapping
> +                * critical sections. Herb Sutter thinks this is needed */
> +               COMPILER_HW_BARRIER();
> +#elif defined __aarch64__
> +               __asm __volatile("stlr %w0, [%1]"
> +                               :
> +                               : "r"(val), "r"(&ptr->v)
> +                               : MEMORY);
> +#elif defined __x86_64__
> +               /* This is actually an atomic exchange operation */
> +               /* Generates good code on x86_64 */
> +               (void)__sync_lock_test_and_set(&ptr->v, val);
> +#else
> +#warning odp_atomic32_store_rls() may not be efficiently implemented
> +               /* This is actually an atomic exchange operation */
> +               (void)__sync_lock_test_and_set(&ptr->v, val);
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +
> +/**
> + * Atomic compare and exchange (swap) of 32-bit atomic variable
> + * "Weak" semantics, may fail spuriously and must be used in a loop.
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param exp_p Pointer to expected value (updated on failure)
> + * @param val   New value to write
> + * @param       memmodel Memory model associated with the compare-and-swap
> + * operation (ODP_MEMORDER_RLX only)
> + *
> + * @return 1 (true) if exchange successful, 0 (false) if not successful
> (and
> + * '*exp_p' updated with current value)
> + */
> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
> +               uint32_t *exp_p,
> +               uint32_t val,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +#if defined __arm__ /* A32/T32 ISA */
> +               uint32_t old;
> +               uint32_t exp = *exp_p;
> +               int status;
> +               __asm __volatile("ldrex %0, [%2]\t\n"
> +                                "cmp   %0, %3\t\n"
> +                                "bne   1f\t\n"
> +                                "strex %1, %4, [%2]\t\n"
> +                                "1:\t\n"
> +                               : "=&r"(old), "=&r"(status)
> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
> +                               : MEMORY);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Clear exclusive access monitor */
> +                       __asm __volatile("clrex");
> +                       /* Return current value */
> +                       *exp_p = old;
> +                       return 0;
> +               }
> +               /* strex returns 0 on success */
> +               if (odp_unlikely(status != 0)) {
> +                       /* strex failed, reservation was disturbed */
> +                       /* Return potentially changed value */
> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
> +                       return 0;
> +               }
> +               return 1;
> +#elif defined __mips64__
> +               uint32_t old;
> +               uint32_t exp = *exp_p;
> +               uint32_t status = val;
> +               __asm __volatile("llw %0, [%2]\t\n"
> +                                "bne %0, %3, 1f\t\n"
> +                                "scw %1, [%2]\t\n"
> +                                "1:\t\n"
> +                               : "=&r"(old), "+&r"(status)
> +                               : "r"(&ptr->v), "r"(exp)
> +                               : MEMORY);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Return current value */
> +                       *exp_p = old;
> +                       return 0;
> +               }
> +               /* scw returns 1 on success, 0 on failure */
> +               if (odp_unlikely(status == 0)) {
> +                       /* scw failed, reservation was disturbed */
> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
> +                       return 0;
> +               }
> +               return 1;
> +#elif defined __x86_64__
> +               uint32_t exp = *exp_p;
> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
> val);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Return the unexpected content of '*ptr' */
> +                       *exp_p = old;
> +                       return 0;
> +               } else {
> +                       return 1;
> +               }
> +#else
> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
> +               uint32_t exp = *exp_p;
> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
> val);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Return the unexpected content of '*ptr' */
> +                       *exp_p = old;
> +                       return 0;
> +               } else {
> +                       return 1;
> +               }
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +/**
> + * Atomic fetch and add to 32-bit atomic variable
> + * @note A - B <=> A + (-B)
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + * @param memmodel Memory model associated with the add
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
> + *
> + * @return Value of the atomic variable before the addition
> + */
> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
> +               uint32_t incr,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +#if defined __arm__ /* A32/T32 ISA */
> +               uint32_t old_val, tmp;
> +               int status;
> +               do {
> +                       __asm __volatile("ldrex %0, [%3]\t\n"
> +                                        "add   %1, %0, %4\t\n"
> +                                        "strex %2, %1, [%3]\t\n"
>
> +                                       : "=&r"(old_val), "=&r"(tmp),
>
> +                                         "=&r"(status)
> +                                       : "r"(&ptr->v), "r"(incr)
> +                                       : MEMORY);
> +               } while (odp_unlikely(status != 0));
> +               return old_val;
> +#elif defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("laa %0,(%2),%3"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr), "r" (incr)
> +                               : MEMORY);
> +               return old_val;
> +#elif defined __x86_64__
> +               /* Generates good code on x86_64 */
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +#else
> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("laa %0,(%2),%3"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr), "r" (incr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
> +#endif
> +               /* __sync_fetch_and_add() will give us barriers before and
> +                * after, we are fine with this for release operations */
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
>  }
>
>  /**
> - * Initialize atomic uint64
> + * Atomic add to 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + * @param memmodel Memory model associated with the add
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
> +               uint32_t incr,
> +               odp_memorder_t mmodel)
>  {
> -       *ptr = 0;
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic add instructions can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               __asm __volatile("saa %[inc], (%[base])"
> +                               : "+m" (*ptr)
> +                               : [inc] "r" (incr), [base] "r" (ptr)
> +                               : MEMORY);
> +               return;
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +               /* Platforms that support atomic add instructions can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("saa %[inc], (%[base])"
> +                               : "+m" (*ptr)
> +                               : [inc] "r" (incr), [base] "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return;
> +#endif
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>  }
>
>  /**
> - * Load value of atomic uint64
> - *
> - * @param ptr    An atomic variable
> + * Atomic fetch and increment of 32-bit atomic variable
>   *
> - * @return atomic uint64 value
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the increment
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   *
> - * @note The operation is not synchronized with other threads
> + * @return Value of the atomic variable before the increment
>   */
> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>  {
> -       return *ptr;
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic increment instructions
> can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("lai %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               return old_val;
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("lai %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
> +#endif
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>  }
>
>  /**
> - * Store value to atomic uint64
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> + * Atomic increment of 32-bit atomic variable
>   *
> - * @note The operation is not synchronized with other threads
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the increment
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
> -                                       uint64_t new_value)
> -{
> -       *ptr = new_value;
> -}
> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>
> -/**
> - * Add atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - */
> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
>  {
> -       __sync_fetch_and_add(ptr, value);
> +       /* Default to using odp_atomic32_fetch_inc() */
> +       /* Platforms that support atomic increment instructions can add
> +        * their implementations here */
> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>  }
>
>  /**
> - * Fetch and add atomic uint64
> + * Atomic fetch and decrement of 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the decrement
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   *
> - * @return Value of the variable before the operation
> + * @return Value of the atomic variable before the decrement
>   */
> -
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>  {
> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> -#else
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic decrement instructions
> can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("lad %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               return old_val;
>  #endif
> -/**
> - * Subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - */
> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
> -{
> -       __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> -#else
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("lad %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
>  #endif
> -/**
> - * Fetch and increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u64(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       odp_atomic_fetch_add_u64(ptr, 1);
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>  }
>
>  /**
> - * Fetch and decrement atomic uint64 by 1
> + * Atomic decrement of 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the decrement
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u64(ptr, 1);
> -}
> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
> +               odp_memorder_t memorder)
>
> -/**
> - * Decrement atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>  {
> -       odp_atomic_fetch_sub_u64(ptr, 1);
> +       /* Default to using odp_atomic32_fetch_dec() */
> +       /* Platforms that support atomic decrement instructions can add
> +        * their implementations here */
> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>  }
>
> -/**
> - * Atomic compare and set for 64bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> -}
> +/* We are not exporting this macro */
> +#undef COMPILER_HW_BARRIER
> +#undef MEMORY
>
>  #ifdef __cplusplus
>  }
> diff --git a/platform/linux-generic/include/api/odp_barrier.h
> b/platform/linux-generic/include/api/odp_barrier.h
> index a7b3215..69b1eb8 100644
> --- a/platform/linux-generic/include/api/odp_barrier.h
> +++ b/platform/linux-generic/include/api/odp_barrier.h
> @@ -27,18 +27,18 @@ extern "C" {
>   * ODP execution barrier
>   */
>  typedef struct odp_barrier_t {
> -       int              count;  /**< @private Thread count */
> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
> +       uint32_t       num_threads;  /**< @private Thread count (constant)
> */
> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
>  } odp_barrier_t;
>
>
>  /**
>   * Init barrier with thread count
>   *
> - * @param barrier    Barrier
> - * @param count      Thread count
> + * @param barrier     Barrier
> + * @param num_threads Number of threads which share the barrier
>   */
> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>
>
>  /**
> diff --git a/platform/linux-generic/include/api/odp_counter.h
> b/platform/linux-generic/include/api/odp_counter.h
> new file mode 100644
>
> index 0000000..f937d27
>
> --- /dev/null
> +++ b/platform/linux-generic/include/api/odp_counter.h
> @@ -0,0 +1,363 @@
> +/* Copyright (c) 2013, Linaro Limited
> + * All rights reserved.
> + *
> + * SPDX-License-Identifier:     BSD-3-Clause
> + */
> +
> +/**
> + * @file
> + *
> + * ODP atomic counter types and operations, suitable for e.g. shared
> statistics.
> + * Relaxed memory model assumed for lowest overhead.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Counter functions must be used to operate on counter variables!
> + */
> +
> +#ifndef ODP_COUNTER_H_
> +#define ODP_COUNTER_H_
> +
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * 32-bit (unsigned) atomic counter type
> + */
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the counter variable */
> +} odp_counter32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
> +
> +/**
> + * 64-bit (unsigned) atomic counter type
> + */
> +typedef struct {
> +       uint64_t v; /**< Actual storage for the counter variable */
> +       /* Room for other data structures (e.g. spin lock) that might be
> +        * needed to ensure atomicity on some architectures */
> +} odp_counter64_t
> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
> +
>
> +/*****************************************************************************
> + * Operations on 32-bit atomic counters
> + * odp_counter32_init - returns no value
> + * odp_counter32_read - returns current value
> + * odp_counter32_write - returns no value
> + * odp_counter32_add - returns no value
> + * odp_counter32_read_inc - returns old value
> + * odp_counter32_inc - returns no value
> +
> *****************************************************************************/
> +
> +/**
> + * Initialize 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param val   Initial value
> + */
> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t val)
> +{
> +       /* No implementation requires any other type of initialization */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Read 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + *
> + * @return Value of the variable
> + */
> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
> +{
> +       uint32_t val;
> +       /* Read of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) read variable, thus
> we
> +        * will avoid using compiler memory barriers */
> +       val = *(__volatile const uint32_t *)&ptr->v;
> +       return val;
> +}
> +
> +/**
> + * Write 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param val   Value to write to the variable
> + */
> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t val)
> +{
> +       /* Write of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) write variable,
> thus we
> +        * will avoid using compiler memory barriers */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Atomic add to 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t result;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%2]\t\n"
> +                                "add   %0, %0, %3\t\n"
> +                                "strex %1, %0, [%2]"
> +                                : "=&r"(result), "=&r"(status)
> +                                : "r"(&ptr->v), "Ir" (incr)
> +                                : );
> +       } while (odp_unlikely(status != 0));
> +#elif defined __OCTEON__
> +       __asm __volatile("saa %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +       /* Warning odp_counter32_add() may not be efficiently implemented
> */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) of 32-bit counter variable, return original value
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + *
> + * @return Original value of counter
> + */
> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t result, tmp;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%3]\t\n"
> +                                "add   %1, %0, #1\t\n"
> +                                "strex %2, %1, [%3]"
>
> +                                : "=&r"(result), "=&r"(tmp), "=&r"(status)
>
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0));
> +       return result;
> +#elif defined __OCTEON__
> +       uint32_t old_val;
> +       __asm __volatile("lai %0,(%2)"
> +                        : "=r" (old_val), "+m" (ptr)
> +                        : "r" (ptr)
> +                        : );
> +       return old_val;
> +#elif defined __x86_64__
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#else
> +/* Warning odp_counter32_read_inc() may not be efficiently implemented */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + */
> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
> +{
> +#if defined __OCTEON__
> +       odp_counter32_add(ptr, 1);
> +#else
> +       (void)odp_counter32_read_inc(ptr);
> +#endif
> +}
> +
>
> +/*****************************************************************************
> + * Operations on 64-bit atomic counters
> + * odp_counter64_init
> + * odp_counter64_read
> + * odp_counter64_write
> + * odp_counter64_add
> + * odp_counter64_read_inc
> + * odp_counter64_inc
> +
> *****************************************************************************/
> +
> +/**
> + * Read 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + *
> + * @return Value of the counter variable
> + */
> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t val;
> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
> +                        "clrex" /* Clear exclusive access monitor */
> +                        : "=&r"(val)
> +                        : "r"(&ptr->v)
> +                        : );
> +       return val;
> +#elif defined __x86_64__ || defined __aarch64__
> +       /* Read of aligned quad/double word is atomic */
> +       return ptr->v;
> +#else
> +/* Warning odp_counter64_read() may not be efficiently implemented */
> +       return __sync_fetch_and_or(&ptr->v, 0);
> +#endif
> +}
> +
> +/**
> + * Write 64-bit counter variable
> + *
> + * @param ptr  Pointer to a 64-bit counter variable
> + * @param val  Value to write to the counter variable
> + */
> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t val)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               /* Read counter variable exclusively so we can write to it
> +                * later */
> +               /* Attempt to write the new value */
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "strexd %1, %3, %H3, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(val)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __x86_64__ || defined __aarch64__
> +       /* Write of aligned quad/double word is atomic */
> +       ptr->v = val;
> +#else
> +/* Warning odp_counter64_write() may not be efficiently implemented */
> +       /* This is actually an counter exchange operation */
> +       (void)__sync_lock_test_and_set(&ptr->v, val);
> +#endif
> +}
> +
> +/**
> + * Initialize 64-bit counter variable
> + * Perform implementation specific initializations, assign initial value.
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param val   Initial value
> + */
> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t val)
> +{
> +       /* No implementation requires any other type of initialization */
> +       odp_counter64_write(ptr, val);
> +}
> +
> +/**
> + * Atomic add to 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, %0, %3\t\n"
> +                                "adc    %H0, %H3\t\n"
> +                                "strexd %1, %0, %H0, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(incr)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __OCTEON__
> +       __asm __volatile("saad %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +/* Warning odp_counter64_add() may not be efficiently implemented */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +}
> +
> +
> +/**
> + * Atomic increment (+1) 64-bit counter variable and return original value
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + *
> + * @return Original value of counter
> + */
> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val, tmp;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
> +                                "adds   %2, %0, #1\t\n"
> +                                "adc    %H2, %H0, #0\t\n"
> +                                "strexd %1, %2, %H2, [%3]"
> +                                : "=&r"(old_val), "=&r"(status),
> "=&r"(tmp)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +       return old_val;
> +#elif defined __OCTEON__
> +       uint64_t old_val;
> +       __asm __volatile("laid %0,(%2)"
> +                       : "=r" (old_val), "+m" (ptr)
> +                       : "r" (ptr)
> +                       : );
> +       return old_val;
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#else
> +/* Warning odp_counter64_read_inc() may not be efficiently implemented */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + */
> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, #1\t\n"
> +                                "adc    %H0, #0\t\n"
> +                                "strexd %1, %0, %H0, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#else
> +       (void)odp_counter64_read_inc(ptr);
> +#endif
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif
> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
> b/platform/linux-generic/include/api/odp_rwlock.h
> index 252ebb2..ff8a9a2 100644
> --- a/platform/linux-generic/include/api/odp_rwlock.h
> +++ b/platform/linux-generic/include/api/odp_rwlock.h
> @@ -10,26 +10,30 @@
>  /**
>   * @file
>   *
> - * ODP RW Locks
> + * ODP read/write lock
> + * RW lock support mu
> ...
>
> [Message clipped]
Anders Roxell Nov. 4, 2014, 3:03 p.m. UTC | #7
As perti wrote in his first email this patch should be broken up in
multiple patches...

Cheers,
Anders
On 4 Nov 2014 15:34, "Ola Liljedahl" <ola.liljedahl@linaro.org> wrote:

> Possibly odp_atomics.h should then be internal leaving only odp_counter.h
> as the only public API. The original odp_atomics.h is public so I left it
> that way.
>
> The counter API does not allow the user to specify any memory ordering,
> relaxed memory order is expected, i.e. no ordering is guaranteed.
>
> Why does acquire/release not fit well with the far atomics? And what do
> you mean specifically with "far atomics"? Just the counter updates like
> Cavium has?
>
> As Linux kernel atomics interface predates C11/C++11 atomics support, I do
> not see it as model to follow.
>
> The patch summary contained a brief description of what I wanted to
> achieve with the patch. What do you want more, a Google Docs design
> document?
>
> -- Ola
>
> On 4 November 2014 15:22, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
>>  There are many things I’d change in this patch. I think it’s better to
>> take a step back and talk what you are trying to achieve here, and then
>> correct those step by step. E.g. the whole idea of acquire / release does
>> not fit well on far atomics, and far atomics is the thing I’d abstract from
>> applications with this API. Other synchronization primitives (such as
>> locks) would not be implemented (too often) by applications, so it’s not
>> very productive to abstract that (implementation of locks). E.g. Linux
>> kernel atomics.h looks pretty much like the odp_atomic.h.
>>
>>
>>
>> -Petri
>>
>>
>>
>>
>>
>> *From:* lng-odp-bounces@lists.linaro.org [mailto:
>> lng-odp-bounces@lists.linaro.org] *On Behalf Of *ext Ola Liljedahl
>> *Sent:* Tuesday, November 04, 2014 3:49 PM
>> *To:* lng-odp@lists.linaro.org
>> *Subject:* Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11 memory
>> model
>>
>>
>>
>> Ping!
>>
>>
>>
>> I really need this new working atomics support merged ASAP because I have
>> a new lock-less implementation of the timer API which uses atomic
>> operations. I haven't seen any real criticism against the content of the
>> patch so there is nothing to change.
>>
>>
>>
>> -- Ola
>>
>>
>>
>>
>>
>> On 20 October 2014 15:07, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:
>>
>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>> ---
>> Added header file odp_counter.h with support for 32- and 64-bit atomic
>> counters
>> using relaxed memory order. 6 operations
>> (init/read/write/add/read_inc/inc) on
>> 32-bit and 64-bit counters respectively.
>>
>> Renamed odp_atomic_test to odp_counter_test and changed to use
>> odp_counter.h
>>
>> Implementation of C11-based memory model for atomic operations. 10
>> operations
>> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
>> in
>> odp_atomic.h. The required memory ordering is now a parameter to each
>> call just
>> like in C11.
>>
>> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
>> fall back to GCC __sync builtins which often include unnecessarily heavy
>> barrier/sync operations (always sequentially consistent).
>>
>> Attempt to remove all explicit memory barriers (odp_sync_stores) from
>> code that
>> implements multithreaded synchronization primitives (e.g. locks,
>> barriers).
>> Rewrote such primitives to use the new atomic operations.
>>
>> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
>> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
>> enqueue/dequeue
>> (missing release barrier, had only compiler barrier).
>>
>>  .gitignore                                         |   2 +-
>>  example/generator/odp_generator.c                  |  43 +-
>>  example/ipsec/odp_ipsec.c                          |   2 +-
>>  example/odp_example/odp_example.c                  |   2 +-
>>  example/timer/odp_timer_test.c                     |   2 +-
>>  helper/include/odph_ring.h                         |   8 +-
>>  platform/linux-generic/include/api/odp.h           |   1 +
>>  platform/linux-generic/include/api/odp_atomic.h    | 838
>> +++++++++++----------
>>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>>  platform/linux-generic/odp_barrier.c               |  49 +-
>>  platform/linux-generic/odp_buffer.c                |   3 +-
>>  platform/linux-generic/odp_crypto.c                |   7 +-
>>  platform/linux-generic/odp_queue.c                 |   7 +-
>>  platform/linux-generic/odp_ring.c                  |  94 +--
>>  platform/linux-generic/odp_rwlock.c                |  62 +-
>>  platform/linux-generic/odp_thread.c                |   9 +-
>>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>>  platform/linux-generic/odp_timer.c                 |  22 +-
>>  test/api_test/Makefile.am                          |   6 +-
>>  test/api_test/odp_atomic_test.c                    | 362 ---------
>>  test/api_test/odp_atomic_test.h                    |  60 --
>>  test/api_test/odp_common.c                         |   1 -
>>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>>  delete mode 100644 test/api_test/odp_atomic_test.c
>>  delete mode 100644 test/api_test/odp_atomic_test.h
>>  create mode 100644 test/api_test/odp_counter_test.c
>>
>> diff --git a/.gitignore b/.gitignore
>> index 6342e34..77db4d6 100644
>> --- a/.gitignore
>> +++ b/.gitignore
>> @@ -35,7 +35,7 @@ build/
>>  odp_example
>>  odp_packet
>>  odp_packet_netmap
>> -odp_atomic
>> +odp_counter
>>  odp_shm
>>  odp_ring
>>  odp_timer_ping
>> diff --git a/example/generator/odp_generator.c
>> b/example/generator/odp_generator.c
>> index eb8b340..252157d 100644
>> --- a/example/generator/odp_generator.c
>> +++ b/example/generator/odp_generator.c
>> @@ -62,10 +62,10 @@ typedef struct {
>>   * counters
>>  */
>>  static struct {
>> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
>> -       odp_atomic_u64_t ip;    /**< ip packets */
>> -       odp_atomic_u64_t udp;   /**< udp packets */
>> -       odp_atomic_u64_t icmp;  /**< icmp packets */
>> +       odp_counter64_t seq;    /**< ip seq to be send */
>> +       odp_counter64_t ip;     /**< ip packets */
>> +       odp_counter64_t udp;    /**< udp packets */
>> +       odp_counter64_t icmp;   /**< icmp packets */
>>  } counters;
>>
>>  /** * Thread specific arguments
>> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>> ODPH_UDPHDR_LEN +
>>                                        ODPH_IPV4HDR_LEN);
>>         ip->proto = ODPH_IPPROTO_UDP;
>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>>         ip->id = odp_cpu_to_be_16(seq);
>>         ip->chksum = 0;
>>         odph_ipv4_csum_update(pkt);
>> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>> ODPH_ICMPHDR_LEN +
>>                                        ODPH_IPV4HDR_LEN);
>>         ip->proto = ODPH_IPPROTO_ICMP;
>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>>         ip->id = odp_cpu_to_be_16(seq);
>>         ip->chksum = 0;
>>         odph_ipv4_csum_update(pkt);
>> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>>                 }
>>
>>                 if (args->appl.interval != 0) {
>> +                       uint64_t seq = odp_counter64_read(&counters.seq);
>>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
>> -                              thr, counters.seq, counters.seq%0xffff);
>> +                              thr, seq, seq%0xffff);
>>                         /* TODO use odp timer */
>>                         usleep(args->appl.interval * 1000);
>>                 }
>> -               if (args->appl.number != -1 && counters.seq
>> -                   >= (unsigned int)args->appl.number) {
>> +               if (args->appl.number != -1 &&
>> +                   odp_counter64_read(&counters.seq) >=
>> +                   (unsigned int)args->appl.number) {
>>                         break;
>>                 }
>>         }
>> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>>         /* receive number of reply pks until timeout */
>>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>>                 while (args->appl.timeout >= 0) {
>> -                       if (counters.icmp >= (unsigned
>> int)args->appl.number)
>> +                       if (odp_counter64_read(&counters.icmp) >=
>> +                           (unsigned int)args->appl.number)
>>                                 break;
>>                         /* TODO use odp timer */
>>                         sleep(1);
>> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>>
>>         /* print info */
>>         if (args->appl.mode == APPL_MODE_UDP) {
>> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
>> +               printf("  [%02i] total send: %ju\n", thr,
>> +                      odp_counter64_read(&counters.seq));
>>         } else if (args->appl.mode == APPL_MODE_PING) {
>>                 printf("  [%02i] total send: %ju total receive: %ju\n",
>> -                      thr, counters.seq, counters.icmp);
>> +                      thr, odp_counter64_read(&counters.seq),
>> +                      odp_counter64_read(&counters.icmp));
>>         }
>>         return arg;
>>  }
>> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>                 if (!odp_packet_inflag_ipv4(pkt))
>>                         continue;
>>
>> -               odp_atomic_inc_u64(&counters.ip);
>> +               odp_counter64_inc(&counters.ip);
>>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
>> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>
>>                 /* udp */
>>                 if (ip->proto == ODPH_IPPROTO_UDP) {
>> -                       odp_atomic_inc_u64(&counters.udp);
>> +                       odp_counter64_inc(&counters.udp);
>>                         udp = (odph_udphdr_t *)(buf + offset);
>>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>>                                         odp_be_to_cpu_16(udp->length) -
>> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>                         icmp = (odph_icmphdr_t *)(buf + offset);
>>                         /* echo reply */
>>                         if (icmp->type == ICMP_ECHOREPLY) {
>> -                               odp_atomic_inc_u64(&counters.icmp);
>> +                               odp_counter64_inc(&counters.icmp);
>>                                 memcpy(&tvsend, buf + offset +
>> ODPH_ICMPHDR_LEN,
>>                                        sizeof(struct timeval));
>>                                 /* TODO This should be changed to use an
>> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>>         }
>>
>>         /* init counters */
>> -       odp_atomic_init_u64(&counters.seq);
>> -       odp_atomic_init_u64(&counters.ip);
>> -       odp_atomic_init_u64(&counters.udp);
>> -       odp_atomic_init_u64(&counters.icmp);
>> +       odp_counter64_init(&counters.seq, 0);
>> +       odp_counter64_init(&counters.ip, 0);
>> +       odp_counter64_init(&counters.udp, 0);
>> +       odp_counter64_init(&counters.icmp, 0);
>>
>>         /* Reserve memory for args from shared mem */
>>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
>> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
>> index 2f2dc19..76c27d0 100644
>> --- a/example/ipsec/odp_ipsec.c
>> +++ b/example/ipsec/odp_ipsec.c
>> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>>         printf("Num worker threads: %i\n", num_workers);
>>
>>         /* Create a barrier to synchronize thread startup */
>> -       odp_barrier_init_count(&sync_barrier, num_workers);
>> +       odp_barrier_init(&sync_barrier, num_workers);
>>
>>         /*
>>          * By default core #0 runs Linux kernel background tasks.
>> diff --git a/example/odp_example/odp_example.c
>> b/example/odp_example/odp_example.c
>> index 0e9aa3d..c473395 100644
>> --- a/example/odp_example/odp_example.c
>> +++ b/example/odp_example/odp_example.c
>> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>>         odp_shm_print_all();
>>
>>         /* Barrier to sync test case execution */
>> -       odp_barrier_init_count(&globals->barrier, num_workers);
>> +       odp_barrier_init(&globals->barrier, num_workers);
>>
>>         if (args.proc_mode) {
>>                 int ret;
>> diff --git a/example/timer/odp_timer_test.c
>> b/example/timer/odp_timer_test.c
>> index 78b2ae2..dfbeae9 100644
>> --- a/example/timer/odp_timer_test.c
>> +++ b/example/timer/odp_timer_test.c
>> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>>         printf("\n");
>>
>>         /* Barrier to sync test case execution */
>> -       odp_barrier_init_count(&test_barrier, num_workers);
>> +       odp_barrier_init(&test_barrier, num_workers);
>>
>>         /* Create and launch worker threads */
>>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
>> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
>> index 76c1db8..5e78b34 100644
>> --- a/helper/include/odph_ring.h
>> +++ b/helper/include/odph_ring.h
>> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>>                 uint32_t sp_enqueue;     /* True, if single producer. */
>>                 uint32_t size;           /* Size of ring. */
>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>> -               uint32_t head;          /* Producer head. */
>> -               uint32_t tail;          /* Producer tail. */
>> +               odp_atomic32_t head;    /* Producer head. */
>> +               odp_atomic32_t tail;    /* Producer tail. */
>>         } prod ODP_ALIGNED_CACHE;
>>
>>         /** @private Consumer */
>> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>>                 uint32_t size;           /* Size of the ring. */
>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>> -               uint32_t head;          /* Consumer head. */
>> -               uint32_t tail;          /* Consumer tail. */
>> +               odp_atomic32_t head;    /* Consumer head. */
>> +               odp_atomic32_t tail;    /* Consumer tail. */
>>         } cons ODP_ALIGNED_CACHE;
>>
>>         /** @private Memory space of ring starts here. */
>> diff --git a/platform/linux-generic/include/api/odp.h
>> b/platform/linux-generic/include/api/odp.h
>> index 0ee3faf..d124d52 100644
>> --- a/platform/linux-generic/include/api/odp.h
>> +++ b/platform/linux-generic/include/api/odp.h
>> @@ -32,6 +32,7 @@ extern "C" {
>>  #include <odp_barrier.h>
>>  #include <odp_spinlock.h>
>>  #include <odp_atomic.h>
>> +#include <odp_counter.h>
>>
>>  #include <odp_init.h>
>>  #include <odp_system_info.h>
>> diff --git a/platform/linux-generic/include/api/odp_atomic.h
>> b/platform/linux-generic/include/api/odp_atomic.h
>>
>> index 0cc4cf4..ccaad02 100644
>>
>> --- a/platform/linux-generic/include/api/odp_atomic.h
>> +++ b/platform/linux-generic/include/api/odp_atomic.h
>> @@ -4,464 +4,494 @@
>>   * SPDX-License-Identifier:     BSD-3-Clause
>>   */
>>
>> -
>>  /**
>>   * @file
>>   *
>> - * ODP atomic operations
>> + * ODP atomic types and operations, semantically a subset of C11 atomics.
>> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
>> + * without using the required access functions.
>> + * Atomic functions must be used to operate on atomic variables!
>>   */
>>
>>  #ifndef ODP_ATOMIC_H_
>>  #define ODP_ATOMIC_H_
>>
>> +#include <stdint.h>
>> +#include <odp_align.h>
>> +#include <odp_hints.h>
>> +#include <odp_debug.h>
>> +
>>  #ifdef __cplusplus
>>  extern "C" {
>>  #endif
>>
>> -
>> -#include <odp_std_types.h>
>> -
>> -
>> -/**
>> - * Atomic integer
>> - */
>> -typedef volatile int32_t odp_atomic_int_t;
>> -
>> -/**
>> - * Atomic unsigned integer 64 bits
>> - */
>> -typedef volatile uint64_t odp_atomic_u64_t;
>> -
>> -/**
>> - * Atomic unsigned integer 32 bits
>> - */
>> -typedef volatile uint32_t odp_atomic_u32_t;
>> -
>> -
>> -/**
>> - * Initialize atomic integer
>> - *
>> - * @param ptr    An integer atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
>> -{
>> -       *ptr = 0;
>> -}
>> -
>> -/**
>> - * Load value of atomic integer
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return atomic integer value
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
>> -{
>> -       return *ptr;
>> -}
>> -
>> -/**
>> - * Store value to atomic integer
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
>> new_value)
>> -{
>> -       *ptr = new_value;
>> -}
>> -
>> -/**
>> - * Fetch and add atomic integer
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
>> value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and subtract atomic integer
>> - *
>> - * @param ptr    An atomic integer variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
>> value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and increment atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Increment atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Fetch and decrement atomic integer by 1
>> - *
>> - * @param ptr    An atomic int variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Decrement atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
>> -{
>> -       odp_atomic_fetch_sub_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Initialize atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       *ptr = 0;
>> -}
>> -
>> -/**
>> - * Load value of atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return atomic uint32 value
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return *ptr;
>> -}
>> -
>> -/**
>> - * Store value to atomic uint32
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
>> -                                       uint32_t new_value)
>> -{
>> -       *ptr = new_value;
>> -}
>> -
>> -/**
>> - * Fetch and add atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>> -                                               uint32_t value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and subtract uint32
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be sub to the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
>> -                                               uint32_t value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> -
>>  /**
>> - * Fetch and increment atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -#if defined __OCTEON__
>> -
>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       uint32_t ret;
>> -
>> -       __asm__ __volatile__ ("syncws");
>> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
>> -                             "r" (ptr));
>> -
>> -       return ret;
>> -}
>> -
>> + * 32-bit (unsigned) atomic type
>> + */
>> +typedef struct {
>> +       uint32_t v; /**< Actual storage for the atomic variable */
>> +} odp_atomic32_t
>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>> +
>> +typedef enum {
>> +       /** Relaxed memory order, no ordering of other accesses enforced
>> */
>> +       ODP_MEMORDER_RLX,
>> +       /** Acquire memory order, later accesses cannot move before
>> +        * acquire operation */
>> +       ODP_MEMORDER_ACQ,
>> +       /** Release memory order, earlier accesses cannot move after
>> +        * release operation */
>> +       ODP_MEMORDER_RLS
>> +} odp_memorder_t;
>> +
>>
>> +/*****************************************************************************
>> + * Just some private helpers
>>
>> +*****************************************************************************/
>> +
>> +#ifdef __OCTEON__
>> +/* OCTEON Write Memory Barrier */
>> +#define COMPILER_HW_BARRIER() __asm __volatile( \
>> +       /* Double syncw to work around errata */ \
>> +       "syncw\n\tsyncw" : : : )
>>  #else
>> -
>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_u32(ptr, 1);
>> -}
>> -
>> +/** Compiler and hardware full memory barrier */
>> +#define COMPILER_HW_BARRIER() __sync_synchronize()
>> +/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a
>> */
>>  #endif
>>
>> -/**
>> - * Increment atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_u32(ptr, 1);
>> -}
>> -
>> -/**
>> - * Fetch and decrement uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_u32(ptr, 1);
>> -}
>> -
>> -/**
>> - * Decrement atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       odp_atomic_fetch_sub_u32(ptr, 1);
>> -}
>> -
>> -/**
>> - * Atomic compare and set for 32bit
>> - *
>> - * @param dst destination location into which the value will be written.
>> - * @param exp expected value.
>> - * @param src new value.
>> - * @return Non-zero on success; 0 on failure.
>> - */
>> -static inline int
>> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
>> -{
>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>> +#define MEMORY "memory"
>> +
>>
>> +/*****************************************************************************
>> + * Operations on 32-bit atomics
>> + * odp_atomic32_init - no return value
>> + * odp_atomic32_load - return current value
>> + * odp_atomic32_store - no return value
>> + * odp_atomic32_cmp_xchg_weak - return bool
>> + * odp_atomic32_fetch_add - return old value
>> + * odp_atomic32_add - no return value
>> + * odp_atomic32_fetch_inc - return old value
>> + * odp_atomic32_inc - no return value
>> + * odp_atomic32_fetch_dec - return old value
>> + * odp_atomic32_dec - no return value
>> +
>> *****************************************************************************/
>> +
>> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
>> +{
>> +       /* Write of aligned word is atomic */
>> +       /* Cast to volatile to force compiler to (re-) write variable,
>> thus we
>> +        * can avoid using compiler memory barriers */
>> +       *(__volatile uint32_t *)&ptr->v = val;
>> +}
>> +
>> +/**
>> + * Atomic load of 32-bit atomic variable
>> + *
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the load
>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
>> + *
>> + * @return Value of the variable
>> + */
>> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
>> +               odp_memorder_t mmodel)
>> +{
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               uint32_t val;
>> +               /* Read of aligned word is atomic */
>> +               /* Cast to volatile to force compiler to (re-) read
>> variable,
>> +                * thus we can avoid using compiler memory barriers */
>> +               val = *(__volatile const uint32_t *)&ptr->v;
>> +               return val;
>> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
>> +#if defined __aarch64__
>> +               uint32_t val;
>> +               __asm __volatile("ldar %w0, [%1]"
>> +                               : "=&r"(val)
>> +                               : "r"(&ptr->v)
>> +                               : MEMORY);
>> +               return val;
>> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
>> +               /* Read of aligned word is atomic */
>> +               uint32_t val = ptr->v;
>> +               /* To prevent later accesses from moving up */
>> +               /* Herb Sutter claims HW barrier not needed on x86? */
>> +               COMPILER_HW_BARRIER();
>> +               return val;
>> +#else
>> +#warning odp_atomic32_load() may not be efficiently implemented
>> +               /* Assume read of aligned word is atomic */
>> +               uint32_t val = ptr->v;
>> +               /* To prevent later accesses from moving up */
>> +               COMPILER_HW_BARRIER();
>> +               return val;
>> +#endif
>> +       } else {
>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>> +       }
>> +}
>> +
>> +/**
>> + * Atomic store to 32-bit atomic variable
>> + *
>> + * @param ptr  Pointer to a 32-bit atomic variable
>> + * @param val  Value to write to the atomic variable
>> + * @param memmodel Memory model associated with the store
>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
>> + */
>> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
>> +               uint32_t val,
>> +               odp_memorder_t mmodel)
>> +{
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               /* Write of aligned word is atomic */
>> +               /* Cast to volatile to force compiler to (re-) write
>> variable,
>> +                * thus we will avoid using compiler memory barriers */
>> +               *(__volatile uint32_t *)&ptr->v = val;
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
>> +               /* Compiler and HW barrier to prevent earlier accesses
>> from
>> +                * moving down */
>> +               COMPILER_HW_BARRIER();
>> +               /* Write of aligned word is atomic */
>> +               ptr->v = val;
>> +               /* Compiler and HW barrier to prevent this store from
>> moving
>> +                * down after a later load-acquire and thus create
>> overlapping
>> +                * critical sections. Herb Sutter thinks this is needed */
>> +               COMPILER_HW_BARRIER();
>> +#elif defined __aarch64__
>> +               __asm __volatile("stlr %w0, [%1]"
>> +                               :
>> +                               : "r"(val), "r"(&ptr->v)
>> +                               : MEMORY);
>> +#elif defined __x86_64__
>> +               /* This is actually an atomic exchange operation */
>> +               /* Generates good code on x86_64 */
>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>> +#else
>> +#warning odp_atomic32_store_rls() may not be efficiently implemented
>> +               /* This is actually an atomic exchange operation */
>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>> +#endif
>> +       } else {
>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>> +       }
>> +}
>> +
>> +
>> +/**
>> + * Atomic compare and exchange (swap) of 32-bit atomic variable
>> + * "Weak" semantics, may fail spuriously and must be used in a loop.
>> + *
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param exp_p Pointer to expected value (updated on failure)
>> + * @param val   New value to write
>> + * @param       memmodel Memory model associated with the
>> compare-and-swap
>> + * operation (ODP_MEMORDER_RLX only)
>> + *
>> + * @return 1 (true) if exchange successful, 0 (false) if not successful
>> (and
>> + * '*exp_p' updated with current value)
>> + */
>> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
>> +               uint32_t *exp_p,
>> +               uint32_t val,
>> +               odp_memorder_t mmodel)
>> +{
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +#if defined __arm__ /* A32/T32 ISA */
>> +               uint32_t old;
>> +               uint32_t exp = *exp_p;
>> +               int status;
>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>> +                                "cmp   %0, %3\t\n"
>> +                                "bne   1f\t\n"
>> +                                "strex %1, %4, [%2]\t\n"
>> +                                "1:\t\n"
>> +                               : "=&r"(old), "=&r"(status)
>> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
>> +                               : MEMORY);
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Value has changed, can't proceed */
>> +                       /* Clear exclusive access monitor */
>> +                       __asm __volatile("clrex");
>> +                       /* Return current value */
>> +                       *exp_p = old;
>> +                       return 0;
>> +               }
>> +               /* strex returns 0 on success */
>> +               if (odp_unlikely(status != 0)) {
>> +                       /* strex failed, reservation was disturbed */
>> +                       /* Return potentially changed value */
>> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
>> +                       return 0;
>> +               }
>> +               return 1;
>> +#elif defined __mips64__
>> +               uint32_t old;
>> +               uint32_t exp = *exp_p;
>> +               uint32_t status = val;
>> +               __asm __volatile("llw %0, [%2]\t\n"
>> +                                "bne %0, %3, 1f\t\n"
>> +                                "scw %1, [%2]\t\n"
>> +                                "1:\t\n"
>> +                               : "=&r"(old), "+&r"(status)
>> +                               : "r"(&ptr->v), "r"(exp)
>> +                               : MEMORY);
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Value has changed, can't proceed */
>> +                       /* Return current value */
>> +                       *exp_p = old;
>> +                       return 0;
>> +               }
>> +               /* scw returns 1 on success, 0 on failure */
>> +               if (odp_unlikely(status == 0)) {
>> +                       /* scw failed, reservation was disturbed */
>> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
>> +                       return 0;
>> +               }
>> +               return 1;
>> +#elif defined __x86_64__
>> +               uint32_t exp = *exp_p;
>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
>> val);
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Return the unexpected content of '*ptr' */
>> +                       *exp_p = old;
>> +                       return 0;
>> +               } else {
>> +                       return 1;
>> +               }
>> +#else
>> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
>> +               uint32_t exp = *exp_p;
>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
>> val);
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Return the unexpected content of '*ptr' */
>> +                       *exp_p = old;
>> +                       return 0;
>> +               } else {
>> +                       return 1;
>> +               }
>> +#endif
>> +       } else {
>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>> +       }
>> +}
>> +
>> +/**
>> + * Atomic fetch and add to 32-bit atomic variable
>> + * @note A - B <=> A + (-B)
>> + *
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>> + * @param memmodel Memory model associated with the add
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>> + *
>> + * @return Value of the atomic variable before the addition
>> + */
>> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
>> +               uint32_t incr,
>> +               odp_memorder_t mmodel)
>> +{
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +#if defined __arm__ /* A32/T32 ISA */
>> +               uint32_t old_val, tmp;
>> +               int status;
>> +               do {
>> +                       __asm __volatile("ldrex %0, [%3]\t\n"
>> +                                        "add   %1, %0, %4\t\n"
>> +                                        "strex %2, %1, [%3]\t\n"
>>
>> +                                       : "=&r"(old_val), "=&r"(tmp),
>>
>> +                                         "=&r"(status)
>> +                                       : "r"(&ptr->v), "r"(incr)
>> +                                       : MEMORY);
>> +               } while (odp_unlikely(status != 0));
>> +               return old_val;
>> +#elif defined __OCTEON__
>> +               uint32_t old_val;
>> +               __asm __volatile("laa %0,(%2),%3"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr), "r" (incr)
>> +                               : MEMORY);
>> +               return old_val;
>> +#elif defined __x86_64__
>> +               /* Generates good code on x86_64 */
>> +               return __sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
>> +               return __sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               COMPILER_HW_BARRIER();
>> +               __asm __volatile("laa %0,(%2),%3"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr), "r" (incr)
>> +                               : MEMORY);
>> +               COMPILER_HW_BARRIER();
>> +               return old_val;
>> +#endif
>> +               /* __sync_fetch_and_add() will give us barriers before and
>> +                * after, we are fine with this for release operations */
>> +               return __sync_fetch_and_add(&ptr->v, incr);
>> +       } else {
>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>> +       }
>>  }
>>
>>  /**
>> - * Initialize atomic uint64
>> + * Atomic add to 32-bit atomic variable
>>   *
>> - * @param ptr    An atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>> + * @param memmodel Memory model associated with the add
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   */
>> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
>> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
>> +               uint32_t incr,
>> +               odp_memorder_t mmodel)
>>  {
>> -       *ptr = 0;
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               /* Platforms that support atomic add instructions can add
>> +                * their implementations here */
>> +#if defined __OCTEON__
>> +               __asm __volatile("saa %[inc], (%[base])"
>> +                               : "+m" (*ptr)
>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>> +                               : MEMORY);
>> +               return;
>> +#endif
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +               /* Platforms that support atomic add instructions can add
>> +                * their implementations here */
>> +#if defined __OCTEON__
>> +               COMPILER_HW_BARRIER();
>> +               __asm __volatile("saa %[inc], (%[base])"
>> +                               : "+m" (*ptr)
>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>> +                               : MEMORY);
>> +               COMPILER_HW_BARRIER();
>> +               return;
>> +#endif
>> +       }
>> +       /* Default to using odp_atomic32_fetch_add() */
>> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>>  }
>>
>>  /**
>> - * Load value of atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> + * Atomic fetch and increment of 32-bit atomic variable
>>   *
>> - * @return atomic uint64 value
>> + * param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the increment
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   *
>> - * @note The operation is not synchronized with other threads
>> + * @return Value of the atomic variable before the increment
>>   */
>> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
>> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
>> +               odp_memorder_t mmodel)
>>  {
>> -       return *ptr;
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               /* Platforms that support atomic increment instructions
>> can add
>> +                * their implementations here */
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               __asm __volatile("lai %0,(%2)"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr)
>> +                               : MEMORY);
>> +               return old_val;
>> +#endif
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               COMPILER_HW_BARRIER();
>> +               __asm __volatile("lai %0,(%2)"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr)
>> +                               : MEMORY);
>> +               COMPILER_HW_BARRIER();
>> +               return old_val;
>> +#endif
>> +       }
>> +       /* Default to using odp_atomic32_fetch_add() */
>> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>>  }
>>
>>  /**
>> - * Store value to atomic uint64
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> + * Atomic increment of 32-bit atomic variable
>>   *
>> - * @note The operation is not synchronized with other threads
>> + * param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the increment
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   */
>> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
>> -                                       uint64_t new_value)
>> -{
>> -       *ptr = new_value;
>> -}
>> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
>> +               odp_memorder_t mmodel)
>>
>> -/**
>> - * Add atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - */
>> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
>> value)
>>  {
>> -       __sync_fetch_and_add(ptr, value);
>> +       /* Default to using odp_atomic32_fetch_inc() */
>> +       /* Platforms that support atomic increment instructions can add
>> +        * their implementations here */
>> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>>  }
>>
>>  /**
>> - * Fetch and add atomic uint64
>> + * Atomic fetch and decrement of 32-bit atomic variable
>>   *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> + * param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the decrement
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   *
>> - * @return Value of the variable before the operation
>> + * @return Value of the atomic variable before the decrement
>>   */
>> -
>> -#if defined __powerpc__ && !defined __powerpc64__
>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
>> +               odp_memorder_t mmodel)
>>  {
>> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
>> -                                   (uint32_t)value);
>> -}
>> -#else
>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               /* Platforms that support atomic decrement instructions
>> can add
>> +                * their implementations here */
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               __asm __volatile("lad %0,(%2)"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr)
>> +                               : MEMORY);
>> +               return old_val;
>>  #endif
>> -/**
>> - * Subtract atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - */
>> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
>> value)
>> -{
>> -       __sync_fetch_and_sub(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and subtract atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -#if defined __powerpc__ && !defined __powerpc64__
>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
>> -                                   (uint32_t)value);
>> -}
>> -#else
>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               COMPILER_HW_BARRIER();
>> +               __asm __volatile("lad %0,(%2)"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr)
>> +                               : MEMORY);
>> +               COMPILER_HW_BARRIER();
>> +               return old_val;
>>  #endif
>> -/**
>> - * Fetch and increment atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_u64(ptr, 1);
>> -}
>> -
>> -/**
>> - * Increment atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_u64(ptr, 1);
>> +       }
>> +       /* Default to using odp_atomic32_fetch_add() */
>> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>>  }
>>
>>  /**
>> - * Fetch and decrement atomic uint64 by 1
>> + * Atomic decrement of 32-bit atomic variable
>>   *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> + * param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the decrement
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   */
>> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_u64(ptr, 1);
>> -}
>> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
>> +               odp_memorder_t memorder)
>>
>> -/**
>> - * Decrement atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>>  {
>> -       odp_atomic_fetch_sub_u64(ptr, 1);
>> +       /* Default to using odp_atomic32_fetch_dec() */
>> +       /* Platforms that support atomic decrement instructions can add
>> +        * their implementations here */
>> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>>  }
>>
>> -/**
>> - * Atomic compare and set for 64bit
>> - *
>> - * @param dst destination location into which the value will be written.
>> - * @param exp expected value.
>> - * @param src new value.
>> - * @return Non-zero on success; 0 on failure.
>> - */
>> -static inline int
>> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
>> -{
>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>> -}
>> +/* We are not exporting this macro */
>> +#undef COMPILER_HW_BARRIER
>> +#undef MEMORY
>>
>>  #ifdef __cplusplus
>>  }
>> diff --git a/platform/linux-generic/include/api/odp_barrier.h
>> b/platform/linux-generic/include/api/odp_barrier.h
>> index a7b3215..69b1eb8 100644
>> --- a/platform/linux-generic/include/api/odp_barrier.h
>> +++ b/platform/linux-generic/include/api/odp_barrier.h
>> @@ -27,18 +27,18 @@ extern "C" {
>>   * ODP execution barrier
>>   */
>>  typedef struct odp_barrier_t {
>> -       int              count;  /**< @private Thread count */
>> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
>> +       uint32_t       num_threads;  /**< @private Thread count
>> (constant) */
>> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
>>  } odp_barrier_t;
>>
>>
>>  /**
>>   * Init barrier with thread count
>>   *
>> - * @param barrier    Barrier
>> - * @param count      Thread count
>> + * @param barrier     Barrier
>> + * @param num_threads Number of threads which share the barrier
>>   */
>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>>
>>
>>  /**
>> diff --git a/platform/linux-generic/include/api/odp_counter.h
>> b/platform/linux-generic/include/api/odp_counter.h
>> new file mode 100644
>>
>> index 0000000..f937d27
>>
>> --- /dev/null
>> +++ b/platform/linux-generic/include/api/odp_counter.h
>> @@ -0,0 +1,363 @@
>> +/* Copyright (c) 2013, Linaro Limited
>> + * All rights reserved.
>> + *
>> + * SPDX-License-Identifier:     BSD-3-Clause
>> + */
>> +
>> +/**
>> + * @file
>> + *
>> + * ODP atomic counter types and operations, suitable for e.g. shared
>> statistics.
>> + * Relaxed memory model assumed for lowest overhead.
>> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
>> + * without using the required access functions.
>> + * Counter functions must be used to operate on counter variables!
>> + */
>> +
>> +#ifndef ODP_COUNTER_H_
>> +#define ODP_COUNTER_H_
>> +
>> +#include <stdint.h>
>> +#include <odp_align.h>
>> +#include <odp_hints.h>
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +/**
>> + * 32-bit (unsigned) atomic counter type
>> + */
>> +typedef struct {
>> +       uint32_t v; /**< Actual storage for the counter variable */
>> +} odp_counter32_t
>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>> +
>> +/**
>> + * 64-bit (unsigned) atomic counter type
>> + */
>> +typedef struct {
>> +       uint64_t v; /**< Actual storage for the counter variable */
>> +       /* Room for other data structures (e.g. spin lock) that might be
>> +        * needed to ensure atomicity on some architectures */
>> +} odp_counter64_t
>> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
>> +
>>
>> +/*****************************************************************************
>> + * Operations on 32-bit atomic counters
>> + * odp_counter32_init - returns no value
>> + * odp_counter32_read - returns current value
>> + * odp_counter32_write - returns no value
>> + * odp_counter32_add - returns no value
>> + * odp_counter32_read_inc - returns old value
>> + * odp_counter32_inc - returns no value
>> +
>> *****************************************************************************/
>> +
>> +/**
>> + * Initialize 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + * @param val   Initial value
>> + */
>> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t val)
>> +{
>> +       /* No implementation requires any other type of initialization */
>> +       *(__volatile uint32_t *)&ptr->v = val;
>> +}
>> +
>> +/**
>> + * Read 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + *
>> + * @return Value of the variable
>> + */
>> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
>> +{
>> +       uint32_t val;
>> +       /* Read of aligned word is atomic */
>> +       /* Cast to volatile to force compiler to (re-) read variable,
>> thus we
>> +        * will avoid using compiler memory barriers */
>> +       val = *(__volatile const uint32_t *)&ptr->v;
>> +       return val;
>> +}
>> +
>> +/**
>> + * Write 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + * @param val   Value to write to the variable
>> + */
>> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t
>> val)
>> +{
>> +       /* Write of aligned word is atomic */
>> +       /* Cast to volatile to force compiler to (re-) write variable,
>> thus we
>> +        * will avoid using compiler memory barriers */
>> +       *(__volatile uint32_t *)&ptr->v = val;
>> +}
>> +
>> +/**
>> + * Atomic add to 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + * @param incr  The value to be added to the counter variable
>> + */
>> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t incr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint32_t result;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>> +                                "add   %0, %0, %3\t\n"
>> +                                "strex %1, %0, [%2]"
>> +                                : "=&r"(result), "=&r"(status)
>> +                                : "r"(&ptr->v), "Ir" (incr)
>> +                                : );
>> +       } while (odp_unlikely(status != 0));
>> +#elif defined __OCTEON__
>> +       __asm __volatile("saa %[inc], (%[base])"
>> +                        : "+m" (*ptr)
>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>> +                        : );
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +       /* Warning odp_counter32_add() may not be efficiently implemented
>> */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>> +}
>> +
>> +/**
>> + * Atomic increment (+1) of 32-bit counter variable, return original
>> value
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + *
>> + * @return Original value of counter
>> + */
>> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint32_t result, tmp;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrex %0, [%3]\t\n"
>> +                                "add   %1, %0, #1\t\n"
>> +                                "strex %2, %1, [%3]"
>>
>> +                                : "=&r"(result), "=&r"(tmp),
>> "=&r"(status)
>>
>> +                                : "r"(&ptr->v)
>> +                                : );
>> +       } while (odp_unlikely(status != 0));
>> +       return result;
>> +#elif defined __OCTEON__
>> +       uint32_t old_val;
>> +       __asm __volatile("lai %0,(%2)"
>> +                        : "=r" (old_val), "+m" (ptr)
>> +                        : "r" (ptr)
>> +                        : );
>> +       return old_val;
>> +#elif defined __x86_64__
>> +       return __sync_fetch_and_add(&ptr->v, 1);
>> +#else
>> +/* Warning odp_counter32_read_inc() may not be efficiently implemented */
>> +       return __sync_fetch_and_add(&ptr->v, 1);
>> +#endif
>> +}
>> +
>> +/**
>> + * Atomic increment (+1) 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + */
>> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
>> +{
>> +#if defined __OCTEON__
>> +       odp_counter32_add(ptr, 1);
>> +#else
>> +       (void)odp_counter32_read_inc(ptr);
>> +#endif
>> +}
>> +
>>
>> +/*****************************************************************************
>> + * Operations on 64-bit atomic counters
>> + * odp_counter64_init
>> + * odp_counter64_read
>> + * odp_counter64_write
>> + * odp_counter64_add
>> + * odp_counter64_read_inc
>> + * odp_counter64_inc
>> +
>> *****************************************************************************/
>> +
>> +/**
>> + * Read 64-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + *
>> + * @return Value of the counter variable
>> + */
>> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t val;
>> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
>> +                        "clrex" /* Clear exclusive access monitor */
>> +                        : "=&r"(val)
>> +                        : "r"(&ptr->v)
>> +                        : );
>> +       return val;
>> +#elif defined __x86_64__ || defined __aarch64__
>> +       /* Read of aligned quad/double word is atomic */
>> +       return ptr->v;
>> +#else
>> +/* Warning odp_counter64_read() may not be efficiently implemented */
>> +       return __sync_fetch_and_or(&ptr->v, 0);
>> +#endif
>> +}
>> +
>> +/**
>> + * Write 64-bit counter variable
>> + *
>> + * @param ptr  Pointer to a 64-bit counter variable
>> + * @param val  Value to write to the counter variable
>> + */
>> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t
>> val)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val;
>> +       int status;
>> +       do {
>> +               /* Read counter variable exclusively so we can write to it
>> +                * later */
>> +               /* Attempt to write the new value */
>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>> +                                "strexd %1, %3, %H3, [%2]"
>> +                                : "=&r"(old_val), "=&r"(status)
>> +                                : "r"(&ptr->v), "r"(val)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +#elif defined __x86_64__ || defined __aarch64__
>> +       /* Write of aligned quad/double word is atomic */
>> +       ptr->v = val;
>> +#else
>> +/* Warning odp_counter64_write() may not be efficiently implemented */
>> +       /* This is actually an counter exchange operation */
>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>> +#endif
>> +}
>> +
>> +/**
>> + * Initialize 64-bit counter variable
>> + * Perform implementation specific initializations, assign initial value.
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + * @param val   Initial value
>> + */
>> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t val)
>> +{
>> +       /* No implementation requires any other type of initialization */
>> +       odp_counter64_write(ptr, val);
>> +}
>> +
>> +/**
>> + * Atomic add to 64-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + * @param incr  The value to be added to the counter variable
>> + */
>> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>> +                                "adds   %0, %0, %3\t\n"
>> +                                "adc    %H0, %H3\t\n"
>> +                                "strexd %1, %0, %H0, [%2]"
>> +                                : "=&r"(old_val), "=&r"(status)
>> +                                : "r"(&ptr->v), "r"(incr)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +#elif defined __OCTEON__
>> +       __asm __volatile("saad %[inc], (%[base])"
>> +                        : "+m" (*ptr)
>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>> +                        : );
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +/* Warning odp_counter64_add() may not be efficiently implemented */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>> +}
>> +
>> +
>> +/**
>> + * Atomic increment (+1) 64-bit counter variable and return original
>> value
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + *
>> + * @return Original value of counter
>> + */
>> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val, tmp;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
>> +                                "adds   %2, %0, #1\t\n"
>> +                                "adc    %H2, %H0, #0\t\n"
>> +                                "strexd %1, %2, %H2, [%3]"
>> +                                : "=&r"(old_val), "=&r"(status),
>> "=&r"(tmp)
>> +                                : "r"(&ptr->v)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +       return old_val;
>> +#elif defined __OCTEON__
>> +       uint64_t old_val;
>> +       __asm __volatile("laid %0,(%2)"
>> +                       : "=r" (old_val), "+m" (ptr)
>> +                       : "r" (ptr)
>> +                       : );
>> +       return old_val;
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       return __sync_fetch_and_add(&ptr->v, 1);
>> +#else
>> +/* Warning odp_counter64_read_inc() may not be efficiently implemented */
>> +       return __sync_fetch_and_add(&ptr->v, 1);
>> +#endif
>> +}
>> +
>> +/**
>> + * Atomic increment (+1) 64-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + */
>> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>> +                                "adds   %0, #1\t\n"
>> +                                "adc    %H0, #0\t\n"
>> +                                "strexd %1, %0, %H0, [%2]"
>> +                                : "=&r"(old_val), "=&r"(status)
>> +                                : "r"(&ptr->v)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +#else
>> +       (void)odp_counter64_read_inc(ptr);
>> +#endif
>> +}
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif
>> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
>> b/platform/linux-generic/include/api/odp_rwlock.h
>> index 252ebb2..ff8a9a2 100644
>> --- a/platform/linux-generic/include/api/odp_rwlock.h
>> +++ b/platform/linux-generic/include/api/odp_rwlock.h
>> @@ -10,26 +10,30 @@
>>  /**
>>   * @file
>>   *
>> - * ODP RW Locks
>> + * ODP read/write lock
>> + * RW lock support mu
>> ...
>>
>> [Message clipped]
>
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
Ola Liljedahl Nov. 4, 2014, 3:06 p.m. UTC | #8
And what should be in each patch?

On 4 November 2014 16:03, Anders Roxell <anders.roxell@linaro.org> wrote:

> As perti wrote in his first email this patch should be broken up in
> multiple patches...
>
> Cheers,
> Anders
> On 4 Nov 2014 15:34, "Ola Liljedahl" <ola.liljedahl@linaro.org> wrote:
>
>> Possibly odp_atomics.h should then be internal leaving only odp_counter.h
>> as the only public API. The original odp_atomics.h is public so I left it
>> that way.
>>
>> The counter API does not allow the user to specify any memory ordering,
>> relaxed memory order is expected, i.e. no ordering is guaranteed.
>>
>> Why does acquire/release not fit well with the far atomics? And what do
>> you mean specifically with "far atomics"? Just the counter updates like
>> Cavium has?
>>
>> As Linux kernel atomics interface predates C11/C++11 atomics support, I
>> do not see it as model to follow.
>>
>> The patch summary contained a brief description of what I wanted to
>> achieve with the patch. What do you want more, a Google Docs design
>> document?
>>
>> -- Ola
>>
>> On 4 November 2014 15:22, Savolainen, Petri (NSN - FI/Espoo) <
>> petri.savolainen@nsn.com> wrote:
>>
>>>  There are many things I’d change in this patch. I think it’s better to
>>> take a step back and talk what you are trying to achieve here, and then
>>> correct those step by step. E.g. the whole idea of acquire / release does
>>> not fit well on far atomics, and far atomics is the thing I’d abstract from
>>> applications with this API. Other synchronization primitives (such as
>>> locks) would not be implemented (too often) by applications, so it’s not
>>> very productive to abstract that (implementation of locks). E.g. Linux
>>> kernel atomics.h looks pretty much like the odp_atomic.h.
>>>
>>>
>>>
>>> -Petri
>>>
>>>
>>>
>>>
>>>
>>> *From:* lng-odp-bounces@lists.linaro.org [mailto:
>>> lng-odp-bounces@lists.linaro.org] *On Behalf Of *ext Ola Liljedahl
>>> *Sent:* Tuesday, November 04, 2014 3:49 PM
>>> *To:* lng-odp@lists.linaro.org
>>> *Subject:* Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11
>>> memory model
>>>
>>>
>>>
>>> Ping!
>>>
>>>
>>>
>>> I really need this new working atomics support merged ASAP because I
>>> have a new lock-less implementation of the timer API which uses atomic
>>> operations. I haven't seen any real criticism against the content of the
>>> patch so there is nothing to change.
>>>
>>>
>>>
>>> -- Ola
>>>
>>>
>>>
>>>
>>>
>>> On 20 October 2014 15:07, Ola Liljedahl <ola.liljedahl@linaro.org>
>>> wrote:
>>>
>>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>>> ---
>>> Added header file odp_counter.h with support for 32- and 64-bit atomic
>>> counters
>>> using relaxed memory order. 6 operations
>>> (init/read/write/add/read_inc/inc) on
>>> 32-bit and 64-bit counters respectively.
>>>
>>> Renamed odp_atomic_test to odp_counter_test and changed to use
>>> odp_counter.h
>>>
>>> Implementation of C11-based memory model for atomic operations. 10
>>> operations
>>> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
>>> in
>>> odp_atomic.h. The required memory ordering is now a parameter to each
>>> call just
>>> like in C11.
>>>
>>> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
>>> fall back to GCC __sync builtins which often include unnecessarily heavy
>>> barrier/sync operations (always sequentially consistent).
>>>
>>> Attempt to remove all explicit memory barriers (odp_sync_stores) from
>>> code that
>>> implements multithreaded synchronization primitives (e.g. locks,
>>> barriers).
>>> Rewrote such primitives to use the new atomic operations.
>>>
>>> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
>>> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
>>> enqueue/dequeue
>>> (missing release barrier, had only compiler barrier).
>>>
>>>  .gitignore                                         |   2 +-
>>>  example/generator/odp_generator.c                  |  43 +-
>>>  example/ipsec/odp_ipsec.c                          |   2 +-
>>>  example/odp_example/odp_example.c                  |   2 +-
>>>  example/timer/odp_timer_test.c                     |   2 +-
>>>  helper/include/odph_ring.h                         |   8 +-
>>>  platform/linux-generic/include/api/odp.h           |   1 +
>>>  platform/linux-generic/include/api/odp_atomic.h    | 838
>>> +++++++++++----------
>>>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>>>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>>>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>>>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>>>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>>>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>>>  platform/linux-generic/odp_barrier.c               |  49 +-
>>>  platform/linux-generic/odp_buffer.c                |   3 +-
>>>  platform/linux-generic/odp_crypto.c                |   7 +-
>>>  platform/linux-generic/odp_queue.c                 |   7 +-
>>>  platform/linux-generic/odp_ring.c                  |  94 +--
>>>  platform/linux-generic/odp_rwlock.c                |  62 +-
>>>  platform/linux-generic/odp_thread.c                |   9 +-
>>>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>>>  platform/linux-generic/odp_timer.c                 |  22 +-
>>>  test/api_test/Makefile.am                          |   6 +-
>>>  test/api_test/odp_atomic_test.c                    | 362 ---------
>>>  test/api_test/odp_atomic_test.h                    |  60 --
>>>  test/api_test/odp_common.c                         |   1 -
>>>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>>>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>>>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>>>  delete mode 100644 test/api_test/odp_atomic_test.c
>>>  delete mode 100644 test/api_test/odp_atomic_test.h
>>>  create mode 100644 test/api_test/odp_counter_test.c
>>>
>>> diff --git a/.gitignore b/.gitignore
>>> index 6342e34..77db4d6 100644
>>> --- a/.gitignore
>>> +++ b/.gitignore
>>> @@ -35,7 +35,7 @@ build/
>>>  odp_example
>>>  odp_packet
>>>  odp_packet_netmap
>>> -odp_atomic
>>> +odp_counter
>>>  odp_shm
>>>  odp_ring
>>>  odp_timer_ping
>>> diff --git a/example/generator/odp_generator.c
>>> b/example/generator/odp_generator.c
>>> index eb8b340..252157d 100644
>>> --- a/example/generator/odp_generator.c
>>> +++ b/example/generator/odp_generator.c
>>> @@ -62,10 +62,10 @@ typedef struct {
>>>   * counters
>>>  */
>>>  static struct {
>>> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
>>> -       odp_atomic_u64_t ip;    /**< ip packets */
>>> -       odp_atomic_u64_t udp;   /**< udp packets */
>>> -       odp_atomic_u64_t icmp;  /**< icmp packets */
>>> +       odp_counter64_t seq;    /**< ip seq to be send */
>>> +       odp_counter64_t ip;     /**< ip packets */
>>> +       odp_counter64_t udp;    /**< udp packets */
>>> +       odp_counter64_t icmp;   /**< icmp packets */
>>>  } counters;
>>>
>>>  /** * Thread specific arguments
>>> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>>> ODPH_UDPHDR_LEN +
>>>                                        ODPH_IPV4HDR_LEN);
>>>         ip->proto = ODPH_IPPROTO_UDP;
>>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
>>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>>>         ip->id = odp_cpu_to_be_16(seq);
>>>         ip->chksum = 0;
>>>         odph_ipv4_csum_update(pkt);
>>> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>>> ODPH_ICMPHDR_LEN +
>>>                                        ODPH_IPV4HDR_LEN);
>>>         ip->proto = ODPH_IPPROTO_ICMP;
>>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
>>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>>>         ip->id = odp_cpu_to_be_16(seq);
>>>         ip->chksum = 0;
>>>         odph_ipv4_csum_update(pkt);
>>> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>>>                 }
>>>
>>>                 if (args->appl.interval != 0) {
>>> +                       uint64_t seq = odp_counter64_read(&counters.seq);
>>>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
>>> -                              thr, counters.seq, counters.seq%0xffff);
>>> +                              thr, seq, seq%0xffff);
>>>                         /* TODO use odp timer */
>>>                         usleep(args->appl.interval * 1000);
>>>                 }
>>> -               if (args->appl.number != -1 && counters.seq
>>> -                   >= (unsigned int)args->appl.number) {
>>> +               if (args->appl.number != -1 &&
>>> +                   odp_counter64_read(&counters.seq) >=
>>> +                   (unsigned int)args->appl.number) {
>>>                         break;
>>>                 }
>>>         }
>>> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>>>         /* receive number of reply pks until timeout */
>>>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>>>                 while (args->appl.timeout >= 0) {
>>> -                       if (counters.icmp >= (unsigned
>>> int)args->appl.number)
>>> +                       if (odp_counter64_read(&counters.icmp) >=
>>> +                           (unsigned int)args->appl.number)
>>>                                 break;
>>>                         /* TODO use odp timer */
>>>                         sleep(1);
>>> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>>>
>>>         /* print info */
>>>         if (args->appl.mode == APPL_MODE_UDP) {
>>> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
>>> +               printf("  [%02i] total send: %ju\n", thr,
>>> +                      odp_counter64_read(&counters.seq));
>>>         } else if (args->appl.mode == APPL_MODE_PING) {
>>>                 printf("  [%02i] total send: %ju total receive: %ju\n",
>>> -                      thr, counters.seq, counters.icmp);
>>> +                      thr, odp_counter64_read(&counters.seq),
>>> +                      odp_counter64_read(&counters.icmp));
>>>         }
>>>         return arg;
>>>  }
>>> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
>>> pkt_tbl[], unsigned len)
>>>                 if (!odp_packet_inflag_ipv4(pkt))
>>>                         continue;
>>>
>>> -               odp_atomic_inc_u64(&counters.ip);
>>> +               odp_counter64_inc(&counters.ip);
>>>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>>>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>>>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
>>> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
>>> pkt_tbl[], unsigned len)
>>>
>>>                 /* udp */
>>>                 if (ip->proto == ODPH_IPPROTO_UDP) {
>>> -                       odp_atomic_inc_u64(&counters.udp);
>>> +                       odp_counter64_inc(&counters.udp);
>>>                         udp = (odph_udphdr_t *)(buf + offset);
>>>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>>>                                         odp_be_to_cpu_16(udp->length) -
>>> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
>>> pkt_tbl[], unsigned len)
>>>                         icmp = (odph_icmphdr_t *)(buf + offset);
>>>                         /* echo reply */
>>>                         if (icmp->type == ICMP_ECHOREPLY) {
>>> -                               odp_atomic_inc_u64(&counters.icmp);
>>> +                               odp_counter64_inc(&counters.icmp);
>>>                                 memcpy(&tvsend, buf + offset +
>>> ODPH_ICMPHDR_LEN,
>>>                                        sizeof(struct timeval));
>>>                                 /* TODO This should be changed to use an
>>> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>>>         }
>>>
>>>         /* init counters */
>>> -       odp_atomic_init_u64(&counters.seq);
>>> -       odp_atomic_init_u64(&counters.ip);
>>> -       odp_atomic_init_u64(&counters.udp);
>>> -       odp_atomic_init_u64(&counters.icmp);
>>> +       odp_counter64_init(&counters.seq, 0);
>>> +       odp_counter64_init(&counters.ip, 0);
>>> +       odp_counter64_init(&counters.udp, 0);
>>> +       odp_counter64_init(&counters.icmp, 0);
>>>
>>>         /* Reserve memory for args from shared mem */
>>>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
>>> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
>>> index 2f2dc19..76c27d0 100644
>>> --- a/example/ipsec/odp_ipsec.c
>>> +++ b/example/ipsec/odp_ipsec.c
>>> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>>>         printf("Num worker threads: %i\n", num_workers);
>>>
>>>         /* Create a barrier to synchronize thread startup */
>>> -       odp_barrier_init_count(&sync_barrier, num_workers);
>>> +       odp_barrier_init(&sync_barrier, num_workers);
>>>
>>>         /*
>>>          * By default core #0 runs Linux kernel background tasks.
>>> diff --git a/example/odp_example/odp_example.c
>>> b/example/odp_example/odp_example.c
>>> index 0e9aa3d..c473395 100644
>>> --- a/example/odp_example/odp_example.c
>>> +++ b/example/odp_example/odp_example.c
>>> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>>>         odp_shm_print_all();
>>>
>>>         /* Barrier to sync test case execution */
>>> -       odp_barrier_init_count(&globals->barrier, num_workers);
>>> +       odp_barrier_init(&globals->barrier, num_workers);
>>>
>>>         if (args.proc_mode) {
>>>                 int ret;
>>> diff --git a/example/timer/odp_timer_test.c
>>> b/example/timer/odp_timer_test.c
>>> index 78b2ae2..dfbeae9 100644
>>> --- a/example/timer/odp_timer_test.c
>>> +++ b/example/timer/odp_timer_test.c
>>> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>>>         printf("\n");
>>>
>>>         /* Barrier to sync test case execution */
>>> -       odp_barrier_init_count(&test_barrier, num_workers);
>>> +       odp_barrier_init(&test_barrier, num_workers);
>>>
>>>         /* Create and launch worker threads */
>>>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
>>> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
>>> index 76c1db8..5e78b34 100644
>>> --- a/helper/include/odph_ring.h
>>> +++ b/helper/include/odph_ring.h
>>> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>>>                 uint32_t sp_enqueue;     /* True, if single producer. */
>>>                 uint32_t size;           /* Size of ring. */
>>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>>> -               uint32_t head;          /* Producer head. */
>>> -               uint32_t tail;          /* Producer tail. */
>>> +               odp_atomic32_t head;    /* Producer head. */
>>> +               odp_atomic32_t tail;    /* Producer tail. */
>>>         } prod ODP_ALIGNED_CACHE;
>>>
>>>         /** @private Consumer */
>>> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>>>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>>>                 uint32_t size;           /* Size of the ring. */
>>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>>> -               uint32_t head;          /* Consumer head. */
>>> -               uint32_t tail;          /* Consumer tail. */
>>> +               odp_atomic32_t head;    /* Consumer head. */
>>> +               odp_atomic32_t tail;    /* Consumer tail. */
>>>         } cons ODP_ALIGNED_CACHE;
>>>
>>>         /** @private Memory space of ring starts here. */
>>> diff --git a/platform/linux-generic/include/api/odp.h
>>> b/platform/linux-generic/include/api/odp.h
>>> index 0ee3faf..d124d52 100644
>>> --- a/platform/linux-generic/include/api/odp.h
>>> +++ b/platform/linux-generic/include/api/odp.h
>>> @@ -32,6 +32,7 @@ extern "C" {
>>>  #include <odp_barrier.h>
>>>  #include <odp_spinlock.h>
>>>  #include <odp_atomic.h>
>>> +#include <odp_counter.h>
>>>
>>>  #include <odp_init.h>
>>>  #include <odp_system_info.h>
>>> diff --git a/platform/linux-generic/include/api/odp_atomic.h
>>> b/platform/linux-generic/include/api/odp_atomic.h
>>>
>>> index 0cc4cf4..ccaad02 100644
>>>
>>> --- a/platform/linux-generic/include/api/odp_atomic.h
>>> +++ b/platform/linux-generic/include/api/odp_atomic.h
>>> @@ -4,464 +4,494 @@
>>>   * SPDX-License-Identifier:     BSD-3-Clause
>>>   */
>>>
>>> -
>>>  /**
>>>   * @file
>>>   *
>>> - * ODP atomic operations
>>> + * ODP atomic types and operations, semantically a subset of C11
>>> atomics.
>>> + * Scalar variable wrapped in a struct to avoid accessing scalar
>>> directly
>>> + * without using the required access functions.
>>> + * Atomic functions must be used to operate on atomic variables!
>>>   */
>>>
>>>  #ifndef ODP_ATOMIC_H_
>>>  #define ODP_ATOMIC_H_
>>>
>>> +#include <stdint.h>
>>> +#include <odp_align.h>
>>> +#include <odp_hints.h>
>>> +#include <odp_debug.h>
>>> +
>>>  #ifdef __cplusplus
>>>  extern "C" {
>>>  #endif
>>>
>>> -
>>> -#include <odp_std_types.h>
>>> -
>>> -
>>> -/**
>>> - * Atomic integer
>>> - */
>>> -typedef volatile int32_t odp_atomic_int_t;
>>> -
>>> -/**
>>> - * Atomic unsigned integer 64 bits
>>> - */
>>> -typedef volatile uint64_t odp_atomic_u64_t;
>>> -
>>> -/**
>>> - * Atomic unsigned integer 32 bits
>>> - */
>>> -typedef volatile uint32_t odp_atomic_u32_t;
>>> -
>>> -
>>> -/**
>>> - * Initialize atomic integer
>>> - *
>>> - * @param ptr    An integer atomic variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       *ptr = 0;
>>> -}
>>> -
>>> -/**
>>> - * Load value of atomic integer
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return atomic integer value
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       return *ptr;
>>> -}
>>> -
>>> -/**
>>> - * Store value to atomic integer
>>> - *
>>> - * @param ptr        An atomic variable
>>> - * @param new_value  Store new_value to a variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
>>> new_value)
>>> -{
>>> -       *ptr = new_value;
>>> -}
>>> -
>>> -/**
>>> - * Fetch and add atomic integer
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be added to the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
>>> value)
>>> -{
>>> -       return __sync_fetch_and_add(ptr, value);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and subtract atomic integer
>>> - *
>>> - * @param ptr    An atomic integer variable
>>> - * @param value  A value to be subtracted from the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
>>> value)
>>> -{
>>> -       return __sync_fetch_and_sub(ptr, value);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and increment atomic integer by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_add_int(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Increment atomic integer by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_add_int(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and decrement atomic integer by 1
>>> - *
>>> - * @param ptr    An atomic int variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_sub_int(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Decrement atomic integer by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_sub_int(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Initialize atomic uint32
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       *ptr = 0;
>>> -}
>>> -
>>> -/**
>>> - * Load value of atomic uint32
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return atomic uint32 value
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       return *ptr;
>>> -}
>>> -
>>> -/**
>>> - * Store value to atomic uint32
>>> - *
>>> - * @param ptr        An atomic variable
>>> - * @param new_value  Store new_value to a variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
>>> -                                       uint32_t new_value)
>>> -{
>>> -       *ptr = new_value;
>>> -}
>>> -
>>> -/**
>>> - * Fetch and add atomic uint32
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be added to the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>>> -                                               uint32_t value)
>>> -{
>>> -       return __sync_fetch_and_add(ptr, value);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and subtract uint32
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be sub to the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
>>> -                                               uint32_t value)
>>> -{
>>> -       return __sync_fetch_and_sub(ptr, value);
>>> -}
>>> -
>>>  /**
>>> - * Fetch and increment atomic uint32 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -#if defined __OCTEON__
>>> -
>>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       uint32_t ret;
>>> -
>>> -       __asm__ __volatile__ ("syncws");
>>> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
>>> -                             "r" (ptr));
>>> -
>>> -       return ret;
>>> -}
>>> -
>>> + * 32-bit (unsigned) atomic type
>>> + */
>>> +typedef struct {
>>> +       uint32_t v; /**< Actual storage for the atomic variable */
>>> +} odp_atomic32_t
>>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>>> +
>>> +typedef enum {
>>> +       /** Relaxed memory order, no ordering of other accesses enforced
>>> */
>>> +       ODP_MEMORDER_RLX,
>>> +       /** Acquire memory order, later accesses cannot move before
>>> +        * acquire operation */
>>> +       ODP_MEMORDER_ACQ,
>>> +       /** Release memory order, earlier accesses cannot move after
>>> +        * release operation */
>>> +       ODP_MEMORDER_RLS
>>> +} odp_memorder_t;
>>> +
>>>
>>> +/*****************************************************************************
>>> + * Just some private helpers
>>>
>>> +*****************************************************************************/
>>> +
>>> +#ifdef __OCTEON__
>>> +/* OCTEON Write Memory Barrier */
>>> +#define COMPILER_HW_BARRIER() __asm __volatile( \
>>> +       /* Double syncw to work around errata */ \
>>> +       "syncw\n\tsyncw" : : : )
>>>  #else
>>> -
>>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_add_u32(ptr, 1);
>>> -}
>>> -
>>> +/** Compiler and hardware full memory barrier */
>>> +#define COMPILER_HW_BARRIER() __sync_synchronize()
>>> +/* __sync_synchronize() generates the right insn for ARMv6t2 and
>>> ARMv7-a */
>>>  #endif
>>>
>>> -/**
>>> - * Increment atomic uint32 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_add_u32(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and decrement uint32 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_sub_u32(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Decrement atomic uint32 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_sub_u32(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Atomic compare and set for 32bit
>>> - *
>>> - * @param dst destination location into which the value will be written.
>>> - * @param exp expected value.
>>> - * @param src new value.
>>> - * @return Non-zero on success; 0 on failure.
>>> - */
>>> -static inline int
>>> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
>>> -{
>>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>>> +#define MEMORY "memory"
>>> +
>>>
>>> +/*****************************************************************************
>>> + * Operations on 32-bit atomics
>>> + * odp_atomic32_init - no return value
>>> + * odp_atomic32_load - return current value
>>> + * odp_atomic32_store - no return value
>>> + * odp_atomic32_cmp_xchg_weak - return bool
>>> + * odp_atomic32_fetch_add - return old value
>>> + * odp_atomic32_add - no return value
>>> + * odp_atomic32_fetch_inc - return old value
>>> + * odp_atomic32_inc - no return value
>>> + * odp_atomic32_fetch_dec - return old value
>>> + * odp_atomic32_dec - no return value
>>> +
>>> *****************************************************************************/
>>> +
>>> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
>>> +{
>>> +       /* Write of aligned word is atomic */
>>> +       /* Cast to volatile to force compiler to (re-) write variable,
>>> thus we
>>> +        * can avoid using compiler memory barriers */
>>> +       *(__volatile uint32_t *)&ptr->v = val;
>>> +}
>>> +
>>> +/**
>>> + * Atomic load of 32-bit atomic variable
>>> + *
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param memmodel Memory model associated with the load
>>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
>>> + *
>>> + * @return Value of the variable
>>> + */
>>> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
>>> +               odp_memorder_t mmodel)
>>> +{
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +               uint32_t val;
>>> +               /* Read of aligned word is atomic */
>>> +               /* Cast to volatile to force compiler to (re-) read
>>> variable,
>>> +                * thus we can avoid using compiler memory barriers */
>>> +               val = *(__volatile const uint32_t *)&ptr->v;
>>> +               return val;
>>> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
>>> +#if defined __aarch64__
>>> +               uint32_t val;
>>> +               __asm __volatile("ldar %w0, [%1]"
>>> +                               : "=&r"(val)
>>> +                               : "r"(&ptr->v)
>>> +                               : MEMORY);
>>> +               return val;
>>> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
>>> +               /* Read of aligned word is atomic */
>>> +               uint32_t val = ptr->v;
>>> +               /* To prevent later accesses from moving up */
>>> +               /* Herb Sutter claims HW barrier not needed on x86? */
>>> +               COMPILER_HW_BARRIER();
>>> +               return val;
>>> +#else
>>> +#warning odp_atomic32_load() may not be efficiently implemented
>>> +               /* Assume read of aligned word is atomic */
>>> +               uint32_t val = ptr->v;
>>> +               /* To prevent later accesses from moving up */
>>> +               COMPILER_HW_BARRIER();
>>> +               return val;
>>> +#endif
>>> +       } else {
>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>> +       }
>>> +}
>>> +
>>> +/**
>>> + * Atomic store to 32-bit atomic variable
>>> + *
>>> + * @param ptr  Pointer to a 32-bit atomic variable
>>> + * @param val  Value to write to the atomic variable
>>> + * @param memmodel Memory model associated with the store
>>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
>>> + */
>>> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
>>> +               uint32_t val,
>>> +               odp_memorder_t mmodel)
>>> +{
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +               /* Write of aligned word is atomic */
>>> +               /* Cast to volatile to force compiler to (re-) write
>>> variable,
>>> +                * thus we will avoid using compiler memory barriers */
>>> +               *(__volatile uint32_t *)&ptr->v = val;
>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
>>> +               /* Compiler and HW barrier to prevent earlier accesses
>>> from
>>> +                * moving down */
>>> +               COMPILER_HW_BARRIER();
>>> +               /* Write of aligned word is atomic */
>>> +               ptr->v = val;
>>> +               /* Compiler and HW barrier to prevent this store from
>>> moving
>>> +                * down after a later load-acquire and thus create
>>> overlapping
>>> +                * critical sections. Herb Sutter thinks this is needed
>>> */
>>> +               COMPILER_HW_BARRIER();
>>> +#elif defined __aarch64__
>>> +               __asm __volatile("stlr %w0, [%1]"
>>> +                               :
>>> +                               : "r"(val), "r"(&ptr->v)
>>> +                               : MEMORY);
>>> +#elif defined __x86_64__
>>> +               /* This is actually an atomic exchange operation */
>>> +               /* Generates good code on x86_64 */
>>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>>> +#else
>>> +#warning odp_atomic32_store_rls() may not be efficiently implemented
>>> +               /* This is actually an atomic exchange operation */
>>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>>> +#endif
>>> +       } else {
>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>> +       }
>>> +}
>>> +
>>> +
>>> +/**
>>> + * Atomic compare and exchange (swap) of 32-bit atomic variable
>>> + * "Weak" semantics, may fail spuriously and must be used in a loop.
>>> + *
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param exp_p Pointer to expected value (updated on failure)
>>> + * @param val   New value to write
>>> + * @param       memmodel Memory model associated with the
>>> compare-and-swap
>>> + * operation (ODP_MEMORDER_RLX only)
>>> + *
>>> + * @return 1 (true) if exchange successful, 0 (false) if not successful
>>> (and
>>> + * '*exp_p' updated with current value)
>>> + */
>>> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
>>> +               uint32_t *exp_p,
>>> +               uint32_t val,
>>> +               odp_memorder_t mmodel)
>>> +{
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +               uint32_t old;
>>> +               uint32_t exp = *exp_p;
>>> +               int status;
>>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>>> +                                "cmp   %0, %3\t\n"
>>> +                                "bne   1f\t\n"
>>> +                                "strex %1, %4, [%2]\t\n"
>>> +                                "1:\t\n"
>>> +                               : "=&r"(old), "=&r"(status)
>>> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
>>> +                               : MEMORY);
>>> +               if (odp_unlikely(old != exp)) {
>>> +                       /* Value has changed, can't proceed */
>>> +                       /* Clear exclusive access monitor */
>>> +                       __asm __volatile("clrex");
>>> +                       /* Return current value */
>>> +                       *exp_p = old;
>>> +                       return 0;
>>> +               }
>>> +               /* strex returns 0 on success */
>>> +               if (odp_unlikely(status != 0)) {
>>> +                       /* strex failed, reservation was disturbed */
>>> +                       /* Return potentially changed value */
>>> +                       *exp_p = odp_atomic32_load(ptr,
>>> ODP_MEMORDER_RLX);
>>> +                       return 0;
>>> +               }
>>> +               return 1;
>>> +#elif defined __mips64__
>>> +               uint32_t old;
>>> +               uint32_t exp = *exp_p;
>>> +               uint32_t status = val;
>>> +               __asm __volatile("llw %0, [%2]\t\n"
>>> +                                "bne %0, %3, 1f\t\n"
>>> +                                "scw %1, [%2]\t\n"
>>> +                                "1:\t\n"
>>> +                               : "=&r"(old), "+&r"(status)
>>> +                               : "r"(&ptr->v), "r"(exp)
>>> +                               : MEMORY);
>>> +               if (odp_unlikely(old != exp)) {
>>> +                       /* Value has changed, can't proceed */
>>> +                       /* Return current value */
>>> +                       *exp_p = old;
>>> +                       return 0;
>>> +               }
>>> +               /* scw returns 1 on success, 0 on failure */
>>> +               if (odp_unlikely(status == 0)) {
>>> +                       /* scw failed, reservation was disturbed */
>>> +                       *exp_p = odp_atomic32_load(ptr,
>>> ODP_MEMORDER_RLX);
>>> +                       return 0;
>>> +               }
>>> +               return 1;
>>> +#elif defined __x86_64__
>>> +               uint32_t exp = *exp_p;
>>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
>>> val);
>>> +               if (odp_unlikely(old != exp)) {
>>> +                       /* Return the unexpected content of '*ptr' */
>>> +                       *exp_p = old;
>>> +                       return 0;
>>> +               } else {
>>> +                       return 1;
>>> +               }
>>> +#else
>>> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
>>> +               uint32_t exp = *exp_p;
>>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
>>> val);
>>> +               if (odp_unlikely(old != exp)) {
>>> +                       /* Return the unexpected content of '*ptr' */
>>> +                       *exp_p = old;
>>> +                       return 0;
>>> +               } else {
>>> +                       return 1;
>>> +               }
>>> +#endif
>>> +       } else {
>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>> +       }
>>> +}
>>> +
>>> +/**
>>> + * Atomic fetch and add to 32-bit atomic variable
>>> + * @note A - B <=> A + (-B)
>>> + *
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param incr  The value to be added to the atomic variable
>>> + * @param memmodel Memory model associated with the add
>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>> + *
>>> + * @return Value of the atomic variable before the addition
>>> + */
>>> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
>>> +               uint32_t incr,
>>> +               odp_memorder_t mmodel)
>>> +{
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +               uint32_t old_val, tmp;
>>> +               int status;
>>> +               do {
>>> +                       __asm __volatile("ldrex %0, [%3]\t\n"
>>> +                                        "add   %1, %0, %4\t\n"
>>> +                                        "strex %2, %1, [%3]\t\n"
>>>
>>> +                                       : "=&r"(old_val), "=&r"(tmp),
>>>
>>> +                                         "=&r"(status)
>>> +                                       : "r"(&ptr->v), "r"(incr)
>>> +                                       : MEMORY);
>>> +               } while (odp_unlikely(status != 0));
>>> +               return old_val;
>>> +#elif defined __OCTEON__
>>> +               uint32_t old_val;
>>> +               __asm __volatile("laa %0,(%2),%3"
>>> +                               : "=r" (old_val), "+m" (ptr)
>>> +                               : "r" (ptr), "r" (incr)
>>> +                               : MEMORY);
>>> +               return old_val;
>>> +#elif defined __x86_64__
>>> +               /* Generates good code on x86_64 */
>>> +               return __sync_fetch_and_add(&ptr->v, incr);
>>> +#else
>>> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
>>> +               return __sync_fetch_and_add(&ptr->v, incr);
>>> +#endif
>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>> +#if defined __OCTEON__
>>> +               uint32_t old_val;
>>> +               COMPILER_HW_BARRIER();
>>> +               __asm __volatile("laa %0,(%2),%3"
>>> +                               : "=r" (old_val), "+m" (ptr)
>>> +                               : "r" (ptr), "r" (incr)
>>> +                               : MEMORY);
>>> +               COMPILER_HW_BARRIER();
>>> +               return old_val;
>>> +#endif
>>> +               /* __sync_fetch_and_add() will give us barriers before
>>> and
>>> +                * after, we are fine with this for release operations */
>>> +               return __sync_fetch_and_add(&ptr->v, incr);
>>> +       } else {
>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>> +       }
>>>  }
>>>
>>>  /**
>>> - * Initialize atomic uint64
>>> + * Atomic add to 32-bit atomic variable
>>>   *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param incr  The value to be added to the atomic variable
>>> + * @param memmodel Memory model associated with the add
>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>   */
>>> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
>>> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
>>> +               uint32_t incr,
>>> +               odp_memorder_t mmodel)
>>>  {
>>> -       *ptr = 0;
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +               /* Platforms that support atomic add instructions can add
>>> +                * their implementations here */
>>> +#if defined __OCTEON__
>>> +               __asm __volatile("saa %[inc], (%[base])"
>>> +                               : "+m" (*ptr)
>>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>>> +                               : MEMORY);
>>> +               return;
>>> +#endif
>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>> +               /* Platforms that support atomic add instructions can add
>>> +                * their implementations here */
>>> +#if defined __OCTEON__
>>> +               COMPILER_HW_BARRIER();
>>> +               __asm __volatile("saa %[inc], (%[base])"
>>> +                               : "+m" (*ptr)
>>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>>> +                               : MEMORY);
>>> +               COMPILER_HW_BARRIER();
>>> +               return;
>>> +#endif
>>> +       }
>>> +       /* Default to using odp_atomic32_fetch_add() */
>>> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>>>  }
>>>
>>>  /**
>>> - * Load value of atomic uint64
>>> - *
>>> - * @param ptr    An atomic variable
>>> + * Atomic fetch and increment of 32-bit atomic variable
>>>   *
>>> - * @return atomic uint64 value
>>> + * param ptr   Pointer to a 32-bit atomic variable
>>> + * @param memmodel Memory model associated with the increment
>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>   *
>>> - * @note The operation is not synchronized with other threads
>>> + * @return Value of the atomic variable before the increment
>>>   */
>>> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
>>> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
>>> +               odp_memorder_t mmodel)
>>>  {
>>> -       return *ptr;
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +               /* Platforms that support atomic increment instructions
>>> can add
>>> +                * their implementations here */
>>> +#if defined __OCTEON__
>>> +               uint32_t old_val;
>>> +               __asm __volatile("lai %0,(%2)"
>>> +                               : "=r" (old_val), "+m" (ptr)
>>> +                               : "r" (ptr)
>>> +                               : MEMORY);
>>> +               return old_val;
>>> +#endif
>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>> +#if defined __OCTEON__
>>> +               uint32_t old_val;
>>> +               COMPILER_HW_BARRIER();
>>> +               __asm __volatile("lai %0,(%2)"
>>> +                               : "=r" (old_val), "+m" (ptr)
>>> +                               : "r" (ptr)
>>> +                               : MEMORY);
>>> +               COMPILER_HW_BARRIER();
>>> +               return old_val;
>>> +#endif
>>> +       }
>>> +       /* Default to using odp_atomic32_fetch_add() */
>>> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>>>  }
>>>
>>>  /**
>>> - * Store value to atomic uint64
>>> - *
>>> - * @param ptr        An atomic variable
>>> - * @param new_value  Store new_value to a variable
>>> + * Atomic increment of 32-bit atomic variable
>>>   *
>>> - * @note The operation is not synchronized with other threads
>>> + * param ptr   Pointer to a 32-bit atomic variable
>>> + * @param memmodel Memory model associated with the increment
>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>   */
>>> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
>>> -                                       uint64_t new_value)
>>> -{
>>> -       *ptr = new_value;
>>> -}
>>> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
>>> +               odp_memorder_t mmodel)
>>>
>>> -/**
>>> - * Add atomic uint64
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be added to the variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
>>> value)
>>>  {
>>> -       __sync_fetch_and_add(ptr, value);
>>> +       /* Default to using odp_atomic32_fetch_inc() */
>>> +       /* Platforms that support atomic increment instructions can add
>>> +        * their implementations here */
>>> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>>>  }
>>>
>>>  /**
>>> - * Fetch and add atomic uint64
>>> + * Atomic fetch and decrement of 32-bit atomic variable
>>>   *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be added to the variable
>>> + * param ptr   Pointer to a 32-bit atomic variable
>>> + * @param memmodel Memory model associated with the decrement
>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>   *
>>> - * @return Value of the variable before the operation
>>> + * @return Value of the atomic variable before the decrement
>>>   */
>>> -
>>> -#if defined __powerpc__ && !defined __powerpc64__
>>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>>> -                                               uint64_t value)
>>> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
>>> +               odp_memorder_t mmodel)
>>>  {
>>> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
>>> -                                   (uint32_t)value);
>>> -}
>>> -#else
>>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>>> -                                               uint64_t value)
>>> -{
>>> -       return __sync_fetch_and_add(ptr, value);
>>> -}
>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>> +               /* Platforms that support atomic decrement instructions
>>> can add
>>> +                * their implementations here */
>>> +#if defined __OCTEON__
>>> +               uint32_t old_val;
>>> +               __asm __volatile("lad %0,(%2)"
>>> +                               : "=r" (old_val), "+m" (ptr)
>>> +                               : "r" (ptr)
>>> +                               : MEMORY);
>>> +               return old_val;
>>>  #endif
>>> -/**
>>> - * Subtract atomic uint64
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be subtracted from the variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
>>> value)
>>> -{
>>> -       __sync_fetch_and_sub(ptr, value);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and subtract atomic uint64
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be subtracted from the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -#if defined __powerpc__ && !defined __powerpc64__
>>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>>> -                                               uint64_t value)
>>> -{
>>> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
>>> -                                   (uint32_t)value);
>>> -}
>>> -#else
>>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>>> -                                               uint64_t value)
>>> -{
>>> -       return __sync_fetch_and_sub(ptr, value);
>>> -}
>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>> +#if defined __OCTEON__
>>> +               uint32_t old_val;
>>> +               COMPILER_HW_BARRIER();
>>> +               __asm __volatile("lad %0,(%2)"
>>> +                               : "=r" (old_val), "+m" (ptr)
>>> +                               : "r" (ptr)
>>> +                               : MEMORY);
>>> +               COMPILER_HW_BARRIER();
>>> +               return old_val;
>>>  #endif
>>> -/**
>>> - * Fetch and increment atomic uint64 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_add_u64(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Increment atomic uint64 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_add_u64(ptr, 1);
>>> +       }
>>> +       /* Default to using odp_atomic32_fetch_add() */
>>> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>>>  }
>>>
>>>  /**
>>> - * Fetch and decrement atomic uint64 by 1
>>> + * Atomic decrement of 32-bit atomic variable
>>>   *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> + * param ptr   Pointer to a 32-bit atomic variable
>>> + * @param memmodel Memory model associated with the decrement
>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>   */
>>> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_sub_u64(ptr, 1);
>>> -}
>>> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
>>> +               odp_memorder_t memorder)
>>>
>>> -/**
>>> - * Decrement atomic uint64 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>>>  {
>>> -       odp_atomic_fetch_sub_u64(ptr, 1);
>>> +       /* Default to using odp_atomic32_fetch_dec() */
>>> +       /* Platforms that support atomic decrement instructions can add
>>> +        * their implementations here */
>>> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>>>  }
>>>
>>> -/**
>>> - * Atomic compare and set for 64bit
>>> - *
>>> - * @param dst destination location into which the value will be written.
>>> - * @param exp expected value.
>>> - * @param src new value.
>>> - * @return Non-zero on success; 0 on failure.
>>> - */
>>> -static inline int
>>> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
>>> -{
>>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>>> -}
>>> +/* We are not exporting this macro */
>>> +#undef COMPILER_HW_BARRIER
>>> +#undef MEMORY
>>>
>>>  #ifdef __cplusplus
>>>  }
>>> diff --git a/platform/linux-generic/include/api/odp_barrier.h
>>> b/platform/linux-generic/include/api/odp_barrier.h
>>> index a7b3215..69b1eb8 100644
>>> --- a/platform/linux-generic/include/api/odp_barrier.h
>>> +++ b/platform/linux-generic/include/api/odp_barrier.h
>>> @@ -27,18 +27,18 @@ extern "C" {
>>>   * ODP execution barrier
>>>   */
>>>  typedef struct odp_barrier_t {
>>> -       int              count;  /**< @private Thread count */
>>> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
>>> +       uint32_t       num_threads;  /**< @private Thread count
>>> (constant) */
>>> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
>>>  } odp_barrier_t;
>>>
>>>
>>>  /**
>>>   * Init barrier with thread count
>>>   *
>>> - * @param barrier    Barrier
>>> - * @param count      Thread count
>>> + * @param barrier     Barrier
>>> + * @param num_threads Number of threads which share the barrier
>>>   */
>>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
>>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>>>
>>>
>>>  /**
>>> diff --git a/platform/linux-generic/include/api/odp_counter.h
>>> b/platform/linux-generic/include/api/odp_counter.h
>>> new file mode 100644
>>>
>>> index 0000000..f937d27
>>>
>>> --- /dev/null
>>> +++ b/platform/linux-generic/include/api/odp_counter.h
>>> @@ -0,0 +1,363 @@
>>> +/* Copyright (c) 2013, Linaro Limited
>>> + * All rights reserved.
>>> + *
>>> + * SPDX-License-Identifier:     BSD-3-Clause
>>> + */
>>> +
>>> +/**
>>> + * @file
>>> + *
>>> + * ODP atomic counter types and operations, suitable for e.g. shared
>>> statistics.
>>> + * Relaxed memory model assumed for lowest overhead.
>>> + * Scalar variable wrapped in a struct to avoid accessing scalar
>>> directly
>>> + * without using the required access functions.
>>> + * Counter functions must be used to operate on counter variables!
>>> + */
>>> +
>>> +#ifndef ODP_COUNTER_H_
>>> +#define ODP_COUNTER_H_
>>> +
>>> +#include <stdint.h>
>>> +#include <odp_align.h>
>>> +#include <odp_hints.h>
>>> +
>>> +#ifdef __cplusplus
>>> +extern "C" {
>>> +#endif
>>> +
>>> +/**
>>> + * 32-bit (unsigned) atomic counter type
>>> + */
>>> +typedef struct {
>>> +       uint32_t v; /**< Actual storage for the counter variable */
>>> +} odp_counter32_t
>>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>>> +
>>> +/**
>>> + * 64-bit (unsigned) atomic counter type
>>> + */
>>> +typedef struct {
>>> +       uint64_t v; /**< Actual storage for the counter variable */
>>> +       /* Room for other data structures (e.g. spin lock) that might be
>>> +        * needed to ensure atomicity on some architectures */
>>> +} odp_counter64_t
>>> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
>>> +
>>>
>>> +/*****************************************************************************
>>> + * Operations on 32-bit atomic counters
>>> + * odp_counter32_init - returns no value
>>> + * odp_counter32_read - returns current value
>>> + * odp_counter32_write - returns no value
>>> + * odp_counter32_add - returns no value
>>> + * odp_counter32_read_inc - returns old value
>>> + * odp_counter32_inc - returns no value
>>> +
>>> *****************************************************************************/
>>> +
>>> +/**
>>> + * Initialize 32-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 32-bit counter variable
>>> + * @param val   Initial value
>>> + */
>>> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t
>>> val)
>>> +{
>>> +       /* No implementation requires any other type of initialization */
>>> +       *(__volatile uint32_t *)&ptr->v = val;
>>> +}
>>> +
>>> +/**
>>> + * Read 32-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 32-bit counter variable
>>> + *
>>> + * @return Value of the variable
>>> + */
>>> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
>>> +{
>>> +       uint32_t val;
>>> +       /* Read of aligned word is atomic */
>>> +       /* Cast to volatile to force compiler to (re-) read variable,
>>> thus we
>>> +        * will avoid using compiler memory barriers */
>>> +       val = *(__volatile const uint32_t *)&ptr->v;
>>> +       return val;
>>> +}
>>> +
>>> +/**
>>> + * Write 32-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 32-bit counter variable
>>> + * @param val   Value to write to the variable
>>> + */
>>> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t
>>> val)
>>> +{
>>> +       /* Write of aligned word is atomic */
>>> +       /* Cast to volatile to force compiler to (re-) write variable,
>>> thus we
>>> +        * will avoid using compiler memory barriers */
>>> +       *(__volatile uint32_t *)&ptr->v = val;
>>> +}
>>> +
>>> +/**
>>> + * Atomic add to 32-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 32-bit counter variable
>>> + * @param incr  The value to be added to the counter variable
>>> + */
>>> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t
>>> incr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint32_t result;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>>> +                                "add   %0, %0, %3\t\n"
>>> +                                "strex %1, %0, [%2]"
>>> +                                : "=&r"(result), "=&r"(status)
>>> +                                : "r"(&ptr->v), "Ir" (incr)
>>> +                                : );
>>> +       } while (odp_unlikely(status != 0));
>>> +#elif defined __OCTEON__
>>> +       __asm __volatile("saa %[inc], (%[base])"
>>> +                        : "+m" (*ptr)
>>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>>> +                        : );
>>> +#elif defined __x86_64__
>>> +       /* Generates good code on x86_64 */
>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>> +#else
>>> +       /* Warning odp_counter32_add() may not be efficiently
>>> implemented */
>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>> +#endif
>>> +}
>>> +
>>> +/**
>>> + * Atomic increment (+1) of 32-bit counter variable, return original
>>> value
>>> + *
>>> + * @param ptr   Pointer to a 32-bit counter variable
>>> + *
>>> + * @return Original value of counter
>>> + */
>>> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint32_t result, tmp;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrex %0, [%3]\t\n"
>>> +                                "add   %1, %0, #1\t\n"
>>> +                                "strex %2, %1, [%3]"
>>>
>>> +                                : "=&r"(result), "=&r"(tmp),
>>> "=&r"(status)
>>>
>>> +                                : "r"(&ptr->v)
>>> +                                : );
>>> +       } while (odp_unlikely(status != 0));
>>> +       return result;
>>> +#elif defined __OCTEON__
>>> +       uint32_t old_val;
>>> +       __asm __volatile("lai %0,(%2)"
>>> +                        : "=r" (old_val), "+m" (ptr)
>>> +                        : "r" (ptr)
>>> +                        : );
>>> +       return old_val;
>>> +#elif defined __x86_64__
>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>> +#else
>>> +/* Warning odp_counter32_read_inc() may not be efficiently implemented
>>> */
>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>> +#endif
>>> +}
>>> +
>>> +/**
>>> + * Atomic increment (+1) 32-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 32-bit counter variable
>>> + */
>>> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
>>> +{
>>> +#if defined __OCTEON__
>>> +       odp_counter32_add(ptr, 1);
>>> +#else
>>> +       (void)odp_counter32_read_inc(ptr);
>>> +#endif
>>> +}
>>> +
>>>
>>> +/*****************************************************************************
>>> + * Operations on 64-bit atomic counters
>>> + * odp_counter64_init
>>> + * odp_counter64_read
>>> + * odp_counter64_write
>>> + * odp_counter64_add
>>> + * odp_counter64_read_inc
>>> + * odp_counter64_inc
>>> +
>>> *****************************************************************************/
>>> +
>>> +/**
>>> + * Read 64-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 64-bit counter variable
>>> + *
>>> + * @return Value of the counter variable
>>> + */
>>> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t val;
>>> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
>>> +                        "clrex" /* Clear exclusive access monitor */
>>> +                        : "=&r"(val)
>>> +                        : "r"(&ptr->v)
>>> +                        : );
>>> +       return val;
>>> +#elif defined __x86_64__ || defined __aarch64__
>>> +       /* Read of aligned quad/double word is atomic */
>>> +       return ptr->v;
>>> +#else
>>> +/* Warning odp_counter64_read() may not be efficiently implemented */
>>> +       return __sync_fetch_and_or(&ptr->v, 0);
>>> +#endif
>>> +}
>>> +
>>> +/**
>>> + * Write 64-bit counter variable
>>> + *
>>> + * @param ptr  Pointer to a 64-bit counter variable
>>> + * @param val  Value to write to the counter variable
>>> + */
>>> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t
>>> val)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t old_val;
>>> +       int status;
>>> +       do {
>>> +               /* Read counter variable exclusively so we can write to
>>> it
>>> +                * later */
>>> +               /* Attempt to write the new value */
>>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>>> +                                "strexd %1, %3, %H3, [%2]"
>>> +                                : "=&r"(old_val), "=&r"(status)
>>> +                                : "r"(&ptr->v), "r"(val)
>>> +                                : );
>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>> succeeds */
>>> +#elif defined __x86_64__ || defined __aarch64__
>>> +       /* Write of aligned quad/double word is atomic */
>>> +       ptr->v = val;
>>> +#else
>>> +/* Warning odp_counter64_write() may not be efficiently implemented */
>>> +       /* This is actually an counter exchange operation */
>>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>> +#endif
>>> +}
>>> +
>>> +/**
>>> + * Initialize 64-bit counter variable
>>> + * Perform implementation specific initializations, assign initial
>>> value.
>>> + *
>>> + * @param ptr   Pointer to a 64-bit counter variable
>>> + * @param val   Initial value
>>> + */
>>> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t
>>> val)
>>> +{
>>> +       /* No implementation requires any other type of initialization */
>>> +       odp_counter64_write(ptr, val);
>>> +}
>>> +
>>> +/**
>>> + * Atomic add to 64-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 64-bit counter variable
>>> + * @param incr  The value to be added to the counter variable
>>> + */
>>> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t
>>> incr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t old_val;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>>> +                                "adds   %0, %0, %3\t\n"
>>> +                                "adc    %H0, %H3\t\n"
>>> +                                "strexd %1, %0, %H0, [%2]"
>>> +                                : "=&r"(old_val), "=&r"(status)
>>> +                                : "r"(&ptr->v), "r"(incr)
>>> +                                : );
>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>> succeeds */
>>> +#elif defined __OCTEON__
>>> +       __asm __volatile("saad %[inc], (%[base])"
>>> +                        : "+m" (*ptr)
>>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>>> +                        : );
>>> +#elif defined __x86_64__
>>> +       /* Generates good code on x86_64 */
>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>> +#else
>>> +/* Warning odp_counter64_add() may not be efficiently implemented */
>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>> +#endif
>>> +}
>>> +
>>> +
>>> +/**
>>> + * Atomic increment (+1) 64-bit counter variable and return original
>>> value
>>> + *
>>> + * @param ptr   Pointer to a 64-bit counter variable
>>> + *
>>> + * @return Original value of counter
>>> + */
>>> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t old_val, tmp;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
>>> +                                "adds   %2, %0, #1\t\n"
>>> +                                "adc    %H2, %H0, #0\t\n"
>>> +                                "strexd %1, %2, %H2, [%3]"
>>> +                                : "=&r"(old_val), "=&r"(status),
>>> "=&r"(tmp)
>>> +                                : "r"(&ptr->v)
>>> +                                : );
>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>> succeeds */
>>> +       return old_val;
>>> +#elif defined __OCTEON__
>>> +       uint64_t old_val;
>>> +       __asm __volatile("laid %0,(%2)"
>>> +                       : "=r" (old_val), "+m" (ptr)
>>> +                       : "r" (ptr)
>>> +                       : );
>>> +       return old_val;
>>> +#elif defined __x86_64__
>>> +       /* Generates good code on x86_64 */
>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>> +#else
>>> +/* Warning odp_counter64_read_inc() may not be efficiently implemented
>>> */
>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>> +#endif
>>> +}
>>> +
>>> +/**
>>> + * Atomic increment (+1) 64-bit counter variable
>>> + *
>>> + * @param ptr   Pointer to a 64-bit counter variable
>>> + */
>>> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t old_val;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>>> +                                "adds   %0, #1\t\n"
>>> +                                "adc    %H0, #0\t\n"
>>> +                                "strexd %1, %0, %H0, [%2]"
>>> +                                : "=&r"(old_val), "=&r"(status)
>>> +                                : "r"(&ptr->v)
>>> +                                : );
>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>> succeeds */
>>> +#else
>>> +       (void)odp_counter64_read_inc(ptr);
>>> +#endif
>>> +}
>>> +
>>> +#ifdef __cplusplus
>>> +}
>>> +#endif
>>> +
>>> +#endif
>>> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
>>> b/platform/linux-generic/include/api/odp_rwlock.h
>>> index 252ebb2..ff8a9a2 100644
>>> --- a/platform/linux-generic/include/api/odp_rwlock.h
>>> +++ b/platform/linux-generic/include/api/odp_rwlock.h
>>> @@ -10,26 +10,30 @@
>>>  /**
>>>   * @file
>>>   *
>>> - * ODP RW Locks
>>> + * ODP read/write lock
>>> + * RW lock support mu
>>> ...
>>>
>>> [Message clipped]
>>
>>
>>
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>
>>
Mike Holmes Nov. 4, 2014, 3:58 p.m. UTC | #9
Generally fix one problem per patch and you solve several things at once
making it hard to think about implications, can these be separated ?

Added header file odp_counter.h with support for 32- and 64-bit atomic
counters
using relaxed memory order. 6 operations (init/read/write/add/read_inc/inc)
on
32-bit and 64-bit counters respectively.A
Renamed odp_atomic_test to odp_counter_test and changed to use odp_counter.h

Implementation of C11-based memory model for atomic operations. 10
operations
(init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec) in
odp_atomic.h. The required memory ordering is now a parameter to each call
just
like in C11.

Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
fall back to GCC __sync builtins which often include unnecessarily heavy
barrier/sync operations (always sequentially consistent).

Attempt to remove all explicit memory barriers (odp_sync_stores) from code
that
implements multithreaded synchronization primitives (e.g. locks, barriers).
Rewrote such primitives to use the new atomic operations.

Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
odp_ticketlock_lock() (missing acquire barrier) and odp_ring enqueue/dequeue

On 4 November 2014 10:06, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:

> And what should be in each patch?
>
> On 4 November 2014 16:03, Anders Roxell <anders.roxell@linaro.org> wrote:
>
>> As perti wrote in his first email this patch should be broken up in
>> multiple patches...
>>
>> Cheers,
>> Anders
>> On 4 Nov 2014 15:34, "Ola Liljedahl" <ola.liljedahl@linaro.org> wrote:
>>
>>> Possibly odp_atomics.h should then be internal leaving only
>>> odp_counter.h as the only public API. The original odp_atomics.h is public
>>> so I left it that way.
>>>
>>> The counter API does not allow the user to specify any memory ordering,
>>> relaxed memory order is expected, i.e. no ordering is guaranteed.
>>>
>>> Why does acquire/release not fit well with the far atomics? And what do
>>> you mean specifically with "far atomics"? Just the counter updates like
>>> Cavium has?
>>>
>>> As Linux kernel atomics interface predates C11/C++11 atomics support, I
>>> do not see it as model to follow.
>>>
>>> The patch summary contained a brief description of what I wanted to
>>> achieve with the patch. What do you want more, a Google Docs design
>>> document?
>>>
>>> -- Ola
>>>
>>> On 4 November 2014 15:22, Savolainen, Petri (NSN - FI/Espoo) <
>>> petri.savolainen@nsn.com> wrote:
>>>
>>>>  There are many things I’d change in this patch. I think it’s better
>>>> to take a step back and talk what you are trying to achieve here, and then
>>>> correct those step by step. E.g. the whole idea of acquire / release does
>>>> not fit well on far atomics, and far atomics is the thing I’d abstract from
>>>> applications with this API. Other synchronization primitives (such as
>>>> locks) would not be implemented (too often) by applications, so it’s not
>>>> very productive to abstract that (implementation of locks). E.g. Linux
>>>> kernel atomics.h looks pretty much like the odp_atomic.h.
>>>>
>>>>
>>>>
>>>> -Petri
>>>>
>>>>
>>>>
>>>>
>>>>
>>>> *From:* lng-odp-bounces@lists.linaro.org [mailto:
>>>> lng-odp-bounces@lists.linaro.org] *On Behalf Of *ext Ola Liljedahl
>>>> *Sent:* Tuesday, November 04, 2014 3:49 PM
>>>> *To:* lng-odp@lists.linaro.org
>>>> *Subject:* Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11
>>>> memory model
>>>>
>>>>
>>>>
>>>> Ping!
>>>>
>>>>
>>>>
>>>> I really need this new working atomics support merged ASAP because I
>>>> have a new lock-less implementation of the timer API which uses atomic
>>>> operations. I haven't seen any real criticism against the content of the
>>>> patch so there is nothing to change.
>>>>
>>>>
>>>>
>>>> -- Ola
>>>>
>>>>
>>>>
>>>>
>>>>
>>>> On 20 October 2014 15:07, Ola Liljedahl <ola.liljedahl@linaro.org>
>>>> wrote:
>>>>
>>>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>>>> ---
>>>> Added header file odp_counter.h with support for 32- and 64-bit atomic
>>>> counters
>>>> using relaxed memory order. 6 operations
>>>> (init/read/write/add/read_inc/inc) on
>>>> 32-bit and 64-bit counters respectively.
>>>>
>>>> Renamed odp_atomic_test to odp_counter_test and changed to use
>>>> odp_counter.h
>>>>
>>>> Implementation of C11-based memory model for atomic operations. 10
>>>> operations
>>>> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
>>>> in
>>>> odp_atomic.h. The required memory ordering is now a parameter to each
>>>> call just
>>>> like in C11.
>>>>
>>>> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
>>>> fall back to GCC __sync builtins which often include unnecessarily heavy
>>>> barrier/sync operations (always sequentially consistent).
>>>>
>>>> Attempt to remove all explicit memory barriers (odp_sync_stores) from
>>>> code that
>>>> implements multithreaded synchronization primitives (e.g. locks,
>>>> barriers).
>>>> Rewrote such primitives to use the new atomic operations.
>>>>
>>>> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of
>>>> counter),
>>>> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
>>>> enqueue/dequeue
>>>> (missing release barrier, had only compiler barrier).
>>>>
>>>>  .gitignore                                         |   2 +-
>>>>  example/generator/odp_generator.c                  |  43 +-
>>>>  example/ipsec/odp_ipsec.c                          |   2 +-
>>>>  example/odp_example/odp_example.c                  |   2 +-
>>>>  example/timer/odp_timer_test.c                     |   2 +-
>>>>  helper/include/odph_ring.h                         |   8 +-
>>>>  platform/linux-generic/include/api/odp.h           |   1 +
>>>>  platform/linux-generic/include/api/odp_atomic.h    | 838
>>>> +++++++++++----------
>>>>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>>>>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>>>>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>>>>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>>>>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>>>>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>>>>  platform/linux-generic/odp_barrier.c               |  49 +-
>>>>  platform/linux-generic/odp_buffer.c                |   3 +-
>>>>  platform/linux-generic/odp_crypto.c                |   7 +-
>>>>  platform/linux-generic/odp_queue.c                 |   7 +-
>>>>  platform/linux-generic/odp_ring.c                  |  94 +--
>>>>  platform/linux-generic/odp_rwlock.c                |  62 +-
>>>>  platform/linux-generic/odp_thread.c                |   9 +-
>>>>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>>>>  platform/linux-generic/odp_timer.c                 |  22 +-
>>>>  test/api_test/Makefile.am                          |   6 +-
>>>>  test/api_test/odp_atomic_test.c                    | 362 ---------
>>>>  test/api_test/odp_atomic_test.h                    |  60 --
>>>>  test/api_test/odp_common.c                         |   1 -
>>>>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>>>>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>>>>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>>>>  delete mode 100644 test/api_test/odp_atomic_test.c
>>>>  delete mode 100644 test/api_test/odp_atomic_test.h
>>>>  create mode 100644 test/api_test/odp_counter_test.c
>>>>
>>>> diff --git a/.gitignore b/.gitignore
>>>> index 6342e34..77db4d6 100644
>>>> --- a/.gitignore
>>>> +++ b/.gitignore
>>>> @@ -35,7 +35,7 @@ build/
>>>>  odp_example
>>>>  odp_packet
>>>>  odp_packet_netmap
>>>> -odp_atomic
>>>> +odp_counter
>>>>  odp_shm
>>>>  odp_ring
>>>>  odp_timer_ping
>>>> diff --git a/example/generator/odp_generator.c
>>>> b/example/generator/odp_generator.c
>>>> index eb8b340..252157d 100644
>>>> --- a/example/generator/odp_generator.c
>>>> +++ b/example/generator/odp_generator.c
>>>> @@ -62,10 +62,10 @@ typedef struct {
>>>>   * counters
>>>>  */
>>>>  static struct {
>>>> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
>>>> -       odp_atomic_u64_t ip;    /**< ip packets */
>>>> -       odp_atomic_u64_t udp;   /**< udp packets */
>>>> -       odp_atomic_u64_t icmp;  /**< icmp packets */
>>>> +       odp_counter64_t seq;    /**< ip seq to be send */
>>>> +       odp_counter64_t ip;     /**< ip packets */
>>>> +       odp_counter64_t udp;    /**< udp packets */
>>>> +       odp_counter64_t icmp;   /**< icmp packets */
>>>>  } counters;
>>>>
>>>>  /** * Thread specific arguments
>>>> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>>>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>>>> ODPH_UDPHDR_LEN +
>>>>                                        ODPH_IPV4HDR_LEN);
>>>>         ip->proto = ODPH_IPPROTO_UDP;
>>>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
>>>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>>>>         ip->id = odp_cpu_to_be_16(seq);
>>>>         ip->chksum = 0;
>>>>         odph_ipv4_csum_update(pkt);
>>>> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>>>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>>>> ODPH_ICMPHDR_LEN +
>>>>                                        ODPH_IPV4HDR_LEN);
>>>>         ip->proto = ODPH_IPPROTO_ICMP;
>>>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
>>>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>>>>         ip->id = odp_cpu_to_be_16(seq);
>>>>         ip->chksum = 0;
>>>>         odph_ipv4_csum_update(pkt);
>>>> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>>>>                 }
>>>>
>>>>                 if (args->appl.interval != 0) {
>>>> +                       uint64_t seq =
>>>> odp_counter64_read(&counters.seq);
>>>>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
>>>> -                              thr, counters.seq, counters.seq%0xffff);
>>>> +                              thr, seq, seq%0xffff);
>>>>                         /* TODO use odp timer */
>>>>                         usleep(args->appl.interval * 1000);
>>>>                 }
>>>> -               if (args->appl.number != -1 && counters.seq
>>>> -                   >= (unsigned int)args->appl.number) {
>>>> +               if (args->appl.number != -1 &&
>>>> +                   odp_counter64_read(&counters.seq) >=
>>>> +                   (unsigned int)args->appl.number) {
>>>>                         break;
>>>>                 }
>>>>         }
>>>> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>>>>         /* receive number of reply pks until timeout */
>>>>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0)
>>>> {
>>>>                 while (args->appl.timeout >= 0) {
>>>> -                       if (counters.icmp >= (unsigned
>>>> int)args->appl.number)
>>>> +                       if (odp_counter64_read(&counters.icmp) >=
>>>> +                           (unsigned int)args->appl.number)
>>>>                                 break;
>>>>                         /* TODO use odp timer */
>>>>                         sleep(1);
>>>> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>>>>
>>>>         /* print info */
>>>>         if (args->appl.mode == APPL_MODE_UDP) {
>>>> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
>>>> +               printf("  [%02i] total send: %ju\n", thr,
>>>> +                      odp_counter64_read(&counters.seq));
>>>>         } else if (args->appl.mode == APPL_MODE_PING) {
>>>>                 printf("  [%02i] total send: %ju total receive: %ju\n",
>>>> -                      thr, counters.seq, counters.icmp);
>>>> +                      thr, odp_counter64_read(&counters.seq),
>>>> +                      odp_counter64_read(&counters.icmp));
>>>>         }
>>>>         return arg;
>>>>  }
>>>> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
>>>> pkt_tbl[], unsigned len)
>>>>                 if (!odp_packet_inflag_ipv4(pkt))
>>>>                         continue;
>>>>
>>>> -               odp_atomic_inc_u64(&counters.ip);
>>>> +               odp_counter64_inc(&counters.ip);
>>>>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>>>>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>>>>                 ip = (odph_ipv4hdr_t *)(buf +
>>>> odp_packet_l3_offset(pkt));
>>>> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
>>>> pkt_tbl[], unsigned len)
>>>>
>>>>                 /* udp */
>>>>                 if (ip->proto == ODPH_IPPROTO_UDP) {
>>>> -                       odp_atomic_inc_u64(&counters.udp);
>>>> +                       odp_counter64_inc(&counters.udp);
>>>>                         udp = (odph_udphdr_t *)(buf + offset);
>>>>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>>>>                                         odp_be_to_cpu_16(udp->length) -
>>>> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
>>>> pkt_tbl[], unsigned len)
>>>>                         icmp = (odph_icmphdr_t *)(buf + offset);
>>>>                         /* echo reply */
>>>>                         if (icmp->type == ICMP_ECHOREPLY) {
>>>> -                               odp_atomic_inc_u64(&counters.icmp);
>>>> +                               odp_counter64_inc(&counters.icmp);
>>>>                                 memcpy(&tvsend, buf + offset +
>>>> ODPH_ICMPHDR_LEN,
>>>>                                        sizeof(struct timeval));
>>>>                                 /* TODO This should be changed to use an
>>>> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>>>>         }
>>>>
>>>>         /* init counters */
>>>> -       odp_atomic_init_u64(&counters.seq);
>>>> -       odp_atomic_init_u64(&counters.ip);
>>>> -       odp_atomic_init_u64(&counters.udp);
>>>> -       odp_atomic_init_u64(&counters.icmp);
>>>> +       odp_counter64_init(&counters.seq, 0);
>>>> +       odp_counter64_init(&counters.ip, 0);
>>>> +       odp_counter64_init(&counters.udp, 0);
>>>> +       odp_counter64_init(&counters.icmp, 0);
>>>>
>>>>         /* Reserve memory for args from shared mem */
>>>>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
>>>> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
>>>> index 2f2dc19..76c27d0 100644
>>>> --- a/example/ipsec/odp_ipsec.c
>>>> +++ b/example/ipsec/odp_ipsec.c
>>>> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>>>>         printf("Num worker threads: %i\n", num_workers);
>>>>
>>>>         /* Create a barrier to synchronize thread startup */
>>>> -       odp_barrier_init_count(&sync_barrier, num_workers);
>>>> +       odp_barrier_init(&sync_barrier, num_workers);
>>>>
>>>>         /*
>>>>          * By default core #0 runs Linux kernel background tasks.
>>>> diff --git a/example/odp_example/odp_example.c
>>>> b/example/odp_example/odp_example.c
>>>> index 0e9aa3d..c473395 100644
>>>> --- a/example/odp_example/odp_example.c
>>>> +++ b/example/odp_example/odp_example.c
>>>> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>>>>         odp_shm_print_all();
>>>>
>>>>         /* Barrier to sync test case execution */
>>>> -       odp_barrier_init_count(&globals->barrier, num_workers);
>>>> +       odp_barrier_init(&globals->barrier, num_workers);
>>>>
>>>>         if (args.proc_mode) {
>>>>                 int ret;
>>>> diff --git a/example/timer/odp_timer_test.c
>>>> b/example/timer/odp_timer_test.c
>>>> index 78b2ae2..dfbeae9 100644
>>>> --- a/example/timer/odp_timer_test.c
>>>> +++ b/example/timer/odp_timer_test.c
>>>> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>>>>         printf("\n");
>>>>
>>>>         /* Barrier to sync test case execution */
>>>> -       odp_barrier_init_count(&test_barrier, num_workers);
>>>> +       odp_barrier_init(&test_barrier, num_workers);
>>>>
>>>>         /* Create and launch worker threads */
>>>>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
>>>> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
>>>> index 76c1db8..5e78b34 100644
>>>> --- a/helper/include/odph_ring.h
>>>> +++ b/helper/include/odph_ring.h
>>>> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>>>>                 uint32_t sp_enqueue;     /* True, if single producer. */
>>>>                 uint32_t size;           /* Size of ring. */
>>>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>>>> -               uint32_t head;          /* Producer head. */
>>>> -               uint32_t tail;          /* Producer tail. */
>>>> +               odp_atomic32_t head;    /* Producer head. */
>>>> +               odp_atomic32_t tail;    /* Producer tail. */
>>>>         } prod ODP_ALIGNED_CACHE;
>>>>
>>>>         /** @private Consumer */
>>>> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>>>>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>>>>                 uint32_t size;           /* Size of the ring. */
>>>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>>>> -               uint32_t head;          /* Consumer head. */
>>>> -               uint32_t tail;          /* Consumer tail. */
>>>> +               odp_atomic32_t head;    /* Consumer head. */
>>>> +               odp_atomic32_t tail;    /* Consumer tail. */
>>>>         } cons ODP_ALIGNED_CACHE;
>>>>
>>>>         /** @private Memory space of ring starts here. */
>>>> diff --git a/platform/linux-generic/include/api/odp.h
>>>> b/platform/linux-generic/include/api/odp.h
>>>> index 0ee3faf..d124d52 100644
>>>> --- a/platform/linux-generic/include/api/odp.h
>>>> +++ b/platform/linux-generic/include/api/odp.h
>>>> @@ -32,6 +32,7 @@ extern "C" {
>>>>  #include <odp_barrier.h>
>>>>  #include <odp_spinlock.h>
>>>>  #include <odp_atomic.h>
>>>> +#include <odp_counter.h>
>>>>
>>>>  #include <odp_init.h>
>>>>  #include <odp_system_info.h>
>>>> diff --git a/platform/linux-generic/include/api/odp_atomic.h
>>>> b/platform/linux-generic/include/api/odp_atomic.h
>>>>
>>>> index 0cc4cf4..ccaad02 100644
>>>>
>>>> --- a/platform/linux-generic/include/api/odp_atomic.h
>>>> +++ b/platform/linux-generic/include/api/odp_atomic.h
>>>> @@ -4,464 +4,494 @@
>>>>   * SPDX-License-Identifier:     BSD-3-Clause
>>>>   */
>>>>
>>>> -
>>>>  /**
>>>>   * @file
>>>>   *
>>>> - * ODP atomic operations
>>>> + * ODP atomic types and operations, semantically a subset of C11
>>>> atomics.
>>>> + * Scalar variable wrapped in a struct to avoid accessing scalar
>>>> directly
>>>> + * without using the required access functions.
>>>> + * Atomic functions must be used to operate on atomic variables!
>>>>   */
>>>>
>>>>  #ifndef ODP_ATOMIC_H_
>>>>  #define ODP_ATOMIC_H_
>>>>
>>>> +#include <stdint.h>
>>>> +#include <odp_align.h>
>>>> +#include <odp_hints.h>
>>>> +#include <odp_debug.h>
>>>> +
>>>>  #ifdef __cplusplus
>>>>  extern "C" {
>>>>  #endif
>>>>
>>>> -
>>>> -#include <odp_std_types.h>
>>>> -
>>>> -
>>>> -/**
>>>> - * Atomic integer
>>>> - */
>>>> -typedef volatile int32_t odp_atomic_int_t;
>>>> -
>>>> -/**
>>>> - * Atomic unsigned integer 64 bits
>>>> - */
>>>> -typedef volatile uint64_t odp_atomic_u64_t;
>>>> -
>>>> -/**
>>>> - * Atomic unsigned integer 32 bits
>>>> - */
>>>> -typedef volatile uint32_t odp_atomic_u32_t;
>>>> -
>>>> -
>>>> -/**
>>>> - * Initialize atomic integer
>>>> - *
>>>> - * @param ptr    An integer atomic variable
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> - */
>>>> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
>>>> -{
>>>> -       *ptr = 0;
>>>> -}
>>>> -
>>>> -/**
>>>> - * Load value of atomic integer
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return atomic integer value
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> - */
>>>> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
>>>> -{
>>>> -       return *ptr;
>>>> -}
>>>> -
>>>> -/**
>>>> - * Store value to atomic integer
>>>> - *
>>>> - * @param ptr        An atomic variable
>>>> - * @param new_value  Store new_value to a variable
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> - */
>>>> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
>>>> new_value)
>>>> -{
>>>> -       *ptr = new_value;
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and add atomic integer
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be added to the variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
>>>> value)
>>>> -{
>>>> -       return __sync_fetch_and_add(ptr, value);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and subtract atomic integer
>>>> - *
>>>> - * @param ptr    An atomic integer variable
>>>> - * @param value  A value to be subtracted from the variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
>>>> value)
>>>> -{
>>>> -       return __sync_fetch_and_sub(ptr, value);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and increment atomic integer by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
>>>> -{
>>>> -       return odp_atomic_fetch_add_int(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Increment atomic integer by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
>>>> -{
>>>> -       odp_atomic_fetch_add_int(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and decrement atomic integer by 1
>>>> - *
>>>> - * @param ptr    An atomic int variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
>>>> -{
>>>> -       return odp_atomic_fetch_sub_int(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Decrement atomic integer by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
>>>> -{
>>>> -       odp_atomic_fetch_sub_int(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Initialize atomic uint32
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> - */
>>>> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       *ptr = 0;
>>>> -}
>>>> -
>>>> -/**
>>>> - * Load value of atomic uint32
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return atomic uint32 value
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> - */
>>>> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       return *ptr;
>>>> -}
>>>> -
>>>> -/**
>>>> - * Store value to atomic uint32
>>>> - *
>>>> - * @param ptr        An atomic variable
>>>> - * @param new_value  Store new_value to a variable
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> - */
>>>> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
>>>> -                                       uint32_t new_value)
>>>> -{
>>>> -       *ptr = new_value;
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and add atomic uint32
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be added to the variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>>>> -                                               uint32_t value)
>>>> -{
>>>> -       return __sync_fetch_and_add(ptr, value);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and subtract uint32
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be sub to the variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
>>>> -                                               uint32_t value)
>>>> -{
>>>> -       return __sync_fetch_and_sub(ptr, value);
>>>> -}
>>>> -
>>>>  /**
>>>> - * Fetch and increment atomic uint32 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -#if defined __OCTEON__
>>>> -
>>>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       uint32_t ret;
>>>> -
>>>> -       __asm__ __volatile__ ("syncws");
>>>> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
>>>> -                             "r" (ptr));
>>>> -
>>>> -       return ret;
>>>> -}
>>>> -
>>>> + * 32-bit (unsigned) atomic type
>>>> + */
>>>> +typedef struct {
>>>> +       uint32_t v; /**< Actual storage for the atomic variable */
>>>> +} odp_atomic32_t
>>>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>>>> +
>>>> +typedef enum {
>>>> +       /** Relaxed memory order, no ordering of other accesses
>>>> enforced */
>>>> +       ODP_MEMORDER_RLX,
>>>> +       /** Acquire memory order, later accesses cannot move before
>>>> +        * acquire operation */
>>>> +       ODP_MEMORDER_ACQ,
>>>> +       /** Release memory order, earlier accesses cannot move after
>>>> +        * release operation */
>>>> +       ODP_MEMORDER_RLS
>>>> +} odp_memorder_t;
>>>> +
>>>>
>>>> +/*****************************************************************************
>>>> + * Just some private helpers
>>>>
>>>> +*****************************************************************************/
>>>> +
>>>> +#ifdef __OCTEON__
>>>> +/* OCTEON Write Memory Barrier */
>>>> +#define COMPILER_HW_BARRIER() __asm __volatile( \
>>>> +       /* Double syncw to work around errata */ \
>>>> +       "syncw\n\tsyncw" : : : )
>>>>  #else
>>>> -
>>>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       return odp_atomic_fetch_add_u32(ptr, 1);
>>>> -}
>>>> -
>>>> +/** Compiler and hardware full memory barrier */
>>>> +#define COMPILER_HW_BARRIER() __sync_synchronize()
>>>> +/* __sync_synchronize() generates the right insn for ARMv6t2 and
>>>> ARMv7-a */
>>>>  #endif
>>>>
>>>> -/**
>>>> - * Increment atomic uint32 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       odp_atomic_fetch_add_u32(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and decrement uint32 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       return odp_atomic_fetch_sub_u32(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Decrement atomic uint32 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       odp_atomic_fetch_sub_u32(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Atomic compare and set for 32bit
>>>> - *
>>>> - * @param dst destination location into which the value will be
>>>> written.
>>>> - * @param exp expected value.
>>>> - * @param src new value.
>>>> - * @return Non-zero on success; 0 on failure.
>>>> - */
>>>> -static inline int
>>>> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t
>>>> src)
>>>> -{
>>>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>>>> +#define MEMORY "memory"
>>>> +
>>>>
>>>> +/*****************************************************************************
>>>> + * Operations on 32-bit atomics
>>>> + * odp_atomic32_init - no return value
>>>> + * odp_atomic32_load - return current value
>>>> + * odp_atomic32_store - no return value
>>>> + * odp_atomic32_cmp_xchg_weak - return bool
>>>> + * odp_atomic32_fetch_add - return old value
>>>> + * odp_atomic32_add - no return value
>>>> + * odp_atomic32_fetch_inc - return old value
>>>> + * odp_atomic32_inc - no return value
>>>> + * odp_atomic32_fetch_dec - return old value
>>>> + * odp_atomic32_dec - no return value
>>>> +
>>>> *****************************************************************************/
>>>> +
>>>> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
>>>> +{
>>>> +       /* Write of aligned word is atomic */
>>>> +       /* Cast to volatile to force compiler to (re-) write variable,
>>>> thus we
>>>> +        * can avoid using compiler memory barriers */
>>>> +       *(__volatile uint32_t *)&ptr->v = val;
>>>> +}
>>>> +
>>>> +/**
>>>> + * Atomic load of 32-bit atomic variable
>>>> + *
>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param memmodel Memory model associated with the load
>>>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
>>>> + *
>>>> + * @return Value of the variable
>>>> + */
>>>> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
>>>> +               odp_memorder_t mmodel)
>>>> +{
>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>> +               uint32_t val;
>>>> +               /* Read of aligned word is atomic */
>>>> +               /* Cast to volatile to force compiler to (re-) read
>>>> variable,
>>>> +                * thus we can avoid using compiler memory barriers */
>>>> +               val = *(__volatile const uint32_t *)&ptr->v;
>>>> +               return val;
>>>> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
>>>> +#if defined __aarch64__
>>>> +               uint32_t val;
>>>> +               __asm __volatile("ldar %w0, [%1]"
>>>> +                               : "=&r"(val)
>>>> +                               : "r"(&ptr->v)
>>>> +                               : MEMORY);
>>>> +               return val;
>>>> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
>>>> +               /* Read of aligned word is atomic */
>>>> +               uint32_t val = ptr->v;
>>>> +               /* To prevent later accesses from moving up */
>>>> +               /* Herb Sutter claims HW barrier not needed on x86? */
>>>> +               COMPILER_HW_BARRIER();
>>>> +               return val;
>>>> +#else
>>>> +#warning odp_atomic32_load() may not be efficiently implemented
>>>> +               /* Assume read of aligned word is atomic */
>>>> +               uint32_t val = ptr->v;
>>>> +               /* To prevent later accesses from moving up */
>>>> +               COMPILER_HW_BARRIER();
>>>> +               return val;
>>>> +#endif
>>>> +       } else {
>>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>>> +       }
>>>> +}
>>>> +
>>>> +/**
>>>> + * Atomic store to 32-bit atomic variable
>>>> + *
>>>> + * @param ptr  Pointer to a 32-bit atomic variable
>>>> + * @param val  Value to write to the atomic variable
>>>> + * @param memmodel Memory model associated with the store
>>>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
>>>> + */
>>>> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
>>>> +               uint32_t val,
>>>> +               odp_memorder_t mmodel)
>>>> +{
>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>> +               /* Write of aligned word is atomic */
>>>> +               /* Cast to volatile to force compiler to (re-) write
>>>> variable,
>>>> +                * thus we will avoid using compiler memory barriers */
>>>> +               *(__volatile uint32_t *)&ptr->v = val;
>>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>>> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
>>>> +               /* Compiler and HW barrier to prevent earlier accesses
>>>> from
>>>> +                * moving down */
>>>> +               COMPILER_HW_BARRIER();
>>>> +               /* Write of aligned word is atomic */
>>>> +               ptr->v = val;
>>>> +               /* Compiler and HW barrier to prevent this store from
>>>> moving
>>>> +                * down after a later load-acquire and thus create
>>>> overlapping
>>>> +                * critical sections. Herb Sutter thinks this is needed
>>>> */
>>>> +               COMPILER_HW_BARRIER();
>>>> +#elif defined __aarch64__
>>>> +               __asm __volatile("stlr %w0, [%1]"
>>>> +                               :
>>>> +                               : "r"(val), "r"(&ptr->v)
>>>> +                               : MEMORY);
>>>> +#elif defined __x86_64__
>>>> +               /* This is actually an atomic exchange operation */
>>>> +               /* Generates good code on x86_64 */
>>>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>>>> +#else
>>>> +#warning odp_atomic32_store_rls() may not be efficiently implemented
>>>> +               /* This is actually an atomic exchange operation */
>>>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>>>> +#endif
>>>> +       } else {
>>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>>> +       }
>>>> +}
>>>> +
>>>> +
>>>> +/**
>>>> + * Atomic compare and exchange (swap) of 32-bit atomic variable
>>>> + * "Weak" semantics, may fail spuriously and must be used in a loop.
>>>> + *
>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param exp_p Pointer to expected value (updated on failure)
>>>> + * @param val   New value to write
>>>> + * @param       memmodel Memory model associated with the
>>>> compare-and-swap
>>>> + * operation (ODP_MEMORDER_RLX only)
>>>> + *
>>>> + * @return 1 (true) if exchange successful, 0 (false) if not
>>>> successful (and
>>>> + * '*exp_p' updated with current value)
>>>> + */
>>>> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
>>>> +               uint32_t *exp_p,
>>>> +               uint32_t val,
>>>> +               odp_memorder_t mmodel)
>>>> +{
>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +               uint32_t old;
>>>> +               uint32_t exp = *exp_p;
>>>> +               int status;
>>>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>>>> +                                "cmp   %0, %3\t\n"
>>>> +                                "bne   1f\t\n"
>>>> +                                "strex %1, %4, [%2]\t\n"
>>>> +                                "1:\t\n"
>>>> +                               : "=&r"(old), "=&r"(status)
>>>> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
>>>> +                               : MEMORY);
>>>> +               if (odp_unlikely(old != exp)) {
>>>> +                       /* Value has changed, can't proceed */
>>>> +                       /* Clear exclusive access monitor */
>>>> +                       __asm __volatile("clrex");
>>>> +                       /* Return current value */
>>>> +                       *exp_p = old;
>>>> +                       return 0;
>>>> +               }
>>>> +               /* strex returns 0 on success */
>>>> +               if (odp_unlikely(status != 0)) {
>>>> +                       /* strex failed, reservation was disturbed */
>>>> +                       /* Return potentially changed value */
>>>> +                       *exp_p = odp_atomic32_load(ptr,
>>>> ODP_MEMORDER_RLX);
>>>> +                       return 0;
>>>> +               }
>>>> +               return 1;
>>>> +#elif defined __mips64__
>>>> +               uint32_t old;
>>>> +               uint32_t exp = *exp_p;
>>>> +               uint32_t status = val;
>>>> +               __asm __volatile("llw %0, [%2]\t\n"
>>>> +                                "bne %0, %3, 1f\t\n"
>>>> +                                "scw %1, [%2]\t\n"
>>>> +                                "1:\t\n"
>>>> +                               : "=&r"(old), "+&r"(status)
>>>> +                               : "r"(&ptr->v), "r"(exp)
>>>> +                               : MEMORY);
>>>> +               if (odp_unlikely(old != exp)) {
>>>> +                       /* Value has changed, can't proceed */
>>>> +                       /* Return current value */
>>>> +                       *exp_p = old;
>>>> +                       return 0;
>>>> +               }
>>>> +               /* scw returns 1 on success, 0 on failure */
>>>> +               if (odp_unlikely(status == 0)) {
>>>> +                       /* scw failed, reservation was disturbed */
>>>> +                       *exp_p = odp_atomic32_load(ptr,
>>>> ODP_MEMORDER_RLX);
>>>> +                       return 0;
>>>> +               }
>>>> +               return 1;
>>>> +#elif defined __x86_64__
>>>> +               uint32_t exp = *exp_p;
>>>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v,
>>>> exp, val);
>>>> +               if (odp_unlikely(old != exp)) {
>>>> +                       /* Return the unexpected content of '*ptr' */
>>>> +                       *exp_p = old;
>>>> +                       return 0;
>>>> +               } else {
>>>> +                       return 1;
>>>> +               }
>>>> +#else
>>>> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently
>>>> implemented
>>>> +               uint32_t exp = *exp_p;
>>>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v,
>>>> exp, val);
>>>> +               if (odp_unlikely(old != exp)) {
>>>> +                       /* Return the unexpected content of '*ptr' */
>>>> +                       *exp_p = old;
>>>> +                       return 0;
>>>> +               } else {
>>>> +                       return 1;
>>>> +               }
>>>> +#endif
>>>> +       } else {
>>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>>> +       }
>>>> +}
>>>> +
>>>> +/**
>>>> + * Atomic fetch and add to 32-bit atomic variable
>>>> + * @note A - B <=> A + (-B)
>>>> + *
>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param incr  The value to be added to the atomic variable
>>>> + * @param memmodel Memory model associated with the add
>>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>> + *
>>>> + * @return Value of the atomic variable before the addition
>>>> + */
>>>> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
>>>> +               uint32_t incr,
>>>> +               odp_memorder_t mmodel)
>>>> +{
>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +               uint32_t old_val, tmp;
>>>> +               int status;
>>>> +               do {
>>>> +                       __asm __volatile("ldrex %0, [%3]\t\n"
>>>> +                                        "add   %1, %0, %4\t\n"
>>>> +                                        "strex %2, %1, [%3]\t\n"
>>>>
>>>> +                                       : "=&r"(old_val), "=&r"(tmp),
>>>>
>>>> +                                         "=&r"(status)
>>>> +                                       : "r"(&ptr->v), "r"(incr)
>>>> +                                       : MEMORY);
>>>> +               } while (odp_unlikely(status != 0));
>>>> +               return old_val;
>>>> +#elif defined __OCTEON__
>>>> +               uint32_t old_val;
>>>> +               __asm __volatile("laa %0,(%2),%3"
>>>> +                               : "=r" (old_val), "+m" (ptr)
>>>> +                               : "r" (ptr), "r" (incr)
>>>> +                               : MEMORY);
>>>> +               return old_val;
>>>> +#elif defined __x86_64__
>>>> +               /* Generates good code on x86_64 */
>>>> +               return __sync_fetch_and_add(&ptr->v, incr);
>>>> +#else
>>>> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
>>>> +               return __sync_fetch_and_add(&ptr->v, incr);
>>>> +#endif
>>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>>> +#if defined __OCTEON__
>>>> +               uint32_t old_val;
>>>> +               COMPILER_HW_BARRIER();
>>>> +               __asm __volatile("laa %0,(%2),%3"
>>>> +                               : "=r" (old_val), "+m" (ptr)
>>>> +                               : "r" (ptr), "r" (incr)
>>>> +                               : MEMORY);
>>>> +               COMPILER_HW_BARRIER();
>>>> +               return old_val;
>>>> +#endif
>>>> +               /* __sync_fetch_and_add() will give us barriers before
>>>> and
>>>> +                * after, we are fine with this for release operations
>>>> */
>>>> +               return __sync_fetch_and_add(&ptr->v, incr);
>>>> +       } else {
>>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>>> +       }
>>>>  }
>>>>
>>>>  /**
>>>> - * Initialize atomic uint64
>>>> + * Atomic add to 32-bit atomic variable
>>>>   *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param incr  The value to be added to the atomic variable
>>>> + * @param memmodel Memory model associated with the add
>>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>>   */
>>>> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
>>>> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
>>>> +               uint32_t incr,
>>>> +               odp_memorder_t mmodel)
>>>>  {
>>>> -       *ptr = 0;
>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>> +               /* Platforms that support atomic add instructions can
>>>> add
>>>> +                * their implementations here */
>>>> +#if defined __OCTEON__
>>>> +               __asm __volatile("saa %[inc], (%[base])"
>>>> +                               : "+m" (*ptr)
>>>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>>>> +                               : MEMORY);
>>>> +               return;
>>>> +#endif
>>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>>> +               /* Platforms that support atomic add instructions can
>>>> add
>>>> +                * their implementations here */
>>>> +#if defined __OCTEON__
>>>> +               COMPILER_HW_BARRIER();
>>>> +               __asm __volatile("saa %[inc], (%[base])"
>>>> +                               : "+m" (*ptr)
>>>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>>>> +                               : MEMORY);
>>>> +               COMPILER_HW_BARRIER();
>>>> +               return;
>>>> +#endif
>>>> +       }
>>>> +       /* Default to using odp_atomic32_fetch_add() */
>>>> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>>>>  }
>>>>
>>>>  /**
>>>> - * Load value of atomic uint64
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> + * Atomic fetch and increment of 32-bit atomic variable
>>>>   *
>>>> - * @return atomic uint64 value
>>>> + * param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param memmodel Memory model associated with the increment
>>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>>   *
>>>> - * @note The operation is not synchronized with other threads
>>>> + * @return Value of the atomic variable before the increment
>>>>   */
>>>> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
>>>> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
>>>> +               odp_memorder_t mmodel)
>>>>  {
>>>> -       return *ptr;
>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>> +               /* Platforms that support atomic increment instructions
>>>> can add
>>>> +                * their implementations here */
>>>> +#if defined __OCTEON__
>>>> +               uint32_t old_val;
>>>> +               __asm __volatile("lai %0,(%2)"
>>>> +                               : "=r" (old_val), "+m" (ptr)
>>>> +                               : "r" (ptr)
>>>> +                               : MEMORY);
>>>> +               return old_val;
>>>> +#endif
>>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>>> +#if defined __OCTEON__
>>>> +               uint32_t old_val;
>>>> +               COMPILER_HW_BARRIER();
>>>> +               __asm __volatile("lai %0,(%2)"
>>>> +                               : "=r" (old_val), "+m" (ptr)
>>>> +                               : "r" (ptr)
>>>> +                               : MEMORY);
>>>> +               COMPILER_HW_BARRIER();
>>>> +               return old_val;
>>>> +#endif
>>>> +       }
>>>> +       /* Default to using odp_atomic32_fetch_add() */
>>>> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>>>>  }
>>>>
>>>>  /**
>>>> - * Store value to atomic uint64
>>>> - *
>>>> - * @param ptr        An atomic variable
>>>> - * @param new_value  Store new_value to a variable
>>>> + * Atomic increment of 32-bit atomic variable
>>>>   *
>>>> - * @note The operation is not synchronized with other threads
>>>> + * param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param memmodel Memory model associated with the increment
>>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>>   */
>>>> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
>>>> -                                       uint64_t new_value)
>>>> -{
>>>> -       *ptr = new_value;
>>>> -}
>>>> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
>>>> +               odp_memorder_t mmodel)
>>>>
>>>> -/**
>>>> - * Add atomic uint64
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be added to the variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
>>>> value)
>>>>  {
>>>> -       __sync_fetch_and_add(ptr, value);
>>>> +       /* Default to using odp_atomic32_fetch_inc() */
>>>> +       /* Platforms that support atomic increment instructions can add
>>>> +        * their implementations here */
>>>> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>>>>  }
>>>>
>>>>  /**
>>>> - * Fetch and add atomic uint64
>>>> + * Atomic fetch and decrement of 32-bit atomic variable
>>>>   *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be added to the variable
>>>> + * param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param memmodel Memory model associated with the decrement
>>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>>   *
>>>> - * @return Value of the variable before the operation
>>>> + * @return Value of the atomic variable before the decrement
>>>>   */
>>>> -
>>>> -#if defined __powerpc__ && !defined __powerpc64__
>>>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>>>> -                                               uint64_t value)
>>>> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
>>>> +               odp_memorder_t mmodel)
>>>>  {
>>>> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
>>>> -                                   (uint32_t)value);
>>>> -}
>>>> -#else
>>>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>>>> -                                               uint64_t value)
>>>> -{
>>>> -       return __sync_fetch_and_add(ptr, value);
>>>> -}
>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>> +               /* Platforms that support atomic decrement instructions
>>>> can add
>>>> +                * their implementations here */
>>>> +#if defined __OCTEON__
>>>> +               uint32_t old_val;
>>>> +               __asm __volatile("lad %0,(%2)"
>>>> +                               : "=r" (old_val), "+m" (ptr)
>>>> +                               : "r" (ptr)
>>>> +                               : MEMORY);
>>>> +               return old_val;
>>>>  #endif
>>>> -/**
>>>> - * Subtract atomic uint64
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be subtracted from the variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
>>>> value)
>>>> -{
>>>> -       __sync_fetch_and_sub(ptr, value);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and subtract atomic uint64
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be subtracted from the variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -#if defined __powerpc__ && !defined __powerpc64__
>>>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>>>> -                                               uint64_t value)
>>>> -{
>>>> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
>>>> -                                   (uint32_t)value);
>>>> -}
>>>> -#else
>>>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>>>> -                                               uint64_t value)
>>>> -{
>>>> -       return __sync_fetch_and_sub(ptr, value);
>>>> -}
>>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>>> +#if defined __OCTEON__
>>>> +               uint32_t old_val;
>>>> +               COMPILER_HW_BARRIER();
>>>> +               __asm __volatile("lad %0,(%2)"
>>>> +                               : "=r" (old_val), "+m" (ptr)
>>>> +                               : "r" (ptr)
>>>> +                               : MEMORY);
>>>> +               COMPILER_HW_BARRIER();
>>>> +               return old_val;
>>>>  #endif
>>>> -/**
>>>> - * Fetch and increment atomic uint64 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
>>>> -{
>>>> -       return odp_atomic_fetch_add_u64(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Increment atomic uint64 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
>>>> -{
>>>> -       odp_atomic_fetch_add_u64(ptr, 1);
>>>> +       }
>>>> +       /* Default to using odp_atomic32_fetch_add() */
>>>> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>>>>  }
>>>>
>>>>  /**
>>>> - * Fetch and decrement atomic uint64 by 1
>>>> + * Atomic decrement of 32-bit atomic variable
>>>>   *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> + * param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param memmodel Memory model associated with the decrement
>>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>>   */
>>>> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
>>>> -{
>>>> -       return odp_atomic_fetch_sub_u64(ptr, 1);
>>>> -}
>>>> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
>>>> +               odp_memorder_t memorder)
>>>>
>>>> -/**
>>>> - * Decrement atomic uint64 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>>>>  {
>>>> -       odp_atomic_fetch_sub_u64(ptr, 1);
>>>> +       /* Default to using odp_atomic32_fetch_dec() */
>>>> +       /* Platforms that support atomic decrement instructions can add
>>>> +        * their implementations here */
>>>> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>>>>  }
>>>>
>>>> -/**
>>>> - * Atomic compare and set for 64bit
>>>> - *
>>>> - * @param dst destination location into which the value will be
>>>> written.
>>>> - * @param exp expected value.
>>>> - * @param src new value.
>>>> - * @return Non-zero on success; 0 on failure.
>>>> - */
>>>> -static inline int
>>>> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t
>>>> src)
>>>> -{
>>>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>>>> -}
>>>> +/* We are not exporting this macro */
>>>> +#undef COMPILER_HW_BARRIER
>>>> +#undef MEMORY
>>>>
>>>>  #ifdef __cplusplus
>>>>  }
>>>> diff --git a/platform/linux-generic/include/api/odp_barrier.h
>>>> b/platform/linux-generic/include/api/odp_barrier.h
>>>> index a7b3215..69b1eb8 100644
>>>> --- a/platform/linux-generic/include/api/odp_barrier.h
>>>> +++ b/platform/linux-generic/include/api/odp_barrier.h
>>>> @@ -27,18 +27,18 @@ extern "C" {
>>>>   * ODP execution barrier
>>>>   */
>>>>  typedef struct odp_barrier_t {
>>>> -       int              count;  /**< @private Thread count */
>>>> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
>>>> +       uint32_t       num_threads;  /**< @private Thread count
>>>> (constant) */
>>>> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
>>>>  } odp_barrier_t;
>>>>
>>>>
>>>>  /**
>>>>   * Init barrier with thread count
>>>>   *
>>>> - * @param barrier    Barrier
>>>> - * @param count      Thread count
>>>> + * @param barrier     Barrier
>>>> + * @param num_threads Number of threads which share the barrier
>>>>   */
>>>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
>>>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>>>>
>>>>
>>>>  /**
>>>> diff --git a/platform/linux-generic/include/api/odp_counter.h
>>>> b/platform/linux-generic/include/api/odp_counter.h
>>>> new file mode 100644
>>>>
>>>> index 0000000..f937d27
>>>>
>>>> --- /dev/null
>>>> +++ b/platform/linux-generic/include/api/odp_counter.h
>>>> @@ -0,0 +1,363 @@
>>>> +/* Copyright (c) 2013, Linaro Limited
>>>> + * All rights reserved.
>>>> + *
>>>> + * SPDX-License-Identifier:     BSD-3-Clause
>>>> + */
>>>> +
>>>> +/**
>>>> + * @file
>>>> + *
>>>> + * ODP atomic counter types and operations, suitable for e.g. shared
>>>> statistics.
>>>> + * Relaxed memory model assumed for lowest overhead.
>>>> + * Scalar variable wrapped in a struct to avoid accessing scalar
>>>> directly
>>>> + * without using the required access functions.
>>>> + * Counter functions must be used to operate on counter variables!
>>>> + */
>>>> +
>>>> +#ifndef ODP_COUNTER_H_
>>>> +#define ODP_COUNTER_H_
>>>> +
>>>> +#include <stdint.h>
>>>> +#include <odp_align.h>
>>>> +#include <odp_hints.h>
>>>> +
>>>> +#ifdef __cplusplus
>>>> +extern "C" {
>>>> +#endif
>>>> +
>>>> +/**
>>>> + * 32-bit (unsigned) atomic counter type
>>>> + */
>>>> +typedef struct {
>>>> +       uint32_t v; /**< Actual storage for the counter variable */
>>>> +} odp_counter32_t
>>>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>>>> +
>>>> +/**
>>>> + * 64-bit (unsigned) atomic counter type
>>>> + */
>>>> +typedef struct {
>>>> +       uint64_t v; /**< Actual storage for the counter variable */
>>>> +       /* Room for other data structures (e.g. spin lock) that might be
>>>> +        * needed to ensure atomicity on some architectures */
>>>> +} odp_counter64_t
>>>> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
>>>> +
>>>>
>>>> +/*****************************************************************************
>>>> + * Operations on 32-bit atomic counters
>>>> + * odp_counter32_init - returns no value
>>>> + * odp_counter32_read - returns current value
>>>> + * odp_counter32_write - returns no value
>>>> + * odp_counter32_add - returns no value
>>>> + * odp_counter32_read_inc - returns old value
>>>> + * odp_counter32_inc - returns no value
>>>> +
>>>> *****************************************************************************/
>>>> +
>>>> +/**
>>>> + * Initialize 32-bit counter variable
>>>> + *
>>>> + * @param ptr   Pointer to a 32-bit counter variable
>>>> + * @param val   Initial value
>>>> + */
>>>> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t
>>>> val)
>>>> +{
>>>> +       /* No implementation requires any other type of initialization
>>>> */
>>>> +       *(__volatile uint32_t *)&ptr->v = val;
>>>> +}
>>>> +
>>>> +/**
>>>> + * Read 32-bit counter variable
>>>> + *
>>>> + * @param ptr   Pointer to a 32-bit counter variable
>>>> + *
>>>> + * @return Value of the variable
>>>> + */
>>>> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
>>>> +{
>>>> +       uint32_t val;
>>>> +       /* Read of aligned word is atomic */
>>>> +       /* Cast to volatile to force compiler to (re-) read variable,
>>>> thus we
>>>> +        * will avoid using compiler memory barriers */
>>>> +       val = *(__volatile const uint32_t *)&ptr->v;
>>>> +       return val;
>>>> +}
>>>> +
>>>> +/**
>>>> + * Write 32-bit counter variable
>>>> + *
>>>> + * @param ptr   Pointer to a 32-bit counter variable
>>>> + * @param val   Value to write to the variable
>>>> + */
>>>> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t
>>>> val)
>>>> +{
>>>> +       /* Write of aligned word is atomic */
>>>> +       /* Cast to volatile to force compiler to (re-) write variable,
>>>> thus we
>>>> +        * will avoid using compiler memory barriers */
>>>> +       *(__volatile uint32_t *)&ptr->v = val;
>>>> +}
>>>> +
>>>> +/**
>>>> + * Atomic add to 32-bit counter variable
>>>> + *
>>>> + * @param ptr   Pointer to a 32-bit counter variable
>>>> + * @param incr  The value to be added to the counter variable
>>>> + */
>>>> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t
>>>> incr)
>>>> +{
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       uint32_t result;
>>>> +       int status;
>>>> +       do {
>>>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>>>> +                                "add   %0, %0, %3\t\n"
>>>> +                                "strex %1, %0, [%2]"
>>>> +                                : "=&r"(result), "=&r"(status)
>>>> +                                : "r"(&ptr->v), "Ir" (incr)
>>>> +                                : );
>>>> +       } while (odp_unlikely(status != 0));
>>>> +#elif defined __OCTEON__
>>>> +       __asm __volatile("saa %[inc], (%[base])"
>>>> +                        : "+m" (*ptr)
>>>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>>>> +                        : );
>>>> +#elif defined __x86_64__
>>>> +       /* Generates good code on x86_64 */
>>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>>> +#else
>>>> +       /* Warning odp_counter32_add() may not be efficiently
>>>> implemented */
>>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>>> +#endif
>>>> +}
>>>> +
>>>> +/**
>>>> + * Atomic increment (+1) of 32-bit counter variable, return original
>>>> value
>>>> + *
>>>> + * @param ptr   Pointer to a 32-bit counter variable
>>>> + *
>>>> + * @return Original value of counter
>>>> + */
>>>> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
>>>> +{
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       uint32_t result, tmp;
>>>> +       int status;
>>>> +       do {
>>>> +               __asm __volatile("ldrex %0, [%3]\t\n"
>>>> +                                "add   %1, %0, #1\t\n"
>>>> +                                "strex %2, %1, [%3]"
>>>>
>>>> +                                : "=&r"(result), "=&r"(tmp),
>>>> "=&r"(status)
>>>>
>>>> +                                : "r"(&ptr->v)
>>>> +                                : );
>>>> +       } while (odp_unlikely(status != 0));
>>>> +       return result;
>>>> +#elif defined __OCTEON__
>>>> +       uint32_t old_val;
>>>> +       __asm __volatile("lai %0,(%2)"
>>>> +                        : "=r" (old_val), "+m" (ptr)
>>>> +                        : "r" (ptr)
>>>> +                        : );
>>>> +       return old_val;
>>>> +#elif defined __x86_64__
>>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>>> +#else
>>>> +/* Warning odp_counter32_read_inc() may not be efficiently implemented
>>>> */
>>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>>> +#endif
>>>> +}
>>>> +
>>>> +/**
>>>> + * Atomic increment (+1) 32-bit counter variable
>>>> + *
>>>> + * @param ptr   Pointer to a 32-bit counter variable
>>>> + */
>>>> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
>>>> +{
>>>> +#if defined __OCTEON__
>>>> +       odp_counter32_add(ptr, 1);
>>>> +#else
>>>> +       (void)odp_counter32_read_inc(ptr);
>>>> +#endif
>>>> +}
>>>> +
>>>>
>>>> +/*****************************************************************************
>>>> + * Operations on 64-bit atomic counters
>>>> + * odp_counter64_init
>>>> + * odp_counter64_read
>>>> + * odp_counter64_write
>>>> + * odp_counter64_add
>>>> + * odp_counter64_read_inc
>>>> + * odp_counter64_inc
>>>> +
>>>> *****************************************************************************/
>>>> +
>>>> +/**
>>>> + * Read 64-bit counter variable
>>>> + *
>>>> + * @param ptr   Pointer to a 64-bit counter variable
>>>> + *
>>>> + * @return Value of the counter variable
>>>> + */
>>>> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
>>>> +{
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       uint64_t val;
>>>> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
>>>> +                        "clrex" /* Clear exclusive access monitor */
>>>> +                        : "=&r"(val)
>>>> +                        : "r"(&ptr->v)
>>>> +                        : );
>>>> +       return val;
>>>> +#elif defined __x86_64__ || defined __aarch64__
>>>> +       /* Read of aligned quad/double word is atomic */
>>>> +       return ptr->v;
>>>> +#else
>>>> +/* Warning odp_counter64_read() may not be efficiently implemented */
>>>> +       return __sync_fetch_and_or(&ptr->v, 0);
>>>> +#endif
>>>> +}
>>>> +
>>>> +/**
>>>> + * Write 64-bit counter variable
>>>> + *
>>>> + * @param ptr  Pointer to a 64-bit counter variable
>>>> + * @param val  Value to write to the counter variable
>>>> + */
>>>> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t
>>>> val)
>>>> +{
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       uint64_t old_val;
>>>> +       int status;
>>>> +       do {
>>>> +               /* Read counter variable exclusively so we can write to
>>>> it
>>>> +                * later */
>>>> +               /* Attempt to write the new value */
>>>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>>>> +                                "strexd %1, %3, %H3, [%2]"
>>>> +                                : "=&r"(old_val), "=&r"(status)
>>>> +                                : "r"(&ptr->v), "r"(val)
>>>> +                                : );
>>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>>> succeeds */
>>>> +#elif defined __x86_64__ || defined __aarch64__
>>>> +       /* Write of aligned quad/double word is atomic */
>>>> +       ptr->v = val;
>>>> +#else
>>>> +/* Warning odp_counter64_write() may not be efficiently implemented */
>>>> +       /* This is actually an counter exchange operation */
>>>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>>> +#endif
>>>> +}
>>>> +
>>>> +/**
>>>> + * Initialize 64-bit counter variable
>>>> + * Perform implementation specific initializations, assign initial
>>>> value.
>>>> + *
>>>> + * @param ptr   Pointer to a 64-bit counter variable
>>>> + * @param val   Initial value
>>>> + */
>>>> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t
>>>> val)
>>>> +{
>>>> +       /* No implementation requires any other type of initialization
>>>> */
>>>> +       odp_counter64_write(ptr, val);
>>>> +}
>>>> +
>>>> +/**
>>>> + * Atomic add to 64-bit counter variable
>>>> + *
>>>> + * @param ptr   Pointer to a 64-bit counter variable
>>>> + * @param incr  The value to be added to the counter variable
>>>> + */
>>>> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t
>>>> incr)
>>>> +{
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       uint64_t old_val;
>>>> +       int status;
>>>> +       do {
>>>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>>>> +                                "adds   %0, %0, %3\t\n"
>>>> +                                "adc    %H0, %H3\t\n"
>>>> +                                "strexd %1, %0, %H0, [%2]"
>>>> +                                : "=&r"(old_val), "=&r"(status)
>>>> +                                : "r"(&ptr->v), "r"(incr)
>>>> +                                : );
>>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>>> succeeds */
>>>> +#elif defined __OCTEON__
>>>> +       __asm __volatile("saad %[inc], (%[base])"
>>>> +                        : "+m" (*ptr)
>>>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>>>> +                        : );
>>>> +#elif defined __x86_64__
>>>> +       /* Generates good code on x86_64 */
>>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>>> +#else
>>>> +/* Warning odp_counter64_add() may not be efficiently implemented */
>>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>>> +#endif
>>>> +}
>>>> +
>>>> +
>>>> +/**
>>>> + * Atomic increment (+1) 64-bit counter variable and return original
>>>> value
>>>> + *
>>>> + * @param ptr   Pointer to a 64-bit counter variable
>>>> + *
>>>> + * @return Original value of counter
>>>> + */
>>>> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
>>>> +{
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       uint64_t old_val, tmp;
>>>> +       int status;
>>>> +       do {
>>>> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
>>>> +                                "adds   %2, %0, #1\t\n"
>>>> +                                "adc    %H2, %H0, #0\t\n"
>>>> +                                "strexd %1, %2, %H2, [%3]"
>>>> +                                : "=&r"(old_val), "=&r"(status),
>>>> "=&r"(tmp)
>>>> +                                : "r"(&ptr->v)
>>>> +                                : );
>>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>>> succeeds */
>>>> +       return old_val;
>>>> +#elif defined __OCTEON__
>>>> +       uint64_t old_val;
>>>> +       __asm __volatile("laid %0,(%2)"
>>>> +                       : "=r" (old_val), "+m" (ptr)
>>>> +                       : "r" (ptr)
>>>> +                       : );
>>>> +       return old_val;
>>>> +#elif defined __x86_64__
>>>> +       /* Generates good code on x86_64 */
>>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>>> +#else
>>>> +/* Warning odp_counter64_read_inc() may not be efficiently implemented
>>>> */
>>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>>> +#endif
>>>> +}
>>>> +
>>>> +/**
>>>> + * Atomic increment (+1) 64-bit counter variable
>>>> + *
>>>> + * @param ptr   Pointer to a 64-bit counter variable
>>>> + */
>>>> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
>>>> +{
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       uint64_t old_val;
>>>> +       int status;
>>>> +       do {
>>>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>>>> +                                "adds   %0, #1\t\n"
>>>> +                                "adc    %H0, #0\t\n"
>>>> +                                "strexd %1, %0, %H0, [%2]"
>>>> +                                : "=&r"(old_val), "=&r"(status)
>>>> +                                : "r"(&ptr->v)
>>>> +                                : );
>>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>>> succeeds */
>>>> +#else
>>>> +       (void)odp_counter64_read_inc(ptr);
>>>> +#endif
>>>> +}
>>>> +
>>>> +#ifdef __cplusplus
>>>> +}
>>>> +#endif
>>>> +
>>>> +#endif
>>>> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
>>>> b/platform/linux-generic/include/api/odp_rwlock.h
>>>> index 252ebb2..ff8a9a2 100644
>>>> --- a/platform/linux-generic/include/api/odp_rwlock.h
>>>> +++ b/platform/linux-generic/include/api/odp_rwlock.h
>>>> @@ -10,26 +10,30 @@
>>>>  /**
>>>>   * @file
>>>>   *
>>>> - * ODP RW Locks
>>>> + * ODP read/write lock
>>>> + * RW lock support mu
>>>> ...
>>>>
>>>> [Message clipped]
>>>
>>>
>>>
>>> _______________________________________________
>>> lng-odp mailing list
>>> lng-odp@lists.linaro.org
>>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>>
>>>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
Bill Fischofer Nov. 4, 2014, 4:29 p.m. UTC | #10
So rather than folks spending the time to review the current patch Ola can
spend a lot of time to break it up and then people can start looking at
it?  That doesn't seem to be a very efficient way of working this.  We're
at a point now where some APIs are being replaced rather than incrementally
patched.  It's easier to do that as an (ahem) atomic function rather than
breaking things into multiple patches that are all mutually interdependent.


Multiple patches make sense if things are truly orthogonal.  But that's not
always the case and some patches will be large.

Bill

On Tue, Nov 4, 2014 at 9:58 AM, Mike Holmes <mike.holmes@linaro.org> wrote:

> Generally fix one problem per patch and you solve several things at once
> making it hard to think about implications, can these be separated ?
>
> Added header file odp_counter.h with support for 32- and 64-bit atomic
> counters
> using relaxed memory order. 6 operations (init/read/write/add/read_inc/inc)
> on
> 32-bit and 64-bit counters respectively.A
> Renamed odp_atomic_test to odp_counter_test and changed to use
> odp_counter.h
>
> Implementation of C11-based memory model for atomic operations. 10
> operations
> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
> in
> odp_atomic.h. The required memory ordering is now a parameter to each call
> just
> like in C11.
>
> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
> fall back to GCC __sync builtins which often include unnecessarily heavy
> barrier/sync operations (always sequentially consistent).
>
> Attempt to remove all explicit memory barriers (odp_sync_stores) from code
> that
> implements multithreaded synchronization primitives (e.g. locks, barriers).
> Rewrote such primitives to use the new atomic operations.
>
> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
> enqueue/dequeue
>
> On 4 November 2014 10:06, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:
>
>> And what should be in each patch?
>>
>> On 4 November 2014 16:03, Anders Roxell <anders.roxell@linaro.org> wrote:
>>
>>> As perti wrote in his first email this patch should be broken up in
>>> multiple patches...
>>>
>>> Cheers,
>>> Anders
>>> On 4 Nov 2014 15:34, "Ola Liljedahl" <ola.liljedahl@linaro.org> wrote:
>>>
>>>> Possibly odp_atomics.h should then be internal leaving only
>>>> odp_counter.h as the only public API. The original odp_atomics.h is public
>>>> so I left it that way.
>>>>
>>>> The counter API does not allow the user to specify any memory ordering,
>>>> relaxed memory order is expected, i.e. no ordering is guaranteed.
>>>>
>>>> Why does acquire/release not fit well with the far atomics? And what do
>>>> you mean specifically with "far atomics"? Just the counter updates like
>>>> Cavium has?
>>>>
>>>> As Linux kernel atomics interface predates C11/C++11 atomics support, I
>>>> do not see it as model to follow.
>>>>
>>>> The patch summary contained a brief description of what I wanted to
>>>> achieve with the patch. What do you want more, a Google Docs design
>>>> document?
>>>>
>>>> -- Ola
>>>>
>>>> On 4 November 2014 15:22, Savolainen, Petri (NSN - FI/Espoo) <
>>>> petri.savolainen@nsn.com> wrote:
>>>>
>>>>>  There are many things I’d change in this patch. I think it’s better
>>>>> to take a step back and talk what you are trying to achieve here, and then
>>>>> correct those step by step. E.g. the whole idea of acquire / release does
>>>>> not fit well on far atomics, and far atomics is the thing I’d abstract from
>>>>> applications with this API. Other synchronization primitives (such as
>>>>> locks) would not be implemented (too often) by applications, so it’s not
>>>>> very productive to abstract that (implementation of locks). E.g. Linux
>>>>> kernel atomics.h looks pretty much like the odp_atomic.h.
>>>>>
>>>>>
>>>>>
>>>>> -Petri
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> *From:* lng-odp-bounces@lists.linaro.org [mailto:
>>>>> lng-odp-bounces@lists.linaro.org] *On Behalf Of *ext Ola Liljedahl
>>>>> *Sent:* Tuesday, November 04, 2014 3:49 PM
>>>>> *To:* lng-odp@lists.linaro.org
>>>>> *Subject:* Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11
>>>>> memory model
>>>>>
>>>>>
>>>>>
>>>>> Ping!
>>>>>
>>>>>
>>>>>
>>>>> I really need this new working atomics support merged ASAP because I
>>>>> have a new lock-less implementation of the timer API which uses atomic
>>>>> operations. I haven't seen any real criticism against the content of the
>>>>> patch so there is nothing to change.
>>>>>
>>>>>
>>>>>
>>>>> -- Ola
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> On 20 October 2014 15:07, Ola Liljedahl <ola.liljedahl@linaro.org>
>>>>> wrote:
>>>>>
>>>>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>>>>> ---
>>>>> Added header file odp_counter.h with support for 32- and 64-bit atomic
>>>>> counters
>>>>> using relaxed memory order. 6 operations
>>>>> (init/read/write/add/read_inc/inc) on
>>>>> 32-bit and 64-bit counters respectively.
>>>>>
>>>>> Renamed odp_atomic_test to odp_counter_test and changed to use
>>>>> odp_counter.h
>>>>>
>>>>> Implementation of C11-based memory model for atomic operations. 10
>>>>> operations
>>>>> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
>>>>> in
>>>>> odp_atomic.h. The required memory ordering is now a parameter to each
>>>>> call just
>>>>> like in C11.
>>>>>
>>>>> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures
>>>>> will
>>>>> fall back to GCC __sync builtins which often include unnecessarily
>>>>> heavy
>>>>> barrier/sync operations (always sequentially consistent).
>>>>>
>>>>> Attempt to remove all explicit memory barriers (odp_sync_stores) from
>>>>> code that
>>>>> implements multithreaded synchronization primitives (e.g. locks,
>>>>> barriers).
>>>>> Rewrote such primitives to use the new atomic operations.
>>>>>
>>>>> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of
>>>>> counter),
>>>>> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
>>>>> enqueue/dequeue
>>>>> (missing release barrier, had only compiler barrier).
>>>>>
>>>>>  .gitignore                                         |   2 +-
>>>>>  example/generator/odp_generator.c                  |  43 +-
>>>>>  example/ipsec/odp_ipsec.c                          |   2 +-
>>>>>  example/odp_example/odp_example.c                  |   2 +-
>>>>>  example/timer/odp_timer_test.c                     |   2 +-
>>>>>  helper/include/odph_ring.h                         |   8 +-
>>>>>  platform/linux-generic/include/api/odp.h           |   1 +
>>>>>  platform/linux-generic/include/api/odp_atomic.h    | 838
>>>>> +++++++++++----------
>>>>>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>>>>>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>>>>>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>>>>>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>>>>>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>>>>>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>>>>>  platform/linux-generic/odp_barrier.c               |  49 +-
>>>>>  platform/linux-generic/odp_buffer.c                |   3 +-
>>>>>  platform/linux-generic/odp_crypto.c                |   7 +-
>>>>>  platform/linux-generic/odp_queue.c                 |   7 +-
>>>>>  platform/linux-generic/odp_ring.c                  |  94 +--
>>>>>  platform/linux-generic/odp_rwlock.c                |  62 +-
>>>>>  platform/linux-generic/odp_thread.c                |   9 +-
>>>>>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>>>>>  platform/linux-generic/odp_timer.c                 |  22 +-
>>>>>  test/api_test/Makefile.am                          |   6 +-
>>>>>  test/api_test/odp_atomic_test.c                    | 362 ---------
>>>>>  test/api_test/odp_atomic_test.h                    |  60 --
>>>>>  test/api_test/odp_common.c                         |   1 -
>>>>>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>>>>>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>>>>>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>>>>>  delete mode 100644 test/api_test/odp_atomic_test.c
>>>>>  delete mode 100644 test/api_test/odp_atomic_test.h
>>>>>  create mode 100644 test/api_test/odp_counter_test.c
>>>>>
>>>>> diff --git a/.gitignore b/.gitignore
>>>>> index 6342e34..77db4d6 100644
>>>>> --- a/.gitignore
>>>>> +++ b/.gitignore
>>>>> @@ -35,7 +35,7 @@ build/
>>>>>  odp_example
>>>>>  odp_packet
>>>>>  odp_packet_netmap
>>>>> -odp_atomic
>>>>> +odp_counter
>>>>>  odp_shm
>>>>>  odp_ring
>>>>>  odp_timer_ping
>>>>> diff --git a/example/generator/odp_generator.c
>>>>> b/example/generator/odp_generator.c
>>>>> index eb8b340..252157d 100644
>>>>> --- a/example/generator/odp_generator.c
>>>>> +++ b/example/generator/odp_generator.c
>>>>> @@ -62,10 +62,10 @@ typedef struct {
>>>>>   * counters
>>>>>  */
>>>>>  static struct {
>>>>> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
>>>>> -       odp_atomic_u64_t ip;    /**< ip packets */
>>>>> -       odp_atomic_u64_t udp;   /**< udp packets */
>>>>> -       odp_atomic_u64_t icmp;  /**< icmp packets */
>>>>> +       odp_counter64_t seq;    /**< ip seq to be send */
>>>>> +       odp_counter64_t ip;     /**< ip packets */
>>>>> +       odp_counter64_t udp;    /**< udp packets */
>>>>> +       odp_counter64_t icmp;   /**< icmp packets */
>>>>>  } counters;
>>>>>
>>>>>  /** * Thread specific arguments
>>>>> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>>>>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>>>>> ODPH_UDPHDR_LEN +
>>>>>                                        ODPH_IPV4HDR_LEN);
>>>>>         ip->proto = ODPH_IPPROTO_UDP;
>>>>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
>>>>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>>>>>         ip->id = odp_cpu_to_be_16(seq);
>>>>>         ip->chksum = 0;
>>>>>         odph_ipv4_csum_update(pkt);
>>>>> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>>>>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>>>>> ODPH_ICMPHDR_LEN +
>>>>>                                        ODPH_IPV4HDR_LEN);
>>>>>         ip->proto = ODPH_IPPROTO_ICMP;
>>>>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
>>>>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>>>>>         ip->id = odp_cpu_to_be_16(seq);
>>>>>         ip->chksum = 0;
>>>>>         odph_ipv4_csum_update(pkt);
>>>>> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>>>>>                 }
>>>>>
>>>>>                 if (args->appl.interval != 0) {
>>>>> +                       uint64_t seq =
>>>>> odp_counter64_read(&counters.seq);
>>>>>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
>>>>> -                              thr, counters.seq, counters.seq%0xffff);
>>>>> +                              thr, seq, seq%0xffff);
>>>>>                         /* TODO use odp timer */
>>>>>                         usleep(args->appl.interval * 1000);
>>>>>                 }
>>>>> -               if (args->appl.number != -1 && counters.seq
>>>>> -                   >= (unsigned int)args->appl.number) {
>>>>> +               if (args->appl.number != -1 &&
>>>>> +                   odp_counter64_read(&counters.seq) >=
>>>>> +                   (unsigned int)args->appl.number) {
>>>>>                         break;
>>>>>                 }
>>>>>         }
>>>>> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>>>>>         /* receive number of reply pks until timeout */
>>>>>         if (args->appl.mode == APPL_MODE_PING && args->appl.number >
>>>>> 0) {
>>>>>                 while (args->appl.timeout >= 0) {
>>>>> -                       if (counters.icmp >= (unsigned
>>>>> int)args->appl.number)
>>>>> +                       if (odp_counter64_read(&counters.icmp) >=
>>>>> +                           (unsigned int)args->appl.number)
>>>>>                                 break;
>>>>>                         /* TODO use odp timer */
>>>>>                         sleep(1);
>>>>> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>>>>>
>>>>>         /* print info */
>>>>>         if (args->appl.mode == APPL_MODE_UDP) {
>>>>> -               printf("  [%02i] total send: %ju\n", thr,
>>>>> counters.seq);
>>>>> +               printf("  [%02i] total send: %ju\n", thr,
>>>>> +                      odp_counter64_read(&counters.seq));
>>>>>         } else if (args->appl.mode == APPL_MODE_PING) {
>>>>>                 printf("  [%02i] total send: %ju total receive: %ju\n",
>>>>> -                      thr, counters.seq, counters.icmp);
>>>>> +                      thr, odp_counter64_read(&counters.seq),
>>>>> +                      odp_counter64_read(&counters.icmp));
>>>>>         }
>>>>>         return arg;
>>>>>  }
>>>>> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
>>>>> pkt_tbl[], unsigned len)
>>>>>                 if (!odp_packet_inflag_ipv4(pkt))
>>>>>                         continue;
>>>>>
>>>>> -               odp_atomic_inc_u64(&counters.ip);
>>>>> +               odp_counter64_inc(&counters.ip);
>>>>>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>>>>>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>>>>>                 ip = (odph_ipv4hdr_t *)(buf +
>>>>> odp_packet_l3_offset(pkt));
>>>>> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
>>>>> pkt_tbl[], unsigned len)
>>>>>
>>>>>                 /* udp */
>>>>>                 if (ip->proto == ODPH_IPPROTO_UDP) {
>>>>> -                       odp_atomic_inc_u64(&counters.udp);
>>>>> +                       odp_counter64_inc(&counters.udp);
>>>>>                         udp = (odph_udphdr_t *)(buf + offset);
>>>>>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>>>>>                                         odp_be_to_cpu_16(udp->length) -
>>>>> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
>>>>> pkt_tbl[], unsigned len)
>>>>>                         icmp = (odph_icmphdr_t *)(buf + offset);
>>>>>                         /* echo reply */
>>>>>                         if (icmp->type == ICMP_ECHOREPLY) {
>>>>> -                               odp_atomic_inc_u64(&counters.icmp);
>>>>> +                               odp_counter64_inc(&counters.icmp);
>>>>>                                 memcpy(&tvsend, buf + offset +
>>>>> ODPH_ICMPHDR_LEN,
>>>>>                                        sizeof(struct timeval));
>>>>>                                 /* TODO This should be changed to use
>>>>> an
>>>>> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>>>>>         }
>>>>>
>>>>>         /* init counters */
>>>>> -       odp_atomic_init_u64(&counters.seq);
>>>>> -       odp_atomic_init_u64(&counters.ip);
>>>>> -       odp_atomic_init_u64(&counters.udp);
>>>>> -       odp_atomic_init_u64(&counters.icmp);
>>>>> +       odp_counter64_init(&counters.seq, 0);
>>>>> +       odp_counter64_init(&counters.ip, 0);
>>>>> +       odp_counter64_init(&counters.udp, 0);
>>>>> +       odp_counter64_init(&counters.icmp, 0);
>>>>>
>>>>>         /* Reserve memory for args from shared mem */
>>>>>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
>>>>> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
>>>>> index 2f2dc19..76c27d0 100644
>>>>> --- a/example/ipsec/odp_ipsec.c
>>>>> +++ b/example/ipsec/odp_ipsec.c
>>>>> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>>>>>         printf("Num worker threads: %i\n", num_workers);
>>>>>
>>>>>         /* Create a barrier to synchronize thread startup */
>>>>> -       odp_barrier_init_count(&sync_barrier, num_workers);
>>>>> +       odp_barrier_init(&sync_barrier, num_workers);
>>>>>
>>>>>         /*
>>>>>          * By default core #0 runs Linux kernel background tasks.
>>>>> diff --git a/example/odp_example/odp_example.c
>>>>> b/example/odp_example/odp_example.c
>>>>> index 0e9aa3d..c473395 100644
>>>>> --- a/example/odp_example/odp_example.c
>>>>> +++ b/example/odp_example/odp_example.c
>>>>> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>>>>>         odp_shm_print_all();
>>>>>
>>>>>         /* Barrier to sync test case execution */
>>>>> -       odp_barrier_init_count(&globals->barrier, num_workers);
>>>>> +       odp_barrier_init(&globals->barrier, num_workers);
>>>>>
>>>>>         if (args.proc_mode) {
>>>>>                 int ret;
>>>>> diff --git a/example/timer/odp_timer_test.c
>>>>> b/example/timer/odp_timer_test.c
>>>>> index 78b2ae2..dfbeae9 100644
>>>>> --- a/example/timer/odp_timer_test.c
>>>>> +++ b/example/timer/odp_timer_test.c
>>>>> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>>>>>         printf("\n");
>>>>>
>>>>>         /* Barrier to sync test case execution */
>>>>> -       odp_barrier_init_count(&test_barrier, num_workers);
>>>>> +       odp_barrier_init(&test_barrier, num_workers);
>>>>>
>>>>>         /* Create and launch worker threads */
>>>>>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
>>>>> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
>>>>> index 76c1db8..5e78b34 100644
>>>>> --- a/helper/include/odph_ring.h
>>>>> +++ b/helper/include/odph_ring.h
>>>>> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>>>>>                 uint32_t sp_enqueue;     /* True, if single producer.
>>>>> */
>>>>>                 uint32_t size;           /* Size of ring. */
>>>>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>>>>> -               uint32_t head;          /* Producer head. */
>>>>> -               uint32_t tail;          /* Producer tail. */
>>>>> +               odp_atomic32_t head;    /* Producer head. */
>>>>> +               odp_atomic32_t tail;    /* Producer tail. */
>>>>>         } prod ODP_ALIGNED_CACHE;
>>>>>
>>>>>         /** @private Consumer */
>>>>> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>>>>>                 uint32_t sc_dequeue;     /* True, if single consumer.
>>>>> */
>>>>>                 uint32_t size;           /* Size of the ring. */
>>>>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>>>>> -               uint32_t head;          /* Consumer head. */
>>>>> -               uint32_t tail;          /* Consumer tail. */
>>>>> +               odp_atomic32_t head;    /* Consumer head. */
>>>>> +               odp_atomic32_t tail;    /* Consumer tail. */
>>>>>         } cons ODP_ALIGNED_CACHE;
>>>>>
>>>>>         /** @private Memory space of ring starts here. */
>>>>> diff --git a/platform/linux-generic/include/api/odp.h
>>>>> b/platform/linux-generic/include/api/odp.h
>>>>> index 0ee3faf..d124d52 100644
>>>>> --- a/platform/linux-generic/include/api/odp.h
>>>>> +++ b/platform/linux-generic/include/api/odp.h
>>>>> @@ -32,6 +32,7 @@ extern "C" {
>>>>>  #include <odp_barrier.h>
>>>>>  #include <odp_spinlock.h>
>>>>>  #include <odp_atomic.h>
>>>>> +#include <odp_counter.h>
>>>>>
>>>>>  #include <odp_init.h>
>>>>>  #include <odp_system_info.h>
>>>>> diff --git a/platform/linux-generic/include/api/odp_atomic.h
>>>>> b/platform/linux-generic/include/api/odp_atomic.h
>>>>>
>>>>> index 0cc4cf4..ccaad02 100644
>>>>>
>>>>> --- a/platform/linux-generic/include/api/odp_atomic.h
>>>>> +++ b/platform/linux-generic/include/api/odp_atomic.h
>>>>> @@ -4,464 +4,494 @@
>>>>>   * SPDX-License-Identifier:     BSD-3-Clause
>>>>>   */
>>>>>
>>>>> -
>>>>>  /**
>>>>>   * @file
>>>>>   *
>>>>> - * ODP atomic operations
>>>>> + * ODP atomic types and operations, semantically a subset of C11
>>>>> atomics.
>>>>> + * Scalar variable wrapped in a struct to avoid accessing scalar
>>>>> directly
>>>>> + * without using the required access functions.
>>>>> + * Atomic functions must be used to operate on atomic variables!
>>>>>   */
>>>>>
>>>>>  #ifndef ODP_ATOMIC_H_
>>>>>  #define ODP_ATOMIC_H_
>>>>>
>>>>> +#include <stdint.h>
>>>>> +#include <odp_align.h>
>>>>> +#include <odp_hints.h>
>>>>> +#include <odp_debug.h>
>>>>> +
>>>>>  #ifdef __cplusplus
>>>>>  extern "C" {
>>>>>  #endif
>>>>>
>>>>> -
>>>>> -#include <odp_std_types.h>
>>>>> -
>>>>> -
>>>>> -/**
>>>>> - * Atomic integer
>>>>> - */
>>>>> -typedef volatile int32_t odp_atomic_int_t;
>>>>> -
>>>>> -/**
>>>>> - * Atomic unsigned integer 64 bits
>>>>> - */
>>>>> -typedef volatile uint64_t odp_atomic_u64_t;
>>>>> -
>>>>> -/**
>>>>> - * Atomic unsigned integer 32 bits
>>>>> - */
>>>>> -typedef volatile uint32_t odp_atomic_u32_t;
>>>>> -
>>>>> -
>>>>> -/**
>>>>> - * Initialize atomic integer
>>>>> - *
>>>>> - * @param ptr    An integer atomic variable
>>>>> - *
>>>>> - * @note The operation is not synchronized with other threads
>>>>> - */
>>>>> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
>>>>> -{
>>>>> -       *ptr = 0;
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Load value of atomic integer
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - * @return atomic integer value
>>>>> - *
>>>>> - * @note The operation is not synchronized with other threads
>>>>> - */
>>>>> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
>>>>> -{
>>>>> -       return *ptr;
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Store value to atomic integer
>>>>> - *
>>>>> - * @param ptr        An atomic variable
>>>>> - * @param new_value  Store new_value to a variable
>>>>> - *
>>>>> - * @note The operation is not synchronized with other threads
>>>>> - */
>>>>> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
>>>>> new_value)
>>>>> -{
>>>>> -       *ptr = new_value;
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Fetch and add atomic integer
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - * @param value  A value to be added to the variable
>>>>> - *
>>>>> - * @return Value of the variable before the operation
>>>>> - */
>>>>> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
>>>>> value)
>>>>> -{
>>>>> -       return __sync_fetch_and_add(ptr, value);
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Fetch and subtract atomic integer
>>>>> - *
>>>>> - * @param ptr    An atomic integer variable
>>>>> - * @param value  A value to be subtracted from the variable
>>>>> - *
>>>>> - * @return Value of the variable before the operation
>>>>> - */
>>>>> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
>>>>> value)
>>>>> -{
>>>>> -       return __sync_fetch_and_sub(ptr, value);
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Fetch and increment atomic integer by 1
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - * @return Value of the variable before the operation
>>>>> - */
>>>>> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
>>>>> -{
>>>>> -       return odp_atomic_fetch_add_int(ptr, 1);
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Increment atomic integer by 1
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - */
>>>>> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
>>>>> -{
>>>>> -       odp_atomic_fetch_add_int(ptr, 1);
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Fetch and decrement atomic integer by 1
>>>>> - *
>>>>> - * @param ptr    An atomic int variable
>>>>> - *
>>>>> - * @return Value of the variable before the operation
>>>>> - */
>>>>> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
>>>>> -{
>>>>> -       return odp_atomic_fetch_sub_int(ptr, 1);
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Decrement atomic integer by 1
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - */
>>>>> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
>>>>> -{
>>>>> -       odp_atomic_fetch_sub_int(ptr, 1);
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Initialize atomic uint32
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - * @note The operation is not synchronized with other threads
>>>>> - */
>>>>> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
>>>>> -{
>>>>> -       *ptr = 0;
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Load value of atomic uint32
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - * @return atomic uint32 value
>>>>> - *
>>>>> - * @note The operation is not synchronized with other threads
>>>>> - */
>>>>> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
>>>>> -{
>>>>> -       return *ptr;
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Store value to atomic uint32
>>>>> - *
>>>>> - * @param ptr        An atomic variable
>>>>> - * @param new_value  Store new_value to a variable
>>>>> - *
>>>>> - * @note The operation is not synchronized with other threads
>>>>> - */
>>>>> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
>>>>> -                                       uint32_t new_value)
>>>>> -{
>>>>> -       *ptr = new_value;
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Fetch and add atomic uint32
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - * @param value  A value to be added to the variable
>>>>> - *
>>>>> - * @return Value of the variable before the operation
>>>>> - */
>>>>> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>>>>> -                                               uint32_t value)
>>>>> -{
>>>>> -       return __sync_fetch_and_add(ptr, value);
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Fetch and subtract uint32
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - * @param value  A value to be sub to the variable
>>>>> - *
>>>>> - * @return Value of the variable before the operation
>>>>> - */
>>>>> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
>>>>> -                                               uint32_t value)
>>>>> -{
>>>>> -       return __sync_fetch_and_sub(ptr, value);
>>>>> -}
>>>>> -
>>>>>  /**
>>>>> - * Fetch and increment atomic uint32 by 1
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - * @return Value of the variable before the operation
>>>>> - */
>>>>> -#if defined __OCTEON__
>>>>> -
>>>>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>>>>> -{
>>>>> -       uint32_t ret;
>>>>> -
>>>>> -       __asm__ __volatile__ ("syncws");
>>>>> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
>>>>> -                             "r" (ptr));
>>>>> -
>>>>> -       return ret;
>>>>> -}
>>>>> -
>>>>> + * 32-bit (unsigned) atomic type
>>>>> + */
>>>>> +typedef struct {
>>>>> +       uint32_t v; /**< Actual storage for the atomic variable */
>>>>> +} odp_atomic32_t
>>>>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>>>>> +
>>>>> +typedef enum {
>>>>> +       /** Relaxed memory order, no ordering of other accesses
>>>>> enforced */
>>>>> +       ODP_MEMORDER_RLX,
>>>>> +       /** Acquire memory order, later accesses cannot move before
>>>>> +        * acquire operation */
>>>>> +       ODP_MEMORDER_ACQ,
>>>>> +       /** Release memory order, earlier accesses cannot move after
>>>>> +        * release operation */
>>>>> +       ODP_MEMORDER_RLS
>>>>> +} odp_memorder_t;
>>>>> +
>>>>>
>>>>> +/*****************************************************************************
>>>>> + * Just some private helpers
>>>>>
>>>>> +*****************************************************************************/
>>>>> +
>>>>> +#ifdef __OCTEON__
>>>>> +/* OCTEON Write Memory Barrier */
>>>>> +#define COMPILER_HW_BARRIER() __asm __volatile( \
>>>>> +       /* Double syncw to work around errata */ \
>>>>> +       "syncw\n\tsyncw" : : : )
>>>>>  #else
>>>>> -
>>>>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>>>>> -{
>>>>> -       return odp_atomic_fetch_add_u32(ptr, 1);
>>>>> -}
>>>>> -
>>>>> +/** Compiler and hardware full memory barrier */
>>>>> +#define COMPILER_HW_BARRIER() __sync_synchronize()
>>>>> +/* __sync_synchronize() generates the right insn for ARMv6t2 and
>>>>> ARMv7-a */
>>>>>  #endif
>>>>>
>>>>> -/**
>>>>> - * Increment atomic uint32 by 1
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - */
>>>>> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
>>>>> -{
>>>>> -       odp_atomic_fetch_add_u32(ptr, 1);
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Fetch and decrement uint32 by 1
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - * @return Value of the variable before the operation
>>>>> - */
>>>>> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
>>>>> -{
>>>>> -       return odp_atomic_fetch_sub_u32(ptr, 1);
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Decrement atomic uint32 by 1
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - */
>>>>> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
>>>>> -{
>>>>> -       odp_atomic_fetch_sub_u32(ptr, 1);
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Atomic compare and set for 32bit
>>>>> - *
>>>>> - * @param dst destination location into which the value will be
>>>>> written.
>>>>> - * @param exp expected value.
>>>>> - * @param src new value.
>>>>> - * @return Non-zero on success; 0 on failure.
>>>>> - */
>>>>> -static inline int
>>>>> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t
>>>>> src)
>>>>> -{
>>>>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>>>>> +#define MEMORY "memory"
>>>>> +
>>>>>
>>>>> +/*****************************************************************************
>>>>> + * Operations on 32-bit atomics
>>>>> + * odp_atomic32_init - no return value
>>>>> + * odp_atomic32_load - return current value
>>>>> + * odp_atomic32_store - no return value
>>>>> + * odp_atomic32_cmp_xchg_weak - return bool
>>>>> + * odp_atomic32_fetch_add - return old value
>>>>> + * odp_atomic32_add - no return value
>>>>> + * odp_atomic32_fetch_inc - return old value
>>>>> + * odp_atomic32_inc - no return value
>>>>> + * odp_atomic32_fetch_dec - return old value
>>>>> + * odp_atomic32_dec - no return value
>>>>> +
>>>>> *****************************************************************************/
>>>>> +
>>>>> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t
>>>>> val)
>>>>> +{
>>>>> +       /* Write of aligned word is atomic */
>>>>> +       /* Cast to volatile to force compiler to (re-) write variable,
>>>>> thus we
>>>>> +        * can avoid using compiler memory barriers */
>>>>> +       *(__volatile uint32_t *)&ptr->v = val;
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * Atomic load of 32-bit atomic variable
>>>>> + *
>>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>>> + * @param memmodel Memory model associated with the load
>>>>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
>>>>> + *
>>>>> + * @return Value of the variable
>>>>> + */
>>>>> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
>>>>> +               odp_memorder_t mmodel)
>>>>> +{
>>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>>> +               uint32_t val;
>>>>> +               /* Read of aligned word is atomic */
>>>>> +               /* Cast to volatile to force compiler to (re-) read
>>>>> variable,
>>>>> +                * thus we can avoid using compiler memory barriers */
>>>>> +               val = *(__volatile const uint32_t *)&ptr->v;
>>>>> +               return val;
>>>>> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
>>>>> +#if defined __aarch64__
>>>>> +               uint32_t val;
>>>>> +               __asm __volatile("ldar %w0, [%1]"
>>>>> +                               : "=&r"(val)
>>>>> +                               : "r"(&ptr->v)
>>>>> +                               : MEMORY);
>>>>> +               return val;
>>>>> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
>>>>> +               /* Read of aligned word is atomic */
>>>>> +               uint32_t val = ptr->v;
>>>>> +               /* To prevent later accesses from moving up */
>>>>> +               /* Herb Sutter claims HW barrier not needed on x86? */
>>>>> +               COMPILER_HW_BARRIER();
>>>>> +               return val;
>>>>> +#else
>>>>> +#warning odp_atomic32_load() may not be efficiently implemented
>>>>> +               /* Assume read of aligned word is atomic */
>>>>> +               uint32_t val = ptr->v;
>>>>> +               /* To prevent later accesses from moving up */
>>>>> +               COMPILER_HW_BARRIER();
>>>>> +               return val;
>>>>> +#endif
>>>>> +       } else {
>>>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>>>> +       }
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * Atomic store to 32-bit atomic variable
>>>>> + *
>>>>> + * @param ptr  Pointer to a 32-bit atomic variable
>>>>> + * @param val  Value to write to the atomic variable
>>>>> + * @param memmodel Memory model associated with the store
>>>>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
>>>>> + */
>>>>> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
>>>>> +               uint32_t val,
>>>>> +               odp_memorder_t mmodel)
>>>>> +{
>>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>>> +               /* Write of aligned word is atomic */
>>>>> +               /* Cast to volatile to force compiler to (re-) write
>>>>> variable,
>>>>> +                * thus we will avoid using compiler memory barriers */
>>>>> +               *(__volatile uint32_t *)&ptr->v = val;
>>>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>>>> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
>>>>> +               /* Compiler and HW barrier to prevent earlier accesses
>>>>> from
>>>>> +                * moving down */
>>>>> +               COMPILER_HW_BARRIER();
>>>>> +               /* Write of aligned word is atomic */
>>>>> +               ptr->v = val;
>>>>> +               /* Compiler and HW barrier to prevent this store from
>>>>> moving
>>>>> +                * down after a later load-acquire and thus create
>>>>> overlapping
>>>>> +                * critical sections. Herb Sutter thinks this is
>>>>> needed */
>>>>> +               COMPILER_HW_BARRIER();
>>>>> +#elif defined __aarch64__
>>>>> +               __asm __volatile("stlr %w0, [%1]"
>>>>> +                               :
>>>>> +                               : "r"(val), "r"(&ptr->v)
>>>>> +                               : MEMORY);
>>>>> +#elif defined __x86_64__
>>>>> +               /* This is actually an atomic exchange operation */
>>>>> +               /* Generates good code on x86_64 */
>>>>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>>>>> +#else
>>>>> +#warning odp_atomic32_store_rls() may not be efficiently implemented
>>>>> +               /* This is actually an atomic exchange operation */
>>>>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>>>>> +#endif
>>>>> +       } else {
>>>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>>>> +       }
>>>>> +}
>>>>> +
>>>>> +
>>>>> +/**
>>>>> + * Atomic compare and exchange (swap) of 32-bit atomic variable
>>>>> + * "Weak" semantics, may fail spuriously and must be used in a loop.
>>>>> + *
>>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>>> + * @param exp_p Pointer to expected value (updated on failure)
>>>>> + * @param val   New value to write
>>>>> + * @param       memmodel Memory model associated with the
>>>>> compare-and-swap
>>>>> + * operation (ODP_MEMORDER_RLX only)
>>>>> + *
>>>>> + * @return 1 (true) if exchange successful, 0 (false) if not
>>>>> successful (and
>>>>> + * '*exp_p' updated with current value)
>>>>> + */
>>>>> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
>>>>> +               uint32_t *exp_p,
>>>>> +               uint32_t val,
>>>>> +               odp_memorder_t mmodel)
>>>>> +{
>>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>>> +               uint32_t old;
>>>>> +               uint32_t exp = *exp_p;
>>>>> +               int status;
>>>>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>>>>> +                                "cmp   %0, %3\t\n"
>>>>> +                                "bne   1f\t\n"
>>>>> +                                "strex %1, %4, [%2]\t\n"
>>>>> +                                "1:\t\n"
>>>>> +                               : "=&r"(old), "=&r"(status)
>>>>> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
>>>>> +                               : MEMORY);
>>>>> +               if (odp_unlikely(old != exp)) {
>>>>> +                       /* Value has changed, can't proceed */
>>>>> +                       /* Clear exclusive access monitor */
>>>>> +                       __asm __volatile("clrex");
>>>>> +                       /* Return current value */
>>>>> +                       *exp_p = old;
>>>>> +                       return 0;
>>>>> +               }
>>>>> +               /* strex returns 0 on success */
>>>>> +               if (odp_unlikely(status != 0)) {
>>>>> +                       /* strex failed, reservation was disturbed */
>>>>> +                       /* Return potentially changed value */
>>>>> +                       *exp_p = odp_atomic32_load(ptr,
>>>>> ODP_MEMORDER_RLX);
>>>>> +                       return 0;
>>>>> +               }
>>>>> +               return 1;
>>>>> +#elif defined __mips64__
>>>>> +               uint32_t old;
>>>>> +               uint32_t exp = *exp_p;
>>>>> +               uint32_t status = val;
>>>>> +               __asm __volatile("llw %0, [%2]\t\n"
>>>>> +                                "bne %0, %3, 1f\t\n"
>>>>> +                                "scw %1, [%2]\t\n"
>>>>> +                                "1:\t\n"
>>>>> +                               : "=&r"(old), "+&r"(status)
>>>>> +                               : "r"(&ptr->v), "r"(exp)
>>>>> +                               : MEMORY);
>>>>> +               if (odp_unlikely(old != exp)) {
>>>>> +                       /* Value has changed, can't proceed */
>>>>> +                       /* Return current value */
>>>>> +                       *exp_p = old;
>>>>> +                       return 0;
>>>>> +               }
>>>>> +               /* scw returns 1 on success, 0 on failure */
>>>>> +               if (odp_unlikely(status == 0)) {
>>>>> +                       /* scw failed, reservation was disturbed */
>>>>> +                       *exp_p = odp_atomic32_load(ptr,
>>>>> ODP_MEMORDER_RLX);
>>>>> +                       return 0;
>>>>> +               }
>>>>> +               return 1;
>>>>> +#elif defined __x86_64__
>>>>> +               uint32_t exp = *exp_p;
>>>>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v,
>>>>> exp, val);
>>>>> +               if (odp_unlikely(old != exp)) {
>>>>> +                       /* Return the unexpected content of '*ptr' */
>>>>> +                       *exp_p = old;
>>>>> +                       return 0;
>>>>> +               } else {
>>>>> +                       return 1;
>>>>> +               }
>>>>> +#else
>>>>> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently
>>>>> implemented
>>>>> +               uint32_t exp = *exp_p;
>>>>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v,
>>>>> exp, val);
>>>>> +               if (odp_unlikely(old != exp)) {
>>>>> +                       /* Return the unexpected content of '*ptr' */
>>>>> +                       *exp_p = old;
>>>>> +                       return 0;
>>>>> +               } else {
>>>>> +                       return 1;
>>>>> +               }
>>>>> +#endif
>>>>> +       } else {
>>>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>>>> +       }
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * Atomic fetch and add to 32-bit atomic variable
>>>>> + * @note A - B <=> A + (-B)
>>>>> + *
>>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>>> + * @param incr  The value to be added to the atomic variable
>>>>> + * @param memmodel Memory model associated with the add
>>>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>>> + *
>>>>> + * @return Value of the atomic variable before the addition
>>>>> + */
>>>>> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
>>>>> +               uint32_t incr,
>>>>> +               odp_memorder_t mmodel)
>>>>> +{
>>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>>> +               uint32_t old_val, tmp;
>>>>> +               int status;
>>>>> +               do {
>>>>> +                       __asm __volatile("ldrex %0, [%3]\t\n"
>>>>> +                                        "add   %1, %0, %4\t\n"
>>>>> +                                        "strex %2, %1, [%3]\t\n"
>>>>>
>>>>> +                                       : "=&r"(old_val), "=&r"(tmp),
>>>>>
>>>>> +                                         "=&r"(status)
>>>>> +                                       : "r"(&ptr->v), "r"(incr)
>>>>> +                                       : MEMORY);
>>>>> +               } while (odp_unlikely(status != 0));
>>>>> +               return old_val;
>>>>> +#elif defined __OCTEON__
>>>>> +               uint32_t old_val;
>>>>> +               __asm __volatile("laa %0,(%2),%3"
>>>>> +                               : "=r" (old_val), "+m" (ptr)
>>>>> +                               : "r" (ptr), "r" (incr)
>>>>> +                               : MEMORY);
>>>>> +               return old_val;
>>>>> +#elif defined __x86_64__
>>>>> +               /* Generates good code on x86_64 */
>>>>> +               return __sync_fetch_and_add(&ptr->v, incr);
>>>>> +#else
>>>>> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
>>>>> +               return __sync_fetch_and_add(&ptr->v, incr);
>>>>> +#endif
>>>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>>>> +#if defined __OCTEON__
>>>>> +               uint32_t old_val;
>>>>> +               COMPILER_HW_BARRIER();
>>>>> +               __asm __volatile("laa %0,(%2),%3"
>>>>> +                               : "=r" (old_val), "+m" (ptr)
>>>>> +                               : "r" (ptr), "r" (incr)
>>>>> +                               : MEMORY);
>>>>> +               COMPILER_HW_BARRIER();
>>>>> +               return old_val;
>>>>> +#endif
>>>>> +               /* __sync_fetch_and_add() will give us barriers before
>>>>> and
>>>>> +                * after, we are fine with this for release operations
>>>>> */
>>>>> +               return __sync_fetch_and_add(&ptr->v, incr);
>>>>> +       } else {
>>>>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>>>>> +       }
>>>>>  }
>>>>>
>>>>>  /**
>>>>> - * Initialize atomic uint64
>>>>> + * Atomic add to 32-bit atomic variable
>>>>>   *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - * @note The operation is not synchronized with other threads
>>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>>> + * @param incr  The value to be added to the atomic variable
>>>>> + * @param memmodel Memory model associated with the add
>>>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>>>   */
>>>>> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
>>>>> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
>>>>> +               uint32_t incr,
>>>>> +               odp_memorder_t mmodel)
>>>>>  {
>>>>> -       *ptr = 0;
>>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>>> +               /* Platforms that support atomic add instructions can
>>>>> add
>>>>> +                * their implementations here */
>>>>> +#if defined __OCTEON__
>>>>> +               __asm __volatile("saa %[inc], (%[base])"
>>>>> +                               : "+m" (*ptr)
>>>>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>>>>> +                               : MEMORY);
>>>>> +               return;
>>>>> +#endif
>>>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>>>> +               /* Platforms that support atomic add instructions can
>>>>> add
>>>>> +                * their implementations here */
>>>>> +#if defined __OCTEON__
>>>>> +               COMPILER_HW_BARRIER();
>>>>> +               __asm __volatile("saa %[inc], (%[base])"
>>>>> +                               : "+m" (*ptr)
>>>>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>>>>> +                               : MEMORY);
>>>>> +               COMPILER_HW_BARRIER();
>>>>> +               return;
>>>>> +#endif
>>>>> +       }
>>>>> +       /* Default to using odp_atomic32_fetch_add() */
>>>>> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>>>>>  }
>>>>>
>>>>>  /**
>>>>> - * Load value of atomic uint64
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> + * Atomic fetch and increment of 32-bit atomic variable
>>>>>   *
>>>>> - * @return atomic uint64 value
>>>>> + * param ptr   Pointer to a 32-bit atomic variable
>>>>> + * @param memmodel Memory model associated with the increment
>>>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>>>   *
>>>>> - * @note The operation is not synchronized with other threads
>>>>> + * @return Value of the atomic variable before the increment
>>>>>   */
>>>>> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
>>>>> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
>>>>> +               odp_memorder_t mmodel)
>>>>>  {
>>>>> -       return *ptr;
>>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>>> +               /* Platforms that support atomic increment
>>>>> instructions can add
>>>>> +                * their implementations here */
>>>>> +#if defined __OCTEON__
>>>>> +               uint32_t old_val;
>>>>> +               __asm __volatile("lai %0,(%2)"
>>>>> +                               : "=r" (old_val), "+m" (ptr)
>>>>> +                               : "r" (ptr)
>>>>> +                               : MEMORY);
>>>>> +               return old_val;
>>>>> +#endif
>>>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>>>> +#if defined __OCTEON__
>>>>> +               uint32_t old_val;
>>>>> +               COMPILER_HW_BARRIER();
>>>>> +               __asm __volatile("lai %0,(%2)"
>>>>> +                               : "=r" (old_val), "+m" (ptr)
>>>>> +                               : "r" (ptr)
>>>>> +                               : MEMORY);
>>>>> +               COMPILER_HW_BARRIER();
>>>>> +               return old_val;
>>>>> +#endif
>>>>> +       }
>>>>> +       /* Default to using odp_atomic32_fetch_add() */
>>>>> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>>>>>  }
>>>>>
>>>>>  /**
>>>>> - * Store value to atomic uint64
>>>>> - *
>>>>> - * @param ptr        An atomic variable
>>>>> - * @param new_value  Store new_value to a variable
>>>>> + * Atomic increment of 32-bit atomic variable
>>>>>   *
>>>>> - * @note The operation is not synchronized with other threads
>>>>> + * param ptr   Pointer to a 32-bit atomic variable
>>>>> + * @param memmodel Memory model associated with the increment
>>>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>>>   */
>>>>> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
>>>>> -                                       uint64_t new_value)
>>>>> -{
>>>>> -       *ptr = new_value;
>>>>> -}
>>>>> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
>>>>> +               odp_memorder_t mmodel)
>>>>>
>>>>> -/**
>>>>> - * Add atomic uint64
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - * @param value  A value to be added to the variable
>>>>> - *
>>>>> - */
>>>>> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
>>>>> value)
>>>>>  {
>>>>> -       __sync_fetch_and_add(ptr, value);
>>>>> +       /* Default to using odp_atomic32_fetch_inc() */
>>>>> +       /* Platforms that support atomic increment instructions can add
>>>>> +        * their implementations here */
>>>>> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>>>>>  }
>>>>>
>>>>>  /**
>>>>> - * Fetch and add atomic uint64
>>>>> + * Atomic fetch and decrement of 32-bit atomic variable
>>>>>   *
>>>>> - * @param ptr    An atomic variable
>>>>> - * @param value  A value to be added to the variable
>>>>> + * param ptr   Pointer to a 32-bit atomic variable
>>>>> + * @param memmodel Memory model associated with the decrement
>>>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>>>   *
>>>>> - * @return Value of the variable before the operation
>>>>> + * @return Value of the atomic variable before the decrement
>>>>>   */
>>>>> -
>>>>> -#if defined __powerpc__ && !defined __powerpc64__
>>>>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>>>>> -                                               uint64_t value)
>>>>> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
>>>>> +               odp_memorder_t mmodel)
>>>>>  {
>>>>> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
>>>>> -                                   (uint32_t)value);
>>>>> -}
>>>>> -#else
>>>>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>>>>> -                                               uint64_t value)
>>>>> -{
>>>>> -       return __sync_fetch_and_add(ptr, value);
>>>>> -}
>>>>> +       if (mmodel == ODP_MEMORDER_RLX) {
>>>>> +               /* Platforms that support atomic decrement
>>>>> instructions can add
>>>>> +                * their implementations here */
>>>>> +#if defined __OCTEON__
>>>>> +               uint32_t old_val;
>>>>> +               __asm __volatile("lad %0,(%2)"
>>>>> +                               : "=r" (old_val), "+m" (ptr)
>>>>> +                               : "r" (ptr)
>>>>> +                               : MEMORY);
>>>>> +               return old_val;
>>>>>  #endif
>>>>> -/**
>>>>> - * Subtract atomic uint64
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - * @param value  A value to be subtracted from the variable
>>>>> - *
>>>>> - */
>>>>> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
>>>>> value)
>>>>> -{
>>>>> -       __sync_fetch_and_sub(ptr, value);
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Fetch and subtract atomic uint64
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - * @param value  A value to be subtracted from the variable
>>>>> - *
>>>>> - * @return Value of the variable before the operation
>>>>> - */
>>>>> -#if defined __powerpc__ && !defined __powerpc64__
>>>>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>>>>> -                                               uint64_t value)
>>>>> -{
>>>>> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
>>>>> -                                   (uint32_t)value);
>>>>> -}
>>>>> -#else
>>>>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>>>>> -                                               uint64_t value)
>>>>> -{
>>>>> -       return __sync_fetch_and_sub(ptr, value);
>>>>> -}
>>>>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>>>>> +#if defined __OCTEON__
>>>>> +               uint32_t old_val;
>>>>> +               COMPILER_HW_BARRIER();
>>>>> +               __asm __volatile("lad %0,(%2)"
>>>>> +                               : "=r" (old_val), "+m" (ptr)
>>>>> +                               : "r" (ptr)
>>>>> +                               : MEMORY);
>>>>> +               COMPILER_HW_BARRIER();
>>>>> +               return old_val;
>>>>>  #endif
>>>>> -/**
>>>>> - * Fetch and increment atomic uint64 by 1
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - * @return Value of the variable before the operation
>>>>> - */
>>>>> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
>>>>> -{
>>>>> -       return odp_atomic_fetch_add_u64(ptr, 1);
>>>>> -}
>>>>> -
>>>>> -/**
>>>>> - * Increment atomic uint64 by 1
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - */
>>>>> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
>>>>> -{
>>>>> -       odp_atomic_fetch_add_u64(ptr, 1);
>>>>> +       }
>>>>> +       /* Default to using odp_atomic32_fetch_add() */
>>>>> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>>>>>  }
>>>>>
>>>>>  /**
>>>>> - * Fetch and decrement atomic uint64 by 1
>>>>> + * Atomic decrement of 32-bit atomic variable
>>>>>   *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - * @return Value of the variable before the operation
>>>>> + * param ptr   Pointer to a 32-bit atomic variable
>>>>> + * @param memmodel Memory model associated with the decrement
>>>>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>>>>   */
>>>>> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
>>>>> -{
>>>>> -       return odp_atomic_fetch_sub_u64(ptr, 1);
>>>>> -}
>>>>> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
>>>>> +               odp_memorder_t memorder)
>>>>>
>>>>> -/**
>>>>> - * Decrement atomic uint64 by 1
>>>>> - *
>>>>> - * @param ptr    An atomic variable
>>>>> - *
>>>>> - */
>>>>> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>>>>>  {
>>>>> -       odp_atomic_fetch_sub_u64(ptr, 1);
>>>>> +       /* Default to using odp_atomic32_fetch_dec() */
>>>>> +       /* Platforms that support atomic decrement instructions can add
>>>>> +        * their implementations here */
>>>>> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>>>>>  }
>>>>>
>>>>> -/**
>>>>> - * Atomic compare and set for 64bit
>>>>> - *
>>>>> - * @param dst destination location into which the value will be
>>>>> written.
>>>>> - * @param exp expected value.
>>>>> - * @param src new value.
>>>>> - * @return Non-zero on success; 0 on failure.
>>>>> - */
>>>>> -static inline int
>>>>> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t
>>>>> src)
>>>>> -{
>>>>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>>>>> -}
>>>>> +/* We are not exporting this macro */
>>>>> +#undef COMPILER_HW_BARRIER
>>>>> +#undef MEMORY
>>>>>
>>>>>  #ifdef __cplusplus
>>>>>  }
>>>>> diff --git a/platform/linux-generic/include/api/odp_barrier.h
>>>>> b/platform/linux-generic/include/api/odp_barrier.h
>>>>> index a7b3215..69b1eb8 100644
>>>>> --- a/platform/linux-generic/include/api/odp_barrier.h
>>>>> +++ b/platform/linux-generic/include/api/odp_barrier.h
>>>>> @@ -27,18 +27,18 @@ extern "C" {
>>>>>   * ODP execution barrier
>>>>>   */
>>>>>  typedef struct odp_barrier_t {
>>>>> -       int              count;  /**< @private Thread count */
>>>>> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
>>>>> +       uint32_t       num_threads;  /**< @private Thread count
>>>>> (constant) */
>>>>> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier
>>>>> */
>>>>>  } odp_barrier_t;
>>>>>
>>>>>
>>>>>  /**
>>>>>   * Init barrier with thread count
>>>>>   *
>>>>> - * @param barrier    Barrier
>>>>> - * @param count      Thread count
>>>>> + * @param barrier     Barrier
>>>>> + * @param num_threads Number of threads which share the barrier
>>>>>   */
>>>>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
>>>>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>>>>>
>>>>>
>>>>>  /**
>>>>> diff --git a/platform/linux-generic/include/api/odp_counter.h
>>>>> b/platform/linux-generic/include/api/odp_counter.h
>>>>> new file mode 100644
>>>>>
>>>>> index 0000000..f937d27
>>>>>
>>>>> --- /dev/null
>>>>> +++ b/platform/linux-generic/include/api/odp_counter.h
>>>>> @@ -0,0 +1,363 @@
>>>>> +/* Copyright (c) 2013, Linaro Limited
>>>>> + * All rights reserved.
>>>>> + *
>>>>> + * SPDX-License-Identifier:     BSD-3-Clause
>>>>> + */
>>>>> +
>>>>> +/**
>>>>> + * @file
>>>>> + *
>>>>> + * ODP atomic counter types and operations, suitable for e.g. shared
>>>>> statistics.
>>>>> + * Relaxed memory model assumed for lowest overhead.
>>>>> + * Scalar variable wrapped in a struct to avoid accessing scalar
>>>>> directly
>>>>> + * without using the required access functions.
>>>>> + * Counter functions must be used to operate on counter variables!
>>>>> + */
>>>>> +
>>>>> +#ifndef ODP_COUNTER_H_
>>>>> +#define ODP_COUNTER_H_
>>>>> +
>>>>> +#include <stdint.h>
>>>>> +#include <odp_align.h>
>>>>> +#include <odp_hints.h>
>>>>> +
>>>>> +#ifdef __cplusplus
>>>>> +extern "C" {
>>>>> +#endif
>>>>> +
>>>>> +/**
>>>>> + * 32-bit (unsigned) atomic counter type
>>>>> + */
>>>>> +typedef struct {
>>>>> +       uint32_t v; /**< Actual storage for the counter variable */
>>>>> +} odp_counter32_t
>>>>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>>>>> +
>>>>> +/**
>>>>> + * 64-bit (unsigned) atomic counter type
>>>>> + */
>>>>> +typedef struct {
>>>>> +       uint64_t v; /**< Actual storage for the counter variable */
>>>>> +       /* Room for other data structures (e.g. spin lock) that might
>>>>> be
>>>>> +        * needed to ensure atomicity on some architectures */
>>>>> +} odp_counter64_t
>>>>> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
>>>>> +
>>>>>
>>>>> +/*****************************************************************************
>>>>> + * Operations on 32-bit atomic counters
>>>>> + * odp_counter32_init - returns no value
>>>>> + * odp_counter32_read - returns current value
>>>>> + * odp_counter32_write - returns no value
>>>>> + * odp_counter32_add - returns no value
>>>>> + * odp_counter32_read_inc - returns old value
>>>>> + * odp_counter32_inc - returns no value
>>>>> +
>>>>> *****************************************************************************/
>>>>> +
>>>>> +/**
>>>>> + * Initialize 32-bit counter variable
>>>>> + *
>>>>> + * @param ptr   Pointer to a 32-bit counter variable
>>>>> + * @param val   Initial value
>>>>> + */
>>>>> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t
>>>>> val)
>>>>> +{
>>>>> +       /* No implementation requires any other type of initialization
>>>>> */
>>>>> +       *(__volatile uint32_t *)&ptr->v = val;
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * Read 32-bit counter variable
>>>>> + *
>>>>> + * @param ptr   Pointer to a 32-bit counter variable
>>>>> + *
>>>>> + * @return Value of the variable
>>>>> + */
>>>>> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
>>>>> +{
>>>>> +       uint32_t val;
>>>>> +       /* Read of aligned word is atomic */
>>>>> +       /* Cast to volatile to force compiler to (re-) read variable,
>>>>> thus we
>>>>> +        * will avoid using compiler memory barriers */
>>>>> +       val = *(__volatile const uint32_t *)&ptr->v;
>>>>> +       return val;
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * Write 32-bit counter variable
>>>>> + *
>>>>> + * @param ptr   Pointer to a 32-bit counter variable
>>>>> + * @param val   Value to write to the variable
>>>>> + */
>>>>> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t
>>>>> val)
>>>>> +{
>>>>> +       /* Write of aligned word is atomic */
>>>>> +       /* Cast to volatile to force compiler to (re-) write variable,
>>>>> thus we
>>>>> +        * will avoid using compiler memory barriers */
>>>>> +       *(__volatile uint32_t *)&ptr->v = val;
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * Atomic add to 32-bit counter variable
>>>>> + *
>>>>> + * @param ptr   Pointer to a 32-bit counter variable
>>>>> + * @param incr  The value to be added to the counter variable
>>>>> + */
>>>>> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t
>>>>> incr)
>>>>> +{
>>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>>> +       uint32_t result;
>>>>> +       int status;
>>>>> +       do {
>>>>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>>>>> +                                "add   %0, %0, %3\t\n"
>>>>> +                                "strex %1, %0, [%2]"
>>>>> +                                : "=&r"(result), "=&r"(status)
>>>>> +                                : "r"(&ptr->v), "Ir" (incr)
>>>>> +                                : );
>>>>> +       } while (odp_unlikely(status != 0));
>>>>> +#elif defined __OCTEON__
>>>>> +       __asm __volatile("saa %[inc], (%[base])"
>>>>> +                        : "+m" (*ptr)
>>>>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>>>>> +                        : );
>>>>> +#elif defined __x86_64__
>>>>> +       /* Generates good code on x86_64 */
>>>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>>>> +#else
>>>>> +       /* Warning odp_counter32_add() may not be efficiently
>>>>> implemented */
>>>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>>>> +#endif
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * Atomic increment (+1) of 32-bit counter variable, return original
>>>>> value
>>>>> + *
>>>>> + * @param ptr   Pointer to a 32-bit counter variable
>>>>> + *
>>>>> + * @return Original value of counter
>>>>> + */
>>>>> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
>>>>> +{
>>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>>> +       uint32_t result, tmp;
>>>>> +       int status;
>>>>> +       do {
>>>>> +               __asm __volatile("ldrex %0, [%3]\t\n"
>>>>> +                                "add   %1, %0, #1\t\n"
>>>>> +                                "strex %2, %1, [%3]"
>>>>>
>>>>> +                                : "=&r"(result), "=&r"(tmp),
>>>>> "=&r"(status)
>>>>>
>>>>> +                                : "r"(&ptr->v)
>>>>> +                                : );
>>>>> +       } while (odp_unlikely(status != 0));
>>>>> +       return result;
>>>>> +#elif defined __OCTEON__
>>>>> +       uint32_t old_val;
>>>>> +       __asm __volatile("lai %0,(%2)"
>>>>> +                        : "=r" (old_val), "+m" (ptr)
>>>>> +                        : "r" (ptr)
>>>>> +                        : );
>>>>> +       return old_val;
>>>>> +#elif defined __x86_64__
>>>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>>>> +#else
>>>>> +/* Warning odp_counter32_read_inc() may not be efficiently
>>>>> implemented */
>>>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>>>> +#endif
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * Atomic increment (+1) 32-bit counter variable
>>>>> + *
>>>>> + * @param ptr   Pointer to a 32-bit counter variable
>>>>> + */
>>>>> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
>>>>> +{
>>>>> +#if defined __OCTEON__
>>>>> +       odp_counter32_add(ptr, 1);
>>>>> +#else
>>>>> +       (void)odp_counter32_read_inc(ptr);
>>>>> +#endif
>>>>> +}
>>>>> +
>>>>>
>>>>> +/*****************************************************************************
>>>>> + * Operations on 64-bit atomic counters
>>>>> + * odp_counter64_init
>>>>> + * odp_counter64_read
>>>>> + * odp_counter64_write
>>>>> + * odp_counter64_add
>>>>> + * odp_counter64_read_inc
>>>>> + * odp_counter64_inc
>>>>> +
>>>>> *****************************************************************************/
>>>>> +
>>>>> +/**
>>>>> + * Read 64-bit counter variable
>>>>> + *
>>>>> + * @param ptr   Pointer to a 64-bit counter variable
>>>>> + *
>>>>> + * @return Value of the counter variable
>>>>> + */
>>>>> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
>>>>> +{
>>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>>> +       uint64_t val;
>>>>> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
>>>>> +                        "clrex" /* Clear exclusive access monitor */
>>>>> +                        : "=&r"(val)
>>>>> +                        : "r"(&ptr->v)
>>>>> +                        : );
>>>>> +       return val;
>>>>> +#elif defined __x86_64__ || defined __aarch64__
>>>>> +       /* Read of aligned quad/double word is atomic */
>>>>> +       return ptr->v;
>>>>> +#else
>>>>> +/* Warning odp_counter64_read() may not be efficiently implemented */
>>>>> +       return __sync_fetch_and_or(&ptr->v, 0);
>>>>> +#endif
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * Write 64-bit counter variable
>>>>> + *
>>>>> + * @param ptr  Pointer to a 64-bit counter variable
>>>>> + * @param val  Value to write to the counter variable
>>>>> + */
>>>>> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t
>>>>> val)
>>>>> +{
>>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>>> +       uint64_t old_val;
>>>>> +       int status;
>>>>> +       do {
>>>>> +               /* Read counter variable exclusively so we can write
>>>>> to it
>>>>> +                * later */
>>>>> +               /* Attempt to write the new value */
>>>>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>>>>> +                                "strexd %1, %3, %H3, [%2]"
>>>>> +                                : "=&r"(old_val), "=&r"(status)
>>>>> +                                : "r"(&ptr->v), "r"(val)
>>>>> +                                : );
>>>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>>>> succeeds */
>>>>> +#elif defined __x86_64__ || defined __aarch64__
>>>>> +       /* Write of aligned quad/double word is atomic */
>>>>> +       ptr->v = val;
>>>>> +#else
>>>>> +/* Warning odp_counter64_write() may not be efficiently implemented */
>>>>> +       /* This is actually an counter exchange operation */
>>>>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>>>> +#endif
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * Initialize 64-bit counter variable
>>>>> + * Perform implementation specific initializations, assign initial
>>>>> value.
>>>>> + *
>>>>> + * @param ptr   Pointer to a 64-bit counter variable
>>>>> + * @param val   Initial value
>>>>> + */
>>>>> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t
>>>>> val)
>>>>> +{
>>>>> +       /* No implementation requires any other type of initialization
>>>>> */
>>>>> +       odp_counter64_write(ptr, val);
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * Atomic add to 64-bit counter variable
>>>>> + *
>>>>> + * @param ptr   Pointer to a 64-bit counter variable
>>>>> + * @param incr  The value to be added to the counter variable
>>>>> + */
>>>>> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t
>>>>> incr)
>>>>> +{
>>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>>> +       uint64_t old_val;
>>>>> +       int status;
>>>>> +       do {
>>>>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>>>>> +                                "adds   %0, %0, %3\t\n"
>>>>> +                                "adc    %H0, %H3\t\n"
>>>>> +                                "strexd %1, %0, %H0, [%2]"
>>>>> +                                : "=&r"(old_val), "=&r"(status)
>>>>> +                                : "r"(&ptr->v), "r"(incr)
>>>>> +                                : );
>>>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>>>> succeeds */
>>>>> +#elif defined __OCTEON__
>>>>> +       __asm __volatile("saad %[inc], (%[base])"
>>>>> +                        : "+m" (*ptr)
>>>>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>>>>> +                        : );
>>>>> +#elif defined __x86_64__
>>>>> +       /* Generates good code on x86_64 */
>>>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>>>> +#else
>>>>> +/* Warning odp_counter64_add() may not be efficiently implemented */
>>>>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>>>>> +#endif
>>>>> +}
>>>>> +
>>>>> +
>>>>> +/**
>>>>> + * Atomic increment (+1) 64-bit counter variable and return original
>>>>> value
>>>>> + *
>>>>> + * @param ptr   Pointer to a 64-bit counter variable
>>>>> + *
>>>>> + * @return Original value of counter
>>>>> + */
>>>>> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
>>>>> +{
>>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>>> +       uint64_t old_val, tmp;
>>>>> +       int status;
>>>>> +       do {
>>>>> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
>>>>> +                                "adds   %2, %0, #1\t\n"
>>>>> +                                "adc    %H2, %H0, #0\t\n"
>>>>> +                                "strexd %1, %2, %H2, [%3]"
>>>>> +                                : "=&r"(old_val), "=&r"(status),
>>>>> "=&r"(tmp)
>>>>> +                                : "r"(&ptr->v)
>>>>> +                                : );
>>>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>>>> succeeds */
>>>>> +       return old_val;
>>>>> +#elif defined __OCTEON__
>>>>> +       uint64_t old_val;
>>>>> +       __asm __volatile("laid %0,(%2)"
>>>>> +                       : "=r" (old_val), "+m" (ptr)
>>>>> +                       : "r" (ptr)
>>>>> +                       : );
>>>>> +       return old_val;
>>>>> +#elif defined __x86_64__
>>>>> +       /* Generates good code on x86_64 */
>>>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>>>> +#else
>>>>> +/* Warning odp_counter64_read_inc() may not be efficiently
>>>>> implemented */
>>>>> +       return __sync_fetch_and_add(&ptr->v, 1);
>>>>> +#endif
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * Atomic increment (+1) 64-bit counter variable
>>>>> + *
>>>>> + * @param ptr   Pointer to a 64-bit counter variable
>>>>> + */
>>>>> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
>>>>> +{
>>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>>> +       uint64_t old_val;
>>>>> +       int status;
>>>>> +       do {
>>>>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>>>>> +                                "adds   %0, #1\t\n"
>>>>> +                                "adc    %H0, #0\t\n"
>>>>> +                                "strexd %1, %0, %H0, [%2]"
>>>>> +                                : "=&r"(old_val), "=&r"(status)
>>>>> +                                : "r"(&ptr->v)
>>>>> +                                : );
>>>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>>>> succeeds */
>>>>> +#else
>>>>> +       (void)odp_counter64_read_inc(ptr);
>>>>> +#endif
>>>>> +}
>>>>> +
>>>>> +#ifdef __cplusplus
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>> +#endif
>>>>> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
>>>>> b/platform/linux-generic/include/api/odp_rwlock.h
>>>>> index 252ebb2..ff8a9a2 100644
>>>>> --- a/platform/linux-generic/include/api/odp_rwlock.h
>>>>> +++ b/platform/linux-generic/include/api/odp_rwlock.h
>>>>> @@ -10,26 +10,30 @@
>>>>>  /**
>>>>>   * @file
>>>>>   *
>>>>> - * ODP RW Locks
>>>>> + * ODP read/write lock
>>>>> + * RW lock support mu
>>>>> ...
>>>>>
>>>>> [Message clipped]
>>>>
>>>>
>>>>
>>>> _______________________________________________
>>>> lng-odp mailing list
>>>> lng-odp@lists.linaro.org
>>>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>>>
>>>>
>>
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>
>>
>
>
> --
> *Mike Holmes*
> Linaro  Sr Technical Manager
> LNG - ODP
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
Ola Liljedahl Nov. 5, 2014, 3:57 p.m. UTC | #11
Fixing the bugs in the current implementation was just a side effect of
implementing and then using the new atomics API. I didn't actually go into
ticketlock_lock and fix the missing sync bug, it just disappeared when
using the new C11-inspired atomic operations. My primary interest is better
counter and atomics API's and their corresponding implementations in ODP.

I can remove all references to bugs that thus were fixed in the current
implementation and you will never know that it was broken. Better?

I can provide separate patches for the 32/64-bit counter API and for the
atomics API (that is used to implement different higher level lock and
synchronization primitives). I actually need to add a few more atomic
operations for the lock-less timer implementation, also need proper 64-bit
atomics (not counter) support.

I can't see any meaning of patches that are dependent on each other. A
patch for new atomics API will include the header file, the implementation,
all usages of this header file in ODP components, example apps and test
programs.

-- Ola


On 5 November 2014 12:38, Savolainen, Petri (NSN - FI/Espoo) <
petri.savolainen@nsn.com> wrote:

>  We achieve more predictable progress, if problems are identified, fixes
> suggested and approved in multiple - manageable patches rather than large
> lumps of implementation/API rewrite. E.g. if Ola’s problem is missing
> ticket_lock release sync in the implementation (for ARM), then we’ll fix
> that and do not rewrite the atomics API for application, etc.
>
>
>
> A large rewrite is just more likely to consume many review rounds and
> waste time on both sides.
>
>
>
> -Petri
>
>
>
>
>
> *From:* ext Bill Fischofer [mailto:bill.fischofer@linaro.org]
> *Sent:* Tuesday, November 04, 2014 6:30 PM
> *To:* Mike Holmes
> *Cc:* Ola Liljedahl; Savolainen, Petri (NSN - FI/Espoo); lng-odp-forward
> *Subject:* Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11 memory
> model
>
>
>
> So rather than folks spending the time to review the current patch Ola can
> spend a lot of time to break it up and then people can start looking at
> it?  That doesn't seem to be a very efficient way of working this.  We're
> at a point now where some APIs are being replaced rather than incrementally
> patched.  It's easier to do that as an (ahem) atomic function rather than
> breaking things into multiple patches that are all mutually interdependent.
>
>
>
>
> Multiple patches make sense if things are truly orthogonal.  But that's
> not always the case and some patches will be large.
>
>
>
> Bill
>
>
>
> On Tue, Nov 4, 2014 at 9:58 AM, Mike Holmes <mike.holmes@linaro.org>
> wrote:
>
> Generally fix one problem per patch and you solve several things at once
> making it hard to think about implications, can these be separated ?
>
>
>
> Added header file odp_counter.h with support for 32- and 64-bit atomic
> counters
> using relaxed memory order. 6 operations
> (init/read/write/add/read_inc/inc) on
> 32-bit and 64-bit counters respectively.A
> Renamed odp_atomic_test to odp_counter_test and changed to use
> odp_counter.h
>
> Implementation of C11-based memory model for atomic operations. 10
> operations
> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
> in
> odp_atomic.h. The required memory ordering is now a parameter to each call
> just
> like in C11.
>
> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
> fall back to GCC __sync builtins which often include unnecessarily heavy
> barrier/sync operations (always sequentially consistent).
>
> Attempt to remove all explicit memory barriers (odp_sync_stores) from code
> that
> implements multithreaded synchronization primitives (e.g. locks, barriers).
> Rewrote such primitives to use the new atomic operations.
>
> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
> enqueue/dequeue
>
>
>
> On 4 November 2014 10:06, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:
>
> And what should be in each patch?
>
>
>
> On 4 November 2014 16:03, Anders Roxell <anders.roxell@linaro.org> wrote:
>
> As perti wrote in his first email this patch should be broken up in
> multiple patches...
>
> Cheers,
> Anders
>
> On 4 Nov 2014 15:34, "Ola Liljedahl" <ola.liljedahl@linaro.org> wrote:
>
>   Possibly odp_atomics.h should then be internal leaving only
> odp_counter.h as the only public API. The original odp_atomics.h is public
> so I left it that way.
>
>
>
> The counter API does not allow the user to specify any memory ordering,
> relaxed memory order is expected, i.e. no ordering is guaranteed.
>
>
>
> Why does acquire/release not fit well with the far atomics? And what do
> you mean specifically with "far atomics"? Just the counter updates like
> Cavium has?
>
>
>
> As Linux kernel atomics interface predates C11/C++11 atomics support, I do
> not see it as model to follow.
>
>
>
> The patch summary contained a brief description of what I wanted to
> achieve with the patch. What do you want more, a Google Docs design
> document?
>
>
>
> -- Ola
>
>
>
> On 4 November 2014 15:22, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
> There are many things I’d change in this patch. I think it’s better to
> take a step back and talk what you are trying to achieve here, and then
> correct those step by step. E.g. the whole idea of acquire / release does
> not fit well on far atomics, and far atomics is the thing I’d abstract from
> applications with this API. Other synchronization primitives (such as
> locks) would not be implemented (too often) by applications, so it’s not
> very productive to abstract that (implementation of locks). E.g. Linux
> kernel atomics.h looks pretty much like the odp_atomic.h.
>
>
>
> -Petri
>
>
>
>
>
> *From:* lng-odp-bounces@lists.linaro.org [mailto:
> lng-odp-bounces@lists.linaro.org] *On Behalf Of *ext Ola Liljedahl
> *Sent:* Tuesday, November 04, 2014 3:49 PM
> *To:* lng-odp@lists.linaro.org
> *Subject:* Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11 memory
> model
>
>
>
> Ping!
>
>
>
> I really need this new working atomics support merged ASAP because I have
> a new lock-less implementation of the timer API which uses atomic
> operations. I haven't seen any real criticism against the content of the
> patch so there is nothing to change.
>
>
>
> -- Ola
>
>
>
>
>
> On 20 October 2014 15:07, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:
>
> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
> ---
> Added header file odp_counter.h with support for 32- and 64-bit atomic
> counters
> using relaxed memory order. 6 operations
> (init/read/write/add/read_inc/inc) on
> 32-bit and 64-bit counters respectively.
>
> Renamed odp_atomic_test to odp_counter_test and changed to use
> odp_counter.h
>
> Implementation of C11-based memory model for atomic operations. 10
> operations
> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
> in
> odp_atomic.h. The required memory ordering is now a parameter to each call
> just
> like in C11.
>
> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
> fall back to GCC __sync builtins which often include unnecessarily heavy
> barrier/sync operations (always sequentially consistent).
>
> Attempt to remove all explicit memory barriers (odp_sync_stores) from code
> that
> implements multithreaded synchronization primitives (e.g. locks, barriers).
> Rewrote such primitives to use the new atomic operations.
>
> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
> enqueue/dequeue
> (missing release barrier, had only compiler barrier).
>
>  .gitignore                                         |   2 +-
>  example/generator/odp_generator.c                  |  43 +-
>  example/ipsec/odp_ipsec.c                          |   2 +-
>  example/odp_example/odp_example.c                  |   2 +-
>  example/timer/odp_timer_test.c                     |   2 +-
>  helper/include/odph_ring.h                         |   8 +-
>  platform/linux-generic/include/api/odp.h           |   1 +
>  platform/linux-generic/include/api/odp_atomic.h    | 838
> +++++++++++----------
>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>  platform/linux-generic/odp_barrier.c               |  49 +-
>  platform/linux-generic/odp_buffer.c                |   3 +-
>  platform/linux-generic/odp_crypto.c                |   7 +-
>  platform/linux-generic/odp_queue.c                 |   7 +-
>  platform/linux-generic/odp_ring.c                  |  94 +--
>  platform/linux-generic/odp_rwlock.c                |  62 +-
>  platform/linux-generic/odp_thread.c                |   9 +-
>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>  platform/linux-generic/odp_timer.c                 |  22 +-
>  test/api_test/Makefile.am                          |   6 +-
>  test/api_test/odp_atomic_test.c                    | 362 ---------
>  test/api_test/odp_atomic_test.h                    |  60 --
>  test/api_test/odp_common.c                         |   1 -
>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>  delete mode 100644 test/api_test/odp_atomic_test.c
>  delete mode 100644 test/api_test/odp_atomic_test.h
>  create mode 100644 test/api_test/odp_counter_test.c
>
> diff --git a/.gitignore b/.gitignore
> index 6342e34..77db4d6 100644
> --- a/.gitignore
> +++ b/.gitignore
> @@ -35,7 +35,7 @@ build/
>  odp_example
>  odp_packet
>  odp_packet_netmap
> -odp_atomic
> +odp_counter
>  odp_shm
>  odp_ring
>  odp_timer_ping
> diff --git a/example/generator/odp_generator.c
> b/example/generator/odp_generator.c
> index eb8b340..252157d 100644
> --- a/example/generator/odp_generator.c
> +++ b/example/generator/odp_generator.c
> @@ -62,10 +62,10 @@ typedef struct {
>   * counters
>  */
>  static struct {
> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
> -       odp_atomic_u64_t ip;    /**< ip packets */
> -       odp_atomic_u64_t udp;   /**< udp packets */
> -       odp_atomic_u64_t icmp;  /**< icmp packets */
> +       odp_counter64_t seq;    /**< ip seq to be send */
> +       odp_counter64_t ip;     /**< ip packets */
> +       odp_counter64_t udp;    /**< udp packets */
> +       odp_counter64_t icmp;   /**< icmp packets */
>  } counters;
>
>  /** * Thread specific arguments
> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_UDPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_UDP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_ICMPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_ICMP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>                 }
>
>                 if (args->appl.interval != 0) {
> +                       uint64_t seq = odp_counter64_read(&counters.seq);
>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
> -                              thr, counters.seq, counters.seq%0xffff);
> +                              thr, seq, seq%0xffff);
>                         /* TODO use odp timer */
>                         usleep(args->appl.interval * 1000);
>                 }
> -               if (args->appl.number != -1 && counters.seq
> -                   >= (unsigned int)args->appl.number) {
> +               if (args->appl.number != -1 &&
> +                   odp_counter64_read(&counters.seq) >=
> +                   (unsigned int)args->appl.number) {
>                         break;
>                 }
>         }
> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>         /* receive number of reply pks until timeout */
>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>                 while (args->appl.timeout >= 0) {
> -                       if (counters.icmp >= (unsigned
> int)args->appl.number)
> +                       if (odp_counter64_read(&counters.icmp) >=
> +                           (unsigned int)args->appl.number)
>                                 break;
>                         /* TODO use odp timer */
>                         sleep(1);
> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>
>         /* print info */
>         if (args->appl.mode == APPL_MODE_UDP) {
> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
> +               printf("  [%02i] total send: %ju\n", thr,
> +                      odp_counter64_read(&counters.seq));
>         } else if (args->appl.mode == APPL_MODE_PING) {
>                 printf("  [%02i] total send: %ju total receive: %ju\n",
> -                      thr, counters.seq, counters.icmp);
> +                      thr, odp_counter64_read(&counters.seq),
> +                      odp_counter64_read(&counters.icmp));
>         }
>         return arg;
>  }
> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                 if (!odp_packet_inflag_ipv4(pkt))
>                         continue;
>
> -               odp_atomic_inc_u64(&counters.ip);
> +               odp_counter64_inc(&counters.ip);
>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>
>                 /* udp */
>                 if (ip->proto == ODPH_IPPROTO_UDP) {
> -                       odp_atomic_inc_u64(&counters.udp);
> +                       odp_counter64_inc(&counters.udp);
>                         udp = (odph_udphdr_t *)(buf + offset);
>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>                                         odp_be_to_cpu_16(udp->length) -
> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                         icmp = (odph_icmphdr_t *)(buf + offset);
>                         /* echo reply */
>                         if (icmp->type == ICMP_ECHOREPLY) {
> -                               odp_atomic_inc_u64(&counters.icmp);
> +                               odp_counter64_inc(&counters.icmp);
>                                 memcpy(&tvsend, buf + offset +
> ODPH_ICMPHDR_LEN,
>                                        sizeof(struct timeval));
>                                 /* TODO This should be changed to use an
> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>         }
>
>         /* init counters */
> -       odp_atomic_init_u64(&counters.seq);
> -       odp_atomic_init_u64(&counters.ip);
> -       odp_atomic_init_u64(&counters.udp);
> -       odp_atomic_init_u64(&counters.icmp);
> +       odp_counter64_init(&counters.seq, 0);
> +       odp_counter64_init(&counters.ip, 0);
> +       odp_counter64_init(&counters.udp, 0);
> +       odp_counter64_init(&counters.icmp, 0);
>
>         /* Reserve memory for args from shared mem */
>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
> index 2f2dc19..76c27d0 100644
> --- a/example/ipsec/odp_ipsec.c
> +++ b/example/ipsec/odp_ipsec.c
> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>         printf("Num worker threads: %i\n", num_workers);
>
>         /* Create a barrier to synchronize thread startup */
> -       odp_barrier_init_count(&sync_barrier, num_workers);
> +       odp_barrier_init(&sync_barrier, num_workers);
>
>         /*
>          * By default core #0 runs Linux kernel background tasks.
> diff --git a/example/odp_example/odp_example.c
> b/example/odp_example/odp_example.c
> index 0e9aa3d..c473395 100644
> --- a/example/odp_example/odp_example.c
> +++ b/example/odp_example/odp_example.c
> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>         odp_shm_print_all();
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&globals->barrier, num_workers);
> +       odp_barrier_init(&globals->barrier, num_workers);
>
>         if (args.proc_mode) {
>                 int ret;
> diff --git a/example/timer/odp_timer_test.c
> b/example/timer/odp_timer_test.c
> index 78b2ae2..dfbeae9 100644
> --- a/example/timer/odp_timer_test.c
> +++ b/example/timer/odp_timer_test.c
> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>         printf("\n");
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&test_barrier, num_workers);
> +       odp_barrier_init(&test_barrier, num_workers);
>
>         /* Create and launch worker threads */
>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
> index 76c1db8..5e78b34 100644
> --- a/helper/include/odph_ring.h
> +++ b/helper/include/odph_ring.h
> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>                 uint32_t sp_enqueue;     /* True, if single producer. */
>                 uint32_t size;           /* Size of ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Producer head. */
> -               uint32_t tail;          /* Producer tail. */
> +               odp_atomic32_t head;    /* Producer head. */
> +               odp_atomic32_t tail;    /* Producer tail. */
>         } prod ODP_ALIGNED_CACHE;
>
>         /** @private Consumer */
> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>                 uint32_t size;           /* Size of the ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Consumer head. */
> -               uint32_t tail;          /* Consumer tail. */
> +               odp_atomic32_t head;    /* Consumer head. */
> +               odp_atomic32_t tail;    /* Consumer tail. */
>         } cons ODP_ALIGNED_CACHE;
>
>         /** @private Memory space of ring starts here. */
> diff --git a/platform/linux-generic/include/api/odp.h
> b/platform/linux-generic/include/api/odp.h
> index 0ee3faf..d124d52 100644
> --- a/platform/linux-generic/include/api/odp.h
> +++ b/platform/linux-generic/include/api/odp.h
> @@ -32,6 +32,7 @@ extern "C" {
>  #include <odp_barrier.h>
>  #include <odp_spinlock.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>
>  #include <odp_init.h>
>  #include <odp_system_info.h>
> diff --git a/platform/linux-generic/include/api/odp_atomic.h
> b/platform/linux-generic/include/api/odp_atomic.h
>
> index 0cc4cf4..ccaad02 100644
>
> --- a/platform/linux-generic/include/api/odp_atomic.h
> +++ b/platform/linux-generic/include/api/odp_atomic.h
> @@ -4,464 +4,494 @@
>   * SPDX-License-Identifier:     BSD-3-Clause
>   */
>
> -
>  /**
>   * @file
>   *
> - * ODP atomic operations
> + * ODP atomic types and operations, semantically a subset of C11 atomics.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Atomic functions must be used to operate on atomic variables!
>   */
>
>  #ifndef ODP_ATOMIC_H_
>  #define ODP_ATOMIC_H_
>
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +#include <odp_debug.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
>
> -
> -#include <odp_std_types.h>
> -
> -
> -/**
> - * Atomic integer
> - */
> -typedef volatile int32_t odp_atomic_int_t;
> -
> -/**
> - * Atomic unsigned integer 64 bits
> - */
> -typedef volatile uint64_t odp_atomic_u64_t;
> -
> -/**
> - * Atomic unsigned integer 32 bits
> - */
> -typedef volatile uint32_t odp_atomic_u32_t;
> -
> -
> -/**
> - * Initialize atomic integer
> - *
> - * @param ptr    An integer atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic integer
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic integer value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
> -{
> -       return *ptr;
> -}
> -
> -/**
> - * Store value to atomic integer
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
> new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic integer
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic integer
> - *
> - * @param ptr    An atomic integer variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement atomic integer by 1
> - *
> - * @param ptr    An atomic int variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Initialize atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic uint32 value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
> -{
> -       return *ptr;
> -}
> -
> -/**
> - * Store value to atomic uint32
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
> -                                       uint32_t new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be sub to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
>  /**
> - * Fetch and increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __OCTEON__
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       uint32_t ret;
> -
> -       __asm__ __volatile__ ("syncws");
> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
> -                             "r" (ptr));
> -
> -       return ret;
> -}
> -
> + * 32-bit (unsigned) atomic type
> + */
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the atomic variable */
> +} odp_atomic32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
> +
> +typedef enum {
> +       /** Relaxed memory order, no ordering of other accesses enforced */
> +       ODP_MEMORDER_RLX,
> +       /** Acquire memory order, later accesses cannot move before
> +        * acquire operation */
> +       ODP_MEMORDER_ACQ,
> +       /** Release memory order, earlier accesses cannot move after
> +        * release operation */
> +       ODP_MEMORDER_RLS
> +} odp_memorder_t;
> +
>
> +/*****************************************************************************
> + * Just some private helpers
>
> +*****************************************************************************/
> +
> +#ifdef __OCTEON__
> +/* OCTEON Write Memory Barrier */
> +#define COMPILER_HW_BARRIER() __asm __volatile( \
> +       /* Double syncw to work around errata */ \
> +       "syncw\n\tsyncw" : : : )
>  #else
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> +/** Compiler and hardware full memory barrier */
> +#define COMPILER_HW_BARRIER() __sync_synchronize()
> +/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a
> */
>  #endif
>
> -/**
> - * Increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u32(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_sub_u32(ptr, 1);
> -}
> -
> -/**
> - * Atomic compare and set for 32bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> +#define MEMORY "memory"
> +
>
> +/*****************************************************************************
> + * Operations on 32-bit atomics
> + * odp_atomic32_init - no return value
> + * odp_atomic32_load - return current value
> + * odp_atomic32_store - no return value
> + * odp_atomic32_cmp_xchg_weak - return bool
> + * odp_atomic32_fetch_add - return old value
> + * odp_atomic32_add - no return value
> + * odp_atomic32_fetch_inc - return old value
> + * odp_atomic32_inc - no return value
> + * odp_atomic32_fetch_dec - return old value
> + * odp_atomic32_dec - no return value
> +
> *****************************************************************************/
> +
> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
> +{
> +       /* Write of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) write variable,
> thus we
> +        * can avoid using compiler memory barriers */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Atomic load of 32-bit atomic variable
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the load
> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
> + *
> + * @return Value of the variable
> + */
> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               uint32_t val;
> +               /* Read of aligned word is atomic */
> +               /* Cast to volatile to force compiler to (re-) read
> variable,
> +                * thus we can avoid using compiler memory barriers */
> +               val = *(__volatile const uint32_t *)&ptr->v;
> +               return val;
> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
> +#if defined __aarch64__
> +               uint32_t val;
> +               __asm __volatile("ldar %w0, [%1]"
> +                               : "=&r"(val)
> +                               : "r"(&ptr->v)
> +                               : MEMORY);
> +               return val;
> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
> +               /* Read of aligned word is atomic */
> +               uint32_t val = ptr->v;
> +               /* To prevent later accesses from moving up */
> +               /* Herb Sutter claims HW barrier not needed on x86? */
> +               COMPILER_HW_BARRIER();
> +               return val;
> +#else
> +#warning odp_atomic32_load() may not be efficiently implemented
> +               /* Assume read of aligned word is atomic */
> +               uint32_t val = ptr->v;
> +               /* To prevent later accesses from moving up */
> +               COMPILER_HW_BARRIER();
> +               return val;
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +/**
> + * Atomic store to 32-bit atomic variable
> + *
> + * @param ptr  Pointer to a 32-bit atomic variable
> + * @param val  Value to write to the atomic variable
> + * @param memmodel Memory model associated with the store
> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
> + */
> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
> +               uint32_t val,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Write of aligned word is atomic */
> +               /* Cast to volatile to force compiler to (re-) write
> variable,
> +                * thus we will avoid using compiler memory barriers */
> +               *(__volatile uint32_t *)&ptr->v = val;
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
> +               /* Compiler and HW barrier to prevent earlier accesses from
> +                * moving down */
> +               COMPILER_HW_BARRIER();
> +               /* Write of aligned word is atomic */
> +               ptr->v = val;
> +               /* Compiler and HW barrier to prevent this store from
> moving
> +                * down after a later load-acquire and thus create
> overlapping
> +                * critical sections. Herb Sutter thinks this is needed */
> +               COMPILER_HW_BARRIER();
> +#elif defined __aarch64__
> +               __asm __volatile("stlr %w0, [%1]"
> +                               :
> +                               : "r"(val), "r"(&ptr->v)
> +                               : MEMORY);
> +#elif defined __x86_64__
> +               /* This is actually an atomic exchange operation */
> +               /* Generates good code on x86_64 */
> +               (void)__sync_lock_test_and_set(&ptr->v, val);
> +#else
> +#warning odp_atomic32_store_rls() may not be efficiently implemented
> +               /* This is actually an atomic exchange operation */
> +               (void)__sync_lock_test_and_set(&ptr->v, val);
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +
> +/**
> + * Atomic compare and exchange (swap) of 32-bit atomic variable
> + * "Weak" semantics, may fail spuriously and must be used in a loop.
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param exp_p Pointer to expected value (updated on failure)
> + * @param val   New value to write
> + * @param       memmodel Memory model associated with the compare-and-swap
> + * operation (ODP_MEMORDER_RLX only)
> + *
> + * @return 1 (true) if exchange successful, 0 (false) if not successful
> (and
> + * '*exp_p' updated with current value)
> + */
> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
> +               uint32_t *exp_p,
> +               uint32_t val,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +#if defined __arm__ /* A32/T32 ISA */
> +               uint32_t old;
> +               uint32_t exp = *exp_p;
> +               int status;
> +               __asm __volatile("ldrex %0, [%2]\t\n"
> +                                "cmp   %0, %3\t\n"
> +                                "bne   1f\t\n"
> +                                "strex %1, %4, [%2]\t\n"
> +                                "1:\t\n"
> +                               : "=&r"(old), "=&r"(status)
> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
> +                               : MEMORY);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Clear exclusive access monitor */
> +                       __asm __volatile("clrex");
> +                       /* Return current value */
> +                       *exp_p = old;
> +                       return 0;
> +               }
> +               /* strex returns 0 on success */
> +               if (odp_unlikely(status != 0)) {
> +                       /* strex failed, reservation was disturbed */
> +                       /* Return potentially changed value */
> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
> +                       return 0;
> +               }
> +               return 1;
> +#elif defined __mips64__
> +               uint32_t old;
> +               uint32_t exp = *exp_p;
> +               uint32_t status = val;
> +               __asm __volatile("llw %0, [%2]\t\n"
> +                                "bne %0, %3, 1f\t\n"
> +                                "scw %1, [%2]\t\n"
> +                                "1:\t\n"
> +                               : "=&r"(old), "+&r"(status)
> +                               : "r"(&ptr->v), "r"(exp)
> +                               : MEMORY);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Return current value */
> +                       *exp_p = old;
> +                       return 0;
> +               }
> +               /* scw returns 1 on success, 0 on failure */
> +               if (odp_unlikely(status == 0)) {
> +                       /* scw failed, reservation was disturbed */
> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
> +                       return 0;
> +               }
> +               return 1;
> +#elif defined __x86_64__
> +               uint32_t exp = *exp_p;
> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
> val);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Return the unexpected content of '*ptr' */
> +                       *exp_p = old;
> +                       return 0;
> +               } else {
> +                       return 1;
> +               }
> +#else
> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
> +               uint32_t exp = *exp_p;
> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
> val);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Return the unexpected content of '*ptr' */
> +                       *exp_p = old;
> +                       return 0;
> +               } else {
> +                       return 1;
> +               }
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +/**
> + * Atomic fetch and add to 32-bit atomic variable
> + * @note A - B <=> A + (-B)
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + * @param memmodel Memory model associated with the add
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
> + *
> + * @return Value of the atomic variable before the addition
> + */
> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
> +               uint32_t incr,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +#if defined __arm__ /* A32/T32 ISA */
> +               uint32_t old_val, tmp;
> +               int status;
> +               do {
> +                       __asm __volatile("ldrex %0, [%3]\t\n"
> +                                        "add   %1, %0, %4\t\n"
> +                                        "strex %2, %1, [%3]\t\n"
>
> +                                       : "=&r"(old_val), "=&r"(tmp),
>
> +                                         "=&r"(status)
> +                                       : "r"(&ptr->v), "r"(incr)
> +                                       : MEMORY);
> +               } while (odp_unlikely(status != 0));
> +               return old_val;
> +#elif defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("laa %0,(%2),%3"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr), "r" (incr)
> +                               : MEMORY);
> +               return old_val;
> +#elif defined __x86_64__
> +               /* Generates good code on x86_64 */
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +#else
> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("laa %0,(%2),%3"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr), "r" (incr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
> +#endif
> +               /* __sync_fetch_and_add() will give us barriers before and
> +                * after, we are fine with this for release operations */
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
>  }
>
>  /**
> - * Initialize atomic uint64
> + * Atomic add to 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + * @param memmodel Memory model associated with the add
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
> +               uint32_t incr,
> +               odp_memorder_t mmodel)
>  {
> -       *ptr = 0;
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic add instructions can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               __asm __volatile("saa %[inc], (%[base])"
> +                               : "+m" (*ptr)
> +                               : [inc] "r" (incr), [base] "r" (ptr)
> +                               : MEMORY);
> +               return;
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +               /* Platforms that support atomic add instructions can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("saa %[inc], (%[base])"
> +                               : "+m" (*ptr)
> +                               : [inc] "r" (incr), [base] "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return;
> +#endif
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>  }
>
>  /**
> - * Load value of atomic uint64
> - *
> - * @param ptr    An atomic variable
> + * Atomic fetch and increment of 32-bit atomic variable
>   *
> - * @return atomic uint64 value
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the increment
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   *
> - * @note The operation is not synchronized with other threads
> + * @return Value of the atomic variable before the increment
>   */
> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>  {
> -       return *ptr;
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic increment instructions
> can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("lai %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               return old_val;
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("lai %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
> +#endif
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>  }
>
>  /**
> - * Store value to atomic uint64
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> + * Atomic increment of 32-bit atomic variable
>   *
> - * @note The operation is not synchronized with other threads
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the increment
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
> -                                       uint64_t new_value)
> -{
> -       *ptr = new_value;
> -}
> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>
> -/**
> - * Add atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - */
> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
>  {
> -       __sync_fetch_and_add(ptr, value);
> +       /* Default to using odp_atomic32_fetch_inc() */
> +       /* Platforms that support atomic increment instructions can add
> +        * their implementations here */
> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>  }
>
>  /**
> - * Fetch and add atomic uint64
> + * Atomic fetch and decrement of 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the decrement
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   *
> - * @return Value of the variable before the operation
> + * @return Value of the atomic variable before the decrement
>   */
> -
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>  {
> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> -#else
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic decrement instructions
> can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("lad %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               return old_val;
>  #endif
> -/**
> - * Subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - */
> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
> -{
> -       __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> -#else
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("lad %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
>  #endif
> -/**
> - * Fetch and increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u64(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       odp_atomic_fetch_add_u64(ptr, 1);
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>  }
>
>  /**
> - * Fetch and decrement atomic uint64 by 1
> + * Atomic decrement of 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the decrement
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u64(ptr, 1);
> -}
> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
> +               odp_memorder_t memorder)
>
> -/**
> - * Decrement atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>  {
> -       odp_atomic_fetch_sub_u64(ptr, 1);
> +       /* Default to using odp_atomic32_fetch_dec() */
> +       /* Platforms that support atomic decrement instructions can add
> +        * their implementations here */
> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>  }
>
> -/**
> - * Atomic compare and set for 64bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> -}
> +/* We are not exporting this macro */
> +#undef COMPILER_HW_BARRIER
> +#undef MEMORY
>
>  #ifdef __cplusplus
>  }
> diff --git a/platform/linux-generic/include/api/odp_barrier.h
> b/platform/linux-generic/include/api/odp_barrier.h
> index a7b3215..69b1eb8 100644
> --- a/platform/linux-generic/include/api/odp_barrier.h
> +++ b/platform/linux-generic/include/api/odp_barrier.h
> @@ -27,18 +27,18 @@ extern "C" {
>   * ODP execution barrier
>   */
>  typedef struct odp_barrier_t {
> -       int              count;  /**< @private Thread count */
> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
> +       uint32_t       num_threads;  /**< @private Thread count (constant)
> */
> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
>  } odp_barrier_t;
>
>
>  /**
>   * Init barrier with thread count
>   *
> - * @param barrier    Barrier
> - * @param count      Thread count
> + * @param barrier     Barrier
> + * @param num_threads Number of threads which share the barrier
>   */
> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>
>
>  /**
> diff --git a/platform/linux-generic/include/api/odp_counter.h
> b/platform/linux-generic/include/api/odp_counter.h
> new file mode 100644
>
> index 0000000..f937d27
>
> --- /dev/null
> +++ b/platform/linux-generic/include/api/odp_counter.h
> @@ -0,0 +1,363 @@
> +/* Copyright (c) 2013, Linaro Limited
> + * All rights reserved.
> + *
> + * SPDX-License-Identifier:     BSD-3-Clause
> + */
> +
> +/**
> + * @file
> + *
> + * ODP atomic counter types and operations, suitable for e.g. shared
> statistics.
> + * Relaxed memory model assumed for lowest overhead.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Counter functions must be used to operate on counter variables!
> + */
> +
> +#ifndef ODP_COUNTER_H_
> +#define ODP_COUNTER_H_
> +
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * 32-bit (unsigned) atomic counter type
> + */
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the counter variable */
> +} odp_counter32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
> +
> +/**
> + * 64-bit (unsigned) atomic counter type
> + */
> +typedef struct {
> +       uint64_t v; /**< Actual storage for the counter variable */
> +       /* Room for other data structures (e.g. spin lock) that might be
> +        * needed to ensure atomicity on some architectures */
> +} odp_counter64_t
> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
> +
>
> +/*****************************************************************************
> + * Operations on 32-bit atomic counters
> + * odp_counter32_init - returns no value
> + * odp_counter32_read - returns current value
> + * odp_counter32_write - returns no value
> + * odp_counter32_add - returns no value
> + * odp_counter32_read_inc - returns old value
> + * odp_counter32_inc - returns no value
> +
> *****************************************************************************/
> +
> +/**
> + * Initialize 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param val   Initial value
> + */
> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t val)
> +{
> +       /* No implementation requires any other type of initialization */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Read 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + *
> + * @return Value of the variable
> + */
> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
> +{
> +       uint32_t val;
> +       /* Read of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) read variable, thus
> we
> +        * will avoid using compiler memory barriers */
> +       val = *(__volatile const uint32_t *)&ptr->v;
> +       return val;
> +}
> +
> +/**
> + * Write 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param val   Value to write to the variable
> + */
> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t val)
> +{
> +       /* Write of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) write variable,
> thus we
> +        * will avoid using compiler memory barriers */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Atomic add to 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t result;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%2]\t\n"
> +                                "add   %0, %0, %3\t\n"
> +                                "strex %1, %0, [%2]"
> +                                : "=&r"(result), "=&r"(status)
> +                                : "r"(&ptr->v), "Ir" (incr)
> +                                : );
> +       } while (odp_unlikely(status != 0));
> +#elif defined __OCTEON__
> +       __asm __volatile("saa %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +       /* Warning odp_counter32_add() may not be efficiently implemented
> */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) of 32-bit counter variable, return original value
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + *
> + * @return Original value of counter
> + */
> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t result, tmp;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%3]\t\n"
> +                                "add   %1, %0, #1\t\n"
> +                                "strex %2, %1, [%3]"
>
> +                                : "=&r"(result), "=&r"(tmp), "=&r"(status)
>
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0));
> +       return result;
> +#elif defined __OCTEON__
> +       uint32_t old_val;
> +       __asm __volatile("lai %0,(%2)"
> +                        : "=r" (old_val), "+m" (ptr)
> +                        : "r" (ptr)
> +                        : );
> +       return old_val;
> +#elif defined __x86_64__
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#else
> +/* Warning odp_counter32_read_inc() may not be efficiently implemented */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + */
> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
> +{
> +#if defined __OCTEON__
> +       odp_counter32_add(ptr, 1);
> +#else
> +       (void)odp_counter32_read_inc(ptr);
> +#endif
> +}
> +
>
> +/*****************************************************************************
> + * Operations on 64-bit atomic counters
> + * odp_counter64_init
> + * odp_counter64_read
> + * odp_counter64_write
> + * odp_counter64_add
> + * odp_counter64_read_inc
> + * odp_counter64_inc
> +
> *****************************************************************************/
> +
> +/**
> + * Read 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + *
> + * @return Value of the counter variable
> + */
> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t val;
> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
> +                        "clrex" /* Clear exclusive access monitor */
> +                        : "=&r"(val)
> +                        : "r"(&ptr->v)
> +                        : );
> +       return val;
> +#elif defined __x86_64__ || defined __aarch64__
> +       /* Read of aligned quad/double word is atomic */
> +       return ptr->v;
> +#else
> +/* Warning odp_counter64_read() may not be efficiently implemented */
> +       return __sync_fetch_and_or(&ptr->v, 0);
> +#endif
> +}
> +
> +/**
> + * Write 64-bit counter variable
> + *
> + * @param ptr  Pointer to a 64-bit counter variable
> + * @param val  Value to write to the counter variable
> + */
> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t val)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               /* Read counter variable exclusively so we can write to it
> +                * later */
> +               /* Attempt to write the new value */
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "strexd %1, %3, %H3, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(val)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __x86_64__ || defined __aarch64__
> +       /* Write of aligned quad/double word is atomic */
> +       ptr->v = val;
> +#else
> +/* Warning odp_counter64_write() may not be efficiently implemented */
> +       /* This is actually an counter exchange operation */
> +       (void)__sync_lock_test_and_set(&ptr->v, val);
> +#endif
> +}
> +
> +/**
> + * Initialize 64-bit counter variable
> + * Perform implementation specific initializations, assign initial value.
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param val   Initial value
> + */
> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t val)
> +{
> +       /* No implementation requires any other type of initialization */
> +       odp_counter64_write(ptr, val);
> +}
> +
> +/**
> + * Atomic add to 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, %0, %3\t\n"
> +                                "adc    %H0, %H3\t\n"
> +                                "strexd %1, %0, %H0, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(incr)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __OCTEON__
> +       __asm __volatile("saad %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +/* Warning odp_counter64_add() may not be efficiently implemented */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +}
> +
> +
> +/**
> + * Atomic increment (+1) 64-bit counter variable and return original value
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + *
> + * @return Original value of counter
> + */
> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val, tmp;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
> +                                "adds   %2, %0, #1\t\n"
> +                                "adc    %H2, %H0, #0\t\n"
> +                                "strexd %1, %2, %H2, [%3]"
> +                                : "=&r"(old_val), "=&r"(status),
> "=&r"(tmp)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +       return old_val;
> +#elif defined __OCTEON__
> +       uint64_t old_val;
> +       __asm __volatile("laid %0,(%2)"
> +                       : "=r" (old_val), "+m" (ptr)
> +                       : "r" (ptr)
> +                       : );
> +       return old_val;
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#else
> +/* Warning odp_counter64_read_inc() may not be efficiently implemented */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + */
> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, #1\t\n"
> +                                "adc    %H0, #0\t\n"
> +                                "strexd %1, %0, %H0, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#else
> +       (void)odp_counter64_read_inc(ptr);
> +#endif
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif
> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
> b/platform/linux-generic/include/api/odp_rwlock.h
> index 252ebb2..ff8a9a2 100644
> --- a/platform/linux-generic/include/api/odp_rwlock.h
> +++ b/platform/linux-generic/include/api/odp_rwlock.h
> @@ -10,26 +10,30 @@
>  /**
>   * @file
>   *
> - * ODP RW Locks
> + * ODP read/write lock
> + * RW lock support mu
>
> ...
>
> [Message clipped]
>
>
>
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
>
>
>
> --
>
> *Mike Holmes*
>
> Linaro  Sr Technical Manager
>
> LNG - ODP
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
>
Ola Liljedahl Nov. 5, 2014, 5:51 p.m. UTC | #12
Those are your opinions but I don't agree (except that odp_atomics.h
couldand maybe should be an internal header file). You are not the one
person who decides the ODP architecture. It is supposed to be a
collaborative effort.

You are still not providing any basis for your claim the acquire/release
semantics are not meaningful for "far" atomic operations, just repeating
the same statement. To counter your base-less opinion, I claim that a far
atomic increment of the ticket lock counter (when is done when releasing a
ticket lock) would benefit from release ordering. The ticket lock next and
current variables could benefit from not being fetched into L1 cache of the
different lock clients (and they need to be in a cache line of their own,
separate from the data).

As we have seen, explicit use of barriers is already creating problems.
When are barriers needed and when are they not needed (e.g. multiple cases
of redundant barriers in ODP)? Both programming models may have their
issues, multithreaded programming is not trivial. I can't see why C11/C++11
style atomics should be more difficult to use than old-style use of
barriers. Probably the reverse as I detected so many missing and redundant
barriers when looking at the code using my acquire/release glasses. So just
another unsubstantiated opinion.

I also claim that a C11 atomics inspired API will be useful for ARMv8
implementations that support C11/C++11 memory models natively without the
use of explicit barriers. In a few years, programmers of multithreaded
applications will likely become used to the C11/C++11 way of doing things,
this is probably what they will teach in colleges etc. Explicit use of
barriers is architecture (and memory model) specific and thus a fragile
method if you want to write portable code.

Petri, kill your darlings.

-- Ola


On 5 November 2014 17:21, Savolainen, Petri (NSN - FI/Espoo) <
petri.savolainen@nsn.com> wrote:

>  The larger question is if ODP needs to provide lock _*implementation*_
> abstraction - I think it does not. It should be sufficient for application
> to use ODP synchronization services (to enable HW acceleration of those).
> If application is really going to implement a lock (or lockless data
> structure) by itself, it can very well use GCC __atomics or direct assembly
> for that. That should be very rare case and low number of application code
> lines => no point to abstract through ODP API.
>
>
>
> The current atomics API is in line with similar definition in linux kernel
> and DPDK, and fits better far atomics (in memory atomic operations) than
> the acq/rel semantics of C11. Separate counter API could be reserved for HW
> (not ISA) based counters, that may have limitations in usage. These current
> atomics would work (in practice) anywhere in the  main memory.
>
>
>
> I think C11 style API would not have benefits over the current (linux/DPDK
> style) definition, but instead it would create problems like which
> combinations of the operations are valid/legal/optimal in each case. Also
> as said, ODP should provide readymade synchronization primitives (that are
> on higher abstraction layer) instead of introducing bunch of lock
> implementation level API calls.
>
>
>
> So, rather fix bugs in lock implementation (with assembly if needed) than
> introduce new APIs.
>
>
>
> -Petri
>
>
>
>
>
>
>
> *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> *Sent:* Wednesday, November 05, 2014 5:57 PM
> *To:* Savolainen, Petri (NSN - FI/Espoo)
> *Cc:* ext Bill Fischofer; Mike Holmes; lng-odp-forward
> *Subject:* Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11 memory
> model
>
>
>
> Fixing the bugs in the current implementation was just a side effect of
> implementing and then using the new atomics API. I didn't actually go into
> ticketlock_lock and fix the missing sync bug, it just disappeared when
> using the new C11-inspired atomic operations. My primary interest is better
> counter and atomics API's and their corresponding implementations in ODP.
>
>
>
> I can remove all references to bugs that thus were fixed in the current
> implementation and you will never know that it was broken. Better?
>
>
>
> I can provide separate patches for the 32/64-bit counter API and for the
> atomics API (that is used to implement different higher level lock and
> synchronization primitives). I actually need to add a few more atomic
> operations for the lock-less timer implementation, also need proper 64-bit
> atomics (not counter) support.
>
>
>
> I can't see any meaning of patches that are dependent on each other. A
> patch for new atomics API will include the header file, the implementation,
> all usages of this header file in ODP components, example apps and test
> programs.
>
>
>
> -- Ola
>
>
>
>
>
> On 5 November 2014 12:38, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
> We achieve more predictable progress, if problems are identified, fixes
> suggested and approved in multiple - manageable patches rather than large
> lumps of implementation/API rewrite. E.g. if Ola’s problem is missing
> ticket_lock release sync in the implementation (for ARM), then we’ll fix
> that and do not rewrite the atomics API for application, etc.
>
>
>
> A large rewrite is just more likely to consume many review rounds and
> waste time on both sides.
>
>
>
> -Petri
>
>
>
>
>
> *From:* ext Bill Fischofer [mailto:bill.fischofer@linaro.org]
> *Sent:* Tuesday, November 04, 2014 6:30 PM
> *To:* Mike Holmes
> *Cc:* Ola Liljedahl; Savolainen, Petri (NSN - FI/Espoo); lng-odp-forward
> *Subject:* Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11 memory
> model
>
>
>
> So rather than folks spending the time to review the current patch Ola can
> spend a lot of time to break it up and then people can start looking at
> it?  That doesn't seem to be a very efficient way of working this.  We're
> at a point now where some APIs are being replaced rather than incrementally
> patched.  It's easier to do that as an (ahem) atomic function rather than
> breaking things into multiple patches that are all mutually interdependent.
>
>
>
>
> Multiple patches make sense if things are truly orthogonal.  But that's
> not always the case and some patches will be large.
>
>
>
> Bill
>
>
>
> On Tue, Nov 4, 2014 at 9:58 AM, Mike Holmes <mike.holmes@linaro.org>
> wrote:
>
> Generally fix one problem per patch and you solve several things at once
> making it hard to think about implications, can these be separated ?
>
>
>
> Added header file odp_counter.h with support for 32- and 64-bit atomic
> counters
> using relaxed memory order. 6 operations
> (init/read/write/add/read_inc/inc) on
> 32-bit and 64-bit counters respectively.A
> Renamed odp_atomic_test to odp_counter_test and changed to use
> odp_counter.h
>
> Implementation of C11-based memory model for atomic operations. 10
> operations
> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
> in
> odp_atomic.h. The required memory ordering is now a parameter to each call
> just
> like in C11.
>
> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
> fall back to GCC __sync builtins which often include unnecessarily heavy
> barrier/sync operations (always sequentially consistent).
>
> Attempt to remove all explicit memory barriers (odp_sync_stores) from code
> that
> implements multithreaded synchronization primitives (e.g. locks, barriers).
> Rewrote such primitives to use the new atomic operations.
>
> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
> enqueue/dequeue
>
>
>
> On 4 November 2014 10:06, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:
>
> And what should be in each patch?
>
>
>
> On 4 November 2014 16:03, Anders Roxell <anders.roxell@linaro.org> wrote:
>
> As perti wrote in his first email this patch should be broken up in
> multiple patches...
>
> Cheers,
> Anders
>
> On 4 Nov 2014 15:34, "Ola Liljedahl" <ola.liljedahl@linaro.org> wrote:
>
>   Possibly odp_atomics.h should then be internal leaving only
> odp_counter.h as the only public API. The original odp_atomics.h is public
> so I left it that way.
>
>
>
> The counter API does not allow the user to specify any memory ordering,
> relaxed memory order is expected, i.e. no ordering is guaranteed.
>
>
>
> Why does acquire/release not fit well with the far atomics? And what do
> you mean specifically with "far atomics"? Just the counter updates like
> Cavium has?
>
>
>
> As Linux kernel atomics interface predates C11/C++11 atomics support, I do
> not see it as model to follow.
>
>
>
> The patch summary contained a brief description of what I wanted to
> achieve with the patch. What do you want more, a Google Docs design
> document?
>
>
>
> -- Ola
>
>
>
> On 4 November 2014 15:22, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
> There are many things I’d change in this patch. I think it’s better to
> take a step back and talk what you are trying to achieve here, and then
> correct those step by step. E.g. the whole idea of acquire / release does
> not fit well on far atomics, and far atomics is the thing I’d abstract from
> applications with this API. Other synchronization primitives (such as
> locks) would not be implemented (too often) by applications, so it’s not
> very productive to abstract that (implementation of locks). E.g. Linux
> kernel atomics.h looks pretty much like the odp_atomic.h.
>
>
>
> -Petri
>
>
>
>
>
> *From:* lng-odp-bounces@lists.linaro.org [mailto:
> lng-odp-bounces@lists.linaro.org] *On Behalf Of *ext Ola Liljedahl
> *Sent:* Tuesday, November 04, 2014 3:49 PM
> *To:* lng-odp@lists.linaro.org
> *Subject:* Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11 memory
> model
>
>
>
> Ping!
>
>
>
> I really need this new working atomics support merged ASAP because I have
> a new lock-less implementation of the timer API which uses atomic
> operations. I haven't seen any real criticism against the content of the
> patch so there is nothing to change.
>
>
>
> -- Ola
>
>
>
>
>
> On 20 October 2014 15:07, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:
>
> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
> ---
> Added header file odp_counter.h with support for 32- and 64-bit atomic
> counters
> using relaxed memory order. 6 operations
> (init/read/write/add/read_inc/inc) on
> 32-bit and 64-bit counters respectively.
>
> Renamed odp_atomic_test to odp_counter_test and changed to use
> odp_counter.h
>
> Implementation of C11-based memory model for atomic operations. 10
> operations
> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
> in
> odp_atomic.h. The required memory ordering is now a parameter to each call
> just
> like in C11.
>
> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
> fall back to GCC __sync builtins which often include unnecessarily heavy
> barrier/sync operations (always sequentially consistent).
>
> Attempt to remove all explicit memory barriers (odp_sync_stores) from code
> that
> implements multithreaded synchronization primitives (e.g. locks, barriers).
> Rewrote such primitives to use the new atomic operations.
>
> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
> enqueue/dequeue
> (missing release barrier, had only compiler barrier).
>
>  .gitignore                                         |   2 +-
>  example/generator/odp_generator.c                  |  43 +-
>  example/ipsec/odp_ipsec.c                          |   2 +-
>  example/odp_example/odp_example.c                  |   2 +-
>  example/timer/odp_timer_test.c                     |   2 +-
>  helper/include/odph_ring.h                         |   8 +-
>  platform/linux-generic/include/api/odp.h           |   1 +
>  platform/linux-generic/include/api/odp_atomic.h    | 838
> +++++++++++----------
>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>  platform/linux-generic/odp_barrier.c               |  49 +-
>  platform/linux-generic/odp_buffer.c                |   3 +-
>  platform/linux-generic/odp_crypto.c                |   7 +-
>  platform/linux-generic/odp_queue.c                 |   7 +-
>  platform/linux-generic/odp_ring.c                  |  94 +--
>  platform/linux-generic/odp_rwlock.c                |  62 +-
>  platform/linux-generic/odp_thread.c                |   9 +-
>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>  platform/linux-generic/odp_timer.c                 |  22 +-
>  test/api_test/Makefile.am                          |   6 +-
>  test/api_test/odp_atomic_test.c                    | 362 ---------
>  test/api_test/odp_atomic_test.h                    |  60 --
>  test/api_test/odp_common.c                         |   1 -
>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>  delete mode 100644 test/api_test/odp_atomic_test.c
>  delete mode 100644 test/api_test/odp_atomic_test.h
>  create mode 100644 test/api_test/odp_counter_test.c
>
> diff --git a/.gitignore b/.gitignore
> index 6342e34..77db4d6 100644
> --- a/.gitignore
> +++ b/.gitignore
> @@ -35,7 +35,7 @@ build/
>  odp_example
>  odp_packet
>  odp_packet_netmap
> -odp_atomic
> +odp_counter
>  odp_shm
>  odp_ring
>  odp_timer_ping
> diff --git a/example/generator/odp_generator.c
> b/example/generator/odp_generator.c
> index eb8b340..252157d 100644
> --- a/example/generator/odp_generator.c
> +++ b/example/generator/odp_generator.c
> @@ -62,10 +62,10 @@ typedef struct {
>   * counters
>  */
>  static struct {
> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
> -       odp_atomic_u64_t ip;    /**< ip packets */
> -       odp_atomic_u64_t udp;   /**< udp packets */
> -       odp_atomic_u64_t icmp;  /**< icmp packets */
> +       odp_counter64_t seq;    /**< ip seq to be send */
> +       odp_counter64_t ip;     /**< ip packets */
> +       odp_counter64_t udp;    /**< udp packets */
> +       odp_counter64_t icmp;   /**< icmp packets */
>  } counters;
>
>  /** * Thread specific arguments
> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_UDPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_UDP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_ICMPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_ICMP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>                 }
>
>                 if (args->appl.interval != 0) {
> +                       uint64_t seq = odp_counter64_read(&counters.seq);
>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
> -                              thr, counters.seq, counters.seq%0xffff);
> +                              thr, seq, seq%0xffff);
>                         /* TODO use odp timer */
>                         usleep(args->appl.interval * 1000);
>                 }
> -               if (args->appl.number != -1 && counters.seq
> -                   >= (unsigned int)args->appl.number) {
> +               if (args->appl.number != -1 &&
> +                   odp_counter64_read(&counters.seq) >=
> +                   (unsigned int)args->appl.number) {
>                         break;
>                 }
>         }
> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>         /* receive number of reply pks until timeout */
>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>                 while (args->appl.timeout >= 0) {
> -                       if (counters.icmp >= (unsigned
> int)args->appl.number)
> +                       if (odp_counter64_read(&counters.icmp) >=
> +                           (unsigned int)args->appl.number)
>                                 break;
>                         /* TODO use odp timer */
>                         sleep(1);
> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>
>         /* print info */
>         if (args->appl.mode == APPL_MODE_UDP) {
> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
> +               printf("  [%02i] total send: %ju\n", thr,
> +                      odp_counter64_read(&counters.seq));
>         } else if (args->appl.mode == APPL_MODE_PING) {
>                 printf("  [%02i] total send: %ju total receive: %ju\n",
> -                      thr, counters.seq, counters.icmp);
> +                      thr, odp_counter64_read(&counters.seq),
> +                      odp_counter64_read(&counters.icmp));
>         }
>         return arg;
>  }
> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                 if (!odp_packet_inflag_ipv4(pkt))
>                         continue;
>
> -               odp_atomic_inc_u64(&counters.ip);
> +               odp_counter64_inc(&counters.ip);
>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>
>                 /* udp */
>                 if (ip->proto == ODPH_IPPROTO_UDP) {
> -                       odp_atomic_inc_u64(&counters.udp);
> +                       odp_counter64_inc(&counters.udp);
>                         udp = (odph_udphdr_t *)(buf + offset);
>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>                                         odp_be_to_cpu_16(udp->length) -
> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                         icmp = (odph_icmphdr_t *)(buf + offset);
>                         /* echo reply */
>                         if (icmp->type == ICMP_ECHOREPLY) {
> -                               odp_atomic_inc_u64(&counters.icmp);
> +                               odp_counter64_inc(&counters.icmp);
>                                 memcpy(&tvsend, buf + offset +
> ODPH_ICMPHDR_LEN,
>                                        sizeof(struct timeval));
>                                 /* TODO This should be changed to use an
> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>         }
>
>         /* init counters */
> -       odp_atomic_init_u64(&counters.seq);
> -       odp_atomic_init_u64(&counters.ip);
> -       odp_atomic_init_u64(&counters.udp);
> -       odp_atomic_init_u64(&counters.icmp);
> +       odp_counter64_init(&counters.seq, 0);
> +       odp_counter64_init(&counters.ip, 0);
> +       odp_counter64_init(&counters.udp, 0);
> +       odp_counter64_init(&counters.icmp, 0);
>
>         /* Reserve memory for args from shared mem */
>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
> index 2f2dc19..76c27d0 100644
> --- a/example/ipsec/odp_ipsec.c
> +++ b/example/ipsec/odp_ipsec.c
> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>         printf("Num worker threads: %i\n", num_workers);
>
>         /* Create a barrier to synchronize thread startup */
> -       odp_barrier_init_count(&sync_barrier, num_workers);
> +       odp_barrier_init(&sync_barrier, num_workers);
>
>         /*
>          * By default core #0 runs Linux kernel background tasks.
> diff --git a/example/odp_example/odp_example.c
> b/example/odp_example/odp_example.c
> index 0e9aa3d..c473395 100644
> --- a/example/odp_example/odp_example.c
> +++ b/example/odp_example/odp_example.c
> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>         odp_shm_print_all();
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&globals->barrier, num_workers);
> +       odp_barrier_init(&globals->barrier, num_workers);
>
>         if (args.proc_mode) {
>                 int ret;
> diff --git a/example/timer/odp_timer_test.c
> b/example/timer/odp_timer_test.c
> index 78b2ae2..dfbeae9 100644
> --- a/example/timer/odp_timer_test.c
> +++ b/example/timer/odp_timer_test.c
> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>         printf("\n");
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&test_barrier, num_workers);
> +       odp_barrier_init(&test_barrier, num_workers);
>
>         /* Create and launch worker threads */
>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
> index 76c1db8..5e78b34 100644
> --- a/helper/include/odph_ring.h
> +++ b/helper/include/odph_ring.h
> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>                 uint32_t sp_enqueue;     /* True, if single producer. */
>                 uint32_t size;           /* Size of ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Producer head. */
> -               uint32_t tail;          /* Producer tail. */
> +               odp_atomic32_t head;    /* Producer head. */
> +               odp_atomic32_t tail;    /* Producer tail. */
>         } prod ODP_ALIGNED_CACHE;
>
>         /** @private Consumer */
> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>                 uint32_t size;           /* Size of the ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Consumer head. */
> -               uint32_t tail;          /* Consumer tail. */
> +               odp_atomic32_t head;    /* Consumer head. */
> +               odp_atomic32_t tail;    /* Consumer tail. */
>         } cons ODP_ALIGNED_CACHE;
>
>         /** @private Memory space of ring starts here. */
> diff --git a/platform/linux-generic/include/api/odp.h
> b/platform/linux-generic/include/api/odp.h
> index 0ee3faf..d124d52 100644
> --- a/platform/linux-generic/include/api/odp.h
> +++ b/platform/linux-generic/include/api/odp.h
> @@ -32,6 +32,7 @@ extern "C" {
>  #include <odp_barrier.h>
>  #include <odp_spinlock.h>
>  #include <odp_atomic.h>
> +#include <odp_counter.h>
>
>  #include <odp_init.h>
>  #include <odp_system_info.h>
> diff --git a/platform/linux-generic/include/api/odp_atomic.h
> b/platform/linux-generic/include/api/odp_atomic.h
>
> index 0cc4cf4..ccaad02 100644
>
> --- a/platform/linux-generic/include/api/odp_atomic.h
> +++ b/platform/linux-generic/include/api/odp_atomic.h
> @@ -4,464 +4,494 @@
>   * SPDX-License-Identifier:     BSD-3-Clause
>   */
>
> -
>  /**
>   * @file
>   *
> - * ODP atomic operations
> + * ODP atomic types and operations, semantically a subset of C11 atomics.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Atomic functions must be used to operate on atomic variables!
>   */
>
>  #ifndef ODP_ATOMIC_H_
>  #define ODP_ATOMIC_H_
>
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +#include <odp_debug.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
>
> -
> -#include <odp_std_types.h>
> -
> -
> -/**
> - * Atomic integer
> - */
> -typedef volatile int32_t odp_atomic_int_t;
> -
> -/**
> - * Atomic unsigned integer 64 bits
> - */
> -typedef volatile uint64_t odp_atomic_u64_t;
> -
> -/**
> - * Atomic unsigned integer 32 bits
> - */
> -typedef volatile uint32_t odp_atomic_u32_t;
> -
> -
> -/**
> - * Initialize atomic integer
> - *
> - * @param ptr    An integer atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic integer
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic integer value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
> -{
> -       return *ptr;
> -}
> -
> -/**
> - * Store value to atomic integer
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
> new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic integer
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic integer
> - *
> - * @param ptr    An atomic integer variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement atomic integer by 1
> - *
> - * @param ptr    An atomic int variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Initialize atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic uint32 value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
> -{
> -       return *ptr;
> -}
> -
> -/**
> - * Store value to atomic uint32
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
> -                                       uint32_t new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be sub to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
>  /**
> - * Fetch and increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __OCTEON__
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       uint32_t ret;
> -
> -       __asm__ __volatile__ ("syncws");
> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
> -                             "r" (ptr));
> -
> -       return ret;
> -}
> -
> + * 32-bit (unsigned) atomic type
> + */
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the atomic variable */
> +} odp_atomic32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
> +
> +typedef enum {
> +       /** Relaxed memory order, no ordering of other accesses enforced */
> +       ODP_MEMORDER_RLX,
> +       /** Acquire memory order, later accesses cannot move before
> +        * acquire operation */
> +       ODP_MEMORDER_ACQ,
> +       /** Release memory order, earlier accesses cannot move after
> +        * release operation */
> +       ODP_MEMORDER_RLS
> +} odp_memorder_t;
> +
>
> +/*****************************************************************************
> + * Just some private helpers
>
> +*****************************************************************************/
> +
> +#ifdef __OCTEON__
> +/* OCTEON Write Memory Barrier */
> +#define COMPILER_HW_BARRIER() __asm __volatile( \
> +       /* Double syncw to work around errata */ \
> +       "syncw\n\tsyncw" : : : )
>  #else
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> +/** Compiler and hardware full memory barrier */
> +#define COMPILER_HW_BARRIER() __sync_synchronize()
> +/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a
> */
>  #endif
>
> -/**
> - * Increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u32(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_sub_u32(ptr, 1);
> -}
> -
> -/**
> - * Atomic compare and set for 32bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> +#define MEMORY "memory"
> +
>
> +/*****************************************************************************
> + * Operations on 32-bit atomics
> + * odp_atomic32_init - no return value
> + * odp_atomic32_load - return current value
> + * odp_atomic32_store - no return value
> + * odp_atomic32_cmp_xchg_weak - return bool
> + * odp_atomic32_fetch_add - return old value
> + * odp_atomic32_add - no return value
> + * odp_atomic32_fetch_inc - return old value
> + * odp_atomic32_inc - no return value
> + * odp_atomic32_fetch_dec - return old value
> + * odp_atomic32_dec - no return value
> +
> *****************************************************************************/
> +
> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
> +{
> +       /* Write of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) write variable,
> thus we
> +        * can avoid using compiler memory barriers */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Atomic load of 32-bit atomic variable
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the load
> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
> + *
> + * @return Value of the variable
> + */
> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               uint32_t val;
> +               /* Read of aligned word is atomic */
> +               /* Cast to volatile to force compiler to (re-) read
> variable,
> +                * thus we can avoid using compiler memory barriers */
> +               val = *(__volatile const uint32_t *)&ptr->v;
> +               return val;
> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
> +#if defined __aarch64__
> +               uint32_t val;
> +               __asm __volatile("ldar %w0, [%1]"
> +                               : "=&r"(val)
> +                               : "r"(&ptr->v)
> +                               : MEMORY);
> +               return val;
> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
> +               /* Read of aligned word is atomic */
> +               uint32_t val = ptr->v;
> +               /* To prevent later accesses from moving up */
> +               /* Herb Sutter claims HW barrier not needed on x86? */
> +               COMPILER_HW_BARRIER();
> +               return val;
> +#else
> +#warning odp_atomic32_load() may not be efficiently implemented
> +               /* Assume read of aligned word is atomic */
> +               uint32_t val = ptr->v;
> +               /* To prevent later accesses from moving up */
> +               COMPILER_HW_BARRIER();
> +               return val;
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +/**
> + * Atomic store to 32-bit atomic variable
> + *
> + * @param ptr  Pointer to a 32-bit atomic variable
> + * @param val  Value to write to the atomic variable
> + * @param memmodel Memory model associated with the store
> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
> + */
> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
> +               uint32_t val,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Write of aligned word is atomic */
> +               /* Cast to volatile to force compiler to (re-) write
> variable,
> +                * thus we will avoid using compiler memory barriers */
> +               *(__volatile uint32_t *)&ptr->v = val;
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
> +               /* Compiler and HW barrier to prevent earlier accesses from
> +                * moving down */
> +               COMPILER_HW_BARRIER();
> +               /* Write of aligned word is atomic */
> +               ptr->v = val;
> +               /* Compiler and HW barrier to prevent this store from
> moving
> +                * down after a later load-acquire and thus create
> overlapping
> +                * critical sections. Herb Sutter thinks this is needed */
> +               COMPILER_HW_BARRIER();
> +#elif defined __aarch64__
> +               __asm __volatile("stlr %w0, [%1]"
> +                               :
> +                               : "r"(val), "r"(&ptr->v)
> +                               : MEMORY);
> +#elif defined __x86_64__
> +               /* This is actually an atomic exchange operation */
> +               /* Generates good code on x86_64 */
> +               (void)__sync_lock_test_and_set(&ptr->v, val);
> +#else
> +#warning odp_atomic32_store_rls() may not be efficiently implemented
> +               /* This is actually an atomic exchange operation */
> +               (void)__sync_lock_test_and_set(&ptr->v, val);
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +
> +/**
> + * Atomic compare and exchange (swap) of 32-bit atomic variable
> + * "Weak" semantics, may fail spuriously and must be used in a loop.
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param exp_p Pointer to expected value (updated on failure)
> + * @param val   New value to write
> + * @param       memmodel Memory model associated with the compare-and-swap
> + * operation (ODP_MEMORDER_RLX only)
> + *
> + * @return 1 (true) if exchange successful, 0 (false) if not successful
> (and
> + * '*exp_p' updated with current value)
> + */
> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
> +               uint32_t *exp_p,
> +               uint32_t val,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +#if defined __arm__ /* A32/T32 ISA */
> +               uint32_t old;
> +               uint32_t exp = *exp_p;
> +               int status;
> +               __asm __volatile("ldrex %0, [%2]\t\n"
> +                                "cmp   %0, %3\t\n"
> +                                "bne   1f\t\n"
> +                                "strex %1, %4, [%2]\t\n"
> +                                "1:\t\n"
> +                               : "=&r"(old), "=&r"(status)
> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
> +                               : MEMORY);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Clear exclusive access monitor */
> +                       __asm __volatile("clrex");
> +                       /* Return current value */
> +                       *exp_p = old;
> +                       return 0;
> +               }
> +               /* strex returns 0 on success */
> +               if (odp_unlikely(status != 0)) {
> +                       /* strex failed, reservation was disturbed */
> +                       /* Return potentially changed value */
> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
> +                       return 0;
> +               }
> +               return 1;
> +#elif defined __mips64__
> +               uint32_t old;
> +               uint32_t exp = *exp_p;
> +               uint32_t status = val;
> +               __asm __volatile("llw %0, [%2]\t\n"
> +                                "bne %0, %3, 1f\t\n"
> +                                "scw %1, [%2]\t\n"
> +                                "1:\t\n"
> +                               : "=&r"(old), "+&r"(status)
> +                               : "r"(&ptr->v), "r"(exp)
> +                               : MEMORY);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Return current value */
> +                       *exp_p = old;
> +                       return 0;
> +               }
> +               /* scw returns 1 on success, 0 on failure */
> +               if (odp_unlikely(status == 0)) {
> +                       /* scw failed, reservation was disturbed */
> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
> +                       return 0;
> +               }
> +               return 1;
> +#elif defined __x86_64__
> +               uint32_t exp = *exp_p;
> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
> val);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Return the unexpected content of '*ptr' */
> +                       *exp_p = old;
> +                       return 0;
> +               } else {
> +                       return 1;
> +               }
> +#else
> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
> +               uint32_t exp = *exp_p;
> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
> val);
> +               if (odp_unlikely(old != exp)) {
> +                       /* Return the unexpected content of '*ptr' */
> +                       *exp_p = old;
> +                       return 0;
> +               } else {
> +                       return 1;
> +               }
> +#endif
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
> +}
> +
> +/**
> + * Atomic fetch and add to 32-bit atomic variable
> + * @note A - B <=> A + (-B)
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + * @param memmodel Memory model associated with the add
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
> + *
> + * @return Value of the atomic variable before the addition
> + */
> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
> +               uint32_t incr,
> +               odp_memorder_t mmodel)
> +{
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +#if defined __arm__ /* A32/T32 ISA */
> +               uint32_t old_val, tmp;
> +               int status;
> +               do {
> +                       __asm __volatile("ldrex %0, [%3]\t\n"
> +                                        "add   %1, %0, %4\t\n"
> +                                        "strex %2, %1, [%3]\t\n"
>
> +                                       : "=&r"(old_val), "=&r"(tmp),
>
> +                                         "=&r"(status)
> +                                       : "r"(&ptr->v), "r"(incr)
> +                                       : MEMORY);
> +               } while (odp_unlikely(status != 0));
> +               return old_val;
> +#elif defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("laa %0,(%2),%3"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr), "r" (incr)
> +                               : MEMORY);
> +               return old_val;
> +#elif defined __x86_64__
> +               /* Generates good code on x86_64 */
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +#else
> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("laa %0,(%2),%3"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr), "r" (incr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
> +#endif
> +               /* __sync_fetch_and_add() will give us barriers before and
> +                * after, we are fine with this for release operations */
> +               return __sync_fetch_and_add(&ptr->v, incr);
> +       } else {
> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
> +       }
>  }
>
>  /**
> - * Initialize atomic uint64
> + * Atomic add to 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + * @param memmodel Memory model associated with the add
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
> +               uint32_t incr,
> +               odp_memorder_t mmodel)
>  {
> -       *ptr = 0;
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic add instructions can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               __asm __volatile("saa %[inc], (%[base])"
> +                               : "+m" (*ptr)
> +                               : [inc] "r" (incr), [base] "r" (ptr)
> +                               : MEMORY);
> +               return;
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +               /* Platforms that support atomic add instructions can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("saa %[inc], (%[base])"
> +                               : "+m" (*ptr)
> +                               : [inc] "r" (incr), [base] "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return;
> +#endif
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>  }
>
>  /**
> - * Load value of atomic uint64
> - *
> - * @param ptr    An atomic variable
> + * Atomic fetch and increment of 32-bit atomic variable
>   *
> - * @return atomic uint64 value
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the increment
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   *
> - * @note The operation is not synchronized with other threads
> + * @return Value of the atomic variable before the increment
>   */
> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>  {
> -       return *ptr;
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic increment instructions
> can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("lai %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               return old_val;
> +#endif
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("lai %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
> +#endif
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>  }
>
>  /**
> - * Store value to atomic uint64
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> + * Atomic increment of 32-bit atomic variable
>   *
> - * @note The operation is not synchronized with other threads
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the increment
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
> -                                       uint64_t new_value)
> -{
> -       *ptr = new_value;
> -}
> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>
> -/**
> - * Add atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - */
> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
>  {
> -       __sync_fetch_and_add(ptr, value);
> +       /* Default to using odp_atomic32_fetch_inc() */
> +       /* Platforms that support atomic increment instructions can add
> +        * their implementations here */
> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>  }
>
>  /**
> - * Fetch and add atomic uint64
> + * Atomic fetch and decrement of 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the decrement
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   *
> - * @return Value of the variable before the operation
> + * @return Value of the atomic variable before the decrement
>   */
> -
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
> +               odp_memorder_t mmodel)
>  {
> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> -#else
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> +       if (mmodel == ODP_MEMORDER_RLX) {
> +               /* Platforms that support atomic decrement instructions
> can add
> +                * their implementations here */
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               __asm __volatile("lad %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               return old_val;
>  #endif
> -/**
> - * Subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - */
> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
> -{
> -       __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> -#else
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> +       } else if (mmodel == ODP_MEMORDER_RLS) {
> +#if defined __OCTEON__
> +               uint32_t old_val;
> +               COMPILER_HW_BARRIER();
> +               __asm __volatile("lad %0,(%2)"
> +                               : "=r" (old_val), "+m" (ptr)
> +                               : "r" (ptr)
> +                               : MEMORY);
> +               COMPILER_HW_BARRIER();
> +               return old_val;
>  #endif
> -/**
> - * Fetch and increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u64(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       odp_atomic_fetch_add_u64(ptr, 1);
> +       }
> +       /* Default to using odp_atomic32_fetch_add() */
> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>  }
>
>  /**
> - * Fetch and decrement atomic uint64 by 1
> + * Atomic decrement of 32-bit atomic variable
>   *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> + * param ptr   Pointer to a 32-bit atomic variable
> + * @param memmodel Memory model associated with the decrement
> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>   */
> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u64(ptr, 1);
> -}
> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
> +               odp_memorder_t memorder)
>
> -/**
> - * Decrement atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>  {
> -       odp_atomic_fetch_sub_u64(ptr, 1);
> +       /* Default to using odp_atomic32_fetch_dec() */
> +       /* Platforms that support atomic decrement instructions can add
> +        * their implementations here */
> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>  }
>
> -/**
> - * Atomic compare and set for 64bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> -}
> +/* We are not exporting this macro */
> +#undef COMPILER_HW_BARRIER
> +#undef MEMORY
>
>  #ifdef __cplusplus
>  }
> diff --git a/platform/linux-generic/include/api/odp_barrier.h
> b/platform/linux-generic/include/api/odp_barrier.h
> index a7b3215..69b1eb8 100644
> --- a/platform/linux-generic/include/api/odp_barrier.h
> +++ b/platform/linux-generic/include/api/odp_barrier.h
> @@ -27,18 +27,18 @@ extern "C" {
>   * ODP execution barrier
>   */
>  typedef struct odp_barrier_t {
> -       int              count;  /**< @private Thread count */
> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
> +       uint32_t       num_threads;  /**< @private Thread count (constant)
> */
> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
>  } odp_barrier_t;
>
>
>  /**
>   * Init barrier with thread count
>   *
> - * @param barrier    Barrier
> - * @param count      Thread count
> + * @param barrier     Barrier
> + * @param num_threads Number of threads which share the barrier
>   */
> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>
>
>  /**
> diff --git a/platform/linux-generic/include/api/odp_counter.h
> b/platform/linux-generic/include/api/odp_counter.h
> new file mode 100644
>
> index 0000000..f937d27
>
> --- /dev/null
> +++ b/platform/linux-generic/include/api/odp_counter.h
> @@ -0,0 +1,363 @@
> +/* Copyright (c) 2013, Linaro Limited
> + * All rights reserved.
> + *
> + * SPDX-License-Identifier:     BSD-3-Clause
> + */
> +
> +/**
> + * @file
> + *
> + * ODP atomic counter types and operations, suitable for e.g. shared
> statistics.
> + * Relaxed memory model assumed for lowest overhead.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Counter functions must be used to operate on counter variables!
> + */
> +
> +#ifndef ODP_COUNTER_H_
> +#define ODP_COUNTER_H_
> +
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * 32-bit (unsigned) atomic counter type
> + */
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the counter variable */
> +} odp_counter32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
> +
> +/**
> + * 64-bit (unsigned) atomic counter type
> + */
> +typedef struct {
> +       uint64_t v; /**< Actual storage for the counter variable */
> +       /* Room for other data structures (e.g. spin lock) that might be
> +        * needed to ensure atomicity on some architectures */
> +} odp_counter64_t
> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
> +
>
> +/*****************************************************************************
> + * Operations on 32-bit atomic counters
> + * odp_counter32_init - returns no value
> + * odp_counter32_read - returns current value
> + * odp_counter32_write - returns no value
> + * odp_counter32_add - returns no value
> + * odp_counter32_read_inc - returns old value
> + * odp_counter32_inc - returns no value
> +
> *****************************************************************************/
> +
> +/**
> + * Initialize 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param val   Initial value
> + */
> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t val)
> +{
> +       /* No implementation requires any other type of initialization */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Read 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + *
> + * @return Value of the variable
> + */
> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
> +{
> +       uint32_t val;
> +       /* Read of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) read variable, thus
> we
> +        * will avoid using compiler memory barriers */
> +       val = *(__volatile const uint32_t *)&ptr->v;
> +       return val;
> +}
> +
> +/**
> + * Write 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param val   Value to write to the variable
> + */
> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t val)
> +{
> +       /* Write of aligned word is atomic */
> +       /* Cast to volatile to force compiler to (re-) write variable,
> thus we
> +        * will avoid using compiler memory barriers */
> +       *(__volatile uint32_t *)&ptr->v = val;
> +}
> +
> +/**
> + * Atomic add to 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t result;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%2]\t\n"
> +                                "add   %0, %0, %3\t\n"
> +                                "strex %1, %0, [%2]"
> +                                : "=&r"(result), "=&r"(status)
> +                                : "r"(&ptr->v), "Ir" (incr)
> +                                : );
> +       } while (odp_unlikely(status != 0));
> +#elif defined __OCTEON__
> +       __asm __volatile("saa %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +       /* Warning odp_counter32_add() may not be efficiently implemented
> */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) of 32-bit counter variable, return original value
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + *
> + * @return Original value of counter
> + */
> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t result, tmp;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%3]\t\n"
> +                                "add   %1, %0, #1\t\n"
> +                                "strex %2, %1, [%3]"
>
> +                                : "=&r"(result), "=&r"(tmp), "=&r"(status)
>
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0));
> +       return result;
> +#elif defined __OCTEON__
> +       uint32_t old_val;
> +       __asm __volatile("lai %0,(%2)"
> +                        : "=r" (old_val), "+m" (ptr)
> +                        : "r" (ptr)
> +                        : );
> +       return old_val;
> +#elif defined __x86_64__
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#else
> +/* Warning odp_counter32_read_inc() may not be efficiently implemented */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) 32-bit counter variable
> + *
> + * @param ptr   Pointer to a 32-bit counter variable
> + */
> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
> +{
> +#if defined __OCTEON__
> +       odp_counter32_add(ptr, 1);
> +#else
> +       (void)odp_counter32_read_inc(ptr);
> +#endif
> +}
> +
>
> +/*****************************************************************************
> + * Operations on 64-bit atomic counters
> + * odp_counter64_init
> + * odp_counter64_read
> + * odp_counter64_write
> + * odp_counter64_add
> + * odp_counter64_read_inc
> + * odp_counter64_inc
> +
> *****************************************************************************/
> +
> +/**
> + * Read 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + *
> + * @return Value of the counter variable
> + */
> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t val;
> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
> +                        "clrex" /* Clear exclusive access monitor */
> +                        : "=&r"(val)
> +                        : "r"(&ptr->v)
> +                        : );
> +       return val;
> +#elif defined __x86_64__ || defined __aarch64__
> +       /* Read of aligned quad/double word is atomic */
> +       return ptr->v;
> +#else
> +/* Warning odp_counter64_read() may not be efficiently implemented */
> +       return __sync_fetch_and_or(&ptr->v, 0);
> +#endif
> +}
> +
> +/**
> + * Write 64-bit counter variable
> + *
> + * @param ptr  Pointer to a 64-bit counter variable
> + * @param val  Value to write to the counter variable
> + */
> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t val)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               /* Read counter variable exclusively so we can write to it
> +                * later */
> +               /* Attempt to write the new value */
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "strexd %1, %3, %H3, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(val)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __x86_64__ || defined __aarch64__
> +       /* Write of aligned quad/double word is atomic */
> +       ptr->v = val;
> +#else
> +/* Warning odp_counter64_write() may not be efficiently implemented */
> +       /* This is actually an counter exchange operation */
> +       (void)__sync_lock_test_and_set(&ptr->v, val);
> +#endif
> +}
> +
> +/**
> + * Initialize 64-bit counter variable
> + * Perform implementation specific initializations, assign initial value.
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param val   Initial value
> + */
> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t val)
> +{
> +       /* No implementation requires any other type of initialization */
> +       odp_counter64_write(ptr, val);
> +}
> +
> +/**
> + * Atomic add to 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + * @param incr  The value to be added to the counter variable
> + */
> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, %0, %3\t\n"
> +                                "adc    %H0, %H3\t\n"
> +                                "strexd %1, %0, %H0, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v), "r"(incr)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __OCTEON__
> +       __asm __volatile("saad %[inc], (%[base])"
> +                        : "+m" (*ptr)
> +                        : [inc] "r" (incr), [base] "r" (ptr)
> +                        : );
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#else
> +/* Warning odp_counter64_add() may not be efficiently implemented */
> +       (void)__sync_fetch_and_add(&ptr->v, incr);
> +#endif
> +}
> +
> +
> +/**
> + * Atomic increment (+1) 64-bit counter variable and return original value
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + *
> + * @return Original value of counter
> + */
> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val, tmp;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
> +                                "adds   %2, %0, #1\t\n"
> +                                "adc    %H2, %H0, #0\t\n"
> +                                "strexd %1, %2, %H2, [%3]"
> +                                : "=&r"(old_val), "=&r"(status),
> "=&r"(tmp)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +       return old_val;
> +#elif defined __OCTEON__
> +       uint64_t old_val;
> +       __asm __volatile("laid %0,(%2)"
> +                       : "=r" (old_val), "+m" (ptr)
> +                       : "r" (ptr)
> +                       : );
> +       return old_val;
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#else
> +/* Warning odp_counter64_read_inc() may not be efficiently implemented */
> +       return __sync_fetch_and_add(&ptr->v, 1);
> +#endif
> +}
> +
> +/**
> + * Atomic increment (+1) 64-bit counter variable
> + *
> + * @param ptr   Pointer to a 64-bit counter variable
> + */
> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
> +                                "adds   %0, #1\t\n"
> +                                "adc    %H0, #0\t\n"
> +                                "strexd %1, %0, %H0, [%2]"
> +                                : "=&r"(old_val), "=&r"(status)
> +                                : "r"(&ptr->v)
> +                                : );
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#else
> +       (void)odp_counter64_read_inc(ptr);
> +#endif
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif
> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
> b/platform/linux-generic/include/api/odp_rwlock.h
> index 252ebb2..ff8a9a2 100644
> --- a/platform/linux-generic/include/api/odp_rwlock.h
> +++ b/platform/linux-generic/include/api/odp_rwlock.h
> @@ -10,26 +10,30 @@
>  /**
>   * @file
>   *
> - * ODP RW Locks
> + * ODP read/write lock
> + * RW lock support mu
>
> ...
>
> [Message clipped]
>
>
>
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
>
>
>
> --
>
> *Mike Holmes*
>
> Linaro  Sr Technical Manager
>
> LNG - ODP
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
>
>
>
vkamensky Nov. 5, 2014, 6:46 p.m. UTC | #13
Hi Ola,

I've tried to look at your patch first time you posted it. I just
tried to look at it again. Sorry, it is too big and too many
things happens in the patch. I cannot hold so many things
in my mind.

If you really want it to be reviewed it has to be split in pieces,
as many people repeatedly asked you already.

Here are suggestions how one potentially can split the patch:

o introduce counter api in header file with doxygen description
o introduce default implementation (I guress armv8 currently falls
into default)
o for each CPU type counter API implementation in separate patch
o test for counters separate patch
o replace atomics with counters in all places where relevant
o introduce new atomic apis with memory model parameter
o default implementation of atomic apis
o for each CPU type atomic with memory model implementation
separate patch
o test for new atomics (seems missing from the patch)
o change code where new atomic apis are used
o ...

Thanks,
Victor


On 5 November 2014 09:51, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:
> Those are your opinions but I don't agree (except that odp_atomics.h
> couldand maybe should be an internal header file). You are not the one
> person who decides the ODP architecture. It is supposed to be a
> collaborative effort.
>
> You are still not providing any basis for your claim the acquire/release
> semantics are not meaningful for "far" atomic operations, just repeating the
> same statement. To counter your base-less opinion, I claim that a far atomic
> increment of the ticket lock counter (when is done when releasing a ticket
> lock) would benefit from release ordering. The ticket lock next and current
> variables could benefit from not being fetched into L1 cache of the
> different lock clients (and they need to be in a cache line of their own,
> separate from the data).
>
> As we have seen, explicit use of barriers is already creating problems. When
> are barriers needed and when are they not needed (e.g. multiple cases of
> redundant barriers in ODP)? Both programming models may have their issues,
> multithreaded programming is not trivial. I can't see why C11/C++11 style
> atomics should be more difficult to use than old-style use of barriers.
> Probably the reverse as I detected so many missing and redundant barriers
> when looking at the code using my acquire/release glasses. So just another
> unsubstantiated opinion.
>
> I also claim that a C11 atomics inspired API will be useful for ARMv8
> implementations that support C11/C++11 memory models natively without the
> use of explicit barriers. In a few years, programmers of multithreaded
> applications will likely become used to the C11/C++11 way of doing things,
> this is probably what they will teach in colleges etc. Explicit use of
> barriers is architecture (and memory model) specific and thus a fragile
> method if you want to write portable code.
>
> Petri, kill your darlings.
>
> -- Ola
>
>
> On 5 November 2014 17:21, Savolainen, Petri (NSN - FI/Espoo)
> <petri.savolainen@nsn.com> wrote:
>>
>> The larger question is if ODP needs to provide lock _implementation_
>> abstraction - I think it does not. It should be sufficient for application
>> to use ODP synchronization services (to enable HW acceleration of those). If
>> application is really going to implement a lock (or lockless data structure)
>> by itself, it can very well use GCC __atomics or direct assembly for that.
>> That should be very rare case and low number of application code lines => no
>> point to abstract through ODP API.
>>
>>
>>
>> The current atomics API is in line with similar definition in linux kernel
>> and DPDK, and fits better far atomics (in memory atomic operations) than the
>> acq/rel semantics of C11. Separate counter API could be reserved for HW (not
>> ISA) based counters, that may have limitations in usage. These current
>> atomics would work (in practice) anywhere in the  main memory.
>>
>>
>>
>> I think C11 style API would not have benefits over the current (linux/DPDK
>> style) definition, but instead it would create problems like which
>> combinations of the operations are valid/legal/optimal in each case. Also as
>> said, ODP should provide readymade synchronization primitives (that are on
>> higher abstraction layer) instead of introducing bunch of lock
>> implementation level API calls.
>>
>>
>>
>> So, rather fix bugs in lock implementation (with assembly if needed) than
>> introduce new APIs.
>>
>>
>>
>> -Petri
>>
>>
>>
>>
>>
>>
>>
>> From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
>> Sent: Wednesday, November 05, 2014 5:57 PM
>> To: Savolainen, Petri (NSN - FI/Espoo)
>> Cc: ext Bill Fischofer; Mike Holmes; lng-odp-forward
>> Subject: Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11 memory
>> model
>>
>>
>>
>> Fixing the bugs in the current implementation was just a side effect of
>> implementing and then using the new atomics API. I didn't actually go into
>> ticketlock_lock and fix the missing sync bug, it just disappeared when using
>> the new C11-inspired atomic operations. My primary interest is better
>> counter and atomics API's and their corresponding implementations in ODP.
>>
>>
>>
>> I can remove all references to bugs that thus were fixed in the current
>> implementation and you will never know that it was broken. Better?
>>
>>
>>
>> I can provide separate patches for the 32/64-bit counter API and for the
>> atomics API (that is used to implement different higher level lock and
>> synchronization primitives). I actually need to add a few more atomic
>> operations for the lock-less timer implementation, also need proper 64-bit
>> atomics (not counter) support.
>>
>>
>>
>> I can't see any meaning of patches that are dependent on each other. A
>> patch for new atomics API will include the header file, the implementation,
>> all usages of this header file in ODP components, example apps and test
>> programs.
>>
>>
>>
>> -- Ola
>>
>>
>>
>>
>>
>> On 5 November 2014 12:38, Savolainen, Petri (NSN - FI/Espoo)
>> <petri.savolainen@nsn.com> wrote:
>>
>> We achieve more predictable progress, if problems are identified, fixes
>> suggested and approved in multiple - manageable patches rather than large
>> lumps of implementation/API rewrite. E.g. if Ola’s problem is missing
>> ticket_lock release sync in the implementation (for ARM), then we’ll fix
>> that and do not rewrite the atomics API for application, etc.
>>
>>
>>
>> A large rewrite is just more likely to consume many review rounds and
>> waste time on both sides.
>>
>>
>>
>> -Petri
>>
>>
>>
>>
>>
>> From: ext Bill Fischofer [mailto:bill.fischofer@linaro.org]
>> Sent: Tuesday, November 04, 2014 6:30 PM
>> To: Mike Holmes
>> Cc: Ola Liljedahl; Savolainen, Petri (NSN - FI/Espoo); lng-odp-forward
>> Subject: Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11 memory
>> model
>>
>>
>>
>> So rather than folks spending the time to review the current patch Ola can
>> spend a lot of time to break it up and then people can start looking at it?
>> That doesn't seem to be a very efficient way of working this.  We're at a
>> point now where some APIs are being replaced rather than incrementally
>> patched.  It's easier to do that as an (ahem) atomic function rather than
>> breaking things into multiple patches that are all mutually interdependent.
>>
>>
>>
>> Multiple patches make sense if things are truly orthogonal.  But that's
>> not always the case and some patches will be large.
>>
>>
>>
>> Bill
>>
>>
>>
>> On Tue, Nov 4, 2014 at 9:58 AM, Mike Holmes <mike.holmes@linaro.org>
>> wrote:
>>
>> Generally fix one problem per patch and you solve several things at once
>> making it hard to think about implications, can these be separated ?
>>
>>
>>
>> Added header file odp_counter.h with support for 32- and 64-bit atomic
>> counters
>> using relaxed memory order. 6 operations
>> (init/read/write/add/read_inc/inc) on
>> 32-bit and 64-bit counters respectively.A
>> Renamed odp_atomic_test to odp_counter_test and changed to use
>> odp_counter.h
>>
>> Implementation of C11-based memory model for atomic operations. 10
>> operations
>> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
>> in
>> odp_atomic.h. The required memory ordering is now a parameter to each call
>> just
>> like in C11.
>>
>> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
>> fall back to GCC __sync builtins which often include unnecessarily heavy
>> barrier/sync operations (always sequentially consistent).
>>
>> Attempt to remove all explicit memory barriers (odp_sync_stores) from code
>> that
>> implements multithreaded synchronization primitives (e.g. locks,
>> barriers).
>> Rewrote such primitives to use the new atomic operations.
>>
>> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
>> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
>> enqueue/dequeue
>>
>>
>>
>> On 4 November 2014 10:06, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:
>>
>> And what should be in each patch?
>>
>>
>>
>> On 4 November 2014 16:03, Anders Roxell <anders.roxell@linaro.org> wrote:
>>
>> As perti wrote in his first email this patch should be broken up in
>> multiple patches...
>>
>> Cheers,
>> Anders
>>
>> On 4 Nov 2014 15:34, "Ola Liljedahl" <ola.liljedahl@linaro.org> wrote:
>>
>> Possibly odp_atomics.h should then be internal leaving only odp_counter.h
>> as the only public API. The original odp_atomics.h is public so I left it
>> that way.
>>
>>
>>
>> The counter API does not allow the user to specify any memory ordering,
>> relaxed memory order is expected, i.e. no ordering is guaranteed.
>>
>>
>>
>> Why does acquire/release not fit well with the far atomics? And what do
>> you mean specifically with "far atomics"? Just the counter updates like
>> Cavium has?
>>
>>
>>
>> As Linux kernel atomics interface predates C11/C++11 atomics support, I do
>> not see it as model to follow.
>>
>>
>>
>> The patch summary contained a brief description of what I wanted to
>> achieve with the patch. What do you want more, a Google Docs design
>> document?
>>
>>
>>
>> -- Ola
>>
>>
>>
>> On 4 November 2014 15:22, Savolainen, Petri (NSN - FI/Espoo)
>> <petri.savolainen@nsn.com> wrote:
>>
>> There are many things I’d change in this patch. I think it’s better to
>> take a step back and talk what you are trying to achieve here, and then
>> correct those step by step. E.g. the whole idea of acquire / release does
>> not fit well on far atomics, and far atomics is the thing I’d abstract from
>> applications with this API. Other synchronization primitives (such as locks)
>> would not be implemented (too often) by applications, so it’s not very
>> productive to abstract that (implementation of locks). E.g. Linux kernel
>> atomics.h looks pretty much like the odp_atomic.h.
>>
>>
>>
>> -Petri
>>
>>
>>
>>
>>
>> From: lng-odp-bounces@lists.linaro.org
>> [mailto:lng-odp-bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
>> Sent: Tuesday, November 04, 2014 3:49 PM
>> To: lng-odp@lists.linaro.org
>> Subject: Re: [lng-odp] [ODP/PATCH v3] Look ma, no barriers! C11 memory
>> model
>>
>>
>>
>> Ping!
>>
>>
>>
>> I really need this new working atomics support merged ASAP because I have
>> a new lock-less implementation of the timer API which uses atomic
>> operations. I haven't seen any real criticism against the content of the
>> patch so there is nothing to change.
>>
>>
>>
>> -- Ola
>>
>>
>>
>>
>>
>> On 20 October 2014 15:07, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:
>>
>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>> ---
>> Added header file odp_counter.h with support for 32- and 64-bit atomic
>> counters
>> using relaxed memory order. 6 operations
>> (init/read/write/add/read_inc/inc) on
>> 32-bit and 64-bit counters respectively.
>>
>> Renamed odp_atomic_test to odp_counter_test and changed to use
>> odp_counter.h
>>
>> Implementation of C11-based memory model for atomic operations. 10
>> operations
>> (init/load/store/cmp_xchg_weak/fetch_add/add/fetch_inc/inc/fetch_dec/dec)
>> in
>> odp_atomic.h. The required memory ordering is now a parameter to each call
>> just
>> like in C11.
>>
>> Optimized support for ARMv6/v7, x86_64, OCTEON. Other architectures will
>> fall back to GCC __sync builtins which often include unnecessarily heavy
>> barrier/sync operations (always sequentially consistent).
>>
>> Attempt to remove all explicit memory barriers (odp_sync_stores) from code
>> that
>> implements multithreaded synchronization primitives (e.g. locks,
>> barriers).
>> Rewrote such primitives to use the new atomic operations.
>>
>> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter),
>> odp_ticketlock_lock() (missing acquire barrier) and odp_ring
>> enqueue/dequeue
>> (missing release barrier, had only compiler barrier).
>>
>>  .gitignore                                         |   2 +-
>>  example/generator/odp_generator.c                  |  43 +-
>>  example/ipsec/odp_ipsec.c                          |   2 +-
>>  example/odp_example/odp_example.c                  |   2 +-
>>  example/timer/odp_timer_test.c                     |   2 +-
>>  helper/include/odph_ring.h                         |   8 +-
>>  platform/linux-generic/include/api/odp.h           |   1 +
>>  platform/linux-generic/include/api/odp_atomic.h    | 838
>> +++++++++++----------
>>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>>  platform/linux-generic/include/api/odp_counter.h   | 363 +++++++++
>>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>>  .../linux-generic/include/api/odp_ticketlock.h     |   5 +-
>>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>>  platform/linux-generic/include/odp_spin_internal.h |   9 -
>>  platform/linux-generic/odp_barrier.c               |  49 +-
>>  platform/linux-generic/odp_buffer.c                |   3 +-
>>  platform/linux-generic/odp_crypto.c                |   7 +-
>>  platform/linux-generic/odp_queue.c                 |   7 +-
>>  platform/linux-generic/odp_ring.c                  |  94 +--
>>  platform/linux-generic/odp_rwlock.c                |  62 +-
>>  platform/linux-generic/odp_thread.c                |   9 +-
>>  platform/linux-generic/odp_ticketlock.c            |  29 +-
>>  platform/linux-generic/odp_timer.c                 |  22 +-
>>  test/api_test/Makefile.am                          |   6 +-
>>  test/api_test/odp_atomic_test.c                    | 362 ---------
>>  test/api_test/odp_atomic_test.h                    |  60 --
>>  test/api_test/odp_common.c                         |   1 -
>>  test/api_test/odp_counter_test.c                   | 361 +++++++++
>>  28 files changed, 1365 insertions(+), 1014 deletions(-)
>>  create mode 100644 platform/linux-generic/include/api/odp_counter.h
>>  delete mode 100644 test/api_test/odp_atomic_test.c
>>  delete mode 100644 test/api_test/odp_atomic_test.h
>>  create mode 100644 test/api_test/odp_counter_test.c
>>
>> diff --git a/.gitignore b/.gitignore
>> index 6342e34..77db4d6 100644
>> --- a/.gitignore
>> +++ b/.gitignore
>> @@ -35,7 +35,7 @@ build/
>>  odp_example
>>  odp_packet
>>  odp_packet_netmap
>> -odp_atomic
>> +odp_counter
>>  odp_shm
>>  odp_ring
>>  odp_timer_ping
>> diff --git a/example/generator/odp_generator.c
>> b/example/generator/odp_generator.c
>> index eb8b340..252157d 100644
>> --- a/example/generator/odp_generator.c
>> +++ b/example/generator/odp_generator.c
>> @@ -62,10 +62,10 @@ typedef struct {
>>   * counters
>>  */
>>  static struct {
>> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
>> -       odp_atomic_u64_t ip;    /**< ip packets */
>> -       odp_atomic_u64_t udp;   /**< udp packets */
>> -       odp_atomic_u64_t icmp;  /**< icmp packets */
>> +       odp_counter64_t seq;    /**< ip seq to be send */
>> +       odp_counter64_t ip;     /**< ip packets */
>> +       odp_counter64_t udp;    /**< udp packets */
>> +       odp_counter64_t icmp;   /**< icmp packets */
>>  } counters;
>>
>>  /** * Thread specific arguments
>> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>> ODPH_UDPHDR_LEN +
>>                                        ODPH_IPV4HDR_LEN);
>>         ip->proto = ODPH_IPPROTO_UDP;
>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
>>         ip->id = odp_cpu_to_be_16(seq);
>>         ip->chksum = 0;
>>         odph_ipv4_csum_update(pkt);
>> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>> ODPH_ICMPHDR_LEN +
>>                                        ODPH_IPV4HDR_LEN);
>>         ip->proto = ODPH_IPPROTO_ICMP;
>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
>> +       seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
>>         ip->id = odp_cpu_to_be_16(seq);
>>         ip->chksum = 0;
>>         odph_ipv4_csum_update(pkt);
>> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>>                 }
>>
>>                 if (args->appl.interval != 0) {
>> +                       uint64_t seq = odp_counter64_read(&counters.seq);
>>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
>> -                              thr, counters.seq, counters.seq%0xffff);
>> +                              thr, seq, seq%0xffff);
>>                         /* TODO use odp timer */
>>                         usleep(args->appl.interval * 1000);
>>                 }
>> -               if (args->appl.number != -1 && counters.seq
>> -                   >= (unsigned int)args->appl.number) {
>> +               if (args->appl.number != -1 &&
>> +                   odp_counter64_read(&counters.seq) >=
>> +                   (unsigned int)args->appl.number) {
>>                         break;
>>                 }
>>         }
>> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>>         /* receive number of reply pks until timeout */
>>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>>                 while (args->appl.timeout >= 0) {
>> -                       if (counters.icmp >= (unsigned
>> int)args->appl.number)
>> +                       if (odp_counter64_read(&counters.icmp) >=
>> +                           (unsigned int)args->appl.number)
>>                                 break;
>>                         /* TODO use odp timer */
>>                         sleep(1);
>> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>>
>>         /* print info */
>>         if (args->appl.mode == APPL_MODE_UDP) {
>> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
>> +               printf("  [%02i] total send: %ju\n", thr,
>> +                      odp_counter64_read(&counters.seq));
>>         } else if (args->appl.mode == APPL_MODE_PING) {
>>                 printf("  [%02i] total send: %ju total receive: %ju\n",
>> -                      thr, counters.seq, counters.icmp);
>> +                      thr, odp_counter64_read(&counters.seq),
>> +                      odp_counter64_read(&counters.icmp));
>>         }
>>         return arg;
>>  }
>> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>                 if (!odp_packet_inflag_ipv4(pkt))
>>                         continue;
>>
>> -               odp_atomic_inc_u64(&counters.ip);
>> +               odp_counter64_inc(&counters.ip);
>>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
>> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>
>>                 /* udp */
>>                 if (ip->proto == ODPH_IPPROTO_UDP) {
>> -                       odp_atomic_inc_u64(&counters.udp);
>> +                       odp_counter64_inc(&counters.udp);
>>                         udp = (odph_udphdr_t *)(buf + offset);
>>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>>                                         odp_be_to_cpu_16(udp->length) -
>> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>                         icmp = (odph_icmphdr_t *)(buf + offset);
>>                         /* echo reply */
>>                         if (icmp->type == ICMP_ECHOREPLY) {
>> -                               odp_atomic_inc_u64(&counters.icmp);
>> +                               odp_counter64_inc(&counters.icmp);
>>                                 memcpy(&tvsend, buf + offset +
>> ODPH_ICMPHDR_LEN,
>>                                        sizeof(struct timeval));
>>                                 /* TODO This should be changed to use an
>> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>>         }
>>
>>         /* init counters */
>> -       odp_atomic_init_u64(&counters.seq);
>> -       odp_atomic_init_u64(&counters.ip);
>> -       odp_atomic_init_u64(&counters.udp);
>> -       odp_atomic_init_u64(&counters.icmp);
>> +       odp_counter64_init(&counters.seq, 0);
>> +       odp_counter64_init(&counters.ip, 0);
>> +       odp_counter64_init(&counters.udp, 0);
>> +       odp_counter64_init(&counters.icmp, 0);
>>
>>         /* Reserve memory for args from shared mem */
>>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
>> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
>> index 2f2dc19..76c27d0 100644
>> --- a/example/ipsec/odp_ipsec.c
>> +++ b/example/ipsec/odp_ipsec.c
>> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>>         printf("Num worker threads: %i\n", num_workers);
>>
>>         /* Create a barrier to synchronize thread startup */
>> -       odp_barrier_init_count(&sync_barrier, num_workers);
>> +       odp_barrier_init(&sync_barrier, num_workers);
>>
>>         /*
>>          * By default core #0 runs Linux kernel background tasks.
>> diff --git a/example/odp_example/odp_example.c
>> b/example/odp_example/odp_example.c
>> index 0e9aa3d..c473395 100644
>> --- a/example/odp_example/odp_example.c
>> +++ b/example/odp_example/odp_example.c
>> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>>         odp_shm_print_all();
>>
>>         /* Barrier to sync test case execution */
>> -       odp_barrier_init_count(&globals->barrier, num_workers);
>> +       odp_barrier_init(&globals->barrier, num_workers);
>>
>>         if (args.proc_mode) {
>>                 int ret;
>> diff --git a/example/timer/odp_timer_test.c
>> b/example/timer/odp_timer_test.c
>> index 78b2ae2..dfbeae9 100644
>> --- a/example/timer/odp_timer_test.c
>> +++ b/example/timer/odp_timer_test.c
>> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>>         printf("\n");
>>
>>         /* Barrier to sync test case execution */
>> -       odp_barrier_init_count(&test_barrier, num_workers);
>> +       odp_barrier_init(&test_barrier, num_workers);
>>
>>         /* Create and launch worker threads */
>>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
>> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
>> index 76c1db8..5e78b34 100644
>> --- a/helper/include/odph_ring.h
>> +++ b/helper/include/odph_ring.h
>> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>>                 uint32_t sp_enqueue;     /* True, if single producer. */
>>                 uint32_t size;           /* Size of ring. */
>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>> -               uint32_t head;          /* Producer head. */
>> -               uint32_t tail;          /* Producer tail. */
>> +               odp_atomic32_t head;    /* Producer head. */
>> +               odp_atomic32_t tail;    /* Producer tail. */
>>         } prod ODP_ALIGNED_CACHE;
>>
>>         /** @private Consumer */
>> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>>                 uint32_t size;           /* Size of the ring. */
>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>> -               uint32_t head;          /* Consumer head. */
>> -               uint32_t tail;          /* Consumer tail. */
>> +               odp_atomic32_t head;    /* Consumer head. */
>> +               odp_atomic32_t tail;    /* Consumer tail. */
>>         } cons ODP_ALIGNED_CACHE;
>>
>>         /** @private Memory space of ring starts here. */
>> diff --git a/platform/linux-generic/include/api/odp.h
>> b/platform/linux-generic/include/api/odp.h
>> index 0ee3faf..d124d52 100644
>> --- a/platform/linux-generic/include/api/odp.h
>> +++ b/platform/linux-generic/include/api/odp.h
>> @@ -32,6 +32,7 @@ extern "C" {
>>  #include <odp_barrier.h>
>>  #include <odp_spinlock.h>
>>  #include <odp_atomic.h>
>> +#include <odp_counter.h>
>>
>>  #include <odp_init.h>
>>  #include <odp_system_info.h>
>> diff --git a/platform/linux-generic/include/api/odp_atomic.h
>> b/platform/linux-generic/include/api/odp_atomic.h
>>
>> index 0cc4cf4..ccaad02 100644
>>
>> --- a/platform/linux-generic/include/api/odp_atomic.h
>> +++ b/platform/linux-generic/include/api/odp_atomic.h
>> @@ -4,464 +4,494 @@
>>   * SPDX-License-Identifier:     BSD-3-Clause
>>   */
>>
>> -
>>  /**
>>   * @file
>>   *
>> - * ODP atomic operations
>> + * ODP atomic types and operations, semantically a subset of C11 atomics.
>> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
>> + * without using the required access functions.
>> + * Atomic functions must be used to operate on atomic variables!
>>   */
>>
>>  #ifndef ODP_ATOMIC_H_
>>  #define ODP_ATOMIC_H_
>>
>> +#include <stdint.h>
>> +#include <odp_align.h>
>> +#include <odp_hints.h>
>> +#include <odp_debug.h>
>> +
>>  #ifdef __cplusplus
>>  extern "C" {
>>  #endif
>>
>> -
>> -#include <odp_std_types.h>
>> -
>> -
>> -/**
>> - * Atomic integer
>> - */
>> -typedef volatile int32_t odp_atomic_int_t;
>> -
>> -/**
>> - * Atomic unsigned integer 64 bits
>> - */
>> -typedef volatile uint64_t odp_atomic_u64_t;
>> -
>> -/**
>> - * Atomic unsigned integer 32 bits
>> - */
>> -typedef volatile uint32_t odp_atomic_u32_t;
>> -
>> -
>> -/**
>> - * Initialize atomic integer
>> - *
>> - * @param ptr    An integer atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
>> -{
>> -       *ptr = 0;
>> -}
>> -
>> -/**
>> - * Load value of atomic integer
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return atomic integer value
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
>> -{
>> -       return *ptr;
>> -}
>> -
>> -/**
>> - * Store value to atomic integer
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
>> new_value)
>> -{
>> -       *ptr = new_value;
>> -}
>> -
>> -/**
>> - * Fetch and add atomic integer
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
>> value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and subtract atomic integer
>> - *
>> - * @param ptr    An atomic integer variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
>> value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and increment atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Increment atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Fetch and decrement atomic integer by 1
>> - *
>> - * @param ptr    An atomic int variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Decrement atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
>> -{
>> -       odp_atomic_fetch_sub_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Initialize atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       *ptr = 0;
>> -}
>> -
>> -/**
>> - * Load value of atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return atomic uint32 value
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return *ptr;
>> -}
>> -
>> -/**
>> - * Store value to atomic uint32
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
>> -                                       uint32_t new_value)
>> -{
>> -       *ptr = new_value;
>> -}
>> -
>> -/**
>> - * Fetch and add atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>> -                                               uint32_t value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and subtract uint32
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be sub to the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
>> -                                               uint32_t value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> -
>>  /**
>> - * Fetch and increment atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -#if defined __OCTEON__
>> -
>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       uint32_t ret;
>> -
>> -       __asm__ __volatile__ ("syncws");
>> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
>> -                             "r" (ptr));
>> -
>> -       return ret;
>> -}
>> -
>> + * 32-bit (unsigned) atomic type
>> + */
>> +typedef struct {
>> +       uint32_t v; /**< Actual storage for the atomic variable */
>> +} odp_atomic32_t
>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>> +
>> +typedef enum {
>> +       /** Relaxed memory order, no ordering of other accesses enforced
>> */
>> +       ODP_MEMORDER_RLX,
>> +       /** Acquire memory order, later accesses cannot move before
>> +        * acquire operation */
>> +       ODP_MEMORDER_ACQ,
>> +       /** Release memory order, earlier accesses cannot move after
>> +        * release operation */
>> +       ODP_MEMORDER_RLS
>> +} odp_memorder_t;
>> +
>>
>> +/*****************************************************************************
>> + * Just some private helpers
>>
>> +*****************************************************************************/
>> +
>> +#ifdef __OCTEON__
>> +/* OCTEON Write Memory Barrier */
>> +#define COMPILER_HW_BARRIER() __asm __volatile( \
>> +       /* Double syncw to work around errata */ \
>> +       "syncw\n\tsyncw" : : : )
>>  #else
>> -
>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_u32(ptr, 1);
>> -}
>> -
>> +/** Compiler and hardware full memory barrier */
>> +#define COMPILER_HW_BARRIER() __sync_synchronize()
>> +/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a
>> */
>>  #endif
>>
>> -/**
>> - * Increment atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_u32(ptr, 1);
>> -}
>> -
>> -/**
>> - * Fetch and decrement uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_u32(ptr, 1);
>> -}
>> -
>> -/**
>> - * Decrement atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       odp_atomic_fetch_sub_u32(ptr, 1);
>> -}
>> -
>> -/**
>> - * Atomic compare and set for 32bit
>> - *
>> - * @param dst destination location into which the value will be written.
>> - * @param exp expected value.
>> - * @param src new value.
>> - * @return Non-zero on success; 0 on failure.
>> - */
>> -static inline int
>> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
>> -{
>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>> +#define MEMORY "memory"
>> +
>>
>> +/*****************************************************************************
>> + * Operations on 32-bit atomics
>> + * odp_atomic32_init - no return value
>> + * odp_atomic32_load - return current value
>> + * odp_atomic32_store - no return value
>> + * odp_atomic32_cmp_xchg_weak - return bool
>> + * odp_atomic32_fetch_add - return old value
>> + * odp_atomic32_add - no return value
>> + * odp_atomic32_fetch_inc - return old value
>> + * odp_atomic32_inc - no return value
>> + * odp_atomic32_fetch_dec - return old value
>> + * odp_atomic32_dec - no return value
>> +
>> *****************************************************************************/
>> +
>> +static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
>> +{
>> +       /* Write of aligned word is atomic */
>> +       /* Cast to volatile to force compiler to (re-) write variable,
>> thus we
>> +        * can avoid using compiler memory barriers */
>> +       *(__volatile uint32_t *)&ptr->v = val;
>> +}
>> +
>> +/**
>> + * Atomic load of 32-bit atomic variable
>> + *
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the load
>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
>> + *
>> + * @return Value of the variable
>> + */
>> +static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
>> +               odp_memorder_t mmodel)
>> +{
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               uint32_t val;
>> +               /* Read of aligned word is atomic */
>> +               /* Cast to volatile to force compiler to (re-) read
>> variable,
>> +                * thus we can avoid using compiler memory barriers */
>> +               val = *(__volatile const uint32_t *)&ptr->v;
>> +               return val;
>> +       } else if (mmodel == ODP_MEMORDER_ACQ) {
>> +#if defined __aarch64__
>> +               uint32_t val;
>> +               __asm __volatile("ldar %w0, [%1]"
>> +                               : "=&r"(val)
>> +                               : "r"(&ptr->v)
>> +                               : MEMORY);
>> +               return val;
>> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
>> +               /* Read of aligned word is atomic */
>> +               uint32_t val = ptr->v;
>> +               /* To prevent later accesses from moving up */
>> +               /* Herb Sutter claims HW barrier not needed on x86? */
>> +               COMPILER_HW_BARRIER();
>> +               return val;
>> +#else
>> +#warning odp_atomic32_load() may not be efficiently implemented
>> +               /* Assume read of aligned word is atomic */
>> +               uint32_t val = ptr->v;
>> +               /* To prevent later accesses from moving up */
>> +               COMPILER_HW_BARRIER();
>> +               return val;
>> +#endif
>> +       } else {
>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>> +       }
>> +}
>> +
>> +/**
>> + * Atomic store to 32-bit atomic variable
>> + *
>> + * @param ptr  Pointer to a 32-bit atomic variable
>> + * @param val  Value to write to the atomic variable
>> + * @param memmodel Memory model associated with the store
>> + * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
>> + */
>> +static inline void odp_atomic32_store(odp_atomic32_t *ptr,
>> +               uint32_t val,
>> +               odp_memorder_t mmodel)
>> +{
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               /* Write of aligned word is atomic */
>> +               /* Cast to volatile to force compiler to (re-) write
>> variable,
>> +                * thus we will avoid using compiler memory barriers */
>> +               *(__volatile uint32_t *)&ptr->v = val;
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
>> +               /* Compiler and HW barrier to prevent earlier accesses
>> from
>> +                * moving down */
>> +               COMPILER_HW_BARRIER();
>> +               /* Write of aligned word is atomic */
>> +               ptr->v = val;
>> +               /* Compiler and HW barrier to prevent this store from
>> moving
>> +                * down after a later load-acquire and thus create
>> overlapping
>> +                * critical sections. Herb Sutter thinks this is needed */
>> +               COMPILER_HW_BARRIER();
>> +#elif defined __aarch64__
>> +               __asm __volatile("stlr %w0, [%1]"
>> +                               :
>> +                               : "r"(val), "r"(&ptr->v)
>> +                               : MEMORY);
>> +#elif defined __x86_64__
>> +               /* This is actually an atomic exchange operation */
>> +               /* Generates good code on x86_64 */
>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>> +#else
>> +#warning odp_atomic32_store_rls() may not be efficiently implemented
>> +               /* This is actually an atomic exchange operation */
>> +               (void)__sync_lock_test_and_set(&ptr->v, val);
>> +#endif
>> +       } else {
>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>> +       }
>> +}
>> +
>> +
>> +/**
>> + * Atomic compare and exchange (swap) of 32-bit atomic variable
>> + * "Weak" semantics, may fail spuriously and must be used in a loop.
>> + *
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param exp_p Pointer to expected value (updated on failure)
>> + * @param val   New value to write
>> + * @param       memmodel Memory model associated with the
>> compare-and-swap
>> + * operation (ODP_MEMORDER_RLX only)
>> + *
>> + * @return 1 (true) if exchange successful, 0 (false) if not successful
>> (and
>> + * '*exp_p' updated with current value)
>> + */
>> +static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
>> +               uint32_t *exp_p,
>> +               uint32_t val,
>> +               odp_memorder_t mmodel)
>> +{
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +#if defined __arm__ /* A32/T32 ISA */
>> +               uint32_t old;
>> +               uint32_t exp = *exp_p;
>> +               int status;
>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>> +                                "cmp   %0, %3\t\n"
>> +                                "bne   1f\t\n"
>> +                                "strex %1, %4, [%2]\t\n"
>> +                                "1:\t\n"
>> +                               : "=&r"(old), "=&r"(status)
>> +                               : "r"(&ptr->v), "r"(exp), "r"(val)
>> +                               : MEMORY);
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Value has changed, can't proceed */
>> +                       /* Clear exclusive access monitor */
>> +                       __asm __volatile("clrex");
>> +                       /* Return current value */
>> +                       *exp_p = old;
>> +                       return 0;
>> +               }
>> +               /* strex returns 0 on success */
>> +               if (odp_unlikely(status != 0)) {
>> +                       /* strex failed, reservation was disturbed */
>> +                       /* Return potentially changed value */
>> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
>> +                       return 0;
>> +               }
>> +               return 1;
>> +#elif defined __mips64__
>> +               uint32_t old;
>> +               uint32_t exp = *exp_p;
>> +               uint32_t status = val;
>> +               __asm __volatile("llw %0, [%2]\t\n"
>> +                                "bne %0, %3, 1f\t\n"
>> +                                "scw %1, [%2]\t\n"
>> +                                "1:\t\n"
>> +                               : "=&r"(old), "+&r"(status)
>> +                               : "r"(&ptr->v), "r"(exp)
>> +                               : MEMORY);
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Value has changed, can't proceed */
>> +                       /* Return current value */
>> +                       *exp_p = old;
>> +                       return 0;
>> +               }
>> +               /* scw returns 1 on success, 0 on failure */
>> +               if (odp_unlikely(status == 0)) {
>> +                       /* scw failed, reservation was disturbed */
>> +                       *exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
>> +                       return 0;
>> +               }
>> +               return 1;
>> +#elif defined __x86_64__
>> +               uint32_t exp = *exp_p;
>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
>> val);
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Return the unexpected content of '*ptr' */
>> +                       *exp_p = old;
>> +                       return 0;
>> +               } else {
>> +                       return 1;
>> +               }
>> +#else
>> +#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
>> +               uint32_t exp = *exp_p;
>> +               uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp,
>> val);
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Return the unexpected content of '*ptr' */
>> +                       *exp_p = old;
>> +                       return 0;
>> +               } else {
>> +                       return 1;
>> +               }
>> +#endif
>> +       } else {
>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>> +       }
>> +}
>> +
>> +/**
>> + * Atomic fetch and add to 32-bit atomic variable
>> + * @note A - B <=> A + (-B)
>> + *
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>> + * @param memmodel Memory model associated with the add
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>> + *
>> + * @return Value of the atomic variable before the addition
>> + */
>> +static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
>> +               uint32_t incr,
>> +               odp_memorder_t mmodel)
>> +{
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +#if defined __arm__ /* A32/T32 ISA */
>> +               uint32_t old_val, tmp;
>> +               int status;
>> +               do {
>> +                       __asm __volatile("ldrex %0, [%3]\t\n"
>> +                                        "add   %1, %0, %4\t\n"
>> +                                        "strex %2, %1, [%3]\t\n"
>>
>> +                                       : "=&r"(old_val), "=&r"(tmp),
>>
>> +                                         "=&r"(status)
>> +                                       : "r"(&ptr->v), "r"(incr)
>> +                                       : MEMORY);
>> +               } while (odp_unlikely(status != 0));
>> +               return old_val;
>> +#elif defined __OCTEON__
>> +               uint32_t old_val;
>> +               __asm __volatile("laa %0,(%2),%3"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr), "r" (incr)
>> +                               : MEMORY);
>> +               return old_val;
>> +#elif defined __x86_64__
>> +               /* Generates good code on x86_64 */
>> +               return __sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +#warning odp_atomic32_fetch_add() may not be efficiently implemented
>> +               return __sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               COMPILER_HW_BARRIER();
>> +               __asm __volatile("laa %0,(%2),%3"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr), "r" (incr)
>> +                               : MEMORY);
>> +               COMPILER_HW_BARRIER();
>> +               return old_val;
>> +#endif
>> +               /* __sync_fetch_and_add() will give us barriers before and
>> +                * after, we are fine with this for release operations */
>> +               return __sync_fetch_and_add(&ptr->v, incr);
>> +       } else {
>> +               ODP_ABORT("Invalid memory model %u\n", mmodel);
>> +       }
>>  }
>>
>>  /**
>> - * Initialize atomic uint64
>> + * Atomic add to 32-bit atomic variable
>>   *
>> - * @param ptr    An atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>> + * @param memmodel Memory model associated with the add
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   */
>> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
>> +static inline void odp_atomic32_add(odp_atomic32_t *ptr,
>> +               uint32_t incr,
>> +               odp_memorder_t mmodel)
>>  {
>> -       *ptr = 0;
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               /* Platforms that support atomic add instructions can add
>> +                * their implementations here */
>> +#if defined __OCTEON__
>> +               __asm __volatile("saa %[inc], (%[base])"
>> +                               : "+m" (*ptr)
>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>> +                               : MEMORY);
>> +               return;
>> +#endif
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +               /* Platforms that support atomic add instructions can add
>> +                * their implementations here */
>> +#if defined __OCTEON__
>> +               COMPILER_HW_BARRIER();
>> +               __asm __volatile("saa %[inc], (%[base])"
>> +                               : "+m" (*ptr)
>> +                               : [inc] "r" (incr), [base] "r" (ptr)
>> +                               : MEMORY);
>> +               COMPILER_HW_BARRIER();
>> +               return;
>> +#endif
>> +       }
>> +       /* Default to using odp_atomic32_fetch_add() */
>> +       (void)odp_atomic32_fetch_add(ptr, incr, mmodel);
>>  }
>>
>>  /**
>> - * Load value of atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> + * Atomic fetch and increment of 32-bit atomic variable
>>   *
>> - * @return atomic uint64 value
>> + * param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the increment
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   *
>> - * @note The operation is not synchronized with other threads
>> + * @return Value of the atomic variable before the increment
>>   */
>> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
>> +static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
>> +               odp_memorder_t mmodel)
>>  {
>> -       return *ptr;
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               /* Platforms that support atomic increment instructions
>> can add
>> +                * their implementations here */
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               __asm __volatile("lai %0,(%2)"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr)
>> +                               : MEMORY);
>> +               return old_val;
>> +#endif
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               COMPILER_HW_BARRIER();
>> +               __asm __volatile("lai %0,(%2)"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr)
>> +                               : MEMORY);
>> +               COMPILER_HW_BARRIER();
>> +               return old_val;
>> +#endif
>> +       }
>> +       /* Default to using odp_atomic32_fetch_add() */
>> +       return odp_atomic32_fetch_add(ptr, 1, mmodel);
>>  }
>>
>>  /**
>> - * Store value to atomic uint64
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> + * Atomic increment of 32-bit atomic variable
>>   *
>> - * @note The operation is not synchronized with other threads
>> + * param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the increment
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   */
>> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
>> -                                       uint64_t new_value)
>> -{
>> -       *ptr = new_value;
>> -}
>> +static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
>> +               odp_memorder_t mmodel)
>>
>> -/**
>> - * Add atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - */
>> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
>> value)
>>  {
>> -       __sync_fetch_and_add(ptr, value);
>> +       /* Default to using odp_atomic32_fetch_inc() */
>> +       /* Platforms that support atomic increment instructions can add
>> +        * their implementations here */
>> +       (void)odp_atomic32_fetch_inc(ptr, mmodel);
>>  }
>>
>>  /**
>> - * Fetch and add atomic uint64
>> + * Atomic fetch and decrement of 32-bit atomic variable
>>   *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> + * param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the decrement
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   *
>> - * @return Value of the variable before the operation
>> + * @return Value of the atomic variable before the decrement
>>   */
>> -
>> -#if defined __powerpc__ && !defined __powerpc64__
>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> +static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
>> +               odp_memorder_t mmodel)
>>  {
>> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
>> -                                   (uint32_t)value);
>> -}
>> -#else
>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> +       if (mmodel == ODP_MEMORDER_RLX) {
>> +               /* Platforms that support atomic decrement instructions
>> can add
>> +                * their implementations here */
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               __asm __volatile("lad %0,(%2)"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr)
>> +                               : MEMORY);
>> +               return old_val;
>>  #endif
>> -/**
>> - * Subtract atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - */
>> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
>> value)
>> -{
>> -       __sync_fetch_and_sub(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and subtract atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -#if defined __powerpc__ && !defined __powerpc64__
>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
>> -                                   (uint32_t)value);
>> -}
>> -#else
>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> +       } else if (mmodel == ODP_MEMORDER_RLS) {
>> +#if defined __OCTEON__
>> +               uint32_t old_val;
>> +               COMPILER_HW_BARRIER();
>> +               __asm __volatile("lad %0,(%2)"
>> +                               : "=r" (old_val), "+m" (ptr)
>> +                               : "r" (ptr)
>> +                               : MEMORY);
>> +               COMPILER_HW_BARRIER();
>> +               return old_val;
>>  #endif
>> -/**
>> - * Fetch and increment atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_u64(ptr, 1);
>> -}
>> -
>> -/**
>> - * Increment atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_u64(ptr, 1);
>> +       }
>> +       /* Default to using odp_atomic32_fetch_add() */
>> +       return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
>>  }
>>
>>  /**
>> - * Fetch and decrement atomic uint64 by 1
>> + * Atomic decrement of 32-bit atomic variable
>>   *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> + * param ptr   Pointer to a 32-bit atomic variable
>> + * @param memmodel Memory model associated with the decrement
>> + * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
>>   */
>> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_u64(ptr, 1);
>> -}
>> +static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
>> +               odp_memorder_t memorder)
>>
>> -/**
>> - * Decrement atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>>  {
>> -       odp_atomic_fetch_sub_u64(ptr, 1);
>> +       /* Default to using odp_atomic32_fetch_dec() */
>> +       /* Platforms that support atomic decrement instructions can add
>> +        * their implementations here */
>> +       (void)odp_atomic32_fetch_dec(ptr, memorder);
>>  }
>>
>> -/**
>> - * Atomic compare and set for 64bit
>> - *
>> - * @param dst destination location into which the value will be written.
>> - * @param exp expected value.
>> - * @param src new value.
>> - * @return Non-zero on success; 0 on failure.
>> - */
>> -static inline int
>> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
>> -{
>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>> -}
>> +/* We are not exporting this macro */
>> +#undef COMPILER_HW_BARRIER
>> +#undef MEMORY
>>
>>  #ifdef __cplusplus
>>  }
>> diff --git a/platform/linux-generic/include/api/odp_barrier.h
>> b/platform/linux-generic/include/api/odp_barrier.h
>> index a7b3215..69b1eb8 100644
>> --- a/platform/linux-generic/include/api/odp_barrier.h
>> +++ b/platform/linux-generic/include/api/odp_barrier.h
>> @@ -27,18 +27,18 @@ extern "C" {
>>   * ODP execution barrier
>>   */
>>  typedef struct odp_barrier_t {
>> -       int              count;  /**< @private Thread count */
>> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
>> +       uint32_t       num_threads;  /**< @private Thread count (constant)
>> */
>> +       odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
>>  } odp_barrier_t;
>>
>>
>>  /**
>>   * Init barrier with thread count
>>   *
>> - * @param barrier    Barrier
>> - * @param count      Thread count
>> + * @param barrier     Barrier
>> + * @param num_threads Number of threads which share the barrier
>>   */
>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>>
>>
>>  /**
>> diff --git a/platform/linux-generic/include/api/odp_counter.h
>> b/platform/linux-generic/include/api/odp_counter.h
>> new file mode 100644
>>
>> index 0000000..f937d27
>>
>> --- /dev/null
>> +++ b/platform/linux-generic/include/api/odp_counter.h
>> @@ -0,0 +1,363 @@
>> +/* Copyright (c) 2013, Linaro Limited
>> + * All rights reserved.
>> + *
>> + * SPDX-License-Identifier:     BSD-3-Clause
>> + */
>> +
>> +/**
>> + * @file
>> + *
>> + * ODP atomic counter types and operations, suitable for e.g. shared
>> statistics.
>> + * Relaxed memory model assumed for lowest overhead.
>> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
>> + * without using the required access functions.
>> + * Counter functions must be used to operate on counter variables!
>> + */
>> +
>> +#ifndef ODP_COUNTER_H_
>> +#define ODP_COUNTER_H_
>> +
>> +#include <stdint.h>
>> +#include <odp_align.h>
>> +#include <odp_hints.h>
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +/**
>> + * 32-bit (unsigned) atomic counter type
>> + */
>> +typedef struct {
>> +       uint32_t v; /**< Actual storage for the counter variable */
>> +} odp_counter32_t
>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>> +
>> +/**
>> + * 64-bit (unsigned) atomic counter type
>> + */
>> +typedef struct {
>> +       uint64_t v; /**< Actual storage for the counter variable */
>> +       /* Room for other data structures (e.g. spin lock) that might be
>> +        * needed to ensure atomicity on some architectures */
>> +} odp_counter64_t
>> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
>> +
>>
>> +/*****************************************************************************
>> + * Operations on 32-bit atomic counters
>> + * odp_counter32_init - returns no value
>> + * odp_counter32_read - returns current value
>> + * odp_counter32_write - returns no value
>> + * odp_counter32_add - returns no value
>> + * odp_counter32_read_inc - returns old value
>> + * odp_counter32_inc - returns no value
>> +
>> *****************************************************************************/
>> +
>> +/**
>> + * Initialize 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + * @param val   Initial value
>> + */
>> +static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t val)
>> +{
>> +       /* No implementation requires any other type of initialization */
>> +       *(__volatile uint32_t *)&ptr->v = val;
>> +}
>> +
>> +/**
>> + * Read 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + *
>> + * @return Value of the variable
>> + */
>> +static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
>> +{
>> +       uint32_t val;
>> +       /* Read of aligned word is atomic */
>> +       /* Cast to volatile to force compiler to (re-) read variable, thus
>> we
>> +        * will avoid using compiler memory barriers */
>> +       val = *(__volatile const uint32_t *)&ptr->v;
>> +       return val;
>> +}
>> +
>> +/**
>> + * Write 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + * @param val   Value to write to the variable
>> + */
>> +static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t
>> val)
>> +{
>> +       /* Write of aligned word is atomic */
>> +       /* Cast to volatile to force compiler to (re-) write variable,
>> thus we
>> +        * will avoid using compiler memory barriers */
>> +       *(__volatile uint32_t *)&ptr->v = val;
>> +}
>> +
>> +/**
>> + * Atomic add to 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + * @param incr  The value to be added to the counter variable
>> + */
>> +static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t incr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint32_t result;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrex %0, [%2]\t\n"
>> +                                "add   %0, %0, %3\t\n"
>> +                                "strex %1, %0, [%2]"
>> +                                : "=&r"(result), "=&r"(status)
>> +                                : "r"(&ptr->v), "Ir" (incr)
>> +                                : );
>> +       } while (odp_unlikely(status != 0));
>> +#elif defined __OCTEON__
>> +       __asm __volatile("saa %[inc], (%[base])"
>> +                        : "+m" (*ptr)
>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>> +                        : );
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +       /* Warning odp_counter32_add() may not be efficiently implemented
>> */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>> +}
>> +
>> +/**
>> + * Atomic increment (+1) of 32-bit counter variable, return original
>> value
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + *
>> + * @return Original value of counter
>> + */
>> +static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint32_t result, tmp;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrex %0, [%3]\t\n"
>> +                                "add   %1, %0, #1\t\n"
>> +                                "strex %2, %1, [%3]"
>>
>> +                                : "=&r"(result), "=&r"(tmp),
>> "=&r"(status)
>>
>> +                                : "r"(&ptr->v)
>> +                                : );
>> +       } while (odp_unlikely(status != 0));
>> +       return result;
>> +#elif defined __OCTEON__
>> +       uint32_t old_val;
>> +       __asm __volatile("lai %0,(%2)"
>> +                        : "=r" (old_val), "+m" (ptr)
>> +                        : "r" (ptr)
>> +                        : );
>> +       return old_val;
>> +#elif defined __x86_64__
>> +       return __sync_fetch_and_add(&ptr->v, 1);
>> +#else
>> +/* Warning odp_counter32_read_inc() may not be efficiently implemented */
>> +       return __sync_fetch_and_add(&ptr->v, 1);
>> +#endif
>> +}
>> +
>> +/**
>> + * Atomic increment (+1) 32-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 32-bit counter variable
>> + */
>> +static inline void odp_counter32_inc(odp_counter32_t *ptr)
>> +{
>> +#if defined __OCTEON__
>> +       odp_counter32_add(ptr, 1);
>> +#else
>> +       (void)odp_counter32_read_inc(ptr);
>> +#endif
>> +}
>> +
>>
>> +/*****************************************************************************
>> + * Operations on 64-bit atomic counters
>> + * odp_counter64_init
>> + * odp_counter64_read
>> + * odp_counter64_write
>> + * odp_counter64_add
>> + * odp_counter64_read_inc
>> + * odp_counter64_inc
>> +
>> *****************************************************************************/
>> +
>> +/**
>> + * Read 64-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + *
>> + * @return Value of the counter variable
>> + */
>> +static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t val;
>> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
>> +                        "clrex" /* Clear exclusive access monitor */
>> +                        : "=&r"(val)
>> +                        : "r"(&ptr->v)
>> +                        : );
>> +       return val;
>> +#elif defined __x86_64__ || defined __aarch64__
>> +       /* Read of aligned quad/double word is atomic */
>> +       return ptr->v;
>> +#else
>> +/* Warning odp_counter64_read() may not be efficiently implemented */
>> +       return __sync_fetch_and_or(&ptr->v, 0);
>> +#endif
>> +}
>> +
>> +/**
>> + * Write 64-bit counter variable
>> + *
>> + * @param ptr  Pointer to a 64-bit counter variable
>> + * @param val  Value to write to the counter variable
>> + */
>> +static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t
>> val)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val;
>> +       int status;
>> +       do {
>> +               /* Read counter variable exclusively so we can write to it
>> +                * later */
>> +               /* Attempt to write the new value */
>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>> +                                "strexd %1, %3, %H3, [%2]"
>> +                                : "=&r"(old_val), "=&r"(status)
>> +                                : "r"(&ptr->v), "r"(val)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
>> */
>> +#elif defined __x86_64__ || defined __aarch64__
>> +       /* Write of aligned quad/double word is atomic */
>> +       ptr->v = val;
>> +#else
>> +/* Warning odp_counter64_write() may not be efficiently implemented */
>> +       /* This is actually an counter exchange operation */
>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>> +#endif
>> +}
>> +
>> +/**
>> + * Initialize 64-bit counter variable
>> + * Perform implementation specific initializations, assign initial value.
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + * @param val   Initial value
>> + */
>> +static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t val)
>> +{
>> +       /* No implementation requires any other type of initialization */
>> +       odp_counter64_write(ptr, val);
>> +}
>> +
>> +/**
>> + * Atomic add to 64-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + * @param incr  The value to be added to the counter variable
>> + */
>> +static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>> +                                "adds   %0, %0, %3\t\n"
>> +                                "adc    %H0, %H3\t\n"
>> +                                "strexd %1, %0, %H0, [%2]"
>> +                                : "=&r"(old_val), "=&r"(status)
>> +                                : "r"(&ptr->v), "r"(incr)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
>> */
>> +#elif defined __OCTEON__
>> +       __asm __volatile("saad %[inc], (%[base])"
>> +                        : "+m" (*ptr)
>> +                        : [inc] "r" (incr), [base] "r" (ptr)
>> +                        : );
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +/* Warning odp_counter64_add() may not be efficiently implemented */
>> +       (void)__sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>> +}
>> +
>> +
>> +/**
>> + * Atomic increment (+1) 64-bit counter variable and return original
>> value
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + *
>> + * @return Original value of counter
>> + */
>> +static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val, tmp;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrexd %0, %H0, [%3]\t\n"
>> +                                "adds   %2, %0, #1\t\n"
>> +                                "adc    %H2, %H0, #0\t\n"
>> +                                "strexd %1, %2, %H2, [%3]"
>> +                                : "=&r"(old_val), "=&r"(status),
>> "=&r"(tmp)
>> +                                : "r"(&ptr->v)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
>> */
>> +       return old_val;
>> +#elif defined __OCTEON__
>> +       uint64_t old_val;
>> +       __asm __volatile("laid %0,(%2)"
>> +                       : "=r" (old_val), "+m" (ptr)
>> +                       : "r" (ptr)
>> +                       : );
>> +       return old_val;
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       return __sync_fetch_and_add(&ptr->v, 1);
>> +#else
>> +/* Warning odp_counter64_read_inc() may not be efficiently implemented */
>> +       return __sync_fetch_and_add(&ptr->v, 1);
>> +#endif
>> +}
>> +
>> +/**
>> + * Atomic increment (+1) 64-bit counter variable
>> + *
>> + * @param ptr   Pointer to a 64-bit counter variable
>> + */
>> +static inline void odp_counter64_inc(odp_counter64_t *ptr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrexd %0, %H0, [%2]\t\n"
>> +                                "adds   %0, #1\t\n"
>> +                                "adc    %H0, #0\t\n"
>> +                                "strexd %1, %0, %H0, [%2]"
>> +                                : "=&r"(old_val), "=&r"(status)
>> +                                : "r"(&ptr->v)
>> +                                : );
>> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
>> */
>> +#else
>> +       (void)odp_counter64_read_inc(ptr);
>> +#endif
>> +}
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif
>> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
>> b/platform/linux-generic/include/api/odp_rwlock.h
>> index 252ebb2..ff8a9a2 100644
>> --- a/platform/linux-generic/include/api/odp_rwlock.h
>> +++ b/platform/linux-generic/include/api/odp_rwlock.h
>> @@ -10,26 +10,30 @@
>>  /**
>>   * @file
>>   *
>> - * ODP RW Locks
>> + * ODP read/write lock
>> + * RW lock support mu
>>
>> ...
>>
>> [Message clipped]
>>
>>
>>
>>
>>
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>
>>
>>
>>
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>
>>
>>
>>
>>
>> --
>>
>> Mike Holmes
>>
>> Linaro  Sr Technical Manager
>>
>> LNG - ODP
>>
>>
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>
>>
>>
>>
>
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
diff mbox

Patch

diff --git a/.gitignore b/.gitignore
index 6342e34..77db4d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,7 +35,7 @@  build/
 odp_example
 odp_packet
 odp_packet_netmap
-odp_atomic
+odp_counter
 odp_shm
 odp_ring
 odp_timer_ping
diff --git a/example/generator/odp_generator.c b/example/generator/odp_generator.c
index eb8b340..252157d 100644
--- a/example/generator/odp_generator.c
+++ b/example/generator/odp_generator.c
@@ -62,10 +62,10 @@  typedef struct {
  * counters
 */
 static struct {
-	odp_atomic_u64_t seq;	/**< ip seq to be send */
-	odp_atomic_u64_t ip;	/**< ip packets */
-	odp_atomic_u64_t udp;	/**< udp packets */
-	odp_atomic_u64_t icmp;	/**< icmp packets */
+	odp_counter64_t seq;	/**< ip seq to be send */
+	odp_counter64_t ip;	/**< ip packets */
+	odp_counter64_t udp;	/**< udp packets */
+	odp_counter64_t icmp;	/**< icmp packets */
 } counters;
 
 /** * Thread specific arguments
@@ -201,7 +201,7 @@  static void pack_udp_pkt(odp_buffer_t obuf)
 	ip->tot_len = odp_cpu_to_be_16(args->appl.payload + ODPH_UDPHDR_LEN +
 				       ODPH_IPV4HDR_LEN);
 	ip->proto = ODPH_IPPROTO_UDP;
-	seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
+	seq = odp_counter64_read_inc(&counters.seq) % 0xFFFF;
 	ip->id = odp_cpu_to_be_16(seq);
 	ip->chksum = 0;
 	odph_ipv4_csum_update(pkt);
@@ -258,7 +258,7 @@  static void pack_icmp_pkt(odp_buffer_t obuf)
 	ip->tot_len = odp_cpu_to_be_16(args->appl.payload + ODPH_ICMPHDR_LEN +
 				       ODPH_IPV4HDR_LEN);
 	ip->proto = ODPH_IPPROTO_ICMP;
-	seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
+	seq = odp_counter64_read_inc(&counters.seq) % 0xffff;
 	ip->id = odp_cpu_to_be_16(seq);
 	ip->chksum = 0;
 	odph_ipv4_csum_update(pkt);
@@ -334,13 +334,15 @@  static void *gen_send_thread(void *arg)
 		}
 
 		if (args->appl.interval != 0) {
+			uint64_t seq = odp_counter64_read(&counters.seq);
 			printf("  [%02i] send pkt no:%ju seq %ju\n",
-			       thr, counters.seq, counters.seq%0xffff);
+			       thr, seq, seq%0xffff);
 			/* TODO use odp timer */
 			usleep(args->appl.interval * 1000);
 		}
-		if (args->appl.number != -1 && counters.seq
-		    >= (unsigned int)args->appl.number) {
+		if (args->appl.number != -1 &&
+		    odp_counter64_read(&counters.seq) >=
+		    (unsigned int)args->appl.number) {
 			break;
 		}
 	}
@@ -348,7 +350,8 @@  static void *gen_send_thread(void *arg)
 	/* receive number of reply pks until timeout */
 	if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
 		while (args->appl.timeout >= 0) {
-			if (counters.icmp >= (unsigned int)args->appl.number)
+			if (odp_counter64_read(&counters.icmp) >=
+			    (unsigned int)args->appl.number)
 				break;
 			/* TODO use odp timer */
 			sleep(1);
@@ -358,10 +361,12 @@  static void *gen_send_thread(void *arg)
 
 	/* print info */
 	if (args->appl.mode == APPL_MODE_UDP) {
-		printf("  [%02i] total send: %ju\n", thr, counters.seq);
+		printf("  [%02i] total send: %ju\n", thr,
+		       odp_counter64_read(&counters.seq));
 	} else if (args->appl.mode == APPL_MODE_PING) {
 		printf("  [%02i] total send: %ju total receive: %ju\n",
-		       thr, counters.seq, counters.icmp);
+		       thr, odp_counter64_read(&counters.seq),
+		       odp_counter64_read(&counters.icmp));
 	}
 	return arg;
 }
@@ -395,7 +400,7 @@  static void print_pkts(int thr, odp_packet_t pkt_tbl[], unsigned len)
 		if (!odp_packet_inflag_ipv4(pkt))
 			continue;
 
-		odp_atomic_inc_u64(&counters.ip);
+		odp_counter64_inc(&counters.ip);
 		rlen += sprintf(msg, "receive Packet proto:IP ");
 		buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
 		ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
@@ -405,7 +410,7 @@  static void print_pkts(int thr, odp_packet_t pkt_tbl[], unsigned len)
 
 		/* udp */
 		if (ip->proto == ODPH_IPPROTO_UDP) {
-			odp_atomic_inc_u64(&counters.udp);
+			odp_counter64_inc(&counters.udp);
 			udp = (odph_udphdr_t *)(buf + offset);
 			rlen += sprintf(msg + rlen, "UDP payload %d ",
 					odp_be_to_cpu_16(udp->length) -
@@ -417,7 +422,7 @@  static void print_pkts(int thr, odp_packet_t pkt_tbl[], unsigned len)
 			icmp = (odph_icmphdr_t *)(buf + offset);
 			/* echo reply */
 			if (icmp->type == ICMP_ECHOREPLY) {
-				odp_atomic_inc_u64(&counters.icmp);
+				odp_counter64_inc(&counters.icmp);
 				memcpy(&tvsend, buf + offset + ODPH_ICMPHDR_LEN,
 				       sizeof(struct timeval));
 				/* TODO This should be changed to use an
@@ -530,10 +535,10 @@  int main(int argc, char *argv[])
 	}
 
 	/* init counters */
-	odp_atomic_init_u64(&counters.seq);
-	odp_atomic_init_u64(&counters.ip);
-	odp_atomic_init_u64(&counters.udp);
-	odp_atomic_init_u64(&counters.icmp);
+	odp_counter64_init(&counters.seq, 0);
+	odp_counter64_init(&counters.ip, 0);
+	odp_counter64_init(&counters.udp, 0);
+	odp_counter64_init(&counters.icmp, 0);
 
 	/* Reserve memory for args from shared mem */
 	shm = odp_shm_reserve("shm_args", sizeof(args_t),
diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
index 2f2dc19..76c27d0 100644
--- a/example/ipsec/odp_ipsec.c
+++ b/example/ipsec/odp_ipsec.c
@@ -1223,7 +1223,7 @@  main(int argc, char *argv[])
 	printf("Num worker threads: %i\n", num_workers);
 
 	/* Create a barrier to synchronize thread startup */
-	odp_barrier_init_count(&sync_barrier, num_workers);
+	odp_barrier_init(&sync_barrier, num_workers);
 
 	/*
 	 * By default core #0 runs Linux kernel background tasks.
diff --git a/example/odp_example/odp_example.c b/example/odp_example/odp_example.c
index 0e9aa3d..c473395 100644
--- a/example/odp_example/odp_example.c
+++ b/example/odp_example/odp_example.c
@@ -1120,7 +1120,7 @@  int main(int argc, char *argv[])
 	odp_shm_print_all();
 
 	/* Barrier to sync test case execution */
-	odp_barrier_init_count(&globals->barrier, num_workers);
+	odp_barrier_init(&globals->barrier, num_workers);
 
 	if (args.proc_mode) {
 		int ret;
diff --git a/example/timer/odp_timer_test.c b/example/timer/odp_timer_test.c
index 78b2ae2..dfbeae9 100644
--- a/example/timer/odp_timer_test.c
+++ b/example/timer/odp_timer_test.c
@@ -372,7 +372,7 @@  int main(int argc, char *argv[])
 	printf("\n");
 
 	/* Barrier to sync test case execution */
-	odp_barrier_init_count(&test_barrier, num_workers);
+	odp_barrier_init(&test_barrier, num_workers);
 
 	/* Create and launch worker threads */
 	odph_linux_pthread_create(thread_tbl, num_workers, first_core,
diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
index 76c1db8..5e78b34 100644
--- a/helper/include/odph_ring.h
+++ b/helper/include/odph_ring.h
@@ -138,8 +138,8 @@  typedef struct odph_ring {
 		uint32_t sp_enqueue;     /* True, if single producer. */
 		uint32_t size;           /* Size of ring. */
 		uint32_t mask;           /* Mask (size-1) of ring. */
-		uint32_t head;		/* Producer head. */
-		uint32_t tail;		/* Producer tail. */
+		odp_atomic32_t head;	/* Producer head. */
+		odp_atomic32_t tail;	/* Producer tail. */
 	} prod ODP_ALIGNED_CACHE;
 
 	/** @private Consumer */
@@ -147,8 +147,8 @@  typedef struct odph_ring {
 		uint32_t sc_dequeue;     /* True, if single consumer. */
 		uint32_t size;           /* Size of the ring. */
 		uint32_t mask;           /* Mask (size-1) of ring. */
-		uint32_t head;		/* Consumer head. */
-		uint32_t tail;		/* Consumer tail. */
+		odp_atomic32_t head;	/* Consumer head. */
+		odp_atomic32_t tail;	/* Consumer tail. */
 	} cons ODP_ALIGNED_CACHE;
 
 	/** @private Memory space of ring starts here. */
diff --git a/platform/linux-generic/include/api/odp.h b/platform/linux-generic/include/api/odp.h
index 0ee3faf..d124d52 100644
--- a/platform/linux-generic/include/api/odp.h
+++ b/platform/linux-generic/include/api/odp.h
@@ -32,6 +32,7 @@  extern "C" {
 #include <odp_barrier.h>
 #include <odp_spinlock.h>
 #include <odp_atomic.h>
+#include <odp_counter.h>
 
 #include <odp_init.h>
 #include <odp_system_info.h>
diff --git a/platform/linux-generic/include/api/odp_atomic.h b/platform/linux-generic/include/api/odp_atomic.h
index 0cc4cf4..ccaad02 100644
--- a/platform/linux-generic/include/api/odp_atomic.h
+++ b/platform/linux-generic/include/api/odp_atomic.h
@@ -4,464 +4,494 @@ 
  * SPDX-License-Identifier:     BSD-3-Clause
  */
 
-
 /**
  * @file
  *
- * ODP atomic operations
+ * ODP atomic types and operations, semantically a subset of C11 atomics.
+ * Scalar variable wrapped in a struct to avoid accessing scalar directly
+ * without using the required access functions.
+ * Atomic functions must be used to operate on atomic variables!
  */
 
 #ifndef ODP_ATOMIC_H_
 #define ODP_ATOMIC_H_
 
+#include <stdint.h>
+#include <odp_align.h>
+#include <odp_hints.h>
+#include <odp_debug.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-
-#include <odp_std_types.h>
-
-
-/**
- * Atomic integer
- */
-typedef volatile int32_t odp_atomic_int_t;
-
-/**
- * Atomic unsigned integer 64 bits
- */
-typedef volatile uint64_t odp_atomic_u64_t;
-
-/**
- * Atomic unsigned integer 32 bits
- */
-typedef volatile uint32_t odp_atomic_u32_t;
-
-
-/**
- * Initialize atomic integer
- *
- * @param ptr    An integer atomic variable
- *
- * @note The operation is not synchronized with other threads
- */
-static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
-{
-	*ptr = 0;
-}
-
-/**
- * Load value of atomic integer
- *
- * @param ptr    An atomic variable
- *
- * @return atomic integer value
- *
- * @note The operation is not synchronized with other threads
- */
-static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
-{
-	return *ptr;
-}
-
-/**
- * Store value to atomic integer
- *
- * @param ptr        An atomic variable
- * @param new_value  Store new_value to a variable
- *
- * @note The operation is not synchronized with other threads
- */
-static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int new_value)
-{
-	*ptr = new_value;
-}
-
-/**
- * Fetch and add atomic integer
- *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
- *
- * @return Value of the variable before the operation
- */
-static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int value)
-{
-	return __sync_fetch_and_add(ptr, value);
-}
-
-/**
- * Fetch and subtract atomic integer
- *
- * @param ptr    An atomic integer variable
- * @param value  A value to be subtracted from the variable
- *
- * @return Value of the variable before the operation
- */
-static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int value)
-{
-	return __sync_fetch_and_sub(ptr, value);
-}
-
-/**
- * Fetch and increment atomic integer by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
-{
-	return odp_atomic_fetch_add_int(ptr, 1);
-}
-
-/**
- * Increment atomic integer by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
-{
-	odp_atomic_fetch_add_int(ptr, 1);
-}
-
-/**
- * Fetch and decrement atomic integer by 1
- *
- * @param ptr    An atomic int variable
- *
- * @return Value of the variable before the operation
- */
-static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
-{
-	return odp_atomic_fetch_sub_int(ptr, 1);
-}
-
-/**
- * Decrement atomic integer by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
-{
-	odp_atomic_fetch_sub_int(ptr, 1);
-}
-
-/**
- * Initialize atomic uint32
- *
- * @param ptr    An atomic variable
- *
- * @note The operation is not synchronized with other threads
- */
-static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
-{
-	*ptr = 0;
-}
-
-/**
- * Load value of atomic uint32
- *
- * @param ptr    An atomic variable
- *
- * @return atomic uint32 value
- *
- * @note The operation is not synchronized with other threads
- */
-static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
-{
-	return *ptr;
-}
-
-/**
- * Store value to atomic uint32
- *
- * @param ptr        An atomic variable
- * @param new_value  Store new_value to a variable
- *
- * @note The operation is not synchronized with other threads
- */
-static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
-					uint32_t new_value)
-{
-	*ptr = new_value;
-}
-
-/**
- * Fetch and add atomic uint32
- *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
- *
- * @return Value of the variable before the operation
- */
-static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
-						uint32_t value)
-{
-	return __sync_fetch_and_add(ptr, value);
-}
-
-/**
- * Fetch and subtract uint32
- *
- * @param ptr    An atomic variable
- * @param value  A value to be sub to the variable
- *
- * @return Value of the variable before the operation
- */
-static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
-						uint32_t value)
-{
-	return __sync_fetch_and_sub(ptr, value);
-}
-
 /**
- * Fetch and increment atomic uint32 by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-#if defined __OCTEON__
-
-static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
-{
-	uint32_t ret;
-
-	__asm__ __volatile__ ("syncws");
-	__asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
-			      "r" (ptr));
-
-	return ret;
-}
-
+ * 32-bit (unsigned) atomic type
+ */
+typedef struct {
+	uint32_t v; /**< Actual storage for the atomic variable */
+} odp_atomic32_t
+ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
+
+typedef enum {
+	/** Relaxed memory order, no ordering of other accesses enforced */
+	ODP_MEMORDER_RLX,
+	/** Acquire memory order, later accesses cannot move before
+	 * acquire operation */
+	ODP_MEMORDER_ACQ,
+	/** Release memory order, earlier accesses cannot move after
+	 * release operation */
+	ODP_MEMORDER_RLS
+} odp_memorder_t;
+
+/*****************************************************************************
+ * Just some private helpers
+*****************************************************************************/
+
+#ifdef __OCTEON__
+/* OCTEON Write Memory Barrier */
+#define COMPILER_HW_BARRIER() __asm __volatile( \
+	/* Double syncw to work around errata */ \
+	"syncw\n\tsyncw" : : : )
 #else
-
-static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
-{
-	return odp_atomic_fetch_add_u32(ptr, 1);
-}
-
+/** Compiler and hardware full memory barrier */
+#define COMPILER_HW_BARRIER() __sync_synchronize()
+/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a */
 #endif
 
-/**
- * Increment atomic uint32 by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
-{
-	odp_atomic_fetch_add_u32(ptr, 1);
-}
-
-/**
- * Fetch and decrement uint32 by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
-{
-	return odp_atomic_fetch_sub_u32(ptr, 1);
-}
-
-/**
- * Decrement atomic uint32 by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
-{
-	odp_atomic_fetch_sub_u32(ptr, 1);
-}
-
-/**
- * Atomic compare and set for 32bit
- *
- * @param dst destination location into which the value will be written.
- * @param exp expected value.
- * @param src new value.
- * @return Non-zero on success; 0 on failure.
- */
-static inline int
-odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
-{
-	return __sync_bool_compare_and_swap(dst, exp, src);
+#define MEMORY "memory"
+
+/*****************************************************************************
+ * Operations on 32-bit atomics
+ * odp_atomic32_init - no return value
+ * odp_atomic32_load - return current value
+ * odp_atomic32_store - no return value
+ * odp_atomic32_cmp_xchg_weak - return bool
+ * odp_atomic32_fetch_add - return old value
+ * odp_atomic32_add - no return value
+ * odp_atomic32_fetch_inc - return old value
+ * odp_atomic32_inc - no return value
+ * odp_atomic32_fetch_dec - return old value
+ * odp_atomic32_dec - no return value
+ *****************************************************************************/
+
+static inline void odp_atomic32_init(odp_atomic32_t *ptr, uint32_t val)
+{
+	/* Write of aligned word is atomic */
+	/* Cast to volatile to force compiler to (re-) write variable, thus we
+	 * can avoid using compiler memory barriers */
+	*(__volatile uint32_t *)&ptr->v = val;
+}
+
+/**
+ * Atomic load of 32-bit atomic variable
+ *
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the load
+ * (ODP_MEMORDER_RLX or ODP_MEMORDER_ACQ)
+ *
+ * @return Value of the variable
+ */
+static inline uint32_t odp_atomic32_load(const odp_atomic32_t *ptr,
+		odp_memorder_t mmodel)
+{
+	if (mmodel == ODP_MEMORDER_RLX) {
+		uint32_t val;
+		/* Read of aligned word is atomic */
+		/* Cast to volatile to force compiler to (re-) read variable,
+		 * thus we can avoid using compiler memory barriers */
+		val = *(__volatile const uint32_t *)&ptr->v;
+		return val;
+	} else if (mmodel == ODP_MEMORDER_ACQ) {
+#if defined __aarch64__
+		uint32_t val;
+		__asm __volatile("ldar %w0, [%1]"
+				: "=&r"(val)
+				: "r"(&ptr->v)
+				: MEMORY);
+		return val;
+#elif defined __arm__  || defined __mips64__ || defined __x86_64__
+		/* Read of aligned word is atomic */
+		uint32_t val = ptr->v;
+		/* To prevent later accesses from moving up */
+		/* Herb Sutter claims HW barrier not needed on x86? */
+		COMPILER_HW_BARRIER();
+		return val;
+#else
+#warning odp_atomic32_load() may not be efficiently implemented
+		/* Assume read of aligned word is atomic */
+		uint32_t val = ptr->v;
+		/* To prevent later accesses from moving up */
+		COMPILER_HW_BARRIER();
+		return val;
+#endif
+	} else {
+		ODP_ABORT("Invalid memory model %u\n", mmodel);
+	}
+}
+
+/**
+ * Atomic store to 32-bit atomic variable
+ *
+ * @param ptr  Pointer to a 32-bit atomic variable
+ * @param val  Value to write to the atomic variable
+ * @param memmodel Memory model associated with the store
+ * (ODP_MEMORDER_RLX or ODP_MEMORDER_RLS)
+ */
+static inline void odp_atomic32_store(odp_atomic32_t *ptr,
+		uint32_t val,
+		odp_memorder_t mmodel)
+{
+	if (mmodel == ODP_MEMORDER_RLX) {
+		/* Write of aligned word is atomic */
+		/* Cast to volatile to force compiler to (re-) write variable,
+		 * thus we will avoid using compiler memory barriers */
+		*(__volatile uint32_t *)&ptr->v = val;
+	} else if (mmodel == ODP_MEMORDER_RLS) {
+#if defined __arm__ /* A32/T32 ISA */ || defined __mips64__
+		/* Compiler and HW barrier to prevent earlier accesses from
+		 * moving down */
+		COMPILER_HW_BARRIER();
+		/* Write of aligned word is atomic */
+		ptr->v = val;
+		/* Compiler and HW barrier to prevent this store from moving
+		 * down after a later load-acquire and thus create overlapping
+		 * critical sections. Herb Sutter thinks this is needed */
+		COMPILER_HW_BARRIER();
+#elif defined __aarch64__
+		__asm __volatile("stlr %w0, [%1]"
+				:
+				: "r"(val), "r"(&ptr->v)
+				: MEMORY);
+#elif defined __x86_64__
+		/* This is actually an atomic exchange operation */
+		/* Generates good code on x86_64 */
+		(void)__sync_lock_test_and_set(&ptr->v, val);
+#else
+#warning odp_atomic32_store_rls() may not be efficiently implemented
+		/* This is actually an atomic exchange operation */
+		(void)__sync_lock_test_and_set(&ptr->v, val);
+#endif
+	} else {
+		ODP_ABORT("Invalid memory model %u\n", mmodel);
+	}
+}
+
+
+/**
+ * Atomic compare and exchange (swap) of 32-bit atomic variable
+ * "Weak" semantics, may fail spuriously and must be used in a loop.
+ *
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param exp_p Pointer to expected value (updated on failure)
+ * @param val   New value to write
+ * @param       memmodel Memory model associated with the compare-and-swap
+ * operation (ODP_MEMORDER_RLX only)
+ *
+ * @return 1 (true) if exchange successful, 0 (false) if not successful (and
+ * '*exp_p' updated with current value)
+ */
+static inline int odp_atomic32_cmp_xchg_weak(odp_atomic32_t *ptr,
+		uint32_t *exp_p,
+		uint32_t val,
+		odp_memorder_t mmodel)
+{
+	if (mmodel == ODP_MEMORDER_RLX) {
+#if defined __arm__ /* A32/T32 ISA */
+		uint32_t old;
+		uint32_t exp = *exp_p;
+		int status;
+		__asm __volatile("ldrex %0, [%2]\t\n"
+				 "cmp   %0, %3\t\n"
+				 "bne   1f\t\n"
+				 "strex %1, %4, [%2]\t\n"
+				 "1:\t\n"
+				: "=&r"(old), "=&r"(status)
+				: "r"(&ptr->v), "r"(exp), "r"(val)
+				: MEMORY);
+		if (odp_unlikely(old != exp)) {
+			/* Value has changed, can't proceed */
+			/* Clear exclusive access monitor */
+			__asm __volatile("clrex");
+			/* Return current value */
+			*exp_p = old;
+			return 0;
+		}
+		/* strex returns 0 on success */
+		if (odp_unlikely(status != 0)) {
+			/* strex failed, reservation was disturbed */
+			/* Return potentially changed value */
+			*exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
+			return 0;
+		}
+		return 1;
+#elif defined __mips64__
+		uint32_t old;
+		uint32_t exp = *exp_p;
+		uint32_t status = val;
+		__asm __volatile("llw %0, [%2]\t\n"
+				 "bne %0, %3, 1f\t\n"
+				 "scw %1, [%2]\t\n"
+				 "1:\t\n"
+				: "=&r"(old), "+&r"(status)
+				: "r"(&ptr->v), "r"(exp)
+				: MEMORY);
+		if (odp_unlikely(old != exp)) {
+			/* Value has changed, can't proceed */
+			/* Return current value */
+			*exp_p = old;
+			return 0;
+		}
+		/* scw returns 1 on success, 0 on failure */
+		if (odp_unlikely(status == 0)) {
+			/* scw failed, reservation was disturbed */
+			*exp_p = odp_atomic32_load(ptr, ODP_MEMORDER_RLX);
+			return 0;
+		}
+		return 1;
+#elif defined __x86_64__
+		uint32_t exp = *exp_p;
+		uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp, val);
+		if (odp_unlikely(old != exp)) {
+			/* Return the unexpected content of '*ptr' */
+			*exp_p = old;
+			return 0;
+		} else {
+			return 1;
+		}
+#else
+#warning odp_atomic32_cmp_xchg_weak() may not be efficiently implemented
+		uint32_t exp = *exp_p;
+		uint32_t old = __sync_val_compare_and_swap(&ptr->v, exp, val);
+		if (odp_unlikely(old != exp)) {
+			/* Return the unexpected content of '*ptr' */
+			*exp_p = old;
+			return 0;
+		} else {
+			return 1;
+		}
+#endif
+	} else {
+		ODP_ABORT("Invalid memory model %u\n", mmodel);
+	}
+}
+
+/**
+ * Atomic fetch and add to 32-bit atomic variable
+ * @note A - B <=> A + (-B)
+ *
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
+ * @param memmodel Memory model associated with the add
+ * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
+ *
+ * @return Value of the atomic variable before the addition
+ */
+static inline uint32_t odp_atomic32_fetch_add(odp_atomic32_t *ptr,
+		uint32_t incr,
+		odp_memorder_t mmodel)
+{
+	if (mmodel == ODP_MEMORDER_RLX) {
+#if defined __arm__ /* A32/T32 ISA */
+		uint32_t old_val, tmp;
+		int status;
+		do {
+			__asm __volatile("ldrex %0, [%3]\t\n"
+					 "add   %1, %0, %4\t\n"
+					 "strex %2, %1, [%3]\t\n"
+					: "=&r"(old_val), "=&r"(tmp),
+					  "=&r"(status)
+					: "r"(&ptr->v), "r"(incr)
+					: MEMORY);
+		} while (odp_unlikely(status != 0));
+		return old_val;
+#elif defined __OCTEON__
+		uint32_t old_val;
+		__asm __volatile("laa %0,(%2),%3"
+				: "=r" (old_val), "+m" (ptr)
+				: "r" (ptr), "r" (incr)
+				: MEMORY);
+		return old_val;
+#elif defined __x86_64__
+		/* Generates good code on x86_64 */
+		return __sync_fetch_and_add(&ptr->v, incr);
+#else
+#warning odp_atomic32_fetch_add() may not be efficiently implemented
+		return __sync_fetch_and_add(&ptr->v, incr);
+#endif
+	} else if (mmodel == ODP_MEMORDER_RLS) {
+#if defined __OCTEON__
+		uint32_t old_val;
+		COMPILER_HW_BARRIER();
+		__asm __volatile("laa %0,(%2),%3"
+				: "=r" (old_val), "+m" (ptr)
+				: "r" (ptr), "r" (incr)
+				: MEMORY);
+		COMPILER_HW_BARRIER();
+		return old_val;
+#endif
+		/* __sync_fetch_and_add() will give us barriers before and
+		 * after, we are fine with this for release operations */
+		return __sync_fetch_and_add(&ptr->v, incr);
+	} else {
+		ODP_ABORT("Invalid memory model %u\n", mmodel);
+	}
 }
 
 /**
- * Initialize atomic uint64
+ * Atomic add to 32-bit atomic variable
  *
- * @param ptr    An atomic variable
- *
- * @note The operation is not synchronized with other threads
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
+ * @param memmodel Memory model associated with the add
+ * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
  */
-static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
+static inline void odp_atomic32_add(odp_atomic32_t *ptr,
+		uint32_t incr,
+		odp_memorder_t mmodel)
 {
-	*ptr = 0;
+	if (mmodel == ODP_MEMORDER_RLX) {
+		/* Platforms that support atomic add instructions can add
+		 * their implementations here */
+#if defined __OCTEON__
+		__asm __volatile("saa %[inc], (%[base])"
+				: "+m" (*ptr)
+				: [inc] "r" (incr), [base] "r" (ptr)
+				: MEMORY);
+		return;
+#endif
+	} else if (mmodel == ODP_MEMORDER_RLS) {
+		/* Platforms that support atomic add instructions can add
+		 * their implementations here */
+#if defined __OCTEON__
+		COMPILER_HW_BARRIER();
+		__asm __volatile("saa %[inc], (%[base])"
+				: "+m" (*ptr)
+				: [inc] "r" (incr), [base] "r" (ptr)
+				: MEMORY);
+		COMPILER_HW_BARRIER();
+		return;
+#endif
+	}
+	/* Default to using odp_atomic32_fetch_add() */
+	(void)odp_atomic32_fetch_add(ptr, incr, mmodel);
 }
 
 /**
- * Load value of atomic uint64
- *
- * @param ptr    An atomic variable
+ * Atomic fetch and increment of 32-bit atomic variable
  *
- * @return atomic uint64 value
+ * param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the increment
+ * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
  *
- * @note The operation is not synchronized with other threads
+ * @return Value of the atomic variable before the increment
  */
-static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
+static inline uint32_t odp_atomic32_fetch_inc(odp_atomic32_t *ptr,
+		odp_memorder_t mmodel)
 {
-	return *ptr;
+	if (mmodel == ODP_MEMORDER_RLX) {
+		/* Platforms that support atomic increment instructions can add
+		 * their implementations here */
+#if defined __OCTEON__
+		uint32_t old_val;
+		__asm __volatile("lai %0,(%2)"
+				: "=r" (old_val), "+m" (ptr)
+				: "r" (ptr)
+				: MEMORY);
+		return old_val;
+#endif
+	} else if (mmodel == ODP_MEMORDER_RLS) {
+#if defined __OCTEON__
+		uint32_t old_val;
+		COMPILER_HW_BARRIER();
+		__asm __volatile("lai %0,(%2)"
+				: "=r" (old_val), "+m" (ptr)
+				: "r" (ptr)
+				: MEMORY);
+		COMPILER_HW_BARRIER();
+		return old_val;
+#endif
+	}
+	/* Default to using odp_atomic32_fetch_add() */
+	return odp_atomic32_fetch_add(ptr, 1, mmodel);
 }
 
 /**
- * Store value to atomic uint64
- *
- * @param ptr        An atomic variable
- * @param new_value  Store new_value to a variable
+ * Atomic increment of 32-bit atomic variable
  *
- * @note The operation is not synchronized with other threads
+ * param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the increment
+ * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
  */
-static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
-					uint64_t new_value)
-{
-	*ptr = new_value;
-}
+static inline void odp_atomic32_inc(odp_atomic32_t *ptr,
+		odp_memorder_t mmodel)
 
-/**
- * Add atomic uint64
- *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
- *
- */
-static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t value)
 {
-	__sync_fetch_and_add(ptr, value);
+	/* Default to using odp_atomic32_fetch_inc() */
+	/* Platforms that support atomic increment instructions can add
+	 * their implementations here */
+	(void)odp_atomic32_fetch_inc(ptr, mmodel);
 }
 
 /**
- * Fetch and add atomic uint64
+ * Atomic fetch and decrement of 32-bit atomic variable
  *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
+ * param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the decrement
+ * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
  *
- * @return Value of the variable before the operation
+ * @return Value of the atomic variable before the decrement
  */
-
-#if defined __powerpc__ && !defined __powerpc64__
-static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
+static inline uint32_t odp_atomic32_fetch_dec(odp_atomic32_t *ptr,
+		odp_memorder_t mmodel)
 {
-	return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
-				    (uint32_t)value);
-}
-#else
-static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
-{
-	return __sync_fetch_and_add(ptr, value);
-}
+	if (mmodel == ODP_MEMORDER_RLX) {
+		/* Platforms that support atomic decrement instructions can add
+		 * their implementations here */
+#if defined __OCTEON__
+		uint32_t old_val;
+		__asm __volatile("lad %0,(%2)"
+				: "=r" (old_val), "+m" (ptr)
+				: "r" (ptr)
+				: MEMORY);
+		return old_val;
 #endif
-/**
- * Subtract atomic uint64
- *
- * @param ptr    An atomic variable
- * @param value  A value to be subtracted from the variable
- *
- */
-static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t value)
-{
-	__sync_fetch_and_sub(ptr, value);
-}
-
-/**
- * Fetch and subtract atomic uint64
- *
- * @param ptr    An atomic variable
- * @param value  A value to be subtracted from the variable
- *
- * @return Value of the variable before the operation
- */
-#if defined __powerpc__ && !defined __powerpc64__
-static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
-{
-	return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
-				    (uint32_t)value);
-}
-#else
-static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
-{
-	return __sync_fetch_and_sub(ptr, value);
-}
+	} else if (mmodel == ODP_MEMORDER_RLS) {
+#if defined __OCTEON__
+		uint32_t old_val;
+		COMPILER_HW_BARRIER();
+		__asm __volatile("lad %0,(%2)"
+				: "=r" (old_val), "+m" (ptr)
+				: "r" (ptr)
+				: MEMORY);
+		COMPILER_HW_BARRIER();
+		return old_val;
 #endif
-/**
- * Fetch and increment atomic uint64 by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
-{
-	return odp_atomic_fetch_add_u64(ptr, 1);
-}
-
-/**
- * Increment atomic uint64 by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
-{
-	odp_atomic_fetch_add_u64(ptr, 1);
+	}
+	/* Default to using odp_atomic32_fetch_add() */
+	return odp_atomic32_fetch_add(ptr, (uint32_t)-1, mmodel);
 }
 
 /**
- * Fetch and decrement atomic uint64 by 1
+ * Atomic decrement of 32-bit atomic variable
  *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
+ * param ptr   Pointer to a 32-bit atomic variable
+ * @param memmodel Memory model associated with the decrement
+ * operation (ODP_MEMORDER_RLX, ODP_MEMORDER_RLS).
  */
-static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
-{
-	return odp_atomic_fetch_sub_u64(ptr, 1);
-}
+static inline void odp_atomic32_dec(odp_atomic32_t *ptr,
+		odp_memorder_t memorder)
 
-/**
- * Decrement atomic uint64 by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
 {
-	odp_atomic_fetch_sub_u64(ptr, 1);
+	/* Default to using odp_atomic32_fetch_dec() */
+	/* Platforms that support atomic decrement instructions can add
+	 * their implementations here */
+	(void)odp_atomic32_fetch_dec(ptr, memorder);
 }
 
-/**
- * Atomic compare and set for 64bit
- *
- * @param dst destination location into which the value will be written.
- * @param exp expected value.
- * @param src new value.
- * @return Non-zero on success; 0 on failure.
- */
-static inline int
-odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
-{
-	return __sync_bool_compare_and_swap(dst, exp, src);
-}
+/* We are not exporting this macro */
+#undef COMPILER_HW_BARRIER
+#undef MEMORY
 
 #ifdef __cplusplus
 }
diff --git a/platform/linux-generic/include/api/odp_barrier.h b/platform/linux-generic/include/api/odp_barrier.h
index a7b3215..69b1eb8 100644
--- a/platform/linux-generic/include/api/odp_barrier.h
+++ b/platform/linux-generic/include/api/odp_barrier.h
@@ -27,18 +27,18 @@  extern "C" {
  * ODP execution barrier
  */
 typedef struct odp_barrier_t {
-	int              count;  /**< @private Thread count */
-	odp_atomic_int_t bar;    /**< @private Barrier counter */
+	uint32_t       num_threads;  /**< @private Thread count (constant) */
+	odp_atomic32_t in_barrier;   /**< @private Threads in barrier */
 } odp_barrier_t;
 
 
 /**
  * Init barrier with thread count
  *
- * @param barrier    Barrier
- * @param count      Thread count
+ * @param barrier     Barrier
+ * @param num_threads Number of threads which share the barrier
  */
-void odp_barrier_init_count(odp_barrier_t *barrier, int count);
+void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
 
 
 /**
diff --git a/platform/linux-generic/include/api/odp_counter.h b/platform/linux-generic/include/api/odp_counter.h
new file mode 100644
index 0000000..f937d27
--- /dev/null
+++ b/platform/linux-generic/include/api/odp_counter.h
@@ -0,0 +1,363 @@ 
+/* Copyright (c) 2013, Linaro Limited
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier:     BSD-3-Clause
+ */
+
+/**
+ * @file
+ *
+ * ODP atomic counter types and operations, suitable for e.g. shared statistics.
+ * Relaxed memory model assumed for lowest overhead.
+ * Scalar variable wrapped in a struct to avoid accessing scalar directly
+ * without using the required access functions.
+ * Counter functions must be used to operate on counter variables!
+ */
+
+#ifndef ODP_COUNTER_H_
+#define ODP_COUNTER_H_
+
+#include <stdint.h>
+#include <odp_align.h>
+#include <odp_hints.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * 32-bit (unsigned) atomic counter type
+ */
+typedef struct {
+	uint32_t v; /**< Actual storage for the counter variable */
+} odp_counter32_t
+ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
+
+/**
+ * 64-bit (unsigned) atomic counter type
+ */
+typedef struct {
+	uint64_t v; /**< Actual storage for the counter variable */
+	/* Room for other data structures (e.g. spin lock) that might be
+	 * needed to ensure atomicity on some architectures */
+} odp_counter64_t
+ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
+
+/*****************************************************************************
+ * Operations on 32-bit atomic counters
+ * odp_counter32_init - returns no value
+ * odp_counter32_read - returns current value
+ * odp_counter32_write - returns no value
+ * odp_counter32_add - returns no value
+ * odp_counter32_read_inc - returns old value
+ * odp_counter32_inc - returns no value
+ *****************************************************************************/
+
+/**
+ * Initialize 32-bit counter variable
+ *
+ * @param ptr   Pointer to a 32-bit counter variable
+ * @param val   Initial value
+ */
+static inline void odp_counter32_init(odp_counter32_t *ptr, uint32_t val)
+{
+	/* No implementation requires any other type of initialization */
+	*(__volatile uint32_t *)&ptr->v = val;
+}
+
+/**
+ * Read 32-bit counter variable
+ *
+ * @param ptr   Pointer to a 32-bit counter variable
+ *
+ * @return Value of the variable
+ */
+static inline uint32_t odp_counter32_read(const odp_counter32_t *ptr)
+{
+	uint32_t val;
+	/* Read of aligned word is atomic */
+	/* Cast to volatile to force compiler to (re-) read variable, thus we
+	 * will avoid using compiler memory barriers */
+	val = *(__volatile const uint32_t *)&ptr->v;
+	return val;
+}
+
+/**
+ * Write 32-bit counter variable
+ *
+ * @param ptr   Pointer to a 32-bit counter variable
+ * @param val   Value to write to the variable
+ */
+static inline void odp_counter32_write(odp_counter32_t *ptr, uint32_t val)
+{
+	/* Write of aligned word is atomic */
+	/* Cast to volatile to force compiler to (re-) write variable, thus we
+	 * will avoid using compiler memory barriers */
+	*(__volatile uint32_t *)&ptr->v = val;
+}
+
+/**
+ * Atomic add to 32-bit counter variable
+ *
+ * @param ptr   Pointer to a 32-bit counter variable
+ * @param incr  The value to be added to the counter variable
+ */
+static inline void odp_counter32_add(odp_counter32_t *ptr, uint32_t incr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint32_t result;
+	int status;
+	do {
+		__asm __volatile("ldrex %0, [%2]\t\n"
+				 "add   %0, %0, %3\t\n"
+				 "strex %1, %0, [%2]"
+				 : "=&r"(result), "=&r"(status)
+				 : "r"(&ptr->v), "Ir" (incr)
+				 : );
+	} while (odp_unlikely(status != 0));
+#elif defined __OCTEON__
+	__asm __volatile("saa %[inc], (%[base])"
+			 : "+m" (*ptr)
+			 : [inc] "r" (incr), [base] "r" (ptr)
+			 : );
+#elif defined __x86_64__
+	/* Generates good code on x86_64 */
+	(void)__sync_fetch_and_add(&ptr->v, incr);
+#else
+	/* Warning odp_counter32_add() may not be efficiently implemented */
+	(void)__sync_fetch_and_add(&ptr->v, incr);
+#endif
+}
+
+/**
+ * Atomic increment (+1) of 32-bit counter variable, return original value
+ *
+ * @param ptr   Pointer to a 32-bit counter variable
+ *
+ * @return Original value of counter
+ */
+static inline uint32_t odp_counter32_read_inc(odp_counter32_t *ptr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint32_t result, tmp;
+	int status;
+	do {
+		__asm __volatile("ldrex %0, [%3]\t\n"
+				 "add   %1, %0, #1\t\n"
+				 "strex %2, %1, [%3]"
+				 : "=&r"(result), "=&r"(tmp), "=&r"(status)
+				 : "r"(&ptr->v)
+				 : );
+	} while (odp_unlikely(status != 0));
+	return result;
+#elif defined __OCTEON__
+	uint32_t old_val;
+	__asm __volatile("lai %0,(%2)"
+			 : "=r" (old_val), "+m" (ptr)
+			 : "r" (ptr)
+			 : );
+	return old_val;
+#elif defined __x86_64__
+	return __sync_fetch_and_add(&ptr->v, 1);
+#else
+/* Warning odp_counter32_read_inc() may not be efficiently implemented */
+	return __sync_fetch_and_add(&ptr->v, 1);
+#endif
+}
+
+/**
+ * Atomic increment (+1) 32-bit counter variable
+ *
+ * @param ptr   Pointer to a 32-bit counter variable
+ */
+static inline void odp_counter32_inc(odp_counter32_t *ptr)
+{
+#if defined __OCTEON__
+	odp_counter32_add(ptr, 1);
+#else
+	(void)odp_counter32_read_inc(ptr);
+#endif
+}
+
+/*****************************************************************************
+ * Operations on 64-bit atomic counters
+ * odp_counter64_init
+ * odp_counter64_read
+ * odp_counter64_write
+ * odp_counter64_add
+ * odp_counter64_read_inc
+ * odp_counter64_inc
+ *****************************************************************************/
+
+/**
+ * Read 64-bit counter variable
+ *
+ * @param ptr   Pointer to a 64-bit counter variable
+ *
+ * @return Value of the counter variable
+ */
+static inline uint64_t odp_counter64_read(const odp_counter64_t *ptr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t val;
+	__asm __volatile("ldrexd %0, %H0, [%1]\n\t"
+			 "clrex" /* Clear exclusive access monitor */
+			 : "=&r"(val)
+			 : "r"(&ptr->v)
+			 : );
+	return val;
+#elif defined __x86_64__ || defined __aarch64__
+	/* Read of aligned quad/double word is atomic */
+	return ptr->v;
+#else
+/* Warning odp_counter64_read() may not be efficiently implemented */
+	return __sync_fetch_and_or(&ptr->v, 0);
+#endif
+}
+
+/**
+ * Write 64-bit counter variable
+ *
+ * @param ptr  Pointer to a 64-bit counter variable
+ * @param val  Value to write to the counter variable
+ */
+static inline void odp_counter64_write(odp_counter64_t *ptr, uint64_t val)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t old_val;
+	int status;
+	do {
+		/* Read counter variable exclusively so we can write to it
+		 * later */
+		/* Attempt to write the new value */
+		__asm __volatile("ldrexd %0, %H0, [%2]\t\n"
+				 "strexd %1, %3, %H3, [%2]"
+				 : "=&r"(old_val), "=&r"(status)
+				 : "r"(&ptr->v), "r"(val)
+				 : );
+	} while (odp_unlikely(status != 0)); /* Retry until write succeeds */
+#elif defined __x86_64__ || defined __aarch64__
+	/* Write of aligned quad/double word is atomic */
+	ptr->v = val;
+#else
+/* Warning odp_counter64_write() may not be efficiently implemented */
+	/* This is actually an counter exchange operation */
+	(void)__sync_lock_test_and_set(&ptr->v, val);
+#endif
+}
+
+/**
+ * Initialize 64-bit counter variable
+ * Perform implementation specific initializations, assign initial value.
+ *
+ * @param ptr   Pointer to a 64-bit counter variable
+ * @param val   Initial value
+ */
+static inline void odp_counter64_init(odp_counter64_t *ptr, uint64_t val)
+{
+	/* No implementation requires any other type of initialization */
+	odp_counter64_write(ptr, val);
+}
+
+/**
+ * Atomic add to 64-bit counter variable
+ *
+ * @param ptr   Pointer to a 64-bit counter variable
+ * @param incr  The value to be added to the counter variable
+ */
+static inline void odp_counter64_add(odp_counter64_t *ptr, uint64_t incr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t old_val;
+	int status;
+	do {
+		__asm __volatile("ldrexd %0, %H0, [%2]\t\n"
+				 "adds   %0, %0, %3\t\n"
+				 "adc    %H0, %H3\t\n"
+				 "strexd %1, %0, %H0, [%2]"
+				 : "=&r"(old_val), "=&r"(status)
+				 : "r"(&ptr->v), "r"(incr)
+				 : );
+	} while (odp_unlikely(status != 0)); /* Retry until write succeeds */
+#elif defined __OCTEON__
+	__asm __volatile("saad %[inc], (%[base])"
+			 : "+m" (*ptr)
+			 : [inc] "r" (incr), [base] "r" (ptr)
+			 : );
+#elif defined __x86_64__
+	/* Generates good code on x86_64 */
+	(void)__sync_fetch_and_add(&ptr->v, incr);
+#else
+/* Warning odp_counter64_add() may not be efficiently implemented */
+	(void)__sync_fetch_and_add(&ptr->v, incr);
+#endif
+}
+
+
+/**
+ * Atomic increment (+1) 64-bit counter variable and return original value
+ *
+ * @param ptr   Pointer to a 64-bit counter variable
+ *
+ * @return Original value of counter
+ */
+static inline uint64_t odp_counter64_read_inc(odp_counter64_t *ptr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t old_val, tmp;
+	int status;
+	do {
+		__asm __volatile("ldrexd %0, %H0, [%3]\t\n"
+				 "adds   %2, %0, #1\t\n"
+				 "adc    %H2, %H0, #0\t\n"
+				 "strexd %1, %2, %H2, [%3]"
+				 : "=&r"(old_val), "=&r"(status), "=&r"(tmp)
+				 : "r"(&ptr->v)
+				 : );
+	} while (odp_unlikely(status != 0)); /* Retry until write succeeds */
+	return old_val;
+#elif defined __OCTEON__
+	uint64_t old_val;
+	__asm __volatile("laid %0,(%2)"
+			: "=r" (old_val), "+m" (ptr)
+			: "r" (ptr)
+			: );
+	return old_val;
+#elif defined __x86_64__
+	/* Generates good code on x86_64 */
+	return __sync_fetch_and_add(&ptr->v, 1);
+#else
+/* Warning odp_counter64_read_inc() may not be efficiently implemented */
+	return __sync_fetch_and_add(&ptr->v, 1);
+#endif
+}
+
+/**
+ * Atomic increment (+1) 64-bit counter variable
+ *
+ * @param ptr   Pointer to a 64-bit counter variable
+ */
+static inline void odp_counter64_inc(odp_counter64_t *ptr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t old_val;
+	int status;
+	do {
+		__asm __volatile("ldrexd %0, %H0, [%2]\t\n"
+				 "adds   %0, #1\t\n"
+				 "adc    %H0, #0\t\n"
+				 "strexd %1, %0, %H0, [%2]"
+				 : "=&r"(old_val), "=&r"(status)
+				 : "r"(&ptr->v)
+				 : );
+	} while (odp_unlikely(status != 0)); /* Retry until write succeeds */
+#else
+	(void)odp_counter64_read_inc(ptr);
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/platform/linux-generic/include/api/odp_rwlock.h b/platform/linux-generic/include/api/odp_rwlock.h
index 252ebb2..ff8a9a2 100644
--- a/platform/linux-generic/include/api/odp_rwlock.h
+++ b/platform/linux-generic/include/api/odp_rwlock.h
@@ -10,26 +10,30 @@ 
 /**
  * @file
  *
- * ODP RW Locks
+ * ODP read/write lock
+ * RW lock support multiple concurrent reads but only one (exclusive) writer.
  */
 
+#include <odp_atomic.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /**
  * The odp_rwlock_t type.
- * write lock count is -1,
- * read lock count > 0
+ * write lock is ~0U
+ * read lock count >0 && <~0U
  */
 typedef struct {
-	volatile int32_t cnt; /**< -1 Write lock,
-				> 0 for Read lock. */
+	odp_atomic32_t cnt; /**< == 0: unlocked,
+				 == ~0: locked for write,
+				 > 0 number of concurrent read locks */
 } odp_rwlock_t;
 
 
 /**
- * Initialize the rwlock to an unlocked state.
+ * Initialize the rwlock to the unlocked state.
  *
  * @param rwlock pointer to the RW Lock.
  */
@@ -50,14 +54,14 @@  void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
 void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
 
 /**
- * Aquire a write lock.
+ * Aquire the write lock.
  *
  * @param rwlock pointer to a RW Lock.
  */
 void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
 
 /**
- * Release a write lock.
+ * Release the write lock.
  *
  * @param rwlock pointer to a RW Lock.
  */
diff --git a/platform/linux-generic/include/api/odp_ticketlock.h b/platform/linux-generic/include/api/odp_ticketlock.h
index 6277a18..5933f85 100644
--- a/platform/linux-generic/include/api/odp_ticketlock.h
+++ b/platform/linux-generic/include/api/odp_ticketlock.h
@@ -21,14 +21,15 @@  extern "C" {
 
 #include <odp_std_types.h>
 #include <odp_atomic.h>
+#include <odp_counter.h>
 
 
 /**
  * ODP ticketlock
  */
 typedef struct odp_ticketlock_t {
-	odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
-	volatile uint32_t cur_ticket;  /**< @private Current ticket */
+	odp_counter32_t next_ticket; /**< @private Next ticket */
+	odp_atomic32_t cur_ticket;  /**< @private Current ticket */
 } odp_ticketlock_t;
 
 
diff --git a/platform/linux-generic/include/odp_buffer_internal.h b/platform/linux-generic/include/odp_buffer_internal.h
index 2002b51..530ab96 100644
--- a/platform/linux-generic/include/odp_buffer_internal.h
+++ b/platform/linux-generic/include/odp_buffer_internal.h
@@ -88,7 +88,7 @@  typedef struct odp_buffer_hdr_t {
 	uint32_t                 index;	     /* buf index in the pool */
 	size_t                   size;       /* max data size */
 	size_t                   cur_offset; /* current offset */
-	odp_atomic_int_t         ref_count;  /* reference count */
+	odp_atomic32_t           ref_count;  /* reference count */
 	odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
 	int                      type;       /* type of next header */
 	odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
diff --git a/platform/linux-generic/include/odp_spin_internal.h b/platform/linux-generic/include/odp_spin_internal.h
index b7e2071..29c524f 100644
--- a/platform/linux-generic/include/odp_spin_internal.h
+++ b/platform/linux-generic/include/odp_spin_internal.h
@@ -15,15 +15,6 @@  extern "C" {
 
 
 /**
- * GCC memory barrier for ODP internal use
- */
-static inline void odp_mem_barrier(void)
-{
-	__asm__ __volatile__ ("" : : : "memory");
-}
-
-
-/**
  * Spin loop for ODP internal use
  */
 static inline void odp_spin(void)
diff --git a/platform/linux-generic/odp_barrier.c b/platform/linux-generic/odp_barrier.c
index a82b294..10368b5 100644
--- a/platform/linux-generic/odp_barrier.c
+++ b/platform/linux-generic/odp_barrier.c
@@ -8,41 +8,52 @@ 
 #include <odp_sync.h>
 #include <odp_spin_internal.h>
 
-void odp_barrier_init_count(odp_barrier_t *barrier, int count)
+void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
 {
-	barrier->count = count;
-	barrier->bar = 0;
-	odp_sync_stores();
+	barrier->num_threads = num_threads; /* Constant after initialisation */
+	odp_atomic32_init(&barrier->in_barrier, 0);
 }
 
 /*
  * Efficient barrier_sync -
  *
  *   Barriers are initialized with a count of the number of callers
- *   that must sync on the barrier before any may proceed.
+ *   that must sync on (enter) the barrier before any may proceed (exit).
  *
  *   To avoid race conditions and to permit the barrier to be fully
- *   reusable, the barrier value cycles between 0..2*count-1. When
- *   synchronizing the wasless variable simply tracks which half of
+ *   reusable, the barrier value cycles between 0..2*count-1 (temporarily
+ *   hitting 2*count before being wrapped). When
+ *   synchronizing, the waslow variable simply tracks which half of
  *   the cycle the barrier was in upon entry.  Exit is when the
  *   barrier crosses to the other half of the cycle.
  */
 
 void odp_barrier_sync(odp_barrier_t *barrier)
 {
-	int count;
-	int wasless;
+	uint32_t count;
+	bool waslow;
 
-	odp_sync_stores();
-	wasless = barrier->bar < barrier->count;
-	count = odp_atomic_fetch_inc_int(&barrier->bar);
+	/* We need both acquire and release barriers but does the order
+	 * matter? Here we start with release and end with acquire. */
 
-	if (count == 2*barrier->count-1) {
-		barrier->bar = 0;
-	} else {
-		while ((barrier->bar < barrier->count) == wasless)
-			odp_spin();
-	}
+	/* Increase threads in_barrier count, this will automatically release
+	 * the other threads when lower/upper range is switched */
+	count = odp_atomic32_fetch_add(&barrier->in_barrier, 1,
+				       ODP_MEMORDER_RLS);
+	/* Compute lower or higher range indicator */
+	waslow = count < barrier->num_threads;
 
-	odp_mem_barrier();
+	/* Check if in_barrier count should wrap */
+	if (count == 2 * barrier->num_threads - 1) {
+		/* Manually wrap the counter */
+		odp_atomic32_add(&barrier->in_barrier,
+				 -2 * barrier->num_threads,
+				 ODP_MEMORDER_RLX);
+		/* Fall-through the final part for the acquire barrier */
+	}
+	/* Wait for counter to change half */
+	while ((odp_atomic32_load(&barrier->in_barrier, ODP_MEMORDER_ACQ) <
+	       barrier->num_threads) == waslow) {
+		odp_spin();
+	}
 }
diff --git a/platform/linux-generic/odp_buffer.c b/platform/linux-generic/odp_buffer.c
index e54e0e7..fc3506b 100644
--- a/platform/linux-generic/odp_buffer.c
+++ b/platform/linux-generic/odp_buffer.c
@@ -73,7 +73,8 @@  int odp_buffer_snprint(char *str, size_t n, odp_buffer_t buf)
 	len += snprintf(&str[len], n-len,
 			"  cur_offset   %zu\n",       hdr->cur_offset);
 	len += snprintf(&str[len], n-len,
-			"  ref_count    %i\n",        hdr->ref_count);
+			"  ref_count    %u\n",
+			odp_atomic32_load(&hdr->ref_count, ODP_MEMORDER_RLX));
 	len += snprintf(&str[len], n-len,
 			"  type         %i\n",        hdr->type);
 	len += snprintf(&str[len], n-len,
diff --git a/platform/linux-generic/odp_crypto.c b/platform/linux-generic/odp_crypto.c
index b37ad6b..75b4ce0 100644
--- a/platform/linux-generic/odp_crypto.c
+++ b/platform/linux-generic/odp_crypto.c
@@ -6,7 +6,7 @@ 
 
 #include <odp_crypto.h>
 #include <odp_internal.h>
-#include <odp_atomic.h>
+#include <odp_counter.h>
 #include <odp_spinlock.h>
 #include <odp_sync.h>
 #include <odp_debug.h>
@@ -26,7 +26,7 @@ 
 #define MAX_SESSIONS 32
 
 typedef struct {
-	odp_atomic_u32_t next;
+	odp_counter32_t   next;
 	uint32_t         max;
 	odp_crypto_generic_session_t sessions[0];
 } odp_crypto_global_t;
@@ -58,7 +58,7 @@  odp_crypto_generic_session_t *alloc_session(void)
 	uint32_t idx;
 	odp_crypto_generic_session_t *session = NULL;
 
-	idx = odp_atomic_fetch_inc_u32(&global->next);
+	idx = odp_counter32_read_inc(&global->next);
 	if (idx < global->max) {
 		session = &global->sessions[idx];
 		session->index = idx;
@@ -420,6 +420,7 @@  odp_crypto_init_global(void)
 
 	/* Initialize it */
 	global->max = MAX_SESSIONS;
+	odp_counter32_init(&global->next, 0);
 
 	return 0;
 }
diff --git a/platform/linux-generic/odp_queue.c b/platform/linux-generic/odp_queue.c
index 1318bcd..08c0d29 100644
--- a/platform/linux-generic/odp_queue.c
+++ b/platform/linux-generic/odp_queue.c
@@ -214,8 +214,13 @@  int odp_queue_set_context(odp_queue_t handle, void *context)
 {
 	queue_entry_t *queue;
 	queue = queue_to_qentry(handle);
+	/* Setting a new queue context can be viewed as a release operation,
+	 * all writes to the context must be observable before the context
+	 * is made observable */
 	odp_sync_stores();
-	queue->s.param.context = context;
+	queue->s.param.context = context; /* Store-release */
+	/* Ensure queue modification is globally visible before we return
+	 * and the application might cause the queue to be scheduled */
 	odp_sync_stores();
 	return 0;
 }
diff --git a/platform/linux-generic/odp_ring.c b/platform/linux-generic/odp_ring.c
index 632aa66..e5b9c23 100644
--- a/platform/linux-generic/odp_ring.c
+++ b/platform/linux-generic/odp_ring.c
@@ -187,10 +187,10 @@  odph_ring_create(const char *name, unsigned count, unsigned flags)
 		r->cons.size = count;
 		r->prod.mask = count-1;
 		r->cons.mask = count-1;
-		r->prod.head = 0;
-		r->cons.head = 0;
-		r->prod.tail = 0;
-		r->cons.tail = 0;
+		odp_atomic32_init(&r->prod.head, 0);
+		odp_atomic32_init(&r->cons.head, 0);
+		odp_atomic32_init(&r->prod.tail, 0);
+		odp_atomic32_init(&r->cons.tail, 0);
 
 		TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
 	} else {
@@ -227,7 +227,7 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	uint32_t prod_head, prod_next;
 	uint32_t cons_tail, free_entries;
 	const unsigned max = n;
-	int success;
+	bool success;
 	unsigned i;
 	uint32_t mask = r->prod.mask;
 	int ret;
@@ -237,8 +237,8 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 		/* Reset n to the initial burst count */
 		n = max;
 
-		prod_head = r->prod.head;
-		cons_tail = r->cons.tail;
+		prod_head = odp_atomic32_load(&r->prod.head, ODP_MEMORDER_RLX);
+		cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_ACQ);
 		/* The subtraction is done between two unsigned 32bits value
 		 * (the result is always modulo 32 bits even if we have
 		 * prod_head > cons_tail). So 'free_entries' is always between 0
@@ -259,13 +259,14 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 		}
 
 		prod_next = prod_head + n;
-		success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
-					      prod_next);
-	} while (odp_unlikely(success == 0));
+		success = odp_atomic32_cmp_xchg_weak(&r->prod.head,
+						     &prod_head,
+						     prod_next,
+						     ODP_MEMORDER_RLX);
+	} while (odp_unlikely(!success));
 
 	/* write entries in ring */
 	ENQUEUE_PTRS();
-	odp_mem_barrier();
 
 	/* if we exceed the watermark */
 	if (odp_unlikely(((mask + 1) - free_entries + n) > r->prod.watermark)) {
@@ -279,10 +280,11 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	 * If there are other enqueues in progress that preceeded us,
 	 * we need to wait for them to complete
 	 */
-	while (odp_unlikely(r->prod.tail != prod_head))
+	while (odp_unlikely(odp_atomic32_load(&r->prod.tail,
+					      ODP_MEMORDER_RLX) != prod_head))
 		odp_spin();
 
-	r->prod.tail = prod_next;
+	odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
 	return ret;
 }
 
@@ -298,8 +300,8 @@  int __odph_ring_sp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	uint32_t mask = r->prod.mask;
 	int ret;
 
-	prod_head = r->prod.head;
-	cons_tail = r->cons.tail;
+	prod_head = odp_atomic32_load(&r->prod.head, ODP_MEMORDER_RLX);
+	cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_ACQ);
 	/* The subtraction is done between two unsigned 32bits value
 	 * (the result is always modulo 32 bits even if we have
 	 * prod_head > cons_tail). So 'free_entries' is always between 0
@@ -320,11 +322,10 @@  int __odph_ring_sp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	}
 
 	prod_next = prod_head + n;
-	r->prod.head = prod_next;
+	odp_atomic32_store(&r->prod.head, prod_next, ODP_MEMORDER_RLX);
 
 	/* write entries in ring */
 	ENQUEUE_PTRS();
-	odp_mem_barrier();
 
 	/* if we exceed the watermark */
 	if (odp_unlikely(((mask + 1) - free_entries + n) > r->prod.watermark)) {
@@ -334,7 +335,7 @@  int __odph_ring_sp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 		ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
 	}
 
-	r->prod.tail = prod_next;
+	odp_atomic32_store(&r->prod.tail, prod_next, ODP_MEMORDER_RLS);
 	return ret;
 }
 
@@ -348,7 +349,7 @@  int __odph_ring_mc_do_dequeue(odph_ring_t *r, void **obj_table,
 	uint32_t cons_head, prod_tail;
 	uint32_t cons_next, entries;
 	const unsigned max = n;
-	int success;
+	bool success;
 	unsigned i;
 	uint32_t mask = r->prod.mask;
 
@@ -357,8 +358,8 @@  int __odph_ring_mc_do_dequeue(odph_ring_t *r, void **obj_table,
 		/* Restore n as it may change every loop */
 		n = max;
 
-		cons_head = r->cons.head;
-		prod_tail = r->prod.tail;
+		cons_head = odp_atomic32_load(&r->cons.head, ODP_MEMORDER_RLX);
+		prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_ACQ);
 		/* The subtraction is done between two unsigned 32bits value
 		 * (the result is always modulo 32 bits even if we have
 		 * cons_head > prod_tail). So 'entries' is always between 0
@@ -378,22 +379,24 @@  int __odph_ring_mc_do_dequeue(odph_ring_t *r, void **obj_table,
 		}
 
 		cons_next = cons_head + n;
-		success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
-					      cons_next);
-	} while (odp_unlikely(success == 0));
+		success = odp_atomic32_cmp_xchg_weak(&r->cons.head,
+						     &cons_head,
+						     cons_next,
+						     ODP_MEMORDER_RLX);
+	} while (odp_unlikely(!success));
 
 	/* copy in table */
 	DEQUEUE_PTRS();
-	odp_mem_barrier();
 
 	/*
 	 * If there are other dequeues in progress that preceded us,
 	 * we need to wait for them to complete
 	 */
-	while (odp_unlikely(r->cons.tail != cons_head))
+	while (odp_unlikely(odp_atomic32_load(&r->cons.tail,
+					      ODP_MEMORDER_RLX) != cons_head))
 		odp_spin();
 
-	r->cons.tail = cons_next;
+	odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
 
 	return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
 }
@@ -409,8 +412,8 @@  int __odph_ring_sc_do_dequeue(odph_ring_t *r, void **obj_table,
 	unsigned i;
 	uint32_t mask = r->prod.mask;
 
-	cons_head = r->cons.head;
-	prod_tail = r->prod.tail;
+	cons_head = odp_atomic32_load(&r->cons.head, ODP_MEMORDER_RLX);
+	prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_ACQ);
 	/* The subtraction is done between two unsigned 32bits value
 	 * (the result is always modulo 32 bits even if we have
 	 * cons_head > prod_tail). So 'entries' is always between 0
@@ -429,13 +432,12 @@  int __odph_ring_sc_do_dequeue(odph_ring_t *r, void **obj_table,
 	}
 
 	cons_next = cons_head + n;
-	r->cons.head = cons_next;
+	odp_atomic32_store(&r->cons.head, cons_next, ODP_MEMORDER_RLX);
 
 	/* copy in table */
 	DEQUEUE_PTRS();
-	odp_mem_barrier();
 
-	r->cons.tail = cons_next;
+	odp_atomic32_store(&r->cons.tail, cons_next, ODP_MEMORDER_RLS);
 	return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
 }
 
@@ -482,8 +484,8 @@  int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void **obj_table, unsigned n)
  */
 int odph_ring_full(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_RLX);
+	uint32_t cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_RLX);
 	return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
 }
 
@@ -492,8 +494,8 @@  int odph_ring_full(const odph_ring_t *r)
  */
 int odph_ring_empty(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_RLX);
+	uint32_t cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_RLX);
 	return !!(cons_tail == prod_tail);
 }
 
@@ -502,8 +504,8 @@  int odph_ring_empty(const odph_ring_t *r)
  */
 unsigned odph_ring_count(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_RLX);
+	uint32_t cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_RLX);
 	return (prod_tail - cons_tail) & r->prod.mask;
 }
 
@@ -512,8 +514,8 @@  unsigned odph_ring_count(const odph_ring_t *r)
  */
 unsigned odph_ring_free_count(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic32_load(&r->prod.tail, ODP_MEMORDER_RLX);
+	uint32_t cons_tail = odp_atomic32_load(&r->cons.tail, ODP_MEMORDER_RLX);
 	return (cons_tail - prod_tail - 1) & r->prod.mask;
 }
 
@@ -523,10 +525,14 @@  void odph_ring_dump(const odph_ring_t *r)
 	ODP_DBG("ring <%s>@%p\n", r->name, r);
 	ODP_DBG("  flags=%x\n", r->flags);
 	ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
-	ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
-	ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
-	ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
-	ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
+	ODP_DBG("  ct=%"PRIu32"\n", odp_atomic32_load(&r->cons.tail,
+						      ODP_MEMORDER_RLX));
+	ODP_DBG("  ch=%"PRIu32"\n", odp_atomic32_load(&r->cons.head,
+						      ODP_MEMORDER_RLX));
+	ODP_DBG("  pt=%"PRIu32"\n", odp_atomic32_load(&r->prod.tail,
+						      ODP_MEMORDER_RLX));
+	ODP_DBG("  ph=%"PRIu32"\n", odp_atomic32_load(&r->prod.head,
+						      ODP_MEMORDER_RLX));
 	ODP_DBG("  used=%u\n", odph_ring_count(r));
 	ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
 	if (r->prod.watermark == r->prod.size)
diff --git a/platform/linux-generic/odp_rwlock.c b/platform/linux-generic/odp_rwlock.c
index 11c8dd7..a5fae4d 100644
--- a/platform/linux-generic/odp_rwlock.c
+++ b/platform/linux-generic/odp_rwlock.c
@@ -4,58 +4,64 @@ 
  * SPDX-License-Identifier:     BSD-3-Clause
  */
 
+#include <stdbool.h>
 #include <odp_atomic.h>
 #include <odp_rwlock.h>
-
 #include <odp_spin_internal.h>
 
 void odp_rwlock_init(odp_rwlock_t *rwlock)
 {
-	rwlock->cnt = 0;
+	odp_atomic32_init(&rwlock->cnt, 0);
 }
 
 void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
 {
-	int32_t cnt;
-	int  is_locked = 0;
-
-	while (is_locked == 0) {
-		cnt = rwlock->cnt;
-		/* waiting for read lock */
-		if (cnt < 0) {
+	bool gotit;
+	uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
+	do {
+		/* Wait for any writer to release lock */
+		while ((int32_t)cnt < 0) {
 			odp_spin();
-			continue;
+			cnt = odp_atomic32_load(&rwlock->cnt,
+						ODP_MEMORDER_RLX);
 		}
-		is_locked = odp_atomic_cmpset_u32(
-					(volatile uint32_t *)&rwlock->cnt,
-					      cnt, cnt + 1);
-	}
+		/* Attempt to take another read lock */
+		gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
+						   &cnt, cnt + 1,
+						   ODP_MEMORDER_RLX);
+		/* If operation fails, 'cnt' will contain current value */
+	} while (!gotit);
 }
 
 void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
 {
-	odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
+	/* Release one read lock by subtracting 1 */
+	odp_atomic32_dec(&rwlock->cnt, ODP_MEMORDER_RLS);
 }
 
 void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
 {
-	int32_t cnt;
-	int is_locked = 0;
-
-	while (is_locked == 0) {
-		cnt = rwlock->cnt;
-		/* lock aquired, wait */
-		if (cnt != 0) {
+	bool gotit;
+	uint32_t cnt = odp_atomic32_load(&rwlock->cnt, ODP_MEMORDER_ACQ);
+	do {
+		/* Wait for all lock holders to release lock */
+		while (cnt != 0) {
+			/* Lock is busy */
 			odp_spin();
-			continue;
+			cnt = odp_atomic32_load(&rwlock->cnt,
+						ODP_MEMORDER_RLX);
 		}
-		is_locked = odp_atomic_cmpset_u32(
-					(volatile uint32_t *)&rwlock->cnt,
-					      0, -1);
-	}
+		/* Attempt to take write lock */
+		gotit = odp_atomic32_cmp_xchg_weak(&rwlock->cnt,
+						   &cnt,
+						   (uint32_t)-1,
+						   ODP_MEMORDER_RLX);
+		/* If operation fails, 'cnt' will contain current value */
+	} while (!gotit);
 }
 
 void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
 {
-	odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
+	/* Release the write lock by adding 1 */
+	odp_atomic32_inc(&rwlock->cnt, ODP_MEMORDER_RLS);
 }
diff --git a/platform/linux-generic/odp_thread.c b/platform/linux-generic/odp_thread.c
index b869b27..652d317 100644
--- a/platform/linux-generic/odp_thread.c
+++ b/platform/linux-generic/odp_thread.c
@@ -11,7 +11,7 @@ 
 
 #include <odp_thread.h>
 #include <odp_internal.h>
-#include <odp_atomic.h>
+#include <odp_counter.h>
 #include <odp_config.h>
 #include <odp_debug.h>
 #include <odp_shared_memory.h>
@@ -31,7 +31,7 @@  typedef struct {
 
 typedef struct {
 	thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
-	odp_atomic_int_t num;
+	odp_counter32_t   num;
 
 } thread_globals_t;
 
@@ -58,6 +58,7 @@  int odp_thread_init_global(void)
 		return -1;
 
 	memset(thread_globals, 0, sizeof(thread_globals_t));
+	odp_counter32_init(&thread_globals->num, 0);
 	return 0;
 }
 
@@ -67,7 +68,7 @@  static int thread_id(void)
 	int id;
 	int cpu;
 
-	id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
+	id = (int)odp_counter32_read_inc(&thread_globals->num);
 
 	if (id >= ODP_CONFIG_MAX_THREADS) {
 		ODP_ERR("Too many threads\n");
@@ -77,7 +78,7 @@  static int thread_id(void)
 	cpu = sched_getcpu();
 
 	if (cpu < 0) {
-		ODP_ERR("getcpu failed\n");
+		ODP_ERR("sched_getcpu failed\n");
 		return -1;
 	}
 
diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux-generic/odp_ticketlock.c
index be5b885..510aa9f 100644
--- a/platform/linux-generic/odp_ticketlock.c
+++ b/platform/linux-generic/odp_ticketlock.c
@@ -6,15 +6,15 @@ 
 
 #include <odp_ticketlock.h>
 #include <odp_atomic.h>
+#include <odp_counter.h>
 #include <odp_sync.h>
 #include <odp_spin_internal.h>
 
 
 void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
 {
-	ticketlock->next_ticket = 0;
-	ticketlock->cur_ticket  = 0;
-	odp_sync_stores();
+	odp_counter32_init(&ticketlock->next_ticket, 0);
+	odp_atomic32_init(&ticketlock->cur_ticket, 0);
 }
 
 
@@ -22,30 +22,15 @@  void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
 {
 	uint32_t ticket;
 
-	ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
+	ticket = odp_counter32_read_inc(&ticketlock->next_ticket);
 
-	while (ticket != ticketlock->cur_ticket)
+	while (ticket != odp_atomic32_load(&ticketlock->cur_ticket,
+					   ODP_MEMORDER_ACQ))
 		odp_spin();
-
-	odp_mem_barrier();
 }
 
 
 void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
 {
-	odp_sync_stores();
-
-	ticketlock->cur_ticket++;
-
-#if defined __OCTEON__
-	odp_sync_stores();
-#else
-	odp_mem_barrier();
-#endif
-}
-
-
-int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
-{
-	return ticketlock->cur_ticket != ticketlock->next_ticket;
+	odp_atomic32_inc(&ticketlock->cur_ticket, ODP_MEMORDER_RLS);
 }
diff --git a/platform/linux-generic/odp_timer.c b/platform/linux-generic/odp_timer.c
index 313c713..fffaa44 100644
--- a/platform/linux-generic/odp_timer.c
+++ b/platform/linux-generic/odp_timer.c
@@ -10,6 +10,7 @@ 
 #include <odp_buffer_pool_internal.h>
 #include <odp_internal.h>
 #include <odp_atomic.h>
+#include <odp_counter.h>
 #include <odp_spinlock.h>
 #include <odp_sync.h>
 #include <odp_debug.h>
@@ -32,8 +33,8 @@  typedef struct {
 
 typedef struct {
 	int               allocated;
-	volatile int      active;
-	volatile uint64_t cur_tick;
+	odp_atomic32_t    active;
+	odp_counter64_t   cur_tick;
 	timer_t           timerid;
 	odp_timer_t       timer_hdl;
 	odp_buffer_pool_t pool;
@@ -150,16 +151,16 @@  static void notify_function(union sigval sigval)
 
 	timer = sigval.sival_ptr;
 
-	if (timer->active == 0) {
+	if (odp_atomic32_load(&timer->active, ODP_MEMORDER_RLX) == 0) {
 		ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
 		return;
 	}
 
 	/* ODP_DBG("Tick\n"); */
 
-	cur_tick = timer->cur_tick++;
-
-	odp_sync_stores();
+	/* Increment and read are not atomic but we are the only writer */
+	odp_counter64_inc(&timer->cur_tick);
+	cur_tick = odp_counter64_read(&timer->cur_tick);
 
 	tick = &timer->tick[cur_tick % MAX_TICKS];
 
@@ -308,6 +309,8 @@  odp_timer_t odp_timer_create(const char *name, odp_buffer_pool_t pool,
 
 	timer_hdl = id + 1;
 
+	odp_atomic32_init(&timer->active, 0);
+	odp_counter64_init(&timer->cur_tick, 0);
 	timer->timer_hdl     = timer_hdl;
 	timer->pool          = pool;
 	timer->resolution_ns = resolution_ns;
@@ -318,8 +321,7 @@  odp_timer_t odp_timer_create(const char *name, odp_buffer_pool_t pool,
 		timer->tick[i].list = NULL;
 	}
 
-	timer->active = 1;
-	odp_sync_stores();
+	odp_atomic32_store(&timer->active, 1, ODP_MEMORDER_RLS);
 
 	timer_start(timer);
 
@@ -340,7 +342,7 @@  odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t timer_hdl, uint64_t tmo_tick,
 	id = (int)timer_hdl - 1;
 	timer = &odp_timer.timer[id];
 
-	cur_tick = timer->cur_tick;
+	cur_tick = odp_counter64_read(&timer->cur_tick);
 	if (tmo_tick <= cur_tick) {
 		ODP_DBG("timeout too close\n");
 		return ODP_TIMER_TMO_INVALID;
@@ -416,7 +418,7 @@  uint64_t odp_timer_current_tick(odp_timer_t timer_hdl)
 	uint32_t id;
 
 	id = timer_hdl - 1;
-	return odp_timer.timer[id].cur_tick;
+	return odp_counter64_read(&odp_timer.timer[id].cur_tick);
 }
 
 odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
diff --git a/test/api_test/Makefile.am b/test/api_test/Makefile.am
index 5104454..478aa6c 100644
--- a/test/api_test/Makefile.am
+++ b/test/api_test/Makefile.am
@@ -1,12 +1,12 @@ 
 include $(top_srcdir)/test/Makefile.inc
 
-bin_PROGRAMS = odp_atomic odp_shm odp_ring odp_timer_ping
-odp_atomic_LDFLAGS = $(AM_LDFLAGS) -static
+bin_PROGRAMS = odp_counter odp_shm odp_ring odp_timer_ping
+odp_counter_LDFLAGS = $(AM_LDFLAGS) -static
 odp_shm_LDFLAGS = $(AM_LDFLAGS) -static
 odp_ring_LDFLAGS = $(AM_LDFLAGS) -static
 odp_timer_ping_LDFLAGS = $(AM_LDFLAGS) -static
 
-dist_odp_atomic_SOURCES = odp_atomic_test.c odp_common.c
+dist_odp_counter_SOURCES = odp_counter_test.c odp_common.c
 dist_odp_shm_SOURCES = odp_shm_test.c odp_common.c
 dist_odp_ring_SOURCES = odp_ring_test.c odp_common.c
 dist_odp_timer_ping_SOURCES = odp_timer_ping.c odp_common.c
diff --git a/test/api_test/odp_atomic_test.c b/test/api_test/odp_atomic_test.c
deleted file mode 100644
index 9019d4f..0000000
--- a/test/api_test/odp_atomic_test.c
+++ /dev/null
@@ -1,362 +0,0 @@ 
-/* Copyright (c) 2013, Linaro Limited
- * All rights reserved.
- *
- * SPDX-License-Identifier:     BSD-3-Clause
- */
-
-#include <string.h>
-#include <sys/time.h>
-#include <odp_debug.h>
-#include <odp_common.h>
-#include <odp_atomic_test.h>
-
-static odp_atomic_int_t a32;
-static odp_atomic_u32_t a32u;
-static odp_atomic_u64_t a64u;
-
-static odp_atomic_int_t numthrds;
-
-static const char * const test_name[] = {
-	"dummy",
-	"test atomic basic ops add/sub/inc/dec",
-	"test atomic inc/dec of signed word",
-	"test atomic add/sub of signed word",
-	"test atomic inc/dec of unsigned word",
-	"test atomic add/sub of unsigned word",
-	"test atomic inc/dec of unsigned double word",
-	"test atomic add/sub of unsigned double word"
-};
-
-static struct timeval tv0[MAX_WORKERS], tv1[MAX_WORKERS];
-
-static void usage(void)
-{
-	printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
-	       "\t<testcase> is\n"
-	       "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
-	       "\t\t2 - Test inc dec of signed word\n"
-	       "\t\t3 - Test add sub of signed word\n"
-	       "\t\t4 - Test inc dec of unsigned word\n"
-	       "\t\t5 - Test add sub of unsigned word\n"
-	       "\t\t6 - Test inc dec of double word\n"
-	       "\t\t7 - Test add sub of double word\n"
-	       "\t<num of pthread> is optional\n"
-	       "\t\t<1 - 31> - no of pthreads to start\n"
-	       "\t\tif user doesn't specify this option, then\n"
-	       "\t\tno of pthreads created is equivalent to no of cores\n"
-	       "\t\tavailable in the system\n"
-	       "\tExample usage:\n"
-	       "\t\t./odp_atomic -t 2\n"
-	       "\t\t./odp_atomic -t 3 -n 12\n");
-}
-
-void test_atomic_inc_32(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_inc_int(&a32);
-}
-
-void test_atomic_inc_u32(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_inc_u32(&a32u);
-}
-
-void test_atomic_inc_64(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_inc_u64(&a64u);
-}
-
-void test_atomic_dec_32(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_dec_int(&a32);
-}
-
-void test_atomic_dec_u32(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_dec_u32(&a32u);
-}
-
-void test_atomic_dec_64(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_dec_u64(&a64u);
-}
-
-void test_atomic_add_32(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
-}
-
-void test_atomic_add_u32(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
-}
-
-void test_atomic_add_64(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
-}
-
-void test_atomic_sub_32(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
-}
-
-void test_atomic_sub_u32(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
-}
-
-void test_atomic_sub_64(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
-}
-
-void test_atomic_inc_dec_32(void)
-{
-	test_atomic_inc_32();
-	test_atomic_dec_32();
-}
-
-void test_atomic_add_sub_32(void)
-{
-	test_atomic_add_32();
-	test_atomic_sub_32();
-}
-
-void test_atomic_inc_dec_u32(void)
-{
-	test_atomic_inc_u32();
-	test_atomic_dec_u32();
-}
-
-void test_atomic_add_sub_u32(void)
-{
-	test_atomic_add_u32();
-	test_atomic_sub_u32();
-}
-
-void test_atomic_inc_dec_64(void)
-{
-	test_atomic_inc_64();
-	test_atomic_dec_64();
-}
-
-void test_atomic_add_sub_64(void)
-{
-	test_atomic_add_64();
-	test_atomic_sub_64();
-}
-
-/**
- * Test basic atomic operation like
- * add/sub/increment/decrement operation.
- */
-void test_atomic_basic(void)
-{
-	test_atomic_inc_32();
-	test_atomic_dec_32();
-	test_atomic_add_32();
-	test_atomic_sub_32();
-
-	test_atomic_inc_u32();
-	test_atomic_dec_u32();
-	test_atomic_add_u32();
-	test_atomic_sub_u32();
-
-	test_atomic_inc_64();
-	test_atomic_dec_64();
-	test_atomic_add_64();
-	test_atomic_sub_64();
-}
-
-void test_atomic_init(void)
-{
-	odp_atomic_init_int(&a32);
-	odp_atomic_init_u32(&a32u);
-	odp_atomic_init_u64(&a64u);
-}
-
-void test_atomic_store(void)
-{
-	odp_atomic_store_int(&a32, S32_INIT_VAL);
-	odp_atomic_store_u32(&a32u, U32_INIT_VAL);
-	odp_atomic_store_u64(&a64u, U64_INIT_VAL);
-}
-
-int test_atomic_validate(void)
-{
-	if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
-		ODP_ERR("Atomic signed 32 usual functions failed\n");
-		return -1;
-	}
-
-	if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
-		ODP_ERR("Atomic u32 usual functions failed\n");
-		return -1;
-	}
-
-	if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
-		ODP_ERR("Atomic u64 usual functions failed\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-static void *run_thread(void *arg)
-{
-	pthrd_arg *parg = (pthrd_arg *)arg;
-	int thr;
-
-	thr = odp_thread_id();
-
-	ODP_DBG("Thread %i starts\n", thr);
-
-	odp_atomic_inc_int(&numthrds);
-
-	/* Wait here until all pthreads are created */
-	while (*(volatile int *)&numthrds < parg->numthrds)
-		;
-
-	gettimeofday(&tv0[thr], NULL);
-
-	switch (parg->testcase) {
-	case TEST_MIX:
-		test_atomic_basic();
-		break;
-	case TEST_INC_DEC_S32:
-		test_atomic_inc_dec_32();
-		break;
-	case TEST_ADD_SUB_S32:
-		test_atomic_add_sub_32();
-		break;
-	case TEST_INC_DEC_U32:
-		test_atomic_inc_dec_u32();
-		break;
-	case TEST_ADD_SUB_U32:
-		test_atomic_add_sub_u32();
-		break;
-	case TEST_INC_DEC_64:
-		test_atomic_inc_dec_64();
-		break;
-	case TEST_ADD_SUB_64:
-		test_atomic_add_sub_64();
-		break;
-	}
-	gettimeofday(&tv1[thr], NULL);
-	fflush(NULL);
-
-	printf("Time taken in thread %02d to complete op is %lld usec\n", thr,
-	       (tv1[thr].tv_sec - tv0[thr].tv_sec) * 1000000ULL +
-	       (tv1[thr].tv_usec - tv0[thr].tv_usec));
-
-	return parg;
-}
-
-int main(int argc, char *argv[])
-{
-	pthrd_arg thrdarg;
-	int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
-	char c;
-	int result;
-
-	if (argc == 1 || argc % 2 == 0) {
-		usage();
-		goto err_exit;
-	}
-	if (odp_test_global_init() != 0)
-		goto err_exit;
-	odp_print_system_info();
-
-	while (cnt != 0) {
-		sscanf(argv[++i], "-%c", &c);
-		switch (c) {
-		case 't':
-			sscanf(argv[++i], "%d", &test_type);
-			break;
-		case 'n':
-			sscanf(argv[++i], "%d", &pthrdnum);
-			break;
-		default:
-			ODP_ERR("Invalid option %c\n", c);
-			usage();
-			goto err_exit;
-		}
-		if (test_type < TEST_MIX || test_type > TEST_MAX ||
-		    pthrdnum > odp_sys_core_count()) {
-			usage();
-			goto err_exit;
-		}
-		cnt -= 2;
-	}
-	if (pthrdnum == 0)
-		pthrdnum = odp_sys_core_count();
-
-	odp_atomic_init_int(&numthrds);
-	test_atomic_init();
-	test_atomic_store();
-
-	memset(&thrdarg, 0, sizeof(pthrd_arg));
-	thrdarg.testcase = test_type;
-	thrdarg.numthrds = pthrdnum;
-
-	if ((test_type > 0) && (test_type < TEST_MAX)) {
-		printf("%s\n", test_name[test_type]);
-	} else {
-		ODP_ERR("Invalid test case [%d]\n", test_type);
-		usage();
-		goto err_exit;
-	}
-	odp_test_thread_create(run_thread, &thrdarg);
-
-	odp_test_thread_exit(&thrdarg);
-
-	result = test_atomic_validate();
-
-	if (result == 0) {
-		printf("%s_%d_%d Result:pass\n",
-		       test_name[test_type], test_type, pthrdnum);
-	} else {
-		printf("%s_%d_%d Result:fail\n",
-		       test_name[test_type], test_type, pthrdnum);
-	}
-	return 0;
-
-err_exit:
-	return -1;
-}
diff --git a/test/api_test/odp_atomic_test.h b/test/api_test/odp_atomic_test.h
deleted file mode 100644
index 7814da5..0000000
--- a/test/api_test/odp_atomic_test.h
+++ /dev/null
@@ -1,60 +0,0 @@ 
-/* Copyright (c) 2013, Linaro Limited
- * All rights reserved.
- *
- * SPDX-License-Identifier:     BSD-3-Clause
- */
-
-#ifndef ODP_ATOMIC_TEST_H_
-#define ODP_ATOMIC_TEST_H_
-
-#include <odp.h>
-#include <odph_linux.h>
-
-/**
- * add_sub_cnt could be any valid value
- * so to excercise explicit atomic_add/sub
- * ops. For now using 5..
- */
-#define ADD_SUB_CNT	5
-
-#define	CNT 500000
-#define	S32_INIT_VAL	(1UL << 10)
-#define	U32_INIT_VAL	(1UL << 10)
-#define	U64_INIT_VAL	(1ULL << 33)
-
-typedef enum {
-	TEST_MIX = 1, /* Must be first test case num */
-	TEST_INC_DEC_S32,
-	TEST_ADD_SUB_S32,
-	TEST_INC_DEC_U32,
-	TEST_ADD_SUB_U32,
-	TEST_INC_DEC_64,
-	TEST_ADD_SUB_64,
-	TEST_MAX,
-} odp_test_atomic_t;
-
-
-void test_atomic_inc_dec_32(void);
-void test_atomic_add_sub_32(void);
-void test_atomic_inc_dec_u32(void);
-void test_atomic_add_sub_u32(void);
-void test_atomic_inc_dec_64(void);
-void test_atomic_add_sub_64(void);
-void test_atomic_inc_32(void);
-void test_atomic_dec_32(void);
-void test_atomic_add_32(void);
-void test_atomic_sub_32(void);
-void test_atomic_inc_u32(void);
-void test_atomic_dec_u32(void);
-void test_atomic_add_u32(void);
-void test_atomic_sub_u32(void);
-void test_atomic_inc_64(void);
-void test_atomic_dec_64(void);
-void test_atomic_add_64(void);
-void test_atomic_sub_64(void);
-void test_atomic_init(void);
-void test_atomic_basic(void);
-void test_atomic_store(void);
-int test_atomic_validate(void);
-
-#endif /* ODP_ATOMIC_TEST_H_ */
diff --git a/test/api_test/odp_common.c b/test/api_test/odp_common.c
index ed1fc97..198fe8f 100644
--- a/test/api_test/odp_common.c
+++ b/test/api_test/odp_common.c
@@ -14,7 +14,6 @@ 
 #include <odp.h>
 #include <odph_linux.h>
 #include <odp_common.h>
-#include <odp_atomic_test.h>
 #include <odp_shm_test.h>
 
 
diff --git a/test/api_test/odp_counter_test.c b/test/api_test/odp_counter_test.c
new file mode 100644
index 0000000..c72328e
--- /dev/null
+++ b/test/api_test/odp_counter_test.c
@@ -0,0 +1,361 @@ 
+/* Copyright (c) 2013, Linaro Limited
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier:     BSD-3-Clause
+ */
+
+#include <string.h>
+#include <sys/time.h>
+#include <odp.h>
+#include <odp_debug.h>
+#include <odp_common.h>
+#include <odph_linux.h>
+
+/**
+ * add_sub_cnt could be any valid value
+ * so to excercise explicit atomic_add/sub
+ * ops. For now using 5..
+ */
+#define ADD_SUB_CNT	5
+
+#define	CNT 500000
+#define	U32_INIT_VAL	(1UL << 10)
+#define	U64_INIT_VAL	(1ULL << 33)
+
+typedef enum {
+	TEST_MIX = 1, /* Must be first test case num */
+	TEST_INC_DEC_U32 = 2,
+	TEST_ADD_SUB_U32 = 3,
+	TEST_INC_DEC_64 = 4,
+	TEST_ADD_SUB_64 = 5,
+	TEST_MAX,
+} odp_test_counter_t;
+
+
+static uint32_t test_counter_inc_dec_u32(void);
+static uint32_t test_counter_add_sub_u32(void);
+static uint32_t test_counter_inc_dec_64(void);
+static uint32_t test_counter_add_sub_64(void);
+static uint32_t test_counter_inc_u32(void);
+static uint32_t test_counter_dec_u32(void);
+static uint32_t test_counter_add_u32(void);
+static uint32_t test_counter_sub_u32(void);
+static uint32_t test_counter_inc_64(void);
+static uint32_t test_counter_dec_64(void);
+static uint32_t test_counter_add_64(void);
+static uint32_t test_counter_sub_64(void);
+static void test_counter_init(void);
+static uint32_t test_counter_basic(void);
+static void test_counter_write(void);
+static int test_counter_validate(void);
+
+static odp_counter32_t a32u;
+static odp_counter64_t a64u;
+
+static odp_barrier_t barrier;
+
+static const char * const test_name[] = {
+	"dummy",
+	"test atomic counter basic ops add/sub/inc/dec",
+	"test atomic inc/dec of 32-bit counter",
+	"test atomic add/sub of 32-bit counter",
+	"test atomic inc/dec of 64-bit counter",
+	"test atomic add/sub of 64-bit counter"
+};
+
+static uint64_t accops[MAX_WORKERS];
+
+static void usage(void)
+{
+	printf("\n./odp_counter -t <testcase> -n <num of threads>\n\n"
+	       "\t<testcase> is\n"
+	       "\t\t1 - Test mix (inc/dec/add/sub on 32- and 64-bit counters)\n"
+	       "\t\t2 - Test inc/dec of 32-bit counter\n"
+	       "\t\t3 - Test add/sub of 32-bit counter\n"
+	       "\t\t4 - Test inc/dec of 64-bit counter\n"
+	       "\t\t5 - Test add/sub of 64-bit counter\n"
+	       "\t<num of thread> is optional\n"
+	       "\t\t<1 - 31> - no of threads to start\n"
+	       "\t\tif user doesn't specify this option, then\n"
+	       "\t\tno of threads created is equivalent to no of cores\n"
+	       "\t\tavailable in the system\n"
+	       "\tExample usage:\n"
+	       "\t\t./odp_counter -t 2\n"
+	       "\t\t./odp_counter -t 3 -n 12\n");
+}
+
+static uint32_t test_counter_inc_u32(void)
+{
+	int i;
+
+	for (i = 0; i < CNT; i++)
+		odp_counter32_inc(&a32u);
+	return i;
+}
+
+static uint32_t test_counter_inc_64(void)
+{
+	int i;
+
+	for (i = 0; i < CNT; i++)
+		odp_counter64_inc(&a64u);
+	return i;
+}
+
+static uint32_t test_counter_dec_u32(void)
+{
+	int i;
+
+	for (i = 0; i < CNT; i++)
+		odp_counter32_add(&a32u, (uint32_t)-1);
+	return i;
+}
+
+static uint32_t test_counter_dec_64(void)
+{
+	int i;
+
+	for (i = 0; i < CNT; i++)
+		odp_counter64_add(&a64u, (uint64_t)-1);
+	return i;
+}
+
+static uint32_t test_counter_add_u32(void)
+{
+	int i;
+
+	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
+		odp_counter32_add(&a32u, ADD_SUB_CNT);
+	return i;
+}
+
+static uint32_t test_counter_add_64(void)
+{
+	int i;
+
+	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
+		odp_counter64_add(&a64u, ADD_SUB_CNT);
+	return i;
+}
+
+static uint32_t test_counter_sub_u32(void)
+{
+	int i;
+
+	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
+		odp_counter32_add(&a32u, -ADD_SUB_CNT);
+	return i;
+}
+
+static uint32_t test_counter_sub_64(void)
+{
+	int i;
+
+	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
+		odp_counter64_add(&a64u, -ADD_SUB_CNT);
+	return i;
+}
+
+static uint32_t test_counter_inc_dec_u32(void)
+{
+	uint32_t nops = 0;
+	nops += test_counter_inc_u32();
+	nops += test_counter_dec_u32();
+	return nops;
+}
+
+static uint32_t test_counter_add_sub_u32(void)
+{
+	uint32_t nops = 0;
+	nops += test_counter_add_u32();
+	nops += test_counter_sub_u32();
+	return nops;
+}
+
+static uint32_t test_counter_inc_dec_64(void)
+{
+	uint32_t nops = 0;
+	nops += test_counter_inc_64();
+	nops += test_counter_dec_64();
+	return nops;
+}
+
+static uint32_t test_counter_add_sub_64(void)
+{
+	uint32_t nops = 0;
+	nops += test_counter_add_64();
+	nops += test_counter_sub_64();
+	return nops;
+}
+
+/**
+ * Test basic counter operation like
+ * add/sub/increment/decrement operation.
+ */
+static uint32_t test_counter_basic(void)
+{
+	uint32_t nops = 0;
+	nops += test_counter_inc_u32();
+	nops += test_counter_dec_u32();
+	nops += test_counter_add_u32();
+	nops += test_counter_sub_u32();
+
+	nops += test_counter_inc_64();
+	nops += test_counter_dec_64();
+	nops += test_counter_add_64();
+	nops += test_counter_sub_64();
+
+	return nops;
+}
+
+static void test_counter_init(void)
+{
+	odp_counter32_init(&a32u, 0);
+	odp_counter64_init(&a64u, 0);
+}
+
+static void test_counter_write(void)
+{
+	odp_counter32_write(&a32u, U32_INIT_VAL);
+	odp_counter64_write(&a64u, U64_INIT_VAL);
+}
+
+static int test_counter_validate(void)
+{
+	if (odp_counter32_read(&a32u) != U32_INIT_VAL) {
+		ODP_ERR("Atomic u32 usual functions failed\n");
+		return -1;
+	}
+
+	if (odp_counter64_read(&a64u) != U64_INIT_VAL) {
+		ODP_ERR("Atomic u64 usual functions failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void *run_thread(void *arg)
+{
+	pthrd_arg *parg = (pthrd_arg *)arg;
+	int thr;
+	uint64_t nops = 0;
+	struct timeval tv0, tv1;
+
+	thr = odp_thread_id();
+
+	ODP_DBG("Thread %i starts\n", thr);
+
+	/* Wait here until all threads have arrived */
+	/* Use multiple barriers to verify that it handles wrap around and
+	 * has no race conditions which could be exposed when invoked back-
+	 * to-back */
+	odp_barrier_sync(&barrier);
+	odp_barrier_sync(&barrier);
+	odp_barrier_sync(&barrier);
+	odp_barrier_sync(&barrier);
+
+	gettimeofday(&tv0, NULL);
+
+	switch (parg->testcase) {
+	case TEST_MIX:
+		nops += test_counter_basic();
+		break;
+	case TEST_INC_DEC_U32:
+		nops += test_counter_inc_dec_u32();
+		break;
+	case TEST_ADD_SUB_U32:
+		nops += test_counter_add_sub_u32();
+		break;
+	case TEST_INC_DEC_64:
+		nops += test_counter_inc_dec_64();
+		break;
+	case TEST_ADD_SUB_64:
+		nops += test_counter_add_sub_64();
+		break;
+	}
+	gettimeofday(&tv1, NULL);
+	accops[thr] = nops;
+	fflush(NULL);
+
+	uint64_t usecs = (tv1.tv_sec - tv0.tv_sec) * 1000000ULL +
+			 tv1.tv_usec - tv0.tv_usec;
+	printf("Time taken in thread %02d to complete %"PRIu64" op is "
+	       "%"PRIu64" usec, %"PRIu64" ns/op\n",
+	       thr, nops, usecs, 1000 * usecs / nops);
+
+	return parg;
+}
+
+int main(int argc, char *argv[])
+{
+	pthrd_arg thrdarg;
+	int test_type = 0, pthrdnum = 0, i = 0, cnt = argc - 1;
+	char c;
+	int result;
+
+	if (argc == 1 || argc % 2 == 0) {
+		usage();
+		goto err_exit;
+	}
+	if (odp_test_global_init() != 0)
+		goto err_exit;
+	odp_print_system_info();
+
+	while (cnt != 0) {
+		sscanf(argv[++i], "-%c", &c);
+		switch (c) {
+		case 't':
+			sscanf(argv[++i], "%d", &test_type);
+			break;
+		case 'n':
+			sscanf(argv[++i], "%d", &pthrdnum);
+			break;
+		default:
+			ODP_ERR("Invalid option %c\n", c);
+			usage();
+			goto err_exit;
+		}
+		if (test_type < TEST_MIX || test_type > TEST_MAX ||
+		    pthrdnum > odp_sys_core_count()) {
+			usage();
+			goto err_exit;
+		}
+		cnt -= 2;
+	}
+	if (pthrdnum == 0)
+		pthrdnum = odp_sys_core_count();
+
+	test_counter_init();
+	test_counter_write();
+
+	memset(&thrdarg, 0, sizeof(pthrd_arg));
+	thrdarg.testcase = test_type;
+	thrdarg.numthrds = pthrdnum;
+
+	if ((test_type > 0) && (test_type < TEST_MAX)) {
+		printf("%s\n", test_name[test_type]);
+	} else {
+		ODP_ERR("Invalid test case [%d]\n", test_type);
+		usage();
+		goto err_exit;
+	}
+	odp_barrier_init(&barrier, pthrdnum);
+	odp_test_thread_create(run_thread, &thrdarg);
+
+	odp_test_thread_exit(&thrdarg);
+
+	result = test_counter_validate();
+
+	if (result == 0) {
+		printf("%s_%d_%d Result:pass\n",
+		       test_name[test_type], test_type, pthrdnum);
+	} else {
+		printf("%s_%d_%d Result:fail\n",
+		       test_name[test_type], test_type, pthrdnum);
+	}
+	return 0;
+
+err_exit:
+	return -1;
+}