diff mbox

[PATCHv2,5/5] linux-generic: odp_ticketlock.[ch]: use odp_atomic_internal.h

Message ID 1417101181-13500-6-git-send-email-ola.liljedahl@linaro.org
State Accepted
Commit 1c6711ed1d2c58c131fa10a858ac498c8f591553
Headers show

Commit Message

Ola Liljedahl Nov. 27, 2014, 3:13 p.m. UTC
Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
---
(This document/code contribution attached is provided under the terms of
agreement LES-LTM-21309)

Use definitions from odp_atomic_internal.h.

 platform/linux-generic/include/api/odp_ticketlock.h |  2 +-
 platform/linux-generic/odp_ticketlock.c             | 14 ++++++--------
 2 files changed, 7 insertions(+), 9 deletions(-)

Comments

Bill Fischofer Nov. 28, 2014, 12:53 p.m. UTC | #1
On Thu, Nov 27, 2014 at 10:13 AM, Ola Liljedahl <ola.liljedahl@linaro.org>
wrote:

> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>

Reviewed-by: Bill Fischofer <bill.fischofer@linaro.org>


> ---
> (This document/code contribution attached is provided under the terms of
> agreement LES-LTM-21309)
>
> Use definitions from odp_atomic_internal.h.
>
>  platform/linux-generic/include/api/odp_ticketlock.h |  2 +-
>  platform/linux-generic/odp_ticketlock.c             | 14 ++++++--------
>  2 files changed, 7 insertions(+), 9 deletions(-)
>
> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
> b/platform/linux-generic/include/api/odp_ticketlock.h
> index a3b3ab5..619816e 100644
> --- a/platform/linux-generic/include/api/odp_ticketlock.h
> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
> @@ -31,7 +31,7 @@ extern "C" {
>   */
>  typedef struct odp_ticketlock_t {
>         odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
> -       volatile uint32_t cur_ticket;  /**< @private Current ticket */
> +       odp_atomic_u32_t  cur_ticket;  /**< @private Current ticket */
>  } odp_ticketlock_t;
>
>
> diff --git a/platform/linux-generic/odp_ticketlock.c
> b/platform/linux-generic/odp_ticketlock.c
> index e60f526..6c5e74e 100644
> --- a/platform/linux-generic/odp_ticketlock.c
> +++ b/platform/linux-generic/odp_ticketlock.c
> @@ -6,6 +6,7 @@
>
>  #include <odp_ticketlock.h>
>  #include <odp_atomic.h>
> +#include <odp_atomic_internal.h>
>  #include <odp_sync.h>
>  #include <odp_spin_internal.h>
>
> @@ -13,7 +14,7 @@
>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>  {
>         odp_atomic_init_u32(&ticketlock->next_ticket, 0);
> -       ticketlock->cur_ticket  = 0;
> +       odp_atomic_init_u32(&ticketlock->cur_ticket, 0);
>  }
>
>
> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>
>         ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>
> -       while (ticket != ticketlock->cur_ticket)
> +       while (ticket != _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
> +                                                _ODP_MEMMODEL_ACQ))
>                 odp_spin();
> -
> -       __atomic_thread_fence(__ATOMIC_ACQUIRE);
>  }
>
>
>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>  {
> -       __atomic_thread_fence(__ATOMIC_RELEASE);
> -
> -       ticketlock->cur_ticket++;
> +       _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1,
> _ODP_MEMMODEL_RLS);
>
>  #if defined __OCTEON__
>         odp_sync_stores(); /* SYNCW to flush write buffer */
> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>
>  int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
>  {
> -       return ticketlock->cur_ticket !=
> +       return odp_atomic_load_u32(&ticketlock->cur_ticket) !=
>                 odp_atomic_load_u32(&ticketlock->next_ticket);
>  }
> --
> 1.9.1
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
Ola Liljedahl Dec. 1, 2014, 12:55 p.m. UTC | #2
On 1 December 2014 at 13:30, Savolainen, Petri (NSN - FI/Espoo)
<petri.savolainen@nsn.com> wrote:
> Hi,
>
> This change degrades ticketlock performance. The cur_ticket variable does not need atomic add functionality, since it's written only from within the lock. Only correct barrier enforcement is needed.
cur_ticket needs atomic *store* but yes it does not need the whole RMW
add operation to be atomic. This was a bit excessive of me.

A better way to implemented this would be
uint32_t cur = _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
_ODP_MEMMODEL_RLX);
_odp_atomic_u32_store_mm(&ticketlock->cur_ticket, cur + 1, _ODP_MEMMODEL_RLS);

0000000000000030 <odp_ticketlock_unlock>:
  30:   8b 47 04                mov    0x4(%rdi),%eax
  33:   83 c0 01                add    $0x1,%eax
  36:   89 47 04                mov    %eax,0x4(%rdi)
  39:   c3                          retq

I was saving performance optimizations for later. On ARM there is a
lot you can do with WFE/SEV (wait for event/signal event) which will
avoid unnecessary spinning on atomic variables and avoiding the
waiters interfering (loading a flag -> want shared ownership of cache
line) with the lock owner (which likely want exclusive ownership of
cache line in order to update it, this will often stall if other
threads keep loading the lock variable). Moving the lock variables to
another cache line avoids this problem but introduces another
potential cache miss (when accessing the protected data) so is not
necessarily better in all cases.

-- Ola

>
> Here's odp_example cycle counts from the tip of repo (single thread, but multi-thread figures follow the same pattern).
>
> Thread 1 starts on core 1
>   [1] alloc_sng alloc+free   61 cycles, 18 ns
>   [1] alloc_multi alloc+free 38 cycles, 11 ns
>   [1] poll_queue enq+deq     139 cycles, 41 ns
>   [1] sched_one_s_lo enq+deq 626 cycles, 184 ns
>   [1] sched_____s_lo enq+deq 625 cycles, 184 ns
>   [1] sched_one_m_lo enq+deq 620 cycles, 182 ns
>   [1] sched_____m_lo enq+deq 619 cycles, 182 ns
>   [1] sched_multi_lo enq+deq 165 cycles, 48 ns
>   [1] sched_one_s_hi enq+deq 331 cycles, 97 ns
>   [1] sched_____s_hi enq+deq 331 cycles, 97 ns
>   [1] sched_one_m_hi enq+deq 325 cycles, 96 ns
>   [1] sched_____m_hi enq+deq 325 cycles, 95 ns
>   [1] sched_multi_hi enq+deq 90 cycles, 26 ns
> Thread 1 exits
> ODP example complete
>
>
> And here's the same after removing this patch from the tip.
>
>
> Thread 1 starts on core 1
>   [1] alloc_sng alloc+free   61 cycles, 18 ns
>   [1] alloc_multi alloc+free 35 cycles, 10 ns
>   [1] poll_queue enq+deq     86 cycles, 25 ns
>   [1] sched_one_s_lo enq+deq 404 cycles, 119 ns
>   [1] sched_____s_lo enq+deq 404 cycles, 119 ns
>   [1] sched_one_m_lo enq+deq 403 cycles, 119 ns
>   [1] sched_____m_lo enq+deq 405 cycles, 119 ns
>   [1] sched_multi_lo enq+deq 112 cycles, 33 ns
>   [1] sched_one_s_hi enq+deq 216 cycles, 63 ns
>   [1] sched_____s_hi enq+deq 218 cycles, 64 ns
>   [1] sched_one_m_hi enq+deq 211 cycles, 62 ns
>   [1] sched_____m_hi enq+deq 212 cycles, 62 ns
>   [1] sched_multi_hi enq+deq 67 cycles, 19 ns
> Thread 1 exits
> ODP example complete
>
>
> Both ways are functionally correct, but atomic add functionality is not needed here and degrades  performance at least in my x86 system.
>
>
> -Petri
>
>
>
>
>> -----Original Message-----
>> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp-
>> bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
>> Sent: Thursday, November 27, 2014 5:13 PM
>> To: lng-odp@lists.linaro.org
>> Subject: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]: use
>> odp_atomic_internal.h
>>
>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>> ---
>> (This document/code contribution attached is provided under the terms of
>> agreement LES-LTM-21309)
>>
>> Use definitions from odp_atomic_internal.h.
>>
>>  platform/linux-generic/include/api/odp_ticketlock.h |  2 +-
>>  platform/linux-generic/odp_ticketlock.c             | 14 ++++++--------
>>  2 files changed, 7 insertions(+), 9 deletions(-)
>>
>> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
>> b/platform/linux-generic/include/api/odp_ticketlock.h
>> index a3b3ab5..619816e 100644
>> --- a/platform/linux-generic/include/api/odp_ticketlock.h
>> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
>> @@ -31,7 +31,7 @@ extern "C" {
>>   */
>>  typedef struct odp_ticketlock_t {
>>       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
>> -     volatile uint32_t cur_ticket;  /**< @private Current ticket */
>> +     odp_atomic_u32_t  cur_ticket;  /**< @private Current ticket */
>>  } odp_ticketlock_t;
>>
>>
>> diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux-
>> generic/odp_ticketlock.c
>> index e60f526..6c5e74e 100644
>> --- a/platform/linux-generic/odp_ticketlock.c
>> +++ b/platform/linux-generic/odp_ticketlock.c
>> @@ -6,6 +6,7 @@
>>
>>  #include <odp_ticketlock.h>
>>  #include <odp_atomic.h>
>> +#include <odp_atomic_internal.h>
>>  #include <odp_sync.h>
>>  #include <odp_spin_internal.h>
>>
>> @@ -13,7 +14,7 @@
>>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>>  {
>>       odp_atomic_init_u32(&ticketlock->next_ticket, 0);
>> -     ticketlock->cur_ticket  = 0;
>> +     odp_atomic_init_u32(&ticketlock->cur_ticket, 0);
>>  }
>>
>>
>> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>>
>>       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>>
>> -     while (ticket != ticketlock->cur_ticket)
>> +     while (ticket != _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
>> +                                              _ODP_MEMMODEL_ACQ))
>>               odp_spin();
>> -
>> -     __atomic_thread_fence(__ATOMIC_ACQUIRE);
>>  }
>>
>>
>>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>>  {
>> -     __atomic_thread_fence(__ATOMIC_RELEASE);
>> -
>> -     ticketlock->cur_ticket++;
>> +     _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1,
>> _ODP_MEMMODEL_RLS);
>>
>>  #if defined __OCTEON__
>>       odp_sync_stores(); /* SYNCW to flush write buffer */
>> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>>
>>  int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
>>  {
>> -     return ticketlock->cur_ticket !=
>> +     return odp_atomic_load_u32(&ticketlock->cur_ticket) !=
>>               odp_atomic_load_u32(&ticketlock->next_ticket);
>>  }
>> --
>> 1.9.1
>>
>>
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> http://lists.linaro.org/mailman/listinfo/lng-odp
Bill Fischofer Dec. 1, 2014, 1:17 p.m. UTC | #3
Isn't that a question of implementation rather than API definition?  The
APIs are useful.  It's the responsibility for each implementation to
realize those APIs optimally for its platform.  The alternative is to push
that responsibility onto the application, which defeats the portability
goals of ODP.

On Mon, Dec 1, 2014 at 8:14 AM, Savolainen, Petri (NSN - FI/Espoo) <
petri.savolainen@nsn.com> wrote:

> Hi,
>
> This is why lock (and lock free algorithm) implementation abstraction has
> low value. It's hard to find a single abstraction that is optimal for all
> HW architectures - a few C lines arranged differently may cause major
> performance impact. E.g. the kernel ticketlock for x86 packs both those
> variables into the same word so that they can load both with single xadd
> command ... how to abstract that.
>
> -Petri
>
>
> > -----Original Message-----
> > From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> > Sent: Monday, December 01, 2014 2:55 PM
> > To: Savolainen, Petri (NSN - FI/Espoo)
> > Cc: lng-odp@lists.linaro.org
> > Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]:
> > use odp_atomic_internal.h
> >
> > On 1 December 2014 at 13:30, Savolainen, Petri (NSN - FI/Espoo)
> > <petri.savolainen@nsn.com> wrote:
> > > Hi,
> > >
> > > This change degrades ticketlock performance. The cur_ticket variable
> > does not need atomic add functionality, since it's written only from
> > within the lock. Only correct barrier enforcement is needed.
> > cur_ticket needs atomic *store* but yes it does not need the whole RMW
> > add operation to be atomic. This was a bit excessive of me.
> >
> > A better way to implemented this would be
> > uint32_t cur = _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
> > _ODP_MEMMODEL_RLX);
> > _odp_atomic_u32_store_mm(&ticketlock->cur_ticket, cur + 1,
> > _ODP_MEMMODEL_RLS);
> >
> > 0000000000000030 <odp_ticketlock_unlock>:
> >   30:   8b 47 04                mov    0x4(%rdi),%eax
> >   33:   83 c0 01                add    $0x1,%eax
> >   36:   89 47 04                mov    %eax,0x4(%rdi)
> >   39:   c3                          retq
> >
> > I was saving performance optimizations for later. On ARM there is a
> > lot you can do with WFE/SEV (wait for event/signal event) which will
> > avoid unnecessary spinning on atomic variables and avoiding the
> > waiters interfering (loading a flag -> want shared ownership of cache
> > line) with the lock owner (which likely want exclusive ownership of
> > cache line in order to update it, this will often stall if other
> > threads keep loading the lock variable). Moving the lock variables to
> > another cache line avoids this problem but introduces another
> > potential cache miss (when accessing the protected data) so is not
> > necessarily better in all cases.
> >
> > -- Ola
> >
> > >
> > > Here's odp_example cycle counts from the tip of repo (single thread,
> but
> > multi-thread figures follow the same pattern).
> > >
> > > Thread 1 starts on core 1
> > >   [1] alloc_sng alloc+free   61 cycles, 18 ns
> > >   [1] alloc_multi alloc+free 38 cycles, 11 ns
> > >   [1] poll_queue enq+deq     139 cycles, 41 ns
> > >   [1] sched_one_s_lo enq+deq 626 cycles, 184 ns
> > >   [1] sched_____s_lo enq+deq 625 cycles, 184 ns
> > >   [1] sched_one_m_lo enq+deq 620 cycles, 182 ns
> > >   [1] sched_____m_lo enq+deq 619 cycles, 182 ns
> > >   [1] sched_multi_lo enq+deq 165 cycles, 48 ns
> > >   [1] sched_one_s_hi enq+deq 331 cycles, 97 ns
> > >   [1] sched_____s_hi enq+deq 331 cycles, 97 ns
> > >   [1] sched_one_m_hi enq+deq 325 cycles, 96 ns
> > >   [1] sched_____m_hi enq+deq 325 cycles, 95 ns
> > >   [1] sched_multi_hi enq+deq 90 cycles, 26 ns
> > > Thread 1 exits
> > > ODP example complete
> > >
> > >
> > > And here's the same after removing this patch from the tip.
> > >
> > >
> > > Thread 1 starts on core 1
> > >   [1] alloc_sng alloc+free   61 cycles, 18 ns
> > >   [1] alloc_multi alloc+free 35 cycles, 10 ns
> > >   [1] poll_queue enq+deq     86 cycles, 25 ns
> > >   [1] sched_one_s_lo enq+deq 404 cycles, 119 ns
> > >   [1] sched_____s_lo enq+deq 404 cycles, 119 ns
> > >   [1] sched_one_m_lo enq+deq 403 cycles, 119 ns
> > >   [1] sched_____m_lo enq+deq 405 cycles, 119 ns
> > >   [1] sched_multi_lo enq+deq 112 cycles, 33 ns
> > >   [1] sched_one_s_hi enq+deq 216 cycles, 63 ns
> > >   [1] sched_____s_hi enq+deq 218 cycles, 64 ns
> > >   [1] sched_one_m_hi enq+deq 211 cycles, 62 ns
> > >   [1] sched_____m_hi enq+deq 212 cycles, 62 ns
> > >   [1] sched_multi_hi enq+deq 67 cycles, 19 ns
> > > Thread 1 exits
> > > ODP example complete
> > >
> > >
> > > Both ways are functionally correct, but atomic add functionality is not
> > needed here and degrades  performance at least in my x86 system.
> > >
> > >
> > > -Petri
> > >
> > >
> > >
> > >
> > >> -----Original Message-----
> > >> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp-
> > >> bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
> > >> Sent: Thursday, November 27, 2014 5:13 PM
> > >> To: lng-odp@lists.linaro.org
> > >> Subject: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]:
> > use
> > >> odp_atomic_internal.h
> > >>
> > >> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
> > >> ---
> > >> (This document/code contribution attached is provided under the terms
> > of
> > >> agreement LES-LTM-21309)
> > >>
> > >> Use definitions from odp_atomic_internal.h.
> > >>
> > >>  platform/linux-generic/include/api/odp_ticketlock.h |  2 +-
> > >>  platform/linux-generic/odp_ticketlock.c             | 14
> ++++++-------
> > -
> > >>  2 files changed, 7 insertions(+), 9 deletions(-)
> > >>
> > >> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
> > >> b/platform/linux-generic/include/api/odp_ticketlock.h
> > >> index a3b3ab5..619816e 100644
> > >> --- a/platform/linux-generic/include/api/odp_ticketlock.h
> > >> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
> > >> @@ -31,7 +31,7 @@ extern "C" {
> > >>   */
> > >>  typedef struct odp_ticketlock_t {
> > >>       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
> > >> -     volatile uint32_t cur_ticket;  /**< @private Current ticket */
> > >> +     odp_atomic_u32_t  cur_ticket;  /**< @private Current ticket */
> > >>  } odp_ticketlock_t;
> > >>
> > >>
> > >> diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux-
> > >> generic/odp_ticketlock.c
> > >> index e60f526..6c5e74e 100644
> > >> --- a/platform/linux-generic/odp_ticketlock.c
> > >> +++ b/platform/linux-generic/odp_ticketlock.c
> > >> @@ -6,6 +6,7 @@
> > >>
> > >>  #include <odp_ticketlock.h>
> > >>  #include <odp_atomic.h>
> > >> +#include <odp_atomic_internal.h>
> > >>  #include <odp_sync.h>
> > >>  #include <odp_spin_internal.h>
> > >>
> > >> @@ -13,7 +14,7 @@
> > >>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
> > >>  {
> > >>       odp_atomic_init_u32(&ticketlock->next_ticket, 0);
> > >> -     ticketlock->cur_ticket  = 0;
> > >> +     odp_atomic_init_u32(&ticketlock->cur_ticket, 0);
> > >>  }
> > >>
> > >>
> > >> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t
> > *ticketlock)
> > >>
> > >>       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
> > >>
> > >> -     while (ticket != ticketlock->cur_ticket)
> > >> +     while (ticket !=
> _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
> > >> +                                              _ODP_MEMMODEL_ACQ))
> > >>               odp_spin();
> > >> -
> > >> -     __atomic_thread_fence(__ATOMIC_ACQUIRE);
> > >>  }
> > >>
> > >>
> > >>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
> > >>  {
> > >> -     __atomic_thread_fence(__ATOMIC_RELEASE);
> > >> -
> > >> -     ticketlock->cur_ticket++;
> > >> +     _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1,
> > >> _ODP_MEMMODEL_RLS);
> > >>
> > >>  #if defined __OCTEON__
> > >>       odp_sync_stores(); /* SYNCW to flush write buffer */
> > >> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t
> > *ticketlock)
> > >>
> > >>  int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
> > >>  {
> > >> -     return ticketlock->cur_ticket !=
> > >> +     return odp_atomic_load_u32(&ticketlock->cur_ticket) !=
> > >>               odp_atomic_load_u32(&ticketlock->next_ticket);
> > >>  }
> > >> --
> > >> 1.9.1
> > >>
> > >>
> > >> _______________________________________________
> > >> lng-odp mailing list
> > >> lng-odp@lists.linaro.org
> > >> http://lists.linaro.org/mailman/listinfo/lng-odp
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
Bill Fischofer Dec. 1, 2014, 1:25 p.m. UTC | #4
odp_atomic_internal is using the GCC primitives (for now) for
linux-generic.  Those can be changed/refined as needed. The functions
themselves are useful.

We need to distinguish between APIs and implementations.  We shouldn't be
arguing for or against an API based on the virtues or deficiencies of a
specific implementation of that API.  If the API is good then optimizing
its implementation is a separate issue that should be handled as a bug fix.

On Mon, Dec 1, 2014 at 8:20 AM, Savolainen, Petri (NSN - FI/Espoo) <
petri.savolainen@nsn.com> wrote:

>  Implementation abstraction == usage of odp_atomic_internal.h in the
> implementation.
>
>
>
> -Petri
>
>
>
> *From:* ext Bill Fischofer [mailto:bill.fischofer@linaro.org]
> *Sent:* Monday, December 01, 2014 3:18 PM
> *To:* Savolainen, Petri (NSN - FI/Espoo)
> *Cc:* ext Ola Liljedahl; lng-odp@lists.linaro.org
>
> *Subject:* Re: [lng-odp] [PATCHv2 5/5] linux-generic:
> odp_ticketlock.[ch]: use odp_atomic_internal.h
>
>
>
> Isn't that a question of implementation rather than API definition?  The
> APIs are useful.  It's the responsibility for each implementation to
> realize those APIs optimally for its platform.  The alternative is to push
> that responsibility onto the application, which defeats the portability
> goals of ODP.
>
>
>
> On Mon, Dec 1, 2014 at 8:14 AM, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
> Hi,
>
> This is why lock (and lock free algorithm) implementation abstraction has
> low value. It's hard to find a single abstraction that is optimal for all
> HW architectures - a few C lines arranged differently may cause major
> performance impact. E.g. the kernel ticketlock for x86 packs both those
> variables into the same word so that they can load both with single xadd
> command ... how to abstract that.
>
> -Petri
>
>
>
> > -----Original Message-----
> > From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> > Sent: Monday, December 01, 2014 2:55 PM
> > To: Savolainen, Petri (NSN - FI/Espoo)
> > Cc: lng-odp@lists.linaro.org
> > Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]:
> > use odp_atomic_internal.h
> >
> > On 1 December 2014 at 13:30, Savolainen, Petri (NSN - FI/Espoo)
> > <petri.savolainen@nsn.com> wrote:
> > > Hi,
> > >
> > > This change degrades ticketlock performance. The cur_ticket variable
> > does not need atomic add functionality, since it's written only from
> > within the lock. Only correct barrier enforcement is needed.
> > cur_ticket needs atomic *store* but yes it does not need the whole RMW
> > add operation to be atomic. This was a bit excessive of me.
> >
> > A better way to implemented this would be
> > uint32_t cur = _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
> > _ODP_MEMMODEL_RLX);
> > _odp_atomic_u32_store_mm(&ticketlock->cur_ticket, cur + 1,
> > _ODP_MEMMODEL_RLS);
> >
> > 0000000000000030 <odp_ticketlock_unlock>:
> >   30:   8b 47 04                mov    0x4(%rdi),%eax
> >   33:   83 c0 01                add    $0x1,%eax
> >   36:   89 47 04                mov    %eax,0x4(%rdi)
> >   39:   c3                          retq
> >
> > I was saving performance optimizations for later. On ARM there is a
> > lot you can do with WFE/SEV (wait for event/signal event) which will
> > avoid unnecessary spinning on atomic variables and avoiding the
> > waiters interfering (loading a flag -> want shared ownership of cache
> > line) with the lock owner (which likely want exclusive ownership of
> > cache line in order to update it, this will often stall if other
> > threads keep loading the lock variable). Moving the lock variables to
> > another cache line avoids this problem but introduces another
> > potential cache miss (when accessing the protected data) so is not
> > necessarily better in all cases.
> >
> > -- Ola
> >
> > >
> > > Here's odp_example cycle counts from the tip of repo (single thread,
> but
> > multi-thread figures follow the same pattern).
> > >
> > > Thread 1 starts on core 1
> > >   [1] alloc_sng alloc+free   61 cycles, 18 ns
> > >   [1] alloc_multi alloc+free 38 cycles, 11 ns
> > >   [1] poll_queue enq+deq     139 cycles, 41 ns
> > >   [1] sched_one_s_lo enq+deq 626 cycles, 184 ns
> > >   [1] sched_____s_lo enq+deq 625 cycles, 184 ns
> > >   [1] sched_one_m_lo enq+deq 620 cycles, 182 ns
> > >   [1] sched_____m_lo enq+deq 619 cycles, 182 ns
> > >   [1] sched_multi_lo enq+deq 165 cycles, 48 ns
> > >   [1] sched_one_s_hi enq+deq 331 cycles, 97 ns
> > >   [1] sched_____s_hi enq+deq 331 cycles, 97 ns
> > >   [1] sched_one_m_hi enq+deq 325 cycles, 96 ns
> > >   [1] sched_____m_hi enq+deq 325 cycles, 95 ns
> > >   [1] sched_multi_hi enq+deq 90 cycles, 26 ns
> > > Thread 1 exits
> > > ODP example complete
> > >
> > >
> > > And here's the same after removing this patch from the tip.
> > >
> > >
> > > Thread 1 starts on core 1
> > >   [1] alloc_sng alloc+free   61 cycles, 18 ns
> > >   [1] alloc_multi alloc+free 35 cycles, 10 ns
> > >   [1] poll_queue enq+deq     86 cycles, 25 ns
> > >   [1] sched_one_s_lo enq+deq 404 cycles, 119 ns
> > >   [1] sched_____s_lo enq+deq 404 cycles, 119 ns
> > >   [1] sched_one_m_lo enq+deq 403 cycles, 119 ns
> > >   [1] sched_____m_lo enq+deq 405 cycles, 119 ns
> > >   [1] sched_multi_lo enq+deq 112 cycles, 33 ns
> > >   [1] sched_one_s_hi enq+deq 216 cycles, 63 ns
> > >   [1] sched_____s_hi enq+deq 218 cycles, 64 ns
> > >   [1] sched_one_m_hi enq+deq 211 cycles, 62 ns
> > >   [1] sched_____m_hi enq+deq 212 cycles, 62 ns
> > >   [1] sched_multi_hi enq+deq 67 cycles, 19 ns
> > > Thread 1 exits
> > > ODP example complete
> > >
> > >
> > > Both ways are functionally correct, but atomic add functionality is not
> > needed here and degrades  performance at least in my x86 system.
> > >
> > >
> > > -Petri
> > >
> > >
> > >
> > >
> > >> -----Original Message-----
> > >> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp-
> > >> bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
> > >> Sent: Thursday, November 27, 2014 5:13 PM
> > >> To: lng-odp@lists.linaro.org
> > >> Subject: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]:
> > use
> > >> odp_atomic_internal.h
> > >>
> > >> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
> > >> ---
> > >> (This document/code contribution attached is provided under the terms
> > of
> > >> agreement LES-LTM-21309)
> > >>
> > >> Use definitions from odp_atomic_internal.h.
> > >>
> > >>  platform/linux-generic/include/api/odp_ticketlock.h |  2 +-
> > >>  platform/linux-generic/odp_ticketlock.c             | 14
> ++++++-------
> > -
> > >>  2 files changed, 7 insertions(+), 9 deletions(-)
> > >>
> > >> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
> > >> b/platform/linux-generic/include/api/odp_ticketlock.h
> > >> index a3b3ab5..619816e 100644
> > >> --- a/platform/linux-generic/include/api/odp_ticketlock.h
> > >> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
> > >> @@ -31,7 +31,7 @@ extern "C" {
> > >>   */
> > >>  typedef struct odp_ticketlock_t {
> > >>       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
> > >> -     volatile uint32_t cur_ticket;  /**< @private Current ticket */
> > >> +     odp_atomic_u32_t  cur_ticket;  /**< @private Current ticket */
> > >>  } odp_ticketlock_t;
> > >>
> > >>
> > >> diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux-
> > >> generic/odp_ticketlock.c
> > >> index e60f526..6c5e74e 100644
> > >> --- a/platform/linux-generic/odp_ticketlock.c
> > >> +++ b/platform/linux-generic/odp_ticketlock.c
> > >> @@ -6,6 +6,7 @@
> > >>
> > >>  #include <odp_ticketlock.h>
> > >>  #include <odp_atomic.h>
> > >> +#include <odp_atomic_internal.h>
> > >>  #include <odp_sync.h>
> > >>  #include <odp_spin_internal.h>
> > >>
> > >> @@ -13,7 +14,7 @@
> > >>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
> > >>  {
> > >>       odp_atomic_init_u32(&ticketlock->next_ticket, 0);
> > >> -     ticketlock->cur_ticket  = 0;
> > >> +     odp_atomic_init_u32(&ticketlock->cur_ticket, 0);
> > >>  }
> > >>
> > >>
> > >> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t
> > *ticketlock)
> > >>
> > >>       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
> > >>
> > >> -     while (ticket != ticketlock->cur_ticket)
> > >> +     while (ticket !=
> _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
> > >> +                                              _ODP_MEMMODEL_ACQ))
> > >>               odp_spin();
> > >> -
> > >> -     __atomic_thread_fence(__ATOMIC_ACQUIRE);
> > >>  }
> > >>
> > >>
> > >>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
> > >>  {
> > >> -     __atomic_thread_fence(__ATOMIC_RELEASE);
> > >> -
> > >> -     ticketlock->cur_ticket++;
> > >> +     _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1,
> > >> _ODP_MEMMODEL_RLS);
> > >>
> > >>  #if defined __OCTEON__
> > >>       odp_sync_stores(); /* SYNCW to flush write buffer */
> > >> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t
> > *ticketlock)
> > >>
> > >>  int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
> > >>  {
> > >> -     return ticketlock->cur_ticket !=
> > >> +     return odp_atomic_load_u32(&ticketlock->cur_ticket) !=
> > >>               odp_atomic_load_u32(&ticketlock->next_ticket);
> > >>  }
> > >> --
> > >> 1.9.1
> > >>
> > >>
> > >> _______________________________________________
> > >> lng-odp mailing list
> > >> lng-odp@lists.linaro.org
> > >> http://lists.linaro.org/mailman/listinfo/lng-odp
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
>
Ola Liljedahl Dec. 1, 2014, 1:30 p.m. UTC | #5
On 1 December 2014 at 14:14, Savolainen, Petri (NSN - FI/Espoo)
<petri.savolainen@nsn.com> wrote:
> Hi,
>
> This is why lock (and lock free algorithm) implementation abstraction has low value.
I can't agree with that. Using higher level abstraction and
programming model makes it much easier to write correct code. Only
when you have correct function should you start optimization.

> It's hard to find a single abstraction that is optimal for all HW architectures - a few C lines arranged differently may cause major performance impact. E.g. the kernel ticketlock for x86 packs both those variables into the same word so that they can load both with single xadd command ... how to abstract that.
I have looked at this implementation as well and was considering
benchmarking it later. You challenge me to find an abstraction for
that that makes is possible to implement it efficiently using
C11-style atomics. Challenge accepted.

I think the bigger challenge is to find implementations that scale
nicely and avoid congesting the interconnect with loads and
invalidates because a number of threads are competing for the same
lock.

-- Ola


>
> -Petri
>
>
>> -----Original Message-----
>> From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
>> Sent: Monday, December 01, 2014 2:55 PM
>> To: Savolainen, Petri (NSN - FI/Espoo)
>> Cc: lng-odp@lists.linaro.org
>> Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]:
>> use odp_atomic_internal.h
>>
>> On 1 December 2014 at 13:30, Savolainen, Petri (NSN - FI/Espoo)
>> <petri.savolainen@nsn.com> wrote:
>> > Hi,
>> >
>> > This change degrades ticketlock performance. The cur_ticket variable
>> does not need atomic add functionality, since it's written only from
>> within the lock. Only correct barrier enforcement is needed.
>> cur_ticket needs atomic *store* but yes it does not need the whole RMW
>> add operation to be atomic. This was a bit excessive of me.
>>
>> A better way to implemented this would be
>> uint32_t cur = _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
>> _ODP_MEMMODEL_RLX);
>> _odp_atomic_u32_store_mm(&ticketlock->cur_ticket, cur + 1,
>> _ODP_MEMMODEL_RLS);
>>
>> 0000000000000030 <odp_ticketlock_unlock>:
>>   30:   8b 47 04                mov    0x4(%rdi),%eax
>>   33:   83 c0 01                add    $0x1,%eax
>>   36:   89 47 04                mov    %eax,0x4(%rdi)
>>   39:   c3                          retq
>>
>> I was saving performance optimizations for later. On ARM there is a
>> lot you can do with WFE/SEV (wait for event/signal event) which will
>> avoid unnecessary spinning on atomic variables and avoiding the
>> waiters interfering (loading a flag -> want shared ownership of cache
>> line) with the lock owner (which likely want exclusive ownership of
>> cache line in order to update it, this will often stall if other
>> threads keep loading the lock variable). Moving the lock variables to
>> another cache line avoids this problem but introduces another
>> potential cache miss (when accessing the protected data) so is not
>> necessarily better in all cases.
>>
>> -- Ola
>>
>> >
>> > Here's odp_example cycle counts from the tip of repo (single thread, but
>> multi-thread figures follow the same pattern).
>> >
>> > Thread 1 starts on core 1
>> >   [1] alloc_sng alloc+free   61 cycles, 18 ns
>> >   [1] alloc_multi alloc+free 38 cycles, 11 ns
>> >   [1] poll_queue enq+deq     139 cycles, 41 ns
>> >   [1] sched_one_s_lo enq+deq 626 cycles, 184 ns
>> >   [1] sched_____s_lo enq+deq 625 cycles, 184 ns
>> >   [1] sched_one_m_lo enq+deq 620 cycles, 182 ns
>> >   [1] sched_____m_lo enq+deq 619 cycles, 182 ns
>> >   [1] sched_multi_lo enq+deq 165 cycles, 48 ns
>> >   [1] sched_one_s_hi enq+deq 331 cycles, 97 ns
>> >   [1] sched_____s_hi enq+deq 331 cycles, 97 ns
>> >   [1] sched_one_m_hi enq+deq 325 cycles, 96 ns
>> >   [1] sched_____m_hi enq+deq 325 cycles, 95 ns
>> >   [1] sched_multi_hi enq+deq 90 cycles, 26 ns
>> > Thread 1 exits
>> > ODP example complete
>> >
>> >
>> > And here's the same after removing this patch from the tip.
>> >
>> >
>> > Thread 1 starts on core 1
>> >   [1] alloc_sng alloc+free   61 cycles, 18 ns
>> >   [1] alloc_multi alloc+free 35 cycles, 10 ns
>> >   [1] poll_queue enq+deq     86 cycles, 25 ns
>> >   [1] sched_one_s_lo enq+deq 404 cycles, 119 ns
>> >   [1] sched_____s_lo enq+deq 404 cycles, 119 ns
>> >   [1] sched_one_m_lo enq+deq 403 cycles, 119 ns
>> >   [1] sched_____m_lo enq+deq 405 cycles, 119 ns
>> >   [1] sched_multi_lo enq+deq 112 cycles, 33 ns
>> >   [1] sched_one_s_hi enq+deq 216 cycles, 63 ns
>> >   [1] sched_____s_hi enq+deq 218 cycles, 64 ns
>> >   [1] sched_one_m_hi enq+deq 211 cycles, 62 ns
>> >   [1] sched_____m_hi enq+deq 212 cycles, 62 ns
>> >   [1] sched_multi_hi enq+deq 67 cycles, 19 ns
>> > Thread 1 exits
>> > ODP example complete
>> >
>> >
>> > Both ways are functionally correct, but atomic add functionality is not
>> needed here and degrades  performance at least in my x86 system.
>> >
>> >
>> > -Petri
>> >
>> >
>> >
>> >
>> >> -----Original Message-----
>> >> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp-
>> >> bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
>> >> Sent: Thursday, November 27, 2014 5:13 PM
>> >> To: lng-odp@lists.linaro.org
>> >> Subject: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]:
>> use
>> >> odp_atomic_internal.h
>> >>
>> >> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>> >> ---
>> >> (This document/code contribution attached is provided under the terms
>> of
>> >> agreement LES-LTM-21309)
>> >>
>> >> Use definitions from odp_atomic_internal.h.
>> >>
>> >>  platform/linux-generic/include/api/odp_ticketlock.h |  2 +-
>> >>  platform/linux-generic/odp_ticketlock.c             | 14 ++++++-------
>> -
>> >>  2 files changed, 7 insertions(+), 9 deletions(-)
>> >>
>> >> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
>> >> b/platform/linux-generic/include/api/odp_ticketlock.h
>> >> index a3b3ab5..619816e 100644
>> >> --- a/platform/linux-generic/include/api/odp_ticketlock.h
>> >> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
>> >> @@ -31,7 +31,7 @@ extern "C" {
>> >>   */
>> >>  typedef struct odp_ticketlock_t {
>> >>       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
>> >> -     volatile uint32_t cur_ticket;  /**< @private Current ticket */
>> >> +     odp_atomic_u32_t  cur_ticket;  /**< @private Current ticket */
>> >>  } odp_ticketlock_t;
>> >>
>> >>
>> >> diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux-
>> >> generic/odp_ticketlock.c
>> >> index e60f526..6c5e74e 100644
>> >> --- a/platform/linux-generic/odp_ticketlock.c
>> >> +++ b/platform/linux-generic/odp_ticketlock.c
>> >> @@ -6,6 +6,7 @@
>> >>
>> >>  #include <odp_ticketlock.h>
>> >>  #include <odp_atomic.h>
>> >> +#include <odp_atomic_internal.h>
>> >>  #include <odp_sync.h>
>> >>  #include <odp_spin_internal.h>
>> >>
>> >> @@ -13,7 +14,7 @@
>> >>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>> >>  {
>> >>       odp_atomic_init_u32(&ticketlock->next_ticket, 0);
>> >> -     ticketlock->cur_ticket  = 0;
>> >> +     odp_atomic_init_u32(&ticketlock->cur_ticket, 0);
>> >>  }
>> >>
>> >>
>> >> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t
>> *ticketlock)
>> >>
>> >>       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>> >>
>> >> -     while (ticket != ticketlock->cur_ticket)
>> >> +     while (ticket != _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
>> >> +                                              _ODP_MEMMODEL_ACQ))
>> >>               odp_spin();
>> >> -
>> >> -     __atomic_thread_fence(__ATOMIC_ACQUIRE);
>> >>  }
>> >>
>> >>
>> >>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>> >>  {
>> >> -     __atomic_thread_fence(__ATOMIC_RELEASE);
>> >> -
>> >> -     ticketlock->cur_ticket++;
>> >> +     _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1,
>> >> _ODP_MEMMODEL_RLS);
>> >>
>> >>  #if defined __OCTEON__
>> >>       odp_sync_stores(); /* SYNCW to flush write buffer */
>> >> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t
>> *ticketlock)
>> >>
>> >>  int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
>> >>  {
>> >> -     return ticketlock->cur_ticket !=
>> >> +     return odp_atomic_load_u32(&ticketlock->cur_ticket) !=
>> >>               odp_atomic_load_u32(&ticketlock->next_ticket);
>> >>  }
>> >> --
>> >> 1.9.1
>> >>
>> >>
>> >> _______________________________________________
>> >> lng-odp mailing list
>> >> lng-odp@lists.linaro.org
>> >> http://lists.linaro.org/mailman/listinfo/lng-odp
Ola Liljedahl Dec. 1, 2014, 1:43 p.m. UTC | #6
On 1 December 2014 at 14:25, Bill Fischofer <bill.fischofer@linaro.org> wrote:
> odp_atomic_internal is using the GCC primitives (for now) for linux-generic.
> Those can be changed/refined as needed. The functions themselves are useful.
>
> We need to distinguish between APIs and implementations.  We shouldn't be
> arguing for or against an API based on the virtues or deficiencies of a
> specific implementation of that API.  If the API is good then optimizing its
> implementation is a separate issue that should be handled as a bug fix.
The problem was not even the implementation of this API. I used an
atomic-RMW operation where none was needed (only atomic store needed)
and this is not optimal for some architectures (e.g. x86). But I
wasn't attempting to optimize for performance, just optimizing for
correctness.

-- Ola

>
> On Mon, Dec 1, 2014 at 8:20 AM, Savolainen, Petri (NSN - FI/Espoo)
> <petri.savolainen@nsn.com> wrote:
>>
>> Implementation abstraction == usage of odp_atomic_internal.h in the
>> implementation.
>>
>>
>>
>> -Petri
>>
>>
>>
>> From: ext Bill Fischofer [mailto:bill.fischofer@linaro.org]
>> Sent: Monday, December 01, 2014 3:18 PM
>> To: Savolainen, Petri (NSN - FI/Espoo)
>> Cc: ext Ola Liljedahl; lng-odp@lists.linaro.org
>>
>>
>> Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]:
>> use odp_atomic_internal.h
>>
>>
>>
>> Isn't that a question of implementation rather than API definition?  The
>> APIs are useful.  It's the responsibility for each implementation to realize
>> those APIs optimally for its platform.  The alternative is to push that
>> responsibility onto the application, which defeats the portability goals of
>> ODP.
>>
>>
>>
>> On Mon, Dec 1, 2014 at 8:14 AM, Savolainen, Petri (NSN - FI/Espoo)
>> <petri.savolainen@nsn.com> wrote:
>>
>> Hi,
>>
>> This is why lock (and lock free algorithm) implementation abstraction has
>> low value. It's hard to find a single abstraction that is optimal for all HW
>> architectures - a few C lines arranged differently may cause major
>> performance impact. E.g. the kernel ticketlock for x86 packs both those
>> variables into the same word so that they can load both with single xadd
>> command ... how to abstract that.
>>
>> -Petri
>>
>>
>>
>> > -----Original Message-----
>> > From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
>> > Sent: Monday, December 01, 2014 2:55 PM
>> > To: Savolainen, Petri (NSN - FI/Espoo)
>> > Cc: lng-odp@lists.linaro.org
>> > Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]:
>> > use odp_atomic_internal.h
>> >
>> > On 1 December 2014 at 13:30, Savolainen, Petri (NSN - FI/Espoo)
>> > <petri.savolainen@nsn.com> wrote:
>> > > Hi,
>> > >
>> > > This change degrades ticketlock performance. The cur_ticket variable
>> > does not need atomic add functionality, since it's written only from
>> > within the lock. Only correct barrier enforcement is needed.
>> > cur_ticket needs atomic *store* but yes it does not need the whole RMW
>> > add operation to be atomic. This was a bit excessive of me.
>> >
>> > A better way to implemented this would be
>> > uint32_t cur = _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
>> > _ODP_MEMMODEL_RLX);
>> > _odp_atomic_u32_store_mm(&ticketlock->cur_ticket, cur + 1,
>> > _ODP_MEMMODEL_RLS);
>> >
>> > 0000000000000030 <odp_ticketlock_unlock>:
>> >   30:   8b 47 04                mov    0x4(%rdi),%eax
>> >   33:   83 c0 01                add    $0x1,%eax
>> >   36:   89 47 04                mov    %eax,0x4(%rdi)
>> >   39:   c3                          retq
>> >
>> > I was saving performance optimizations for later. On ARM there is a
>> > lot you can do with WFE/SEV (wait for event/signal event) which will
>> > avoid unnecessary spinning on atomic variables and avoiding the
>> > waiters interfering (loading a flag -> want shared ownership of cache
>> > line) with the lock owner (which likely want exclusive ownership of
>> > cache line in order to update it, this will often stall if other
>> > threads keep loading the lock variable). Moving the lock variables to
>> > another cache line avoids this problem but introduces another
>> > potential cache miss (when accessing the protected data) so is not
>> > necessarily better in all cases.
>> >
>> > -- Ola
>> >
>> > >
>> > > Here's odp_example cycle counts from the tip of repo (single thread,
>> > > but
>> > multi-thread figures follow the same pattern).
>> > >
>> > > Thread 1 starts on core 1
>> > >   [1] alloc_sng alloc+free   61 cycles, 18 ns
>> > >   [1] alloc_multi alloc+free 38 cycles, 11 ns
>> > >   [1] poll_queue enq+deq     139 cycles, 41 ns
>> > >   [1] sched_one_s_lo enq+deq 626 cycles, 184 ns
>> > >   [1] sched_____s_lo enq+deq 625 cycles, 184 ns
>> > >   [1] sched_one_m_lo enq+deq 620 cycles, 182 ns
>> > >   [1] sched_____m_lo enq+deq 619 cycles, 182 ns
>> > >   [1] sched_multi_lo enq+deq 165 cycles, 48 ns
>> > >   [1] sched_one_s_hi enq+deq 331 cycles, 97 ns
>> > >   [1] sched_____s_hi enq+deq 331 cycles, 97 ns
>> > >   [1] sched_one_m_hi enq+deq 325 cycles, 96 ns
>> > >   [1] sched_____m_hi enq+deq 325 cycles, 95 ns
>> > >   [1] sched_multi_hi enq+deq 90 cycles, 26 ns
>> > > Thread 1 exits
>> > > ODP example complete
>> > >
>> > >
>> > > And here's the same after removing this patch from the tip.
>> > >
>> > >
>> > > Thread 1 starts on core 1
>> > >   [1] alloc_sng alloc+free   61 cycles, 18 ns
>> > >   [1] alloc_multi alloc+free 35 cycles, 10 ns
>> > >   [1] poll_queue enq+deq     86 cycles, 25 ns
>> > >   [1] sched_one_s_lo enq+deq 404 cycles, 119 ns
>> > >   [1] sched_____s_lo enq+deq 404 cycles, 119 ns
>> > >   [1] sched_one_m_lo enq+deq 403 cycles, 119 ns
>> > >   [1] sched_____m_lo enq+deq 405 cycles, 119 ns
>> > >   [1] sched_multi_lo enq+deq 112 cycles, 33 ns
>> > >   [1] sched_one_s_hi enq+deq 216 cycles, 63 ns
>> > >   [1] sched_____s_hi enq+deq 218 cycles, 64 ns
>> > >   [1] sched_one_m_hi enq+deq 211 cycles, 62 ns
>> > >   [1] sched_____m_hi enq+deq 212 cycles, 62 ns
>> > >   [1] sched_multi_hi enq+deq 67 cycles, 19 ns
>> > > Thread 1 exits
>> > > ODP example complete
>> > >
>> > >
>> > > Both ways are functionally correct, but atomic add functionality is
>> > > not
>> > needed here and degrades  performance at least in my x86 system.
>> > >
>> > >
>> > > -Petri
>> > >
>> > >
>> > >
>> > >
>> > >> -----Original Message-----
>> > >> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp-
>> > >> bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
>> > >> Sent: Thursday, November 27, 2014 5:13 PM
>> > >> To: lng-odp@lists.linaro.org
>> > >> Subject: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]:
>> > use
>> > >> odp_atomic_internal.h
>> > >>
>> > >> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>> > >> ---
>> > >> (This document/code contribution attached is provided under the terms
>> > of
>> > >> agreement LES-LTM-21309)
>> > >>
>> > >> Use definitions from odp_atomic_internal.h.
>> > >>
>> > >>  platform/linux-generic/include/api/odp_ticketlock.h |  2 +-
>> > >>  platform/linux-generic/odp_ticketlock.c             | 14
>> > >> ++++++-------
>> > -
>> > >>  2 files changed, 7 insertions(+), 9 deletions(-)
>> > >>
>> > >> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
>> > >> b/platform/linux-generic/include/api/odp_ticketlock.h
>> > >> index a3b3ab5..619816e 100644
>> > >> --- a/platform/linux-generic/include/api/odp_ticketlock.h
>> > >> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
>> > >> @@ -31,7 +31,7 @@ extern "C" {
>> > >>   */
>> > >>  typedef struct odp_ticketlock_t {
>> > >>       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
>> > >> -     volatile uint32_t cur_ticket;  /**< @private Current ticket */
>> > >> +     odp_atomic_u32_t  cur_ticket;  /**< @private Current ticket */
>> > >>  } odp_ticketlock_t;
>> > >>
>> > >>
>> > >> diff --git a/platform/linux-generic/odp_ticketlock.c
>> > >> b/platform/linux-
>> > >> generic/odp_ticketlock.c
>> > >> index e60f526..6c5e74e 100644
>> > >> --- a/platform/linux-generic/odp_ticketlock.c
>> > >> +++ b/platform/linux-generic/odp_ticketlock.c
>> > >> @@ -6,6 +6,7 @@
>> > >>
>> > >>  #include <odp_ticketlock.h>
>> > >>  #include <odp_atomic.h>
>> > >> +#include <odp_atomic_internal.h>
>> > >>  #include <odp_sync.h>
>> > >>  #include <odp_spin_internal.h>
>> > >>
>> > >> @@ -13,7 +14,7 @@
>> > >>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>> > >>  {
>> > >>       odp_atomic_init_u32(&ticketlock->next_ticket, 0);
>> > >> -     ticketlock->cur_ticket  = 0;
>> > >> +     odp_atomic_init_u32(&ticketlock->cur_ticket, 0);
>> > >>  }
>> > >>
>> > >>
>> > >> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t
>> > *ticketlock)
>> > >>
>> > >>       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>> > >>
>> > >> -     while (ticket != ticketlock->cur_ticket)
>> > >> +     while (ticket !=
>> > >> _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
>> > >> +                                              _ODP_MEMMODEL_ACQ))
>> > >>               odp_spin();
>> > >> -
>> > >> -     __atomic_thread_fence(__ATOMIC_ACQUIRE);
>> > >>  }
>> > >>
>> > >>
>> > >>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>> > >>  {
>> > >> -     __atomic_thread_fence(__ATOMIC_RELEASE);
>> > >> -
>> > >> -     ticketlock->cur_ticket++;
>> > >> +     _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1,
>> > >> _ODP_MEMMODEL_RLS);
>> > >>
>> > >>  #if defined __OCTEON__
>> > >>       odp_sync_stores(); /* SYNCW to flush write buffer */
>> > >> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t
>> > *ticketlock)
>> > >>
>> > >>  int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
>> > >>  {
>> > >> -     return ticketlock->cur_ticket !=
>> > >> +     return odp_atomic_load_u32(&ticketlock->cur_ticket) !=
>> > >>               odp_atomic_load_u32(&ticketlock->next_ticket);
>> > >>  }
>> > >> --
>> > >> 1.9.1
>> > >>
>> > >>
>> > >> _______________________________________________
>> > >> lng-odp mailing list
>> > >> lng-odp@lists.linaro.org
>> > >> http://lists.linaro.org/mailman/listinfo/lng-odp
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>
>>
>
>
Bill Fischofer Dec. 1, 2014, 1:51 p.m. UTC | #7
I agree with that approach.  Getting things correct first is the most
important.  Getting the wrong answer quickly isn't terribly useful.  :)

On Mon, Dec 1, 2014 at 8:43 AM, Ola Liljedahl <ola.liljedahl@linaro.org>
wrote:

> On 1 December 2014 at 14:25, Bill Fischofer <bill.fischofer@linaro.org>
> wrote:
> > odp_atomic_internal is using the GCC primitives (for now) for
> linux-generic.
> > Those can be changed/refined as needed. The functions themselves are
> useful.
> >
> > We need to distinguish between APIs and implementations.  We shouldn't be
> > arguing for or against an API based on the virtues or deficiencies of a
> > specific implementation of that API.  If the API is good then optimizing
> its
> > implementation is a separate issue that should be handled as a bug fix.
> The problem was not even the implementation of this API. I used an
> atomic-RMW operation where none was needed (only atomic store needed)
> and this is not optimal for some architectures (e.g. x86). But I
> wasn't attempting to optimize for performance, just optimizing for
> correctness.
>
> -- Ola
>
> >
> > On Mon, Dec 1, 2014 at 8:20 AM, Savolainen, Petri (NSN - FI/Espoo)
> > <petri.savolainen@nsn.com> wrote:
> >>
> >> Implementation abstraction == usage of odp_atomic_internal.h in the
> >> implementation.
> >>
> >>
> >>
> >> -Petri
> >>
> >>
> >>
> >> From: ext Bill Fischofer [mailto:bill.fischofer@linaro.org]
> >> Sent: Monday, December 01, 2014 3:18 PM
> >> To: Savolainen, Petri (NSN - FI/Espoo)
> >> Cc: ext Ola Liljedahl; lng-odp@lists.linaro.org
> >>
> >>
> >> Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]:
> >> use odp_atomic_internal.h
> >>
> >>
> >>
> >> Isn't that a question of implementation rather than API definition?  The
> >> APIs are useful.  It's the responsibility for each implementation to
> realize
> >> those APIs optimally for its platform.  The alternative is to push that
> >> responsibility onto the application, which defeats the portability
> goals of
> >> ODP.
> >>
> >>
> >>
> >> On Mon, Dec 1, 2014 at 8:14 AM, Savolainen, Petri (NSN - FI/Espoo)
> >> <petri.savolainen@nsn.com> wrote:
> >>
> >> Hi,
> >>
> >> This is why lock (and lock free algorithm) implementation abstraction
> has
> >> low value. It's hard to find a single abstraction that is optimal for
> all HW
> >> architectures - a few C lines arranged differently may cause major
> >> performance impact. E.g. the kernel ticketlock for x86 packs both those
> >> variables into the same word so that they can load both with single xadd
> >> command ... how to abstract that.
> >>
> >> -Petri
> >>
> >>
> >>
> >> > -----Original Message-----
> >> > From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> >> > Sent: Monday, December 01, 2014 2:55 PM
> >> > To: Savolainen, Petri (NSN - FI/Espoo)
> >> > Cc: lng-odp@lists.linaro.org
> >> > Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic:
> odp_ticketlock.[ch]:
> >> > use odp_atomic_internal.h
> >> >
> >> > On 1 December 2014 at 13:30, Savolainen, Petri (NSN - FI/Espoo)
> >> > <petri.savolainen@nsn.com> wrote:
> >> > > Hi,
> >> > >
> >> > > This change degrades ticketlock performance. The cur_ticket variable
> >> > does not need atomic add functionality, since it's written only from
> >> > within the lock. Only correct barrier enforcement is needed.
> >> > cur_ticket needs atomic *store* but yes it does not need the whole RMW
> >> > add operation to be atomic. This was a bit excessive of me.
> >> >
> >> > A better way to implemented this would be
> >> > uint32_t cur = _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
> >> > _ODP_MEMMODEL_RLX);
> >> > _odp_atomic_u32_store_mm(&ticketlock->cur_ticket, cur + 1,
> >> > _ODP_MEMMODEL_RLS);
> >> >
> >> > 0000000000000030 <odp_ticketlock_unlock>:
> >> >   30:   8b 47 04                mov    0x4(%rdi),%eax
> >> >   33:   83 c0 01                add    $0x1,%eax
> >> >   36:   89 47 04                mov    %eax,0x4(%rdi)
> >> >   39:   c3                          retq
> >> >
> >> > I was saving performance optimizations for later. On ARM there is a
> >> > lot you can do with WFE/SEV (wait for event/signal event) which will
> >> > avoid unnecessary spinning on atomic variables and avoiding the
> >> > waiters interfering (loading a flag -> want shared ownership of cache
> >> > line) with the lock owner (which likely want exclusive ownership of
> >> > cache line in order to update it, this will often stall if other
> >> > threads keep loading the lock variable). Moving the lock variables to
> >> > another cache line avoids this problem but introduces another
> >> > potential cache miss (when accessing the protected data) so is not
> >> > necessarily better in all cases.
> >> >
> >> > -- Ola
> >> >
> >> > >
> >> > > Here's odp_example cycle counts from the tip of repo (single thread,
> >> > > but
> >> > multi-thread figures follow the same pattern).
> >> > >
> >> > > Thread 1 starts on core 1
> >> > >   [1] alloc_sng alloc+free   61 cycles, 18 ns
> >> > >   [1] alloc_multi alloc+free 38 cycles, 11 ns
> >> > >   [1] poll_queue enq+deq     139 cycles, 41 ns
> >> > >   [1] sched_one_s_lo enq+deq 626 cycles, 184 ns
> >> > >   [1] sched_____s_lo enq+deq 625 cycles, 184 ns
> >> > >   [1] sched_one_m_lo enq+deq 620 cycles, 182 ns
> >> > >   [1] sched_____m_lo enq+deq 619 cycles, 182 ns
> >> > >   [1] sched_multi_lo enq+deq 165 cycles, 48 ns
> >> > >   [1] sched_one_s_hi enq+deq 331 cycles, 97 ns
> >> > >   [1] sched_____s_hi enq+deq 331 cycles, 97 ns
> >> > >   [1] sched_one_m_hi enq+deq 325 cycles, 96 ns
> >> > >   [1] sched_____m_hi enq+deq 325 cycles, 95 ns
> >> > >   [1] sched_multi_hi enq+deq 90 cycles, 26 ns
> >> > > Thread 1 exits
> >> > > ODP example complete
> >> > >
> >> > >
> >> > > And here's the same after removing this patch from the tip.
> >> > >
> >> > >
> >> > > Thread 1 starts on core 1
> >> > >   [1] alloc_sng alloc+free   61 cycles, 18 ns
> >> > >   [1] alloc_multi alloc+free 35 cycles, 10 ns
> >> > >   [1] poll_queue enq+deq     86 cycles, 25 ns
> >> > >   [1] sched_one_s_lo enq+deq 404 cycles, 119 ns
> >> > >   [1] sched_____s_lo enq+deq 404 cycles, 119 ns
> >> > >   [1] sched_one_m_lo enq+deq 403 cycles, 119 ns
> >> > >   [1] sched_____m_lo enq+deq 405 cycles, 119 ns
> >> > >   [1] sched_multi_lo enq+deq 112 cycles, 33 ns
> >> > >   [1] sched_one_s_hi enq+deq 216 cycles, 63 ns
> >> > >   [1] sched_____s_hi enq+deq 218 cycles, 64 ns
> >> > >   [1] sched_one_m_hi enq+deq 211 cycles, 62 ns
> >> > >   [1] sched_____m_hi enq+deq 212 cycles, 62 ns
> >> > >   [1] sched_multi_hi enq+deq 67 cycles, 19 ns
> >> > > Thread 1 exits
> >> > > ODP example complete
> >> > >
> >> > >
> >> > > Both ways are functionally correct, but atomic add functionality is
> >> > > not
> >> > needed here and degrades  performance at least in my x86 system.
> >> > >
> >> > >
> >> > > -Petri
> >> > >
> >> > >
> >> > >
> >> > >
> >> > >> -----Original Message-----
> >> > >> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp-
> >> > >> bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
> >> > >> Sent: Thursday, November 27, 2014 5:13 PM
> >> > >> To: lng-odp@lists.linaro.org
> >> > >> Subject: [lng-odp] [PATCHv2 5/5] linux-generic:
> odp_ticketlock.[ch]:
> >> > use
> >> > >> odp_atomic_internal.h
> >> > >>
> >> > >> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
> >> > >> ---
> >> > >> (This document/code contribution attached is provided under the
> terms
> >> > of
> >> > >> agreement LES-LTM-21309)
> >> > >>
> >> > >> Use definitions from odp_atomic_internal.h.
> >> > >>
> >> > >>  platform/linux-generic/include/api/odp_ticketlock.h |  2 +-
> >> > >>  platform/linux-generic/odp_ticketlock.c             | 14
> >> > >> ++++++-------
> >> > -
> >> > >>  2 files changed, 7 insertions(+), 9 deletions(-)
> >> > >>
> >> > >> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
> >> > >> b/platform/linux-generic/include/api/odp_ticketlock.h
> >> > >> index a3b3ab5..619816e 100644
> >> > >> --- a/platform/linux-generic/include/api/odp_ticketlock.h
> >> > >> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
> >> > >> @@ -31,7 +31,7 @@ extern "C" {
> >> > >>   */
> >> > >>  typedef struct odp_ticketlock_t {
> >> > >>       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
> >> > >> -     volatile uint32_t cur_ticket;  /**< @private Current ticket
> */
> >> > >> +     odp_atomic_u32_t  cur_ticket;  /**< @private Current ticket
> */
> >> > >>  } odp_ticketlock_t;
> >> > >>
> >> > >>
> >> > >> diff --git a/platform/linux-generic/odp_ticketlock.c
> >> > >> b/platform/linux-
> >> > >> generic/odp_ticketlock.c
> >> > >> index e60f526..6c5e74e 100644
> >> > >> --- a/platform/linux-generic/odp_ticketlock.c
> >> > >> +++ b/platform/linux-generic/odp_ticketlock.c
> >> > >> @@ -6,6 +6,7 @@
> >> > >>
> >> > >>  #include <odp_ticketlock.h>
> >> > >>  #include <odp_atomic.h>
> >> > >> +#include <odp_atomic_internal.h>
> >> > >>  #include <odp_sync.h>
> >> > >>  #include <odp_spin_internal.h>
> >> > >>
> >> > >> @@ -13,7 +14,7 @@
> >> > >>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
> >> > >>  {
> >> > >>       odp_atomic_init_u32(&ticketlock->next_ticket, 0);
> >> > >> -     ticketlock->cur_ticket  = 0;
> >> > >> +     odp_atomic_init_u32(&ticketlock->cur_ticket, 0);
> >> > >>  }
> >> > >>
> >> > >>
> >> > >> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t
> >> > *ticketlock)
> >> > >>
> >> > >>       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
> >> > >>
> >> > >> -     while (ticket != ticketlock->cur_ticket)
> >> > >> +     while (ticket !=
> >> > >> _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
> >> > >> +                                              _ODP_MEMMODEL_ACQ))
> >> > >>               odp_spin();
> >> > >> -
> >> > >> -     __atomic_thread_fence(__ATOMIC_ACQUIRE);
> >> > >>  }
> >> > >>
> >> > >>
> >> > >>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
> >> > >>  {
> >> > >> -     __atomic_thread_fence(__ATOMIC_RELEASE);
> >> > >> -
> >> > >> -     ticketlock->cur_ticket++;
> >> > >> +     _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1,
> >> > >> _ODP_MEMMODEL_RLS);
> >> > >>
> >> > >>  #if defined __OCTEON__
> >> > >>       odp_sync_stores(); /* SYNCW to flush write buffer */
> >> > >> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t
> >> > *ticketlock)
> >> > >>
> >> > >>  int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
> >> > >>  {
> >> > >> -     return ticketlock->cur_ticket !=
> >> > >> +     return odp_atomic_load_u32(&ticketlock->cur_ticket) !=
> >> > >>               odp_atomic_load_u32(&ticketlock->next_ticket);
> >> > >>  }
> >> > >> --
> >> > >> 1.9.1
> >> > >>
> >> > >>
> >> > >> _______________________________________________
> >> > >> lng-odp mailing list
> >> > >> lng-odp@lists.linaro.org
> >> > >> http://lists.linaro.org/mailman/listinfo/lng-odp
> >> _______________________________________________
> >> lng-odp mailing list
> >> lng-odp@lists.linaro.org
> >> http://lists.linaro.org/mailman/listinfo/lng-odp
> >>
> >>
> >
> >
>
diff mbox

Patch

diff --git a/platform/linux-generic/include/api/odp_ticketlock.h b/platform/linux-generic/include/api/odp_ticketlock.h
index a3b3ab5..619816e 100644
--- a/platform/linux-generic/include/api/odp_ticketlock.h
+++ b/platform/linux-generic/include/api/odp_ticketlock.h
@@ -31,7 +31,7 @@  extern "C" {
  */
 typedef struct odp_ticketlock_t {
 	odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
-	volatile uint32_t cur_ticket;  /**< @private Current ticket */
+	odp_atomic_u32_t  cur_ticket;  /**< @private Current ticket */
 } odp_ticketlock_t;
 
 
diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux-generic/odp_ticketlock.c
index e60f526..6c5e74e 100644
--- a/platform/linux-generic/odp_ticketlock.c
+++ b/platform/linux-generic/odp_ticketlock.c
@@ -6,6 +6,7 @@ 
 
 #include <odp_ticketlock.h>
 #include <odp_atomic.h>
+#include <odp_atomic_internal.h>
 #include <odp_sync.h>
 #include <odp_spin_internal.h>
 
@@ -13,7 +14,7 @@ 
 void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
 {
 	odp_atomic_init_u32(&ticketlock->next_ticket, 0);
-	ticketlock->cur_ticket  = 0;
+	odp_atomic_init_u32(&ticketlock->cur_ticket, 0);
 }
 
 
@@ -23,18 +24,15 @@  void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
 
 	ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
 
-	while (ticket != ticketlock->cur_ticket)
+	while (ticket != _odp_atomic_u32_load_mm(&ticketlock->cur_ticket,
+						 _ODP_MEMMODEL_ACQ))
 		odp_spin();
-
-	__atomic_thread_fence(__ATOMIC_ACQUIRE);
 }
 
 
 void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
 {
-	__atomic_thread_fence(__ATOMIC_RELEASE);
-
-	ticketlock->cur_ticket++;
+	_odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1, _ODP_MEMMODEL_RLS);
 
 #if defined __OCTEON__
 	odp_sync_stores(); /* SYNCW to flush write buffer */
@@ -44,6 +42,6 @@  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
 
 int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
 {
-	return ticketlock->cur_ticket !=
+	return odp_atomic_load_u32(&ticketlock->cur_ticket) !=
 		odp_atomic_load_u32(&ticketlock->next_ticket);
 }