diff mbox

Fix memory ordering in ring dequeue

Message ID 20170322045417.13555-1-brian.brooks@linaro.org
State Accepted
Commit 7ba232f77c60f707d279478a798f6ea14fe9c143
Headers show

Commit Message

Brian Brooks March 22, 2017, 4:54 a.m. UTC
Acquire ordering is needed to maintain proper release consistency with
the ring enqueue operation. This issue presented itself as deadlock when
running on an ARM-based chip.

Signed-off-by: Brian Brooks <brian.brooks@linaro.org>

---
 platform/linux-generic/include/odp_ring_internal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

-- 
2.12.0

Comments

Bill Fischofer March 22, 2017, 12:36 p.m. UTC | #1
On Tue, Mar 21, 2017 at 11:54 PM, Brian Brooks <brian.brooks@linaro.org> wrote:
>

> Acquire ordering is needed to maintain proper release consistency with

> the ring enqueue operation. This issue presented itself as deadlock when

> running on an ARM-based chip.

>

> Signed-off-by: Brian Brooks <brian.brooks@linaro.org>


Reviewed-by: Bill Fischofer <bill.fischofer@linaro.org>


> ---

>  platform/linux-generic/include/odp_ring_internal.h | 4 ++--

>  1 file changed, 2 insertions(+), 2 deletions(-)

>

> diff --git a/platform/linux-generic/include/odp_ring_internal.h b/platform/linux-generic/include/odp_ring_internal.h

> index 55fedeb3..44b83c60 100644

> --- a/platform/linux-generic/include/odp_ring_internal.h

> +++ b/platform/linux-generic/include/odp_ring_internal.h

> @@ -57,7 +57,7 @@ static inline uint32_t ring_deq(ring_t *ring, uint32_t mask)

>

>         /* Move reader head. This thread owns data at the new head. */

>         do {

> -               tail = odp_atomic_load_u32(&ring->w_tail);

> +               tail = odp_atomic_load_acq_u32(&ring->w_tail);

>

>                 if (head == tail)

>                         return RING_EMPTY;

> @@ -90,7 +90,7 @@ static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,

>

>         /* Move reader head. This thread owns data at the new head. */

>         do {

> -               tail = odp_atomic_load_u32(&ring->w_tail);

> +               tail = odp_atomic_load_acq_u32(&ring->w_tail);

>

>                 /* Ring is empty */

>                 if (head == tail)

> --

> 2.12.0

>
Maxim Uvarov March 22, 2017, 3:37 p.m. UTC | #2
On 03/22/17 07:54, Brian Brooks wrote:
> Acquire ordering is needed to maintain proper release consistency with

> the ring enqueue operation. This issue presented itself as deadlock when

> running on an ARM-based chip.

> 

> Signed-off-by: Brian Brooks <brian.brooks@linaro.org>

> ---

>  platform/linux-generic/include/odp_ring_internal.h | 4 ++--

>  1 file changed, 2 insertions(+), 2 deletions(-)

> 

> diff --git a/platform/linux-generic/include/odp_ring_internal.h b/platform/linux-generic/include/odp_ring_internal.h

> index 55fedeb3..44b83c60 100644

> --- a/platform/linux-generic/include/odp_ring_internal.h

> +++ b/platform/linux-generic/include/odp_ring_internal.h

> @@ -57,7 +57,7 @@ static inline uint32_t ring_deq(ring_t *ring, uint32_t mask)

>  

>  	/* Move reader head. This thread owns data at the new head. */

>  	do {

> -		tail = odp_atomic_load_u32(&ring->w_tail);

> +		tail = odp_atomic_load_acq_u32(&ring->w_tail);

>  

>  		if (head == tail)

>  			return RING_EMPTY;

> @@ -90,7 +90,7 @@ static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,

>  

>  	/* Move reader head. This thread owns data at the new head. */

>  	do {

> -		tail = odp_atomic_load_u32(&ring->w_tail);

> +		tail = odp_atomic_load_acq_u32(&ring->w_tail);

>  

>  		/* Ring is empty */

>  		if (head == tail)

> 



Brian, can you please write few words why acq (i.e. barriers) is not
used in first load and in final store?

static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,
				      uint32_t data[], uint32_t num)
{
	uint32_t head, tail, new_head, i;

	head = odp_atomic_load_u32(&ring->r_head);

	/* Move reader head. This thread owns data at the new head. */
	do {
		tail = odp_atomic_load_acq_u32(&ring->w_tail);

		/* Ring is empty */
		if (head == tail)
			return 0;

		/* Try to take all available */
		if ((tail - head) < num)
			num = tail - head;

		new_head = head + num;

	} while (odp_unlikely(odp_atomic_cas_acq_u32(&ring->r_head, &head,
			      new_head) == 0));

	/* Read queue index */
	for (i = 0; i < num; i++)
		data[i] = ring->data[(head + 1 + i) & mask];

	/* Wait until other readers have updated the tail */
	while (odp_unlikely(odp_atomic_load_acq_u32(&ring->r_tail) != head))
		odp_cpu_pause();

	/* Now update the reader tail */
	odp_atomic_store_rel_u32(&ring->r_tail, new_head);

	return num;
}
Maxim Uvarov March 22, 2017, 3:37 p.m. UTC | #3
On 03/22/17 07:54, Brian Brooks wrote:
> Acquire ordering is needed to maintain proper release consistency with

> the ring enqueue operation. This issue presented itself as deadlock when

> running on an ARM-based chip.

> 

> Signed-off-by: Brian Brooks <brian.brooks@linaro.org>

> ---

>  platform/linux-generic/include/odp_ring_internal.h | 4 ++--

>  1 file changed, 2 insertions(+), 2 deletions(-)

> 

> diff --git a/platform/linux-generic/include/odp_ring_internal.h b/platform/linux-generic/include/odp_ring_internal.h

> index 55fedeb3..44b83c60 100644

> --- a/platform/linux-generic/include/odp_ring_internal.h

> +++ b/platform/linux-generic/include/odp_ring_internal.h

> @@ -57,7 +57,7 @@ static inline uint32_t ring_deq(ring_t *ring, uint32_t mask)

>  

>  	/* Move reader head. This thread owns data at the new head. */

>  	do {

> -		tail = odp_atomic_load_u32(&ring->w_tail);

> +		tail = odp_atomic_load_acq_u32(&ring->w_tail);

>  

>  		if (head == tail)

>  			return RING_EMPTY;

> @@ -90,7 +90,7 @@ static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,

>  

>  	/* Move reader head. This thread owns data at the new head. */

>  	do {

> -		tail = odp_atomic_load_u32(&ring->w_tail);

> +		tail = odp_atomic_load_acq_u32(&ring->w_tail);

>  

>  		/* Ring is empty */

>  		if (head == tail)

> 



Brian, can you please write few words why acq (i.e. barriers) is not
used in first load and in final store?

static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,
				      uint32_t data[], uint32_t num)
{
	uint32_t head, tail, new_head, i;

	head = odp_atomic_load_u32(&ring->r_head);

	/* Move reader head. This thread owns data at the new head. */
	do {
		tail = odp_atomic_load_acq_u32(&ring->w_tail);

		/* Ring is empty */
		if (head == tail)
			return 0;

		/* Try to take all available */
		if ((tail - head) < num)
			num = tail - head;

		new_head = head + num;

	} while (odp_unlikely(odp_atomic_cas_acq_u32(&ring->r_head, &head,
			      new_head) == 0));

	/* Read queue index */
	for (i = 0; i < num; i++)
		data[i] = ring->data[(head + 1 + i) & mask];

	/* Wait until other readers have updated the tail */
	while (odp_unlikely(odp_atomic_load_acq_u32(&ring->r_tail) != head))
		odp_cpu_pause();

	/* Now update the reader tail */
	odp_atomic_store_rel_u32(&ring->r_tail, new_head);

	return num;
}
Bill Fischofer March 22, 2017, 3:47 p.m. UTC | #4
On Wed, Mar 22, 2017 at 10:37 AM, Maxim Uvarov <maxim.uvarov@linaro.org> wrote:
> On 03/22/17 07:54, Brian Brooks wrote:

>> Acquire ordering is needed to maintain proper release consistency with

>> the ring enqueue operation. This issue presented itself as deadlock when

>> running on an ARM-based chip.

>>

>> Signed-off-by: Brian Brooks <brian.brooks@linaro.org>

>> ---

>>  platform/linux-generic/include/odp_ring_internal.h | 4 ++--

>>  1 file changed, 2 insertions(+), 2 deletions(-)

>>

>> diff --git a/platform/linux-generic/include/odp_ring_internal.h b/platform/linux-generic/include/odp_ring_internal.h

>> index 55fedeb3..44b83c60 100644

>> --- a/platform/linux-generic/include/odp_ring_internal.h

>> +++ b/platform/linux-generic/include/odp_ring_internal.h

>> @@ -57,7 +57,7 @@ static inline uint32_t ring_deq(ring_t *ring, uint32_t mask)

>>

>>       /* Move reader head. This thread owns data at the new head. */

>>       do {

>> -             tail = odp_atomic_load_u32(&ring->w_tail);

>> +             tail = odp_atomic_load_acq_u32(&ring->w_tail);

>>

>>               if (head == tail)

>>                       return RING_EMPTY;

>> @@ -90,7 +90,7 @@ static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,

>>

>>       /* Move reader head. This thread owns data at the new head. */

>>       do {

>> -             tail = odp_atomic_load_u32(&ring->w_tail);

>> +             tail = odp_atomic_load_acq_u32(&ring->w_tail);

>>

>>               /* Ring is empty */

>>               if (head == tail)

>>

>

>

> Brian, can you please write few words why acq (i.e. barriers) is not

> used in first load and in final store?


acq is not needed for the initial fetch of head here because head is
updated by cas_acq below, which handles the sync in a self-contained
manner. The update of tail uses store_rel  as a counterpart to the
previous load_acq. load_acq and store_rel operate as pairs.

>

> static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,

>                                       uint32_t data[], uint32_t num)

> {

>         uint32_t head, tail, new_head, i;

>

>         head = odp_atomic_load_u32(&ring->r_head);

>

>         /* Move reader head. This thread owns data at the new head. */

>         do {

>                 tail = odp_atomic_load_acq_u32(&ring->w_tail);

>

>                 /* Ring is empty */

>                 if (head == tail)

>                         return 0;

>

>                 /* Try to take all available */

>                 if ((tail - head) < num)

>                         num = tail - head;

>

>                 new_head = head + num;

>

>         } while (odp_unlikely(odp_atomic_cas_acq_u32(&ring->r_head, &head,

>                               new_head) == 0));

>

>         /* Read queue index */

>         for (i = 0; i < num; i++)

>                 data[i] = ring->data[(head + 1 + i) & mask];

>

>         /* Wait until other readers have updated the tail */

>         while (odp_unlikely(odp_atomic_load_acq_u32(&ring->r_tail) != head))

>                 odp_cpu_pause();

>

>         /* Now update the reader tail */

>         odp_atomic_store_rel_u32(&ring->r_tail, new_head);

>

>         return num;

> }
Maxim Uvarov March 22, 2017, 3:56 p.m. UTC | #5
On 03/22/17 18:47, Bill Fischofer wrote:
> On Wed, Mar 22, 2017 at 10:37 AM, Maxim Uvarov <maxim.uvarov@linaro.org> wrote:

>> On 03/22/17 07:54, Brian Brooks wrote:

>>> Acquire ordering is needed to maintain proper release consistency with

>>> the ring enqueue operation. This issue presented itself as deadlock when

>>> running on an ARM-based chip.

>>>

>>> Signed-off-by: Brian Brooks <brian.brooks@linaro.org>

>>> ---

>>>  platform/linux-generic/include/odp_ring_internal.h | 4 ++--

>>>  1 file changed, 2 insertions(+), 2 deletions(-)

>>>

>>> diff --git a/platform/linux-generic/include/odp_ring_internal.h b/platform/linux-generic/include/odp_ring_internal.h

>>> index 55fedeb3..44b83c60 100644

>>> --- a/platform/linux-generic/include/odp_ring_internal.h

>>> +++ b/platform/linux-generic/include/odp_ring_internal.h

>>> @@ -57,7 +57,7 @@ static inline uint32_t ring_deq(ring_t *ring, uint32_t mask)

>>>

>>>       /* Move reader head. This thread owns data at the new head. */

>>>       do {

>>> -             tail = odp_atomic_load_u32(&ring->w_tail);

>>> +             tail = odp_atomic_load_acq_u32(&ring->w_tail);

>>>

>>>               if (head == tail)

>>>                       return RING_EMPTY;

>>> @@ -90,7 +90,7 @@ static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,

>>>

>>>       /* Move reader head. This thread owns data at the new head. */

>>>       do {

>>> -             tail = odp_atomic_load_u32(&ring->w_tail);

>>> +             tail = odp_atomic_load_acq_u32(&ring->w_tail);

>>>

>>>               /* Ring is empty */

>>>               if (head == tail)

>>>

>>

>>

>> Brian, can you please write few words why acq (i.e. barriers) is not

>> used in first load and in final store?

> 

> acq is not needed for the initial fetch of head here because head is

> updated by cas_acq below, which handles the sync in a self-contained

> manner. The update of tail uses store_rel  as a counterpart to the

> previous load_acq. load_acq and store_rel operate as pairs.

> 


Thank you, Bill.

Merged with short message:
"linux-gen: ring: fix memory ordering in ring dequeue"

Maxim.

>>

>> static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,

>>                                       uint32_t data[], uint32_t num)

>> {

>>         uint32_t head, tail, new_head, i;

>>

>>         head = odp_atomic_load_u32(&ring->r_head);

>>

>>         /* Move reader head. This thread owns data at the new head. */

>>         do {

>>                 tail = odp_atomic_load_acq_u32(&ring->w_tail);

>>

>>                 /* Ring is empty */

>>                 if (head == tail)

>>                         return 0;

>>

>>                 /* Try to take all available */

>>                 if ((tail - head) < num)

>>                         num = tail - head;

>>

>>                 new_head = head + num;

>>

>>         } while (odp_unlikely(odp_atomic_cas_acq_u32(&ring->r_head, &head,

>>                               new_head) == 0));

>>

>>         /* Read queue index */

>>         for (i = 0; i < num; i++)

>>                 data[i] = ring->data[(head + 1 + i) & mask];

>>

>>         /* Wait until other readers have updated the tail */

>>         while (odp_unlikely(odp_atomic_load_acq_u32(&ring->r_tail) != head))

>>                 odp_cpu_pause();

>>

>>         /* Now update the reader tail */

>>         odp_atomic_store_rel_u32(&ring->r_tail, new_head);

>>

>>         return num;

>> }
Brian Brooks March 22, 2017, 4:27 p.m. UTC | #6
On Wed, Mar 22, 2017 at 10:37 AM, Maxim Uvarov <maxim.uvarov@linaro.org> wrote:
> On 03/22/17 07:54, Brian Brooks wrote:

>> Acquire ordering is needed to maintain proper release consistency with

>> the ring enqueue operation. This issue presented itself as deadlock when

>> running on an ARM-based chip.

>>

>> Signed-off-by: Brian Brooks <brian.brooks@linaro.org>

>> ---

>>  platform/linux-generic/include/odp_ring_internal.h | 4 ++--

>>  1 file changed, 2 insertions(+), 2 deletions(-)

>>

>> diff --git a/platform/linux-generic/include/odp_ring_internal.h b/platform/linux-generic/include/odp_ring_internal.h

>> index 55fedeb3..44b83c60 100644

>> --- a/platform/linux-generic/include/odp_ring_internal.h

>> +++ b/platform/linux-generic/include/odp_ring_internal.h

>> @@ -57,7 +57,7 @@ static inline uint32_t ring_deq(ring_t *ring, uint32_t mask)

>>

>>       /* Move reader head. This thread owns data at the new head. */

>>       do {

>> -             tail = odp_atomic_load_u32(&ring->w_tail);

>> +             tail = odp_atomic_load_acq_u32(&ring->w_tail);

>>

>>               if (head == tail)

>>                       return RING_EMPTY;

>> @@ -90,7 +90,7 @@ static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,

>>

>>       /* Move reader head. This thread owns data at the new head. */

>>       do {

>> -             tail = odp_atomic_load_u32(&ring->w_tail);

>> +             tail = odp_atomic_load_acq_u32(&ring->w_tail);

>>

>>               /* Ring is empty */

>>               if (head == tail)

>>

>

>

> Brian, can you please write few words why acq (i.e. barriers) is not

> used in first load and in final store?


Memory ordering on these compiler built-ins can lower to either a
sequence of instructions with an explicit memory barrier instruction
(e.g. a dmb variant on ARMv8) or a single instruction with implied
memory ordering (e.g. ldar (a load instruction with acquire memory
ordering) on ARMv8).

Acquire ordering could be used on the initial load of r_head instead.
An explicit acquire fence could be placed before the do-while loop
too.
It is just a bit more symmetrical to have acquire ordering on the
load of w_tail.

What is needed is an ordering to prevent the load of w_tail (in the
do-while loop) to be reordered before the store of w_tail in the enq
operation... on the same core.

Think of the situation where the thread running on one core does
an enq() followed by a deq().  The final store release in enq() prevents
memory accesses from being reordered to after the store.  But, memory
accesses can still be reordered to before that store.  We need acquire
ordering in deq() to prevent subsequent memory accesses from being
reordered into the enq() operation.

> static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,

>                                       uint32_t data[], uint32_t num)

> {

>         uint32_t head, tail, new_head, i;

>

>         head = odp_atomic_load_u32(&ring->r_head);

>

>         /* Move reader head. This thread owns data at the new head. */

>         do {

>                 tail = odp_atomic_load_acq_u32(&ring->w_tail);

>

>                 /* Ring is empty */

>                 if (head == tail)

>                         return 0;

>

>                 /* Try to take all available */

>                 if ((tail - head) < num)

>                         num = tail - head;

>

>                 new_head = head + num;

>

>         } while (odp_unlikely(odp_atomic_cas_acq_u32(&ring->r_head, &head,

>                               new_head) == 0));

>

>         /* Read queue index */

>         for (i = 0; i < num; i++)

>                 data[i] = ring->data[(head + 1 + i) & mask];

>

>         /* Wait until other readers have updated the tail */

>         while (odp_unlikely(odp_atomic_load_acq_u32(&ring->r_tail) != head))

>                 odp_cpu_pause();

>

>         /* Now update the reader tail */

>         odp_atomic_store_rel_u32(&ring->r_tail, new_head);

>

>         return num;

> }
Maxim Uvarov March 23, 2017, 9:28 a.m. UTC | #7
ok, thanks.

On 22 March 2017 at 19:27, Brian Brooks <brian.brooks@linaro.org> wrote:

> On Wed, Mar 22, 2017 at 10:37 AM, Maxim Uvarov <maxim.uvarov@linaro.org>

> wrote:

> > On 03/22/17 07:54, Brian Brooks wrote:

> >> Acquire ordering is needed to maintain proper release consistency with

> >> the ring enqueue operation. This issue presented itself as deadlock when

> >> running on an ARM-based chip.

> >>

> >> Signed-off-by: Brian Brooks <brian.brooks@linaro.org>

> >> ---

> >>  platform/linux-generic/include/odp_ring_internal.h | 4 ++--

> >>  1 file changed, 2 insertions(+), 2 deletions(-)

> >>

> >> diff --git a/platform/linux-generic/include/odp_ring_internal.h

> b/platform/linux-generic/include/odp_ring_internal.h

> >> index 55fedeb3..44b83c60 100644

> >> --- a/platform/linux-generic/include/odp_ring_internal.h

> >> +++ b/platform/linux-generic/include/odp_ring_internal.h

> >> @@ -57,7 +57,7 @@ static inline uint32_t ring_deq(ring_t *ring,

> uint32_t mask)

> >>

> >>       /* Move reader head. This thread owns data at the new head. */

> >>       do {

> >> -             tail = odp_atomic_load_u32(&ring->w_tail);

> >> +             tail = odp_atomic_load_acq_u32(&ring->w_tail);

> >>

> >>               if (head == tail)

> >>                       return RING_EMPTY;

> >> @@ -90,7 +90,7 @@ static inline uint32_t ring_deq_multi(ring_t *ring,

> uint32_t mask,

> >>

> >>       /* Move reader head. This thread owns data at the new head. */

> >>       do {

> >> -             tail = odp_atomic_load_u32(&ring->w_tail);

> >> +             tail = odp_atomic_load_acq_u32(&ring->w_tail);

> >>

> >>               /* Ring is empty */

> >>               if (head == tail)

> >>

> >

> >

> > Brian, can you please write few words why acq (i.e. barriers) is not

> > used in first load and in final store?

>

> Memory ordering on these compiler built-ins can lower to either a

> sequence of instructions with an explicit memory barrier instruction

> (e.g. a dmb variant on ARMv8) or a single instruction with implied

> memory ordering (e.g. ldar (a load instruction with acquire memory

> ordering) on ARMv8).

>

> Acquire ordering could be used on the initial load of r_head instead.

> An explicit acquire fence could be placed before the do-while loop

> too.

> It is just a bit more symmetrical to have acquire ordering on the

> load of w_tail.

>

> What is needed is an ordering to prevent the load of w_tail (in the

> do-while loop) to be reordered before the store of w_tail in the enq

> operation... on the same core.

>

> Think of the situation where the thread running on one core does

> an enq() followed by a deq().  The final store release in enq() prevents

> memory accesses from being reordered to after the store.  But, memory

> accesses can still be reordered to before that store.  We need acquire

> ordering in deq() to prevent subsequent memory accesses from being

> reordered into the enq() operation.

>

> > static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,

> >                                       uint32_t data[], uint32_t num)

> > {

> >         uint32_t head, tail, new_head, i;

> >

> >         head = odp_atomic_load_u32(&ring->r_head);

> >

> >         /* Move reader head. This thread owns data at the new head. */

> >         do {

> >                 tail = odp_atomic_load_acq_u32(&ring->w_tail);

> >

> >                 /* Ring is empty */

> >                 if (head == tail)

> >                         return 0;

> >

> >                 /* Try to take all available */

> >                 if ((tail - head) < num)

> >                         num = tail - head;

> >

> >                 new_head = head + num;

> >

> >         } while (odp_unlikely(odp_atomic_cas_acq_u32(&ring->r_head,

> &head,

> >                               new_head) == 0));

> >

> >         /* Read queue index */

> >         for (i = 0; i < num; i++)

> >                 data[i] = ring->data[(head + 1 + i) & mask];

> >

> >         /* Wait until other readers have updated the tail */

> >         while (odp_unlikely(odp_atomic_load_acq_u32(&ring->r_tail) !=

> head))

> >                 odp_cpu_pause();

> >

> >         /* Now update the reader tail */

> >         odp_atomic_store_rel_u32(&ring->r_tail, new_head);

> >

> >         return num;

> > }

>
diff mbox

Patch

diff --git a/platform/linux-generic/include/odp_ring_internal.h b/platform/linux-generic/include/odp_ring_internal.h
index 55fedeb3..44b83c60 100644
--- a/platform/linux-generic/include/odp_ring_internal.h
+++ b/platform/linux-generic/include/odp_ring_internal.h
@@ -57,7 +57,7 @@  static inline uint32_t ring_deq(ring_t *ring, uint32_t mask)
 
 	/* Move reader head. This thread owns data at the new head. */
 	do {
-		tail = odp_atomic_load_u32(&ring->w_tail);
+		tail = odp_atomic_load_acq_u32(&ring->w_tail);
 
 		if (head == tail)
 			return RING_EMPTY;
@@ -90,7 +90,7 @@  static inline uint32_t ring_deq_multi(ring_t *ring, uint32_t mask,
 
 	/* Move reader head. This thread owns data at the new head. */
 	do {
-		tail = odp_atomic_load_u32(&ring->w_tail);
+		tail = odp_atomic_load_acq_u32(&ring->w_tail);
 
 		/* Ring is empty */
 		if (head == tail)