diff mbox

linux-generic: timer: generalize arch-specific code path selection

Message ID 1463780713-29267-1-git-send-email-ola.liljedahl@linaro.org
State Accepted
Commit 0ef0069863276d186bd66885bf6fbb253d34c6a5
Headers show

Commit Message

Ola Liljedahl May 20, 2016, 9:45 p.m. UTC
Make architecture-specific code path selection generic, controlled
directly by compiler feature predefines.
Replace macro PREFETCH with intrinsic __builtin_prefetch.
Fixes https://bugs.linaro.org/show_bug.cgi?id=2235

Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
---
(This document/code contribution attached is provided under the terms of
agreement LES-LTM-21309)

 platform/linux-generic/odp_timer.c | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

Comments

Bill Fischofer May 22, 2016, 4:02 p.m. UTC | #1
On Fri, May 20, 2016 at 4:45 PM, Ola Liljedahl <ola.liljedahl@linaro.org>
wrote:

> Make architecture-specific code path selection generic, controlled
> directly by compiler feature predefines.
> Replace macro PREFETCH with intrinsic __builtin_prefetch.
> Fixes https://bugs.linaro.org/show_bug.cgi?id=2235
>
> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>

Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org>


> ---
> (This document/code contribution attached is provided under the terms of
> agreement LES-LTM-21309)
>
>  platform/linux-generic/odp_timer.c | 28 +++++++++-------------------
>  1 file changed, 9 insertions(+), 19 deletions(-)
>
> diff --git a/platform/linux-generic/odp_timer.c
> b/platform/linux-generic/odp_timer.c
> index a6d3332..4e56fb0 100644
> --- a/platform/linux-generic/odp_timer.c
> +++ b/platform/linux-generic/odp_timer.c
> @@ -58,12 +58,6 @@
>   * for checking the freshness of received timeouts */
>  #define TMO_INACTIVE ((uint64_t)0x8000000000000000)
>
> -#ifdef __ARM_ARCH
> -#define PREFETCH(ptr) __builtin_prefetch((ptr), 0, 0)
> -#else
> -#define PREFETCH(ptr) (void)(ptr)
> -#endif
> -
>
>  /******************************************************************************
>   * Mutual exclusion in the absence of CAS16
>
> *****************************************************************************/
> @@ -210,7 +204,7 @@ static inline uint32_t handle_to_idx(odp_timer_t hdl,
>                 struct odp_timer_pool_s *tp)
>  {
>         uint32_t idx = _odp_typeval(hdl) & ((1U << INDEX_BITS) - 1U);
> -       PREFETCH(&tp->tick_buf[idx]);
> +       __builtin_prefetch(&tp->tick_buf[idx], 0, 0);
>         if (odp_likely(idx < odp_atomic_load_u32(&tp->high_wm)))
>                 return idx;
>         ODP_ABORT("Invalid timer handle %#x\n", hdl);
> @@ -395,7 +389,7 @@ static bool timer_reset(uint32_t idx,
>         tick_buf_t *tb = &tp->tick_buf[idx];
>
>         if (tmo_buf == NULL || *tmo_buf == ODP_BUFFER_INVALID) {
> -#ifdef ODP_ATOMIC_U128
> +#ifdef ODP_ATOMIC_U128 /* Target supports 128-bit atomic operations */
>                 tick_buf_t new, old;
>                 do {
>                         /* Relaxed and non-atomic read of current values */
> @@ -422,9 +416,10 @@ static bool timer_reset(uint32_t idx,
>                                         (_uint128_t *)&new,
>                                         _ODP_MEMMODEL_RLS,
>                                         _ODP_MEMMODEL_RLX));
> -#else
> -#ifdef __ARM_ARCH
> -               /* Since barriers are not good for C-A15, we take an
> +#elif __GCC_ATOMIC_LLONG_LOCK_FREE >= 2 && \
> +       defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
> +       /* Target supports lock-free 64-bit CAS (and probably exchange) */
> +               /* Since locks/barriers are not good for C-A15, we take an
>                  * alternative approach using relaxed memory model */
>                 uint64_t old;
>                 /* Swap in new expiration tick, get back old tick which
> @@ -450,7 +445,7 @@ static bool timer_reset(uint32_t idx,
>                                         _ODP_MEMMODEL_RLX);
>                         success = false;
>                 }
> -#else
> +#else /* Target supports neither 128-bit nor 64-bit CAS => use lock */
>                 /* Take a related lock */
>                 while (_odp_atomic_flag_tas(IDX2LOCK(idx)))
>                         /* While lock is taken, spin using relaxed loads */
> @@ -470,7 +465,6 @@ static bool timer_reset(uint32_t idx,
>                 /* Release the lock */
>                 _odp_atomic_flag_clear(IDX2LOCK(idx));
>  #endif
> -#endif
>         } else {
>                 /* We have a new timeout buffer which replaces any old one
> */
>                 /* Fill in some (constant) header fields for timeout
> events */
> @@ -655,13 +649,11 @@ static unsigned
> odp_timer_pool_expire(odp_timer_pool_t tpid, uint64_t tick)
>
>         ODP_ASSERT(high_wm <= tpid->param.num_timers);
>         for (i = 0; i < high_wm;) {
> -#ifdef __ARM_ARCH
>                 /* As a rare occurrence, we can outsmart the HW prefetcher
>                  * and the compiler (GCC -fprefetch-loop-arrays) with some
>                  * tuned manual prefetching (32x16=512B ahead), seems to
>                  * give 30% better performance on ARM C-A15 */
> -               PREFETCH(&array[i + 32]);
> -#endif
> +               __builtin_prefetch(&array[i + 32], 0, 0);
>                 /* Non-atomic read for speed */
>                 uint64_t exp_tck = array[i++].exp_tck.v;
>                 if (odp_unlikely(exp_tck <= tick)) {
> @@ -691,13 +683,11 @@ static void timer_notify(odp_timer_pool *tp)
>                 }
>         }
>
> -#ifdef __ARM_ARCH
>         odp_timer *array = &tp->timers[0];
>         uint32_t i;
>         /* Prefetch initial cache lines (match 32 above) */
>         for (i = 0; i < 32; i += ODP_CACHE_LINE_SIZE / sizeof(array[0]))
> -               PREFETCH(&array[i]);
> -#endif
> +               __builtin_prefetch(&array[i], 0, 0);
>         prev_tick = odp_atomic_fetch_inc_u64(&tp->cur_tick);
>
>         /* Scan timer array, looking for timers to expire */
> --
> 2.5.0
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> https://lists.linaro.org/mailman/listinfo/lng-odp
>
Maxim Uvarov May 23, 2016, 3:30 p.m. UTC | #2
Merged,
Maxim.

On 05/22/16 19:02, Bill Fischofer wrote:
> On Fri, May 20, 2016 at 4:45 PM, Ola Liljedahl <ola.liljedahl@linaro.org>
> wrote:
>
>> Make architecture-specific code path selection generic, controlled
>> directly by compiler feature predefines.
>> Replace macro PREFETCH with intrinsic __builtin_prefetch.
>> Fixes https://bugs.linaro.org/show_bug.cgi?id=2235
>>
>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>>
> Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org>
>
>
>> ---
>> (This document/code contribution attached is provided under the terms of
>> agreement LES-LTM-21309)
>>
>>   platform/linux-generic/odp_timer.c | 28 +++++++++-------------------
>>   1 file changed, 9 insertions(+), 19 deletions(-)
>>
>> diff --git a/platform/linux-generic/odp_timer.c
>> b/platform/linux-generic/odp_timer.c
>> index a6d3332..4e56fb0 100644
>> --- a/platform/linux-generic/odp_timer.c
>> +++ b/platform/linux-generic/odp_timer.c
>> @@ -58,12 +58,6 @@
>>    * for checking the freshness of received timeouts */
>>   #define TMO_INACTIVE ((uint64_t)0x8000000000000000)
>>
>> -#ifdef __ARM_ARCH
>> -#define PREFETCH(ptr) __builtin_prefetch((ptr), 0, 0)
>> -#else
>> -#define PREFETCH(ptr) (void)(ptr)
>> -#endif
>> -
>>
>>   /******************************************************************************
>>    * Mutual exclusion in the absence of CAS16
>>
>> *****************************************************************************/
>> @@ -210,7 +204,7 @@ static inline uint32_t handle_to_idx(odp_timer_t hdl,
>>                  struct odp_timer_pool_s *tp)
>>   {
>>          uint32_t idx = _odp_typeval(hdl) & ((1U << INDEX_BITS) - 1U);
>> -       PREFETCH(&tp->tick_buf[idx]);
>> +       __builtin_prefetch(&tp->tick_buf[idx], 0, 0);
>>          if (odp_likely(idx < odp_atomic_load_u32(&tp->high_wm)))
>>                  return idx;
>>          ODP_ABORT("Invalid timer handle %#x\n", hdl);
>> @@ -395,7 +389,7 @@ static bool timer_reset(uint32_t idx,
>>          tick_buf_t *tb = &tp->tick_buf[idx];
>>
>>          if (tmo_buf == NULL || *tmo_buf == ODP_BUFFER_INVALID) {
>> -#ifdef ODP_ATOMIC_U128
>> +#ifdef ODP_ATOMIC_U128 /* Target supports 128-bit atomic operations */
>>                  tick_buf_t new, old;
>>                  do {
>>                          /* Relaxed and non-atomic read of current values */
>> @@ -422,9 +416,10 @@ static bool timer_reset(uint32_t idx,
>>                                          (_uint128_t *)&new,
>>                                          _ODP_MEMMODEL_RLS,
>>                                          _ODP_MEMMODEL_RLX));
>> -#else
>> -#ifdef __ARM_ARCH
>> -               /* Since barriers are not good for C-A15, we take an
>> +#elif __GCC_ATOMIC_LLONG_LOCK_FREE >= 2 && \
>> +       defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
>> +       /* Target supports lock-free 64-bit CAS (and probably exchange) */
>> +               /* Since locks/barriers are not good for C-A15, we take an
>>                   * alternative approach using relaxed memory model */
>>                  uint64_t old;
>>                  /* Swap in new expiration tick, get back old tick which
>> @@ -450,7 +445,7 @@ static bool timer_reset(uint32_t idx,
>>                                          _ODP_MEMMODEL_RLX);
>>                          success = false;
>>                  }
>> -#else
>> +#else /* Target supports neither 128-bit nor 64-bit CAS => use lock */
>>                  /* Take a related lock */
>>                  while (_odp_atomic_flag_tas(IDX2LOCK(idx)))
>>                          /* While lock is taken, spin using relaxed loads */
>> @@ -470,7 +465,6 @@ static bool timer_reset(uint32_t idx,
>>                  /* Release the lock */
>>                  _odp_atomic_flag_clear(IDX2LOCK(idx));
>>   #endif
>> -#endif
>>          } else {
>>                  /* We have a new timeout buffer which replaces any old one
>> */
>>                  /* Fill in some (constant) header fields for timeout
>> events */
>> @@ -655,13 +649,11 @@ static unsigned
>> odp_timer_pool_expire(odp_timer_pool_t tpid, uint64_t tick)
>>
>>          ODP_ASSERT(high_wm <= tpid->param.num_timers);
>>          for (i = 0; i < high_wm;) {
>> -#ifdef __ARM_ARCH
>>                  /* As a rare occurrence, we can outsmart the HW prefetcher
>>                   * and the compiler (GCC -fprefetch-loop-arrays) with some
>>                   * tuned manual prefetching (32x16=512B ahead), seems to
>>                   * give 30% better performance on ARM C-A15 */
>> -               PREFETCH(&array[i + 32]);
>> -#endif
>> +               __builtin_prefetch(&array[i + 32], 0, 0);
>>                  /* Non-atomic read for speed */
>>                  uint64_t exp_tck = array[i++].exp_tck.v;
>>                  if (odp_unlikely(exp_tck <= tick)) {
>> @@ -691,13 +683,11 @@ static void timer_notify(odp_timer_pool *tp)
>>                  }
>>          }
>>
>> -#ifdef __ARM_ARCH
>>          odp_timer *array = &tp->timers[0];
>>          uint32_t i;
>>          /* Prefetch initial cache lines (match 32 above) */
>>          for (i = 0; i < 32; i += ODP_CACHE_LINE_SIZE / sizeof(array[0]))
>> -               PREFETCH(&array[i]);
>> -#endif
>> +               __builtin_prefetch(&array[i], 0, 0);
>>          prev_tick = odp_atomic_fetch_inc_u64(&tp->cur_tick);
>>
>>          /* Scan timer array, looking for timers to expire */
>> --
>> 2.5.0
>>
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> https://lists.linaro.org/mailman/listinfo/lng-odp
>>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> https://lists.linaro.org/mailman/listinfo/lng-odp
diff mbox

Patch

diff --git a/platform/linux-generic/odp_timer.c b/platform/linux-generic/odp_timer.c
index a6d3332..4e56fb0 100644
--- a/platform/linux-generic/odp_timer.c
+++ b/platform/linux-generic/odp_timer.c
@@ -58,12 +58,6 @@ 
  * for checking the freshness of received timeouts */
 #define TMO_INACTIVE ((uint64_t)0x8000000000000000)
 
-#ifdef __ARM_ARCH
-#define PREFETCH(ptr) __builtin_prefetch((ptr), 0, 0)
-#else
-#define PREFETCH(ptr) (void)(ptr)
-#endif
-
 /******************************************************************************
  * Mutual exclusion in the absence of CAS16
  *****************************************************************************/
@@ -210,7 +204,7 @@  static inline uint32_t handle_to_idx(odp_timer_t hdl,
 		struct odp_timer_pool_s *tp)
 {
 	uint32_t idx = _odp_typeval(hdl) & ((1U << INDEX_BITS) - 1U);
-	PREFETCH(&tp->tick_buf[idx]);
+	__builtin_prefetch(&tp->tick_buf[idx], 0, 0);
 	if (odp_likely(idx < odp_atomic_load_u32(&tp->high_wm)))
 		return idx;
 	ODP_ABORT("Invalid timer handle %#x\n", hdl);
@@ -395,7 +389,7 @@  static bool timer_reset(uint32_t idx,
 	tick_buf_t *tb = &tp->tick_buf[idx];
 
 	if (tmo_buf == NULL || *tmo_buf == ODP_BUFFER_INVALID) {
-#ifdef ODP_ATOMIC_U128
+#ifdef ODP_ATOMIC_U128 /* Target supports 128-bit atomic operations */
 		tick_buf_t new, old;
 		do {
 			/* Relaxed and non-atomic read of current values */
@@ -422,9 +416,10 @@  static bool timer_reset(uint32_t idx,
 					(_uint128_t *)&new,
 					_ODP_MEMMODEL_RLS,
 					_ODP_MEMMODEL_RLX));
-#else
-#ifdef __ARM_ARCH
-		/* Since barriers are not good for C-A15, we take an
+#elif __GCC_ATOMIC_LLONG_LOCK_FREE >= 2 && \
+	defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+	/* Target supports lock-free 64-bit CAS (and probably exchange) */
+		/* Since locks/barriers are not good for C-A15, we take an
 		 * alternative approach using relaxed memory model */
 		uint64_t old;
 		/* Swap in new expiration tick, get back old tick which
@@ -450,7 +445,7 @@  static bool timer_reset(uint32_t idx,
 					_ODP_MEMMODEL_RLX);
 			success = false;
 		}
-#else
+#else /* Target supports neither 128-bit nor 64-bit CAS => use lock */
 		/* Take a related lock */
 		while (_odp_atomic_flag_tas(IDX2LOCK(idx)))
 			/* While lock is taken, spin using relaxed loads */
@@ -470,7 +465,6 @@  static bool timer_reset(uint32_t idx,
 		/* Release the lock */
 		_odp_atomic_flag_clear(IDX2LOCK(idx));
 #endif
-#endif
 	} else {
 		/* We have a new timeout buffer which replaces any old one */
 		/* Fill in some (constant) header fields for timeout events */
@@ -655,13 +649,11 @@  static unsigned odp_timer_pool_expire(odp_timer_pool_t tpid, uint64_t tick)
 
 	ODP_ASSERT(high_wm <= tpid->param.num_timers);
 	for (i = 0; i < high_wm;) {
-#ifdef __ARM_ARCH
 		/* As a rare occurrence, we can outsmart the HW prefetcher
 		 * and the compiler (GCC -fprefetch-loop-arrays) with some
 		 * tuned manual prefetching (32x16=512B ahead), seems to
 		 * give 30% better performance on ARM C-A15 */
-		PREFETCH(&array[i + 32]);
-#endif
+		__builtin_prefetch(&array[i + 32], 0, 0);
 		/* Non-atomic read for speed */
 		uint64_t exp_tck = array[i++].exp_tck.v;
 		if (odp_unlikely(exp_tck <= tick)) {
@@ -691,13 +683,11 @@  static void timer_notify(odp_timer_pool *tp)
 		}
 	}
 
-#ifdef __ARM_ARCH
 	odp_timer *array = &tp->timers[0];
 	uint32_t i;
 	/* Prefetch initial cache lines (match 32 above) */
 	for (i = 0; i < 32; i += ODP_CACHE_LINE_SIZE / sizeof(array[0]))
-		PREFETCH(&array[i]);
-#endif
+		__builtin_prefetch(&array[i], 0, 0);
 	prev_tick = odp_atomic_fetch_inc_u64(&tp->cur_tick);
 
 	/* Scan timer array, looking for timers to expire */