eal: generic counter based loop for CPU freq calculation

Message ID 20200608213417.9764-1-honnappa.nagarahalli@arm.com
State New
Headers show
Series
  • eal: generic counter based loop for CPU freq calculation
Related show

Commit Message

Honnappa Nagarahalli June 8, 2020, 9:34 p.m.
get_tsc_freq uses 'nanosleep' system call to calculate the CPU
frequency. However, 'nanosleep' results in the process getting
un-scheduled. The kernel saves and restores the PMU state. This
ensures that the PMU cycles are not counted towards a sleeping
process. When RTE_ARM_EAL_RDTSC_USE_PMU is defined, this results
in incorrect CPU frequency calculation. This logic is replaced
with generic counter based loop.

Bugzilla ID: 450
Fixes: af75078fece3 ("first public release")
Cc: stable@dpdk.org

Signed-off-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>

Reviewed-by: Dharmik Thakkar <dharmik.thakkar@arm.com>

Reviewed-by: Phil Yang <phil.yang@arm.com>


---
 lib/librte_eal/arm/include/rte_cycles_64.h | 45 +++++++++++++++++++---
 lib/librte_eal/arm/rte_cycles.c            | 24 ++++++++++--
 2 files changed, 61 insertions(+), 8 deletions(-)

-- 
2.17.1

Comments

Jerin Jacob June 24, 2020, 12:50 p.m. | #1
On Tue, Jun 9, 2020 at 3:04 AM Honnappa Nagarahalli
<honnappa.nagarahalli@arm.com> wrote:
>

> get_tsc_freq uses 'nanosleep' system call to calculate the CPU

> frequency. However, 'nanosleep' results in the process getting

> un-scheduled. The kernel saves and restores the PMU state. This

> ensures that the PMU cycles are not counted towards a sleeping

> process. When RTE_ARM_EAL_RDTSC_USE_PMU is defined, this results

> in incorrect CPU frequency calculation. This logic is replaced

> with generic counter based loop.

>

> Bugzilla ID: 450

> Fixes: af75078fece3 ("first public release")


The Fix looks good to me.

The Fixes is not correct. It should be the patch where
RTE_ARM_EAL_RDTSC_USE_PMU got introduced.


> Cc: stable@dpdk.org

>

> Signed-off-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>

> Reviewed-by: Dharmik Thakkar <dharmik.thakkar@arm.com>

> Reviewed-by: Phil Yang <phil.yang@arm.com>

>

> ---

>  lib/librte_eal/arm/include/rte_cycles_64.h | 45 +++++++++++++++++++---

>  lib/librte_eal/arm/rte_cycles.c            | 24 ++++++++++--

>  2 files changed, 61 insertions(+), 8 deletions(-)

>

> diff --git a/lib/librte_eal/arm/include/rte_cycles_64.h b/lib/librte_eal/arm/include/rte_cycles_64.h

> index da557b6a1..6fc352036 100644

> --- a/lib/librte_eal/arm/include/rte_cycles_64.h

> +++ b/lib/librte_eal/arm/include/rte_cycles_64.h

> @@ -11,6 +11,36 @@ extern "C" {

>

>  #include "generic/rte_cycles.h"

>

> +/** Read generic counter frequency */

> +static inline uint64_t


I prefer to have __rte_allways_inline

> +__rte_rd_generic_cntr_freq(void)


I think, the generic counter is confusing, I think, since the symbol
is exposed due to placed in
header file, it is better to change, __rte_arm64_cntfrq()

> +{

> +       uint64_t freq;

> +

> +       asm volatile("mrs %0, cntfrq_el0" : "=r" (freq));

> +       return freq;

> +}

> +

> +/** Read generic counter */

> +static inline uint64_t


Likewise, __rte_arm64_cntvct()


> +__rte_rd_generic_cntr(void)

> +{

> +       uint64_t tsc;

> +

> +       asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));

> +       return tsc;

> +}

> +

> +static inline uint64_t

> +__rte_rd_generic_cntr_precise(void)


__rte_arm64_cntfrq_precise()

> +{

> +       uint64_t tsc;

> +

> +       asm volatile("isb" : : : "memory");

> +       asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));

> +       return tsc;

> +}

> +

>  /**

>   * Read the time base register.

>   *

> @@ -25,10 +55,7 @@ extern "C" {

>  static inline uint64_t

>  rte_rdtsc(void)

>  {

> -       uint64_t tsc;

> -

> -       asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));

> -       return tsc;

> +       return __rte_rd_generic_cntr();

>  }

>  #else

>  /**

> @@ -49,14 +76,22 @@ rte_rdtsc(void)

>   * asm volatile("msr pmcr_el0, %0" : : "r" (val));

>   *

>   */

> +

> +/** Read PMU cycle counter */

>  static inline uint64_t

> -rte_rdtsc(void)

> +__rte_rd_pmu_cycle_cntr(void)

>  {

>         uint64_t tsc;

>

>         asm volatile("mrs %0, pmccntr_el0" : "=r"(tsc));

>         return tsc;

>  }

> +

> +static inline uint64_t

> +rte_rdtsc(void)

> +{

> +       return __rte_rd_pmu_cycle_cntr();

> +}

>  #endif

>

>  static inline uint64_t

> diff --git a/lib/librte_eal/arm/rte_cycles.c b/lib/librte_eal/arm/rte_cycles.c

> index 3500d523e..92c87a8a4 100644

> --- a/lib/librte_eal/arm/rte_cycles.c

> +++ b/lib/librte_eal/arm/rte_cycles.c

> @@ -3,14 +3,32 @@

>   */

>

>  #include "eal_private.h"

> +#include "rte_cycles.h"

>

>  uint64_t

>  get_tsc_freq_arch(void)

>  {

>  #if defined RTE_ARCH_ARM64 && !defined RTE_ARM_EAL_RDTSC_USE_PMU

> -       uint64_t freq;

> -       asm volatile("mrs %0, cntfrq_el0" : "=r" (freq));

> -       return freq;

> +       return __rte_rd_generic_cntr_freq();

> +#elif defined RTE_ARCH_ARM64 && defined RTE_ARM_EAL_RDTSC_USE_PMU

> +       /* Use the generic counter ticks to calculate the PMU

> +        * cycle frequency.

> +        */

> +       uint64_t gcnt_ticks;

> +       uint64_t start_ticks, cur_ticks;

> +       uint64_t start_pmu_cycles, end_pmu_cycles;

> +

> +       /* Number of ticks for 1/10 second */

> +       gcnt_ticks = __rte_rd_generic_cntr_freq() / 10;

> +

> +       start_ticks = __rte_rd_generic_cntr_precise();

> +       start_pmu_cycles = rte_rdtsc_precise();

> +       do {

> +               cur_ticks = __rte_rd_generic_cntr();

> +       } while ((cur_ticks - start_ticks) < gcnt_ticks);

> +       end_pmu_cycles = rte_rdtsc_precise();

> +

> +       return ((end_pmu_cycles - start_pmu_cycles) * 10);


Good thought. On the plus side, it will reduce the boot time by .9 sec.

>  #else

>         return 0;


With above changes:

Acked-by: Jerin Jacob <jerinj@marvell.com>




>  #endif

> --

> 2.17.1

>
Pavan Nikhilesh Bhagavatula June 24, 2020, 3:09 p.m. | #2
>Subject: [dpdk-dev] [PATCH] eal: generic counter based loop for CPU

>freq calculation

>

>get_tsc_freq uses 'nanosleep' system call to calculate the CPU

>frequency. However, 'nanosleep' results in the process getting

>un-scheduled. The kernel saves and restores the PMU state. This

>ensures that the PMU cycles are not counted towards a sleeping

>process. When RTE_ARM_EAL_RDTSC_USE_PMU is defined, this results

>in incorrect CPU frequency calculation. This logic is replaced

>with generic counter based loop.

>

>Bugzilla ID: 450

>Fixes: af75078fece3 ("first public release")

>Cc: stable@dpdk.org

>

>Signed-off-by: Honnappa Nagarahalli

><honnappa.nagarahalli@arm.com>

>Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>

>Reviewed-by: Dharmik Thakkar <dharmik.thakkar@arm.com>

>Reviewed-by: Phil Yang <phil.yang@arm.com>

>

>---

> lib/librte_eal/arm/include/rte_cycles_64.h | 45

>+++++++++++++++++++---

> lib/librte_eal/arm/rte_cycles.c            | 24 ++++++++++--

> 2 files changed, 61 insertions(+), 8 deletions(-)

>


<Snip>

>

> uint64_t

> get_tsc_freq_arch(void)

> {

> #if defined RTE_ARCH_ARM64 && !defined

>RTE_ARM_EAL_RDTSC_USE_PMU

>-	uint64_t freq;

>-	asm volatile("mrs %0, cntfrq_el0" : "=r" (freq));

>-	return freq;

>+	return __rte_rd_generic_cntr_freq();

>+#elif defined RTE_ARCH_ARM64 && defined

>RTE_ARM_EAL_RDTSC_USE_PMU

>+	/* Use the generic counter ticks to calculate the PMU

>+	 * cycle frequency.

>+	 */

>+	uint64_t gcnt_ticks;

>+	uint64_t start_ticks, cur_ticks;

>+	uint64_t start_pmu_cycles, end_pmu_cycles;

>+

>+	/* Number of ticks for 1/10 second */

>+	gcnt_ticks = __rte_rd_generic_cntr_freq() / 10;

>+

>+	start_ticks = __rte_rd_generic_cntr_precise();

>+	start_pmu_cycles = rte_rdtsc_precise();

>+	do {

>+		cur_ticks = __rte_rd_generic_cntr();

>+	} while ((cur_ticks - start_ticks) < gcnt_ticks);

>+	end_pmu_cycles = rte_rdtsc_precise();

>+

>+	return ((end_pmu_cycles - start_pmu_cycles) * 10);


I think we need to round this of to the next multiple of 10.
Sometimes it is off by one
EAL: TSC frequency is ~2399999 KHz

Similar to http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/eal_common_timer.c#n54

Pavan.

> #else

> 	return 0;

> #endif

>--

>2.17.1
Honnappa Nagarahalli June 26, 2020, 8:46 p.m. | #3
Hi Jerin,
	Thanks for the comments.

> -----Original Message-----

> From: Jerin Jacob <jerinjacobk@gmail.com>

> Sent: Wednesday, June 24, 2020 7:51 AM

> To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>

> Cc: dpdk-dev <dev@dpdk.org>; jerinj@marvell.com;

> hemant.agrawal@nxp.com; Akhil.goyal@nxp.com; ogerlitz@mellanox.com;

> Ajit Khaparde (ajit.khaparde@broadcom.com)

> <ajit.khaparde@broadcom.com>; ruigeng.wang@arm.com; Dharmik Thakkar

> <Dharmik.Thakkar@arm.com>; Phil Yang <Phil.Yang@arm.com>; dpdk stable

> <stable@dpdk.org>

> Subject: Re: [dpdk-dev] [PATCH] eal: generic counter based loop for CPU freq

> calculation

> 

> On Tue, Jun 9, 2020 at 3:04 AM Honnappa Nagarahalli

> <honnappa.nagarahalli@arm.com> wrote:

> >

> > get_tsc_freq uses 'nanosleep' system call to calculate the CPU

> > frequency. However, 'nanosleep' results in the process getting

> > un-scheduled. The kernel saves and restores the PMU state. This

> > ensures that the PMU cycles are not counted towards a sleeping

> > process. When RTE_ARM_EAL_RDTSC_USE_PMU is defined, this results in

> > incorrect CPU frequency calculation. This logic is replaced with

> > generic counter based loop.

> >

> > Bugzilla ID: 450

> > Fixes: af75078fece3 ("first public release")

> 

> The Fix looks good to me.

> 

> The Fixes is not correct. It should be the patch where

> RTE_ARM_EAL_RDTSC_USE_PMU got introduced.

Ok, will dig that out.

> 

> 

> > Cc: stable@dpdk.org

> >

> > Signed-off-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>

> > Reviewed-by: Dharmik Thakkar <dharmik.thakkar@arm.com>

> > Reviewed-by: Phil Yang <phil.yang@arm.com>

> >

> > ---

> >  lib/librte_eal/arm/include/rte_cycles_64.h | 45 +++++++++++++++++++---

> >  lib/librte_eal/arm/rte_cycles.c            | 24 ++++++++++--

> >  2 files changed, 61 insertions(+), 8 deletions(-)

> >

> > diff --git a/lib/librte_eal/arm/include/rte_cycles_64.h

> > b/lib/librte_eal/arm/include/rte_cycles_64.h

> > index da557b6a1..6fc352036 100644

> > --- a/lib/librte_eal/arm/include/rte_cycles_64.h

> > +++ b/lib/librte_eal/arm/include/rte_cycles_64.h

> > @@ -11,6 +11,36 @@ extern "C" {

> >

> >  #include "generic/rte_cycles.h"

> >

> > +/** Read generic counter frequency */ static inline uint64_t

> 

> I prefer to have __rte_allways_inline

> 

> > +__rte_rd_generic_cntr_freq(void)

> 

> I think, the generic counter is confusing, I think, since the symbol is exposed

> due to placed in header file, it is better to change, __rte_arm64_cntfrq()

Ok, makes sense.

> 

> > +{

> > +       uint64_t freq;

> > +

> > +       asm volatile("mrs %0, cntfrq_el0" : "=r" (freq));

> > +       return freq;

> > +}

> > +

> > +/** Read generic counter */

> > +static inline uint64_t

> 

> Likewise, __rte_arm64_cntvct()

> 

> 

> > +__rte_rd_generic_cntr(void)

> > +{

> > +       uint64_t tsc;

> > +

> > +       asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));

> > +       return tsc;

> > +}

> > +

> > +static inline uint64_t

> > +__rte_rd_generic_cntr_precise(void)

> 

> __rte_arm64_cntfrq_precise()

> 

> > +{

> > +       uint64_t tsc;

> > +

> > +       asm volatile("isb" : : : "memory");

> > +       asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));

> > +       return tsc;

> > +}

> > +

> >  /**

> >   * Read the time base register.

> >   *

> > @@ -25,10 +55,7 @@ extern "C" {

> >  static inline uint64_t

> >  rte_rdtsc(void)

> >  {

> > -       uint64_t tsc;

> > -

> > -       asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));

> > -       return tsc;

> > +       return __rte_rd_generic_cntr();

> >  }

> >  #else

> >  /**

> > @@ -49,14 +76,22 @@ rte_rdtsc(void)

> >   * asm volatile("msr pmcr_el0, %0" : : "r" (val));

> >   *

> >   */

> > +

> > +/** Read PMU cycle counter */

> >  static inline uint64_t

> > -rte_rdtsc(void)

> > +__rte_rd_pmu_cycle_cntr(void)

I will change this to __rte_arm64_pmccntr

> >  {

> >         uint64_t tsc;

> >

> >         asm volatile("mrs %0, pmccntr_el0" : "=r"(tsc));

> >         return tsc;

> >  }

> > +

> > +static inline uint64_t

> > +rte_rdtsc(void)

> > +{

> > +       return __rte_rd_pmu_cycle_cntr(); }

> >  #endif

> >

> >  static inline uint64_t

> > diff --git a/lib/librte_eal/arm/rte_cycles.c

> > b/lib/librte_eal/arm/rte_cycles.c index 3500d523e..92c87a8a4 100644

> > --- a/lib/librte_eal/arm/rte_cycles.c

> > +++ b/lib/librte_eal/arm/rte_cycles.c

> > @@ -3,14 +3,32 @@

> >   */

> >

> >  #include "eal_private.h"

> > +#include "rte_cycles.h"

> >

> >  uint64_t

> >  get_tsc_freq_arch(void)

> >  {

> >  #if defined RTE_ARCH_ARM64 && !defined

> RTE_ARM_EAL_RDTSC_USE_PMU

> > -       uint64_t freq;

> > -       asm volatile("mrs %0, cntfrq_el0" : "=r" (freq));

> > -       return freq;

> > +       return __rte_rd_generic_cntr_freq(); #elif defined

> > +RTE_ARCH_ARM64 && defined RTE_ARM_EAL_RDTSC_USE_PMU

> > +       /* Use the generic counter ticks to calculate the PMU

> > +        * cycle frequency.

> > +        */

> > +       uint64_t gcnt_ticks;

> > +       uint64_t start_ticks, cur_ticks;

> > +       uint64_t start_pmu_cycles, end_pmu_cycles;

> > +

> > +       /* Number of ticks for 1/10 second */

> > +       gcnt_ticks = __rte_rd_generic_cntr_freq() / 10;

> > +

> > +       start_ticks = __rte_rd_generic_cntr_precise();

> > +       start_pmu_cycles = rte_rdtsc_precise();

> > +       do {

> > +               cur_ticks = __rte_rd_generic_cntr();

> > +       } while ((cur_ticks - start_ticks) < gcnt_ticks);

> > +       end_pmu_cycles = rte_rdtsc_precise();

> > +

> > +       return ((end_pmu_cycles - start_pmu_cycles) * 10);

> 

> Good thought. On the plus side, it will reduce the boot time by .9 sec.

> 

> >  #else

> >         return 0;

> 

> With above changes:

> 

> Acked-by: Jerin Jacob <jerinj@marvell.com>

> 

> 

> 

> >  #endif

> > --

> > 2.17.1

> >

Patch

diff --git a/lib/librte_eal/arm/include/rte_cycles_64.h b/lib/librte_eal/arm/include/rte_cycles_64.h
index da557b6a1..6fc352036 100644
--- a/lib/librte_eal/arm/include/rte_cycles_64.h
+++ b/lib/librte_eal/arm/include/rte_cycles_64.h
@@ -11,6 +11,36 @@  extern "C" {
 
 #include "generic/rte_cycles.h"
 
+/** Read generic counter frequency */
+static inline uint64_t
+__rte_rd_generic_cntr_freq(void)
+{
+	uint64_t freq;
+
+	asm volatile("mrs %0, cntfrq_el0" : "=r" (freq));
+	return freq;
+}
+
+/** Read generic counter */
+static inline uint64_t
+__rte_rd_generic_cntr(void)
+{
+	uint64_t tsc;
+
+	asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
+	return tsc;
+}
+
+static inline uint64_t
+__rte_rd_generic_cntr_precise(void)
+{
+	uint64_t tsc;
+
+	asm volatile("isb" : : : "memory");
+	asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
+	return tsc;
+}
+
 /**
  * Read the time base register.
  *
@@ -25,10 +55,7 @@  extern "C" {
 static inline uint64_t
 rte_rdtsc(void)
 {
-	uint64_t tsc;
-
-	asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
-	return tsc;
+	return __rte_rd_generic_cntr();
 }
 #else
 /**
@@ -49,14 +76,22 @@  rte_rdtsc(void)
  * asm volatile("msr pmcr_el0, %0" : : "r" (val));
  *
  */
+
+/** Read PMU cycle counter */
 static inline uint64_t
-rte_rdtsc(void)
+__rte_rd_pmu_cycle_cntr(void)
 {
 	uint64_t tsc;
 
 	asm volatile("mrs %0, pmccntr_el0" : "=r"(tsc));
 	return tsc;
 }
+
+static inline uint64_t
+rte_rdtsc(void)
+{
+	return __rte_rd_pmu_cycle_cntr();
+}
 #endif
 
 static inline uint64_t
diff --git a/lib/librte_eal/arm/rte_cycles.c b/lib/librte_eal/arm/rte_cycles.c
index 3500d523e..92c87a8a4 100644
--- a/lib/librte_eal/arm/rte_cycles.c
+++ b/lib/librte_eal/arm/rte_cycles.c
@@ -3,14 +3,32 @@ 
  */
 
 #include "eal_private.h"
+#include "rte_cycles.h"
 
 uint64_t
 get_tsc_freq_arch(void)
 {
 #if defined RTE_ARCH_ARM64 && !defined RTE_ARM_EAL_RDTSC_USE_PMU
-	uint64_t freq;
-	asm volatile("mrs %0, cntfrq_el0" : "=r" (freq));
-	return freq;
+	return __rte_rd_generic_cntr_freq();
+#elif defined RTE_ARCH_ARM64 && defined RTE_ARM_EAL_RDTSC_USE_PMU
+	/* Use the generic counter ticks to calculate the PMU
+	 * cycle frequency.
+	 */
+	uint64_t gcnt_ticks;
+	uint64_t start_ticks, cur_ticks;
+	uint64_t start_pmu_cycles, end_pmu_cycles;
+
+	/* Number of ticks for 1/10 second */
+	gcnt_ticks = __rte_rd_generic_cntr_freq() / 10;
+
+	start_ticks = __rte_rd_generic_cntr_precise();
+	start_pmu_cycles = rte_rdtsc_precise();
+	do {
+		cur_ticks = __rte_rd_generic_cntr();
+	} while ((cur_ticks - start_ticks) < gcnt_ticks);
+	end_pmu_cycles = rte_rdtsc_precise();
+
+	return ((end_pmu_cycles - start_pmu_cycles) * 10);
 #else
 	return 0;
 #endif