diff mbox series

[v9,05/13] accel: adding TB_JIT_TIME and full replacing CONFIG_PROFILER

Message ID 20191007152839.30804-6-alex.bennee@linaro.org
State New
Headers show
Series TCG code quality tracking and perf integration | expand

Commit Message

Alex Bennée Oct. 7, 2019, 3:28 p.m. UTC
From: "Vanderson M. do Rosario" <vandersonmr2@gmail.com>


Replace all others CONFIG_PROFILER statistics and migrate it to
TBStatistics system. However, TCGProfiler still exists and can
be use to store global statistics and times. All TB related
statistics goes to TBStatistics.

Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>

Message-Id: <20190829173437.5926-6-vandersonmr2@gmail.com>
[AJB: fix authorship]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

---
 accel/tcg/tb-stats.c      |  98 +++++++++++++++++++++---------
 accel/tcg/translate-all.c |  46 ++++++++-------
 configure                 |   3 -
 cpus.c                    |  14 ++---
 include/exec/tb-stats.h   |  15 +++++
 include/qemu/timer.h      |   5 +-
 monitor/misc.c            |  28 ++-------
 tcg/tcg.c                 | 121 ++++++++++++--------------------------
 tcg/tcg.h                 |   2 +-
 vl.c                      |   8 +--
 10 files changed, 159 insertions(+), 181 deletions(-)

-- 
2.20.1

Comments

Richard Henderson Oct. 8, 2019, 3:25 p.m. UTC | #1
On 10/7/19 11:28 AM, Alex Bennée wrote:
> From: "Vanderson M. do Rosario" <vandersonmr2@gmail.com>

> 

> Replace all others CONFIG_PROFILER statistics and migrate it to

> TBStatistics system. However, TCGProfiler still exists and can

> be use to store global statistics and times. All TB related

> statistics goes to TBStatistics.

> 

> Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>

> Message-Id: <20190829173437.5926-6-vandersonmr2@gmail.com>

> [AJB: fix authorship]

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> ---

>  accel/tcg/tb-stats.c      |  98 +++++++++++++++++++++---------

>  accel/tcg/translate-all.c |  46 ++++++++-------

>  configure                 |   3 -

>  cpus.c                    |  14 ++---

>  include/exec/tb-stats.h   |  15 +++++

>  include/qemu/timer.h      |   5 +-

>  monitor/misc.c            |  28 ++-------

>  tcg/tcg.c                 | 121 ++++++++++++--------------------------

>  tcg/tcg.h                 |   2 +-

>  vl.c                      |   8 +--

>  10 files changed, 159 insertions(+), 181 deletions(-)

> 

> diff --git a/accel/tcg/tb-stats.c b/accel/tcg/tb-stats.c

> index 0e64c176b3..f431159fd2 100644

> --- a/accel/tcg/tb-stats.c

> +++ b/accel/tcg/tb-stats.c

> @@ -17,11 +17,18 @@

>  #include "exec/tb-stats.h"

>  

>  /* TBStatistic collection controls */

> -enum TBStatsStatus { TB_STATS_RUNNING, TB_STATS_PAUSED, TB_STATS_STOPPED };

> +enum TBStatsStatus {

> +    TB_STATS_DISABLED = 0,

> +    TB_STATS_RUNNING,

> +    TB_STATS_PAUSED,

> +    TB_STATS_STOPPED

> +};


This should be in patch 1, I should think.

> +uint64_t dev_time;


This patch is doing several things at once, and I think it should be split.
All of the changes to dev_time, for instance, are unrelated to TBStatistic.

> +    jpi->interm_time += stat_per_translation(tbs, time.interm);

> +    jpi->code_time += stat_per_translation(tbs, time.code);

> +    jpi->opt_time += stat_per_translation(tbs, time.opt);

> +    jpi->la_time += stat_per_translation(tbs, time.la);

> +    jpi->restore_time += tbs->time.restore;

> +    jpi->restore_count += tbs->time.restore_count;


Why are some of these "per translation" and some not?

> @@ -370,11 +371,11 @@ static int cpu_restore_state_from_tb

>      }

>      restore_state_to_opc(env, tb, data);

>  

> -#ifdef CONFIG_PROFILER

> -    atomic_set(&prof->restore_time,

> -                prof->restore_time + profile_getclock() - ti);

> -    atomic_set(&prof->restore_count, prof->restore_count + 1);

> -#endif

> +    if (tb_stats_enabled(tb, TB_JIT_TIME)) {

> +        atomic_add(&tb->tb_stats->time.restore, profile_getclock() - ti);

> +        atomic_inc(&tb->tb_stats->time.restore_count);

> +    }


Would it be better to use a the TBStatistics lock than two atomic updates?

> @@ -1826,10 +1828,10 @@ TranslationBlock *tb_gen_code(CPUState *cpu,

>          tcg_ctx->tb_jmp_target_addr = tb->jmp_target_arg;

>      }

>  

> -#ifdef CONFIG_PROFILER

> -    atomic_set(&prof->interm_time, prof->interm_time + profile_getclock() - ti);

> -    ti = profile_getclock();

> -#endif

> +    if (tb_stats_enabled(tb, TB_JIT_TIME)) {

> +        atomic_add(&tb->tb_stats->time.interm, profile_getclock() - ti);

> +        ti = profile_getclock();

> +    }


You should call profile_getclock() only once here.

Why does this need an atomic_add, while the rest of TB_JIT_STATS within
tb_gen_code do not, and in fact have a comment:

> +    /*

> +     * Collect JIT stats when enabled. We batch them all up here to

> +     * avoid spamming the cache with atomic accesses

> +     */


> @@ -1871,9 +1873,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,

>      }

>      tb->tc.size = gen_code_size;

>  

> -#ifdef CONFIG_PROFILER

> -    atomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti);

> -#endif

> +    if (tb_stats_enabled(tb, TB_JIT_TIME)) {

> +        atomic_add(&tb->tb_stats->time.code, profile_getclock() - ti);

> +    }


Likewise.

> diff --git a/configure b/configure

> index 8f8446f52b..eedeb9016e 100755

> --- a/configure

> +++ b/configure

> @@ -6566,9 +6566,6 @@ fi

>  if test "$static" = "yes" ; then

>    echo "CONFIG_STATIC=y" >> $config_host_mak

>  fi

> -if test "$profiler" = "yes" ; then

> -  echo "CONFIG_PROFILER=y" >> $config_host_mak

> -fi


Removing the define without removing --enable-profiler.

>  static int tcg_cpu_exec(CPUState *cpu)

>  {

>      int ret;

> -#ifdef CONFIG_PROFILER

> -    int64_t ti;

> -#endif

> +    uint64_t ti;

>  

>      assert(tcg_enabled());

> -#ifdef CONFIG_PROFILER

>      ti = profile_getclock();

> -#endif

> +

>      cpu_exec_start(cpu);

>      ret = cpu_exec(cpu);

>      cpu_exec_end(cpu);

> -#ifdef CONFIG_PROFILER

> -    atomic_set(&tcg_ctx->prof.cpu_exec_time,

> -               tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);

> -#endif

> +

> +    atomic_add(&tcg_ctx->prof.cpu_exec_time, profile_getclock() - ti);


This is also unrelated to TBStatistics.

It's also adding an unconditional atomic_add, of which I am not fond.  Should
this be testing tb_stats_collection_enabled() or something else?

I'll also note that tcg_ctx is per-thread (for mttcg), and so perhaps this does
not require an atomic_add anyway.  Perhaps just an atomic_set to be paired with
atomic_read in the dump function that's iterating over the tcg_ctx.

Even without the atomic_add, we shouldn't make the syscall for getclock in the
normal case.


> +    /* These are accessed with atomic operations */

> +    struct {

> +        int64_t restore;

> +        uint64_t restore_count;

> +        int64_t interm;

> +        int64_t code;

> +        int64_t opt;

> +        int64_t la;

> +    } time;


Why are these signed?  We're always adding positive values.

Why is restore_count here in "time"?  Indeed, why all of these unnamed
sub-structures at all?  I don't see that "." helps organization any more than "_".

> @@ -4020,18 +3959,18 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)

>      }

>  #endif

>  

> -#ifdef CONFIG_PROFILER

> -    atomic_set(&prof->opt_time, prof->opt_time - profile_getclock());

> -#endif

> +    if (tb_stats_enabled(tb, TB_JIT_TIME)) {

> +        atomic_add(&tb->tb_stats->time.opt, -profile_getclock());

> +    }


Atomic add of a negative clock value?  That just means the intermediate value
is unusable, so why make any of this atomic?

Also, this is tb_gen_code again, wherein we already talked about not using
atomic_foo.

> @@ -4081,14 +4020,17 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)

>      s->pool_labels = NULL;

>  #endif

>  

> +    if (!tb_stats_collection_enabled()) {

> +        QTAILQ_FOREACH(op, &s->ops, link) {

> +            TCGOpcode opc = op->opc;

> +            atomic_add(&s->prof.table_op_count[opc], 1);

> +        }

> +    }


Why would you collect stats when stats collection is disabled?  Is this a
simple typo?


r~
Alex Bennée Dec. 13, 2019, 9:49 p.m. UTC | #2
Richard Henderson <richard.henderson@linaro.org> writes:

> On 10/7/19 11:28 AM, Alex Bennée wrote:

>> From: "Vanderson M. do Rosario" <vandersonmr2@gmail.com>

>> 

>> Replace all others CONFIG_PROFILER statistics and migrate it to

>> TBStatistics system. However, TCGProfiler still exists and can

>> be use to store global statistics and times. All TB related

>> statistics goes to TBStatistics.

>> 

>> Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>

>> Message-Id: <20190829173437.5926-6-vandersonmr2@gmail.com>

>> [AJB: fix authorship]

>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>> ---

>>  accel/tcg/tb-stats.c      |  98 +++++++++++++++++++++---------

>>  accel/tcg/translate-all.c |  46 ++++++++-------

>>  configure                 |   3 -

>>  cpus.c                    |  14 ++---

>>  include/exec/tb-stats.h   |  15 +++++

>>  include/qemu/timer.h      |   5 +-

>>  monitor/misc.c            |  28 ++-------

>>  tcg/tcg.c                 | 121 ++++++++++++--------------------------

>>  tcg/tcg.h                 |   2 +-

>>  vl.c                      |   8 +--

>>  10 files changed, 159 insertions(+), 181 deletions(-)

>> 

>> diff --git a/accel/tcg/tb-stats.c b/accel/tcg/tb-stats.c

>> index 0e64c176b3..f431159fd2 100644

>> --- a/accel/tcg/tb-stats.c

>> +++ b/accel/tcg/tb-stats.c

>> @@ -17,11 +17,18 @@

>>  #include "exec/tb-stats.h"

>>  

>>  /* TBStatistic collection controls */

>> -enum TBStatsStatus { TB_STATS_RUNNING, TB_STATS_PAUSED, TB_STATS_STOPPED };

>> +enum TBStatsStatus {

>> +    TB_STATS_DISABLED = 0,

>> +    TB_STATS_RUNNING,

>> +    TB_STATS_PAUSED,

>> +    TB_STATS_STOPPED

>> +};

>

> This should be in patch 1, I should think.

>

>> +uint64_t dev_time;

>

> This patch is doing several things at once, and I think it should be split.

> All of the changes to dev_time, for instance, are unrelated to TBStatistic.

>

>> +    jpi->interm_time += stat_per_translation(tbs, time.interm);

>> +    jpi->code_time += stat_per_translation(tbs, time.code);

>> +    jpi->opt_time += stat_per_translation(tbs, time.opt);

>> +    jpi->la_time += stat_per_translation(tbs, time.la);

>> +    jpi->restore_time += tbs->time.restore;

>> +    jpi->restore_count += tbs->time.restore_count;

>

> Why are some of these "per translation" and some not?


The restore_time is an amortised time or all restore operations (so
dividable by restore_count rather than the number of translations).

I've added some commentary.

>

>> @@ -370,11 +371,11 @@ static int cpu_restore_state_from_tb

>>      }

>>      restore_state_to_opc(env, tb, data);

>>  

>> -#ifdef CONFIG_PROFILER

>> -    atomic_set(&prof->restore_time,

>> -                prof->restore_time + profile_getclock() - ti);

>> -    atomic_set(&prof->restore_count, prof->restore_count + 1);

>> -#endif

>> +    if (tb_stats_enabled(tb, TB_JIT_TIME)) {

>> +        atomic_add(&tb->tb_stats->time.restore, profile_getclock() - ti);

>> +        atomic_inc(&tb->tb_stats->time.restore_count);

>> +    }

>

> Would it be better to use a the TBStatistics lock than two atomic

> updates?


Yeah I think keeping the locking consistent as a mutex probably pays of
in reducing complexity as well.

>

>> @@ -1826,10 +1828,10 @@ TranslationBlock *tb_gen_code(CPUState *cpu,

>>          tcg_ctx->tb_jmp_target_addr = tb->jmp_target_arg;

>>      }

>>  

>> -#ifdef CONFIG_PROFILER

>> -    atomic_set(&prof->interm_time, prof->interm_time + profile_getclock() - ti);

>> -    ti = profile_getclock();

>> -#endif

>> +    if (tb_stats_enabled(tb, TB_JIT_TIME)) {

>> +        atomic_add(&tb->tb_stats->time.interm, profile_getclock() - ti);

>> +        ti = profile_getclock();

>> +    }

>

> You should call profile_getclock() only once here.

>

> Why does this need an atomic_add, while the rest of TB_JIT_STATS within

> tb_gen_code do not, and in fact have a comment:

>

>> +    /*

>> +     * Collect JIT stats when enabled. We batch them all up here to

>> +     * avoid spamming the cache with atomic accesses

>> +     */

>

>> @@ -1871,9 +1873,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,

>>      }

>>      tb->tc.size = gen_code_size;

>>  

>> -#ifdef CONFIG_PROFILER

>> -    atomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti);

>> -#endif

>> +    if (tb_stats_enabled(tb, TB_JIT_TIME)) {

>> +        atomic_add(&tb->tb_stats->time.code, profile_getclock() - ti);

>> +    }

>

> Likewise.

>

>> diff --git a/configure b/configure

>> index 8f8446f52b..eedeb9016e 100755

>> --- a/configure

>> +++ b/configure

>> @@ -6566,9 +6566,6 @@ fi

>>  if test "$static" = "yes" ; then

>>    echo "CONFIG_STATIC=y" >> $config_host_mak

>>  fi

>> -if test "$profiler" = "yes" ; then

>> -  echo "CONFIG_PROFILER=y" >> $config_host_mak

>> -fi

>

> Removing the define without removing --enable-profiler.

>

>>  static int tcg_cpu_exec(CPUState *cpu)

>>  {

>>      int ret;

>> -#ifdef CONFIG_PROFILER

>> -    int64_t ti;

>> -#endif

>> +    uint64_t ti;

>>  

>>      assert(tcg_enabled());

>> -#ifdef CONFIG_PROFILER

>>      ti = profile_getclock();

>> -#endif

>> +

>>      cpu_exec_start(cpu);

>>      ret = cpu_exec(cpu);

>>      cpu_exec_end(cpu);

>> -#ifdef CONFIG_PROFILER

>> -    atomic_set(&tcg_ctx->prof.cpu_exec_time,

>> -               tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);

>> -#endif

>> +

>> +    atomic_add(&tcg_ctx->prof.cpu_exec_time, profile_getclock() - ti);

>

> This is also unrelated to TBStatistics.


I'm going to split this patch up into more sub-commits to do each piece
one at a time. To make it easier can I drop CONFIG_PROFILE support
before the re-factoring so I don't need to keep both modes compiling
during the transition?

> It's also adding an unconditional atomic_add, of which I am not fond.  Should

> this be testing tb_stats_collection_enabled() or something else?

>

> I'll also note that tcg_ctx is per-thread (for mttcg), and so perhaps this does

> not require an atomic_add anyway.  Perhaps just an atomic_set to be paired with

> atomic_read in the dump function that's iterating over the tcg_ctx.


I'll try an improve the commenting of TCGProf about what is just holding
per-translations values which get copied at the end and what is
persistent and may be read from outside the translation context (I think
just the OPC count array now).

>

> Even without the atomic_add, we shouldn't make the syscall for getclock in the

> normal case.

>

>

>> +    /* These are accessed with atomic operations */

>> +    struct {

>> +        int64_t restore;

>> +        uint64_t restore_count;

>> +        int64_t interm;

>> +        int64_t code;

>> +        int64_t opt;

>> +        int64_t la;

>> +    } time;

>

> Why are these signed?  We're always adding positive values.

>

> Why is restore_count here in "time"?  Indeed, why all of these unnamed

> sub-structures at all?  I don't see that "." helps organization any more than "_".

>

>> @@ -4020,18 +3959,18 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)

>>      }

>>  #endif

>>  

>> -#ifdef CONFIG_PROFILER

>> -    atomic_set(&prof->opt_time, prof->opt_time - profile_getclock());

>> -#endif

>> +    if (tb_stats_enabled(tb, TB_JIT_TIME)) {

>> +        atomic_add(&tb->tb_stats->time.opt, -profile_getclock());

>> +    }

>

> Atomic add of a negative clock value?  That just means the intermediate value

> is unusable, so why make any of this atomic?


I've rewritten all this to just grab some timestamps at various key
points and then bring it all together when we update the TBStat in
translate.

>

> Also, this is tb_gen_code again, wherein we already talked about not using

> atomic_foo.

>

>> @@ -4081,14 +4020,17 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)

>>      s->pool_labels = NULL;

>>  #endif

>>  

>> +    if (!tb_stats_collection_enabled()) {

>> +        QTAILQ_FOREACH(op, &s->ops, link) {

>> +            TCGOpcode opc = op->opc;

>> +            atomic_add(&s->prof.table_op_count[opc], 1);

>> +        }

>> +    }

>

> Why would you collect stats when stats collection is disabled?  Is this a

> simple typo?


I think if we can get away with not using atomics this count can go back
to being inline with the main opc loop. If we aren't using atomics is
there really such a big cost to just doing it unconditionally?


-- 
Alex Bennée
Richard Henderson Dec. 16, 2019, 8:34 p.m. UTC | #3
On 12/13/19 11:49 AM, Alex Bennée wrote:
> I'm going to split this patch up into more sub-commits to do each piece

> one at a time. To make it easier can I drop CONFIG_PROFILE support

> before the re-factoring so I don't need to keep both modes compiling

> during the transition?


Certainly.

> I think if we can get away with not using atomics this count can go back

> to being inline with the main opc loop. If we aren't using atomics is

> there really such a big cost to just doing it unconditionally?


I shouldn't think so.


r~
diff mbox series

Patch

diff --git a/accel/tcg/tb-stats.c b/accel/tcg/tb-stats.c
index 0e64c176b3..f431159fd2 100644
--- a/accel/tcg/tb-stats.c
+++ b/accel/tcg/tb-stats.c
@@ -17,11 +17,18 @@ 
 #include "exec/tb-stats.h"
 
 /* TBStatistic collection controls */
-enum TBStatsStatus { TB_STATS_RUNNING, TB_STATS_PAUSED, TB_STATS_STOPPED };
+enum TBStatsStatus {
+    TB_STATS_DISABLED = 0,
+    TB_STATS_RUNNING,
+    TB_STATS_PAUSED,
+    TB_STATS_STOPPED
+};
 
 static enum TBStatsStatus tcg_collect_tb_stats;
 static uint32_t default_tbstats_flag;
 
+uint64_t dev_time;
+
 struct jit_profile_info {
     uint64_t translations;
     uint64_t aborted;
@@ -33,6 +40,13 @@  struct jit_profile_info {
     uint64_t host;
     uint64_t guest;
     uint64_t search_data;
+
+    uint64_t interm_time;
+    uint64_t code_time;
+    uint64_t restore_count;
+    uint64_t restore_time;
+    uint64_t opt_time;
+    uint64_t la_time;
 };
 
 /* accumulate the statistics from all TBs */
@@ -54,6 +68,29 @@  static void collect_jit_profile_info(void *p, uint32_t hash, void *userp)
     jpi->host += tbs->code.out_len;
     jpi->guest += tbs->code.in_len;
     jpi->search_data += tbs->code.search_out_len;
+
+    jpi->interm_time += stat_per_translation(tbs, time.interm);
+    jpi->code_time += stat_per_translation(tbs, time.code);
+    jpi->opt_time += stat_per_translation(tbs, time.opt);
+    jpi->la_time += stat_per_translation(tbs, time.la);
+    jpi->restore_time += tbs->time.restore;
+    jpi->restore_count += tbs->time.restore_count;
+}
+
+void dump_jit_exec_time_info(uint64_t dev_time)
+{
+    static uint64_t last_cpu_exec_time;
+    uint64_t cpu_exec_time;
+    uint64_t delta;
+
+    cpu_exec_time = tcg_cpu_exec_time();
+    delta = cpu_exec_time - last_cpu_exec_time;
+
+    qemu_printf("async time  %" PRId64 " (%0.3f)\n",
+                   dev_time, dev_time / (double) NANOSECONDS_PER_SECOND);
+    qemu_printf("qemu time   %" PRId64 " (%0.3f)\n",
+                   delta, delta / (double) NANOSECONDS_PER_SECOND);
+    last_cpu_exec_time = cpu_exec_time;
 }
 
 /* dump JIT statisticis using TCGProfile and TBStats */
@@ -80,34 +117,39 @@  void dump_jit_profile_info(TCGProfile *s)
         qemu_printf("avg search data/TB  %0.1f\n",
                 jpi->search_data / (double) jpi->translations);
 
+        uint64_t tot = jpi->interm_time + jpi->code_time;
+
+        qemu_printf("JIT cycles          %" PRId64 " (%0.3fs at 2.4 GHz)\n",
+                tot, tot / 2.4e9);
+        qemu_printf("  cycles/op           %0.1f\n",
+                jpi->ops ? (double)tot / jpi->ops : 0);
+        qemu_printf("  cycles/in byte      %0.1f\n",
+                jpi->guest ? (double)tot / jpi->guest : 0);
+        qemu_printf("  cycles/out byte     %0.1f\n",
+                jpi->host ? (double)tot / jpi->host : 0);
+        qemu_printf("  cycles/search byte  %0.1f\n",
+                jpi->search_data ? (double)tot / jpi->search_data : 0);
+        if (tot == 0) {
+            tot = 1;
+        }
+
+        qemu_printf("  gen_interm time     %0.1f%%\n",
+                (double)jpi->interm_time / tot * 100.0);
+        qemu_printf("  gen_code time       %0.1f%%\n",
+                (double)jpi->code_time / tot * 100.0);
+
+        qemu_printf("    optim./code time    %0.1f%%\n",
+                (double)jpi->opt_time / (jpi->code_time ? jpi->code_time : 1) * 100.0);
+        qemu_printf("    liveness/code time  %0.1f%%\n",
+                (double)jpi->la_time / (jpi->code_time ? jpi->code_time : 1) * 100.0);
+
+        qemu_printf("cpu_restore count   %" PRId64 "\n", jpi->restore_count);
+        qemu_printf("  avg cycles        %0.1f\n",
+                jpi->restore_count ? (double)jpi->restore_time / jpi->restore_count : 0);
+
         if (s) {
-            int64_t tot = s->interm_time + s->code_time;
-            qemu_printf("JIT cycles          %" PRId64 " (%0.3f s at 2.4 GHz)\n",
-                            tot, tot / 2.4e9);
-            qemu_printf("cycles/op           %0.1f\n",
-                        jpi->ops ? (double)tot / jpi->ops : 0);
-            qemu_printf("cycles/in byte      %0.1f\n",
-                        jpi->guest ? (double)tot / jpi->guest : 0);
-            qemu_printf("cycles/out byte     %0.1f\n",
-                        jpi->host ? (double)tot / jpi->host : 0);
-            qemu_printf("cycles/search byte     %0.1f\n",
-                        jpi->search_data ? (double)tot / jpi->search_data : 0);
-            if (tot == 0) {
-                tot = 1;
-            }
-            qemu_printf("  gen_interm time   %0.1f%%\n",
-                        (double)s->interm_time / tot * 100.0);
-            qemu_printf("  gen_code time     %0.1f%%\n",
-                        (double)s->code_time / tot * 100.0);
-            qemu_printf("optim./code time    %0.1f%%\n",
-                        (double)s->opt_time / (s->code_time ? s->code_time : 1)
-                        * 100.0);
-            qemu_printf("liveness/code time  %0.1f%%\n",
-                    (double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0);
-            qemu_printf("cpu_restore count   %" PRId64 "\n",
-                    s->restore_count);
-            qemu_printf("  avg cycles        %0.1f\n",
-                    s->restore_count ? (double)s->restore_time / s->restore_count : 0);
+            qemu_printf("cpu exec time  %" PRId64 " (%0.3fs)\n",
+                s->cpu_exec_time, s->cpu_exec_time / (double) NANOSECONDS_PER_SECOND);
         }
     }
     g_free(jpi);
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 84f759a829..396e63c3e7 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -337,10 +337,11 @@  static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
     CPUArchState *env = cpu->env_ptr;
     uint8_t *p = tb->tc.ptr + tb->tc.size;
     int i, j, num_insns = tb->icount;
-#ifdef CONFIG_PROFILER
-    TCGProfile *prof = &tcg_ctx->prof;
-    int64_t ti = profile_getclock();
-#endif
+    uint64_t ti = 0;
+
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        ti = profile_getclock();
+    }
 
     searched_pc -= GETPC_ADJ;
 
@@ -370,11 +371,11 @@  static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
     }
     restore_state_to_opc(env, tb, data);
 
-#ifdef CONFIG_PROFILER
-    atomic_set(&prof->restore_time,
-                prof->restore_time + profile_getclock() - ti);
-    atomic_set(&prof->restore_count, prof->restore_count + 1);
-#endif
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        atomic_add(&tb->tb_stats->time.restore, profile_getclock() - ti);
+        atomic_inc(&tb->tb_stats->time.restore_count);
+    }
+
     return 0;
 }
 
@@ -1731,9 +1732,8 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     tcg_insn_unit *gen_code_buf;
     int gen_code_size, search_size, max_insns;
     TCGProfile *prof = &tcg_ctx->prof;
-#ifdef CONFIG_PROFILER
-    int64_t ti;
-#endif
+    uint64_t ti = 0;
+
     assert_memory_lock();
 
     phys_pc = get_page_addr_code(env, pc);
@@ -1779,9 +1779,6 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     tcg_ctx->tb_cflags = cflags;
  tb_overflow:
 
-#ifdef CONFIG_PROFILER
-    ti = profile_getclock();
-#endif
 
     /*
      * We want to fetch the stats structure before we start code
@@ -1801,6 +1798,11 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
         if (flag & TB_JIT_STATS) {
             tb->tb_stats->stats_enabled |= TB_JIT_STATS;
         }
+
+        if (flag & TB_JIT_TIME) {
+            tb->tb_stats->stats_enabled |= TB_JIT_TIME;
+            ti = profile_getclock();
+        }
     } else {
         tb->tb_stats = NULL;
     }
@@ -1826,10 +1828,10 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
         tcg_ctx->tb_jmp_target_addr = tb->jmp_target_arg;
     }
 
-#ifdef CONFIG_PROFILER
-    atomic_set(&prof->interm_time, prof->interm_time + profile_getclock() - ti);
-    ti = profile_getclock();
-#endif
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        atomic_add(&tb->tb_stats->time.interm, profile_getclock() - ti);
+        ti = profile_getclock();
+    }
 
     gen_code_size = tcg_gen_code(tcg_ctx, tb);
     if (unlikely(gen_code_size < 0)) {
@@ -1871,9 +1873,9 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     }
     tb->tc.size = gen_code_size;
 
-#ifdef CONFIG_PROFILER
-    atomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti);
-#endif
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        atomic_add(&tb->tb_stats->time.code, profile_getclock() - ti);
+    }
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
diff --git a/configure b/configure
index 8f8446f52b..eedeb9016e 100755
--- a/configure
+++ b/configure
@@ -6566,9 +6566,6 @@  fi
 if test "$static" = "yes" ; then
   echo "CONFIG_STATIC=y" >> $config_host_mak
 fi
-if test "$profiler" = "yes" ; then
-  echo "CONFIG_PROFILER=y" >> $config_host_mak
-fi
 if test "$want_tools" = "yes" ; then
   echo "CONFIG_TOOLS=y" >> $config_host_mak
 fi
diff --git a/cpus.c b/cpus.c
index d2c61ff155..3c4bce0e56 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1442,21 +1442,17 @@  static void process_icount_data(CPUState *cpu)
 static int tcg_cpu_exec(CPUState *cpu)
 {
     int ret;
-#ifdef CONFIG_PROFILER
-    int64_t ti;
-#endif
+    uint64_t ti;
 
     assert(tcg_enabled());
-#ifdef CONFIG_PROFILER
     ti = profile_getclock();
-#endif
+
     cpu_exec_start(cpu);
     ret = cpu_exec(cpu);
     cpu_exec_end(cpu);
-#ifdef CONFIG_PROFILER
-    atomic_set(&tcg_ctx->prof.cpu_exec_time,
-               tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
-#endif
+
+    atomic_add(&tcg_ctx->prof.cpu_exec_time, profile_getclock() - ti);
+
     return ret;
 }
 
diff --git a/include/exec/tb-stats.h b/include/exec/tb-stats.h
index c779d6a183..a142972960 100644
--- a/include/exec/tb-stats.h
+++ b/include/exec/tb-stats.h
@@ -90,10 +90,24 @@  struct TBStatistics {
      * this TBStats structure. Has to be reset on a tb_flush.
      */
     GPtrArray *tbs;
+
+    /* end of jit_stats_lock */
+
+    /* These are accessed with atomic operations */
+    struct {
+        int64_t restore;
+        uint64_t restore_count;
+        int64_t interm;
+        int64_t code;
+        int64_t opt;
+        int64_t la;
+    } time;
 };
 
 bool tb_stats_cmp(const void *ap, const void *bp);
 
+void dump_jit_exec_time_info(uint64_t dev_time);
+
 void init_tb_stats_htable_if_not(void);
 
 void dump_jit_profile_info(TCGProfile *s);
@@ -101,6 +115,7 @@  void dump_jit_profile_info(TCGProfile *s);
 #define TB_NOTHING    (1 << 0)
 #define TB_EXEC_STATS (1 << 1)
 #define TB_JIT_STATS  (1 << 2)
+#define TB_JIT_TIME   (1 << 3)
 
 void enable_collect_tb_stats(void);
 void disable_collect_tb_stats(void);
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 85bc6eb00b..641ca3ddd0 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -1006,13 +1006,10 @@  static inline int64_t cpu_get_host_ticks(void)
 }
 #endif
 
-#ifdef CONFIG_PROFILER
 static inline int64_t profile_getclock(void)
 {
     return get_clock();
 }
 
-extern int64_t dev_time;
-#endif
-
+extern uint64_t dev_time;
 #endif
diff --git a/monitor/misc.c b/monitor/misc.c
index aef16f6cfb..ac4ff58d96 100644
--- a/monitor/misc.c
+++ b/monitor/misc.c
@@ -472,6 +472,11 @@  static void hmp_info_jit(Monitor *mon, const QDict *qdict)
 
 static void hmp_info_opcount(Monitor *mon, const QDict *qdict)
 {
+    if (!tb_stats_collection_enabled()) {
+        error_report("TB information not being recorded.");
+        return;
+    }
+
     dump_opcount_info();
 }
 #endif
@@ -1082,32 +1087,11 @@  static void hmp_info_mtree(Monitor *mon, const QDict *qdict)
     mtree_info(flatview, dispatch_tree, owner);
 }
 
-#ifdef CONFIG_PROFILER
-
-int64_t dev_time;
-
 static void hmp_info_profile(Monitor *mon, const QDict *qdict)
 {
-    static int64_t last_cpu_exec_time;
-    int64_t cpu_exec_time;
-    int64_t delta;
-
-    cpu_exec_time = tcg_cpu_exec_time();
-    delta = cpu_exec_time - last_cpu_exec_time;
-
-    monitor_printf(mon, "async time  %" PRId64 " (%0.3f)\n",
-                   dev_time, dev_time / (double)NANOSECONDS_PER_SECOND);
-    monitor_printf(mon, "qemu time   %" PRId64 " (%0.3f)\n",
-                   delta, delta / (double)NANOSECONDS_PER_SECOND);
-    last_cpu_exec_time = cpu_exec_time;
+    dump_jit_exec_time_info(dev_time);
     dev_time = 0;
 }
-#else
-static void hmp_info_profile(Monitor *mon, const QDict *qdict)
-{
-    monitor_printf(mon, "Internal profiler not compiled\n");
-}
-#endif
 
 /* Capture support */
 static QLIST_HEAD (capture_list_head, CaptureState) capture_head;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 9c199c8a40..e1438fa0d9 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -3881,82 +3881,34 @@  static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
     }
 }
 
-#ifdef CONFIG_PROFILER
-
 /* avoid copy/paste errors */
 #define PROF_ADD(to, from, field)                       \
     do {                                                \
         (to)->field += atomic_read(&((from)->field));   \
     } while (0)
 
-#define PROF_MAX(to, from, field)                                       \
-    do {                                                                \
-        typeof((from)->field) val__ = atomic_read(&((from)->field));    \
-        if (val__ > (to)->field) {                                      \
-            (to)->field = val__;                                        \
-        }                                                               \
-    } while (0)
-
-/* Pass in a zero'ed @prof */
-static inline
-void tcg_profile_snapshot(TCGProfile *prof, bool counters, bool table)
+static void collect_tcg_profiler(TCGProfile *prof)
 {
+    int i;
     unsigned int n_ctxs = atomic_read(&n_tcg_ctxs);
-    unsigned int i;
 
     for (i = 0; i < n_ctxs; i++) {
         TCGContext *s = atomic_read(&tcg_ctxs[i]);
         const TCGProfile *orig = &s->prof;
 
-        if (counters) {
-            PROF_ADD(prof, orig, cpu_exec_time);
-            PROF_ADD(prof, orig, interm_time);
-            PROF_ADD(prof, orig, code_time);
-            PROF_ADD(prof, orig, la_time);
-            PROF_ADD(prof, orig, opt_time);
-            PROF_ADD(prof, orig, restore_count);
-            PROF_ADD(prof, orig, restore_time);
-        }
-        if (table) {
-            int i;
+        PROF_ADD(prof, orig, cpu_exec_time);
 
-            for (i = 0; i < NB_OPS; i++) {
-                PROF_ADD(prof, orig, table_op_count[i]);
-            }
+        for (i = 0; i < NB_OPS; i++) {
+            PROF_ADD(prof, orig, table_op_count[i]);
         }
     }
 }
 
-#undef PROF_ADD
-#undef PROF_MAX
-
-static void tcg_profile_snapshot_counters(TCGProfile *prof)
-{
-    tcg_profile_snapshot(prof, true, false);
-}
-
-static void tcg_profile_snapshot_table(TCGProfile *prof)
-{
-    tcg_profile_snapshot(prof, false, true);
-}
-
-void tcg_dump_op_count(void)
-{
-    TCGProfile prof = {};
-    int i;
-
-    tcg_profile_snapshot_table(&prof);
-    for (i = 0; i < NB_OPS; i++) {
-        qemu_printf("%s %" PRId64 "\n", tcg_op_defs[i].name,
-                    prof.table_op_count[i]);
-    }
-}
-
-int64_t tcg_cpu_exec_time(void)
+uint64_t tcg_cpu_exec_time(void)
 {
     unsigned int n_ctxs = atomic_read(&n_tcg_ctxs);
     unsigned int i;
-    int64_t ret = 0;
+    uint64_t ret = 0;
 
     for (i = 0; i < n_ctxs; i++) {
         const TCGContext *s = atomic_read(&tcg_ctxs[i]);
@@ -3966,19 +3918,6 @@  int64_t tcg_cpu_exec_time(void)
     }
     return ret;
 }
-#else
-void tcg_dump_op_count(void)
-{
-    qemu_printf("[TCG profiler not compiled]\n");
-}
-
-int64_t tcg_cpu_exec_time(void)
-{
-    error_report("%s: TCG profiler not compiled", __func__);
-    exit(EXIT_FAILURE);
-}
-#endif
-
 
 int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
 {
@@ -4020,18 +3959,18 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
     }
 #endif
 
-#ifdef CONFIG_PROFILER
-    atomic_set(&prof->opt_time, prof->opt_time - profile_getclock());
-#endif
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        atomic_add(&tb->tb_stats->time.opt, -profile_getclock());
+    }
 
 #ifdef USE_TCG_OPTIMIZATIONS
     tcg_optimize(s);
 #endif
 
-#ifdef CONFIG_PROFILER
-    atomic_set(&prof->opt_time, prof->opt_time + profile_getclock());
-    atomic_set(&prof->la_time, prof->la_time - profile_getclock());
-#endif
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        atomic_add(&tb->tb_stats->time.opt, profile_getclock());
+        atomic_add(&tb->tb_stats->time.la,  -profile_getclock());
+    }
 
     reachable_code_pass(s);
     liveness_pass_1(s);
@@ -4054,9 +3993,9 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
         }
     }
 
-#ifdef CONFIG_PROFILER
-    atomic_set(&prof->la_time, prof->la_time + profile_getclock());
-#endif
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        atomic_add(&tb->tb_stats->time.la, profile_getclock());
+    }
 
 #ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT)
@@ -4081,14 +4020,17 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
     s->pool_labels = NULL;
 #endif
 
+    if (!tb_stats_collection_enabled()) {
+        QTAILQ_FOREACH(op, &s->ops, link) {
+            TCGOpcode opc = op->opc;
+            atomic_add(&s->prof.table_op_count[opc], 1);
+        }
+    }
+
     num_insns = -1;
     QTAILQ_FOREACH(op, &s->ops, link) {
         TCGOpcode opc = op->opc;
 
-#ifdef CONFIG_PROFILER
-        atomic_set(&prof->table_op_count[opc], prof->table_op_count[opc] + 1);
-#endif
-
         switch (opc) {
         case INDEX_op_mov_i32:
         case INDEX_op_mov_i64:
@@ -4181,14 +4123,23 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
     return tcg_current_code_size(s);
 }
 
+void tcg_dump_op_count(void)
+{
+    TCGProfile prof = {};
+    int i;
+
+    collect_tcg_profiler(&prof);
+    for (i = 0; i < NB_OPS; i++) {
+        qemu_printf("%s %" PRId64 "\n", tcg_op_defs[i].name, prof.table_op_count[i]);
+    }
+}
+
 void tcg_dump_info(void)
 {
     TCGProfile *s = NULL;
-#ifdef CONFIG_PROFILER
     TCGProfile prof = {};
-    tcg_profile_snapshot_counters(&prof);
     s = &prof;
-#endif
+    collect_tcg_profiler(s);
     dump_jit_profile_info(s);
 }
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 5296f7c075..6dcbbf79e7 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -948,7 +948,7 @@  int tcg_check_temp_count(void);
 #define tcg_check_temp_count() 0
 #endif
 
-int64_t tcg_cpu_exec_time(void);
+uint64_t tcg_cpu_exec_time(void);
 void tcg_dump_info(void);
 void tcg_dump_op_count(void);
 
diff --git a/vl.c b/vl.c
index 002bf4919e..f91bc6fb4c 100644
--- a/vl.c
+++ b/vl.c
@@ -1796,17 +1796,11 @@  static bool main_loop_should_exit(void)
 
 static void main_loop(void)
 {
-#ifdef CONFIG_PROFILER
-    int64_t ti;
-#endif
+    uint64_t ti;
     while (!main_loop_should_exit()) {
-#ifdef CONFIG_PROFILER
         ti = profile_getclock();
-#endif
         main_loop_wait(false);
-#ifdef CONFIG_PROFILER
         dev_time += profile_getclock() - ti;
-#endif
     }
 }