diff mbox

[v2,3/3] trace: instrument and trace tcg tb flush activity

Message ID 1405424541-21803-4-git-send-email-alex.bennee@linaro.org
State New
Headers show

Commit Message

Alex Bennée July 15, 2014, 11:42 a.m. UTC
The tb_find_fast path is important to quickly moving from one block to
the next. However we need to flush it when tlb changes occur so it's
important to know how well we are doing with the cache.

This patch adds some basic hit/miss profiling to the tb_find_fast
tracepoint as well as a number of other tb_ related areas. I've also
added a trace_inc_counter() helper which gets inlined away when tracing
is disabled.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

Comments

Andreas Färber July 15, 2014, 12:15 p.m. UTC | #1
Hi,

Am 15.07.2014 13:42, schrieb Alex Bennée:
> The tb_find_fast path is important to quickly moving from one block to
> the next. However we need to flush it when tlb changes occur so it's
> important to know how well we are doing with the cache.
> 
> This patch adds some basic hit/miss profiling to the tb_find_fast
> tracepoint as well as a number of other tb_ related areas. I've also
> added a trace_inc_counter() helper which gets inlined away when tracing
> is disabled.
> 
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> 
> diff --git a/cpu-exec.c b/cpu-exec.c
> index 45ef77b..771272f 100644
> --- a/cpu-exec.c
> +++ b/cpu-exec.c
> @@ -187,7 +187,10 @@ static inline TranslationBlock *tb_find_fast(CPUArchState *env)
>      tb = cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>                   tb->flags != flags)) {
> +        trace_inc_counter(&cpu->tb_jmp_cache_stats.misses);
>          tb = tb_find_slow(env, pc, cs_base, flags);
> +    } else {
> +        trace_inc_counter(&cpu->tb_jmp_cache_stats.hits);
>      }
>      return tb;
>  }
> diff --git a/cputlb.c b/cputlb.c
> index 7bd3573..672656a 100644
> --- a/cputlb.c
> +++ b/cputlb.c
> @@ -58,7 +58,7 @@ void tlb_flush(CPUState *cpu, int flush_global)
>      cpu->current_tb = NULL;
>  
>      memset(env->tlb_table, -1, sizeof(env->tlb_table));
> -    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
> +    tb_flush_all_jmp_cache(cpu);
>  
>      env->tlb_flush_addr = -1;
>      env->tlb_flush_mask = 0;
> diff --git a/include/qom/cpu.h b/include/qom/cpu.h
> index df977c8..8376678 100644
> --- a/include/qom/cpu.h
> +++ b/include/qom/cpu.h
> @@ -243,6 +243,10 @@ struct CPUState {
>      void *env_ptr; /* CPUArchState */
>      struct TranslationBlock *current_tb;
>      struct TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE];
> +    struct {
> +        int     hits;
> +        int     misses;

Is anything else going to be added here? If not, the indentation can be
dropped.

> +    } tb_jmp_cache_stats;

This is lacking documentation. Should be trivial to add for this field
(not here, above the struct). To document the subfields we may need to
name the struct.

>      struct GDBRegisterState *gdb_regs;
>      int gdb_num_regs;
>      int gdb_num_g_regs;
> @@ -584,6 +588,15 @@ void cpu_exit(CPUState *cpu);
>   */
>  void cpu_resume(CPUState *cpu);
>  
> +
> +/**
> + * tb_flush_all_jmp_cache:
> + * @cpu: The CPU jmp cache to flush
> + *
> + * Flush all the entries from the cpu fast jump cache

"CPU" for consistency

> + */
> +void tb_flush_all_jmp_cache(CPUState *cpu);
> +
>  /**
>   * qemu_init_vcpu:
>   * @cpu: The vCPU to initialize.
> diff --git a/include/trace.h b/include/trace.h
> index c15f498..7a9c0dc 100644
> --- a/include/trace.h
> +++ b/include/trace.h
> @@ -3,4 +3,14 @@
>  
>  #include "trace/generated-tracers.h"
>  
> +#ifndef CONFIG_TRACE_NOP
> +static inline void trace_inc_counter(int *counter) {
> +    int cnt = *counter;
> +    cnt++;
> +    *counter = cnt;
> +}
> +#else
> +static inline void trace_inc_counter(int *counter) { /* do nothing */ }
> +#endif
> +
>  #endif  /* TRACE_H */

Coding Style issues with the first function. For simplicity just keep
the first implementation but with the proper brace placement, and then
just put the #ifdef into the function body. That avoids the signatures
getting out of sync.

> diff --git a/qom/cpu.c b/qom/cpu.c
> index fada2d4..956b36d 100644
> --- a/qom/cpu.c
> +++ b/qom/cpu.c
> @@ -244,7 +244,7 @@ static void cpu_common_reset(CPUState *cpu)
>      cpu->icount_extra = 0;
>      cpu->icount_decr.u32 = 0;
>      cpu->can_do_io = 0;
> -    memset(cpu->tb_jmp_cache, 0, TB_JMP_CACHE_SIZE * sizeof(void *));
> +    tb_flush_all_jmp_cache(cpu);
>  }
>  
>  static bool cpu_common_has_work(CPUState *cs)
> diff --git a/trace-events b/trace-events
> index f8cc35f..5a58a11 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1244,6 +1244,9 @@ exec_tb_exit(void *next_tb, unsigned int flags) "tb:%p flags=%x"
>  
>  # translate-all.c
>  translate_block(void *tb, uintptr_t pc, uint8_t *tb_code) "tb:%p, pc:0x%x, tb_code:%p"
> +tb_flush(void) ""
> +tb_flush_jump_cache(uintptr_t pc) "pc:0x%x"
> +tb_flush_all_jump_cache(int hits, int misses) "hits:%d misses:%d"
>  
>  # memory.c
>  memory_region_ops_read(void *mr, uint64_t addr, uint64_t value, unsigned size) "mr %p addr %#"PRIx64" value %#"PRIx64" size %u"
> diff --git a/translate-all.c b/translate-all.c
> index a11c083..8e7bbcc 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -714,12 +714,22 @@ static void page_flush_tb(void)
>      }
>  }
>  
> +void tb_flush_all_jmp_cache(CPUState *cpu)
> +{
> +    trace_tb_flush_all_jump_cache(cpu->tb_jmp_cache_stats.hits,
> +                                  cpu->tb_jmp_cache_stats.misses);
> +    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
> +    memset((void *) &cpu->tb_jmp_cache_stats, 0, sizeof(cpu->tb_jmp_cache_stats));
> +}
> +
>  /* flush all the translation blocks */
>  /* XXX: tb_flush is currently not thread safe */
>  void tb_flush(CPUArchState *env1)
>  {
>      CPUState *cpu = ENV_GET_CPU(env1);
>  
> +    trace_tb_flush();
> +
>  #if defined(DEBUG_FLUSH)
>      printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
>             (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
> @@ -734,7 +744,7 @@ void tb_flush(CPUArchState *env1)
>      tcg_ctx.tb_ctx.nb_tbs = 0;
>  
>      CPU_FOREACH(cpu) {
> -        memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
> +        tb_flush_all_jmp_cache(cpu);
>      }
>  
>      memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash));
> @@ -1520,6 +1530,8 @@ void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr)
>      i = tb_jmp_cache_hash_page(addr);
>      memset(&cpu->tb_jmp_cache[i], 0,
>             TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *));

Can this one be dropped, too?

> +
> +    trace_tb_flush_jump_cache(addr);
>  }
>  
>  void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)

Cheers,
Andreas
Peter Maydell July 15, 2014, 12:23 p.m. UTC | #2
On 15 July 2014 12:42, Alex Bennée <alex.bennee@linaro.org> wrote:
> +#ifndef CONFIG_TRACE_NOP
> +static inline void trace_inc_counter(int *counter) {
> +    int cnt = *counter;
> +    cnt++;
> +    *counter = cnt;
> +}

...why isn't this just "*counter++;"  ?

-- PMM
Peter Maydell July 15, 2014, 1:07 p.m. UTC | #3
On 15 July 2014 13:23, Peter Maydell <peter.maydell@linaro.org> wrote:
> On 15 July 2014 12:42, Alex Bennée <alex.bennee@linaro.org> wrote:
>> +#ifndef CONFIG_TRACE_NOP
>> +static inline void trace_inc_counter(int *counter) {
>> +    int cnt = *counter;
>> +    cnt++;
>> +    *counter = cnt;
>> +}
>
> ...why isn't this just "*counter++;"  ?

Derp.

 (*counter)++;

I leave for the reader to decide whether this constitutes an
argument in favour of the way you originally phrased it...

-- PMM
Alex Bennée July 15, 2014, 1:10 p.m. UTC | #4
Peter Maydell writes:

> On 15 July 2014 12:42, Alex Bennée <alex.bennee@linaro.org> wrote:
>> +#ifndef CONFIG_TRACE_NOP
>> +static inline void trace_inc_counter(int *counter) {
>> +    int cnt = *counter;
>> +    cnt++;
>> +    *counter = cnt;
>> +}
>
> ...why isn't this just "*counter++;"  ?

You of course mean:

    (*counter)++;

I'll fix that up for the next iteration...
Alex Bennée July 15, 2014, 1:12 p.m. UTC | #5
Andreas Färber writes:

> Hi,
>
> Am 15.07.2014 13:42, schrieb Alex Bennée:
<snip>
>> index df977c8..8376678 100644
>> --- a/include/qom/cpu.h
>> +++ b/include/qom/cpu.h
>> @@ -243,6 +243,10 @@ struct CPUState {
>>      void *env_ptr; /* CPUArchState */
>>      struct TranslationBlock *current_tb;
>>      struct TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE];
>> +    struct {
>> +        int     hits;
>> +        int     misses;
>
> Is anything else going to be added here? If not, the indentation can be
> dropped.

At the moment probably not.

>
>> +    } tb_jmp_cache_stats;
>
> This is lacking documentation. Should be trivial to add for this field
> (not here, above the struct). To document the subfields we may need to
> name the struct.

Would not a simple comment be enough?

>
>>      struct GDBRegisterState *gdb_regs;
>>      int gdb_num_regs;
>>      int gdb_num_g_regs;
>> @@ -584,6 +588,15 @@ void cpu_exit(CPUState *cpu);
>>   */
>>  void cpu_resume(CPUState *cpu);
>>  
>> +
>> +/**
>> + * tb_flush_all_jmp_cache:
>> + * @cpu: The CPU jmp cache to flush
>> + *
>> + * Flush all the entries from the cpu fast jump cache
>
> "CPU" for consistency

OK

>
<snip>
>> +#ifndef CONFIG_TRACE_NOP
>> +static inline void trace_inc_counter(int *counter) {
>> +    int cnt = *counter;
>> +    cnt++;
>> +    *counter = cnt;
>> +}
>> +#else
>> +static inline void trace_inc_counter(int *counter) { /* do nothing */ }
>> +#endif
>> +
>>  #endif  /* TRACE_H */
>
> Coding Style issues with the first function. For simplicity just keep
> the first implementation but with the proper brace placement, and then
> just put the #ifdef into the function body. That avoids the signatures
> getting out of sync.

mea-culpa, I forgot to run checkpatch.pl....
>>  
>>      memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash));
>> @@ -1520,6 +1530,8 @@ void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr)
>>      i = tb_jmp_cache_hash_page(addr);
>>      memset(&cpu->tb_jmp_cache[i], 0,
>>             TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *));
>
> Can this one be dropped, too?

No, this is only a partial invalidation. I did toy with instrumenting
how many entries are flushed but it didn't seem the be worth it given
with the current architecture there is not much we can do.

>
>> +
>> +    trace_tb_flush_jump_cache(addr);
>>  }
>>  
>>  void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
>
> Cheers,
> Andreas
Paolo Bonzini July 15, 2014, 1:19 p.m. UTC | #6
Il 15/07/2014 13:42, Alex Bennée ha scritto:
> +        trace_inc_counter(&cpu->tb_jmp_cache_stats.misses);
>          tb = tb_find_slow(env, pc, cs_base, flags);
> +    } else {
> +        trace_inc_counter(&cpu->tb_jmp_cache_stats.hits);
>      }

I think this is premature optimization...

Paolo
Alex Bennée July 15, 2014, 2:16 p.m. UTC | #7
Paolo Bonzini writes:

> Il 15/07/2014 13:42, Alex Bennée ha scritto:
>> +        trace_inc_counter(&cpu->tb_jmp_cache_stats.misses);
>>          tb = tb_find_slow(env, pc, cs_base, flags);
>> +    } else {
>> +        trace_inc_counter(&cpu->tb_jmp_cache_stats.hits);
>>      }
>
> I think this is premature optimization...

How do you mean? It's not really an optimization as much as an
instrumentation. It should compile away to nothing if you don't have
tracing enabled in your build.

OTOH the numbers I'm seeing are very interesting in so far as the fast
path could be a potential waste of code in a lot of cases.
Paolo Bonzini July 15, 2014, 8:11 p.m. UTC | #8
Il 15/07/2014 16:16, Alex Bennée ha scritto:
>> > I think this is premature optimization...
> How do you mean? It's not really an optimization as much as an
> instrumentation. It should compile away to nothing if you don't have
> tracing enabled in your build.

I think it's not a big deal if you always enable the counting, and 
perhaps show them in "info jit".

Paolo

> OTOH the numbers I'm seeing are very interesting in so far as the fast
> path could be a potential waste of code in a lot of cases.
Peter Maydell July 15, 2014, 8:29 p.m. UTC | #9
On 15 July 2014 21:11, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 15/07/2014 16:16, Alex Bennée ha scritto:
>
>>> > I think this is premature optimization...
>>
>> How do you mean? It's not really an optimization as much as an
>> instrumentation. It should compile away to nothing if you don't have
>> tracing enabled in your build.
>
>
> I think it's not a big deal if you always enable the counting, and perhaps
> show them in "info jit".

We don't enable any other tracepoints by default; why would
we want to enable just this one which is in a hot codepath??

-- PMM
Paolo Bonzini July 15, 2014, 8:38 p.m. UTC | #10
Il 15/07/2014 22:29, Peter Maydell ha scritto:
>> >
>> > I think it's not a big deal if you always enable the counting, and perhaps
>> > show them in "info jit".
> We don't enable any other tracepoints by default; why would
> we want to enable just this one which is in a hot codepath??

I'm not referring to the tracepoint, only to trace_inc_counter instead 
of just "x++".  The overhead is probably not measurable. There are a few 
low-hanging fruit optimizations in cpu-exec.c that probably would give 
more measurable benefit, for example trying to replace cpu_loop_exit 
with a goto whenever possible.

Paolo
diff mbox

Patch

diff --git a/cpu-exec.c b/cpu-exec.c
index 45ef77b..771272f 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -187,7 +187,10 @@  static inline TranslationBlock *tb_find_fast(CPUArchState *env)
     tb = cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
     if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
                  tb->flags != flags)) {
+        trace_inc_counter(&cpu->tb_jmp_cache_stats.misses);
         tb = tb_find_slow(env, pc, cs_base, flags);
+    } else {
+        trace_inc_counter(&cpu->tb_jmp_cache_stats.hits);
     }
     return tb;
 }
diff --git a/cputlb.c b/cputlb.c
index 7bd3573..672656a 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -58,7 +58,7 @@  void tlb_flush(CPUState *cpu, int flush_global)
     cpu->current_tb = NULL;
 
     memset(env->tlb_table, -1, sizeof(env->tlb_table));
-    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
+    tb_flush_all_jmp_cache(cpu);
 
     env->tlb_flush_addr = -1;
     env->tlb_flush_mask = 0;
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index df977c8..8376678 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -243,6 +243,10 @@  struct CPUState {
     void *env_ptr; /* CPUArchState */
     struct TranslationBlock *current_tb;
     struct TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE];
+    struct {
+        int     hits;
+        int     misses;
+    } tb_jmp_cache_stats;
     struct GDBRegisterState *gdb_regs;
     int gdb_num_regs;
     int gdb_num_g_regs;
@@ -584,6 +588,15 @@  void cpu_exit(CPUState *cpu);
  */
 void cpu_resume(CPUState *cpu);
 
+
+/**
+ * tb_flush_all_jmp_cache:
+ * @cpu: The CPU jmp cache to flush
+ *
+ * Flush all the entries from the cpu fast jump cache
+ */
+void tb_flush_all_jmp_cache(CPUState *cpu);
+
 /**
  * qemu_init_vcpu:
  * @cpu: The vCPU to initialize.
diff --git a/include/trace.h b/include/trace.h
index c15f498..7a9c0dc 100644
--- a/include/trace.h
+++ b/include/trace.h
@@ -3,4 +3,14 @@ 
 
 #include "trace/generated-tracers.h"
 
+#ifndef CONFIG_TRACE_NOP
+static inline void trace_inc_counter(int *counter) {
+    int cnt = *counter;
+    cnt++;
+    *counter = cnt;
+}
+#else
+static inline void trace_inc_counter(int *counter) { /* do nothing */ }
+#endif
+
 #endif  /* TRACE_H */
diff --git a/qom/cpu.c b/qom/cpu.c
index fada2d4..956b36d 100644
--- a/qom/cpu.c
+++ b/qom/cpu.c
@@ -244,7 +244,7 @@  static void cpu_common_reset(CPUState *cpu)
     cpu->icount_extra = 0;
     cpu->icount_decr.u32 = 0;
     cpu->can_do_io = 0;
-    memset(cpu->tb_jmp_cache, 0, TB_JMP_CACHE_SIZE * sizeof(void *));
+    tb_flush_all_jmp_cache(cpu);
 }
 
 static bool cpu_common_has_work(CPUState *cs)
diff --git a/trace-events b/trace-events
index f8cc35f..5a58a11 100644
--- a/trace-events
+++ b/trace-events
@@ -1244,6 +1244,9 @@  exec_tb_exit(void *next_tb, unsigned int flags) "tb:%p flags=%x"
 
 # translate-all.c
 translate_block(void *tb, uintptr_t pc, uint8_t *tb_code) "tb:%p, pc:0x%x, tb_code:%p"
+tb_flush(void) ""
+tb_flush_jump_cache(uintptr_t pc) "pc:0x%x"
+tb_flush_all_jump_cache(int hits, int misses) "hits:%d misses:%d"
 
 # memory.c
 memory_region_ops_read(void *mr, uint64_t addr, uint64_t value, unsigned size) "mr %p addr %#"PRIx64" value %#"PRIx64" size %u"
diff --git a/translate-all.c b/translate-all.c
index a11c083..8e7bbcc 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -714,12 +714,22 @@  static void page_flush_tb(void)
     }
 }
 
+void tb_flush_all_jmp_cache(CPUState *cpu)
+{
+    trace_tb_flush_all_jump_cache(cpu->tb_jmp_cache_stats.hits,
+                                  cpu->tb_jmp_cache_stats.misses);
+    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
+    memset((void *) &cpu->tb_jmp_cache_stats, 0, sizeof(cpu->tb_jmp_cache_stats));
+}
+
 /* flush all the translation blocks */
 /* XXX: tb_flush is currently not thread safe */
 void tb_flush(CPUArchState *env1)
 {
     CPUState *cpu = ENV_GET_CPU(env1);
 
+    trace_tb_flush();
+
 #if defined(DEBUG_FLUSH)
     printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
            (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
@@ -734,7 +744,7 @@  void tb_flush(CPUArchState *env1)
     tcg_ctx.tb_ctx.nb_tbs = 0;
 
     CPU_FOREACH(cpu) {
-        memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
+        tb_flush_all_jmp_cache(cpu);
     }
 
     memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash));
@@ -1520,6 +1530,8 @@  void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr)
     i = tb_jmp_cache_hash_page(addr);
     memset(&cpu->tb_jmp_cache[i], 0,
            TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *));
+
+    trace_tb_flush_jump_cache(addr);
 }
 
 void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)