diff mbox

[v4,01/11] tcg: add ability to dump /tmp/perf-<pid>.map files

Message ID 1438593291-27109-2-git-send-email-alex.bennee@linaro.org
State New
Headers show

Commit Message

Alex Bennée Aug. 3, 2015, 9:14 a.m. UTC
This allows the perf tool to map samples to each individual translation
block. This could be expanded for user space but currently it gives
enough information to find any hotblocks by other means.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

---

v2:
  - hoist up into translate-all.c
  - don't use pointless glib wrappers
  - use proper format types for portability
  - mark prologue/epilog area
  - rebase

v3:
  - fix bracket for perf-map
  - find an include for the tb_enable_perfmap() declaration
  - checkpatch clean-ups
---
 include/qemu-common.h |  2 ++
 qemu-options.hx       |  9 +++++++++
 translate-all.c       | 26 ++++++++++++++++++++++++++
 vl.c                  |  4 ++++
 4 files changed, 41 insertions(+)

Comments

Alex Bennée Aug. 4, 2015, 7:39 a.m. UTC | #1
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 03/08/2015 11:14, Alex Bennée wrote:
>> This allows the perf tool to map samples to each individual translation
>> block. This could be expanded for user space but currently it gives
>> enough information to find any hotblocks by other means.
>> 
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>
> What happens if you encounter a tb_flush?

At the point of a tb_flush all bets are off as we will re-generate all
the blocks at potentially different locations in the translation buffer.
However for most analysis cases you are unlikely to cause the code
buffer to overflow. Most other uses of tb_flush are the result
debugging.

I could add a printf when --perfmap is enabled to flag when a flush
happens to signal to the user? I guess some more caveats in the flag
description wouldn't hurt.

We could consider truncating and re-starting the JIT dump at each flush?


>
> Paolo
Alex Bennée Aug. 4, 2015, 12:55 p.m. UTC | #2
Aurelien Jarno <aurelien@aurel32.net> writes:

> On 2015-08-04 08:39, Alex Bennée wrote:
>> 
>> Paolo Bonzini <pbonzini@redhat.com> writes:
>> 
>> > On 03/08/2015 11:14, Alex Bennée wrote:
>> >> This allows the perf tool to map samples to each individual translation
>> >> block. This could be expanded for user space but currently it gives
>> >> enough information to find any hotblocks by other means.
>> >> 
>> >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> >
>> > What happens if you encounter a tb_flush?
>> 
>> At the point of a tb_flush all bets are off as we will re-generate all
>> the blocks at potentially different locations in the translation buffer.
>> However for most analysis cases you are unlikely to cause the code
>> buffer to overflow. Most other uses of tb_flush are the result
>> debugging.
>> 
>> I could add a printf when --perfmap is enabled to flag when a flush
>> happens to signal to the user? I guess some more caveats in the flag
>> description wouldn't hurt.
>> 
>> We could consider truncating and re-starting the JIT dump at each flush?
>
> You also need to take care about TB invalidation. When the last
> generated TB is invalidated, the code pointer is rolled back to the
> end of the previous TB. In that case the last entry of the dump might
> should be replaced by the new value. If the invalidated TB is not the
> last one, it is just left in the generated code.

Can we only invalidate the previous TB and not any earlier ones?

We could keep the output line until the next TB is generated but then
you would never have a mapping for the last TB generated.
diff mbox

Patch

diff --git a/include/qemu-common.h b/include/qemu-common.h
index fb3da6c..60b87d0 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -382,6 +382,8 @@  typedef struct PCIHostDeviceAddress {
 void tcg_exec_init(unsigned long tb_size);
 bool tcg_enabled(void);
 
+void tb_enable_perfmap(void);
+
 void cpu_exec_init_all(void);
 
 /* CPU save/load.  */
diff --git a/qemu-options.hx b/qemu-options.hx
index 77f5853..ae53346 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3572,6 +3572,15 @@  to the RNG daemon.
 
 ETEXI
 
+DEF("perfmap", 0, QEMU_OPTION_PERFMAP, \
+    "-perfmap        generate a /tmp/perf-${pid}.map file for perf\n",
+    QEMU_ARCH_ALL)
+STEXI
+@item -perfmap
+@findex -perfmap
+This will cause QEMU to generate a map file for Linux perf tools that will allow
+basic profiling information to be broken down into basic blocks.
+ETEXI
 
 HXCOMM This is the last statement. Insert new options before this line!
 STEXI
diff --git a/translate-all.c b/translate-all.c
index 60a3d8b..c05e2a5 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -27,6 +27,7 @@ 
 #include <stdio.h>
 #include <string.h>
 #include <inttypes.h>
+#include <glib.h>
 
 #include "config.h"
 
@@ -133,6 +134,24 @@  static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
                          tb_page_addr_t phys_page2);
 static TranslationBlock *tb_find_pc(uintptr_t tc_ptr);
 
+static FILE *tb_perfmap;
+
+void tb_enable_perfmap(void)
+{
+    gchar *map_file = g_strdup_printf("/tmp/perf-%d.map", getpid());
+    tb_perfmap = fopen(map_file, "w");
+    g_free(map_file);
+}
+
+static void tb_write_perfmap(tcg_insn_unit *start, int size, target_ulong pc)
+{
+    if (tb_perfmap) {
+        fprintf(tb_perfmap,
+                "%"PRIxPTR" %x subject-"TARGET_FMT_lx"\n",
+                (uintptr_t) start, size, pc);
+    }
+}
+
 void cpu_gen_init(void)
 {
     tcg_context_init(&tcg_ctx); 
@@ -190,6 +209,7 @@  int cpu_gen_code(CPUArchState *env, TranslationBlock *tb, int *gen_code_size_ptr
     s->code_out_len += gen_code_size;
 #endif
 
+    tb_write_perfmap(gen_code_buf, gen_code_size, tb->pc);
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) {
         qemu_log("OUT: [size=%d]\n", gen_code_size);
@@ -669,6 +689,12 @@  static inline void code_gen_alloc(size_t tb_size)
             tcg_ctx.code_gen_buffer_size - 1024;
     tcg_ctx.code_gen_buffer_size -= 1024;
 
+    if (tb_perfmap) {
+        fprintf(tb_perfmap,
+                "%"PRIxPTR" %x tcg-prologue-buffer\n",
+                (uintptr_t) tcg_ctx.code_gen_prologue, 1024);
+    }
+
     tcg_ctx.code_gen_buffer_max_size = tcg_ctx.code_gen_buffer_size -
         (TCG_MAX_OP_SIZE * OPC_BUF_SIZE);
     tcg_ctx.code_gen_max_blocks = tcg_ctx.code_gen_buffer_size /
diff --git a/vl.c b/vl.c
index 0adbbd6..1d2de4f 100644
--- a/vl.c
+++ b/vl.c
@@ -122,6 +122,7 @@  int main(int argc, char **argv)
 #include "qapi-event.h"
 #include "exec/semihost.h"
 #include "crypto/init.h"
+#include "qemu-common.h"
 
 #define MAX_VIRTIO_CONSOLES 1
 #define MAX_SCLP_CONSOLES 1
@@ -3348,6 +3349,9 @@  int main(int argc, char **argv, char **envp)
             case QEMU_OPTION_D:
                 log_file = optarg;
                 break;
+            case QEMU_OPTION_PERFMAP:
+                tb_enable_perfmap();
+                break;
             case QEMU_OPTION_s:
                 add_device_config(DEV_GDB, "tcp::" DEFAULT_GDBSTUB_PORT);
                 break;