diff mbox

arm: perf: Fix userspace call stack walking

Message ID 20151001172643.GA23147@dreric01-gentoo.localdomain
State New
Headers show

Commit Message

Drew Richardson Oct. 1, 2015, 5:26 p.m. UTC
I got some undeliverable responses the first time, sorry if you get this twice

---

The layout of stack frames has changed over time. Testing using a
arm-linux-gnueabi gcc-4.2 from 2007 the original code didn't work but
this new code does. It also works with clang as well as newer versions
of gcc.

gcc has this layout for it's stackframes

caller_fp
caller_lr <- fp

However clang has this layout

caller_fp <- fp
caller_lr

Since the layouts are not compatible use a heuristic to determine for
each stack frame which layout is used.

Signed-off-by: Drew Richardson <drew.richardson@arm.com>
---
 arch/arm/kernel/perf_callchain.c | 86 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 76 insertions(+), 10 deletions(-)

Comments

Russell King - ARM Linux Oct. 1, 2015, 7:10 p.m. UTC | #1
On Thu, Oct 01, 2015 at 10:26:47AM -0700, Drew Richardson wrote:
> The layout of stack frames has changed over time. Testing using a
> arm-linux-gnueabi gcc-4.2 from 2007 the original code didn't work but
> this new code does. It also works with clang as well as newer versions
> of gcc.

Can you point to a modern ARM distribution where perf actually works with
calltraces into userspace?
Drew Richardson Oct. 1, 2015, 7:47 p.m. UTC | #2
On Thu, Oct 01, 2015 at 08:10:41PM +0100, Russell King - ARM Linux wrote:
> On Thu, Oct 01, 2015 at 10:26:47AM -0700, Drew Richardson wrote:
> > The layout of stack frames has changed over time. Testing using a
> > arm-linux-gnueabi gcc-4.2 from 2007 the original code didn't work but
> > this new code does. It also works with clang as well as newer versions
> > of gcc.
> 
> Can you point to a modern ARM distribution where perf actually works with
> calltraces into userspace?

I am not aware of an ARM distribution where it works, that's the
problem. I optimistically said 'The layout of stack frames has changed
over time,' but I couldn't find any case where it worked (including
digging up an ARM compiler from 2007)

This is from 4.3-rc3 on Gentoo using 'perf record -ga ./dhrystone'
then 'perf report -g'.


     1.36%        dhrystone  dhrystone          [.] Func_3                               
                  |
                  --- Func_3
                     |          
                     |--85.61%-- 0x59
                     |          
                      --14.39%-- 0x7ec5d5ac


And this is after the proposed changes


     1.99%        dhrystone  dhrystone           [.] Func_3                           
                  |
                  --- Func_3
                     |          
                     |--87.45%-- cmd_report
                     |          Proc_1
                     |          main
                     |          0x0
                     |          
                      --12.55%-- Proc_1
                                main
                                0x0

The call stack unwinding isn't perfect, for example leaf functions may
not write a stack frame at all, but it's hopefully better than it was.

Drew Richardson
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Will Deacon Oct. 5, 2015, 12:32 p.m. UTC | #3
Hi Drew,

On Thu, Oct 01, 2015 at 06:26:47PM +0100, Drew Richardson wrote:
> I got some undeliverable responses the first time, sorry if you get this twice
> 
> ---
> 
> The layout of stack frames has changed over time. Testing using a
> arm-linux-gnueabi gcc-4.2 from 2007 the original code didn't work but
> this new code does. It also works with clang as well as newer versions
> of gcc.
> 
> gcc has this layout for it's stackframes
> 
> caller_fp
> caller_lr <- fp
> 
> However clang has this layout
> 
> caller_fp <- fp
> caller_lr

Do you have any buy-in from the toolchain people that this won't continue
to change over time? Adding more and more heuristics to walk the stack of
binaries compiled using GCC x.y doesn't really scale...

> Since the layouts are not compatible use a heuristic to determine for
> each stack frame which layout is used.
> 
> Signed-off-by: Drew Richardson <drew.richardson@arm.com>
> ---
>  arch/arm/kernel/perf_callchain.c | 86 +++++++++++++++++++++++++++++++++++-----
>  1 file changed, 76 insertions(+), 10 deletions(-)

Please can you update the compat stack walker in
arch/arm64/kernel/perf_callchain.c too?

Will
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
diff mbox

Patch

diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..99acfe9be9b1 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -13,16 +13,12 @@ 
 
 /*
  * The registers we're interested in are at the end of the variable
- * length saved register structure. The fp points at the end of this
- * structure so the address of this struct is:
- * (struct frame_tail *)(xxx->fp)-1
+ * length saved register structure.
  *
  * This code has been adapted from the ARM OProfile support.
  */
 struct frame_tail {
-	struct frame_tail __user *fp;
-	unsigned long sp;
-	unsigned long lr;
+	void __user *regs[3];
 } __attribute__((packed));
 
 /*
@@ -35,6 +31,8 @@  user_backtrace(struct frame_tail __user *tail,
 {
 	struct frame_tail buftail;
 	unsigned long err;
+	void __user *fp;
+	void __user *lr;
 
 	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
 		return NULL;
@@ -46,16 +44,84 @@  user_backtrace(struct frame_tail __user *tail,
 	if (err)
 		return NULL;
 
-	perf_callchain_store(entry, buftail.lr);
+	/*
+	 * gcc style stackframes
+	 *
+	 * parent_func <- caller_lr?
+	 * ...
+	 * caller_fp  (regs[0]) <- tail
+	 * caller_lr  (regs[1]) <- fp
+	 * other_data (regs[2])
+	 * ...
+	 * caller_stackframe <- caller_fp
+	 * ...
+	 * parent_func <- caller_lr?
+	 *
+	 * clang style stackframes
+	 *
+	 * parent_func <- caller_lr?
+	 * ...
+	 * other_data (regs[0]) <- tail
+	 * caller_fp  (regs[1]) <- fp
+	 * caller_lr  (regs[2])
+	 * ...
+	 * caller_stackframe <- caller_fp
+	 * ...
+	 * parent_func <- caller_lr?
+	 *
+	 * Is buftail.regs[1] the caller_lr or the caller_fp? Assume
+	 * that the previous function does not exist before the
+	 * previous stackframe (ie, caller_lr < tail || caller_lr >
+	 * caller_fp) and that the stack grows downwards towards
+	 * smaller addresses (ie, fp < caller_fp). In the case of
+	 * ambiguity, assume gcc style stackframes.
+	 *
+	 * (caller_lr < tail < caller_fp) || (tail < caller_fp < caller_lr)
+	 *
+	 * gcc style stackframes
+	 *
+	 * (regs[1] < tail < regs[0]) || (tail < regs[0] < regs[1])
+	 * regs[2] is undefined
+	 *
+	 * regs[0] <    tail =         0
+	 * regs[1] <    tail =    0 || 1
+	 * regs[2] <    tail = undefined
+	 * regs[1] < regs[0] =    0 || 1
+	 * regs[2] < regs[0] = undefined
+	 * regs[2] < regs[1] = undefined
+	 *
+	 * clang style stackframes
+	 *
+	 * (regs[2] < tail < regs[1]) || (tail < regs[1] < regs[2])
+	 * regs[0] is undefined
+	 *
+	 * regs[0] <    tail = undefined
+	 * regs[1] <    tail =         0
+	 * regs[2] <    tail =    0 || 1
+	 * regs[1] < regs[0] = undefined
+	 * regs[2] < regs[0] = undefined
+	 * regs[2] < regs[1] =    0 || 1
+	 */
+	if (buftail.regs[0] > (void __user *)tail) {
+		/* gcc style stackframes */
+		fp = buftail.regs[0];
+		lr = buftail.regs[1];
+	} else {
+		/* clang style stackframes */
+		fp = buftail.regs[1];
+		lr = buftail.regs[2];
+	}
+
+	perf_callchain_store(entry, (uintptr_t)lr);
 
 	/*
 	 * Frame pointers should strictly progress back up the stack
 	 * (towards higher addresses).
 	 */
-	if (tail + 1 >= buftail.fp)
+	if ((void __user *)tail + 4 >= fp)
 		return NULL;
 
-	return buftail.fp - 1;
+	return fp - 4;
 }
 
 void
@@ -73,7 +139,7 @@  perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 	if (!current->mm)
 		return;
 
-	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
+	tail = (struct frame_tail __user *)(regs->ARM_fp - 4);
 
 	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
 	       tail && !((unsigned long)tail & 0x3))