diff mbox

[perf/core,00/22] perf refcnt debugger API and fixes

Message ID 56697572.90701@huawei.com
State New
Headers show

Commit Message

Wang Nan Dec. 10, 2015, 12:52 p.m. UTC
On 2015/12/10 19:04, 平松雅巳 / HIRAMATU,MASAMI wrote:
>> From: Arnaldo Carvalho de Melo [mailto:acme@kernel.org]

>>

>> Em Wed, Dec 09, 2015 at 11:10:48AM +0900, Masami Hiramatsu escreveu:

>>> Hi Arnaldo,

>>>

>>> Here is a series of patches for perf refcnt debugger and

>>> some fixes.

>>>

>>> In this series I've replaced all atomic reference counters

>>> with the refcnt interface, including dso, map, map_groups,

>>> thread, cpu_map, comm_str, cgroup_sel, thread_map, and

>>> perf_mmap.

>>>

>>>    refcnt debugger (or refcnt leak checker)

>>>    ===============

>>>

>>> At first, note that this change doesn't affect any compiled

>>> code unless building with REFCNT_DEBUG=1 (see macros in

>>> refcnt.h). So, this feature is only enabled in the debug binary.

>>> But before releasing, we can ensure that all objects are safely

>>> reclaimed before exit in -rc phase.

>> That helps and is finding bugs and is really great stuff, thank you!

>>

>> But I wonder if we couldn't get the same results on an unmodified binary

>> by using things like 'perf probe', the BPF code we're introducing, have

>> you thought about this possibility?

> That's possible, but it will require pre-analysis of the binary, because

> refcnt interface is not fixed API like a "systemcall" (moreover, it could

> be just a non-atomic variable).

>

> Thus we need a kind of "annotation" event by source code level.

>

>> I.e. trying to use 'perf probe' to do this would help in using the same

>> technique in other code bases where we can't change the sources, etc.

>>

>> For perf we could perhaps use a 'noinline' on the __get/__put

>> operations, so that we could have the right places to hook using

>> uprobes, other codebases would have to rely in the 'perf probe'

>> infrastructure that knows where inlines were expanded, etc.

>>

>> Such a toold could work like:

>>

>>    perf dbgrefcnt ~/bin/perf thread

> This works only for the binary which is coded as you said.

> I actually doubt that this is universal solution. We'd better introduce

> librefcnt.so if you need more general solution, so that we can fix the

> refcnt API and we can also hook the interface easily.

>

> But with this way, we don't need ebpf/uprobes anymore, since we've already

> have LD_PRELOAD (like valgrind does). :(

>

> So, IMHO, using ebpf and perf probe for this issue sounds like using

> a sledge‐hammer...


But this is an interesting problem and can inspire us the direction
for eBPF improvement. I guess if we can solve this problem with eBPF
we can also solve many similar problems with much lower cost than what
you have done in first 5 patches?

This is what we have done today:

With a much simpler patch which create 4 stub functions:


And a relative complex eBPF script attached at the end of
this mail, with following cmdline:

  # ./perf record -e ./refcnt.c ./perf probe vfs_read
  # cat /sys/kernel/debug/tracing/trace
             ...
             perf-18419 [004] d... 613572.513083: : Type 0 leak 2
             perf-18419 [004] d... 613572.513084: : Type 1 leak 1

I know we have 2 dsos and 1 map get leak.

However I have to analysis full stack trace from 'perf script' to find
which one get leak, because currently my eBPF script is unable to report
which object is leak. I know I can use a hashtable with object address
as key, but currently I don't know how to enumerate keys in a hash table,
except maintaining a relationship between index and object address.
Now I'm waiting for Daniel's persistent map to be enforced for that. When
it ready we can create a tool with the following eBPF script embedded into
perf as a small subcommand, and report call stack of 'alloc' method of
leak object in 'perf report' style, so we can solve similar problem easier.
To make it genereic, we can even make it attach to '{m,c}alloc%return' and
'free', or 'mmap/munmap'.

Thank you.


-------------- eBPF script --------------

typedef int u32;
typedef unsigned long long u64;
#define NULL    ((void *)(0))

#define BPF_ANY         0 /* create new element or update existing */
#define BPF_NOEXIST     1 /* create new element if it didn't exist */
#define BPF_EXIST       2 /* update existing element */

enum bpf_map_type {
         BPF_MAP_TYPE_UNSPEC,
         BPF_MAP_TYPE_HASH,
         BPF_MAP_TYPE_ARRAY,
         BPF_MAP_TYPE_PROG_ARRAY,
         BPF_MAP_TYPE_PERF_EVENT_ARRAY,
};

struct bpf_map_def {
         unsigned int type;
         unsigned int key_size;
         unsigned int value_size;
         unsigned int max_entries;
};

#define SEC(NAME) __attribute__((section(NAME), used))
static int (*bpf_probe_read)(void *dst, int size, void *src) =
         (void *)4;
static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
         (void *)6;
static int (*bpf_get_smp_processor_id)(void) =
         (void *)8;
static int (*map_update_elem)(struct bpf_map_def *, void *, void *, 
unsigned long long flags) =
         (void *)2;
static void *(*map_lookup_elem)(struct bpf_map_def *, void *) =
         (void *)1;
static unsigned long long (*get_current_pid_tgid)(void) =
         (void *)14;
static unsigned long long (*get_current_comm)(char *buf, int size_of_buf) =
         (void *)16;

char _license[] SEC("license") = "GPL";
int _version SEC("version") = LINUX_VERSION_CODE;

enum global_var {
         G_pid,
         G_LEAK_START,
         G_dso_leak = G_LEAK_START,
         G_map_group_leak,
         G_LEAK_END,
         G_NR = G_LEAK_END,
};

struct bpf_map_def SEC("maps") global_vars = {
         .type = BPF_MAP_TYPE_ARRAY,
         .key_size = sizeof(int),
         .value_size = sizeof(u64),
         .max_entries = G_NR,
};

static inline int filter_pid(void)
{
         int key_pid = G_pid;
         unsigned long long *p_pid, pid;

         char fmt[] = "%d vs %d\n";

         p_pid = map_lookup_elem(&global_vars, &key_pid);
         if (!p_pid)
                 return 0;

         pid = get_current_pid_tgid() & 0xffffffff;

         if (*p_pid != pid)
                 return 0;
         return 1;
}

static inline void print_leak(int type)
{
         unsigned long long *p_cnt;
         char fmt[] = "Type %d leak %llu\n";

         p_cnt = map_lookup_elem(&global_vars, &type);
         if (!p_cnt)
                 return;
         bpf_trace_printk(fmt, sizeof(fmt), type - G_LEAK_START, *p_cnt);
}

SEC("execve=sys_execve")
int execve(void *ctx)
{
         char name[sizeof(u64)] = "";
         char name_perf[sizeof(u64)] = "perf";
         unsigned long long *p_pid, pid;
         int key = G_pid;

         p_pid = map_lookup_elem(&global_vars, &key);
         if (!p_pid)
                 return 0;
         pid = *p_pid;
         if (pid)
                 return 0;
         if (get_current_comm(name, sizeof(name)))
                 return 0;
         if (*(u32*)name != *(u32*)name_perf)
                 return 0;

         pid = get_current_pid_tgid() & 0xffffffff;
         map_update_elem(&global_vars, &key, &pid, BPF_ANY);
         return 0;
}

static inline int func_exit(void *ctx)
{
         if (!filter_pid())
                 return 0;
         print_leak(G_dso_leak);
         print_leak(G_map_group_leak);
         return 0;
}

SEC("exit_group=sys_exit_group")
int exit_group(void *ctx)
{
         return func_exit(ctx);
}

SEC("exit_=sys_exit")
int exit_(void *ctx)
{
         return func_exit(ctx);
}
static inline void inc_leak_from_type(int type, int n)
{
         u64 *p_cnt, cnt;

         type += G_LEAK_START;
         if (type >= G_LEAK_END)
                 return;

         p_cnt = map_lookup_elem(&global_vars, &type);
         if (!p_cnt)
                 cnt = n;
         else
                 cnt = *p_cnt + n;

         map_update_elem(&global_vars, &type, &cnt, BPF_ANY);
         return;
}

SEC("exec=/home/wangnan/perf;"
     "refcnt_init=__refcnt__init n obj type")
int refcnt_init(void *ctx, int err, int n, void *obj, int type)
{
         if (!filter_pid())
                 return 0;
         inc_leak_from_type(type, n);
         return 0;
}
SEC("exec=/home/wangnan/perf;"
     "refcnt_del=refcnt__delete obj type")
int refcnt_del(void *ctx, int err, void *obj, int type)
{
         if (!filter_pid())
                 return 0;
         return 0;
}

SEC("exec=/home/wangnan/perf;"
     "refcnt_get=__refcnt__get obj type")
int refcnt_get(void *ctx, int err, void *obj, int type)
{
         if (!filter_pid())
                 return 0;
         inc_leak_from_type(type, 1);
         return 0;
}

SEC("exec=/home/wangnan/perf;"
     "refcnt_put=__refcnt__put refcnt obj type")
int refcnt_put(void *ctx, int err, void *refcnt, void *obj, int type)
{
         int old_cnt = -1;

         if (!filter_pid())
                 return 0;
         if (bpf_probe_read(&old_cnt, sizeof(int), refcnt))
                 return 0;
         if (old_cnt)
                 inc_leak_from_type(type, -1);
         return 0;
}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Comments

Wang Nan Dec. 11, 2015, 1:53 a.m. UTC | #1
On 2015/12/10 23:12, 'Arnaldo Carvalho de Melo' wrote:

[SNIP]
> But this requires having these special refcnt__ routines, that will make

> tools/perf/ code patterns for reference counts look different that the

> refcount patterns in the kernel :-\

>

> And would be a requirement to change the observed workload :-\

>

> Is this _strictly_ required?


No. The requirement should be:

  1. The create/get/put/delete functions are non-inline (because dwarf info
     is not as reliable as symbol);
  2. From their argument list, we can always get the variable we need (the
     pointer of objects, the value of refcnt, etc.)

We don't have to use this refcnt things.

> Can't we, for a project like perf, where we

> know where some refcount (say, the one for 'struct thread') gets

> initialized, use that (thread__new()) and then hook into thread__get and

> thread__put and then use the destructor, thread__delete() as the place

> to dump leaks?

>


I think it is possible. If we can abstract a common pattern about it,
we can provide a perf subcommand which we can deal with generic alloc/free
pattern. I'll put it on my todo-list. Currently we are focusing on perf
daemonization.

Thank you.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Wang Nan Dec. 11, 2015, 2:22 a.m. UTC | #2
On 2015/12/11 10:08, 平松雅巳 / HIRAMATU,MASAMI wrote:
> From: Wangnan (F) [mailto:wangnan0@huawei.com]

>> On 2015/12/10 23:12, 'Arnaldo Carvalho de Melo' wrote:

>>

>> [SNIP]

>>> But this requires having these special refcnt__ routines, that will make

>>> tools/perf/ code patterns for reference counts look different that the

>>> refcount patterns in the kernel :-\

>>>

>>> And would be a requirement to change the observed workload :-\

>>>

>>> Is this _strictly_ required?

>> No. The requirement should be:

>>

>>   1. The create/get/put/delete functions are non-inline (because dwarf info

>>      is not as reliable as symbol);

>>   2. From their argument list, we can always get the variable we need (the

>>      pointer of objects, the value of refcnt, etc.)

> However, we have to customize it for each application. Perf itself might be OK

> but others might have different implementation.


If limited to pairwise operations ({{m,c}alloc,strdup} vs free, open vs 
close),
I think it is possible to abstract a uniformed pattern.

Thank you.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Wang Nan Dec. 11, 2015, 2:42 a.m. UTC | #3
On 2015/12/11 10:15, 平松雅巳 / HIRAMATU,MASAMI wrote:
> From: 'Arnaldo Carvalho de Melo' [mailto:acme@kernel.org]

>> But this requires having these special refcnt__ routines, that will make

>> tools/perf/ code patterns for reference counts look different that the

>> refcount patterns in the kernel :-\

> BTW, I think even without the refcnt debugger, we'd better introduce this

> kind API to unify the refcnt operation in perf code. As I said, we have many

> miscodings on current implementation. Unifying the API can enforce developers

> to avoid such miscodings.

>

> Thank you,

>


I tried this problem in another way, I'd like to share it here.

First: create two uprobes:

# ./perf probe --exec /home/wangnan/perf dso__new%return %ax
Added new event:
   probe_perf:dso__new  (on dso__new%return in /home/wangnan/perf with %ax)

You can now use it in all perf tools, such as:

     perf record -e probe_perf:dso__new -aR sleep 1

# ./perf probe --exec /home/wangnan/perf dso__delete dso
Added new event:
   probe_perf:dso__delete (on dso__delete in /home/wangnan/perf with dso)

You can now use it in all perf tools, such as:

     perf record -e probe_perf:dso__delete -aR sleep 1


Then start test:

# ./perf record -g -e probe_perf:dso__new -e probe_perf:dso__delete 
./perf probe vfs_read
Added new event:
   probe:vfs_read       (on vfs_read)

You can now use it in all perf tools, such as:

     perf record -e probe:vfs_read -aR sleep 1

[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.048 MB perf.data (178 samples) ]


 From the perf report result I know two dso objects are leak:

90 probe_perf:dso__new `
88 probe_perf:dso__delete


Then convert output to CTF:

$ ./perf data convert --to-ctf ./out.ctf


With a python script, try to find the exact leak objects (I'm not good 
at python,
I believe we can do this with much shorter script):

$ cat refcnt.py
from babeltrace import TraceCollection

tc = TraceCollection();
tc.add_trace('./out.ctf', 'ctf')
objs = {}
for event in tc.events:
     if event.name.startswith('probe_perf:dso__new'):
         if event['arg1'] not in objs:
             objs[event['arg1']] = 1
     if event.name.startswith('probe_perf:dso__delete'):
         if event['dso'] in objs:
             objs[event['dso']] = 0

for x in objs:
     if objs[x] is 0:
         continue
     print("0x%x" % x)

$ python3 ./refcnt.py
0x34cb350
0x34d4640

Then from perf script's result, search for the two address:

perf 23203 [004] 665244.170387: probe_perf:dso__new: (4aaee0 <- 50a5eb) 
arg1=0x34cb350
                   10a5eb dso__load_sym (/home/w00229757/perf)
                    af42d dso__load_vmlinux (/home/w00229757/perf)
                    af58c dso__load_vmlinux_path (/home/w00229757/perf)
                   10c40a open_debuginfo (/home/w00229757/perf)
                   111d39 convert_perf_probe_events (/home/w00229757/perf)
                    603a7 __cmd_probe.isra.3 (/home/w00229757/perf)
                    60a44 cmd_probe (/home/w00229757/perf)
                    7f6d1 run_builtin (/home/w00229757/perf)
                    33056 main (/home/w00229757/perf)
                    21bd5 __libc_start_main 
(/tmp/oxygen_root-w00229757/lib64/libc-2.18.so)

perf 23203 [004] 665244.170679: probe_perf:dso__new: (4aaee0 <- 50a5eb) 
arg1=0x34d4640
                   10a5eb dso__load_sym (/home/w00229757/perf)
                    af42d dso__load_vmlinux (/home/w00229757/perf)
                    af58c dso__load_vmlinux_path (/home/w00229757/perf)
                   10c40a open_debuginfo (/home/w00229757/perf)
                   111d39 convert_perf_probe_events (/home/w00229757/perf)
                    603a7 __cmd_probe.isra.3 (/home/w00229757/perf)
                    60a44 cmd_probe (/home/w00229757/perf)
                    7f6d1 run_builtin (/home/w00229757/perf)
                    33056 main (/home/w00229757/perf)
                    21bd5 __libc_start_main 
(/tmp/oxygen_root-w00229757/lib64/libc-2.18.so)

Thank you.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Wang Nan Dec. 11, 2015, 2:53 a.m. UTC | #4
On 2015/12/11 10:42, Wangnan (F) wrote:
>

>

> On 2015/12/11 10:15, 平松雅巳 / HIRAMATU,MASAMI wrote:

>> From: 'Arnaldo Carvalho de Melo' [mailto:acme@kernel.org]

>>> But this requires having these special refcnt__ routines, that will 

>>> make

>>> tools/perf/ code patterns for reference counts look different that the

>>> refcount patterns in the kernel :-\

>> BTW, I think even without the refcnt debugger, we'd better introduce 

>> this

>> kind API to unify the refcnt operation in perf code. As I said, we 

>> have many

>> miscodings on current implementation. Unifying the API can enforce 

>> developers

>> to avoid such miscodings.

>>

>> Thank you,

>>

>

> I tried this problem in another way, I'd like to share it here.

>

> First: create two uprobes:

>

> # ./perf probe --exec /home/wangnan/perf dso__new%return %ax

> Added new event:

>   probe_perf:dso__new  (on dso__new%return in /home/wangnan/perf with 

> %ax)

>

> You can now use it in all perf tools, such as:

>

>     perf record -e probe_perf:dso__new -aR sleep 1

>

> # ./perf probe --exec /home/wangnan/perf dso__delete dso

> Added new event:

>   probe_perf:dso__delete (on dso__delete in /home/wangnan/perf with dso)

>

> You can now use it in all perf tools, such as:

>

>     perf record -e probe_perf:dso__delete -aR sleep 1

>

>

> Then start test:

>

> # ./perf record -g -e probe_perf:dso__new -e probe_perf:dso__delete 

> ./perf probe vfs_read

> Added new event:

>   probe:vfs_read       (on vfs_read)

>

> You can now use it in all perf tools, such as:

>

>     perf record -e probe:vfs_read -aR sleep 1

>

> [ perf record: Woken up 1 times to write data ]

> [ perf record: Captured and wrote 0.048 MB perf.data (178 samples) ]

>

>

> From the perf report result I know two dso objects are leak:

>

> 90 probe_perf:dso__new `

> 88 probe_perf:dso__delete

>


The above result is gotten from yesterday's perf/core. I also tried
today's perf/core and get:

90 probe_perf:dso__new `
90 probe_perf:dso__delete

So we fix these leak.

Thank you.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
diff mbox

Patch

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 65fef59..2c45478 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -87,6 +87,7 @@  libperf-$(CONFIG_AUXTRACE) += intel-bts.o
  libperf-y += parse-branch-options.o
  libperf-y += parse-regs-options.o
  libperf-y += term.o
+libperf-y += refcnt.o

  libperf-$(CONFIG_LIBBPF) += bpf-loader.o
  libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index e8e9a9d..de52ae8 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1,6 +1,7 @@ 
  #include <asm/bug.h>
  #include <sys/time.h>
  #include <sys/resource.h>
+#include "refcnt.h"
  #include "symbol.h"
  #include "dso.h"
  #include "machine.h"
diff --git a/tools/perf/util/refcnt.c b/tools/perf/util/refcnt.c
new file mode 100644
index 0000000..f5a6659
--- /dev/null
+++ b/tools/perf/util/refcnt.c
@@ -0,0 +1,29 @@ 
+#include <linux/compiler.h>
+#include "util/refcnt.h"
+
+void __attribute__ ((noinline))
+__refcnt__init(atomic_t *refcnt, int n,
+              void *obj __maybe_unused,
+              const char *name __maybe_unused)
+{
+       atomic_set(refcnt, n);
+}
+
+void __attribute__ ((noinline))
+refcnt__delete(void *obj __maybe_unused)
+{
+}
+
+void __attribute__ ((noinline))
+__refcnt__get(atomic_t *refcnt, void *obj __maybe_unused,
+             const char *name __maybe_unused)
+{
+       atomic_inc(refcnt);
+}
+
+int __attribute__ ((noinline))
+__refcnt__put(atomic_t *refcnt, void *obj __maybe_unused,
+             const char *name __maybe_unused)
+{
+       return atomic_dec_and_test(refcnt);
+}
diff --git a/tools/perf/util/refcnt.h b/tools/perf/util/refcnt.h
new file mode 100644
index 0000000..04f5390
--- /dev/null
+++ b/tools/perf/util/refcnt.h
@@ -0,0 +1,21 @@ 
+#ifndef PERF_REFCNT_H
+#define PERF_REFCNT_H
+#include <linux/atomic.h>
+
+void __refcnt__init(atomic_t *refcnt, int n, void *obj, const char *name);
+void refcnt__delete(void *obj);
+void __refcnt__get(atomic_t *refcnt, void *obj, const char *name);
+int __refcnt__put(atomic_t *refcnt, void *obj, const char *name);
+
+#define refcnt__init(obj, member, n)   \
+       __refcnt__init(&(obj)->member, n, obj, #obj)
+#define refcnt__init_as(obj, member, n, name)  \
+       __refcnt__init(&(obj)->member, n, obj, name)
+#define refcnt__exit(obj, member)      \
+       refcnt__delete(obj)
+#define refcnt__get(obj, member)       \
+       __refcnt__get(&(obj)->member, obj, #obj)
+#define refcnt__put(obj, member)       \
+       __refcnt__put(&(obj)->member, obj, #obj)
+
+#endif