Message ID | 1444640563-159175-2-git-send-email-xiakaixu@huawei.com |
---|---|
State | New |
Headers | show |
On Mon, Oct 12, 2015 at 09:02:42AM +0000, Kaixu Xia wrote: > --- a/include/linux/perf_event.h > +++ b/include/linux/perf_event.h > @@ -483,6 +483,8 @@ struct perf_event { > perf_overflow_handler_t overflow_handler; > void *overflow_handler_context; > > + atomic_t *sample_disable; > + > #ifdef CONFIG_EVENT_TRACING > struct trace_event_call *tp_event; > struct event_filter *filter; > diff --git a/kernel/events/core.c b/kernel/events/core.c > index b11756f..f6ef45c 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event, > irq_work_queue(&event->pending); > } > > + if ((event->sample_disable) && atomic_read(event->sample_disable)) > + return ret; > + > if (event->overflow_handler) > event->overflow_handler(event, data, regs); > else Try and guarantee sample_disable lives in the same cacheline as overflow_handler. I think we should at the very least replace the kzalloc() currently used with a cacheline aligned alloc, and check the structure layout to verify these two do in fact share a cacheline. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On 2015/10/12 20:02, Peter Zijlstra wrote: > On Mon, Oct 12, 2015 at 09:02:42AM +0000, Kaixu Xia wrote: >> --- a/include/linux/perf_event.h >> +++ b/include/linux/perf_event.h >> @@ -483,6 +483,8 @@ struct perf_event { >> perf_overflow_handler_t overflow_handler; >> void *overflow_handler_context; >> >> + atomic_t *sample_disable; >> + >> #ifdef CONFIG_EVENT_TRACING >> struct trace_event_call *tp_event; >> struct event_filter *filter; >> diff --git a/kernel/events/core.c b/kernel/events/core.c >> index b11756f..f6ef45c 100644 >> --- a/kernel/events/core.c >> +++ b/kernel/events/core.c >> @@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event, >> irq_work_queue(&event->pending); >> } >> >> + if ((event->sample_disable) && atomic_read(event->sample_disable)) >> + return ret; >> + >> if (event->overflow_handler) >> event->overflow_handler(event, data, regs); >> else > Try and guarantee sample_disable lives in the same cacheline as > overflow_handler. Could you please explain why we need them to be in a same cacheline? Thank you. > I think we should at the very least replace the kzalloc() currently used > with a cacheline aligned alloc, and check the structure layout to verify > these two do in fact share a cacheline. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On Mon, Oct 12, 2015 at 08:05:20PM +0800, Wangnan (F) wrote: > > > On 2015/10/12 20:02, Peter Zijlstra wrote: > >On Mon, Oct 12, 2015 at 09:02:42AM +0000, Kaixu Xia wrote: > >>--- a/include/linux/perf_event.h > >>+++ b/include/linux/perf_event.h > >>@@ -483,6 +483,8 @@ struct perf_event { > >> perf_overflow_handler_t overflow_handler; > >> void *overflow_handler_context; > >>+ atomic_t *sample_disable; > >>+ > >> #ifdef CONFIG_EVENT_TRACING > >> struct trace_event_call *tp_event; > >> struct event_filter *filter; > >>diff --git a/kernel/events/core.c b/kernel/events/core.c > >>index b11756f..f6ef45c 100644 > >>--- a/kernel/events/core.c > >>+++ b/kernel/events/core.c > >>@@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event, > >> irq_work_queue(&event->pending); > >> } > >>+ if ((event->sample_disable) && atomic_read(event->sample_disable)) > >>+ return ret; > >>+ > >> if (event->overflow_handler) > >> event->overflow_handler(event, data, regs); > >> else > >Try and guarantee sample_disable lives in the same cacheline as > >overflow_handler. > > Could you please explain why we need them to be in a same cacheline? Because otherwise you've just added a cacheline miss to this relatively hot path. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Hi Kaixu, [auto build test ERROR on tip/perf/core -- if it's inappropriate base, please suggest rules for selecting the more suitable base] url: https://github.com/0day-ci/linux/commits/Kaixu-Xia/bpf-enable-disable-events-stored-in-PERF_EVENT_ARRAY-maps-trace-data-output-when-perf-sampling/20151012-170616 config: m68k-allyesconfig (attached as .config) reproduce: wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree make.cross ARCH=m68k All errors (new ones prefixed by >>): kernel/bpf/arraymap.c: In function 'perf_event_fd_array_get_ptr': >> kernel/bpf/arraymap.c:305:7: error: 'struct perf_event' has no member named 'sample_disable' event->sample_disable = &map->perf_sample_disable; ^ vim +305 kernel/bpf/arraymap.c 299 if (attr->type != PERF_TYPE_RAW && 300 attr->type != PERF_TYPE_HARDWARE) { 301 perf_event_release_kernel(event); 302 return ERR_PTR(-EINVAL); 303 } 304 > 305 event->sample_disable = &map->perf_sample_disable; 306 return event; 307 } 308 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
On 10/12/15 2:02 AM, Kaixu Xia wrote: > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index f57d7fe..25e073d 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -39,6 +39,7 @@ struct bpf_map { > u32 max_entries; > const struct bpf_map_ops *ops; > struct work_struct work; > + atomic_t perf_sample_disable; > }; > > struct bpf_map_type_list { > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h > index 092a0e8..0606d1d 100644 > --- a/include/linux/perf_event.h > +++ b/include/linux/perf_event.h > @@ -483,6 +483,8 @@ struct perf_event { > perf_overflow_handler_t overflow_handler; > void *overflow_handler_context; > > + atomic_t *sample_disable; this looks fragile and unnecessary. Why add such field to generic bpf_map and carry its pointer into perf_event? Single extra field in perf_event would have been enough. Even better is to avoid adding any fields. There is already event->state why not to use that? The proper perf_event_enable/disable are so heavy that another mechanism needed? cpu_function_call is probably too much to do from bpf program, but that can be simplified? Based on the use case from cover letter, sounds like you want something like soft_disable? Then extending event->state would make the most sense. Also consider the case of re-entrant event enable/disable. So inc/dec of a flag may be needed? -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
δΊ 2015/10/13 3:20, Alexei Starovoitov ει: > On 10/12/15 2:02 AM, Kaixu Xia wrote: >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index f57d7fe..25e073d 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -39,6 +39,7 @@ struct bpf_map { >> u32 max_entries; >> const struct bpf_map_ops *ops; >> struct work_struct work; >> + atomic_t perf_sample_disable; >> }; >> >> struct bpf_map_type_list { >> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h >> index 092a0e8..0606d1d 100644 >> --- a/include/linux/perf_event.h >> +++ b/include/linux/perf_event.h >> @@ -483,6 +483,8 @@ struct perf_event { >> perf_overflow_handler_t overflow_handler; >> void *overflow_handler_context; >> >> + atomic_t *sample_disable; > > this looks fragile and unnecessary. > Why add such field to generic bpf_map and carry its pointer into perf_event? > Single extra field in perf_event would have been enough. > Even better is to avoid adding any fields. > There is already event->state why not to use that? > The proper perf_event_enable/disable are so heavy that another > mechanism needed? cpu_function_call is probably too much to do > from bpf program, but that can be simplified? > Based on the use case from cover letter, sounds like you want > something like soft_disable? > Then extending event->state would make the most sense. > Also consider the case of re-entrant event enable/disable. > So inc/dec of a flag may be needed? Thanks for your comments! I've tried perf_event_enable/disable, but there is a warning caused by cpu_function_call. The main reason as follows, int smp_call_function_single(...) { ... WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); ... } So I added the extra atomic flag filed in order to avoid this problem. > > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On 10/12/15 7:30 PM, xiakaixu wrote: >> The proper perf_event_enable/disable are so heavy that another >> >mechanism needed? cpu_function_call is probably too much to do >> >from bpf program, but that can be simplified? >> >Based on the use case from cover letter, sounds like you want >> >something like soft_disable? >> >Then extending event->state would make the most sense. >> >Also consider the case of re-entrant event enable/disable. >> >So inc/dec of a flag may be needed? > Thanks for your comments! > I've tried perf_event_enable/disable, but there is a warning caused > by cpu_function_call. The main reason as follows, > int smp_call_function_single(...) > { > ... > WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() > && !oops_in_progress); of course, that's what I meant by 'cpu_function_call is too much to do from bpf program'. In this case it's running out of kprobe with disabled irq, so you hit the warning, but even if it was regular tracepoint, doing ipi from bpf is too much. All bpf helpers must be deterministic without such side effects. > So I added the extra atomic flag filed in order to avoid this problem. that's a hammer approach. There are other ways to do it, like: - extend event->state with this soft_disable-like functionality (Also consider the case of re-entrant event enable/disable. inc/dec may be needed) - or tap into event->attr.sample_period may be it can be temporarily set to zero to indicate soft_disabled. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Hello. On 10/12/2015 12:02 PM, Kaixu Xia wrote: > In some scenarios we don't want to output trace data when sampling > to reduce overhead. This patch adds the flag sample_disable to > implement this function. By setting this flag and integrating with > ebpf, we can control the data output process and get the samples we > are most interested in. > > Signed-off-by: Kaixu Xia <xiakaixu@huawei.com> [...] > diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c > index 29ace10..4ae82c9 100644 > --- a/kernel/bpf/arraymap.c > +++ b/kernel/bpf/arraymap.c [...] > diff --git a/kernel/events/core.c b/kernel/events/core.c > index b11756f..f6ef45c 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event, > irq_work_queue(&event->pending); > } > > + if ((event->sample_disable) && atomic_read(event->sample_disable)) Inner parens not needed at all. [...] MBR, Sergei -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f57d7fe..25e073d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -39,6 +39,7 @@ struct bpf_map { u32 max_entries; const struct bpf_map_ops *ops; struct work_struct work; + atomic_t perf_sample_disable; }; struct bpf_map_type_list { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 092a0e8..0606d1d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -483,6 +483,8 @@ struct perf_event { perf_overflow_handler_t overflow_handler; void *overflow_handler_context; + atomic_t *sample_disable; + #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 29ace10..4ae82c9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -51,6 +51,9 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->elem_size = elem_size; + if (attr->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY) + atomic_set(&array->map.perf_sample_disable, 1); + return &array->map; } @@ -298,6 +301,8 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) perf_event_release_kernel(event); return ERR_PTR(-EINVAL); } + + event->sample_disable = &map->perf_sample_disable; return event; } diff --git a/kernel/events/core.c b/kernel/events/core.c index b11756f..f6ef45c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } + if ((event->sample_disable) && atomic_read(event->sample_disable)) + return ret; + if (event->overflow_handler) event->overflow_handler(event, data, regs); else
In some scenarios we don't want to output trace data when sampling to reduce overhead. This patch adds the flag sample_disable to implement this function. By setting this flag and integrating with ebpf, we can control the data output process and get the samples we are most interested in. Signed-off-by: Kaixu Xia <xiakaixu@huawei.com> --- include/linux/bpf.h | 1 + include/linux/perf_event.h | 2 ++ kernel/bpf/arraymap.c | 5 +++++ kernel/events/core.c | 3 +++ 4 files changed, 11 insertions(+)