Message ID | 20210324112503.623833-7-elver@google.com |
---|---|
State | New |
Headers | show |
Series | Add support for synchronous signals on perf events | expand |
On Wed, Mar 24, 2021 at 12:24PM +0100, Marco Elver wrote: [...] > diff --git a/kernel/events/core.c b/kernel/events/core.c > index b6434697c516..1e4c949bf75f 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -6391,6 +6391,17 @@ void perf_event_wakeup(struct perf_event *event) > } > } > > +static void perf_sigtrap(struct perf_event *event) > +{ > + struct kernel_siginfo info; > + I think we need to add something like this here: diff --git a/kernel/events/core.c b/kernel/events/core.c index 4b82788fbaab..4fcd6b45ce66 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event) { struct kernel_siginfo info; + /* + * This irq_work can race with an exiting task; bail out if sighand has + * already been released in release_task(). + */ + if (!current->sighand) + return; + clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_code = TRAP_PERF; Because syzkaller was able to produce this: | general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN | KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] | CPU: 0 PID: 28393 Comm: kworker/u9:4 Not tainted 5.12.0-rc4+ #5 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 | RIP: 0010:__lock_acquire+0x87/0x5e60 kernel/locking/lockdep.c:4770 | Code: 84 c0 48 89 7c 24 78 0f 85 10 26 00 00 83 3d 53 64 59 0c 00 0f 84 84 41 00 00 83 3d 72 8a 01 0b 00 74 32 48 89 f8 48 c1 e8 03 <80> 3c 30 00 74 19 48 8b 7c 24 78 e8 79 8b 60 00 48 8b 7c 24 78 48 | RSP: 0018:ffffc90000007c00 EFLAGS: 00010006 | RAX: 0000000000000003 RBX: ffff888048058000 RCX: 0000000000000000 | RDX: 0000000000000000 RSI: dffffc0000000000 RDI: 0000000000000018 | RBP: ffffc90000007da8 R08: 0000000000000001 R09: 0000000000000001 | R10: fffffbfff1b6b27e R11: 0000000000000000 R12: 0000000000000001 | R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001 | FS: 0000000000000000(0000) GS:ffff88802ce00000(0000) knlGS:0000000000000000 | CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 | CR2: 0000000000970004 CR3: 0000000040d91000 CR4: 0000000000750ef0 | DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 | DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600 | PKRU: 55555554 | Call Trace: | <IRQ> | lock_acquire+0x126/0x650 kernel/locking/lockdep.c:5510 | __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] | _raw_spin_lock_irqsave+0x73/0xa0 kernel/locking/spinlock.c:159 | force_sig_info_to_task+0x65/0x3f0 kernel/signal.c:1322 | perf_sigtrap kernel/events/core.c:6418 [inline] | perf_pending_event_disable kernel/events/core.c:6433 [inline] | perf_pending_event+0x46f/0x620 kernel/events/core.c:6475 | irq_work_single kernel/irq_work.c:153 [inline] | irq_work_run_list kernel/irq_work.c:175 [inline] | irq_work_run+0x1da/0x640 kernel/irq_work.c:184 | __sysvec_irq_work+0x62/0x70 arch/x86/kernel/irq_work.c:22 | sysvec_irq_work+0x8c/0xb0 arch/x86/kernel/irq_work.c:17 | </IRQ> | asm_sysvec_irq_work+0x12/0x20 arch/x86/include/asm/idtentry.h:658 | RIP: 0010:__raw_write_unlock_irq include/linux/rwlock_api_smp.h:268 [inline] | RIP: 0010:_raw_write_unlock_irq+0x25/0x40 kernel/locking/spinlock.c:343 | Code: aa fd ff 66 90 53 48 89 fb 48 83 c7 18 48 8b 74 24 08 e8 3e 34 04 f8 48 89 df e8 a6 1a 06 f8 e8 21 85 26 f8 fb bf 01 00 00 00 <e8> 56 19 fa f7 65 8b 05 77 65 a9 76 85 c0 74 02 5b c3 e8 2b c1 a7 | RSP: 0018:ffffc9000202fd68 EFLAGS: 00000286 | RAX: 2a7870700b93e400 RBX: ffffffff8c40a040 RCX: ffffffff8ff9cb03 | RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000001 | RBP: ffff888047b24790 R08: ffffffff817f0f50 R09: fffffbfff1b6b27e | R10: fffffbfff1b6b27e R11: 0000000000000000 R12: ffff888048058000 | R13: dffffc0000000000 R14: ffff888047b24701 R15: ffff888048058000 | release_task+0x10bf/0x1360 kernel/exit.c:220 | exit_notify kernel/exit.c:699 [inline] | do_exit+0x19b0/0x2290 kernel/exit.c:845 | call_usermodehelper_exec_async+0x39c/0x3a0 kernel/umh.c:123 | ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 > + clear_siginfo(&info); > + info.si_signo = SIGTRAP; > + info.si_code = TRAP_PERF; > + info.si_errno = event->attr.type; > + force_sig_info(&info); > +} > + > static void perf_pending_event_disable(struct perf_event *event) > { > int cpu = READ_ONCE(event->pending_disable); > @@ -6400,6 +6411,13 @@ static void perf_pending_event_disable(struct perf_event *event) > > if (cpu == smp_processor_id()) { > WRITE_ONCE(event->pending_disable, -1); > + > + if (event->attr.sigtrap) { > + atomic_set(&event->event_limit, 1); /* rearm event */ > + perf_sigtrap(event); > + return; > + } > + > perf_event_disable_local(event); > return; > } [...]
On Thu, Mar 25, 2021 at 09:14:39AM +0100, Marco Elver wrote: > On Wed, Mar 24, 2021 at 12:24PM +0100, Marco Elver wrote: > [...] > > diff --git a/kernel/events/core.c b/kernel/events/core.c > > index b6434697c516..1e4c949bf75f 100644 > > --- a/kernel/events/core.c > > +++ b/kernel/events/core.c > > @@ -6391,6 +6391,17 @@ void perf_event_wakeup(struct perf_event *event) > > } > > } > > > > +static void perf_sigtrap(struct perf_event *event) > > +{ > > + struct kernel_siginfo info; > > + > > I think we need to add something like this here: > > diff --git a/kernel/events/core.c b/kernel/events/core.c > index 4b82788fbaab..4fcd6b45ce66 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event) > { > struct kernel_siginfo info; > > + /* > + * This irq_work can race with an exiting task; bail out if sighand has > + * already been released in release_task(). > + */ > + if (!current->sighand) > + return; > + > clear_siginfo(&info); > info.si_signo = SIGTRAP; > info.si_code = TRAP_PERF; > > Urgh.. I'm not entirely sure that check is correct, but I always forget the rules with signal. It could be we ought to be testing PF_EXISTING instead. But also, I think Jiri Olsa was going to poke around here because all of this is broken on PREEMPT_RT. IIRC the plan was to add yet another stage to the construct. So where today we have: <NMI> irq_work_queue() </NMI> ... <IRQ> perf_pending_event() </IRQ> (and we might already have a problem on some architectures where there can be significant time between these due to not having arch_irq_work_raise(), so ideally we ought to double check current in your case) The idea was, I think to add a task_work(), such that we get: <NMI> irq_work_queue() </NMI> ... <IRQ> perf_pending_event() task_work_add() </IRQ> <ret-to-user> run_task_work() ... kill_fasync();
On 03/29, Peter Zijlstra wrote: > > On Thu, Mar 25, 2021 at 09:14:39AM +0100, Marco Elver wrote: > > @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event) > > { > > struct kernel_siginfo info; > > > > + /* > > + * This irq_work can race with an exiting task; bail out if sighand has > > + * already been released in release_task(). > > + */ > > + if (!current->sighand) > > + return; This is racy. If "current" has already passed exit_notify(), current->parent can do release_task() and destroy current->sighand right after the check. > Urgh.. I'm not entirely sure that check is correct, but I always forget > the rules with signal. It could be we ought to be testing PF_EXISTING > instead. Agreed, PF_EXISTING check makes more sense in any case, the exiting task can't receive the signal anyway. Oleg.
On Mon, 29 Mar 2021 at 16:27, Oleg Nesterov <oleg@redhat.com> wrote: > On 03/29, Peter Zijlstra wrote: > > > > On Thu, Mar 25, 2021 at 09:14:39AM +0100, Marco Elver wrote: > > > @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event) > > > { > > > struct kernel_siginfo info; > > > > > > + /* > > > + * This irq_work can race with an exiting task; bail out if sighand has > > > + * already been released in release_task(). > > > + */ > > > + if (!current->sighand) > > > + return; > > This is racy. If "current" has already passed exit_notify(), current->parent > can do release_task() and destroy current->sighand right after the check. > > > Urgh.. I'm not entirely sure that check is correct, but I always forget > > the rules with signal. It could be we ought to be testing PF_EXISTING > > instead. > > Agreed, PF_EXISTING check makes more sense in any case, the exiting task > can't receive the signal anyway. Thanks for confirming. I'll switch to just checking PF_EXITING (PF_EXISTING does not exist :-)). Thanks, -- Marco
On Mon, 29 Mar 2021 at 16:27, Oleg Nesterov <oleg@redhat.com> wrote: > On 03/29, Peter Zijlstra wrote: > > > > On Thu, Mar 25, 2021 at 09:14:39AM +0100, Marco Elver wrote: > > > @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event) > > > { > > > struct kernel_siginfo info; > > > > > > + /* > > > + * This irq_work can race with an exiting task; bail out if sighand has > > > + * already been released in release_task(). > > > + */ > > > + if (!current->sighand) > > > + return; > > This is racy. If "current" has already passed exit_notify(), current->parent > can do release_task() and destroy current->sighand right after the check. > > > Urgh.. I'm not entirely sure that check is correct, but I always forget > > the rules with signal. It could be we ought to be testing PF_EXISTING > > instead. > > Agreed, PF_EXISTING check makes more sense in any case, the exiting task > can't receive the signal anyway. So, per off-list discussion, it appears that I should ask to clarify: PF_EXISTING or PF_EXITING? It appears that PF_EXISTING is what's being suggested, whereas it has not been mentioned anywhere, nor are its semantics clear. If it is not simply the negation of PF_EXITING, what are its semantics? And why do we need it in the case here (instead of something else that already exists)? Thanks, -- Marco
On 03/29, Marco Elver wrote: > > So, per off-list discussion, it appears that I should ask to clarify: > PF_EXISTING or PF_EXITING? Aaaaaaah, sorry Marco. PF_EXITING, of course. Oleg.
On Mon, Mar 29, 2021 at 04:32:18PM +0200, Marco Elver wrote: > On Mon, 29 Mar 2021 at 16:27, Oleg Nesterov <oleg@redhat.com> wrote: > > On 03/29, Peter Zijlstra wrote: > > > > > > On Thu, Mar 25, 2021 at 09:14:39AM +0100, Marco Elver wrote: > > > > @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event) > > > > { > > > > struct kernel_siginfo info; > > > > > > > > + /* > > > > + * This irq_work can race with an exiting task; bail out if sighand has > > > > + * already been released in release_task(). > > > > + */ > > > > + if (!current->sighand) > > > > + return; > > > > This is racy. If "current" has already passed exit_notify(), current->parent > > can do release_task() and destroy current->sighand right after the check. > > > > > Urgh.. I'm not entirely sure that check is correct, but I always forget > > > the rules with signal. It could be we ought to be testing PF_EXISTING > > > instead. > > > > Agreed, PF_EXISTING check makes more sense in any case, the exiting task > > can't receive the signal anyway. > > Thanks for confirming. I'll switch to just checking PF_EXITING > (PF_EXISTING does not exist :-)). Indeed! Typing be hard :-)
On Mon, 29 Mar 2021 at 14:07, Peter Zijlstra <peterz@infradead.org> wrote: > (and we might already have a problem on some architectures where there > can be significant time between these due to not having > arch_irq_work_raise(), so ideally we ought to double check current in > your case) I missed this bit -- just to verify: here we want to check that event->ctx->task == current, in case the the irq_work runs when the current task has already been replaced. Correct? Thanks, -- Marco
On Wed, Mar 31, 2021 at 02:32:58PM +0200, Marco Elver wrote: > On Mon, 29 Mar 2021 at 14:07, Peter Zijlstra <peterz@infradead.org> wrote: > > > (and we might already have a problem on some architectures where there > > can be significant time between these due to not having > > arch_irq_work_raise(), so ideally we ought to double check current in > > your case) > > I missed this bit -- just to verify: here we want to check that > event->ctx->task == current, in case the the irq_work runs when the > current task has already been replaced. Correct? Yeah, just not sure what a decent failure would be, silent ignore seems undesired, maybe WARN and archs that can trigger it get to fix it ?
On Wed, 31 Mar 2021 at 16:51, Peter Zijlstra <peterz@infradead.org> wrote: > On Wed, Mar 31, 2021 at 02:32:58PM +0200, Marco Elver wrote: > > On Mon, 29 Mar 2021 at 14:07, Peter Zijlstra <peterz@infradead.org> wrote: > > > > > (and we might already have a problem on some architectures where there > > > can be significant time between these due to not having > > > arch_irq_work_raise(), so ideally we ought to double check current in > > > your case) > > > > I missed this bit -- just to verify: here we want to check that > > event->ctx->task == current, in case the the irq_work runs when the > > current task has already been replaced. Correct? > > Yeah, just not sure what a decent failure would be, silent ignore seems > undesired, maybe WARN and archs that can trigger it get to fix it ? I'll go with a WARN and add a comment. This also revealed there should be a requirement that sigtrap events must be associated with a task (syzkaller managed to trigger the warning for cpu events). Thanks, -- Marco
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 8c5b9f5ad63f..3a4dbb1688f0 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -391,7 +391,8 @@ struct perf_event_attr { build_id : 1, /* use build id in mmap2 events */ inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ - __reserved_1 : 27; + sigtrap : 1, /* send synchronous SIGTRAP on event */ + __reserved_1 : 26; union { __u32 wakeup_events; /* wakeup every n events */ diff --git a/kernel/events/core.c b/kernel/events/core.c index b6434697c516..1e4c949bf75f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6391,6 +6391,17 @@ void perf_event_wakeup(struct perf_event *event) } } +static void perf_sigtrap(struct perf_event *event) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_code = TRAP_PERF; + info.si_errno = event->attr.type; + force_sig_info(&info); +} + static void perf_pending_event_disable(struct perf_event *event) { int cpu = READ_ONCE(event->pending_disable); @@ -6400,6 +6411,13 @@ static void perf_pending_event_disable(struct perf_event *event) if (cpu == smp_processor_id()) { WRITE_ONCE(event->pending_disable, -1); + + if (event->attr.sigtrap) { + atomic_set(&event->event_limit, 1); /* rearm event */ + perf_sigtrap(event); + return; + } + perf_event_disable_local(event); return; } @@ -11428,6 +11446,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->state = PERF_EVENT_STATE_INACTIVE; + if (event->attr.sigtrap) + atomic_set(&event->event_limit, 1); + if (task) { event->attach_state = PERF_ATTACH_TASK; /* @@ -11706,6 +11727,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->remove_on_exec && attr->enable_on_exec) return -EINVAL; + if (attr->sigtrap && !attr->remove_on_exec) + return -EINVAL; + out: return ret; @@ -12932,7 +12956,9 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *child_ctx; if (!event->attr.inherit || - (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) { + (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || + /* Do not inherit if sigtrap and signal handlers were cleared. */ + (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) { *inherited_all = 0; return 0; }