diff mbox series

[bpf-next] bpf: use raw_spin_trylock() for pcpu_freelist_push/pop in NMI

Message ID 20200926000756.893078-1-songliubraving@fb.com
State New
Headers show
Series [bpf-next] bpf: use raw_spin_trylock() for pcpu_freelist_push/pop in NMI | expand

Commit Message

Song Liu Sept. 26, 2020, 12:07 a.m. UTC
Recent improvements in LOCKDEP highlighted a potential A-A deadlock with
pcpu_freelist in NMI:

./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi

[   18.984807] ================================
[   18.984807] WARNING: inconsistent lock state
[   18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted
[   18.984809] --------------------------------
[   18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage.
[   18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes:
[   18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at:
__pcpu_freelist_pop+0xe3/0x180
[   18.984813] {INITIAL USE} state was registered at:
[   18.984814]   lock_acquire+0x175/0x7c0
[   18.984814]   _raw_spin_lock+0x2c/0x40
[   18.984815]   __pcpu_freelist_pop+0xe3/0x180
[   18.984815]   pcpu_freelist_pop+0x31/0x40
[   18.984816]   htab_map_alloc+0xbbf/0xf40
[   18.984816]   __do_sys_bpf+0x5aa/0x3ed0
[   18.984817]   do_syscall_64+0x2d/0x40
[   18.984818]   entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   18.984818] irq event stamp: 12
[ ... ]
[   18.984822] other info that might help us debug this:
[   18.984823]  Possible unsafe locking scenario:
[   18.984823]
[   18.984824]        CPU0
[   18.984824]        ----
[   18.984824]   lock(&head->lock);
[   18.984826]   <Interrupt>
[   18.984826]     lock(&head->lock);
[   18.984827]
[   18.984828]  *** DEADLOCK ***
[   18.984828]
[   18.984829] 2 locks held by test_progs/1990:
[ ... ]
[   18.984838]  <NMI>
[   18.984838]  dump_stack+0x9a/0xd0
[   18.984839]  lock_acquire+0x5c9/0x7c0
[   18.984839]  ? lock_release+0x6f0/0x6f0
[   18.984840]  ? __pcpu_freelist_pop+0xe3/0x180
[   18.984840]  _raw_spin_lock+0x2c/0x40
[   18.984841]  ? __pcpu_freelist_pop+0xe3/0x180
[   18.984841]  __pcpu_freelist_pop+0xe3/0x180
[   18.984842]  pcpu_freelist_pop+0x17/0x40
[   18.984842]  ? lock_release+0x6f0/0x6f0
[   18.984843]  __bpf_get_stackid+0x534/0xaf0
[   18.984843]  bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350
[   18.984844]  bpf_overflow_handler+0x12f/0x3f0

This is because pcpu_freelist_head.lock is accessed in both NMI and
non-NMI context. Fix this issue by using raw_spin_trylock() in NMI.

For systems with only one cpu, there is a trickier scenario with
pcpu_freelist_push(): if the only pcpu_freelist_head.lock is already
locked before NMI, raw_spin_trylock() will never succeed. Unlike,
_pop(), where we can failover and return NULL, failing _push() will leak
memory. Fix this issue with an extra list, pcpu_freelist.extralist. The
extralist is primarily used to take _push() when raw_spin_trylock()
failed on all the per cpu lists. It should be empty most of the time.
The following table summarizes the behavior of pcpu_freelist in NMI
and non-NMI:

non-NMI pop(): 	use _lock(); check per cpu lists first;
                if all per cpu lists are empty, check extralist;
                if extralist is empty, return NULL.

non-NMI push(): use _lock(); only push to per cpu lists.

NMI pop():    use _trylock(); check per cpu lists first;
              if all per cpu lists are locked or empty, check extralist;
              if extralist is locked or empty, return NULL.

NMI push():   use _trylock(); check per cpu lists first;
              if all per cpu lists are locked; try push to extralist;
              if extralist is also locked, keep trying on per cpu lists.

Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 kernel/bpf/percpu_freelist.c | 101 +++++++++++++++++++++++++++++++++--
 kernel/bpf/percpu_freelist.h |   1 +
 2 files changed, 97 insertions(+), 5 deletions(-)

Comments

Martin KaFai Lau Oct. 2, 2020, 10:40 p.m. UTC | #1
On Fri, Sep 25, 2020 at 05:07:56PM -0700, Song Liu wrote:
> Recent improvements in LOCKDEP highlighted a potential A-A deadlock with
> pcpu_freelist in NMI:
> 
> ./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi
> 
> [   18.984807] ================================
> [   18.984807] WARNING: inconsistent lock state
> [   18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted
> [   18.984809] --------------------------------
> [   18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage.
> [   18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes:
> [   18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at:
> __pcpu_freelist_pop+0xe3/0x180
> [   18.984813] {INITIAL USE} state was registered at:
> [   18.984814]   lock_acquire+0x175/0x7c0
> [   18.984814]   _raw_spin_lock+0x2c/0x40
> [   18.984815]   __pcpu_freelist_pop+0xe3/0x180
> [   18.984815]   pcpu_freelist_pop+0x31/0x40
> [   18.984816]   htab_map_alloc+0xbbf/0xf40
> [   18.984816]   __do_sys_bpf+0x5aa/0x3ed0
> [   18.984817]   do_syscall_64+0x2d/0x40
> [   18.984818]   entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   18.984818] irq event stamp: 12
> [ ... ]
> [   18.984822] other info that might help us debug this:
> [   18.984823]  Possible unsafe locking scenario:
> [   18.984823]
> [   18.984824]        CPU0
> [   18.984824]        ----
> [   18.984824]   lock(&head->lock);
> [   18.984826]   <Interrupt>
> [   18.984826]     lock(&head->lock);
> [   18.984827]
> [   18.984828]  *** DEADLOCK ***
> [   18.984828]
> [   18.984829] 2 locks held by test_progs/1990:
> [ ... ]
> [   18.984838]  <NMI>
> [   18.984838]  dump_stack+0x9a/0xd0
> [   18.984839]  lock_acquire+0x5c9/0x7c0
> [   18.984839]  ? lock_release+0x6f0/0x6f0
> [   18.984840]  ? __pcpu_freelist_pop+0xe3/0x180
> [   18.984840]  _raw_spin_lock+0x2c/0x40
> [   18.984841]  ? __pcpu_freelist_pop+0xe3/0x180
> [   18.984841]  __pcpu_freelist_pop+0xe3/0x180
> [   18.984842]  pcpu_freelist_pop+0x17/0x40
> [   18.984842]  ? lock_release+0x6f0/0x6f0
> [   18.984843]  __bpf_get_stackid+0x534/0xaf0
> [   18.984843]  bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350
> [   18.984844]  bpf_overflow_handler+0x12f/0x3f0
> 
> This is because pcpu_freelist_head.lock is accessed in both NMI and
> non-NMI context. Fix this issue by using raw_spin_trylock() in NMI.
> 
> For systems with only one cpu, there is a trickier scenario with
> pcpu_freelist_push(): if the only pcpu_freelist_head.lock is already
> locked before NMI, raw_spin_trylock() will never succeed. Unlike,
> _pop(), where we can failover and return NULL, failing _push() will leak
> memory. Fix this issue with an extra list, pcpu_freelist.extralist. The
> extralist is primarily used to take _push() when raw_spin_trylock()
> failed on all the per cpu lists. It should be empty most of the time.
It is tricky.  LGTM.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Daniel Borkmann Oct. 2, 2020, 11:09 p.m. UTC | #2
On 9/26/20 2:07 AM, Song Liu wrote:
> Recent improvements in LOCKDEP highlighted a potential A-A deadlock with
> pcpu_freelist in NMI:
> 
> ./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi
> 
> [   18.984807] ================================
> [   18.984807] WARNING: inconsistent lock state
> [   18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted
> [   18.984809] --------------------------------
> [   18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage.
> [   18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes:
> [   18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at:
> __pcpu_freelist_pop+0xe3/0x180
> [   18.984813] {INITIAL USE} state was registered at:
> [   18.984814]   lock_acquire+0x175/0x7c0
> [   18.984814]   _raw_spin_lock+0x2c/0x40
> [   18.984815]   __pcpu_freelist_pop+0xe3/0x180
> [   18.984815]   pcpu_freelist_pop+0x31/0x40
> [   18.984816]   htab_map_alloc+0xbbf/0xf40
> [   18.984816]   __do_sys_bpf+0x5aa/0x3ed0
> [   18.984817]   do_syscall_64+0x2d/0x40
> [   18.984818]   entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   18.984818] irq event stamp: 12
> [ ... ]
> [   18.984822] other info that might help us debug this:
> [   18.984823]  Possible unsafe locking scenario:
> [   18.984823]
> [   18.984824]        CPU0
> [   18.984824]        ----
> [   18.984824]   lock(&head->lock);
> [   18.984826]   <Interrupt>
> [   18.984826]     lock(&head->lock);
> [   18.984827]
> [   18.984828]  *** DEADLOCK ***
> [   18.984828]
> [   18.984829] 2 locks held by test_progs/1990:
> [ ... ]
> [   18.984838]  <NMI>
> [   18.984838]  dump_stack+0x9a/0xd0
> [   18.984839]  lock_acquire+0x5c9/0x7c0
> [   18.984839]  ? lock_release+0x6f0/0x6f0
> [   18.984840]  ? __pcpu_freelist_pop+0xe3/0x180
> [   18.984840]  _raw_spin_lock+0x2c/0x40
> [   18.984841]  ? __pcpu_freelist_pop+0xe3/0x180
> [   18.984841]  __pcpu_freelist_pop+0xe3/0x180
> [   18.984842]  pcpu_freelist_pop+0x17/0x40
> [   18.984842]  ? lock_release+0x6f0/0x6f0
> [   18.984843]  __bpf_get_stackid+0x534/0xaf0
> [   18.984843]  bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350
> [   18.984844]  bpf_overflow_handler+0x12f/0x3f0
> 
> This is because pcpu_freelist_head.lock is accessed in both NMI and
> non-NMI context. Fix this issue by using raw_spin_trylock() in NMI.
> 
> For systems with only one cpu, there is a trickier scenario with
> pcpu_freelist_push(): if the only pcpu_freelist_head.lock is already
> locked before NMI, raw_spin_trylock() will never succeed. Unlike,
> _pop(), where we can failover and return NULL, failing _push() will leak
> memory. Fix this issue with an extra list, pcpu_freelist.extralist. The
> extralist is primarily used to take _push() when raw_spin_trylock()
> failed on all the per cpu lists. It should be empty most of the time.
> The following table summarizes the behavior of pcpu_freelist in NMI
> and non-NMI:
> 
> non-NMI pop(): 	use _lock(); check per cpu lists first;
>                  if all per cpu lists are empty, check extralist;
>                  if extralist is empty, return NULL.
> 
> non-NMI push(): use _lock(); only push to per cpu lists.
> 
> NMI pop():    use _trylock(); check per cpu lists first;
>                if all per cpu lists are locked or empty, check extralist;
>                if extralist is locked or empty, return NULL.
> 
> NMI push():   use _trylock(); check per cpu lists first;
>                if all per cpu lists are locked; try push to extralist;
>                if extralist is also locked, keep trying on per cpu lists.

Code looks reasonable to me, is there any practical benefit to keep the
extra list around for >1 CPU case (and not just compile it out)? For
example, we could choose a different back end *_freelist_push/pop()
implementation depending on CONFIG_SMP like ...

ifeq ($(CONFIG_SMP),y)
obj-$(CONFIG_BPF_SYSCALL) += percpu_freelist.o
else
obj-$(CONFIG_BPF_SYSCALL) += onecpu_freelist.o
endif

... and keep the CONFIG_SMP simplified in that we'd only do the trylock
iteration over CPUs under NMI with pop aborting with NULL in worst case
and push keep looping, whereas for the single CPU case, all the logic
resides in onecpu_freelist.c and it has a simpler two list implementation?

Thanks,
Daniel
Song Liu Oct. 3, 2020, 12:06 a.m. UTC | #3
> On Oct 2, 2020, at 4:09 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> 
> On 9/26/20 2:07 AM, Song Liu wrote:
>> Recent improvements in LOCKDEP highlighted a potential A-A deadlock with
>> pcpu_freelist in NMI:
>> ./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi
>> [   18.984807] ================================
>> [   18.984807] WARNING: inconsistent lock state
>> [   18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted
>> [   18.984809] --------------------------------
>> [   18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage.
>> [   18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes:
>> [   18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at:
>> __pcpu_freelist_pop+0xe3/0x180
>> [   18.984813] {INITIAL USE} state was registered at:
>> [   18.984814]   lock_acquire+0x175/0x7c0
>> [   18.984814]   _raw_spin_lock+0x2c/0x40
>> [   18.984815]   __pcpu_freelist_pop+0xe3/0x180
>> [   18.984815]   pcpu_freelist_pop+0x31/0x40
>> [   18.984816]   htab_map_alloc+0xbbf/0xf40
>> [   18.984816]   __do_sys_bpf+0x5aa/0x3ed0
>> [   18.984817]   do_syscall_64+0x2d/0x40
>> [   18.984818]   entry_SYSCALL_64_after_hwframe+0x44/0xa9
>> [   18.984818] irq event stamp: 12
>> [ ... ]
>> [   18.984822] other info that might help us debug this:
>> [   18.984823]  Possible unsafe locking scenario:
>> [   18.984823]
>> [   18.984824]        CPU0
>> [   18.984824]        ----
>> [   18.984824]   lock(&head->lock);
>> [   18.984826]   <Interrupt>
>> [   18.984826]     lock(&head->lock);
>> [   18.984827]
>> [   18.984828]  *** DEADLOCK ***
>> [   18.984828]
>> [   18.984829] 2 locks held by test_progs/1990:
>> [ ... ]
>> [   18.984838]  <NMI>
>> [   18.984838]  dump_stack+0x9a/0xd0
>> [   18.984839]  lock_acquire+0x5c9/0x7c0
>> [   18.984839]  ? lock_release+0x6f0/0x6f0
>> [   18.984840]  ? __pcpu_freelist_pop+0xe3/0x180
>> [   18.984840]  _raw_spin_lock+0x2c/0x40
>> [   18.984841]  ? __pcpu_freelist_pop+0xe3/0x180
>> [   18.984841]  __pcpu_freelist_pop+0xe3/0x180
>> [   18.984842]  pcpu_freelist_pop+0x17/0x40
>> [   18.984842]  ? lock_release+0x6f0/0x6f0
>> [   18.984843]  __bpf_get_stackid+0x534/0xaf0
>> [   18.984843]  bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350
>> [   18.984844]  bpf_overflow_handler+0x12f/0x3f0
>> This is because pcpu_freelist_head.lock is accessed in both NMI and
>> non-NMI context. Fix this issue by using raw_spin_trylock() in NMI.
>> For systems with only one cpu, there is a trickier scenario with
>> pcpu_freelist_push(): if the only pcpu_freelist_head.lock is already
>> locked before NMI, raw_spin_trylock() will never succeed. Unlike,
>> _pop(), where we can failover and return NULL, failing _push() will leak
>> memory. Fix this issue with an extra list, pcpu_freelist.extralist. The
>> extralist is primarily used to take _push() when raw_spin_trylock()
>> failed on all the per cpu lists. It should be empty most of the time.
>> The following table summarizes the behavior of pcpu_freelist in NMI
>> and non-NMI:
>> non-NMI pop(): 	use _lock(); check per cpu lists first;
>>                 if all per cpu lists are empty, check extralist;
>>                 if extralist is empty, return NULL.
>> non-NMI push(): use _lock(); only push to per cpu lists.
>> NMI pop():    use _trylock(); check per cpu lists first;
>>               if all per cpu lists are locked or empty, check extralist;
>>               if extralist is locked or empty, return NULL.
>> NMI push():   use _trylock(); check per cpu lists first;
>>               if all per cpu lists are locked; try push to extralist;
>>               if extralist is also locked, keep trying on per cpu lists.
> 
> Code looks reasonable to me, is there any practical benefit to keep the
> extra list around for >1 CPU case (and not just compile it out)? For
> example, we could choose a different back end *_freelist_push/pop()
> implementation depending on CONFIG_SMP like ...
> 
> ifeq ($(CONFIG_SMP),y)
> obj-$(CONFIG_BPF_SYSCALL) += percpu_freelist.o
> else
> obj-$(CONFIG_BPF_SYSCALL) += onecpu_freelist.o
> endif
> 
> ... and keep the CONFIG_SMP simplified in that we'd only do the trylock
> iteration over CPUs under NMI with pop aborting with NULL in worst case
> and push keep looping, whereas for the single CPU case, all the logic
> resides in onecpu_freelist.c and it has a simpler two list implementation?

Technically, it is possible to have similar deadlock in SMP. For N cpus, 
there could be N NMI at the same time, and they may block N non-NMI raw
spinlock, and then all these NMI push() would spin forever. Of course, 
this is almost impossible to trigger with a decent N. 

On the other hand, I feel current code doesn't add too much complexity 
to SMP case. Maintaining two copies may require more work down the road. 
If we found current version too complex for SMP, we can do the split in 
the future. 

Does this make sense?

Thanks,
Song
Daniel Borkmann Oct. 5, 2020, 2:14 p.m. UTC | #4
On 10/3/20 2:06 AM, Song Liu wrote:
>> On Oct 2, 2020, at 4:09 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 9/26/20 2:07 AM, Song Liu wrote:
>>> Recent improvements in LOCKDEP highlighted a potential A-A deadlock with
>>> pcpu_freelist in NMI:
>>> ./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi
>>> [   18.984807] ================================
>>> [   18.984807] WARNING: inconsistent lock state
>>> [   18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted
>>> [   18.984809] --------------------------------
>>> [   18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage.
>>> [   18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes:
>>> [   18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at:
>>> __pcpu_freelist_pop+0xe3/0x180
>>> [   18.984813] {INITIAL USE} state was registered at:
>>> [   18.984814]   lock_acquire+0x175/0x7c0
>>> [   18.984814]   _raw_spin_lock+0x2c/0x40
>>> [   18.984815]   __pcpu_freelist_pop+0xe3/0x180
>>> [   18.984815]   pcpu_freelist_pop+0x31/0x40
>>> [   18.984816]   htab_map_alloc+0xbbf/0xf40
>>> [   18.984816]   __do_sys_bpf+0x5aa/0x3ed0
>>> [   18.984817]   do_syscall_64+0x2d/0x40
>>> [   18.984818]   entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>> [   18.984818] irq event stamp: 12
>>> [ ... ]
>>> [   18.984822] other info that might help us debug this:
>>> [   18.984823]  Possible unsafe locking scenario:
>>> [   18.984823]
>>> [   18.984824]        CPU0
>>> [   18.984824]        ----
>>> [   18.984824]   lock(&head->lock);
>>> [   18.984826]   <Interrupt>
>>> [   18.984826]     lock(&head->lock);
>>> [   18.984827]
>>> [   18.984828]  *** DEADLOCK ***
>>> [   18.984828]
>>> [   18.984829] 2 locks held by test_progs/1990:
>>> [ ... ]
>>> [   18.984838]  <NMI>
>>> [   18.984838]  dump_stack+0x9a/0xd0
>>> [   18.984839]  lock_acquire+0x5c9/0x7c0
>>> [   18.984839]  ? lock_release+0x6f0/0x6f0
>>> [   18.984840]  ? __pcpu_freelist_pop+0xe3/0x180
>>> [   18.984840]  _raw_spin_lock+0x2c/0x40
>>> [   18.984841]  ? __pcpu_freelist_pop+0xe3/0x180
>>> [   18.984841]  __pcpu_freelist_pop+0xe3/0x180
>>> [   18.984842]  pcpu_freelist_pop+0x17/0x40
>>> [   18.984842]  ? lock_release+0x6f0/0x6f0
>>> [   18.984843]  __bpf_get_stackid+0x534/0xaf0
>>> [   18.984843]  bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350
>>> [   18.984844]  bpf_overflow_handler+0x12f/0x3f0
>>> This is because pcpu_freelist_head.lock is accessed in both NMI and
>>> non-NMI context. Fix this issue by using raw_spin_trylock() in NMI.
>>> For systems with only one cpu, there is a trickier scenario with
>>> pcpu_freelist_push(): if the only pcpu_freelist_head.lock is already
>>> locked before NMI, raw_spin_trylock() will never succeed. Unlike,
>>> _pop(), where we can failover and return NULL, failing _push() will leak
>>> memory. Fix this issue with an extra list, pcpu_freelist.extralist. The
>>> extralist is primarily used to take _push() when raw_spin_trylock()
>>> failed on all the per cpu lists. It should be empty most of the time.
>>> The following table summarizes the behavior of pcpu_freelist in NMI
>>> and non-NMI:
>>> non-NMI pop(): 	use _lock(); check per cpu lists first;
>>>                  if all per cpu lists are empty, check extralist;
>>>                  if extralist is empty, return NULL.
>>> non-NMI push(): use _lock(); only push to per cpu lists.
>>> NMI pop():    use _trylock(); check per cpu lists first;
>>>                if all per cpu lists are locked or empty, check extralist;
>>>                if extralist is locked or empty, return NULL.
>>> NMI push():   use _trylock(); check per cpu lists first;
>>>                if all per cpu lists are locked; try push to extralist;
>>>                if extralist is also locked, keep trying on per cpu lists.
>>
>> Code looks reasonable to me, is there any practical benefit to keep the
>> extra list around for >1 CPU case (and not just compile it out)? For
>> example, we could choose a different back end *_freelist_push/pop()
>> implementation depending on CONFIG_SMP like ...
>>
>> ifeq ($(CONFIG_SMP),y)
>> obj-$(CONFIG_BPF_SYSCALL) += percpu_freelist.o
>> else
>> obj-$(CONFIG_BPF_SYSCALL) += onecpu_freelist.o
>> endif
>>
>> ... and keep the CONFIG_SMP simplified in that we'd only do the trylock
>> iteration over CPUs under NMI with pop aborting with NULL in worst case
>> and push keep looping, whereas for the single CPU case, all the logic
>> resides in onecpu_freelist.c and it has a simpler two list implementation?
> 
> Technically, it is possible to have similar deadlock in SMP. For N cpus,
> there could be N NMI at the same time, and they may block N non-NMI raw
> spinlock, and then all these NMI push() would spin forever. Of course,
> this is almost impossible to trigger with a decent N.
> 
> On the other hand, I feel current code doesn't add too much complexity
> to SMP case. Maintaining two copies may require more work down the road.
> If we found current version too complex for SMP, we can do the split in
> the future.
> 
> Does this make sense?

Hm, makes sense that technically this could happen also on SMP though unlikely;
in that case however we'd also need to correct the commit description a bit since
it only mentions this on single CPU case (where it will realistically happen, but
we should state your above explanation there too so we'll later have full context
in git history on why it was done this way also for SMP).

Thanks,
Daniel
Song Liu Oct. 5, 2020, 4:33 p.m. UTC | #5
> On Oct 5, 2020, at 7:14 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> 
> On 10/3/20 2:06 AM, Song Liu wrote:
>>> On Oct 2, 2020, at 4:09 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>> On 9/26/20 2:07 AM, Song Liu wrote:
>>>> Recent improvements in LOCKDEP highlighted a potential A-A deadlock with
>>>> pcpu_freelist in NMI:
>>>> ./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi
>>>> [   18.984807] ================================
>>>> [   18.984807] WARNING: inconsistent lock state
>>>> [   18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted
>>>> [   18.984809] --------------------------------
>>>> [   18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage.
>>>> [   18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes:
>>>> [   18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at:
>>>> __pcpu_freelist_pop+0xe3/0x180
>>>> [   18.984813] {INITIAL USE} state was registered at:
>>>> [   18.984814]   lock_acquire+0x175/0x7c0
>>>> [   18.984814]   _raw_spin_lock+0x2c/0x40
>>>> [   18.984815]   __pcpu_freelist_pop+0xe3/0x180
>>>> [   18.984815]   pcpu_freelist_pop+0x31/0x40
>>>> [   18.984816]   htab_map_alloc+0xbbf/0xf40
>>>> [   18.984816]   __do_sys_bpf+0x5aa/0x3ed0
>>>> [   18.984817]   do_syscall_64+0x2d/0x40
>>>> [   18.984818]   entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>> [   18.984818] irq event stamp: 12
>>>> [ ... ]
>>>> [   18.984822] other info that might help us debug this:
>>>> [   18.984823]  Possible unsafe locking scenario:
>>>> [   18.984823]
>>>> [   18.984824]        CPU0
>>>> [   18.984824]        ----
>>>> [   18.984824]   lock(&head->lock);
>>>> [   18.984826]   <Interrupt>
>>>> [   18.984826]     lock(&head->lock);
>>>> [   18.984827]
>>>> [   18.984828]  *** DEADLOCK ***
>>>> [   18.984828]
>>>> [   18.984829] 2 locks held by test_progs/1990:
>>>> [ ... ]
>>>> [   18.984838]  <NMI>
>>>> [   18.984838]  dump_stack+0x9a/0xd0
>>>> [   18.984839]  lock_acquire+0x5c9/0x7c0
>>>> [   18.984839]  ? lock_release+0x6f0/0x6f0
>>>> [   18.984840]  ? __pcpu_freelist_pop+0xe3/0x180
>>>> [   18.984840]  _raw_spin_lock+0x2c/0x40
>>>> [   18.984841]  ? __pcpu_freelist_pop+0xe3/0x180
>>>> [   18.984841]  __pcpu_freelist_pop+0xe3/0x180
>>>> [   18.984842]  pcpu_freelist_pop+0x17/0x40
>>>> [   18.984842]  ? lock_release+0x6f0/0x6f0
>>>> [   18.984843]  __bpf_get_stackid+0x534/0xaf0
>>>> [   18.984843]  bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350
>>>> [   18.984844]  bpf_overflow_handler+0x12f/0x3f0
>>>> This is because pcpu_freelist_head.lock is accessed in both NMI and
>>>> non-NMI context. Fix this issue by using raw_spin_trylock() in NMI.
>>>> For systems with only one cpu, there is a trickier scenario with
>>>> pcpu_freelist_push(): if the only pcpu_freelist_head.lock is already
>>>> locked before NMI, raw_spin_trylock() will never succeed. Unlike,
>>>> _pop(), where we can failover and return NULL, failing _push() will leak
>>>> memory. Fix this issue with an extra list, pcpu_freelist.extralist. The
>>>> extralist is primarily used to take _push() when raw_spin_trylock()
>>>> failed on all the per cpu lists. It should be empty most of the time.
>>>> The following table summarizes the behavior of pcpu_freelist in NMI
>>>> and non-NMI:
>>>> non-NMI pop(): 	use _lock(); check per cpu lists first;
>>>>                 if all per cpu lists are empty, check extralist;
>>>>                 if extralist is empty, return NULL.
>>>> non-NMI push(): use _lock(); only push to per cpu lists.
>>>> NMI pop():    use _trylock(); check per cpu lists first;
>>>>               if all per cpu lists are locked or empty, check extralist;
>>>>               if extralist is locked or empty, return NULL.
>>>> NMI push():   use _trylock(); check per cpu lists first;
>>>>               if all per cpu lists are locked; try push to extralist;
>>>>               if extralist is also locked, keep trying on per cpu lists.
>>> 
>>> Code looks reasonable to me, is there any practical benefit to keep the
>>> extra list around for >1 CPU case (and not just compile it out)? For
>>> example, we could choose a different back end *_freelist_push/pop()
>>> implementation depending on CONFIG_SMP like ...
>>> 
>>> ifeq ($(CONFIG_SMP),y)
>>> obj-$(CONFIG_BPF_SYSCALL) += percpu_freelist.o
>>> else
>>> obj-$(CONFIG_BPF_SYSCALL) += onecpu_freelist.o
>>> endif
>>> 
>>> ... and keep the CONFIG_SMP simplified in that we'd only do the trylock
>>> iteration over CPUs under NMI with pop aborting with NULL in worst case
>>> and push keep looping, whereas for the single CPU case, all the logic
>>> resides in onecpu_freelist.c and it has a simpler two list implementation?
>> Technically, it is possible to have similar deadlock in SMP. For N cpus,
>> there could be N NMI at the same time, and they may block N non-NMI raw
>> spinlock, and then all these NMI push() would spin forever. Of course,
>> this is almost impossible to trigger with a decent N.
>> On the other hand, I feel current code doesn't add too much complexity
>> to SMP case. Maintaining two copies may require more work down the road.
>> If we found current version too complex for SMP, we can do the split in
>> the future.
>> Does this make sense?
> 
> Hm, makes sense that technically this could happen also on SMP though unlikely;
> in that case however we'd also need to correct the commit description a bit since
> it only mentions this on single CPU case (where it will realistically happen, but
> we should state your above explanation there too so we'll later have full context
> in git history on why it was done this way also for SMP).

Thanks for the feedback. I will revise the commit log and send v2. 

Song
diff mbox series

Patch

diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index b367430e611c7..3d897de890612 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -17,6 +17,8 @@  int pcpu_freelist_init(struct pcpu_freelist *s)
 		raw_spin_lock_init(&head->lock);
 		head->first = NULL;
 	}
+	raw_spin_lock_init(&s->extralist.lock);
+	s->extralist.first = NULL;
 	return 0;
 }
 
@@ -40,12 +42,50 @@  static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
 	raw_spin_unlock(&head->lock);
 }
 
-void __pcpu_freelist_push(struct pcpu_freelist *s,
+static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
+						struct pcpu_freelist_node *node)
+{
+	if (!raw_spin_trylock(&s->extralist.lock))
+		return false;
+
+	pcpu_freelist_push_node(&s->extralist, node);
+	raw_spin_unlock(&s->extralist.lock);
+	return true;
+}
+
+static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
 					     struct pcpu_freelist_node *node)
 {
-	struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
+	int cpu, orig_cpu;
+
+	orig_cpu = cpu = raw_smp_processor_id();
+	while (1) {
+		struct pcpu_freelist_head *head;
+
+		head = per_cpu_ptr(s->freelist, cpu);
+		if (raw_spin_trylock(&head->lock)) {
+			pcpu_freelist_push_node(head, node);
+			raw_spin_unlock(&head->lock);
+			return;
+		}
+		cpu = cpumask_next(cpu, cpu_possible_mask);
+		if (cpu >= nr_cpu_ids)
+			cpu = 0;
 
-	___pcpu_freelist_push(head, node);
+		/* cannot lock any per cpu lock, try extralist */
+		if (cpu == orig_cpu &&
+		    pcpu_freelist_try_push_extra(s, node))
+			return;
+	}
+}
+
+void __pcpu_freelist_push(struct pcpu_freelist *s,
+			struct pcpu_freelist_node *node)
+{
+	if (in_nmi())
+		___pcpu_freelist_push_nmi(s, node);
+	else
+		___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
 }
 
 void pcpu_freelist_push(struct pcpu_freelist *s,
@@ -81,7 +121,7 @@  void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 	}
 }
 
-struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 {
 	struct pcpu_freelist_head *head;
 	struct pcpu_freelist_node *node;
@@ -102,8 +142,59 @@  struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
 		if (cpu == orig_cpu)
-			return NULL;
+			break;
+	}
+
+	/* per cpu lists are all empty, try extralist */
+	raw_spin_lock(&s->extralist.lock);
+	node = s->extralist.first;
+	if (node)
+		s->extralist.first = node->next;
+	raw_spin_unlock(&s->extralist.lock);
+	return node;
+}
+
+static struct pcpu_freelist_node *
+___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
+{
+	struct pcpu_freelist_head *head;
+	struct pcpu_freelist_node *node;
+	int orig_cpu, cpu;
+
+	orig_cpu = cpu = raw_smp_processor_id();
+	while (1) {
+		head = per_cpu_ptr(s->freelist, cpu);
+		if (raw_spin_trylock(&head->lock)) {
+			node = head->first;
+			if (node) {
+				head->first = node->next;
+				raw_spin_unlock(&head->lock);
+				return node;
 			}
+			raw_spin_unlock(&head->lock);
+		}
+		cpu = cpumask_next(cpu, cpu_possible_mask);
+		if (cpu >= nr_cpu_ids)
+			cpu = 0;
+		if (cpu == orig_cpu)
+			break;
+	}
+
+	/* cannot pop from per cpu lists, try extralist */
+	if (!raw_spin_trylock(&s->extralist.lock))
+		return NULL;
+	node = s->extralist.first;
+	if (node)
+		s->extralist.first = node->next;
+	raw_spin_unlock(&s->extralist.lock);
+	return node;
+}
+
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+	if (in_nmi())
+		return ___pcpu_freelist_pop_nmi(s);
+	return ___pcpu_freelist_pop(s);
 }
 
 struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index fbf8a8a289791..3c76553cfe571 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -13,6 +13,7 @@  struct pcpu_freelist_head {
 
 struct pcpu_freelist {
 	struct pcpu_freelist_head __percpu *freelist;
+	struct pcpu_freelist_head extralist;
 };
 
 struct pcpu_freelist_node {