diff mbox series

[5.9,RT] net: openvswitch: Fix using smp_processor_id() in preemptible code

Message ID 20201009124759.592550-1-juri.lelli@redhat.com
State New
Headers show
Series [5.9,RT] net: openvswitch: Fix using smp_processor_id() in preemptible code | expand

Commit Message

Juri Lelli Oct. 9, 2020, 12:47 p.m. UTC
The following BUG has been reported (slightly edited):

 BUG: using smp_processor_id() in preemptible [00000000] code: handler106/3082
 caller is flow_lookup.isra.15+0x2c/0xf0 [openvswitch]
 CPU: 46 PID: 3082 Comm: handler106 Not tainted ... #1
 Hardware name: Dell Inc. PowerEdge R640/06DKY5, BIOS 2.5.4 01/13/2020
 Call Trace:
  dump_stack+0x5c/0x80
  check_preemption_disabled+0xc4/0xd0
  flow_lookup.isra.15+0x2c/0xf0 [openvswitch]
  ovs_flow_tbl_lookup+0x3b/0x60 [openvswitch]
  ovs_flow_cmd_new+0x2d8/0x430 [openvswitch]
  ? __switch_to_asm+0x35/0x70
  ? __switch_to_asm+0x41/0x70
  ? __switch_to_asm+0x35/0x70
  genl_family_rcv_msg+0x1d7/0x410
  ? migrate_enable+0x123/0x3a0
  genl_rcv_msg+0x47/0x8c
  ? __kmalloc_node_track_caller+0xff/0x2e0
  ? genl_family_rcv_msg+0x410/0x410
  netlink_rcv_skb+0x4c/0x120
  genl_rcv+0x24/0x40
  netlink_unicast+0x197/0x230
  netlink_sendmsg+0x204/0x3d0
  sock_sendmsg+0x4c/0x50
  ___sys_sendmsg+0x29f/0x300
  ? migrate_enable+0x123/0x3a0
  ? ep_send_events_proc+0x8a/0x1f0
  ? ep_scan_ready_list.constprop.23+0x237/0x260
  ? rt_spin_unlock+0x23/0x40
  ? ep_poll+0x1b3/0x390
  ? __fget+0x72/0xa0
  __sys_sendmsg+0x57/0xa0
  do_syscall_64+0x87/0x1a0
  entry_SYSCALL_64_after_hwframe+0x65/0xca
 RIP: 0033:0x7f1ed72ccb07
 Code: ...
 RSP: 002b:00007f1ecbd9ba80 EFLAGS: 00003293 ORIG_RAX: 000000000000002e
 RAX: ffffffffffffffda RBX: 000000000000007b RCX: 00007f1ed72ccb07
 RDX: 0000000000000000 RSI: 00007f1ecbd9bb10 RDI: 000000000000007b
 RBP: 00007f1ecbd9bb10 R08: 0000000000000000 R09: 00007f1ecbd9d390
 R10: 0000000019616156 R11: 0000000000003293 R12: 0000000000000000
 R13: 00007f1ecbd9d338 R14: 00007f1ecbd9bfb0 R15: 00007f1ecbd9bb10

This happens because openvswitch/flow_table::flow_lookup() accesses
per-cpu data while being preemptible (and migratable).

Fix it by adding get/put_cpu_light(), so that, even if preempted, the
task executing this code is not migrated (operation is also guarded by
ovs_mutex mutex).

Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
---
 net/openvswitch/flow_table.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

Comments

Juri Lelli Oct. 9, 2020, 3:01 p.m. UTC | #1
On 09/10/20 14:47, Juri Lelli wrote:
> The following BUG has been reported (slightly edited):
> 
>  BUG: using smp_processor_id() in preemptible [00000000] code: handler106/3082
>  caller is flow_lookup.isra.15+0x2c/0xf0 [openvswitch]
>  CPU: 46 PID: 3082 Comm: handler106 Not tainted ... #1
>  Hardware name: Dell Inc. PowerEdge R640/06DKY5, BIOS 2.5.4 01/13/2020
>  Call Trace:
>   dump_stack+0x5c/0x80
>   check_preemption_disabled+0xc4/0xd0
>   flow_lookup.isra.15+0x2c/0xf0 [openvswitch]
>   ovs_flow_tbl_lookup+0x3b/0x60 [openvswitch]
>   ovs_flow_cmd_new+0x2d8/0x430 [openvswitch]
>   ? __switch_to_asm+0x35/0x70
>   ? __switch_to_asm+0x41/0x70
>   ? __switch_to_asm+0x35/0x70
>   genl_family_rcv_msg+0x1d7/0x410
>   ? migrate_enable+0x123/0x3a0
>   genl_rcv_msg+0x47/0x8c
>   ? __kmalloc_node_track_caller+0xff/0x2e0
>   ? genl_family_rcv_msg+0x410/0x410
>   netlink_rcv_skb+0x4c/0x120
>   genl_rcv+0x24/0x40
>   netlink_unicast+0x197/0x230
>   netlink_sendmsg+0x204/0x3d0
>   sock_sendmsg+0x4c/0x50
>   ___sys_sendmsg+0x29f/0x300
>   ? migrate_enable+0x123/0x3a0
>   ? ep_send_events_proc+0x8a/0x1f0
>   ? ep_scan_ready_list.constprop.23+0x237/0x260
>   ? rt_spin_unlock+0x23/0x40
>   ? ep_poll+0x1b3/0x390
>   ? __fget+0x72/0xa0
>   __sys_sendmsg+0x57/0xa0
>   do_syscall_64+0x87/0x1a0
>   entry_SYSCALL_64_after_hwframe+0x65/0xca
>  RIP: 0033:0x7f1ed72ccb07
>  Code: ...
>  RSP: 002b:00007f1ecbd9ba80 EFLAGS: 00003293 ORIG_RAX: 000000000000002e
>  RAX: ffffffffffffffda RBX: 000000000000007b RCX: 00007f1ed72ccb07
>  RDX: 0000000000000000 RSI: 00007f1ecbd9bb10 RDI: 000000000000007b
>  RBP: 00007f1ecbd9bb10 R08: 0000000000000000 R09: 00007f1ecbd9d390
>  R10: 0000000019616156 R11: 0000000000003293 R12: 0000000000000000
>  R13: 00007f1ecbd9d338 R14: 00007f1ecbd9bfb0 R15: 00007f1ecbd9bb10
> 
> This happens because openvswitch/flow_table::flow_lookup() accesses
> per-cpu data while being preemptible (and migratable).
> 
> Fix it by adding get/put_cpu_light(), so that, even if preempted, the
> task executing this code is not migrated (operation is also guarded by
> ovs_mutex mutex).
> 

This actually wants also a (sorry for missing it in the first place)

Suggested-by: Daniel Bristot de Oliveira <bristot@redhat.com>

> Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
> ---
>  net/openvswitch/flow_table.c | 8 +++++++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
> index e2235849a57e..7df27ef7da09 100644
> --- a/net/openvswitch/flow_table.c
> +++ b/net/openvswitch/flow_table.c
> @@ -732,11 +732,14 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>  				   u32 *n_cache_hit,
>  				   u32 *index)
>  {
> -	u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
> +	u64 *usage_counters;
>  	struct sw_flow *flow;
>  	struct sw_flow_mask *mask;
>  	int i;
>  
> +	get_cpu_light();
> +	usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
> +
>  	if (likely(*index < ma->max)) {
>  		mask = rcu_dereference_ovsl(ma->masks[*index]);
>  		if (mask) {
> @@ -746,6 +749,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>  				usage_counters[*index]++;
>  				u64_stats_update_end(&ma->syncp);
>  				(*n_cache_hit)++;
> +				put_cpu_light();
>  				return flow;
>  			}
>  		}
> @@ -766,10 +770,12 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>  			u64_stats_update_begin(&ma->syncp);
>  			usage_counters[*index]++;
>  			u64_stats_update_end(&ma->syncp);
> +			put_cpu_light();
>  			return flow;
>  		}
>  	}
>  
> +	put_cpu_light();
>  	return NULL;
>  }
>  
> -- 
> 2.26.2
>
Sebastian Andrzej Siewior Oct. 9, 2020, 3:41 p.m. UTC | #2
On 2020-10-09 14:47:59 [+0200], Juri Lelli wrote:
> This happens because openvswitch/flow_table::flow_lookup() accesses

> per-cpu data while being preemptible (and migratable).

> 

> Fix it by adding get/put_cpu_light(), so that, even if preempted, the

> task executing this code is not migrated (operation is also guarded by

> ovs_mutex mutex).


This warning is not limited to PREEMPT_RT it also present upstream since
commit
   eac87c413bf97 ("net: openvswitch: reorder masks array based on usage")

You should be able to reproduce it there, too.
The path ovs_flow_tbl_lookup() -> flow_lookup() is guarded by ovs_lock()
I can't say that this true for
   ovs_vport_receive() -> ovs_dp_process_packet() ->
   ovs_flow_tbl_lookup_stats() -> flow_lookup()

(means I don't know but it looks like coming from NAPI).

Which means u64_stats_update_begin() could have two writers. This must
not happen.
There are two reader which do u64_stats_fetch_begin_irq(). Disabling
interrupts makes no sense since they perform cross-CPU access.

-> You need to ensure that there is only one writer at a time.

If mask_array gains a spinlock_t for writer protection then you can
acquire the lock prio grabbing ->masks_usage_cntr. But as of now there
is one `ma->syncp'.

Sebastian
Eelco Chaudron Oct. 12, 2020, 8:14 a.m. UTC | #3
On 9 Oct 2020, at 17:41, Sebastian Andrzej Siewior wrote:

> On 2020-10-09 14:47:59 [+0200], Juri Lelli wrote:
>> This happens because openvswitch/flow_table::flow_lookup() accesses
>> per-cpu data while being preemptible (and migratable).
>>
>> Fix it by adding get/put_cpu_light(), so that, even if preempted, the
>> task executing this code is not migrated (operation is also guarded 
>> by
>> ovs_mutex mutex).
>
> This warning is not limited to PREEMPT_RT it also present upstream 
> since
> commit
>    eac87c413bf97 ("net: openvswitch: reorder masks array based on 
> usage")
>
> You should be able to reproduce it there, too.
> The path ovs_flow_tbl_lookup() -> flow_lookup() is guarded by 
> ovs_lock()
> I can't say that this true for
>    ovs_vport_receive() -> ovs_dp_process_packet() ->
>    ovs_flow_tbl_lookup_stats() -> flow_lookup()
>
> (means I don't know but it looks like coming from NAPI).
>
> Which means u64_stats_update_begin() could have two writers. This must
> not happen.
> There are two reader which do u64_stats_fetch_begin_irq(). Disabling
> interrupts makes no sense since they perform cross-CPU access.
>
> -> You need to ensure that there is only one writer at a time.
>
> If mask_array gains a spinlock_t for writer protection then you can
> acquire the lock prio grabbing ->masks_usage_cntr. But as of now there
> is one `ma->syncp'.

I’m not too familiar with the RT kernel, but in the none RT kernel, 
this function is called in run to completion parts only, hence does not 
need a lock. Actually, this was designed in such a way that it does not 
need a lock at all.

So maybe this needs a get_cpu() instead of the light variant in the RT 
case?

//Eelco
Sebastian Andrzej Siewior Oct. 12, 2020, 8:21 a.m. UTC | #4
On 2020-10-12 10:14:42 [+0200], Eelco Chaudron wrote:
> 
> 
> On 9 Oct 2020, at 17:41, Sebastian Andrzej Siewior wrote:
> 
> > On 2020-10-09 14:47:59 [+0200], Juri Lelli wrote:
> > > This happens because openvswitch/flow_table::flow_lookup() accesses
> > > per-cpu data while being preemptible (and migratable).
> > > 
> > > Fix it by adding get/put_cpu_light(), so that, even if preempted, the
> > > task executing this code is not migrated (operation is also guarded
> > > by
> > > ovs_mutex mutex).
> > 
> > This warning is not limited to PREEMPT_RT it also present upstream since
> > commit
> >    eac87c413bf97 ("net: openvswitch: reorder masks array based on
> > usage")
> > 
> > You should be able to reproduce it there, too.
> > The path ovs_flow_tbl_lookup() -> flow_lookup() is guarded by ovs_lock()
> > I can't say that this true for
> >    ovs_vport_receive() -> ovs_dp_process_packet() ->
> >    ovs_flow_tbl_lookup_stats() -> flow_lookup()
> > 
> > (means I don't know but it looks like coming from NAPI).
> > 
> > Which means u64_stats_update_begin() could have two writers. This must
> > not happen.
> > There are two reader which do u64_stats_fetch_begin_irq(). Disabling
> > interrupts makes no sense since they perform cross-CPU access.
> > 
> > -> You need to ensure that there is only one writer at a time.
> > 
> > If mask_array gains a spinlock_t for writer protection then you can
> > acquire the lock prio grabbing ->masks_usage_cntr. But as of now there
> > is one `ma->syncp'.
> 
> I’m not too familiar with the RT kernel, but in the none RT kernel, this
> function is called in run to completion parts only, hence does not need a
> lock. Actually, this was designed in such a way that it does not need a lock
> at all.

_no_ As explained above, this is not RT specific.
What guaranties that you don't have two flow_lookup() invocations on the
same CPU? You are using u64_stats_update_begin() which must not be
preempted. This means even if preemption is disabled you must not have
another invocation in BH context. This is due to the
write_seqcount_begin() in u64_stats_update_begin().
If preemption / CPU migration is not a problem in the above part, you
can use annotation to disable the warning that led to the warning. But
the u64_stats invocation looks still problematic.

> So maybe this needs a get_cpu() instead of the light variant in the RT case?

> //Eelco

Sebastian
Eelco Chaudron Oct. 12, 2020, 1:36 p.m. UTC | #5
On 12 Oct 2020, at 10:21, Sebastian Andrzej Siewior wrote:

> On 2020-10-12 10:14:42 [+0200], Eelco Chaudron wrote:
>>
>>
>> On 9 Oct 2020, at 17:41, Sebastian Andrzej Siewior wrote:
>>
>>> On 2020-10-09 14:47:59 [+0200], Juri Lelli wrote:
>>>> This happens because openvswitch/flow_table::flow_lookup() accesses
>>>> per-cpu data while being preemptible (and migratable).
>>>>
>>>> Fix it by adding get/put_cpu_light(), so that, even if preempted, 
>>>> the
>>>> task executing this code is not migrated (operation is also guarded
>>>> by
>>>> ovs_mutex mutex).
>>>
>>> This warning is not limited to PREEMPT_RT it also present upstream 
>>> since
>>> commit
>>>    eac87c413bf97 ("net: openvswitch: reorder masks array based on
>>> usage")
>>>
>>> You should be able to reproduce it there, too.
>>> The path ovs_flow_tbl_lookup() -> flow_lookup() is guarded by 
>>> ovs_lock()
>>> I can't say that this true for
>>>    ovs_vport_receive() -> ovs_dp_process_packet() ->
>>>    ovs_flow_tbl_lookup_stats() -> flow_lookup()
>>>
>>> (means I don't know but it looks like coming from NAPI).
>>>
>>> Which means u64_stats_update_begin() could have two writers. This 
>>> must
>>> not happen.
>>> There are two reader which do u64_stats_fetch_begin_irq(). Disabling
>>> interrupts makes no sense since they perform cross-CPU access.
>>>
>>> -> You need to ensure that there is only one writer at a time.
>>>
>>> If mask_array gains a spinlock_t for writer protection then you can
>>> acquire the lock prio grabbing ->masks_usage_cntr. But as of now 
>>> there
>>> is one `ma->syncp'.
>>
>> I’m not too familiar with the RT kernel, but in the none RT kernel, 
>> this
>> function is called in run to completion parts only, hence does not 
>> need a
>> lock. Actually, this was designed in such a way that it does not need 
>> a lock
>> at all.
>
> _no_ As explained above, this is not RT specific.
> What guaranties that you don't have two flow_lookup() invocations on 
> the
> same CPU? You are using u64_stats_update_begin() which must not be
> preempted. This means even if preemption is disabled you must not have
> another invocation in BH context. This is due to the
> write_seqcount_begin() in u64_stats_update_begin().
> If preemption / CPU migration is not a problem in the above part, you
> can use annotation to disable the warning that led to the warning. But
> the u64_stats invocation looks still problematic.
>
>> So maybe this needs a get_cpu() instead of the light variant in the 
>> RT case?

Hi Sebastian,

I was not reading the splat correctly and thought it was from the NAPI 
path but it looks like it's from the netlink part. I think this could be 
fixed with the following patch, so both paths, NAPI, and netlink become 
non-preemptive:

--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -850,9 +850,14 @@ struct sw_flow *ovs_flow_tbl_lookup(struct 
flow_table *tbl,
         struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
         u32 __always_unused n_mask_hit;
         u32 __always_unused n_cache_hit;
+       struct sw_flow *flow;
         u32 index = 0;

-       return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, 
&index);
+       preempt_disable();
+       flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, 
&index);
+       preempt_enable();
+
+       return flow;
  }

  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,

Note that choosing to do this in ovs_flow_tbl_lookup() and not 
flow_lookup() allows the fast-path, through NAPI, not having to call 
preempt_disable().

Any comments? If not I can sent a proper patch trough netdev.

//Eelco
diff mbox series

Patch

diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index e2235849a57e..7df27ef7da09 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -732,11 +732,14 @@  static struct sw_flow *flow_lookup(struct flow_table *tbl,
 				   u32 *n_cache_hit,
 				   u32 *index)
 {
-	u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
+	u64 *usage_counters;
 	struct sw_flow *flow;
 	struct sw_flow_mask *mask;
 	int i;
 
+	get_cpu_light();
+	usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
+
 	if (likely(*index < ma->max)) {
 		mask = rcu_dereference_ovsl(ma->masks[*index]);
 		if (mask) {
@@ -746,6 +749,7 @@  static struct sw_flow *flow_lookup(struct flow_table *tbl,
 				usage_counters[*index]++;
 				u64_stats_update_end(&ma->syncp);
 				(*n_cache_hit)++;
+				put_cpu_light();
 				return flow;
 			}
 		}
@@ -766,10 +770,12 @@  static struct sw_flow *flow_lookup(struct flow_table *tbl,
 			u64_stats_update_begin(&ma->syncp);
 			usage_counters[*index]++;
 			u64_stats_update_end(&ma->syncp);
+			put_cpu_light();
 			return flow;
 		}
 	}
 
+	put_cpu_light();
 	return NULL;
 }