diff mbox series

[net,v2] net: openvswitch: fix to make sure flow_lookup() is not preempted

Message ID 160275519174.566500.6537031776378218151.stgit@ebuild
State Superseded
Headers show
Series [net,v2] net: openvswitch: fix to make sure flow_lookup() is not preempted | expand

Commit Message

Eelco Chaudron Oct. 15, 2020, 9:46 a.m. UTC
The flow_lookup() function uses per CPU variables, which must not be
preempted. However, this is fine in the general napi use case where
the local BH is disabled. But, it's also called in the netlink
context, which is preemptible. The below patch makes sure that even
in the netlink path, preemption is disabled.

In addition, the u64_stats_update_begin() sync point was not protected,
making the sync point part of the per CPU variable fixed this.

Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage")
Reported-by: Juri Lelli <jlelli@redhat.com>
Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
---
v2: - Add u64_stats_update_begin() sync point protection
    - Moved patch to net from net-next branch

 net/openvswitch/flow_table.c |   56 +++++++++++++++++++++++++-----------------
 net/openvswitch/flow_table.h |    8 +++++-
 2 files changed, 39 insertions(+), 25 deletions(-)

Comments

Ilya Maximets Oct. 15, 2020, 10:27 a.m. UTC | #1
On 10/15/20 11:46 AM, Eelco Chaudron wrote:
> The flow_lookup() function uses per CPU variables, which must not be
> preempted. However, this is fine in the general napi use case where
> the local BH is disabled. But, it's also called in the netlink
> context, which is preemptible. The below patch makes sure that even
> in the netlink path, preemption is disabled.
> 
> In addition, the u64_stats_update_begin() sync point was not protected,
> making the sync point part of the per CPU variable fixed this.
> 
> Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage")
> Reported-by: Juri Lelli <jlelli@redhat.com>
> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> ---
> v2: - Add u64_stats_update_begin() sync point protection
>     - Moved patch to net from net-next branch
> 
>  net/openvswitch/flow_table.c |   56 +++++++++++++++++++++++++-----------------
>  net/openvswitch/flow_table.h |    8 +++++-
>  2 files changed, 39 insertions(+), 25 deletions(-)
> 
> diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
> index e2235849a57e..d90b4af6f539 100644
> --- a/net/openvswitch/flow_table.c
> +++ b/net/openvswitch/flow_table.c
> @@ -172,7 +172,7 @@ static struct table_instance *table_instance_alloc(int new_size)
>  
>  static void __mask_array_destroy(struct mask_array *ma)
>  {
> -	free_percpu(ma->masks_usage_cntr);
> +	free_percpu(ma->masks_usage_stats);
>  	kfree(ma);
>  }
>  
> @@ -196,15 +196,15 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
>  		ma->masks_usage_zero_cntr[i] = 0;
>  
>  		for_each_possible_cpu(cpu) {
> -			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
> -							  cpu);
> +			struct mask_array_stats *stats;
>  			unsigned int start;
>  			u64 counter;
>  
> +			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>  			do {
> -				start = u64_stats_fetch_begin_irq(&ma->syncp);
> -				counter = usage_counters[i];
> -			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
> +				start = u64_stats_fetch_begin_irq(&stats->syncp);
> +				counter = stats->usage_cntrs[i];
> +			} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
>  
>  			ma->masks_usage_zero_cntr[i] += counter;
>  		}
> @@ -227,9 +227,10 @@ static struct mask_array *tbl_mask_array_alloc(int size)
>  					     sizeof(struct sw_flow_mask *) *
>  					     size);
>  
> -	new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
> -					       __alignof__(u64));
> -	if (!new->masks_usage_cntr) {
> +	new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
> +						sizeof(u64) * size,
> +						__alignof__(u64));
> +	if (!new->masks_usage_stats) {
>  		kfree(new);
>  		return NULL;
>  	}
> @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>  				   u32 *n_cache_hit,
>  				   u32 *index)
>  {
> -	u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
> +	struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
>  	struct sw_flow *flow;
>  	struct sw_flow_mask *mask;
>  	int i;
> @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>  		if (mask) {
>  			flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>  			if (flow) {
> -				u64_stats_update_begin(&ma->syncp);
> -				usage_counters[*index]++;
> -				u64_stats_update_end(&ma->syncp);
> +				u64_stats_update_begin(&stats->syncp);
> +				stats->usage_cntrs[*index]++;
> +				u64_stats_update_end(&stats->syncp);
>  				(*n_cache_hit)++;
>  				return flow;
>  			}
> @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>  		flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>  		if (flow) { /* Found */
>  			*index = i;
> -			u64_stats_update_begin(&ma->syncp);
> -			usage_counters[*index]++;
> -			u64_stats_update_end(&ma->syncp);
> +			u64_stats_update_begin(&stats->syncp);
> +			stats->usage_cntrs[*index]++;
> +			u64_stats_update_end(&stats->syncp);
>  			return flow;
>  		}
>  	}
> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
>  	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>  	u32 __always_unused n_mask_hit;
>  	u32 __always_unused n_cache_hit;
> +	struct sw_flow *flow;
>  	u32 index = 0;
>  
> -	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
> +	/* This function gets called trough the netlink interface and therefore
> +	 * is preemptible. However, flow_lookup() function needs to be called
> +	 * with preemption disabled due to CPU specific variables.

Is it possible to add some kind of assertion inside flow_lookup() to avoid
this kind of issues in the future?

It might be also good to update the comment for flow_lookup() function itself.

> +	 */
> +	local_bh_disable();
> +	flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
> +	local_bh_enable();
> +	return flow;
>  }
>  
>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
> @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>  
>  	for (i = 0; i < ma->max; i++)  {
>  		struct sw_flow_mask *mask;
> -		unsigned int start;
>  		int cpu;
>  
>  		mask = rcu_dereference_ovsl(ma->masks[i]);
> @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>  		masks_and_count[i].counter = 0;
>  
>  		for_each_possible_cpu(cpu) {
> -			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
> -							  cpu);
> +			struct mask_array_stats *stats;
> +			unsigned int start;
>  			u64 counter;
>  
> +			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>  			do {
> -				start = u64_stats_fetch_begin_irq(&ma->syncp);
> -				counter = usage_counters[i];
> -			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
> +				start = u64_stats_fetch_begin_irq(&stats->syncp);
> +				counter = stats->usage_cntrs[i];
> +			} while (u64_stats_fetch_retry_irq(&stats->syncp,
> +							   start));
>  
>  			masks_and_count[i].counter += counter;
>  		}
> diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
> index 6e7d4ac59353..43144396e192 100644
> --- a/net/openvswitch/flow_table.h
> +++ b/net/openvswitch/flow_table.h
> @@ -38,12 +38,16 @@ struct mask_count {
>  	u64 counter;
>  };
>  
> +struct mask_array_stats {
> +	struct u64_stats_sync syncp;
> +	u64 usage_cntrs[];
> +};
> +
>  struct mask_array {
>  	struct rcu_head rcu;
>  	int count, max;
> -	u64 __percpu *masks_usage_cntr;
> +	struct mask_array_stats __percpu *masks_usage_stats;
>  	u64 *masks_usage_zero_cntr;
> -	struct u64_stats_sync syncp;
>  	struct sw_flow_mask __rcu *masks[];
>  };
>  
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Eelco Chaudron Oct. 15, 2020, 10:54 a.m. UTC | #2
On 15 Oct 2020, at 12:27, Ilya Maximets wrote:

> On 10/15/20 11:46 AM, Eelco Chaudron wrote:
>> The flow_lookup() function uses per CPU variables, which must not be
>> preempted. However, this is fine in the general napi use case where
>> the local BH is disabled. But, it's also called in the netlink
>> context, which is preemptible. The below patch makes sure that even
>> in the netlink path, preemption is disabled.
>>
>> In addition, the u64_stats_update_begin() sync point was not 
>> protected,
>> making the sync point part of the per CPU variable fixed this.
>>
>> Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on 
>> usage")
>> Reported-by: Juri Lelli <jlelli@redhat.com>
>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>> ---
>> v2: - Add u64_stats_update_begin() sync point protection
>>     - Moved patch to net from net-next branch
>>
>>  net/openvswitch/flow_table.c |   56 
>> +++++++++++++++++++++++++-----------------
>>  net/openvswitch/flow_table.h |    8 +++++-
>>  2 files changed, 39 insertions(+), 25 deletions(-)
>>
>> diff --git a/net/openvswitch/flow_table.c 
>> b/net/openvswitch/flow_table.c
>> index e2235849a57e..d90b4af6f539 100644
>> --- a/net/openvswitch/flow_table.c
>> +++ b/net/openvswitch/flow_table.c
>> @@ -172,7 +172,7 @@ static struct table_instance 
>> *table_instance_alloc(int new_size)
>>
>>  static void __mask_array_destroy(struct mask_array *ma)
>>  {
>> -	free_percpu(ma->masks_usage_cntr);
>> +	free_percpu(ma->masks_usage_stats);
>>  	kfree(ma);
>>  }
>>
>> @@ -196,15 +196,15 @@ static void 
>> tbl_mask_array_reset_counters(struct mask_array *ma)
>>  		ma->masks_usage_zero_cntr[i] = 0;
>>
>>  		for_each_possible_cpu(cpu) {
>> -			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>> -							  cpu);
>> +			struct mask_array_stats *stats;
>>  			unsigned int start;
>>  			u64 counter;
>>
>> +			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>  			do {
>> -				start = u64_stats_fetch_begin_irq(&ma->syncp);
>> -				counter = usage_counters[i];
>> -			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>> +				start = u64_stats_fetch_begin_irq(&stats->syncp);
>> +				counter = stats->usage_cntrs[i];
>> +			} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
>>
>>  			ma->masks_usage_zero_cntr[i] += counter;
>>  		}
>> @@ -227,9 +227,10 @@ static struct mask_array 
>> *tbl_mask_array_alloc(int size)
>>  					     sizeof(struct sw_flow_mask *) *
>>  					     size);
>>
>> -	new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
>> -					       __alignof__(u64));
>> -	if (!new->masks_usage_cntr) {
>> +	new->masks_usage_stats = __alloc_percpu(sizeof(struct 
>> mask_array_stats) +
>> +						sizeof(u64) * size,
>> +						__alignof__(u64));
>> +	if (!new->masks_usage_stats) {
>>  		kfree(new);
>>  		return NULL;
>>  	}
>> @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct 
>> flow_table *tbl,
>>  				   u32 *n_cache_hit,
>>  				   u32 *index)
>>  {
>> -	u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
>> +	struct mask_array_stats *stats = 
>> this_cpu_ptr(ma->masks_usage_stats);
>>  	struct sw_flow *flow;
>>  	struct sw_flow_mask *mask;
>>  	int i;
>> @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct 
>> flow_table *tbl,
>>  		if (mask) {
>>  			flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>  			if (flow) {
>> -				u64_stats_update_begin(&ma->syncp);
>> -				usage_counters[*index]++;
>> -				u64_stats_update_end(&ma->syncp);
>> +				u64_stats_update_begin(&stats->syncp);
>> +				stats->usage_cntrs[*index]++;
>> +				u64_stats_update_end(&stats->syncp);
>>  				(*n_cache_hit)++;
>>  				return flow;
>>  			}
>> @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct 
>> flow_table *tbl,
>>  		flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>  		if (flow) { /* Found */
>>  			*index = i;
>> -			u64_stats_update_begin(&ma->syncp);
>> -			usage_counters[*index]++;
>> -			u64_stats_update_end(&ma->syncp);
>> +			u64_stats_update_begin(&stats->syncp);
>> +			stats->usage_cntrs[*index]++;
>> +			u64_stats_update_end(&stats->syncp);
>>  			return flow;
>>  		}
>>  	}
>> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct 
>> flow_table *tbl,
>>  	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>>  	u32 __always_unused n_mask_hit;
>>  	u32 __always_unused n_cache_hit;
>> +	struct sw_flow *flow;
>>  	u32 index = 0;
>>
>> -	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, 
>> &index);
>> +	/* This function gets called trough the netlink interface and 
>> therefore
>> +	 * is preemptible. However, flow_lookup() function needs to be 
>> called
>> +	 * with preemption disabled due to CPU specific variables.
>
> Is it possible to add some kind of assertion inside flow_lookup() to 
> avoid
> this kind of issues in the future?

We could do something like WARN_ON_ONCE(preemptible()) but do not think 
such check should be added to the fast path.

> It might be also good to update the comment for flow_lookup() function 
> itself.

Good idea, will do this in a v3

>> +	 */
>> +	local_bh_disable();
>> +	flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, 
>> &index);
>> +	local_bh_enable();
>> +	return flow;
>>  }
>>
>>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
>> @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct flow_table 
>> *table)
>>
>>  	for (i = 0; i < ma->max; i++)  {
>>  		struct sw_flow_mask *mask;
>> -		unsigned int start;
>>  		int cpu;
>>
>>  		mask = rcu_dereference_ovsl(ma->masks[i]);
>> @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct 
>> flow_table *table)
>>  		masks_and_count[i].counter = 0;
>>
>>  		for_each_possible_cpu(cpu) {
>> -			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>> -							  cpu);
>> +			struct mask_array_stats *stats;
>> +			unsigned int start;
>>  			u64 counter;
>>
>> +			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>  			do {
>> -				start = u64_stats_fetch_begin_irq(&ma->syncp);
>> -				counter = usage_counters[i];
>> -			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>> +				start = u64_stats_fetch_begin_irq(&stats->syncp);
>> +				counter = stats->usage_cntrs[i];
>> +			} while (u64_stats_fetch_retry_irq(&stats->syncp,
>> +							   start));
>>
>>  			masks_and_count[i].counter += counter;
>>  		}
>> diff --git a/net/openvswitch/flow_table.h 
>> b/net/openvswitch/flow_table.h
>> index 6e7d4ac59353..43144396e192 100644
>> --- a/net/openvswitch/flow_table.h
>> +++ b/net/openvswitch/flow_table.h
>> @@ -38,12 +38,16 @@ struct mask_count {
>>  	u64 counter;
>>  };
>>
>> +struct mask_array_stats {
>> +	struct u64_stats_sync syncp;
>> +	u64 usage_cntrs[];
>> +};
>> +
>>  struct mask_array {
>>  	struct rcu_head rcu;
>>  	int count, max;
>> -	u64 __percpu *masks_usage_cntr;
>> +	struct mask_array_stats __percpu *masks_usage_stats;
>>  	u64 *masks_usage_zero_cntr;
>> -	struct u64_stats_sync syncp;
>>  	struct sw_flow_mask __rcu *masks[];
>>  };
>>
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
Ilya Maximets Oct. 15, 2020, 11:04 a.m. UTC | #3
On 10/15/20 12:54 PM, Eelco Chaudron wrote:
> 
> 
> On 15 Oct 2020, at 12:27, Ilya Maximets wrote:
> 
>> On 10/15/20 11:46 AM, Eelco Chaudron wrote:
>>> The flow_lookup() function uses per CPU variables, which must not be
>>> preempted. However, this is fine in the general napi use case where
>>> the local BH is disabled. But, it's also called in the netlink
>>> context, which is preemptible. The below patch makes sure that even
>>> in the netlink path, preemption is disabled.
>>>
>>> In addition, the u64_stats_update_begin() sync point was not protected,
>>> making the sync point part of the per CPU variable fixed this.
>>>
>>> Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage")
>>> Reported-by: Juri Lelli <jlelli@redhat.com>
>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>> ---
>>> v2: - Add u64_stats_update_begin() sync point protection
>>>     - Moved patch to net from net-next branch
>>>
>>>  net/openvswitch/flow_table.c |   56 +++++++++++++++++++++++++-----------------
>>>  net/openvswitch/flow_table.h |    8 +++++-
>>>  2 files changed, 39 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
>>> index e2235849a57e..d90b4af6f539 100644
>>> --- a/net/openvswitch/flow_table.c
>>> +++ b/net/openvswitch/flow_table.c
>>> @@ -172,7 +172,7 @@ static struct table_instance *table_instance_alloc(int new_size)
>>>
>>>  static void __mask_array_destroy(struct mask_array *ma)
>>>  {
>>> -    free_percpu(ma->masks_usage_cntr);
>>> +    free_percpu(ma->masks_usage_stats);
>>>      kfree(ma);
>>>  }
>>>
>>> @@ -196,15 +196,15 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
>>>          ma->masks_usage_zero_cntr[i] = 0;
>>>
>>>          for_each_possible_cpu(cpu) {
>>> -            u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>>> -                              cpu);
>>> +            struct mask_array_stats *stats;
>>>              unsigned int start;
>>>              u64 counter;
>>>
>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>>              do {
>>> -                start = u64_stats_fetch_begin_irq(&ma->syncp);
>>> -                counter = usage_counters[i];
>>> -            } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>> +                start = u64_stats_fetch_begin_irq(&stats->syncp);
>>> +                counter = stats->usage_cntrs[i];
>>> +            } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
>>>
>>>              ma->masks_usage_zero_cntr[i] += counter;
>>>          }
>>> @@ -227,9 +227,10 @@ static struct mask_array *tbl_mask_array_alloc(int size)
>>>                           sizeof(struct sw_flow_mask *) *
>>>                           size);
>>>
>>> -    new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
>>> -                           __alignof__(u64));
>>> -    if (!new->masks_usage_cntr) {
>>> +    new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
>>> +                        sizeof(u64) * size,
>>> +                        __alignof__(u64));
>>> +    if (!new->masks_usage_stats) {
>>>          kfree(new);
>>>          return NULL;
>>>      }
>>> @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>                     u32 *n_cache_hit,
>>>                     u32 *index)
>>>  {
>>> -    u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
>>> +    struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
>>>      struct sw_flow *flow;
>>>      struct sw_flow_mask *mask;
>>>      int i;
>>> @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>          if (mask) {
>>>              flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>>              if (flow) {
>>> -                u64_stats_update_begin(&ma->syncp);
>>> -                usage_counters[*index]++;
>>> -                u64_stats_update_end(&ma->syncp);
>>> +                u64_stats_update_begin(&stats->syncp);
>>> +                stats->usage_cntrs[*index]++;
>>> +                u64_stats_update_end(&stats->syncp);
>>>                  (*n_cache_hit)++;
>>>                  return flow;
>>>              }
>>> @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>          flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>>          if (flow) { /* Found */
>>>              *index = i;
>>> -            u64_stats_update_begin(&ma->syncp);
>>> -            usage_counters[*index]++;
>>> -            u64_stats_update_end(&ma->syncp);
>>> +            u64_stats_update_begin(&stats->syncp);
>>> +            stats->usage_cntrs[*index]++;
>>> +            u64_stats_update_end(&stats->syncp);
>>>              return flow;
>>>          }
>>>      }
>>> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
>>>      struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>>>      u32 __always_unused n_mask_hit;
>>>      u32 __always_unused n_cache_hit;
>>> +    struct sw_flow *flow;
>>>      u32 index = 0;
>>>
>>> -    return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
>>> +    /* This function gets called trough the netlink interface and therefore
>>> +     * is preemptible. However, flow_lookup() function needs to be called
>>> +     * with preemption disabled due to CPU specific variables.
>>
>> Is it possible to add some kind of assertion inside flow_lookup() to avoid
>> this kind of issues in the future?
> 
> We could do something like WARN_ON_ONCE(preemptible()) but do not think such check should be added to the fast path.

I meant something like lockdep_assert_preemption_disabled().  This will not
impact fast path if CONFIG_PROVE_LOCKING disabled, but will allow to catch
issues during development.

> 
>> It might be also good to update the comment for flow_lookup() function itself.
> 
> Good idea, will do this in a v3
> 
>>> +     */
>>> +    local_bh_disable();
>>> +    flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
>>> +    local_bh_enable();
>>> +    return flow;
>>>  }
>>>
>>>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
>>> @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>>>
>>>      for (i = 0; i < ma->max; i++)  {
>>>          struct sw_flow_mask *mask;
>>> -        unsigned int start;
>>>          int cpu;
>>>
>>>          mask = rcu_dereference_ovsl(ma->masks[i]);
>>> @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>>>          masks_and_count[i].counter = 0;
>>>
>>>          for_each_possible_cpu(cpu) {
>>> -            u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>>> -                              cpu);
>>> +            struct mask_array_stats *stats;
>>> +            unsigned int start;
>>>              u64 counter;
>>>
>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>>              do {
>>> -                start = u64_stats_fetch_begin_irq(&ma->syncp);
>>> -                counter = usage_counters[i];
>>> -            } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>> +                start = u64_stats_fetch_begin_irq(&stats->syncp);
>>> +                counter = stats->usage_cntrs[i];
>>> +            } while (u64_stats_fetch_retry_irq(&stats->syncp,
>>> +                               start));
>>>
>>>              masks_and_count[i].counter += counter;
>>>          }
>>> diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
>>> index 6e7d4ac59353..43144396e192 100644
>>> --- a/net/openvswitch/flow_table.h
>>> +++ b/net/openvswitch/flow_table.h
>>> @@ -38,12 +38,16 @@ struct mask_count {
>>>      u64 counter;
>>>  };
>>>
>>> +struct mask_array_stats {
>>> +    struct u64_stats_sync syncp;
>>> +    u64 usage_cntrs[];
>>> +};
>>> +
>>>  struct mask_array {
>>>      struct rcu_head rcu;
>>>      int count, max;
>>> -    u64 __percpu *masks_usage_cntr;
>>> +    struct mask_array_stats __percpu *masks_usage_stats;
>>>      u64 *masks_usage_zero_cntr;
>>> -    struct u64_stats_sync syncp;
>>>      struct sw_flow_mask __rcu *masks[];
>>>  };
>>>
>>>
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>
>
Ilya Maximets Oct. 15, 2020, 11:22 a.m. UTC | #4
On 10/15/20 1:04 PM, Ilya Maximets wrote:
> On 10/15/20 12:54 PM, Eelco Chaudron wrote:
>>
>>
>> On 15 Oct 2020, at 12:27, Ilya Maximets wrote:
>>
>>> On 10/15/20 11:46 AM, Eelco Chaudron wrote:
>>>> The flow_lookup() function uses per CPU variables, which must not be
>>>> preempted. However, this is fine in the general napi use case where
>>>> the local BH is disabled. But, it's also called in the netlink
>>>> context, which is preemptible. The below patch makes sure that even
>>>> in the netlink path, preemption is disabled.
>>>>
>>>> In addition, the u64_stats_update_begin() sync point was not protected,
>>>> making the sync point part of the per CPU variable fixed this.
>>>>
>>>> Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage")
>>>> Reported-by: Juri Lelli <jlelli@redhat.com>
>>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>>> ---
>>>> v2: - Add u64_stats_update_begin() sync point protection
>>>>     - Moved patch to net from net-next branch
>>>>
>>>>  net/openvswitch/flow_table.c |   56 +++++++++++++++++++++++++-----------------
>>>>  net/openvswitch/flow_table.h |    8 +++++-
>>>>  2 files changed, 39 insertions(+), 25 deletions(-)
>>>>
>>>> diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
>>>> index e2235849a57e..d90b4af6f539 100644
>>>> --- a/net/openvswitch/flow_table.c
>>>> +++ b/net/openvswitch/flow_table.c
>>>> @@ -172,7 +172,7 @@ static struct table_instance *table_instance_alloc(int new_size)
>>>>
>>>>  static void __mask_array_destroy(struct mask_array *ma)
>>>>  {
>>>> -    free_percpu(ma->masks_usage_cntr);
>>>> +    free_percpu(ma->masks_usage_stats);
>>>>      kfree(ma);
>>>>  }
>>>>
>>>> @@ -196,15 +196,15 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
>>>>          ma->masks_usage_zero_cntr[i] = 0;
>>>>
>>>>          for_each_possible_cpu(cpu) {
>>>> -            u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>>>> -                              cpu);
>>>> +            struct mask_array_stats *stats;
>>>>              unsigned int start;
>>>>              u64 counter;
>>>>
>>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>>>              do {
>>>> -                start = u64_stats_fetch_begin_irq(&ma->syncp);
>>>> -                counter = usage_counters[i];
>>>> -            } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>>> +                start = u64_stats_fetch_begin_irq(&stats->syncp);
>>>> +                counter = stats->usage_cntrs[i];
>>>> +            } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
>>>>
>>>>              ma->masks_usage_zero_cntr[i] += counter;
>>>>          }
>>>> @@ -227,9 +227,10 @@ static struct mask_array *tbl_mask_array_alloc(int size)
>>>>                           sizeof(struct sw_flow_mask *) *
>>>>                           size);
>>>>
>>>> -    new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
>>>> -                           __alignof__(u64));
>>>> -    if (!new->masks_usage_cntr) {
>>>> +    new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
>>>> +                        sizeof(u64) * size,
>>>> +                        __alignof__(u64));
>>>> +    if (!new->masks_usage_stats) {
>>>>          kfree(new);
>>>>          return NULL;
>>>>      }
>>>> @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>>                     u32 *n_cache_hit,
>>>>                     u32 *index)
>>>>  {
>>>> -    u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
>>>> +    struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
>>>>      struct sw_flow *flow;
>>>>      struct sw_flow_mask *mask;
>>>>      int i;
>>>> @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>>          if (mask) {
>>>>              flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>>>              if (flow) {
>>>> -                u64_stats_update_begin(&ma->syncp);
>>>> -                usage_counters[*index]++;
>>>> -                u64_stats_update_end(&ma->syncp);
>>>> +                u64_stats_update_begin(&stats->syncp);
>>>> +                stats->usage_cntrs[*index]++;
>>>> +                u64_stats_update_end(&stats->syncp);
>>>>                  (*n_cache_hit)++;
>>>>                  return flow;
>>>>              }
>>>> @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>>          flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>>>          if (flow) { /* Found */
>>>>              *index = i;
>>>> -            u64_stats_update_begin(&ma->syncp);
>>>> -            usage_counters[*index]++;
>>>> -            u64_stats_update_end(&ma->syncp);
>>>> +            u64_stats_update_begin(&stats->syncp);
>>>> +            stats->usage_cntrs[*index]++;
>>>> +            u64_stats_update_end(&stats->syncp);
>>>>              return flow;
>>>>          }
>>>>      }
>>>> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
>>>>      struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>>>>      u32 __always_unused n_mask_hit;
>>>>      u32 __always_unused n_cache_hit;
>>>> +    struct sw_flow *flow;
>>>>      u32 index = 0;
>>>>
>>>> -    return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
>>>> +    /* This function gets called trough the netlink interface and therefore
>>>> +     * is preemptible. However, flow_lookup() function needs to be called
>>>> +     * with preemption disabled due to CPU specific variables.
>>>
>>> Is it possible to add some kind of assertion inside flow_lookup() to avoid
>>> this kind of issues in the future?
>>
>> We could do something like WARN_ON_ONCE(preemptible()) but do not think such check should be added to the fast path.
> 
> I meant something like lockdep_assert_preemption_disabled().  This will not
> impact fast path if CONFIG_PROVE_LOCKING disabled, but will allow to catch
> issues during development.

To be clear, I just mean that this could be compiled conditionally under some
debugging config.  Do not know which of existing configs might be used, though. 

> 
>>
>>> It might be also good to update the comment for flow_lookup() function itself.
>>
>> Good idea, will do this in a v3
>>
>>>> +     */
>>>> +    local_bh_disable();
>>>> +    flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
>>>> +    local_bh_enable();
>>>> +    return flow;
>>>>  }
>>>>
>>>>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
>>>> @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>>>>
>>>>      for (i = 0; i < ma->max; i++)  {
>>>>          struct sw_flow_mask *mask;
>>>> -        unsigned int start;
>>>>          int cpu;
>>>>
>>>>          mask = rcu_dereference_ovsl(ma->masks[i]);
>>>> @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>>>>          masks_and_count[i].counter = 0;
>>>>
>>>>          for_each_possible_cpu(cpu) {
>>>> -            u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>>>> -                              cpu);
>>>> +            struct mask_array_stats *stats;
>>>> +            unsigned int start;
>>>>              u64 counter;
>>>>
>>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>>>              do {
>>>> -                start = u64_stats_fetch_begin_irq(&ma->syncp);
>>>> -                counter = usage_counters[i];
>>>> -            } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>>> +                start = u64_stats_fetch_begin_irq(&stats->syncp);
>>>> +                counter = stats->usage_cntrs[i];
>>>> +            } while (u64_stats_fetch_retry_irq(&stats->syncp,
>>>> +                               start));
>>>>
>>>>              masks_and_count[i].counter += counter;
>>>>          }
>>>> diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
>>>> index 6e7d4ac59353..43144396e192 100644
>>>> --- a/net/openvswitch/flow_table.h
>>>> +++ b/net/openvswitch/flow_table.h
>>>> @@ -38,12 +38,16 @@ struct mask_count {
>>>>      u64 counter;
>>>>  };
>>>>
>>>> +struct mask_array_stats {
>>>> +    struct u64_stats_sync syncp;
>>>> +    u64 usage_cntrs[];
>>>> +};
>>>> +
>>>>  struct mask_array {
>>>>      struct rcu_head rcu;
>>>>      int count, max;
>>>> -    u64 __percpu *masks_usage_cntr;
>>>> +    struct mask_array_stats __percpu *masks_usage_stats;
>>>>      u64 *masks_usage_zero_cntr;
>>>> -    struct u64_stats_sync syncp;
>>>>      struct sw_flow_mask __rcu *masks[];
>>>>  };
>>>>
>>>>
>>>> _______________________________________________
>>>> dev mailing list
>>>> dev@openvswitch.org
>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>>
>>
>
Sebastian Andrzej Siewior Oct. 15, 2020, 12:34 p.m. UTC | #5
On 2020-10-15 11:46:53 [+0200], Eelco Chaudron wrote:
> The flow_lookup() function uses per CPU variables, which must not be
> preempted. However, this is fine in the general napi use case where
> the local BH is disabled. But, it's also called in the netlink
> context, which is preemptible. The below patch makes sure that even
> in the netlink path, preemption is disabled.

I would suggest to rephrase it: the term preemption usually means
preempt_disable(). A preempt disabled section can be preempted /
interrupted by hardirq and softirq. The later is mentioned and I think
is confusing.

> In addition, the u64_stats_update_begin() sync point was not protected,
> making the sync point part of the per CPU variable fixed this.

I would rephrase it and mention the key details:
u64_stats_update_begin() requires a lock to ensure one writer which is
not ensured here. Making it per-CPU and disabling NAPI (softirq) ensures
that there is always only one writer.

Regarding the annotation which were mentioned here in the thread.
Basically the this_cpu_ptr() warning worked as expected and got us here.
I don't think it is wise to add annotation distinguished from the actual
problem like assert_the_softirq_is_switched_off() in flow_lookup(). The
assert may become obsolete once the reason is removed and gets overseen
and remains in the code. The commits

	c60c32a577561 ("posix-cpu-timers: Remove lockdep_assert_irqs_disabled()")
	f9dae5554aed4 ("dpaa2-eth: Remove preempt_disable() from seed_pool()")

are just two examples which came to mind while writing this.

Instead I would prefer lockdep annotation in u64_stats_update_begin()
which is around also in 64bit kernels and complains if it is seen
without disabled BH if observed in-serving-softirq.
PeterZ, wasn't this mentioned before?

> --- a/net/openvswitch/flow_table.c
> +++ b/net/openvswitch/flow_table.c
> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
>  	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>  	u32 __always_unused n_mask_hit;
>  	u32 __always_unused n_cache_hit;
> +	struct sw_flow *flow;
>  	u32 index = 0;
>  
> -	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
> +	/* This function gets called trough the netlink interface and therefore
> +	 * is preemptible. However, flow_lookup() function needs to be called
> +	 * with preemption disabled due to CPU specific variables.

preemption vs BH.

> +	 */
> +	local_bh_disable();
> +	flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
> +	local_bh_enable();
> +	return flow;
>  }
>  
>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,

Otherwise it looks good.

Sebastian
Eelco Chaudron Oct. 15, 2020, 12:42 p.m. UTC | #6
On 15 Oct 2020, at 13:22, Ilya Maximets wrote:

> On 10/15/20 1:04 PM, Ilya Maximets wrote:
>> On 10/15/20 12:54 PM, Eelco Chaudron wrote:
>>>
>>>
>>> On 15 Oct 2020, at 12:27, Ilya Maximets wrote:
>>>
>>>> On 10/15/20 11:46 AM, Eelco Chaudron wrote:
>>>>> The flow_lookup() function uses per CPU variables, which must not 
>>>>> be
>>>>> preempted. However, this is fine in the general napi use case 
>>>>> where
>>>>> the local BH is disabled. But, it's also called in the netlink
>>>>> context, which is preemptible. The below patch makes sure that 
>>>>> even
>>>>> in the netlink path, preemption is disabled.
>>>>>
>>>>> In addition, the u64_stats_update_begin() sync point was not 
>>>>> protected,
>>>>> making the sync point part of the per CPU variable fixed this.
>>>>>
>>>>> Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based 
>>>>> on usage")
>>>>> Reported-by: Juri Lelli <jlelli@redhat.com>
>>>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>>>> ---
>>>>> v2: - Add u64_stats_update_begin() sync point protection
>>>>>     - Moved patch to net from net-next branch
>>>>>
>>>>>  net/openvswitch/flow_table.c |   56 
>>>>> +++++++++++++++++++++++++-----------------
>>>>>  net/openvswitch/flow_table.h |    8 +++++-
>>>>>  2 files changed, 39 insertions(+), 25 deletions(-)
>>>>>
>>>>> diff --git a/net/openvswitch/flow_table.c 
>>>>> b/net/openvswitch/flow_table.c
>>>>> index e2235849a57e..d90b4af6f539 100644
>>>>> --- a/net/openvswitch/flow_table.c
>>>>> +++ b/net/openvswitch/flow_table.c
>>>>> @@ -172,7 +172,7 @@ static struct table_instance 
>>>>> *table_instance_alloc(int new_size)
>>>>>
>>>>>  static void __mask_array_destroy(struct mask_array *ma)
>>>>>  {
>>>>> -    free_percpu(ma->masks_usage_cntr);
>>>>> +    free_percpu(ma->masks_usage_stats);
>>>>>      kfree(ma);
>>>>>  }
>>>>>
>>>>> @@ -196,15 +196,15 @@ static void 
>>>>> tbl_mask_array_reset_counters(struct mask_array *ma)
>>>>>          ma->masks_usage_zero_cntr[i] = 0;
>>>>>
>>>>>          for_each_possible_cpu(cpu) {
>>>>> -            u64 *usage_counters = 
>>>>> per_cpu_ptr(ma->masks_usage_cntr,
>>>>> -                              cpu);
>>>>> +            struct mask_array_stats *stats;
>>>>>              unsigned int start;
>>>>>              u64 counter;
>>>>>
>>>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, 
>>>>> cpu);
>>>>>              do {
>>>>> -                start = 
>>>>> u64_stats_fetch_begin_irq(&ma->syncp);
>>>>> -                counter = usage_counters[i];
>>>>> -            } while 
>>>>> (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>>>> +                start = 
>>>>> u64_stats_fetch_begin_irq(&stats->syncp);
>>>>> +                counter = stats->usage_cntrs[i];
>>>>> +            } while 
>>>>> (u64_stats_fetch_retry_irq(&stats->syncp, start));
>>>>>
>>>>>              ma->masks_usage_zero_cntr[i] += counter;
>>>>>          }
>>>>> @@ -227,9 +227,10 @@ static struct mask_array 
>>>>> *tbl_mask_array_alloc(int size)
>>>>>                           sizeof(struct 
>>>>> sw_flow_mask *) *
>>>>>                           size);
>>>>>
>>>>> -    new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
>>>>> -                           
>>>>> __alignof__(u64));
>>>>> -    if (!new->masks_usage_cntr) {
>>>>> +    new->masks_usage_stats = __alloc_percpu(sizeof(struct 
>>>>> mask_array_stats) +
>>>>> +                        sizeof(u64) * 
>>>>> size,
>>>>> +                        __alignof__(u64));
>>>>> +    if (!new->masks_usage_stats) {
>>>>>          kfree(new);
>>>>>          return NULL;
>>>>>      }
>>>>> @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct 
>>>>> flow_table *tbl,
>>>>>                     u32 *n_cache_hit,
>>>>>                     u32 *index)
>>>>>  {
>>>>> -    u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
>>>>> +    struct mask_array_stats *stats = 
>>>>> this_cpu_ptr(ma->masks_usage_stats);
>>>>>      struct sw_flow *flow;
>>>>>      struct sw_flow_mask *mask;
>>>>>      int i;
>>>>> @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct 
>>>>> flow_table *tbl,
>>>>>          if (mask) {
>>>>>              flow = masked_flow_lookup(ti, key, mask, 
>>>>> n_mask_hit);
>>>>>              if (flow) {
>>>>> -                
>>>>> u64_stats_update_begin(&ma->syncp);
>>>>> -                usage_counters[*index]++;
>>>>> -                u64_stats_update_end(&ma->syncp);
>>>>> +                
>>>>> u64_stats_update_begin(&stats->syncp);
>>>>> +                stats->usage_cntrs[*index]++;
>>>>> +                
>>>>> u64_stats_update_end(&stats->syncp);
>>>>>                  (*n_cache_hit)++;
>>>>>                  return flow;
>>>>>              }
>>>>> @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct 
>>>>> flow_table *tbl,
>>>>>          flow = masked_flow_lookup(ti, key, mask, 
>>>>> n_mask_hit);
>>>>>          if (flow) { /* Found */
>>>>>              *index = i;
>>>>> -            u64_stats_update_begin(&ma->syncp);
>>>>> -            usage_counters[*index]++;
>>>>> -            u64_stats_update_end(&ma->syncp);
>>>>> +            u64_stats_update_begin(&stats->syncp);
>>>>> +            stats->usage_cntrs[*index]++;
>>>>> +            u64_stats_update_end(&stats->syncp);
>>>>>              return flow;
>>>>>          }
>>>>>      }
>>>>> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct 
>>>>> flow_table *tbl,
>>>>>      struct mask_array *ma = 
>>>>> rcu_dereference_ovsl(tbl->mask_array);
>>>>>      u32 __always_unused n_mask_hit;
>>>>>      u32 __always_unused n_cache_hit;
>>>>> +    struct sw_flow *flow;
>>>>>      u32 index = 0;
>>>>>
>>>>> -    return flow_lookup(tbl, ti, ma, key, &n_mask_hit, 
>>>>> &n_cache_hit, &index);
>>>>> +    /* This function gets called trough the netlink interface 
>>>>> and therefore
>>>>> +     * is preemptible. However, flow_lookup() function needs 
>>>>> to be called
>>>>> +     * with preemption disabled due to CPU specific 
>>>>> variables.
>>>>
>>>> Is it possible to add some kind of assertion inside flow_lookup() 
>>>> to avoid
>>>> this kind of issues in the future?
>>>
>>> We could do something like WARN_ON_ONCE(preemptible()) but do not 
>>> think such check should be added to the fast path.
>>
>> I meant something like lockdep_assert_preemption_disabled().  This 
>> will not
>> impact fast path if CONFIG_PROVE_LOCKING disabled, but will allow to 
>> catch
>> issues during development.
>
> To be clear, I just mean that this could be compiled conditionally 
> under some
> debugging config.  Do not know which of existing configs might be 
> used, though.

Looked at the debug flags, but there is not really one that makes sense 
to use. Also using a general one like DEBUG_MISC will probably no help, 
as none would probably run OVS tests with this flag enabled. So for now 
I would leave it as is unless someone has a better idea…

>>>
>>>> It might be also good to update the comment for flow_lookup() 
>>>> function itself.
>>>
>>> Good idea, will do this in a v3
>>>
>>>>> +     */
>>>>> +    local_bh_disable();
>>>>> +    flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, 
>>>>> &n_cache_hit, &index);
>>>>> +    local_bh_enable();
>>>>> +    return flow;
>>>>>  }
>>>>>
>>>>>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table 
>>>>> *tbl,
>>>>> @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct 
>>>>> flow_table *table)
>>>>>
>>>>>      for (i = 0; i < ma->max; i++)  {
>>>>>          struct sw_flow_mask *mask;
>>>>> -        unsigned int start;
>>>>>          int cpu;
>>>>>
>>>>>          mask = rcu_dereference_ovsl(ma->masks[i]);
>>>>> @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct 
>>>>> flow_table *table)
>>>>>          masks_and_count[i].counter = 0;
>>>>>
>>>>>          for_each_possible_cpu(cpu) {
>>>>> -            u64 *usage_counters = 
>>>>> per_cpu_ptr(ma->masks_usage_cntr,
>>>>> -                              cpu);
>>>>> +            struct mask_array_stats *stats;
>>>>> +            unsigned int start;
>>>>>              u64 counter;
>>>>>
>>>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, 
>>>>> cpu);
>>>>>              do {
>>>>> -                start = 
>>>>> u64_stats_fetch_begin_irq(&ma->syncp);
>>>>> -                counter = usage_counters[i];
>>>>> -            } while 
>>>>> (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>>>> +                start = 
>>>>> u64_stats_fetch_begin_irq(&stats->syncp);
>>>>> +                counter = stats->usage_cntrs[i];
>>>>> +            } while 
>>>>> (u64_stats_fetch_retry_irq(&stats->syncp,
>>>>> +                               
>>>>> start));
>>>>>
>>>>>              masks_and_count[i].counter += counter;
>>>>>          }
>>>>> diff --git a/net/openvswitch/flow_table.h 
>>>>> b/net/openvswitch/flow_table.h
>>>>> index 6e7d4ac59353..43144396e192 100644
>>>>> --- a/net/openvswitch/flow_table.h
>>>>> +++ b/net/openvswitch/flow_table.h
>>>>> @@ -38,12 +38,16 @@ struct mask_count {
>>>>>      u64 counter;
>>>>>  };
>>>>>
>>>>> +struct mask_array_stats {
>>>>> +    struct u64_stats_sync syncp;
>>>>> +    u64 usage_cntrs[];
>>>>> +};
>>>>> +
>>>>>  struct mask_array {
>>>>>      struct rcu_head rcu;
>>>>>      int count, max;
>>>>> -    u64 __percpu *masks_usage_cntr;
>>>>> +    struct mask_array_stats __percpu *masks_usage_stats;
>>>>>      u64 *masks_usage_zero_cntr;
>>>>> -    struct u64_stats_sync syncp;
>>>>>      struct sw_flow_mask __rcu *masks[];
>>>>>  };
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> dev mailing list
>>>>> dev@openvswitch.org
>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>>>
>>>
>>
Eelco Chaudron Oct. 15, 2020, 5:06 p.m. UTC | #7
On 15 Oct 2020, at 14:34, Sebastian Andrzej Siewior wrote:

> On 2020-10-15 11:46:53 [+0200], Eelco Chaudron wrote:
>> The flow_lookup() function uses per CPU variables, which must not be
>> preempted. However, this is fine in the general napi use case where
>> the local BH is disabled. But, it's also called in the netlink
>> context, which is preemptible. The below patch makes sure that even
>> in the netlink path, preemption is disabled.
>
> I would suggest to rephrase it: the term preemption usually means
> preempt_disable(). A preempt disabled section can be preempted /
> interrupted by hardirq and softirq. The later is mentioned and I think
> is confusing.
>
>> In addition, the u64_stats_update_begin() sync point was not 
>> protected,
>> making the sync point part of the per CPU variable fixed this.
>
> I would rephrase it and mention the key details:
> u64_stats_update_begin() requires a lock to ensure one writer which is
> not ensured here. Making it per-CPU and disabling NAPI (softirq) 
> ensures
> that there is always only one writer.
>
> Regarding the annotation which were mentioned here in the thread.
> Basically the this_cpu_ptr() warning worked as expected and got us 
> here.
> I don't think it is wise to add annotation distinguished from the 
> actual
> problem like assert_the_softirq_is_switched_off() in flow_lookup(). 
> The
> assert may become obsolete once the reason is removed and gets 
> overseen
> and remains in the code. The commits
>
> 	c60c32a577561 ("posix-cpu-timers: Remove 
> lockdep_assert_irqs_disabled()")
> 	f9dae5554aed4 ("dpaa2-eth: Remove preempt_disable() from 
> seed_pool()")
>
> are just two examples which came to mind while writing this.
>
> Instead I would prefer lockdep annotation in u64_stats_update_begin()
> which is around also in 64bit kernels and complains if it is seen
> without disabled BH if observed in-serving-softirq.
> PeterZ, wasn't this mentioned before?
>
>> --- a/net/openvswitch/flow_table.c
>> +++ b/net/openvswitch/flow_table.c
>> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct 
>> flow_table *tbl,
>>  	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>>  	u32 __always_unused n_mask_hit;
>>  	u32 __always_unused n_cache_hit;
>> +	struct sw_flow *flow;
>>  	u32 index = 0;
>>
>> -	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, 
>> &index);
>> +	/* This function gets called trough the netlink interface and 
>> therefore
>> +	 * is preemptible. However, flow_lookup() function needs to be 
>> called
>> +	 * with preemption disabled due to CPU specific variables.
>
> preemption vs BH.
>
>> +	 */
>> +	local_bh_disable();
>> +	flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, 
>> &index);
>> +	local_bh_enable();
>> +	return flow;
>>  }
>>
>>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
>
> Otherwise it looks good.
>

Thanks for your review! Made the modifications you suggested and will 
send out a v3 soon.

//Eelco
diff mbox series

Patch

diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index e2235849a57e..d90b4af6f539 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -172,7 +172,7 @@  static struct table_instance *table_instance_alloc(int new_size)
 
 static void __mask_array_destroy(struct mask_array *ma)
 {
-	free_percpu(ma->masks_usage_cntr);
+	free_percpu(ma->masks_usage_stats);
 	kfree(ma);
 }
 
@@ -196,15 +196,15 @@  static void tbl_mask_array_reset_counters(struct mask_array *ma)
 		ma->masks_usage_zero_cntr[i] = 0;
 
 		for_each_possible_cpu(cpu) {
-			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
-							  cpu);
+			struct mask_array_stats *stats;
 			unsigned int start;
 			u64 counter;
 
+			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
 			do {
-				start = u64_stats_fetch_begin_irq(&ma->syncp);
-				counter = usage_counters[i];
-			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
+				start = u64_stats_fetch_begin_irq(&stats->syncp);
+				counter = stats->usage_cntrs[i];
+			} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
 
 			ma->masks_usage_zero_cntr[i] += counter;
 		}
@@ -227,9 +227,10 @@  static struct mask_array *tbl_mask_array_alloc(int size)
 					     sizeof(struct sw_flow_mask *) *
 					     size);
 
-	new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
-					       __alignof__(u64));
-	if (!new->masks_usage_cntr) {
+	new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
+						sizeof(u64) * size,
+						__alignof__(u64));
+	if (!new->masks_usage_stats) {
 		kfree(new);
 		return NULL;
 	}
@@ -732,7 +733,7 @@  static struct sw_flow *flow_lookup(struct flow_table *tbl,
 				   u32 *n_cache_hit,
 				   u32 *index)
 {
-	u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
+	struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
 	struct sw_flow *flow;
 	struct sw_flow_mask *mask;
 	int i;
@@ -742,9 +743,9 @@  static struct sw_flow *flow_lookup(struct flow_table *tbl,
 		if (mask) {
 			flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
 			if (flow) {
-				u64_stats_update_begin(&ma->syncp);
-				usage_counters[*index]++;
-				u64_stats_update_end(&ma->syncp);
+				u64_stats_update_begin(&stats->syncp);
+				stats->usage_cntrs[*index]++;
+				u64_stats_update_end(&stats->syncp);
 				(*n_cache_hit)++;
 				return flow;
 			}
@@ -763,9 +764,9 @@  static struct sw_flow *flow_lookup(struct flow_table *tbl,
 		flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
 		if (flow) { /* Found */
 			*index = i;
-			u64_stats_update_begin(&ma->syncp);
-			usage_counters[*index]++;
-			u64_stats_update_end(&ma->syncp);
+			u64_stats_update_begin(&stats->syncp);
+			stats->usage_cntrs[*index]++;
+			u64_stats_update_end(&stats->syncp);
 			return flow;
 		}
 	}
@@ -851,9 +852,17 @@  struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
 	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
 	u32 __always_unused n_mask_hit;
 	u32 __always_unused n_cache_hit;
+	struct sw_flow *flow;
 	u32 index = 0;
 
-	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
+	/* This function gets called trough the netlink interface and therefore
+	 * is preemptible. However, flow_lookup() function needs to be called
+	 * with preemption disabled due to CPU specific variables.
+	 */
+	local_bh_disable();
+	flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
+	local_bh_enable();
+	return flow;
 }
 
 struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
@@ -1109,7 +1118,6 @@  void ovs_flow_masks_rebalance(struct flow_table *table)
 
 	for (i = 0; i < ma->max; i++)  {
 		struct sw_flow_mask *mask;
-		unsigned int start;
 		int cpu;
 
 		mask = rcu_dereference_ovsl(ma->masks[i]);
@@ -1120,14 +1128,16 @@  void ovs_flow_masks_rebalance(struct flow_table *table)
 		masks_and_count[i].counter = 0;
 
 		for_each_possible_cpu(cpu) {
-			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
-							  cpu);
+			struct mask_array_stats *stats;
+			unsigned int start;
 			u64 counter;
 
+			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
 			do {
-				start = u64_stats_fetch_begin_irq(&ma->syncp);
-				counter = usage_counters[i];
-			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
+				start = u64_stats_fetch_begin_irq(&stats->syncp);
+				counter = stats->usage_cntrs[i];
+			} while (u64_stats_fetch_retry_irq(&stats->syncp,
+							   start));
 
 			masks_and_count[i].counter += counter;
 		}
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index 6e7d4ac59353..43144396e192 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -38,12 +38,16 @@  struct mask_count {
 	u64 counter;
 };
 
+struct mask_array_stats {
+	struct u64_stats_sync syncp;
+	u64 usage_cntrs[];
+};
+
 struct mask_array {
 	struct rcu_head rcu;
 	int count, max;
-	u64 __percpu *masks_usage_cntr;
+	struct mask_array_stats __percpu *masks_usage_stats;
 	u64 *masks_usage_zero_cntr;
-	struct u64_stats_sync syncp;
 	struct sw_flow_mask __rcu *masks[];
 };