diff mbox series

[v2,net-next,2/3] skbuff: (re)use NAPI skb cache on allocation path

Message ID 20210113133635.39402-2-alobakin@pm.me
State New
Headers show
Series skbuff: introduce skbuff_heads reusing and bulking | expand

Commit Message

Alexander Lobakin Jan. 13, 2021, 1:37 p.m. UTC
Instead of calling kmem_cache_alloc() every time when building a NAPI
skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
this cache was only used for bulk-freeing skbuff_heads consumed via
napi_consume_skb() or __kfree_skb_defer().

Typical path is:
 - skb is queued for freeing from driver or stack, its skbuff_head
   goes into the cache instead of immediate freeing;
 - driver or stack requests NAPI skb allocation, an skbuff_head is
   taken from the cache instead of allocation.

Corner cases:
 - if it's empty on skb allocation, bulk-allocate the first half;
 - if it's full on skb consuming, bulk-wipe the second half.

Also try to balance its size after completing network softirqs
(__kfree_skb_flush()).

prefetchw() on CONFIG_SLUB is dropped since it makes no sense anymore.

Suggested-by: Edward Cree <ecree.xilinx@gmail.com>
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
---
 net/core/skbuff.c | 54 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 19 deletions(-)

Comments

Eric Dumazet Jan. 13, 2021, 2:36 p.m. UTC | #1
On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <alobakin@pm.me> wrote:
>
> Instead of calling kmem_cache_alloc() every time when building a NAPI
> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
> this cache was only used for bulk-freeing skbuff_heads consumed via
> napi_consume_skb() or __kfree_skb_defer().
>
> Typical path is:
>  - skb is queued for freeing from driver or stack, its skbuff_head
>    goes into the cache instead of immediate freeing;
>  - driver or stack requests NAPI skb allocation, an skbuff_head is
>    taken from the cache instead of allocation.
>
> Corner cases:
>  - if it's empty on skb allocation, bulk-allocate the first half;
>  - if it's full on skb consuming, bulk-wipe the second half.
>
> Also try to balance its size after completing network softirqs
> (__kfree_skb_flush()).

I do not see the point of doing this rebalance (especially if we do not change
its name describing its purpose more accurately).

For moderate load, we will have a reduced bulk size (typically one or two).
Number of skbs in the cache is in [0, 64[ , there is really no risk of
letting skbs there for a long period of time.
(32 * sizeof(sk_buff) = 8192)
I would personally get rid of this function completely.


Also it seems you missed my KASAN support request ?
I guess this is a matter of using kasan_unpoison_range(), we can ask for help.




>
> prefetchw() on CONFIG_SLUB is dropped since it makes no sense anymore.
>
> Suggested-by: Edward Cree <ecree.xilinx@gmail.com>
> Signed-off-by: Alexander Lobakin <alobakin@pm.me>
> ---
>  net/core/skbuff.c | 54 ++++++++++++++++++++++++++++++-----------------
>  1 file changed, 35 insertions(+), 19 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index dc3300dc2ac4..f42a3a04b918 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -364,6 +364,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
>  EXPORT_SYMBOL(build_skb_around);
>
>  #define NAPI_SKB_CACHE_SIZE    64
> +#define NAPI_SKB_CACHE_HALF    (NAPI_SKB_CACHE_SIZE / 2)
>
>  struct napi_alloc_cache {
>         struct page_frag_cache page;
> @@ -487,7 +488,15 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
>
>  static struct sk_buff *napi_skb_cache_get(struct napi_alloc_cache *nc)
>  {
> -       return kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
> +       if (unlikely(!nc->skb_count))
> +               nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
> +                                                     GFP_ATOMIC,
> +                                                     NAPI_SKB_CACHE_HALF,
> +                                                     nc->skb_cache);
> +       if (unlikely(!nc->skb_count))
> +               return NULL;
> +
> +       return nc->skb_cache[--nc->skb_count];
>  }
>
>  /**
> @@ -867,40 +876,47 @@ void __consume_stateless_skb(struct sk_buff *skb)
>  void __kfree_skb_flush(void)
>  {
>         struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
> +       size_t count;
> +       void **ptr;
> +
> +       if (unlikely(nc->skb_count == NAPI_SKB_CACHE_HALF))
> +               return;
> +
> +       if (nc->skb_count > NAPI_SKB_CACHE_HALF) {
> +               count = nc->skb_count - NAPI_SKB_CACHE_HALF;
> +               ptr = nc->skb_cache + NAPI_SKB_CACHE_HALF;
>
> -       /* flush skb_cache if containing objects */
> -       if (nc->skb_count) {
> -               kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
> -                                    nc->skb_cache);
> -               nc->skb_count = 0;
> +               kmem_cache_free_bulk(skbuff_head_cache, count, ptr);
> +               nc->skb_count = NAPI_SKB_CACHE_HALF;
> +       } else {
> +               count = NAPI_SKB_CACHE_HALF - nc->skb_count;
> +               ptr = nc->skb_cache + nc->skb_count;
> +
> +               nc->skb_count += kmem_cache_alloc_bulk(skbuff_head_cache,
> +                                                      GFP_ATOMIC, count,
> +                                                      ptr);
>         }
>  }
>
Alexander Lobakin Jan. 14, 2021, 11:41 a.m. UTC | #2
From: Eric Dumazet <edumazet@google.com>

Date: Wed, 13 Jan 2021 15:36:05 +0100

> On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <alobakin@pm.me> wrote:

>>

>> Instead of calling kmem_cache_alloc() every time when building a NAPI

>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously

>> this cache was only used for bulk-freeing skbuff_heads consumed via

>> napi_consume_skb() or __kfree_skb_defer().

>>

>> Typical path is:

>>  - skb is queued for freeing from driver or stack, its skbuff_head

>>    goes into the cache instead of immediate freeing;

>>  - driver or stack requests NAPI skb allocation, an skbuff_head is

>>    taken from the cache instead of allocation.

>>

>> Corner cases:

>>  - if it's empty on skb allocation, bulk-allocate the first half;

>>  - if it's full on skb consuming, bulk-wipe the second half.

>>

>> Also try to balance its size after completing network softirqs

>> (__kfree_skb_flush()).

>

> I do not see the point of doing this rebalance (especially if we do not change

> its name describing its purpose more accurately).

>

> For moderate load, we will have a reduced bulk size (typically one or two).

> Number of skbs in the cache is in [0, 64[ , there is really no risk of

> letting skbs there for a long period of time.

> (32 * sizeof(sk_buff) = 8192)

> I would personally get rid of this function completely.


When I had a cache of 128 entries, I had worse results without this
function. But seems like I forgot to retest when I switched to the
original size of 64.
I also thought about removing this function entirely, will test.

> Also it seems you missed my KASAN support request ?

> I guess this is a matter of using kasan_unpoison_range(), we can ask for help.


I saw your request, but don't see a reason for doing this.
We are not caching already freed skbuff_heads. They don't get
kmem_cache_freed before getting into local cache. KASAN poisons
them no earlier than at kmem_cache_free() (or did I miss someting?).
heads being cached just get rid of all references and at the moment
of dropping to the cache they are pretty the same as if they were
allocated.

I also remind that only skbs that are caught by napi_consume_skb() or
__kfree_skb_defer() are getting into skb_cache, not every single one.

Regarding other emails:

1. NUMA awareness.

napi_alloc_cache is percpu, we're partly protected. The only thing
that might happen is that napi_consume_skb() can be called for skb
that was allocated at a distant node, and then it's requested by
napi_alloc_skb() (and there were no bulk-wipes between).
This can occur only if a NAPI polling cycle for cleaning up the
completion/send queue(s) is scheduled on a CPU that is far away
from the one(s) that clean(s) up the receive queue(s).
That is really very unlikely to be caught, but...

One of the ways to handle this is like (inside napi_skb_cache_get()):

	skb = nc->skb_cache[--nc->skb_count];
	if (unlikely(pfn_to_nid(virt_to_pfn(skb)) != numa_mem_id())) {
		kmem_cache_free(skbuff_head_cache, skb);
		skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
	}

	return skb;

This whole condition will be optimized out on !CONFIG_NUMA, as
pfn_to_nid() and numa_mem_id() are compile-time 0 in this case.
This won't break currently present bulk-freeing.

2. Where do optimizations come from.

Not only from bulk allocations, but also from the shortcut:

napi_consume_skb()/__kfree_skb_defer() -> skb_cache -> napi_alloc_skb();

napi_alloc_skb() will get a new head directly without calling for MM
functions.
I'm aware that kmem_cache has its own cache, but this also applies to
page allocators etc. which doesn't prevent from having things like
page_frag_cache or page_pool to recycle pages and fragments directly,
not through MM layer.

>> prefetchw() on CONFIG_SLUB is dropped since it makes no sense anymore.

>>

>> Suggested-by: Edward Cree <ecree.xilinx@gmail.com>

>> Signed-off-by: Alexander Lobakin <alobakin@pm.me>

>> ---

>>  net/core/skbuff.c | 54 ++++++++++++++++++++++++++++++-----------------

>>  1 file changed, 35 insertions(+), 19 deletions(-)

>>

>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c

>> index dc3300dc2ac4..f42a3a04b918 100644

>> --- a/net/core/skbuff.c

>> +++ b/net/core/skbuff.c

>> @@ -364,6 +364,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,

>>  EXPORT_SYMBOL(build_skb_around);

>>

>>  #define NAPI_SKB_CACHE_SIZE    64

>> +#define NAPI_SKB_CACHE_HALF    (NAPI_SKB_CACHE_SIZE / 2)

>>

>>  struct napi_alloc_cache {

>>         struct page_frag_cache page;

>> @@ -487,7 +488,15 @@ EXPORT_SYMBOL(__netdev_alloc_skb);

>>

>>  static struct sk_buff *napi_skb_cache_get(struct napi_alloc_cache *nc)

>>  {

>> -       return kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);

>> +       if (unlikely(!nc->skb_count))

>> +               nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,

>> +                                                     GFP_ATOMIC,

>> +                                                     NAPI_SKB_CACHE_HALF,

>> +                                                     nc->skb_cache);

>> +       if (unlikely(!nc->skb_count))

>> +               return NULL;

>> +

>> +       return nc->skb_cache[--nc->skb_count];

>>  }

>>

>>  /**

>> @@ -867,40 +876,47 @@ void __consume_stateless_skb(struct sk_buff *skb)

>>  void __kfree_skb_flush(void)

>>  {

>>         struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);

>> +       size_t count;

>> +       void **ptr;

>> +

>> +       if (unlikely(nc->skb_count == NAPI_SKB_CACHE_HALF))

>> +               return;

>> +

>> +       if (nc->skb_count > NAPI_SKB_CACHE_HALF) {

>> +               count = nc->skb_count - NAPI_SKB_CACHE_HALF;

>> +               ptr = nc->skb_cache + NAPI_SKB_CACHE_HALF;

>>

>> -       /* flush skb_cache if containing objects */

>> -       if (nc->skb_count) {

>> -               kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,

>> -                                    nc->skb_cache);

>> -               nc->skb_count = 0;

>> +               kmem_cache_free_bulk(skbuff_head_cache, count, ptr);

>> +               nc->skb_count = NAPI_SKB_CACHE_HALF;

>> +       } else {

>> +               count = NAPI_SKB_CACHE_HALF - nc->skb_count;

>> +               ptr = nc->skb_cache + nc->skb_count;

>> +

>> +               nc->skb_count += kmem_cache_alloc_bulk(skbuff_head_cache,

>> +                                                      GFP_ATOMIC, count,

>> +                                                      ptr);

>>         }

>>  }

>>


Al
Dmitry Vyukov Jan. 14, 2021, 11:47 a.m. UTC | #3
On Thu, Jan 14, 2021 at 12:41 PM Alexander Lobakin <alobakin@pm.me> wrote:
>

> From: Eric Dumazet <edumazet@google.com>

> Date: Wed, 13 Jan 2021 15:36:05 +0100

>

> > On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <alobakin@pm.me> wrote:

> >>

> >> Instead of calling kmem_cache_alloc() every time when building a NAPI

> >> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously

> >> this cache was only used for bulk-freeing skbuff_heads consumed via

> >> napi_consume_skb() or __kfree_skb_defer().

> >>

> >> Typical path is:

> >>  - skb is queued for freeing from driver or stack, its skbuff_head

> >>    goes into the cache instead of immediate freeing;

> >>  - driver or stack requests NAPI skb allocation, an skbuff_head is

> >>    taken from the cache instead of allocation.

> >>

> >> Corner cases:

> >>  - if it's empty on skb allocation, bulk-allocate the first half;

> >>  - if it's full on skb consuming, bulk-wipe the second half.

> >>

> >> Also try to balance its size after completing network softirqs

> >> (__kfree_skb_flush()).

> >

> > I do not see the point of doing this rebalance (especially if we do not change

> > its name describing its purpose more accurately).

> >

> > For moderate load, we will have a reduced bulk size (typically one or two).

> > Number of skbs in the cache is in [0, 64[ , there is really no risk of

> > letting skbs there for a long period of time.

> > (32 * sizeof(sk_buff) = 8192)

> > I would personally get rid of this function completely.

>

> When I had a cache of 128 entries, I had worse results without this

> function. But seems like I forgot to retest when I switched to the

> original size of 64.

> I also thought about removing this function entirely, will test.

>

> > Also it seems you missed my KASAN support request ?

> > I guess this is a matter of using kasan_unpoison_range(), we can ask for help.

>

> I saw your request, but don't see a reason for doing this.

> We are not caching already freed skbuff_heads. They don't get

> kmem_cache_freed before getting into local cache. KASAN poisons

> them no earlier than at kmem_cache_free() (or did I miss someting?).

> heads being cached just get rid of all references and at the moment

> of dropping to the cache they are pretty the same as if they were

> allocated.


KASAN should not report false positives in this case.
But I think Eric meant preventing false negatives. If we kmalloc 17
bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.
But we put that data into 128-byte blocks, KASAN will miss
out-of-bounds accesses beyond 17 bytes up to 128 bytes.
The same holds for "logical" use-after-frees when object is free, but
not freed into slab.

An important custom cache should use annotations like
kasan_poison_object_data/kasan_unpoison_range.
Alexander Lobakin Jan. 14, 2021, 12:44 p.m. UTC | #4
From: Dmitry Vyukov <dvyukov@google.com>

Date: Thu, 14 Jan 2021 12:47:31 +0100

> On Thu, Jan 14, 2021 at 12:41 PM Alexander Lobakin <alobakin@pm.me> wrote:

>>

>> From: Eric Dumazet <edumazet@google.com>

>> Date: Wed, 13 Jan 2021 15:36:05 +0100

>>

>>> On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <alobakin@pm.me> wrote:

>>>>

>>>> Instead of calling kmem_cache_alloc() every time when building a NAPI

>>>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously

>>>> this cache was only used for bulk-freeing skbuff_heads consumed via

>>>> napi_consume_skb() or __kfree_skb_defer().

>>>>

>>>> Typical path is:

>>>>  - skb is queued for freeing from driver or stack, its skbuff_head

>>>>    goes into the cache instead of immediate freeing;

>>>>  - driver or stack requests NAPI skb allocation, an skbuff_head is

>>>>    taken from the cache instead of allocation.

>>>>

>>>> Corner cases:

>>>>  - if it's empty on skb allocation, bulk-allocate the first half;

>>>>  - if it's full on skb consuming, bulk-wipe the second half.

>>>>

>>>> Also try to balance its size after completing network softirqs

>>>> (__kfree_skb_flush()).

>>>

>>> I do not see the point of doing this rebalance (especially if we do not change

>>> its name describing its purpose more accurately).

>>>

>>> For moderate load, we will have a reduced bulk size (typically one or two).

>>> Number of skbs in the cache is in [0, 64[ , there is really no risk of

>>> letting skbs there for a long period of time.

>>> (32 * sizeof(sk_buff) = 8192)

>>> I would personally get rid of this function completely.

>>

>> When I had a cache of 128 entries, I had worse results without this

>> function. But seems like I forgot to retest when I switched to the

>> original size of 64.

>> I also thought about removing this function entirely, will test.

>>

>>> Also it seems you missed my KASAN support request ?

>>  I guess this is a matter of using kasan_unpoison_range(), we can ask for help.

>>

>> I saw your request, but don't see a reason for doing this.

>> We are not caching already freed skbuff_heads. They don't get

>> kmem_cache_freed before getting into local cache. KASAN poisons

>> them no earlier than at kmem_cache_free() (or did I miss someting?).

>> heads being cached just get rid of all references and at the moment

>> of dropping to the cache they are pretty the same as if they were

>> allocated.

>

> KASAN should not report false positives in this case.

> But I think Eric meant preventing false negatives. If we kmalloc 17

> bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.

> But we put that data into 128-byte blocks, KASAN will miss

> out-of-bounds accesses beyond 17 bytes up to 128 bytes.

> The same holds for "logical" use-after-frees when object is free, but

> not freed into slab.

>

> An important custom cache should use annotations like

> kasan_poison_object_data/kasan_unpoison_range.


As I understand, I should
kasan_poison_object_data(skbuff_head_cache, skb) and then
kasan_unpoison_range(skb, sizeof(*skb)) when putting it into the
cache?

Thanks,
Al
Dmitry Vyukov Jan. 14, 2021, 12:50 p.m. UTC | #5
On Thu, Jan 14, 2021 at 1:44 PM Alexander Lobakin <alobakin@pm.me> wrote:
>

> From: Dmitry Vyukov <dvyukov@google.com>

> Date: Thu, 14 Jan 2021 12:47:31 +0100

>

> > On Thu, Jan 14, 2021 at 12:41 PM Alexander Lobakin <alobakin@pm.me> wrote:

> >>

> >> From: Eric Dumazet <edumazet@google.com>

> >> Date: Wed, 13 Jan 2021 15:36:05 +0100

> >>

> >>> On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <alobakin@pm.me> wrote:

> >>>>

> >>>> Instead of calling kmem_cache_alloc() every time when building a NAPI

> >>>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously

> >>>> this cache was only used for bulk-freeing skbuff_heads consumed via

> >>>> napi_consume_skb() or __kfree_skb_defer().

> >>>>

> >>>> Typical path is:

> >>>>  - skb is queued for freeing from driver or stack, its skbuff_head

> >>>>    goes into the cache instead of immediate freeing;

> >>>>  - driver or stack requests NAPI skb allocation, an skbuff_head is

> >>>>    taken from the cache instead of allocation.

> >>>>

> >>>> Corner cases:

> >>>>  - if it's empty on skb allocation, bulk-allocate the first half;

> >>>>  - if it's full on skb consuming, bulk-wipe the second half.

> >>>>

> >>>> Also try to balance its size after completing network softirqs

> >>>> (__kfree_skb_flush()).

> >>>

> >>> I do not see the point of doing this rebalance (especially if we do not change

> >>> its name describing its purpose more accurately).

> >>>

> >>> For moderate load, we will have a reduced bulk size (typically one or two).

> >>> Number of skbs in the cache is in [0, 64[ , there is really no risk of

> >>> letting skbs there for a long period of time.

> >>> (32 * sizeof(sk_buff) = 8192)

> >>> I would personally get rid of this function completely.

> >>

> >> When I had a cache of 128 entries, I had worse results without this

> >> function. But seems like I forgot to retest when I switched to the

> >> original size of 64.

> >> I also thought about removing this function entirely, will test.

> >>

> >>> Also it seems you missed my KASAN support request ?

> >>  I guess this is a matter of using kasan_unpoison_range(), we can ask for help.

> >>

> >> I saw your request, but don't see a reason for doing this.

> >> We are not caching already freed skbuff_heads. They don't get

> >> kmem_cache_freed before getting into local cache. KASAN poisons

> >> them no earlier than at kmem_cache_free() (or did I miss someting?).

> >> heads being cached just get rid of all references and at the moment

> >> of dropping to the cache they are pretty the same as if they were

> >> allocated.

> >

> > KASAN should not report false positives in this case.

> > But I think Eric meant preventing false negatives. If we kmalloc 17

> > bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.

> > But we put that data into 128-byte blocks, KASAN will miss

> > out-of-bounds accesses beyond 17 bytes up to 128 bytes.

> > The same holds for "logical" use-after-frees when object is free, but

> > not freed into slab.

> >

> > An important custom cache should use annotations like

> > kasan_poison_object_data/kasan_unpoison_range.

>

> As I understand, I should

> kasan_poison_object_data(skbuff_head_cache, skb) and then

> kasan_unpoison_range(skb, sizeof(*skb)) when putting it into the

> cache?


I think it's the other way around. It should be _un_poisoned when used.
If it's fixed size, then unpoison_object_data should be a better fit:
https://elixir.bootlin.com/linux/v5.11-rc3/source/mm/kasan/common.c#L253
Dmitry Vyukov Jan. 14, 2021, 12:51 p.m. UTC | #6
On Thu, Jan 14, 2021 at 1:50 PM Dmitry Vyukov <dvyukov@google.com> wrote:
>

> On Thu, Jan 14, 2021 at 1:44 PM Alexander Lobakin <alobakin@pm.me> wrote:

> >

> > From: Dmitry Vyukov <dvyukov@google.com>

> > Date: Thu, 14 Jan 2021 12:47:31 +0100

> >

> > > On Thu, Jan 14, 2021 at 12:41 PM Alexander Lobakin <alobakin@pm.me> wrote:

> > >>

> > >> From: Eric Dumazet <edumazet@google.com>

> > >> Date: Wed, 13 Jan 2021 15:36:05 +0100

> > >>

> > >>> On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <alobakin@pm.me> wrote:

> > >>>>

> > >>>> Instead of calling kmem_cache_alloc() every time when building a NAPI

> > >>>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously

> > >>>> this cache was only used for bulk-freeing skbuff_heads consumed via

> > >>>> napi_consume_skb() or __kfree_skb_defer().

> > >>>>

> > >>>> Typical path is:

> > >>>>  - skb is queued for freeing from driver or stack, its skbuff_head

> > >>>>    goes into the cache instead of immediate freeing;

> > >>>>  - driver or stack requests NAPI skb allocation, an skbuff_head is

> > >>>>    taken from the cache instead of allocation.

> > >>>>

> > >>>> Corner cases:

> > >>>>  - if it's empty on skb allocation, bulk-allocate the first half;

> > >>>>  - if it's full on skb consuming, bulk-wipe the second half.

> > >>>>

> > >>>> Also try to balance its size after completing network softirqs

> > >>>> (__kfree_skb_flush()).

> > >>>

> > >>> I do not see the point of doing this rebalance (especially if we do not change

> > >>> its name describing its purpose more accurately).

> > >>>

> > >>> For moderate load, we will have a reduced bulk size (typically one or two).

> > >>> Number of skbs in the cache is in [0, 64[ , there is really no risk of

> > >>> letting skbs there for a long period of time.

> > >>> (32 * sizeof(sk_buff) = 8192)

> > >>> I would personally get rid of this function completely.

> > >>

> > >> When I had a cache of 128 entries, I had worse results without this

> > >> function. But seems like I forgot to retest when I switched to the

> > >> original size of 64.

> > >> I also thought about removing this function entirely, will test.

> > >>

> > >>> Also it seems you missed my KASAN support request ?

> > >>  I guess this is a matter of using kasan_unpoison_range(), we can ask for help.

> > >>

> > >> I saw your request, but don't see a reason for doing this.

> > >> We are not caching already freed skbuff_heads. They don't get

> > >> kmem_cache_freed before getting into local cache. KASAN poisons

> > >> them no earlier than at kmem_cache_free() (or did I miss someting?).

> > >> heads being cached just get rid of all references and at the moment

> > >> of dropping to the cache they are pretty the same as if they were

> > >> allocated.

> > >

> > > KASAN should not report false positives in this case.

> > > But I think Eric meant preventing false negatives. If we kmalloc 17

> > > bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.

> > > But we put that data into 128-byte blocks, KASAN will miss

> > > out-of-bounds accesses beyond 17 bytes up to 128 bytes.

> > > The same holds for "logical" use-after-frees when object is free, but

> > > not freed into slab.

> > >

> > > An important custom cache should use annotations like

> > > kasan_poison_object_data/kasan_unpoison_range.

> >

> > As I understand, I should

> > kasan_poison_object_data(skbuff_head_cache, skb) and then

> > kasan_unpoison_range(skb, sizeof(*skb)) when putting it into the

> > cache?

>

> I think it's the other way around. It should be _un_poisoned when used.

> If it's fixed size, then unpoison_object_data should be a better fit:

> https://elixir.bootlin.com/linux/v5.11-rc3/source/mm/kasan/common.c#L253


Variable-size poisoning/unpoisoning would be needed for the skb data itself:
https://bugzilla.kernel.org/show_bug.cgi?id=199055
Alexander Lobakin Jan. 14, 2021, 1 p.m. UTC | #7
From: Dmitry Vyukov <dvyukov@google.com>

Date: Thu, 14 Jan 2021 13:50:25 +0100

> On Thu, Jan 14, 2021 at 1:44 PM Alexander Lobakin <alobakin@pm.me> wrote:

>>

>> From: Dmitry Vyukov <dvyukov@google.com>

>> Date: Thu, 14 Jan 2021 12:47:31 +0100

>>

>>> On Thu, Jan 14, 2021 at 12:41 PM Alexander Lobakin <alobakin@pm.me> wrote:

>>>>

>>>> From: Eric Dumazet <edumazet@google.com>

>>>> Date: Wed, 13 Jan 2021 15:36:05 +0100

>>>>

>>>>> On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <alobakin@pm.me> wrote:

>>>>>>

>>>>>> Instead of calling kmem_cache_alloc() every time when building a NAPI

>>>>>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously

>>>>>> this cache was only used for bulk-freeing skbuff_heads consumed via

>>>>>> napi_consume_skb() or __kfree_skb_defer().

>>>>>>

>>>>>> Typical path is:

>>>>>>  - skb is queued for freeing from driver or stack, its skbuff_head

>>>>>>    goes into the cache instead of immediate freeing;

>>>>>>  - driver or stack requests NAPI skb allocation, an skbuff_head is

>>>>>>    taken from the cache instead of allocation.

>>>>>>

>>>>>> Corner cases:

>>>>>>  - if it's empty on skb allocation, bulk-allocate the first half;

>>>>>>  - if it's full on skb consuming, bulk-wipe the second half.

>>>>>>

>>>>>> Also try to balance its size after completing network softirqs

>>>>>> (__kfree_skb_flush()).

>>>>>

>>>>> I do not see the point of doing this rebalance (especially if we do not change

>>>>> its name describing its purpose more accurately).

>>>>>

>>>>> For moderate load, we will have a reduced bulk size (typically one or two).

>>>>> Number of skbs in the cache is in [0, 64[ , there is really no risk of

>>>>> letting skbs there for a long period of time.

>>>>> (32 * sizeof(sk_buff) = 8192)

>>>>> I would personally get rid of this function completely.

>>>>

>>>> When I had a cache of 128 entries, I had worse results without this

>>>> function. But seems like I forgot to retest when I switched to the

>>>> original size of 64.

>>>> I also thought about removing this function entirely, will test.

>>>>

>>>>> Also it seems you missed my KASAN support request ?

>>>>  I guess this is a matter of using kasan_unpoison_range(), we can ask for help.

>>>>

>>>> I saw your request, but don't see a reason for doing this.

>>>> We are not caching already freed skbuff_heads. They don't get

>>>> kmem_cache_freed before getting into local cache. KASAN poisons

>>>> them no earlier than at kmem_cache_free() (or did I miss someting?).

>>>> heads being cached just get rid of all references and at the moment

>>>> of dropping to the cache they are pretty the same as if they were

>>>> allocated.

>>>

>>> KASAN should not report false positives in this case.

>>> But I think Eric meant preventing false negatives. If we kmalloc 17

>>> bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.

>>> But we put that data into 128-byte blocks, KASAN will miss

>>> out-of-bounds accesses beyond 17 bytes up to 128 bytes.

>>> The same holds for "logical" use-after-frees when object is free, but

>>> not freed into slab.

>>>

>>> An important custom cache should use annotations like

>>> kasan_poison_object_data/kasan_unpoison_range.

>>

>> As I understand, I should

>> kasan_poison_object_data(skbuff_head_cache, skb) and then

>> kasan_unpoison_range(skb, sizeof(*skb)) when putting it into the

>> cache?

>

> I think it's the other way around. It should be _un_poisoned when used.

> If it's fixed size, then unpoison_object_data should be a better fit:

> https://elixir.bootlin.com/linux/v5.11-rc3/source/mm/kasan/common.c#L253


Ah, I though of this too. But wouldn't there be a false-positive if
a poisoned skb hits kmem_cache_free_bulk(), not the allocation path?
We plan to use skb_cache for both reusing and bulk-freeing, and SLUB,
for example, might do writes into objects before freeing.
If it also should get unpoisoned before kmem_cache_free_bulk(), we'll
lose bulking as unpoisoning is performed per-object.

Al
Dmitry Vyukov Jan. 14, 2021, 1:01 p.m. UTC | #8
On Thu, Jan 14, 2021 at 2:00 PM Alexander Lobakin <alobakin@pm.me> wrote:
> >>>>>> Instead of calling kmem_cache_alloc() every time when building a NAPI

> >>>>>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously

> >>>>>> this cache was only used for bulk-freeing skbuff_heads consumed via

> >>>>>> napi_consume_skb() or __kfree_skb_defer().

> >>>>>>

> >>>>>> Typical path is:

> >>>>>>  - skb is queued for freeing from driver or stack, its skbuff_head

> >>>>>>    goes into the cache instead of immediate freeing;

> >>>>>>  - driver or stack requests NAPI skb allocation, an skbuff_head is

> >>>>>>    taken from the cache instead of allocation.

> >>>>>>

> >>>>>> Corner cases:

> >>>>>>  - if it's empty on skb allocation, bulk-allocate the first half;

> >>>>>>  - if it's full on skb consuming, bulk-wipe the second half.

> >>>>>>

> >>>>>> Also try to balance its size after completing network softirqs

> >>>>>> (__kfree_skb_flush()).

> >>>>>

> >>>>> I do not see the point of doing this rebalance (especially if we do not change

> >>>>> its name describing its purpose more accurately).

> >>>>>

> >>>>> For moderate load, we will have a reduced bulk size (typically one or two).

> >>>>> Number of skbs in the cache is in [0, 64[ , there is really no risk of

> >>>>> letting skbs there for a long period of time.

> >>>>> (32 * sizeof(sk_buff) = 8192)

> >>>>> I would personally get rid of this function completely.

> >>>>

> >>>> When I had a cache of 128 entries, I had worse results without this

> >>>> function. But seems like I forgot to retest when I switched to the

> >>>> original size of 64.

> >>>> I also thought about removing this function entirely, will test.

> >>>>

> >>>>> Also it seems you missed my KASAN support request ?

> >>>>  I guess this is a matter of using kasan_unpoison_range(), we can ask for help.

> >>>>

> >>>> I saw your request, but don't see a reason for doing this.

> >>>> We are not caching already freed skbuff_heads. They don't get

> >>>> kmem_cache_freed before getting into local cache. KASAN poisons

> >>>> them no earlier than at kmem_cache_free() (or did I miss someting?).

> >>>> heads being cached just get rid of all references and at the moment

> >>>> of dropping to the cache they are pretty the same as if they were

> >>>> allocated.

> >>>

> >>> KASAN should not report false positives in this case.

> >>> But I think Eric meant preventing false negatives. If we kmalloc 17

> >>> bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.

> >>> But we put that data into 128-byte blocks, KASAN will miss

> >>> out-of-bounds accesses beyond 17 bytes up to 128 bytes.

> >>> The same holds for "logical" use-after-frees when object is free, but

> >>> not freed into slab.

> >>>

> >>> An important custom cache should use annotations like

> >>> kasan_poison_object_data/kasan_unpoison_range.

> >>

> >> As I understand, I should

> >> kasan_poison_object_data(skbuff_head_cache, skb) and then

> >> kasan_unpoison_range(skb, sizeof(*skb)) when putting it into the

> >> cache?

> >

> > I think it's the other way around. It should be _un_poisoned when used.

> > If it's fixed size, then unpoison_object_data should be a better fit:

> > https://elixir.bootlin.com/linux/v5.11-rc3/source/mm/kasan/common.c#L253

>

> Ah, I though of this too. But wouldn't there be a false-positive if

> a poisoned skb hits kmem_cache_free_bulk(), not the allocation path?

> We plan to use skb_cache for both reusing and bulk-freeing, and SLUB,

> for example, might do writes into objects before freeing.

> If it also should get unpoisoned before kmem_cache_free_bulk(), we'll

> lose bulking as unpoisoning is performed per-object.


Yes, it needs to be unpoisoned before free.
Unpoison one-by-one, free in bulk. Unpoisoningin is debug-only code anyway.
Alexander Lobakin Jan. 14, 2021, 1:06 p.m. UTC | #9
From: Dmitry Vyukov <dvyukov@google.com>

Date: Thu, 14 Jan 2021 13:51:44 +0100

> On Thu, Jan 14, 2021 at 1:50 PM Dmitry Vyukov <dvyukov@google.com> wrote:

>>

>> On Thu, Jan 14, 2021 at 1:44 PM Alexander Lobakin <alobakin@pm.me> wrote:

>>>

>>> From: Dmitry Vyukov <dvyukov@google.com>

>>> Date: Thu, 14 Jan 2021 12:47:31 +0100

>>>

>>>> On Thu, Jan 14, 2021 at 12:41 PM Alexander Lobakin <alobakin@pm.me> wrote:

>>>>>

>>>>> From: Eric Dumazet <edumazet@google.com>

>>>>> Date: Wed, 13 Jan 2021 15:36:05 +0100

>>>>>

>>>>>> On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <alobakin@pm.me> wrote:

>>>>>>>

>>>>>>> Instead of calling kmem_cache_alloc() every time when building a NAPI

>>>>>>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously

>>>>>>> this cache was only used for bulk-freeing skbuff_heads consumed via

>>>>>>> napi_consume_skb() or __kfree_skb_defer().

>>>>>>>

>>>>>>> Typical path is:

>>>>>>>  - skb is queued for freeing from driver or stack, its skbuff_head

>>>>>>>    goes into the cache instead of immediate freeing;

>>>>>>>  - driver or stack requests NAPI skb allocation, an skbuff_head is

>>>>>>>    taken from the cache instead of allocation.

>>>>>>>

>>>>>>> Corner cases:

>>>>>>>  - if it's empty on skb allocation, bulk-allocate the first half;

>>>>>>>  - if it's full on skb consuming, bulk-wipe the second half.

>>>>>>>

>>>>>>> Also try to balance its size after completing network softirqs

>>>>>>> (__kfree_skb_flush()).

>>>>>>

>>>>>> I do not see the point of doing this rebalance (especially if we do not change

>>>>>> its name describing its purpose more accurately).

>>>>>>

>>>>>> For moderate load, we will have a reduced bulk size (typically one or two).

>>>>>> Number of skbs in the cache is in [0, 64[ , there is really no risk of

>>>>>> letting skbs there for a long period of time.

>>>>>> (32 * sizeof(sk_buff) = 8192)

>>>>>> I would personally get rid of this function completely.

>>>>>

>>>>> When I had a cache of 128 entries, I had worse results without this

>>>>> function. But seems like I forgot to retest when I switched to the

>>>>> original size of 64.

>>>>> I also thought about removing this function entirely, will test.

>>>>>

>>>>>> Also it seems you missed my KASAN support request ?

>>>>>  I guess this is a matter of using kasan_unpoison_range(), we can ask for help.

>>>>>

>>>>> I saw your request, but don't see a reason for doing this.

>>>>> We are not caching already freed skbuff_heads. They don't get

>>>>> kmem_cache_freed before getting into local cache. KASAN poisons

>>>>> them no earlier than at kmem_cache_free() (or did I miss someting?).

>>>>> heads being cached just get rid of all references and at the moment

>>>>> of dropping to the cache they are pretty the same as if they were

>>>>> allocated.

>>>>

>>>> KASAN should not report false positives in this case.

>>>> But I think Eric meant preventing false negatives. If we kmalloc 17

>>>> bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.

>>>> But we put that data into 128-byte blocks, KASAN will miss

>>>> out-of-bounds accesses beyond 17 bytes up to 128 bytes.

>>>> The same holds for "logical" use-after-frees when object is free, but

>>>> not freed into slab.

>>>>

>>>> An important custom cache should use annotations like

>>>> kasan_poison_object_data/kasan_unpoison_range.

>>>

>>> As I understand, I should

>>> kasan_poison_object_data(skbuff_head_cache, skb) and then

>>> kasan_unpoison_range(skb, sizeof(*skb)) when putting it into the

>>> cache?

>>

>> I think it's the other way around. It should be _un_poisoned when used.

>> If it's fixed size, then unpoison_object_data should be a better fit:

>> https://elixir.bootlin.com/linux/v5.11-rc3/source/mm/kasan/common.c#L253

>

> Variable-size poisoning/unpoisoning would be needed for the skb data itself:

> https://bugzilla.kernel.org/show_bug.cgi?id=199055


This cache is for skbuff_heads only, not for the entire skbs. All
linear data and frags gets freed before head hits the cache.
The cache will store skbuff_heads as if they were freshly allocated
by kmem_cache_alloc().

Al
diff mbox series

Patch

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dc3300dc2ac4..f42a3a04b918 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -364,6 +364,7 @@  struct sk_buff *build_skb_around(struct sk_buff *skb,
 EXPORT_SYMBOL(build_skb_around);
 
 #define NAPI_SKB_CACHE_SIZE	64
+#define NAPI_SKB_CACHE_HALF	(NAPI_SKB_CACHE_SIZE / 2)
 
 struct napi_alloc_cache {
 	struct page_frag_cache page;
@@ -487,7 +488,15 @@  EXPORT_SYMBOL(__netdev_alloc_skb);
 
 static struct sk_buff *napi_skb_cache_get(struct napi_alloc_cache *nc)
 {
-	return kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+	if (unlikely(!nc->skb_count))
+		nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
+						      GFP_ATOMIC,
+						      NAPI_SKB_CACHE_HALF,
+						      nc->skb_cache);
+	if (unlikely(!nc->skb_count))
+		return NULL;
+
+	return nc->skb_cache[--nc->skb_count];
 }
 
 /**
@@ -867,40 +876,47 @@  void __consume_stateless_skb(struct sk_buff *skb)
 void __kfree_skb_flush(void)
 {
 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+	size_t count;
+	void **ptr;
+
+	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_HALF))
+		return;
+
+	if (nc->skb_count > NAPI_SKB_CACHE_HALF) {
+		count = nc->skb_count - NAPI_SKB_CACHE_HALF;
+		ptr = nc->skb_cache + NAPI_SKB_CACHE_HALF;
 
-	/* flush skb_cache if containing objects */
-	if (nc->skb_count) {
-		kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
-				     nc->skb_cache);
-		nc->skb_count = 0;
+		kmem_cache_free_bulk(skbuff_head_cache, count, ptr);
+		nc->skb_count = NAPI_SKB_CACHE_HALF;
+	} else {
+		count = NAPI_SKB_CACHE_HALF - nc->skb_count;
+		ptr = nc->skb_cache + nc->skb_count;
+
+		nc->skb_count += kmem_cache_alloc_bulk(skbuff_head_cache,
+						       GFP_ATOMIC, count,
+						       ptr);
 	}
 }
 
-static inline void _kfree_skb_defer(struct sk_buff *skb)
+static void napi_skb_cache_put(struct sk_buff *skb)
 {
 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 
 	/* drop skb->head and call any destructors for packet */
 	skb_release_all(skb);
 
-	/* record skb to CPU local list */
 	nc->skb_cache[nc->skb_count++] = skb;
 
-#ifdef CONFIG_SLUB
-	/* SLUB writes into objects when freeing */
-	prefetchw(skb);
-#endif
-
-	/* flush skb_cache if it is filled */
 	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
-		kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
-				     nc->skb_cache);
-		nc->skb_count = 0;
+		kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF,
+				     nc->skb_cache + NAPI_SKB_CACHE_HALF);
+		nc->skb_count = NAPI_SKB_CACHE_HALF;
 	}
 }
+
 void __kfree_skb_defer(struct sk_buff *skb)
 {
-	_kfree_skb_defer(skb);
+	napi_skb_cache_put(skb);
 }
 
 void napi_consume_skb(struct sk_buff *skb, int budget)
@@ -925,7 +941,7 @@  void napi_consume_skb(struct sk_buff *skb, int budget)
 		return;
 	}
 
-	_kfree_skb_defer(skb);
+	napi_skb_cache_put(skb);
 }
 EXPORT_SYMBOL(napi_consume_skb);