mbox series

[net-next,v2,0/4] net: Avoid the memory waste in some Ethernet drivers

Message ID 20210131074426.44154-1-haokexin@gmail.com
Headers show
Series net: Avoid the memory waste in some Ethernet drivers | expand

Message

Kevin Hao Jan. 31, 2021, 7:44 a.m. UTC
Hi,

v2:
  - Inline page_frag_alloc() and {netdev,napi}_alloc_frag()
  - Adopt Vlastimil's suggestion and add his Acked-by

In the current implementation of napi_alloc_frag(), it doesn't have any
align guarantee for the returned buffer address. We would have to use
some ugly workarounds to make sure that we can get a align buffer
address for some Ethernet drivers. This patch series tries to introduce
some helper functions to make sure that an align buffer is returned.
Then we can drop the ugly workarounds and avoid the unnecessary memory
waste.

Kevin Hao (4):
  mm: page_frag: Introduce page_frag_alloc_align()
  net: Introduce {netdev,napi}_alloc_frag_align()
  net: octeontx2: Use napi_alloc_frag_align() to avoid the memory waste
  net: dpaa2: Use napi_alloc_frag_align() to avoid the memory waste

 .../net/ethernet/freescale/dpaa2/dpaa2-eth.c  |  3 +--
 .../marvell/octeontx2/nic/otx2_common.c       |  3 +--
 include/linux/gfp.h                           | 12 +++++++--
 include/linux/skbuff.h                        | 22 ++++++++++++++--
 mm/page_alloc.c                               |  8 +++---
 net/core/skbuff.c                             | 25 +++++++------------
 6 files changed, 46 insertions(+), 27 deletions(-)

Comments

Ioana Ciornei Feb. 2, 2021, 11:36 a.m. UTC | #1
On Sun, Jan 31, 2021 at 03:44:23PM +0800, Kevin Hao wrote:
> In the current implementation of page_frag_alloc(), it doesn't have

> any align guarantee for the returned buffer address. But for some

> hardwares they do require the DMA buffer to be aligned correctly,

> so we would have to use some workarounds like below if the buffers

> allocated by the page_frag_alloc() are used by these hardwares for

> DMA.

>     buf = page_frag_alloc(really_needed_size + align);

>     buf = PTR_ALIGN(buf, align);

> 

> These codes seems ugly and would waste a lot of memories if the buffers

> are used in a network driver for the TX/RX.


Isn't the memory wasted even with this change?

I am not familiar with the frag allocator so I might be missing
something, but from what I understood each page_frag_cache keeps only
the offset inside the current page being allocated, offset which you
ALIGN_DOWN() to match the alignment requirement. I don't see how that
memory between the non-aligned and aligned offset is going to be used
again before the entire page is freed.

> So introduce

> page_frag_alloc_align() to make sure that an aligned buffer address is

> returned.

> 

> Signed-off-by: Kevin Hao <haokexin@gmail.com>

> Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---

> v2: 

>   - Inline page_frag_alloc()

>   - Adopt Vlastimil's suggestion and add his Acked-by

>  

>  include/linux/gfp.h | 12 ++++++++++--

>  mm/page_alloc.c     |  8 +++++---

>  2 files changed, 15 insertions(+), 5 deletions(-)

> 

> diff --git a/include/linux/gfp.h b/include/linux/gfp.h

> index 6e479e9c48ce..39f4b3070d09 100644

> --- a/include/linux/gfp.h

> +++ b/include/linux/gfp.h

> @@ -583,8 +583,16 @@ extern void free_pages(unsigned long addr, unsigned int order);

>  

>  struct page_frag_cache;

>  extern void __page_frag_cache_drain(struct page *page, unsigned int count);

> -extern void *page_frag_alloc(struct page_frag_cache *nc,

> -			     unsigned int fragsz, gfp_t gfp_mask);

> +extern void *page_frag_alloc_align(struct page_frag_cache *nc,

> +				   unsigned int fragsz, gfp_t gfp_mask,

> +				   int align);

> +

> +static inline void *page_frag_alloc(struct page_frag_cache *nc,

> +			     unsigned int fragsz, gfp_t gfp_mask)

> +{

> +	return page_frag_alloc_align(nc, fragsz, gfp_mask, 0);

> +}

> +

>  extern void page_frag_free(void *addr);

>  

>  #define __free_page(page) __free_pages((page), 0)

> diff --git a/mm/page_alloc.c b/mm/page_alloc.c

> index 519a60d5b6f7..4667e7b6993b 100644

> --- a/mm/page_alloc.c

> +++ b/mm/page_alloc.c

> @@ -5137,8 +5137,8 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)

>  }

>  EXPORT_SYMBOL(__page_frag_cache_drain);

>  

> -void *page_frag_alloc(struct page_frag_cache *nc,

> -		      unsigned int fragsz, gfp_t gfp_mask)

> +void *page_frag_alloc_align(struct page_frag_cache *nc,

> +		      unsigned int fragsz, gfp_t gfp_mask, int align)

>  {

>  	unsigned int size = PAGE_SIZE;

>  	struct page *page;

> @@ -5190,11 +5190,13 @@ void *page_frag_alloc(struct page_frag_cache *nc,

>  	}

>  

>  	nc->pagecnt_bias--;

> +	if (align)

> +		offset = ALIGN_DOWN(offset, align);

>  	nc->offset = offset;

>  

>  	return nc->va + offset;

>  }

> -EXPORT_SYMBOL(page_frag_alloc);

> +EXPORT_SYMBOL(page_frag_alloc_align);

>  

>  /*

>   * Frees a page fragment allocated out of either a compound or order 0 page.

> -- 

> 2.29.2

>
Vlastimil Babka Feb. 2, 2021, 11:48 a.m. UTC | #2
On 2/2/21 12:36 PM, Ioana Ciornei wrote:
> On Sun, Jan 31, 2021 at 03:44:23PM +0800, Kevin Hao wrote:

>> In the current implementation of page_frag_alloc(), it doesn't have

>> any align guarantee for the returned buffer address. But for some

>> hardwares they do require the DMA buffer to be aligned correctly,

>> so we would have to use some workarounds like below if the buffers

>> allocated by the page_frag_alloc() are used by these hardwares for

>> DMA.

>>     buf = page_frag_alloc(really_needed_size + align);

>>     buf = PTR_ALIGN(buf, align);

>> 

>> These codes seems ugly and would waste a lot of memories if the buffers

>> are used in a network driver for the TX/RX.

> 

> Isn't the memory wasted even with this change?


Yes, but less of it. Not always full amount of align, but up to it. Perhaps even
zero.

> I am not familiar with the frag allocator so I might be missing

> something, but from what I understood each page_frag_cache keeps only

> the offset inside the current page being allocated, offset which you

> ALIGN_DOWN() to match the alignment requirement. I don't see how that

> memory between the non-aligned and aligned offset is going to be used

> again before the entire page is freed.


True, thath's how page_frag is designed. The align amounts would be most likely
too small to be usable anyway.
Ioana Ciornei Feb. 2, 2021, 12:31 p.m. UTC | #3
On Tue, Feb 02, 2021 at 12:48:28PM +0100, Vlastimil Babka wrote:
> On 2/2/21 12:36 PM, Ioana Ciornei wrote:

> > On Sun, Jan 31, 2021 at 03:44:23PM +0800, Kevin Hao wrote:

> >> In the current implementation of page_frag_alloc(), it doesn't have

> >> any align guarantee for the returned buffer address. But for some

> >> hardwares they do require the DMA buffer to be aligned correctly,

> >> so we would have to use some workarounds like below if the buffers

> >> allocated by the page_frag_alloc() are used by these hardwares for

> >> DMA.

> >>     buf = page_frag_alloc(really_needed_size + align);

> >>     buf = PTR_ALIGN(buf, align);

> >> 

> >> These codes seems ugly and would waste a lot of memories if the buffers

> >> are used in a network driver for the TX/RX.

> > 

> > Isn't the memory wasted even with this change?

> 

> Yes, but less of it. Not always full amount of align, but up to it. Perhaps even

> zero.


Indeed, the worst case is still there but we gain by not allocating the
full 'size + align' all the time.

Thanks.

> 

> > I am not familiar with the frag allocator so I might be missing

> > something, but from what I understood each page_frag_cache keeps only

> > the offset inside the current page being allocated, offset which you

> > ALIGN_DOWN() to match the alignment requirement. I don't see how that

> > memory between the non-aligned and aligned offset is going to be used

> > again before the entire page is freed.

> 

> True, thath's how page_frag is designed. The align amounts would be most likely

> too small to be usable anyway.
Alexander Duyck Feb. 2, 2021, 4:19 p.m. UTC | #4
On Sat, Jan 30, 2021 at 11:54 PM Kevin Hao <haokexin@gmail.com> wrote:
>

> In the current implementation of page_frag_alloc(), it doesn't have

> any align guarantee for the returned buffer address. But for some

> hardwares they do require the DMA buffer to be aligned correctly,

> so we would have to use some workarounds like below if the buffers

> allocated by the page_frag_alloc() are used by these hardwares for

> DMA.

>     buf = page_frag_alloc(really_needed_size + align);

>     buf = PTR_ALIGN(buf, align);

>

> These codes seems ugly and would waste a lot of memories if the buffers

> are used in a network driver for the TX/RX. So introduce

> page_frag_alloc_align() to make sure that an aligned buffer address is

> returned.

>

> Signed-off-by: Kevin Hao <haokexin@gmail.com>

> Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---

> v2:

>   - Inline page_frag_alloc()

>   - Adopt Vlastimil's suggestion and add his Acked-by

>

>  include/linux/gfp.h | 12 ++++++++++--

>  mm/page_alloc.c     |  8 +++++---

>  2 files changed, 15 insertions(+), 5 deletions(-)

>

> diff --git a/include/linux/gfp.h b/include/linux/gfp.h

> index 6e479e9c48ce..39f4b3070d09 100644

> --- a/include/linux/gfp.h

> +++ b/include/linux/gfp.h

> @@ -583,8 +583,16 @@ extern void free_pages(unsigned long addr, unsigned int order);

>

>  struct page_frag_cache;

>  extern void __page_frag_cache_drain(struct page *page, unsigned int count);

> -extern void *page_frag_alloc(struct page_frag_cache *nc,

> -                            unsigned int fragsz, gfp_t gfp_mask);

> +extern void *page_frag_alloc_align(struct page_frag_cache *nc,

> +                                  unsigned int fragsz, gfp_t gfp_mask,

> +                                  int align);

> +

> +static inline void *page_frag_alloc(struct page_frag_cache *nc,

> +                            unsigned int fragsz, gfp_t gfp_mask)

> +{

> +       return page_frag_alloc_align(nc, fragsz, gfp_mask, 0);

> +}

> +

>  extern void page_frag_free(void *addr);

>

>  #define __free_page(page) __free_pages((page), 0)

> diff --git a/mm/page_alloc.c b/mm/page_alloc.c

> index 519a60d5b6f7..4667e7b6993b 100644

> --- a/mm/page_alloc.c

> +++ b/mm/page_alloc.c

> @@ -5137,8 +5137,8 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)

>  }

>  EXPORT_SYMBOL(__page_frag_cache_drain);

>

> -void *page_frag_alloc(struct page_frag_cache *nc,

> -                     unsigned int fragsz, gfp_t gfp_mask)

> +void *page_frag_alloc_align(struct page_frag_cache *nc,

> +                     unsigned int fragsz, gfp_t gfp_mask, int align)


I would make "align" unsigned since really we are using it as a mask.
Actually passing it as a mask might be even better. More on that
below.

>  {

>         unsigned int size = PAGE_SIZE;

>         struct page *page;

> @@ -5190,11 +5190,13 @@ void *page_frag_alloc(struct page_frag_cache *nc,

>         }

>

>         nc->pagecnt_bias--;

> +       if (align)

> +               offset = ALIGN_DOWN(offset, align);

>         nc->offset = offset;

>

>         return nc->va + offset;

>  }

> -EXPORT_SYMBOL(page_frag_alloc);

> +EXPORT_SYMBOL(page_frag_alloc_align);

>

>  /*

>   * Frees a page fragment allocated out of either a compound or order 0 page.


Rather than using the conditional branch it might be better to just do
"offset &= align_mask". Then you would be adding at most 1 instruction
which can likely occur in parallel with the other work that is going
on versus the conditional branch which requires a test, jump, and then
the 3 alignment instructions to do the subtraction, inversion, and
AND.

However it would ripple through the other patches as you would also
need to update you other patches to assume ~0 in the unaligned case,
however with your masked cases you could just use the negative
alignment value to generate your mask which would likely be taken care
of by the compiler.
Alexander Duyck Feb. 2, 2021, 4:26 p.m. UTC | #5
On Sun, Jan 31, 2021 at 12:17 AM Kevin Hao <haokexin@gmail.com> wrote:
>

> In the current implementation of {netdev,napi}_alloc_frag(), it doesn't

> have any align guarantee for the returned buffer address, But for some

> hardwares they do require the DMA buffer to be aligned correctly,

> so we would have to use some workarounds like below if the buffers

> allocated by the {netdev,napi}_alloc_frag() are used by these hardwares

> for DMA.

>     buf = napi_alloc_frag(really_needed_size + align);

>     buf = PTR_ALIGN(buf, align);

>

> These codes seems ugly and would waste a lot of memories if the buffers

> are used in a network driver for the TX/RX. We have added the align

> support for the page_frag functions, so add the corresponding

> {netdev,napi}_frag functions.

>

> Signed-off-by: Kevin Hao <haokexin@gmail.com>

> ---

> v2: Inline {netdev,napi}_alloc_frag().

>

>  include/linux/skbuff.h | 22 ++++++++++++++++++++--

>  net/core/skbuff.c      | 25 +++++++++----------------

>  2 files changed, 29 insertions(+), 18 deletions(-)

>

> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

> index 9313b5aaf45b..7e8beff4ff22 100644

> --- a/include/linux/skbuff.h

> +++ b/include/linux/skbuff.h

> @@ -2818,7 +2818,19 @@ void skb_queue_purge(struct sk_buff_head *list);

>

>  unsigned int skb_rbtree_purge(struct rb_root *root);

>

> -void *netdev_alloc_frag(unsigned int fragsz);

> +void *netdev_alloc_frag_align(unsigned int fragsz, int align);

> +

> +/**

> + * netdev_alloc_frag - allocate a page fragment

> + * @fragsz: fragment size

> + *

> + * Allocates a frag from a page for receive buffer.

> + * Uses GFP_ATOMIC allocations.

> + */

> +static inline void *netdev_alloc_frag(unsigned int fragsz)

> +{

> +       return netdev_alloc_frag_align(fragsz, 0);

> +}

>


So one thing we may want to do is actually split this up so that we
have a __netdev_alloc_frag_align function that is called by one of two
inline functions. The standard netdev_alloc_frag would be like what
you have here, however we would be passing ~0 for the mask.

The "align" version would be taking in an unsigned int align value and
converting it to a mask. The idea is that your mask value is likely a
constant so converting the constant to a mask would be much easier to
do in an inline function as the compiler can take care of converting
the value during compile time.

An added value to that is you could also add tests to the align value
to guarantee that the value being passed is a power of 2 so that it
works with the alignment mask generation as expected.

>  struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,

>                                    gfp_t gfp_mask);

> @@ -2877,7 +2889,13 @@ static inline void skb_free_frag(void *addr)

>         page_frag_free(addr);

>  }

>

> -void *napi_alloc_frag(unsigned int fragsz);

> +void *napi_alloc_frag_align(unsigned int fragsz, int align);

> +

> +static inline void *napi_alloc_frag(unsigned int fragsz)

> +{

> +       return napi_alloc_frag_align(fragsz, 0);

> +}

> +

>  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,

>                                  unsigned int length, gfp_t gfp_mask);

>  static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,


Same for the __napi_alloc_frag code. You could probably convert the
__napi_alloc_frag below into an __napi_alloc_frag_align that you pass
a mask to. Then you could convert the other two functions to either
pass ~0 or the align value and add align value validation.

> diff --git a/net/core/skbuff.c b/net/core/skbuff.c

> index 2af12f7e170c..a35e75f12428 100644

> --- a/net/core/skbuff.c

> +++ b/net/core/skbuff.c

> @@ -374,29 +374,22 @@ struct napi_alloc_cache {

>  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);

>  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);

>

> -static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)

> +static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask, int align)

>  {

>         struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);

>

> -       return page_frag_alloc(&nc->page, fragsz, gfp_mask);

> +       return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align);

>  }

>

> -void *napi_alloc_frag(unsigned int fragsz)

> +void *napi_alloc_frag_align(unsigned int fragsz, int align)

>  {

>         fragsz = SKB_DATA_ALIGN(fragsz);

>

> -       return __napi_alloc_frag(fragsz, GFP_ATOMIC);

> +       return __napi_alloc_frag(fragsz, GFP_ATOMIC, align);

>  }

> -EXPORT_SYMBOL(napi_alloc_frag);

> +EXPORT_SYMBOL(napi_alloc_frag_align);

>

> -/**

> - * netdev_alloc_frag - allocate a page fragment

> - * @fragsz: fragment size

> - *

> - * Allocates a frag from a page for receive buffer.

> - * Uses GFP_ATOMIC allocations.

> - */

> -void *netdev_alloc_frag(unsigned int fragsz)

> +void *netdev_alloc_frag_align(unsigned int fragsz, int align)

>  {

>         struct page_frag_cache *nc;

>         void *data;

> @@ -404,15 +397,15 @@ void *netdev_alloc_frag(unsigned int fragsz)

>         fragsz = SKB_DATA_ALIGN(fragsz);

>         if (in_irq() || irqs_disabled()) {

>                 nc = this_cpu_ptr(&netdev_alloc_cache);

> -               data = page_frag_alloc(nc, fragsz, GFP_ATOMIC);

> +               data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align);

>         } else {

>                 local_bh_disable();

> -               data = __napi_alloc_frag(fragsz, GFP_ATOMIC);

> +               data = __napi_alloc_frag(fragsz, GFP_ATOMIC, align);

>                 local_bh_enable();

>         }

>         return data;

>  }

> -EXPORT_SYMBOL(netdev_alloc_frag);

> +EXPORT_SYMBOL(netdev_alloc_frag_align);

>

>  /**

>   *     __netdev_alloc_skb - allocate an skbuff for rx on a specific device

> --

> 2.29.2

>
Kevin Hao Feb. 4, 2021, 6:40 a.m. UTC | #6
On Tue, Feb 02, 2021 at 08:19:54AM -0800, Alexander Duyck wrote:
> On Sat, Jan 30, 2021 at 11:54 PM Kevin Hao <haokexin@gmail.com> wrote:

> >

> > In the current implementation of page_frag_alloc(), it doesn't have

> > any align guarantee for the returned buffer address. But for some

> > hardwares they do require the DMA buffer to be aligned correctly,

> > so we would have to use some workarounds like below if the buffers

> > allocated by the page_frag_alloc() are used by these hardwares for

> > DMA.

> >     buf = page_frag_alloc(really_needed_size + align);

> >     buf = PTR_ALIGN(buf, align);

> >

> > These codes seems ugly and would waste a lot of memories if the buffers

> > are used in a network driver for the TX/RX. So introduce

> > page_frag_alloc_align() to make sure that an aligned buffer address is

> > returned.

> >

> > Signed-off-by: Kevin Hao <haokexin@gmail.com>

> > Acked-by: Vlastimil Babka <vbabka@suse.cz>

> > ---

> > v2:

> >   - Inline page_frag_alloc()

> >   - Adopt Vlastimil's suggestion and add his Acked-by

> >

> >  include/linux/gfp.h | 12 ++++++++++--

> >  mm/page_alloc.c     |  8 +++++---

> >  2 files changed, 15 insertions(+), 5 deletions(-)

> >

> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h

> > index 6e479e9c48ce..39f4b3070d09 100644

> > --- a/include/linux/gfp.h

> > +++ b/include/linux/gfp.h

> > @@ -583,8 +583,16 @@ extern void free_pages(unsigned long addr, unsigned int order);

> >

> >  struct page_frag_cache;

> >  extern void __page_frag_cache_drain(struct page *page, unsigned int count);

> > -extern void *page_frag_alloc(struct page_frag_cache *nc,

> > -                            unsigned int fragsz, gfp_t gfp_mask);

> > +extern void *page_frag_alloc_align(struct page_frag_cache *nc,

> > +                                  unsigned int fragsz, gfp_t gfp_mask,

> > +                                  int align);

> > +

> > +static inline void *page_frag_alloc(struct page_frag_cache *nc,

> > +                            unsigned int fragsz, gfp_t gfp_mask)

> > +{

> > +       return page_frag_alloc_align(nc, fragsz, gfp_mask, 0);

> > +}

> > +

> >  extern void page_frag_free(void *addr);

> >

> >  #define __free_page(page) __free_pages((page), 0)

> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c

> > index 519a60d5b6f7..4667e7b6993b 100644

> > --- a/mm/page_alloc.c

> > +++ b/mm/page_alloc.c

> > @@ -5137,8 +5137,8 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)

> >  }

> >  EXPORT_SYMBOL(__page_frag_cache_drain);

> >

> > -void *page_frag_alloc(struct page_frag_cache *nc,

> > -                     unsigned int fragsz, gfp_t gfp_mask)

> > +void *page_frag_alloc_align(struct page_frag_cache *nc,

> > +                     unsigned int fragsz, gfp_t gfp_mask, int align)

> 

> I would make "align" unsigned since really we are using it as a mask.

> Actually passing it as a mask might be even better. More on that

> below.

> 

> >  {

> >         unsigned int size = PAGE_SIZE;

> >         struct page *page;

> > @@ -5190,11 +5190,13 @@ void *page_frag_alloc(struct page_frag_cache *nc,

> >         }

> >

> >         nc->pagecnt_bias--;

> > +       if (align)

> > +               offset = ALIGN_DOWN(offset, align);

> >         nc->offset = offset;

> >

> >         return nc->va + offset;

> >  }

> > -EXPORT_SYMBOL(page_frag_alloc);

> > +EXPORT_SYMBOL(page_frag_alloc_align);

> >

> >  /*

> >   * Frees a page fragment allocated out of either a compound or order 0 page.

> 

> Rather than using the conditional branch it might be better to just do

> "offset &= align_mask". Then you would be adding at most 1 instruction

> which can likely occur in parallel with the other work that is going

> on versus the conditional branch which requires a test, jump, and then

> the 3 alignment instructions to do the subtraction, inversion, and

> AND.


On arm64:

       if (align)
               offset = ALIGN_DOWN(offset, align);

	4b1503e2        neg     w2, w21
	710002bf        cmp     w21, #0x0
	0a020082        and     w2, w4, w2
	1a841044        csel    w4, w2, w4, ne  // ne = any


	offset &= align_mask

	0a0402a4        and     w4, w21, w4


Yes, we do cut 3 instructions by using align mask.

> 

> However it would ripple through the other patches as you would also

> need to update you other patches to assume ~0 in the unaligned case,

> however with your masked cases you could just use the negative

> alignment value to generate your mask which would likely be taken care

> of by the compiler.


Will do.

Thanks,
Kevin
Kevin Hao Feb. 4, 2021, 6:47 a.m. UTC | #7
On Tue, Feb 02, 2021 at 08:26:19AM -0800, Alexander Duyck wrote:
> On Sun, Jan 31, 2021 at 12:17 AM Kevin Hao <haokexin@gmail.com> wrote:

> >

> > In the current implementation of {netdev,napi}_alloc_frag(), it doesn't

> > have any align guarantee for the returned buffer address, But for some

> > hardwares they do require the DMA buffer to be aligned correctly,

> > so we would have to use some workarounds like below if the buffers

> > allocated by the {netdev,napi}_alloc_frag() are used by these hardwares

> > for DMA.

> >     buf = napi_alloc_frag(really_needed_size + align);

> >     buf = PTR_ALIGN(buf, align);

> >

> > These codes seems ugly and would waste a lot of memories if the buffers

> > are used in a network driver for the TX/RX. We have added the align

> > support for the page_frag functions, so add the corresponding

> > {netdev,napi}_frag functions.

> >

> > Signed-off-by: Kevin Hao <haokexin@gmail.com>

> > ---

> > v2: Inline {netdev,napi}_alloc_frag().

> >

> >  include/linux/skbuff.h | 22 ++++++++++++++++++++--

> >  net/core/skbuff.c      | 25 +++++++++----------------

> >  2 files changed, 29 insertions(+), 18 deletions(-)

> >

> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

> > index 9313b5aaf45b..7e8beff4ff22 100644

> > --- a/include/linux/skbuff.h

> > +++ b/include/linux/skbuff.h

> > @@ -2818,7 +2818,19 @@ void skb_queue_purge(struct sk_buff_head *list);

> >

> >  unsigned int skb_rbtree_purge(struct rb_root *root);

> >

> > -void *netdev_alloc_frag(unsigned int fragsz);

> > +void *netdev_alloc_frag_align(unsigned int fragsz, int align);

> > +

> > +/**

> > + * netdev_alloc_frag - allocate a page fragment

> > + * @fragsz: fragment size

> > + *

> > + * Allocates a frag from a page for receive buffer.

> > + * Uses GFP_ATOMIC allocations.

> > + */

> > +static inline void *netdev_alloc_frag(unsigned int fragsz)

> > +{

> > +       return netdev_alloc_frag_align(fragsz, 0);

> > +}

> >

> 

> So one thing we may want to do is actually split this up so that we

> have a __netdev_alloc_frag_align function that is called by one of two

> inline functions. The standard netdev_alloc_frag would be like what

> you have here, however we would be passing ~0 for the mask.

> 

> The "align" version would be taking in an unsigned int align value and

> converting it to a mask. The idea is that your mask value is likely a

> constant so converting the constant to a mask would be much easier to

> do in an inline function as the compiler can take care of converting

> the value during compile time.

> 

> An added value to that is you could also add tests to the align value

> to guarantee that the value being passed is a power of 2 so that it

> works with the alignment mask generation as expected.


Fair enough. Thanks Alexander.

Kevin

> 

> >  struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,

> >                                    gfp_t gfp_mask);

> > @@ -2877,7 +2889,13 @@ static inline void skb_free_frag(void *addr)

> >         page_frag_free(addr);

> >  }

> >

> > -void *napi_alloc_frag(unsigned int fragsz);

> > +void *napi_alloc_frag_align(unsigned int fragsz, int align);

> > +

> > +static inline void *napi_alloc_frag(unsigned int fragsz)

> > +{

> > +       return napi_alloc_frag_align(fragsz, 0);

> > +}

> > +

> >  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,

> >                                  unsigned int length, gfp_t gfp_mask);

> >  static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,

> 

> Same for the __napi_alloc_frag code. You could probably convert the

> __napi_alloc_frag below into an __napi_alloc_frag_align that you pass

> a mask to. Then you could convert the other two functions to either

> pass ~0 or the align value and add align value validation.

> 

> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c

> > index 2af12f7e170c..a35e75f12428 100644

> > --- a/net/core/skbuff.c

> > +++ b/net/core/skbuff.c

> > @@ -374,29 +374,22 @@ struct napi_alloc_cache {

> >  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);

> >  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);

> >

> > -static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)

> > +static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask, int align)

> >  {

> >         struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);

> >

> > -       return page_frag_alloc(&nc->page, fragsz, gfp_mask);

> > +       return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align);

> >  }

> >

> > -void *napi_alloc_frag(unsigned int fragsz)

> > +void *napi_alloc_frag_align(unsigned int fragsz, int align)

> >  {

> >         fragsz = SKB_DATA_ALIGN(fragsz);

> >

> > -       return __napi_alloc_frag(fragsz, GFP_ATOMIC);

> > +       return __napi_alloc_frag(fragsz, GFP_ATOMIC, align);

> >  }

> > -EXPORT_SYMBOL(napi_alloc_frag);

> > +EXPORT_SYMBOL(napi_alloc_frag_align);

> >

> > -/**

> > - * netdev_alloc_frag - allocate a page fragment

> > - * @fragsz: fragment size

> > - *

> > - * Allocates a frag from a page for receive buffer.

> > - * Uses GFP_ATOMIC allocations.

> > - */

> > -void *netdev_alloc_frag(unsigned int fragsz)

> > +void *netdev_alloc_frag_align(unsigned int fragsz, int align)

> >  {

> >         struct page_frag_cache *nc;

> >         void *data;

> > @@ -404,15 +397,15 @@ void *netdev_alloc_frag(unsigned int fragsz)

> >         fragsz = SKB_DATA_ALIGN(fragsz);

> >         if (in_irq() || irqs_disabled()) {

> >                 nc = this_cpu_ptr(&netdev_alloc_cache);

> > -               data = page_frag_alloc(nc, fragsz, GFP_ATOMIC);

> > +               data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align);

> >         } else {

> >                 local_bh_disable();

> > -               data = __napi_alloc_frag(fragsz, GFP_ATOMIC);

> > +               data = __napi_alloc_frag(fragsz, GFP_ATOMIC, align);

> >                 local_bh_enable();

> >         }

> >         return data;

> >  }

> > -EXPORT_SYMBOL(netdev_alloc_frag);

> > +EXPORT_SYMBOL(netdev_alloc_frag_align);

> >

> >  /**

> >   *     __netdev_alloc_skb - allocate an skbuff for rx on a specific device

> > --

> > 2.29.2

> >