[net-next,1/2,v3] net: socionext: different approach on DMA

Message ID 1538220482-16129-2-git-send-email-ilias.apalodimas@linaro.org
State New
Headers show
Series
  • net: socionext: XDP support
Related show

Commit Message

Ilias Apalodimas Sept. 29, 2018, 11:28 a.m.
Current driver dynamically allocates an skb and maps it as DMA rx buffer.
A following patch introduces XDP functionality, so we need a
different allocation scheme. Buffers are allocated dynamically and
mapped into hardware. During the Rx operation the driver uses
build_skb() to produce the necessary buffers for the network stack

Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>

---
 drivers/net/ethernet/socionext/netsec.c | 238 +++++++++++++++++---------------
 1 file changed, 129 insertions(+), 109 deletions(-)

-- 
2.7.4

Comments

Jesper Dangaard Brouer Oct. 1, 2018, 9:26 a.m. | #1
On Sat, 29 Sep 2018 14:28:01 +0300 Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> +static void *netsec_alloc_rx_data(struct netsec_priv *priv,

> +				  dma_addr_t *dma_handle, u16 *desc_len)

> +{

> +	size_t len = priv->ndev->mtu + ETH_HLEN + 2 * VLAN_HLEN + NET_SKB_PAD +

> +		NET_IP_ALIGN;

> +	dma_addr_t mapping;

> +	void *buf;

> +

> +	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

> +	len = SKB_DATA_ALIGN(len);

> +

> +	buf = napi_alloc_frag(len);


Using napi_alloc_frag here ^^^^

> +	if (!buf)

> +		return NULL;

> +

> +	mapping = dma_map_single(priv->dev, buf, len, DMA_FROM_DEVICE);

> +	if (unlikely(dma_mapping_error(priv->dev, mapping)))

> +		goto err_out;

> +

> +	*dma_handle = mapping;

> +	*desc_len = len;

> +

> +	return buf;

> +

> +err_out:

> +	skb_free_frag(buf);

> +	return NULL;

> +}


Hmmm, you are using napi_alloc_frag() in above code, which behind
your-back allocates order-3 pages (32 Kbytes memory in 8 order-0 pages).

This violates at-least two XDP principals:

#1: You are NOT using order-0 page based allocations for XDP.

Notice, I'm not saying 1-page per packet, as ixgbe + i40e violated
this, and it is now "per-practical-code-example" acceptable to split up
the order-0 page, and store two RX frames per order-0 page (4096 bytes).
(To make this fit you have to reduce XDP_HEADROOM to 192 bytes, which
killed the idea of placing the SKB in this area).

#2: You have allocations on the XDP fast-path.

The REAL secret behind the XDP performance is to avoid allocations on
the fast-path.  While I just told you to use the page-allocator and
order-0 pages, this will actually kill performance.  Thus, to make this
fast, you need a driver local recycle scheme that avoids going through
the page allocator, which makes XDP_DROP and XDP_TX extremely fast.
For the XDP_REDIRECT action (which you seems to be interested in, as
this is needed for AF_XDP), there is a xdp_return_frame() API that can
make this fast.

To avoid every driver inventing their own driver local page-recycle
cache (which many does today), we/I have created the page pool API.
See include/net/page_pool.h, and look at how mlx5 driver uses it
in v4.18 links[1][2][3].  Do notice, that mlx5 ALSO have a driver
recycle scheme on top, which Tariq is working on removing or
generalizing.  AND also that mlx5 does not use the DMA mapping feature
that page_pool also provide yet. (Contact me if you want to use
page_pool for handing DMA mapping, we might need to export
__page_pool_clean_page and call it before XDP_PASS action).


[1] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L226
[2] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L255
[3] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_main.c#L598-L618
-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer
Ilias Apalodimas Oct. 1, 2018, 9:44 a.m. | #2
On Mon, Oct 01, 2018 at 11:26:31AM +0200, Jesper Dangaard Brouer wrote:
> 

> On Sat, 29 Sep 2018 14:28:01 +0300 Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> 

> > +static void *netsec_alloc_rx_data(struct netsec_priv *priv,

> > +				  dma_addr_t *dma_handle, u16 *desc_len)

> > +{

> > +	size_t len = priv->ndev->mtu + ETH_HLEN + 2 * VLAN_HLEN + NET_SKB_PAD +

> > +		NET_IP_ALIGN;

> > +	dma_addr_t mapping;

> > +	void *buf;

> > +

> > +	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

> > +	len = SKB_DATA_ALIGN(len);

> > +

> > +	buf = napi_alloc_frag(len);

> 

> Using napi_alloc_frag here ^^^^

> 

> > +	if (!buf)

> > +		return NULL;

> > +

> > +	mapping = dma_map_single(priv->dev, buf, len, DMA_FROM_DEVICE);

> > +	if (unlikely(dma_mapping_error(priv->dev, mapping)))

> > +		goto err_out;

> > +

> > +	*dma_handle = mapping;

> > +	*desc_len = len;

> > +

> > +	return buf;

> > +

> > +err_out:

> > +	skb_free_frag(buf);

> > +	return NULL;

> > +}

> 

> Hmmm, you are using napi_alloc_frag() in above code, which behind

> your-back allocates order-3 pages (32 Kbytes memory in 8 order-0 pages).

> 

> This violates at-least two XDP principals:

> 

> #1: You are NOT using order-0 page based allocations for XDP.

> 

> Notice, I'm not saying 1-page per packet, as ixgbe + i40e violated

> this, and it is now "per-practical-code-example" acceptable to split up

> the order-0 page, and store two RX frames per order-0 page (4096 bytes).

> (To make this fit you have to reduce XDP_HEADROOM to 192 bytes, which

> killed the idea of placing the SKB in this area).

Yes i saw the Intel implementation. I just thought it wasn't worth the hassle
for an 1gbit interface (but wasn't aware it violates and XDP principle). 
I also noticed that Netronome(and others) are allocating 1 page per packet 
when using XDP
> 

> #2: You have allocations on the XDP fast-path.

> 

> The REAL secret behind the XDP performance is to avoid allocations on

> the fast-path.  While I just told you to use the page-allocator and

> order-0 pages, this will actually kill performance.  Thus, to make this

> fast, you need a driver local recycle scheme that avoids going through

> the page allocator, which makes XDP_DROP and XDP_TX extremely fast.

> For the XDP_REDIRECT action (which you seems to be interested in, as

> this is needed for AF_XDP), there is a xdp_return_frame() API that can

> make this fast.

I had an initial implementation that did exactly that (that's why you the
dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 
of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh
buffers back to the hardware only when your packets have been processed from
your userspace application
> 

> To avoid every driver inventing their own driver local page-recycle

> cache (which many does today), we/I have created the page pool API.

> See include/net/page_pool.h, and look at how mlx5 driver uses it

> in v4.18 links[1][2][3].  Do notice, that mlx5 ALSO have a driver

> recycle scheme on top, which Tariq is working on removing or

> generalizing.  AND also that mlx5 does not use the DMA mapping feature

> that page_pool also provide yet. (Contact me if you want to use

> page_pool for handing DMA mapping, we might need to export

> __page_pool_clean_page and call it before XDP_PASS action).

Ok i'll have a look on that and let you know.  i

P.S : A few months back we reported that Chelsio is using a different 
'memory scheme' for incoming packets. Essentially they just feed the hardware 
with unstructred pages and it decides were to place
the packet. Maybe that's worth considering for the page pool API?

> 

> 

> [1] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L226

> [2] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L255

> [3] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_main.c#L598-L618



Thanks
/Ilias
Ilias Apalodimas Oct. 1, 2018, 9:56 a.m. | #3
> > #2: You have allocations on the XDP fast-path.

> > 

> > The REAL secret behind the XDP performance is to avoid allocations on

> > the fast-path.  While I just told you to use the page-allocator and

> > order-0 pages, this will actually kill performance.  Thus, to make this

> > fast, you need a driver local recycle scheme that avoids going through

> > the page allocator, which makes XDP_DROP and XDP_TX extremely fast.

> > For the XDP_REDIRECT action (which you seems to be interested in, as

> > this is needed for AF_XDP), there is a xdp_return_frame() API that can

> > make this fast.

> I had an initial implementation that did exactly that (that's why you the

> dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 

> of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh

> buffers back to the hardware only when your packets have been processed from

> your userspace application

Just a clarification here. This is the case if ZC is implemented. In my case
the buffers will be 'ok' to be passed back to the hardware once the use
userspace payload has been copied by xdp_do_redirect()

/Ilias
Jesper Dangaard Brouer Oct. 1, 2018, 11:03 a.m. | #4
On Mon, 1 Oct 2018 12:56:58 +0300
Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> > > #2: You have allocations on the XDP fast-path.

> > > 

> > > The REAL secret behind the XDP performance is to avoid allocations on

> > > the fast-path.  While I just told you to use the page-allocator and

> > > order-0 pages, this will actually kill performance.  Thus, to make this

> > > fast, you need a driver local recycle scheme that avoids going through

> > > the page allocator, which makes XDP_DROP and XDP_TX extremely fast.

> > > For the XDP_REDIRECT action (which you seems to be interested in, as

> > > this is needed for AF_XDP), there is a xdp_return_frame() API that can

> > > make this fast.  

> >

> > I had an initial implementation that did exactly that (that's why you the

> > dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 

> > of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh

> > buffers back to the hardware only when your packets have been processed from

> > your userspace application 

>

> Just a clarification here. This is the case if ZC is implemented. In my case

> the buffers will be 'ok' to be passed back to the hardware once the use

> userspace payload has been copied by xdp_do_redirect()


Thanks for clarifying.  But no, this is not introducing a 'bottleneck'
for AF_XDP.

For (1) the copy-mode-AF_XDP the frame (as you noticed) is "freed" or
"returned" very quickly after it is copied.  The code is a bit hard to
follow, but in __xsk_rcv() it calls xdp_return_buff() after the memcpy.
Thus, the frame can be kept DMA mapped and reused in RX-ring quickly.

For (2) the zero-copy-AF_XDP, then you need to implement a new
allocator of type MEM_TYPE_ZERO_COPY.  The performance trick here is
that all DMA-map/unmap and allocations go away, given everything is
preallocated by userspace.  Through the 4 rings (SPSC) are used for
recycling the ZC-umem frames (read Documentation/networking/af_xdp.rst).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer
Ilias Apalodimas Oct. 1, 2018, 11:20 a.m. | #5
On Mon, Oct 01, 2018 at 01:03:13PM +0200, Jesper Dangaard Brouer wrote:
> On Mon, 1 Oct 2018 12:56:58 +0300

> Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> 

> > > > #2: You have allocations on the XDP fast-path.

> > > > 

> > > > The REAL secret behind the XDP performance is to avoid allocations on

> > > > the fast-path.  While I just told you to use the page-allocator and

> > > > order-0 pages, this will actually kill performance.  Thus, to make this

> > > > fast, you need a driver local recycle scheme that avoids going through

> > > > the page allocator, which makes XDP_DROP and XDP_TX extremely fast.

> > > > For the XDP_REDIRECT action (which you seems to be interested in, as

> > > > this is needed for AF_XDP), there is a xdp_return_frame() API that can

> > > > make this fast.  

> > >

> > > I had an initial implementation that did exactly that (that's why you the

> > > dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 

> > > of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh

> > > buffers back to the hardware only when your packets have been processed from

> > > your userspace application 

> >

> > Just a clarification here. This is the case if ZC is implemented. In my case

> > the buffers will be 'ok' to be passed back to the hardware once the use

> > userspace payload has been copied by xdp_do_redirect()

> 

> Thanks for clarifying.  But no, this is not introducing a 'bottleneck'

> for AF_XDP.

> 

> For (1) the copy-mode-AF_XDP the frame (as you noticed) is "freed" or

> "returned" very quickly after it is copied.  The code is a bit hard to

> follow, but in __xsk_rcv() it calls xdp_return_buff() after the memcpy.

> Thus, the frame can be kept DMA mapped and reused in RX-ring quickly.

Ok makes sense. I'll send a v4 with page re-usage, while using your API for page
allocation

> 

> For (2) the zero-copy-AF_XDP, then you need to implement a new

> allocator of type MEM_TYPE_ZERO_COPY.  The performance trick here is

> that all DMA-map/unmap and allocations go away, given everything is

> preallocated by userspace.  Through the 4 rings (SPSC) are used for

> recycling the ZC-umem frames (read Documentation/networking/af_xdp.rst).

Noted in case we implement ZC support

Thanks
/Ilias
Jesper Dangaard Brouer Oct. 1, 2018, 1:48 p.m. | #6
On Mon, 1 Oct 2018 14:20:21 +0300
Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> On Mon, Oct 01, 2018 at 01:03:13PM +0200, Jesper Dangaard Brouer wrote:

> > On Mon, 1 Oct 2018 12:56:58 +0300

> > Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> >   

> > > > > #2: You have allocations on the XDP fast-path.

> > > > > 

> > > > > The REAL secret behind the XDP performance is to avoid allocations on

> > > > > the fast-path.  While I just told you to use the page-allocator and

> > > > > order-0 pages, this will actually kill performance.  Thus, to make this

> > > > > fast, you need a driver local recycle scheme that avoids going through

> > > > > the page allocator, which makes XDP_DROP and XDP_TX extremely fast.

> > > > > For the XDP_REDIRECT action (which you seems to be interested in, as

> > > > > this is needed for AF_XDP), there is a xdp_return_frame() API that can

> > > > > make this fast.    

> > > >

> > > > I had an initial implementation that did exactly that (that's why you the

> > > > dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 

> > > > of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh

> > > > buffers back to the hardware only when your packets have been processed from

> > > > your userspace application   

> > >

> > > Just a clarification here. This is the case if ZC is implemented. In my case

> > > the buffers will be 'ok' to be passed back to the hardware once the use

> > > userspace payload has been copied by xdp_do_redirect()  

> > 

> > Thanks for clarifying.  But no, this is not introducing a 'bottleneck'

> > for AF_XDP.

> > 

> > For (1) the copy-mode-AF_XDP the frame (as you noticed) is "freed" or

> > "returned" very quickly after it is copied.  The code is a bit hard to

> > follow, but in __xsk_rcv() it calls xdp_return_buff() after the memcpy.

> > Thus, the frame can be kept DMA mapped and reused in RX-ring quickly.

>  

> Ok makes sense. I'll send a v4 with page re-usage, while using your

> API for page allocation


Sound good, BUT do notice that using the bare page_pool, will/should
give you increased XDP performance, but might slow-down normal network
stack delivery, because netstack will not call xdp_return_frame() and
instead falls back to returning the pages through the page-allocator.

I'm very interested in knowing what performance increase you see with
XDP_DROP, with just a "bare" page_pool implementation.

The mlx5 driver does not see this netstack slowdown, because it have a
hybrid approach of maintaining a recycle ring for frames going into
netstack, by bumping the refcnt.  I think Tariq is cleaning this up.
The mlx5 code is hard to follow... in mlx5e_xdp_handle()[1] the
refcnt==1 and a bit is set. And in [2] the refcnt is page_ref_inc(),
and bit is caught in [3].  (This really need to be cleaned up and
generalized).



[1] https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c#L83-L88
    https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L952-L959

[2] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L1015-L1025

[3] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L1094-L1098

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer
Ilias Apalodimas Oct. 1, 2018, 2:37 p.m. | #7
On Mon, Oct 01, 2018 at 03:48:45PM +0200, Jesper Dangaard Brouer wrote:
> On Mon, 1 Oct 2018 14:20:21 +0300

> Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> 

> > On Mon, Oct 01, 2018 at 01:03:13PM +0200, Jesper Dangaard Brouer wrote:

> > > On Mon, 1 Oct 2018 12:56:58 +0300

> > > Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> > >   

> > > > > > #2: You have allocations on the XDP fast-path.

> > > > > > 

> > > > > > The REAL secret behind the XDP performance is to avoid allocations on

> > > > > > the fast-path.  While I just told you to use the page-allocator and

> > > > > > order-0 pages, this will actually kill performance.  Thus, to make this

> > > > > > fast, you need a driver local recycle scheme that avoids going through

> > > > > > the page allocator, which makes XDP_DROP and XDP_TX extremely fast.

> > > > > > For the XDP_REDIRECT action (which you seems to be interested in, as

> > > > > > this is needed for AF_XDP), there is a xdp_return_frame() API that can

> > > > > > make this fast.    

> > > > >

> > > > > I had an initial implementation that did exactly that (that's why you the

> > > > > dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 

> > > > > of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh

> > > > > buffers back to the hardware only when your packets have been processed from

> > > > > your userspace application   

> > > >

> > > > Just a clarification here. This is the case if ZC is implemented. In my case

> > > > the buffers will be 'ok' to be passed back to the hardware once the use

> > > > userspace payload has been copied by xdp_do_redirect()  

> > > 

> > > Thanks for clarifying.  But no, this is not introducing a 'bottleneck'

> > > for AF_XDP.

> > > 

> > > For (1) the copy-mode-AF_XDP the frame (as you noticed) is "freed" or

> > > "returned" very quickly after it is copied.  The code is a bit hard to

> > > follow, but in __xsk_rcv() it calls xdp_return_buff() after the memcpy.

> > > Thus, the frame can be kept DMA mapped and reused in RX-ring quickly.

> >  

> > Ok makes sense. I'll send a v4 with page re-usage, while using your

> > API for page allocation

> 

> Sound good, BUT do notice that using the bare page_pool, will/should

> give you increased XDP performance, but might slow-down normal network

> stack delivery, because netstack will not call xdp_return_frame() and

> instead falls back to returning the pages through the page-allocator.

> 

> I'm very interested in knowing what performance increase you see with

> XDP_DROP, with just a "bare" page_pool implementation.

When i was just syncing the page fragments instead of unmap -> alloc -> map i
was getting ~340kpps (with XDP_REDIRECT). I ended up with 320kpps on this patch.
I did a couple of more changes though (like the dma mapping when allocating 
the buffers) so i am not 100% sure what caused that difference
I'll let you know once i finish up the code using the API for page allocation

Regarding the change and the 'bottleneck' discussion we had. XDP_REDIRECT is
straight forward (non ZC mode). I agree with you that since the payload is
pretty much immediately copied before being flushed to the userspace, it's
unlikely you'll end up delaying the hardware (starving without buffers).
Do you think that's the same for XDP_TX? The DMA buffer will need to be synced 
for the CPU, then you ring a doorbell with X packets. After that you'll have to
wait for the Tx completion and resync the buffers to the device. So you actually
make your Rx descriptors depending on your Tx completion (and keep in mind this
NIC only has 1 queue per direction)

Now for the measurements part, i'll have to check with the vendor if the
interface can do more than 340kpps and we are missing something
performance-wise. 
Have you done any tests with IOMMU enabled/disabled? In theory the dma recycling
will shine against map/unmap when IOMMU is on (and the IOMMU stressed i.e have a
different NIC doing a traffic test)

> 

> The mlx5 driver does not see this netstack slowdown, because it have a

> hybrid approach of maintaining a recycle ring for frames going into

> netstack, by bumping the refcnt.  I think Tariq is cleaning this up.

> The mlx5 code is hard to follow... in mlx5e_xdp_handle()[1] the

> refcnt==1 and a bit is set. And in [2] the refcnt is page_ref_inc(),

> and bit is caught in [3].  (This really need to be cleaned up and

> generalized).

I've read most of the XDP related code on Intel/Mellanox
before starting my patch series. I'll have a closer look now, thanks!
> 

> 

> 

> [1] https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c#L83-L88

>     https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L952-L959

> 

> [2] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L1015-L1025

> 

> [3] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L1094-L1098

> 

> -- 

> Best regards,

>   Jesper Dangaard Brouer

>   MSc.CS, Principal Kernel Engineer at Red Hat

>   LinkedIn: http://www.linkedin.com/in/brouer
Jesper Dangaard Brouer Oct. 1, 2018, 3:58 p.m. | #8
On Mon, 1 Oct 2018 17:37:06 +0300
Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> On Mon, Oct 01, 2018 at 03:48:45PM +0200, Jesper Dangaard Brouer wrote:

> > On Mon, 1 Oct 2018 14:20:21 +0300

> > Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> >   

> > > On Mon, Oct 01, 2018 at 01:03:13PM +0200, Jesper Dangaard Brouer wrote:  

> > > > On Mon, 1 Oct 2018 12:56:58 +0300

> > > > Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> > > >     

> > > > > > > #2: You have allocations on the XDP fast-path.

> > > > > > > 

> > > > > > > The REAL secret behind the XDP performance is to avoid allocations on

> > > > > > > the fast-path.  While I just told you to use the page-allocator and

> > > > > > > order-0 pages, this will actually kill performance.  Thus, to make this

> > > > > > > fast, you need a driver local recycle scheme that avoids going through

> > > > > > > the page allocator, which makes XDP_DROP and XDP_TX extremely fast.

> > > > > > > For the XDP_REDIRECT action (which you seems to be interested in, as

> > > > > > > this is needed for AF_XDP), there is a xdp_return_frame() API that can

> > > > > > > make this fast.      

> > > > > >

> > > > > > I had an initial implementation that did exactly that (that's why you the

> > > > > > dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 

> > > > > > of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh

> > > > > > buffers back to the hardware only when your packets have been processed from

> > > > > > your userspace application     

> > > > >

> > > > > Just a clarification here. This is the case if ZC is implemented. In my case

> > > > > the buffers will be 'ok' to be passed back to the hardware once the use

> > > > > userspace payload has been copied by xdp_do_redirect()    

> > > > 

> > > > Thanks for clarifying.  But no, this is not introducing a 'bottleneck'

> > > > for AF_XDP.

> > > > 

> > > > For (1) the copy-mode-AF_XDP the frame (as you noticed) is "freed" or

> > > > "returned" very quickly after it is copied.  The code is a bit hard to

> > > > follow, but in __xsk_rcv() it calls xdp_return_buff() after the memcpy.

> > > > Thus, the frame can be kept DMA mapped and reused in RX-ring quickly.  

> > >  

> > > Ok makes sense. I'll send a v4 with page re-usage, while using your

> > > API for page allocation  

> > 

> > Sound good, BUT do notice that using the bare page_pool, will/should

> > give you increased XDP performance, but might slow-down normal network

> > stack delivery, because netstack will not call xdp_return_frame() and

> > instead falls back to returning the pages through the page-allocator.

> > 

> > I'm very interested in knowing what performance increase you see with

> > XDP_DROP, with just a "bare" page_pool implementation.  

>

> When i was just syncing the page fragments instead of unmap -> alloc -> map i

> was getting ~340kpps (with XDP_REDIRECT). I ended up with 320kpps on this patch.


I explicitly asked for the XDP_DROP performance... because it will tell
me/us if your hardware is actually the bottleneck.  For your specific
hardware, you might be limited by the cost of DMA-sync.  It might be
faster to use the DMA-map/unmap calls(?).

I'm hinting you should take one step at a time, and measure.  Knowing
and identifying the limits, is essential. Else you are doing blind
optimizations. If you don't know the HW limit, then you don't know what
the gap is to optimum (and then you don't know when to stop optimizing).


> I did a couple of more changes though (like the dma mapping when allocating 

> the buffers) so i am not 100% sure what caused that difference

> I'll let you know once i finish up the code using the API for page allocation

>

> Regarding the change and the 'bottleneck' discussion we had. XDP_REDIRECT is

> straight forward (non ZC mode). I agree with you that since the payload is

> pretty much immediately copied before being flushed to the userspace, it's

> unlikely you'll end up delaying the hardware (starving without buffers).

> Do you think that's the same for XDP_TX? The DMA buffer will need to be synced 

> for the CPU, then you ring a doorbell with X packets. After that you'll have to

> wait for the Tx completion and resync the buffers to the device. So you actually

> make your Rx descriptors depending on your Tx completion (and keep in mind this

> NIC only has 1 queue per direction)


The page_pool will cache-pages (it have a pool of pages) that should
large enough to handle some pages are outstanding in the TX completion
queue.

>

> Now for the measurements part, i'll have to check with the vendor if the

> interface can do more than 340kpps and we are missing something

> performance-wise. 


Yes, please. The XDP_DROP test I requested above is exactly an attempt
to determine what the NIC HW limits are... esle you are working blindly.


> Have you done any tests with IOMMU enabled/disabled? In theory the dma recycling

> will shine against map/unmap when IOMMU is on (and the IOMMU stressed i.e have a

> different NIC doing a traffic test)


Nope, I could not get the IOMMU working on my testlab, last time I
tried to activate it.  Hench, why I have not implemented/optimized DMA
map/unmap stuff too much (e.g. mlx5 currently does a DMA unmap for
XDP_REDIRECT, which should be fixed).
 
 
> > The mlx5 driver does not see this netstack slowdown, because it

> > have a hybrid approach of maintaining a recycle ring for frames

> > going into netstack, by bumping the refcnt.  I think Tariq is

> > cleaning this up. The mlx5 code is hard to follow... in

> > mlx5e_xdp_handle()[1] the refcnt==1 and a bit is set. And in [2]

> > the refcnt is page_ref_inc(), and bit is caught in [3].  (This

> > really need to be cleaned up and generalized).  

>

> I've read most of the XDP related code on Intel/Mellanox

> before starting my patch series. I'll have a closer look now, thanks!

> > 

> > 

> > 

> > [1]

> > https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c#L83-L88

> > https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L952-L959

> > 

> > [2]

> > https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L1015-L1025

> > 

> > [3]

> > https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L1094-L1098




-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

Patch

diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index 7aa5ebb..8f788a1 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -296,6 +296,11 @@  struct netsec_rx_pkt_info {
 	bool err_flag;
 };
 
+static void netsec_rx_fill(struct netsec_priv *priv, u16 from, u16 num);
+
+static void *netsec_alloc_rx_data(struct netsec_priv *priv,
+				  dma_addr_t *dma_addr, u16 *len);
+
 static void netsec_write(struct netsec_priv *priv, u32 reg_addr, u32 val)
 {
 	writel(val, priv->ioaddr + reg_addr);
@@ -556,34 +561,10 @@  static const struct ethtool_ops netsec_ethtool_ops = {
 
 /************* NETDEV_OPS FOLLOW *************/
 
-static struct sk_buff *netsec_alloc_skb(struct netsec_priv *priv,
-					struct netsec_desc *desc)
-{
-	struct sk_buff *skb;
-
-	if (device_get_dma_attr(priv->dev) == DEV_DMA_COHERENT) {
-		skb = netdev_alloc_skb_ip_align(priv->ndev, desc->len);
-	} else {
-		desc->len = L1_CACHE_ALIGN(desc->len);
-		skb = netdev_alloc_skb(priv->ndev, desc->len);
-	}
-	if (!skb)
-		return NULL;
-
-	desc->addr = skb->data;
-	desc->dma_addr = dma_map_single(priv->dev, desc->addr, desc->len,
-					DMA_FROM_DEVICE);
-	if (dma_mapping_error(priv->dev, desc->dma_addr)) {
-		dev_kfree_skb_any(skb);
-		return NULL;
-	}
-	return skb;
-}
 
 static void netsec_set_rx_de(struct netsec_priv *priv,
 			     struct netsec_desc_ring *dring, u16 idx,
-			     const struct netsec_desc *desc,
-			     struct sk_buff *skb)
+			     const struct netsec_desc *desc)
 {
 	struct netsec_de *de = dring->vaddr + DESC_SZ * idx;
 	u32 attr = (1 << NETSEC_RX_PKT_OWN_FIELD) |
@@ -602,59 +583,6 @@  static void netsec_set_rx_de(struct netsec_priv *priv,
 	dring->desc[idx].dma_addr = desc->dma_addr;
 	dring->desc[idx].addr = desc->addr;
 	dring->desc[idx].len = desc->len;
-	dring->desc[idx].skb = skb;
-}
-
-static struct sk_buff *netsec_get_rx_de(struct netsec_priv *priv,
-					struct netsec_desc_ring *dring,
-					u16 idx,
-					struct netsec_rx_pkt_info *rxpi,
-					struct netsec_desc *desc, u16 *len)
-{
-	struct netsec_de de = {};
-
-	memcpy(&de, dring->vaddr + DESC_SZ * idx, DESC_SZ);
-
-	*len = de.buf_len_info >> 16;
-
-	rxpi->err_flag = (de.attr >> NETSEC_RX_PKT_ER_FIELD) & 1;
-	rxpi->rx_cksum_result = (de.attr >> NETSEC_RX_PKT_CO_FIELD) & 3;
-	rxpi->err_code = (de.attr >> NETSEC_RX_PKT_ERR_FIELD) &
-							NETSEC_RX_PKT_ERR_MASK;
-	*desc = dring->desc[idx];
-	return desc->skb;
-}
-
-static struct sk_buff *netsec_get_rx_pkt_data(struct netsec_priv *priv,
-					      struct netsec_rx_pkt_info *rxpi,
-					      struct netsec_desc *desc,
-					      u16 *len)
-{
-	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
-	struct sk_buff *tmp_skb, *skb = NULL;
-	struct netsec_desc td;
-	int tail;
-
-	*rxpi = (struct netsec_rx_pkt_info){};
-
-	td.len = priv->ndev->mtu + 22;
-
-	tmp_skb = netsec_alloc_skb(priv, &td);
-
-	tail = dring->tail;
-
-	if (!tmp_skb) {
-		netsec_set_rx_de(priv, dring, tail, &dring->desc[tail],
-				 dring->desc[tail].skb);
-	} else {
-		skb = netsec_get_rx_de(priv, dring, tail, rxpi, desc, len);
-		netsec_set_rx_de(priv, dring, tail, &td, tmp_skb);
-	}
-
-	/* move tail ahead */
-	dring->tail = (dring->tail + 1) % DESC_NUM;
-
-	return skb;
 }
 
 static int netsec_clean_tx_dring(struct netsec_priv *priv, int budget)
@@ -721,19 +649,28 @@  static int netsec_process_tx(struct netsec_priv *priv, int budget)
 	return done;
 }
 
+static void netsec_adv_desc(u16 *idx)
+{
+	*idx = *idx + 1;
+	if (unlikely(*idx >= DESC_NUM))
+		*idx = 0;
+}
+
 static int netsec_process_rx(struct netsec_priv *priv, int budget)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
 	struct net_device *ndev = priv->ndev;
-	struct netsec_rx_pkt_info rx_info;
-	int done = 0;
-	struct netsec_desc desc;
 	struct sk_buff *skb;
-	u16 len;
+	int done = 0;
 
 	while (done < budget) {
 		u16 idx = dring->tail;
 		struct netsec_de *de = dring->vaddr + (DESC_SZ * idx);
+		struct netsec_desc *desc = &dring->desc[idx];
+		struct netsec_rx_pkt_info rpi;
+		u16 pkt_len, desc_len;
+		dma_addr_t dma_handle;
+		void *buf_addr;
 
 		if (de->attr & (1U << NETSEC_RX_PKT_OWN_FIELD))
 			break;
@@ -744,28 +681,62 @@  static int netsec_process_rx(struct netsec_priv *priv, int budget)
 		 */
 		dma_rmb();
 		done++;
-		skb = netsec_get_rx_pkt_data(priv, &rx_info, &desc, &len);
-		if (unlikely(!skb) || rx_info.err_flag) {
+
+		pkt_len = de->buf_len_info >> 16;
+		rpi.err_code = (de->attr >> NETSEC_RX_PKT_ERR_FIELD) &
+			NETSEC_RX_PKT_ERR_MASK;
+		rpi.err_flag = (de->attr >> NETSEC_RX_PKT_ER_FIELD) & 1;
+		if (rpi.err_flag) {
 			netif_err(priv, drv, priv->ndev,
-				  "%s: rx fail err(%d)\n",
-				  __func__, rx_info.err_code);
+				  "%s: rx fail err(%d)\n", __func__,
+				  rpi.err_code);
 			ndev->stats.rx_dropped++;
+			netsec_adv_desc(&dring->tail);
+			/* reuse buffer page frag */
+			netsec_rx_fill(priv, idx, 1);
 			continue;
 		}
+		rpi.rx_cksum_result = (de->attr >> NETSEC_RX_PKT_CO_FIELD) & 3;
 
-		dma_unmap_single(priv->dev, desc.dma_addr, desc.len,
-				 DMA_FROM_DEVICE);
-		skb_put(skb, len);
+		buf_addr = netsec_alloc_rx_data(priv, &dma_handle, &desc_len);
+		if (unlikely(!buf_addr))
+			break;
+
+		dma_sync_single_for_cpu(priv->dev, desc->dma_addr, pkt_len,
+					DMA_FROM_DEVICE);
+		prefetch(desc->addr);
+
+		skb = build_skb(desc->addr, desc->len);
+		if (unlikely(!skb)) {
+			dma_unmap_single(priv->dev, dma_handle, desc_len,
+					 DMA_TO_DEVICE);
+			skb_free_frag(buf_addr);
+			netif_err(priv, drv, priv->ndev,
+				  "rx failed to alloc skb\n");
+			break;
+		}
+		dma_unmap_single_attrs(priv->dev, desc->dma_addr, desc->len,
+				       DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
+
+		/* Update the descriptor with fresh buffers */
+		desc->len = desc_len;
+		desc->dma_addr = dma_handle;
+		desc->addr = buf_addr;
+
+		skb_put(skb, pkt_len);
 		skb->protocol = eth_type_trans(skb, priv->ndev);
 
 		if (priv->rx_cksum_offload_flag &&
-		    rx_info.rx_cksum_result == NETSEC_RX_CKSUM_OK)
+		    rpi.rx_cksum_result == NETSEC_RX_CKSUM_OK)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 		if (napi_gro_receive(&priv->napi, skb) != GRO_DROP) {
 			ndev->stats.rx_packets++;
-			ndev->stats.rx_bytes += len;
+			ndev->stats.rx_bytes += pkt_len;
 		}
+
+		netsec_rx_fill(priv, idx, 1);
+		netsec_adv_desc(&dring->tail);
 	}
 
 	return done;
@@ -928,7 +899,10 @@  static void netsec_uninit_pkt_dring(struct netsec_priv *priv, int id)
 		dma_unmap_single(priv->dev, desc->dma_addr, desc->len,
 				 id == NETSEC_RING_RX ? DMA_FROM_DEVICE :
 							      DMA_TO_DEVICE);
-		dev_kfree_skb(desc->skb);
+		if (id == NETSEC_RING_RX)
+			skb_free_frag(desc->addr);
+		else if (id == NETSEC_RING_TX)
+			dev_kfree_skb(desc->skb);
 	}
 
 	memset(dring->desc, 0, sizeof(struct netsec_desc) * DESC_NUM);
@@ -953,50 +927,96 @@  static void netsec_free_dring(struct netsec_priv *priv, int id)
 	dring->desc = NULL;
 }
 
+static void *netsec_alloc_rx_data(struct netsec_priv *priv,
+				  dma_addr_t *dma_handle, u16 *desc_len)
+{
+	size_t len = priv->ndev->mtu + ETH_HLEN + 2 * VLAN_HLEN + NET_SKB_PAD +
+		NET_IP_ALIGN;
+	dma_addr_t mapping;
+	void *buf;
+
+	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	len = SKB_DATA_ALIGN(len);
+
+	buf = napi_alloc_frag(len);
+	if (!buf)
+		return NULL;
+
+	mapping = dma_map_single(priv->dev, buf, len, DMA_FROM_DEVICE);
+	if (unlikely(dma_mapping_error(priv->dev, mapping)))
+		goto err_out;
+
+	*dma_handle = mapping;
+	*desc_len = len;
+
+	return buf;
+
+err_out:
+	skb_free_frag(buf);
+	return NULL;
+}
+
+static void netsec_rx_fill(struct netsec_priv *priv, u16 from, u16 num)
+{
+	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
+	u16 idx = from;
+
+	while (num) {
+		netsec_set_rx_de(priv, dring, idx, &dring->desc[idx]);
+		idx++;
+		if (idx >= DESC_NUM)
+			idx = 0;
+		num--;
+	}
+}
+
 static int netsec_alloc_dring(struct netsec_priv *priv, enum ring_id id)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[id];
-	int ret = 0;
 
 	dring->vaddr = dma_zalloc_coherent(priv->dev, DESC_SZ * DESC_NUM,
 					   &dring->desc_dma, GFP_KERNEL);
-	if (!dring->vaddr) {
-		ret = -ENOMEM;
+	if (!dring->vaddr)
 		goto err;
-	}
 
 	dring->desc = kcalloc(DESC_NUM, sizeof(*dring->desc), GFP_KERNEL);
-	if (!dring->desc) {
-		ret = -ENOMEM;
+	if (!dring->desc)
 		goto err;
-	}
 
 	return 0;
 err:
 	netsec_free_dring(priv, id);
 
-	return ret;
+	return -ENOMEM;
 }
 
 static int netsec_setup_rx_dring(struct netsec_priv *priv)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
-	struct netsec_desc desc;
-	struct sk_buff *skb;
-	int n;
+	int i;
 
-	desc.len = priv->ndev->mtu + 22;
+	for (i = 0; i < DESC_NUM; i++) {
+		struct netsec_desc *desc = &dring->desc[i];
+		dma_addr_t dma_handle;
+		void *buf;
+		u16 len;
 
-	for (n = 0; n < DESC_NUM; n++) {
-		skb = netsec_alloc_skb(priv, &desc);
-		if (!skb) {
+		buf = netsec_alloc_rx_data(priv, &dma_handle, &len);
+		if (!buf) {
 			netsec_uninit_pkt_dring(priv, NETSEC_RING_RX);
-			return -ENOMEM;
+			goto err_out;
 		}
-		netsec_set_rx_de(priv, dring, n, &desc, skb);
+		desc->dma_addr = dma_handle;
+		desc->addr = buf;
+		desc->len = len;
 	}
 
+	netsec_rx_fill(priv, 0, DESC_NUM);
+
 	return 0;
+
+err_out:
+	return -ENOMEM;
 }
 
 static int netsec_netdev_load_ucode_region(struct netsec_priv *priv, u32 reg,