diff mbox series

[net-next,v1,06/16] netdev: support binding dma-buf to netdevice

Message ID 20231208005250.2910004-7-almasrymina@google.com
State New
Headers show
Series Device Memory TCP | expand

Commit Message

Mina Almasry Dec. 8, 2023, 12:52 a.m. UTC
Add a netdev_dmabuf_binding struct which represents the
dma-buf-to-netdevice binding. The netlink API will bind the dma-buf to
rx queues on the netdevice. On the binding, the dma_buf_attach
& dma_buf_map_attachment will occur. The entries in the sg_table from
mapping will be inserted into a genpool to make it ready
for allocation.

The chunks in the genpool are owned by a dmabuf_chunk_owner struct which
holds the dma-buf offset of the base of the chunk and the dma_addr of
the chunk. Both are needed to use allocations that come from this chunk.

We create a new type that represents an allocation from the genpool:
page_pool_iov. We setup the page_pool_iov allocation size in the
genpool to PAGE_SIZE for simplicity: to match the PAGE_SIZE normally
allocated by the page pool and given to the drivers.

The user can unbind the dmabuf from the netdevice by closing the netlink
socket that established the binding. We do this so that the binding is
automatically unbound even if the userspace process crashes.

The binding and unbinding leaves an indicator in struct netdev_rx_queue
that the given queue is bound, but the binding doesn't take effect until
the driver actually reconfigures its queues, and re-initializes its page
pool.

The netdev_dmabuf_binding struct is refcounted, and releases its
resources only when all the refs are released.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>

---

v1:

- Introduce devmem.h instead of bloating netdevice.h (Jakub)
- ENOTSUPP -> EOPNOTSUPP (checkpatch.pl I think)
- Remove unneeded rcu protection for binding->list (rtnl protected)
- Removed extraneous err_binding_put: label.
- Removed dma_addr += len (Paolo).
- Don't override err on netdev_bind_dmabuf_to_queue failure.
- Rename devmem -> dmabuf (David).
- Add id to dmabuf binding (David/Stan).
- Fix missing xa_destroy bound_rq_list.
- Use queue api to reset bound RX queues (Jakub).
- Update netlink API for rx-queue type (tx/re) (Jakub).

RFC v3:
- Support multi rx-queue binding

---
 include/net/devmem.h          |  96 ++++++++++++
 include/net/netdev_rx_queue.h |   1 +
 include/net/page_pool/types.h |  27 ++++
 net/core/dev.c                | 276 ++++++++++++++++++++++++++++++++++
 net/core/netdev-genl.c        | 122 ++++++++++++++-
 5 files changed, 520 insertions(+), 2 deletions(-)
 create mode 100644 include/net/devmem.h

Comments

kernel test robot Dec. 8, 2023, 3:40 p.m. UTC | #1
Hi Mina,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Mina-Almasry/net-page_pool-factor-out-releasing-DMA-from-releasing-the-page/20231208-085531
base:   net-next/main
patch link:    https://lore.kernel.org/r/20231208005250.2910004-7-almasrymina%40google.com
patch subject: [net-next v1 06/16] netdev: support binding dma-buf to netdevice
config: m68k-randconfig-r071-20231208 (https://download.01.org/0day-ci/archive/20231208/202312082303.bCpeCR0q-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231208/202312082303.bCpeCR0q-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202312082303.bCpeCR0q-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from include/asm-generic/bug.h:22,
                    from arch/m68k/include/asm/bug.h:32,
                    from include/linux/bug.h:5,
                    from include/linux/thread_info.h:13,
                    from include/asm-generic/preempt.h:5,
                    from ./arch/m68k/include/generated/asm/preempt.h:1,
                    from include/linux/preempt.h:79,
                    from arch/m68k/include/asm/irqflags.h:6,
                    from include/linux/irqflags.h:17,
                    from arch/m68k/include/asm/atomic.h:6,
                    from include/linux/atomic.h:7,
                    from include/linux/rcupdate.h:25,
                    from include/linux/rculist.h:11,
                    from include/linux/pid.h:5,
                    from include/linux/sched.h:14,
                    from include/linux/uaccess.h:8,
                    from net/core/dev.c:71:
   net/core/dev.c: In function '__netdev_dmabuf_binding_free':
>> net/core/dev.c:2071:34: warning: format '%lu' expects argument of type 'long unsigned int', but argument 2 has type 'size_t' {aka 'unsigned int'} [-Wformat=]
    2071 |         if (!WARN(size != avail, "can't destroy genpool. size=%lu, avail=%lu",
         |                                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    2072 |                   size, avail))
         |                   ~~~~            
         |                   |
         |                   size_t {aka unsigned int}
   include/linux/printk.h:427:25: note: in definition of macro 'printk_index_wrap'
     427 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
         |                         ^~~~
   include/linux/printk.h:129:17: note: in expansion of macro 'printk'
     129 |                 printk(fmt, ##__VA_ARGS__);             \
         |                 ^~~~~~
   include/asm-generic/bug.h:176:9: note: in expansion of macro 'no_printk'
     176 |         no_printk(format);                                              \
         |         ^~~~~~~~~
   net/core/dev.c:2071:14: note: in expansion of macro 'WARN'
    2071 |         if (!WARN(size != avail, "can't destroy genpool. size=%lu, avail=%lu",
         |              ^~~~
   net/core/dev.c:2071:65: note: format string is defined here
    2071 |         if (!WARN(size != avail, "can't destroy genpool. size=%lu, avail=%lu",
         |                                                               ~~^
         |                                                                 |
         |                                                                 long unsigned int
         |                                                               %u
   net/core/dev.c:2071:34: warning: format '%lu' expects argument of type 'long unsigned int', but argument 3 has type 'size_t' {aka 'unsigned int'} [-Wformat=]
    2071 |         if (!WARN(size != avail, "can't destroy genpool. size=%lu, avail=%lu",
         |                                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    2072 |                   size, avail))
         |                         ~~~~~     
         |                         |
         |                         size_t {aka unsigned int}
   include/linux/printk.h:427:25: note: in definition of macro 'printk_index_wrap'
     427 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
         |                         ^~~~
   include/linux/printk.h:129:17: note: in expansion of macro 'printk'
     129 |                 printk(fmt, ##__VA_ARGS__);             \
         |                 ^~~~~~
   include/asm-generic/bug.h:176:9: note: in expansion of macro 'no_printk'
     176 |         no_printk(format);                                              \
         |         ^~~~~~~~~
   net/core/dev.c:2071:14: note: in expansion of macro 'WARN'
    2071 |         if (!WARN(size != avail, "can't destroy genpool. size=%lu, avail=%lu",
         |              ^~~~
   net/core/dev.c:2071:76: note: format string is defined here
    2071 |         if (!WARN(size != avail, "can't destroy genpool. size=%lu, avail=%lu",
         |                                                                          ~~^
         |                                                                            |
         |                                                                            long unsigned int
         |                                                                          %u


vim +2071 net/core/dev.c

  2060	
  2061	void __netdev_dmabuf_binding_free(struct netdev_dmabuf_binding *binding)
  2062	{
  2063		size_t size, avail;
  2064	
  2065		gen_pool_for_each_chunk(binding->chunk_pool,
  2066					netdev_dmabuf_free_chunk_owner, NULL);
  2067	
  2068		size = gen_pool_size(binding->chunk_pool);
  2069		avail = gen_pool_avail(binding->chunk_pool);
  2070	
> 2071		if (!WARN(size != avail, "can't destroy genpool. size=%lu, avail=%lu",
  2072			  size, avail))
  2073			gen_pool_destroy(binding->chunk_pool);
  2074	
  2075		dma_buf_unmap_attachment(binding->attachment, binding->sgt,
  2076					 DMA_BIDIRECTIONAL);
  2077		dma_buf_detach(binding->dmabuf, binding->attachment);
  2078		dma_buf_put(binding->dmabuf);
  2079		xa_destroy(&binding->bound_rxq_list);
  2080		kfree(binding);
  2081	}
  2082
Mina Almasry Dec. 8, 2023, 7:22 p.m. UTC | #2
On Fri, Dec 8, 2023 at 9:48 AM David Ahern <dsahern@kernel.org> wrote:
>
> On 12/7/23 5:52 PM, Mina Almasry wrote:
...
> > +
> > +     xa_for_each(&binding->bound_rxq_list, xa_idx, rxq) {
> > +             if (rxq->binding == binding) {
> > +                     /* We hold the rtnl_lock while binding/unbinding
> > +                      * dma-buf, so we can't race with another thread that
> > +                      * is also modifying this value. However, the driver
> > +                      * may read this config while it's creating its
> > +                      * rx-queues. WRITE_ONCE() here to match the
> > +                      * READ_ONCE() in the driver.
> > +                      */
> > +                     WRITE_ONCE(rxq->binding, NULL);
> > +
> > +                     rxq_idx = get_netdev_rx_queue_index(rxq);
> > +
> > +                     netdev_restart_rx_queue(binding->dev, rxq_idx);
>
> Blindly restarting a queue when a dmabuf is heavy handed. If the dmabuf
> has no outstanding references (ie., no references in the RxQ), then no
> restart is needed.
>

I think I need to stop the queue while binding to a dmabuf for the
sake of concurrency, no? I.e. the softirq thread may be delivering a
packet, and in parallel a separate thread holds rtnl_lock and tries to
bind the dma-buf. At that point the page_pool recreation will race
with the driver doing page_pool_alloc_page(). I don't think I can
insert a lock to handle this into the rx fast path, no?

Also, this sounds like it requires (lots of) more changes. The
page_pool + driver need to report how many pending references there
are (with locking so we don't race with incoming packets), and have
them reported via an ndo so that we can skip restarting the queue.
Implementing the changes in to a huge issue but handling the
concurrency may be a genuine blocker. Not sure it's worth the upside
of not restarting the single rx queue?

> > +             }
> > +     }
> > +
> > +     xa_erase(&netdev_dmabuf_bindings, binding->id);
> > +
> > +     netdev_dmabuf_binding_put(binding);
> > +}
> > +
> > +int netdev_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
> > +                             struct netdev_dmabuf_binding *binding)
> > +{
> > +     struct netdev_rx_queue *rxq;
> > +     u32 xa_idx;
> > +     int err;
> > +
> > +     rxq = __netif_get_rx_queue(dev, rxq_idx);
> > +
> > +     if (rxq->binding)
> > +             return -EEXIST;
> > +
> > +     err = xa_alloc(&binding->bound_rxq_list, &xa_idx, rxq, xa_limit_32b,
> > +                    GFP_KERNEL);
> > +     if (err)
> > +             return err;
> > +
> > +     /* We hold the rtnl_lock while binding/unbinding dma-buf, so we can't
> > +      * race with another thread that is also modifying this value. However,
> > +      * the driver may read this config while it's creating its * rx-queues.
> > +      * WRITE_ONCE() here to match the READ_ONCE() in the driver.
> > +      */
> > +     WRITE_ONCE(rxq->binding, binding);
> > +
> > +     err = netdev_restart_rx_queue(dev, rxq_idx);
>
> Similarly, here binding a dmabuf to a queue. I was expecting the dmabuf
> binding to add entries to the page pool for the queue.

To be honest, I think maybe there's a slight disconnect between how
you think the page_pool works, and my primitive understanding of how
it works. Today, I see a 1:1 mapping between rx-queue and page_pool in
the code. I don't see 1:many or many:1 mappings.

In theory mapping 1 rx-queue to n page_pools is trivial: the driver
can call page_pool_create() multiple times to generate n queues and
decide for incoming packets which one to use.

However, mapping n rx-queues to 1 page_pool seems like a can of worms.
I see code in the page_pool that looks to me (and Willem) like it's
safe only because the page_pool is used from the same napi context.
with a n rx-queueue: 1 page_pool mapping, that is no longer true, no?
There is a tail end of issues to resolve to be able to map 1 page_pool
to n queues as I understand and even if resolved I'm not sure the
maintainers are interested in taking the code.

So, per my humble understanding there is no such thing as "add entries
to the page pool for the (specific) queue", the page_pool is always
used by 1 queue.

Note that even though this limitation exists, we still support binding
1 dma-buf to multiple queues, because multiple page pools can use the
same netdev_dmabuf_binding. I should add that to the docs.

> If the pool was
> previously empty, then maybe the queue needs to be "started" in the
> sense of creating with h/w or just pushing buffers into the queue and
> moving the pidx.
>
>

I don't think it's enough to add buffers to the page_pool, no? The
existing buffers in the page_pool (host mem) must be purged. I think
maybe the queue needs to be stopped as well so that we don't race with
incoming packets and end up with skbs with devmem and non-devmem frags
(unless you're thinking it becomes a requirement to support that, I
think things are complicated as-is and it's a good simplification).
When we already purge the existing buffers & restart the queue, it's
little effort to migrate this to become in line with Jakub's queue-api
that he also wants to use for per-queue configuration & ndo_stop/open.
David Ahern Dec. 9, 2023, 11:29 p.m. UTC | #3
On 12/8/23 12:22 PM, Mina Almasry wrote:
> On Fri, Dec 8, 2023 at 9:48 AM David Ahern <dsahern@kernel.org> wrote:
>>
>> On 12/7/23 5:52 PM, Mina Almasry wrote:
> ...
>>> +
>>> +     xa_for_each(&binding->bound_rxq_list, xa_idx, rxq) {
>>> +             if (rxq->binding == binding) {
>>> +                     /* We hold the rtnl_lock while binding/unbinding
>>> +                      * dma-buf, so we can't race with another thread that
>>> +                      * is also modifying this value. However, the driver
>>> +                      * may read this config while it's creating its
>>> +                      * rx-queues. WRITE_ONCE() here to match the
>>> +                      * READ_ONCE() in the driver.
>>> +                      */
>>> +                     WRITE_ONCE(rxq->binding, NULL);
>>> +
>>> +                     rxq_idx = get_netdev_rx_queue_index(rxq);
>>> +
>>> +                     netdev_restart_rx_queue(binding->dev, rxq_idx);
>>
>> Blindly restarting a queue when a dmabuf is heavy handed. If the dmabuf
>> has no outstanding references (ie., no references in the RxQ), then no
>> restart is needed.
>>
> 
> I think I need to stop the queue while binding to a dmabuf for the
> sake of concurrency, no? I.e. the softirq thread may be delivering a
> packet, and in parallel a separate thread holds rtnl_lock and tries to
> bind the dma-buf. At that point the page_pool recreation will race
> with the driver doing page_pool_alloc_page(). I don't think I can
> insert a lock to handle this into the rx fast path, no?

I think it depends on the details of how entries are added and removed
from the pool. I am behind on the pp details at this point, so I do need
to do some homework.

> 
> Also, this sounds like it requires (lots of) more changes. The
> page_pool + driver need to report how many pending references there
> are (with locking so we don't race with incoming packets), and have
> them reported via an ndo so that we can skip restarting the queue.
> Implementing the changes in to a huge issue but handling the
> concurrency may be a genuine blocker. Not sure it's worth the upside
> of not restarting the single rx queue?

It has to do with the usability of this overall solution. As I mentioned
most ML use cases can (and will want to) use many memory allocations for
receiving packets - e.g., allocations per message and receiving multiple
messages per socket connection.

> 
>>> +             }
>>> +     }
>>> +
>>> +     xa_erase(&netdev_dmabuf_bindings, binding->id);
>>> +
>>> +     netdev_dmabuf_binding_put(binding);
>>> +}
>>> +
>>> +int netdev_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
>>> +                             struct netdev_dmabuf_binding *binding)
>>> +{
>>> +     struct netdev_rx_queue *rxq;
>>> +     u32 xa_idx;
>>> +     int err;
>>> +
>>> +     rxq = __netif_get_rx_queue(dev, rxq_idx);
>>> +
>>> +     if (rxq->binding)
>>> +             return -EEXIST;
>>> +
>>> +     err = xa_alloc(&binding->bound_rxq_list, &xa_idx, rxq, xa_limit_32b,
>>> +                    GFP_KERNEL);
>>> +     if (err)
>>> +             return err;
>>> +
>>> +     /* We hold the rtnl_lock while binding/unbinding dma-buf, so we can't
>>> +      * race with another thread that is also modifying this value. However,
>>> +      * the driver may read this config while it's creating its * rx-queues.
>>> +      * WRITE_ONCE() here to match the READ_ONCE() in the driver.
>>> +      */
>>> +     WRITE_ONCE(rxq->binding, binding);
>>> +
>>> +     err = netdev_restart_rx_queue(dev, rxq_idx);
>>
>> Similarly, here binding a dmabuf to a queue. I was expecting the dmabuf
>> binding to add entries to the page pool for the queue.
> 
> To be honest, I think maybe there's a slight disconnect between how
> you think the page_pool works, and my primitive understanding of how
> it works. Today, I see a 1:1 mapping between rx-queue and page_pool in
> the code. I don't see 1:many or many:1 mappings.

I am not referring to 1:N or N:1 for page pool and queues. I am
referring to entries within a single page pool for a single Rx queue.


> 
> In theory mapping 1 rx-queue to n page_pools is trivial: the driver
> can call page_pool_create() multiple times to generate n queues and
> decide for incoming packets which one to use.
> 
> However, mapping n rx-queues to 1 page_pool seems like a can of worms.
> I see code in the page_pool that looks to me (and Willem) like it's
> safe only because the page_pool is used from the same napi context.
> with a n rx-queueue: 1 page_pool mapping, that is no longer true, no?
> There is a tail end of issues to resolve to be able to map 1 page_pool
> to n queues as I understand and even if resolved I'm not sure the
> maintainers are interested in taking the code.
> 
> So, per my humble understanding there is no such thing as "add entries
> to the page pool for the (specific) queue", the page_pool is always
> used by 1 queue.
> 
> Note that even though this limitation exists, we still support binding
> 1 dma-buf to multiple queues, because multiple page pools can use the
> same netdev_dmabuf_binding. I should add that to the docs.
> 
>> If the pool was
>> previously empty, then maybe the queue needs to be "started" in the
>> sense of creating with h/w or just pushing buffers into the queue and
>> moving the pidx.
>>
>>
> 
> I don't think it's enough to add buffers to the page_pool, no? The
> existing buffers in the page_pool (host mem) must be purged. I think
> maybe the queue needs to be stopped as well so that we don't race with
> incoming packets and end up with skbs with devmem and non-devmem frags
> (unless you're thinking it becomes a requirement to support that, I
> think things are complicated as-is and it's a good simplification).
> When we already purge the existing buffers & restart the queue, it's
> little effort to migrate this to become in line with Jakub's queue-api
> that he also wants to use for per-queue configuration & ndo_stop/open.
>
Mina Almasry Dec. 11, 2023, 2:19 a.m. UTC | #4
On Sat, Dec 9, 2023 at 3:29 PM David Ahern <dsahern@kernel.org> wrote:
>
> On 12/8/23 12:22 PM, Mina Almasry wrote:
> > On Fri, Dec 8, 2023 at 9:48 AM David Ahern <dsahern@kernel.org> wrote:
> >>
> >> On 12/7/23 5:52 PM, Mina Almasry wrote:
> > ...
> >>> +
> >>> +     xa_for_each(&binding->bound_rxq_list, xa_idx, rxq) {
> >>> +             if (rxq->binding == binding) {
> >>> +                     /* We hold the rtnl_lock while binding/unbinding
> >>> +                      * dma-buf, so we can't race with another thread that
> >>> +                      * is also modifying this value. However, the driver
> >>> +                      * may read this config while it's creating its
> >>> +                      * rx-queues. WRITE_ONCE() here to match the
> >>> +                      * READ_ONCE() in the driver.
> >>> +                      */
> >>> +                     WRITE_ONCE(rxq->binding, NULL);
> >>> +
> >>> +                     rxq_idx = get_netdev_rx_queue_index(rxq);
> >>> +
> >>> +                     netdev_restart_rx_queue(binding->dev, rxq_idx);
> >>
> >> Blindly restarting a queue when a dmabuf is heavy handed. If the dmabuf
> >> has no outstanding references (ie., no references in the RxQ), then no
> >> restart is needed.
> >>
> >
> > I think I need to stop the queue while binding to a dmabuf for the
> > sake of concurrency, no? I.e. the softirq thread may be delivering a
> > packet, and in parallel a separate thread holds rtnl_lock and tries to
> > bind the dma-buf. At that point the page_pool recreation will race
> > with the driver doing page_pool_alloc_page(). I don't think I can
> > insert a lock to handle this into the rx fast path, no?
>
> I think it depends on the details of how entries are added and removed
> from the pool. I am behind on the pp details at this point, so I do need
> to do some homework.
>

I think it also depends on the details of how to invalidate buffers
posted to the rx queue of a particular driver. For GVE as far as I
understands when the queue is started I believe it allocates a bunch
of buffers and posts them to the rx queue. Then it processes the
completion descriptors from the hardware and posts new buffers to
replace the ones consumed, so any started queue would have postesd
buffers in it.

As far as I know we also don't support invalidating posted buffers
without first stopping the queue, replacing the buffers, and starting
again. But I don't think these are limitations overly specific to GVE,
I believe non-RDMA NICs work similarly?

But I'd stress that what I'm proposing here should be extensible to
capabilities of specific drivers. If one has a driver that allows them
to invalidate posted buffers on the fly, I imagine they can extend the
queue API to declare that support to the netstack in a genric way, and
the net stack can invalidate buffers from the previous page pool and
supply the new one.

> >
> > Also, this sounds like it requires (lots of) more changes. The
> > page_pool + driver need to report how many pending references there
> > are (with locking so we don't race with incoming packets), and have
> > them reported via an ndo so that we can skip restarting the queue.
> > Implementing the changes in to a huge issue but handling the
> > concurrency may be a genuine blocker. Not sure it's worth the upside
> > of not restarting the single rx queue?
>
> It has to do with the usability of this overall solution. As I mentioned
> most ML use cases can (and will want to) use many memory allocations for
> receiving packets - e.g., allocations per message and receiving multiple
> messages per socket connection.
>

We support that by flow steering different flows to different RX
queues. Our NICs don't support smart choosing of which page_pool to
place the packet in (based on ntuple rule or what not). So flows that
must land on a given dmabuf are flow steered to that dmabuf, and flows
that need to land host memory and not flow steered and are RSS'd to
the non-dmabuf bound queues. This should also be extensible by folks
that have NICs with the appropriate support.

> >
> >>> +             }
> >>> +     }
> >>> +
> >>> +     xa_erase(&netdev_dmabuf_bindings, binding->id);
> >>> +
> >>> +     netdev_dmabuf_binding_put(binding);
> >>> +}
> >>> +
> >>> +int netdev_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
> >>> +                             struct netdev_dmabuf_binding *binding)
> >>> +{
> >>> +     struct netdev_rx_queue *rxq;
> >>> +     u32 xa_idx;
> >>> +     int err;
> >>> +
> >>> +     rxq = __netif_get_rx_queue(dev, rxq_idx);
> >>> +
> >>> +     if (rxq->binding)
> >>> +             return -EEXIST;
> >>> +
> >>> +     err = xa_alloc(&binding->bound_rxq_list, &xa_idx, rxq, xa_limit_32b,
> >>> +                    GFP_KERNEL);
> >>> +     if (err)
> >>> +             return err;
> >>> +
> >>> +     /* We hold the rtnl_lock while binding/unbinding dma-buf, so we can't
> >>> +      * race with another thread that is also modifying this value. However,
> >>> +      * the driver may read this config while it's creating its * rx-queues.
> >>> +      * WRITE_ONCE() here to match the READ_ONCE() in the driver.
> >>> +      */
> >>> +     WRITE_ONCE(rxq->binding, binding);
> >>> +
> >>> +     err = netdev_restart_rx_queue(dev, rxq_idx);
> >>
> >> Similarly, here binding a dmabuf to a queue. I was expecting the dmabuf
> >> binding to add entries to the page pool for the queue.
> >
> > To be honest, I think maybe there's a slight disconnect between how
> > you think the page_pool works, and my primitive understanding of how
> > it works. Today, I see a 1:1 mapping between rx-queue and page_pool in
> > the code. I don't see 1:many or many:1 mappings.
>
> I am not referring to 1:N or N:1 for page pool and queues. I am
> referring to entries within a single page pool for a single Rx queue.
>
>

Thanks, glad to hear that. I was afraid there is a miscommunication here.

> >
> > In theory mapping 1 rx-queue to n page_pools is trivial: the driver
> > can call page_pool_create() multiple times to generate n queues and
> > decide for incoming packets which one to use.
> >
> > However, mapping n rx-queues to 1 page_pool seems like a can of worms.
> > I see code in the page_pool that looks to me (and Willem) like it's
> > safe only because the page_pool is used from the same napi context.
> > with a n rx-queueue: 1 page_pool mapping, that is no longer true, no?
> > There is a tail end of issues to resolve to be able to map 1 page_pool
> > to n queues as I understand and even if resolved I'm not sure the
> > maintainers are interested in taking the code.
> >
> > So, per my humble understanding there is no such thing as "add entries
> > to the page pool for the (specific) queue", the page_pool is always
> > used by 1 queue.
> >
> > Note that even though this limitation exists, we still support binding
> > 1 dma-buf to multiple queues, because multiple page pools can use the
> > same netdev_dmabuf_binding. I should add that to the docs.
> >
> >> If the pool was
> >> previously empty, then maybe the queue needs to be "started" in the
> >> sense of creating with h/w or just pushing buffers into the queue and
> >> moving the pidx.
> >>
> >>
> >
> > I don't think it's enough to add buffers to the page_pool, no? The
> > existing buffers in the page_pool (host mem) must be purged. I think
> > maybe the queue needs to be stopped as well so that we don't race with
> > incoming packets and end up with skbs with devmem and non-devmem frags
> > (unless you're thinking it becomes a requirement to support that, I
> > think things are complicated as-is and it's a good simplification).
> > When we already purge the existing buffers & restart the queue, it's
> > little effort to migrate this to become in line with Jakub's queue-api
> > that he also wants to use for per-queue configuration & ndo_stop/open.
> >
>
diff mbox series

Patch

diff --git a/include/net/devmem.h b/include/net/devmem.h
new file mode 100644
index 000000000000..29ff125f9815
--- /dev/null
+++ b/include/net/devmem.h
@@ -0,0 +1,96 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Device memory TCP support
+ *
+ * Authors:	Mina Almasry <almasrymina@google.com>
+ *		Willem de Bruijn <willemb@google.com>
+ *		Kaiyuan Zhang <kaiyuanz@google.com>
+ *
+ */
+#ifndef _NET_DEVMEM_H
+#define _NET_DEVMEM_H
+
+struct netdev_dmabuf_binding {
+	struct dma_buf *dmabuf;
+	struct dma_buf_attachment *attachment;
+	struct sg_table *sgt;
+	struct net_device *dev;
+	struct gen_pool *chunk_pool;
+
+	/* The user holds a ref (via the netlink API) for as long as they want
+	 * the binding to remain alive. Each page pool using this binding holds
+	 * a ref to keep the binding alive. Each allocated page_pool_iov holds a
+	 * ref.
+	 *
+	 * The binding undos itself and unmaps the underlying dmabuf once all
+	 * those refs are dropped and the binding is no longer desired or in
+	 * use.
+	 */
+	refcount_t ref;
+
+	/* The portid of the user that owns this binding. Used for netlink to
+	 * notify us of the user dropping the bind.
+	 */
+	u32 owner_nlportid;
+
+	/* The list of bindings currently active. Used for netlink to notify us
+	 * of the user dropping the bind.
+	 */
+	struct list_head list;
+
+	/* rxq's this binding is active on. */
+	struct xarray bound_rxq_list;
+
+	/* ID of this binding. Globally unique to all bindings currently
+	 * active.
+	 */
+	u32 id;
+};
+
+#ifdef CONFIG_DMA_SHARED_BUFFER
+void __netdev_dmabuf_binding_free(struct netdev_dmabuf_binding *binding);
+int netdev_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+		       struct netdev_dmabuf_binding **out);
+void netdev_unbind_dmabuf(struct netdev_dmabuf_binding *binding);
+int netdev_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+				struct netdev_dmabuf_binding *binding);
+#else
+static inline void
+__netdev_dmabuf_binding_free(struct netdev_dmabuf_binding *binding)
+{
+}
+
+static inline int netdev_bind_dmabuf(struct net_device *dev,
+				     unsigned int dmabuf_fd,
+				     struct netdev_dmabuf_binding **out)
+{
+	return -EOPNOTSUPP;
+}
+static inline void netdev_unbind_dmabuf(struct netdev_dmabuf_binding *binding)
+{
+}
+
+static inline int
+netdev_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+			    struct netdev_dmabuf_binding *binding)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+static inline void
+netdev_dmabuf_binding_get(struct netdev_dmabuf_binding *binding)
+{
+	refcount_inc(&binding->ref);
+}
+
+static inline void
+netdev_dmabuf_binding_put(struct netdev_dmabuf_binding *binding)
+{
+	if (!refcount_dec_and_test(&binding->ref))
+		return;
+
+	__netdev_dmabuf_binding_free(binding);
+}
+
+#endif /* _NET_DEVMEM_H */
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index aa1716fb0e53..5dc35628633a 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -25,6 +25,7 @@  struct netdev_rx_queue {
 	 * Readers and writers must hold RTNL
 	 */
 	struct napi_struct		*napi;
+	struct netdev_dmabuf_binding *binding;
 } ____cacheline_aligned_in_smp;
 
 /*
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 0e9fa79a5ef1..44faee7a7b02 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -134,6 +134,33 @@  struct memory_provider_ops {
 	bool (*release_page)(struct page_pool *pool, struct page *page);
 };
 
+/* page_pool_iov support */
+
+/* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist
+ * entry from the dmabuf is inserted into the genpool as a chunk, and needs
+ * this owner struct to keep track of some metadata necessary to create
+ * allocations from this chunk.
+ */
+struct dmabuf_genpool_chunk_owner {
+	/* Offset into the dma-buf where this chunk starts.  */
+	unsigned long base_virtual;
+
+	/* dma_addr of the start of the chunk.  */
+	dma_addr_t base_dma_addr;
+
+	/* Array of page_pool_iovs for this chunk. */
+	struct page_pool_iov *ppiovs;
+	size_t num_ppiovs;
+
+	struct netdev_dmabuf_binding *binding;
+};
+
+struct page_pool_iov {
+	struct dmabuf_genpool_chunk_owner *owner;
+
+	refcount_t refcount;
+};
+
 struct page_pool {
 	struct page_pool_params_fast p;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 0432b04cf9b0..b8c8be5a912e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -153,6 +153,10 @@ 
 #include <linux/prandom.h>
 #include <linux/once_lite.h>
 #include <net/netdev_rx_queue.h>
+#include <linux/genalloc.h>
+#include <linux/dma-buf.h>
+#include <net/page_pool/types.h>
+#include <net/devmem.h>
 
 #include "dev.h"
 #include "net-sysfs.h"
@@ -2041,6 +2045,278 @@  static int call_netdevice_notifiers_mtu(unsigned long val,
 	return call_netdevice_notifiers_info(val, &info.info);
 }
 
+/* Device memory support */
+
+#ifdef CONFIG_DMA_SHARED_BUFFER
+static void netdev_dmabuf_free_chunk_owner(struct gen_pool *genpool,
+					   struct gen_pool_chunk *chunk,
+					   void *not_used)
+{
+	struct dmabuf_genpool_chunk_owner *owner = chunk->owner;
+
+	kvfree(owner->ppiovs);
+	kfree(owner);
+}
+
+void __netdev_dmabuf_binding_free(struct netdev_dmabuf_binding *binding)
+{
+	size_t size, avail;
+
+	gen_pool_for_each_chunk(binding->chunk_pool,
+				netdev_dmabuf_free_chunk_owner, NULL);
+
+	size = gen_pool_size(binding->chunk_pool);
+	avail = gen_pool_avail(binding->chunk_pool);
+
+	if (!WARN(size != avail, "can't destroy genpool. size=%lu, avail=%lu",
+		  size, avail))
+		gen_pool_destroy(binding->chunk_pool);
+
+	dma_buf_unmap_attachment(binding->attachment, binding->sgt,
+				 DMA_BIDIRECTIONAL);
+	dma_buf_detach(binding->dmabuf, binding->attachment);
+	dma_buf_put(binding->dmabuf);
+	xa_destroy(&binding->bound_rxq_list);
+	kfree(binding);
+}
+
+static int netdev_restart_rx_queue(struct net_device *dev, int rxq_idx)
+{
+	void *new_mem;
+	void *old_mem;
+	int err;
+
+	if (!dev || !dev->netdev_ops)
+		return -EINVAL;
+
+	if (!dev->netdev_ops->ndo_queue_stop ||
+	    !dev->netdev_ops->ndo_queue_mem_free ||
+	    !dev->netdev_ops->ndo_queue_mem_alloc ||
+	    !dev->netdev_ops->ndo_queue_start)
+		return -EOPNOTSUPP;
+
+	new_mem = dev->netdev_ops->ndo_queue_mem_alloc(dev, rxq_idx);
+	if (!new_mem)
+		return -ENOMEM;
+
+	err = dev->netdev_ops->ndo_queue_stop(dev, rxq_idx, &old_mem);
+	if (err)
+		goto err_free_new_mem;
+
+	err = dev->netdev_ops->ndo_queue_start(dev, rxq_idx, new_mem);
+	if (err)
+		goto err_start_queue;
+
+	dev->netdev_ops->ndo_queue_mem_free(dev, old_mem);
+
+	return 0;
+
+err_start_queue:
+	dev->netdev_ops->ndo_queue_start(dev, rxq_idx, old_mem);
+
+err_free_new_mem:
+	dev->netdev_ops->ndo_queue_mem_free(dev, new_mem);
+
+	return err;
+}
+
+/* Protected by rtnl_lock() */
+static DEFINE_XARRAY_FLAGS(netdev_dmabuf_bindings, XA_FLAGS_ALLOC1);
+
+void netdev_unbind_dmabuf(struct netdev_dmabuf_binding *binding)
+{
+	struct netdev_rx_queue *rxq;
+	unsigned long xa_idx;
+	unsigned int rxq_idx;
+
+	if (!binding)
+		return;
+
+	if (binding->list.next)
+		list_del(&binding->list);
+
+	xa_for_each(&binding->bound_rxq_list, xa_idx, rxq) {
+		if (rxq->binding == binding) {
+			/* We hold the rtnl_lock while binding/unbinding
+			 * dma-buf, so we can't race with another thread that
+			 * is also modifying this value. However, the driver
+			 * may read this config while it's creating its
+			 * rx-queues. WRITE_ONCE() here to match the
+			 * READ_ONCE() in the driver.
+			 */
+			WRITE_ONCE(rxq->binding, NULL);
+
+			rxq_idx = get_netdev_rx_queue_index(rxq);
+
+			netdev_restart_rx_queue(binding->dev, rxq_idx);
+		}
+	}
+
+	xa_erase(&netdev_dmabuf_bindings, binding->id);
+
+	netdev_dmabuf_binding_put(binding);
+}
+
+int netdev_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+				struct netdev_dmabuf_binding *binding)
+{
+	struct netdev_rx_queue *rxq;
+	u32 xa_idx;
+	int err;
+
+	rxq = __netif_get_rx_queue(dev, rxq_idx);
+
+	if (rxq->binding)
+		return -EEXIST;
+
+	err = xa_alloc(&binding->bound_rxq_list, &xa_idx, rxq, xa_limit_32b,
+		       GFP_KERNEL);
+	if (err)
+		return err;
+
+	/* We hold the rtnl_lock while binding/unbinding dma-buf, so we can't
+	 * race with another thread that is also modifying this value. However,
+	 * the driver may read this config while it's creating its * rx-queues.
+	 * WRITE_ONCE() here to match the READ_ONCE() in the driver.
+	 */
+	WRITE_ONCE(rxq->binding, binding);
+
+	err = netdev_restart_rx_queue(dev, rxq_idx);
+	if (err)
+		goto err_xa_erase;
+
+	return 0;
+
+err_xa_erase:
+	xa_erase(&binding->bound_rxq_list, xa_idx);
+	WRITE_ONCE(rxq->binding, NULL);
+
+	return err;
+}
+
+int netdev_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+		       struct netdev_dmabuf_binding **out)
+{
+	struct netdev_dmabuf_binding *binding;
+	static u32 id_alloc_next;
+	struct scatterlist *sg;
+	struct dma_buf *dmabuf;
+	unsigned int sg_idx, i;
+	unsigned long virtual;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	dmabuf = dma_buf_get(dmabuf_fd);
+	if (IS_ERR_OR_NULL(dmabuf))
+		return -EBADFD;
+
+	binding = kzalloc_node(sizeof(*binding), GFP_KERNEL,
+			       dev_to_node(&dev->dev));
+	if (!binding) {
+		err = -ENOMEM;
+		goto err_put_dmabuf;
+	}
+	binding->dev = dev;
+
+	err = xa_alloc_cyclic(&netdev_dmabuf_bindings, &binding->id, binding,
+			      xa_limit_32b, &id_alloc_next, GFP_KERNEL);
+	if (err < 0)
+		goto err_free_binding;
+
+	xa_init_flags(&binding->bound_rxq_list, XA_FLAGS_ALLOC);
+
+	refcount_set(&binding->ref, 1);
+
+	binding->dmabuf = dmabuf;
+
+	binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent);
+	if (IS_ERR(binding->attachment)) {
+		err = PTR_ERR(binding->attachment);
+		goto err_free_id;
+	}
+
+	binding->sgt = dma_buf_map_attachment(binding->attachment,
+					      DMA_BIDIRECTIONAL);
+	if (IS_ERR(binding->sgt)) {
+		err = PTR_ERR(binding->sgt);
+		goto err_detach;
+	}
+
+	/* For simplicity we expect to make PAGE_SIZE allocations, but the
+	 * binding can be much more flexible than that. We may be able to
+	 * allocate MTU sized chunks here. Leave that for future work...
+	 */
+	binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
+					      dev_to_node(&dev->dev));
+	if (!binding->chunk_pool) {
+		err = -ENOMEM;
+		goto err_unmap;
+	}
+
+	virtual = 0;
+	for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) {
+		dma_addr_t dma_addr = sg_dma_address(sg);
+		struct dmabuf_genpool_chunk_owner *owner;
+		size_t len = sg_dma_len(sg);
+		struct page_pool_iov *ppiov;
+
+		owner = kzalloc_node(sizeof(*owner), GFP_KERNEL,
+				     dev_to_node(&dev->dev));
+		owner->base_virtual = virtual;
+		owner->base_dma_addr = dma_addr;
+		owner->num_ppiovs = len / PAGE_SIZE;
+		owner->binding = binding;
+
+		err = gen_pool_add_owner(binding->chunk_pool, dma_addr,
+					 dma_addr, len, dev_to_node(&dev->dev),
+					 owner);
+		if (err) {
+			err = -EINVAL;
+			goto err_free_chunks;
+		}
+
+		owner->ppiovs = kvmalloc_array(owner->num_ppiovs,
+					       sizeof(*owner->ppiovs),
+					       GFP_KERNEL);
+		if (!owner->ppiovs) {
+			err = -ENOMEM;
+			goto err_free_chunks;
+		}
+
+		for (i = 0; i < owner->num_ppiovs; i++) {
+			ppiov = &owner->ppiovs[i];
+			ppiov->owner = owner;
+			refcount_set(&ppiov->refcount, 1);
+		}
+
+		virtual += len;
+	}
+
+	*out = binding;
+
+	return 0;
+
+err_free_chunks:
+	gen_pool_for_each_chunk(binding->chunk_pool,
+				netdev_dmabuf_free_chunk_owner, NULL);
+	gen_pool_destroy(binding->chunk_pool);
+err_unmap:
+	dma_buf_unmap_attachment(binding->attachment, binding->sgt,
+				 DMA_BIDIRECTIONAL);
+err_detach:
+	dma_buf_detach(dmabuf, binding->attachment);
+err_free_id:
+	xa_erase(&netdev_dmabuf_bindings, binding->id);
+err_free_binding:
+	kfree(binding);
+err_put_dmabuf:
+	dma_buf_put(dmabuf);
+	return err;
+}
+#endif
+
 #ifdef CONFIG_NET_INGRESS
 static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 0ed292d87ae0..b3323812d0b0 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -9,6 +9,7 @@ 
 #include <net/xdp_sock.h>
 #include <net/netdev_rx_queue.h>
 #include <net/busy_poll.h>
+#include <net/devmem.h>
 
 #include "netdev-genl-gen.h"
 #include "dev.h"
@@ -469,10 +470,94 @@  int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-/* Stub */
+static LIST_HEAD(netdev_rbinding_list);
+
 int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
 {
-	return 0;
+	struct nlattr *tb[ARRAY_SIZE(netdev_queue_dmabuf_nl_policy)];
+	struct netdev_dmabuf_binding *out_binding;
+	u32 ifindex, dmabuf_fd, rxq_idx;
+	struct net_device *netdev;
+	struct sk_buff *rsp;
+	struct nlattr *attr;
+	int rem, err = 0;
+	void *hdr;
+
+	if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
+	    GENL_REQ_ATTR_CHECK(info, NETDEV_A_BIND_DMABUF_DMABUF_FD) ||
+	    GENL_REQ_ATTR_CHECK(info, NETDEV_A_BIND_DMABUF_QUEUES))
+		return -EINVAL;
+
+	ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+	dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_BIND_DMABUF_DMABUF_FD]);
+
+	rtnl_lock();
+
+	netdev = __dev_get_by_index(genl_info_net(info), ifindex);
+	if (!netdev) {
+		err = -ENODEV;
+		goto err_unlock;
+	}
+
+	err = netdev_bind_dmabuf(netdev, dmabuf_fd, &out_binding);
+	if (err)
+		goto err_unlock;
+
+	nla_for_each_attr(attr, genlmsg_data(info->genlhdr),
+			  genlmsg_len(info->genlhdr), rem) {
+		if (nla_type(attr) != NETDEV_A_BIND_DMABUF_QUEUES)
+			continue;
+
+		err = nla_parse_nested(tb,
+				       ARRAY_SIZE(netdev_queue_dmabuf_nl_policy) - 1,
+				       attr, netdev_queue_dmabuf_nl_policy,
+				       info->extack);
+
+		if (err < 0)
+			goto err_unbind;
+
+		rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_DMABUF_IDX]);
+
+		if (rxq_idx >= netdev->num_rx_queues) {
+			err = -ERANGE;
+			goto err_unbind;
+		}
+
+		err = netdev_bind_dmabuf_to_queue(netdev, rxq_idx, out_binding);
+		if (err)
+			goto err_unbind;
+	}
+
+	out_binding->owner_nlportid = info->snd_portid;
+	list_add(&out_binding->list, &netdev_rbinding_list);
+
+	rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!rsp) {
+		err = -ENOMEM;
+		goto err_unbind;
+	}
+
+	hdr = genlmsg_put(rsp, info->snd_portid, info->snd_seq,
+			  &netdev_nl_family, 0, info->genlhdr->cmd);
+	if (!hdr) {
+		err = -EMSGSIZE;
+		goto err_genlmsg_free;
+	}
+
+	nla_put_u32(rsp, NETDEV_A_BIND_DMABUF_DMABUF_ID, out_binding->id);
+	genlmsg_end(rsp, hdr);
+
+	rtnl_unlock();
+
+	return genlmsg_reply(rsp, info);
+
+err_genlmsg_free:
+	nlmsg_free(rsp);
+err_unbind:
+	netdev_unbind_dmabuf(out_binding);
+err_unlock:
+	rtnl_unlock();
+	return err;
 }
 
 static int netdev_genl_netdevice_event(struct notifier_block *nb,
@@ -495,10 +580,37 @@  static int netdev_genl_netdevice_event(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+static int netdev_netlink_notify(struct notifier_block *nb, unsigned long state,
+				 void *_notify)
+{
+	struct netlink_notify *notify = _notify;
+	struct netdev_dmabuf_binding *rbinding;
+
+	if (state != NETLINK_URELEASE || notify->protocol != NETLINK_GENERIC)
+		return NOTIFY_DONE;
+
+	rtnl_lock();
+
+	list_for_each_entry(rbinding, &netdev_rbinding_list, list) {
+		if (rbinding->owner_nlportid == notify->portid) {
+			netdev_unbind_dmabuf(rbinding);
+			break;
+		}
+	}
+
+	rtnl_unlock();
+
+	return NOTIFY_OK;
+}
+
 static struct notifier_block netdev_genl_nb = {
 	.notifier_call	= netdev_genl_netdevice_event,
 };
 
+static struct notifier_block netdev_netlink_notifier = {
+	.notifier_call = netdev_netlink_notify,
+};
+
 static int __init netdev_genl_init(void)
 {
 	int err;
@@ -511,8 +623,14 @@  static int __init netdev_genl_init(void)
 	if (err)
 		goto err_unreg_ntf;
 
+	err = netlink_register_notifier(&netdev_netlink_notifier);
+	if (err)
+		goto err_unreg_family;
+
 	return 0;
 
+err_unreg_family:
+	genl_unregister_family(&netdev_nl_family);
 err_unreg_ntf:
 	unregister_netdevice_notifier(&netdev_genl_nb);
 	return err;