Message ID | 20231208005250.2910004-7-almasrymina@google.com |
---|---|
State | Superseded |
Headers | show |
Series | [net-next,v1,01/16] net: page_pool: factor out releasing DMA from releasing the page | expand |
On 12/7/23 5:52 PM, Mina Almasry wrote: > + > +static int netdev_restart_rx_queue(struct net_device *dev, int rxq_idx) > +{ > + void *new_mem; > + void *old_mem; > + int err; > + > + if (!dev || !dev->netdev_ops) > + return -EINVAL; > + > + if (!dev->netdev_ops->ndo_queue_stop || > + !dev->netdev_ops->ndo_queue_mem_free || > + !dev->netdev_ops->ndo_queue_mem_alloc || > + !dev->netdev_ops->ndo_queue_start) > + return -EOPNOTSUPP; > + > + new_mem = dev->netdev_ops->ndo_queue_mem_alloc(dev, rxq_idx); > + if (!new_mem) > + return -ENOMEM; > + > + err = dev->netdev_ops->ndo_queue_stop(dev, rxq_idx, &old_mem); > + if (err) > + goto err_free_new_mem; > + > + err = dev->netdev_ops->ndo_queue_start(dev, rxq_idx, new_mem); > + if (err) > + goto err_start_queue; > + > + dev->netdev_ops->ndo_queue_mem_free(dev, old_mem); > + > + return 0; > + > +err_start_queue: > + dev->netdev_ops->ndo_queue_start(dev, rxq_idx, old_mem); > + > +err_free_new_mem: > + dev->netdev_ops->ndo_queue_mem_free(dev, new_mem); > + > + return err; > +} > + > +/* Protected by rtnl_lock() */ > +static DEFINE_XARRAY_FLAGS(netdev_dmabuf_bindings, XA_FLAGS_ALLOC1); > + > +void netdev_unbind_dmabuf(struct netdev_dmabuf_binding *binding) > +{ > + struct netdev_rx_queue *rxq; > + unsigned long xa_idx; > + unsigned int rxq_idx; > + > + if (!binding) > + return; > + > + if (binding->list.next) > + list_del(&binding->list); > + > + xa_for_each(&binding->bound_rxq_list, xa_idx, rxq) { > + if (rxq->binding == binding) { > + /* We hold the rtnl_lock while binding/unbinding > + * dma-buf, so we can't race with another thread that > + * is also modifying this value. However, the driver > + * may read this config while it's creating its > + * rx-queues. WRITE_ONCE() here to match the > + * READ_ONCE() in the driver. > + */ > + WRITE_ONCE(rxq->binding, NULL); > + > + rxq_idx = get_netdev_rx_queue_index(rxq); > + > + netdev_restart_rx_queue(binding->dev, rxq_idx); Blindly restarting a queue when a dmabuf is heavy handed. If the dmabuf has no outstanding references (ie., no references in the RxQ), then no restart is needed. > + } > + } > + > + xa_erase(&netdev_dmabuf_bindings, binding->id); > + > + netdev_dmabuf_binding_put(binding); > +} > + > +int netdev_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, > + struct netdev_dmabuf_binding *binding) > +{ > + struct netdev_rx_queue *rxq; > + u32 xa_idx; > + int err; > + > + rxq = __netif_get_rx_queue(dev, rxq_idx); > + > + if (rxq->binding) > + return -EEXIST; > + > + err = xa_alloc(&binding->bound_rxq_list, &xa_idx, rxq, xa_limit_32b, > + GFP_KERNEL); > + if (err) > + return err; > + > + /* We hold the rtnl_lock while binding/unbinding dma-buf, so we can't > + * race with another thread that is also modifying this value. However, > + * the driver may read this config while it's creating its * rx-queues. > + * WRITE_ONCE() here to match the READ_ONCE() in the driver. > + */ > + WRITE_ONCE(rxq->binding, binding); > + > + err = netdev_restart_rx_queue(dev, rxq_idx); Similarly, here binding a dmabuf to a queue. I was expecting the dmabuf binding to add entries to the page pool for the queue. If the pool was previously empty, then maybe the queue needs to be "started" in the sense of creating with h/w or just pushing buffers into the queue and moving the pidx.
On 12/8/23 12:22 PM, Mina Almasry wrote: > On Fri, Dec 8, 2023 at 9:48 AM David Ahern <dsahern@kernel.org> wrote: >> >> On 12/7/23 5:52 PM, Mina Almasry wrote: > ... >>> + >>> + xa_for_each(&binding->bound_rxq_list, xa_idx, rxq) { >>> + if (rxq->binding == binding) { >>> + /* We hold the rtnl_lock while binding/unbinding >>> + * dma-buf, so we can't race with another thread that >>> + * is also modifying this value. However, the driver >>> + * may read this config while it's creating its >>> + * rx-queues. WRITE_ONCE() here to match the >>> + * READ_ONCE() in the driver. >>> + */ >>> + WRITE_ONCE(rxq->binding, NULL); >>> + >>> + rxq_idx = get_netdev_rx_queue_index(rxq); >>> + >>> + netdev_restart_rx_queue(binding->dev, rxq_idx); >> >> Blindly restarting a queue when a dmabuf is heavy handed. If the dmabuf >> has no outstanding references (ie., no references in the RxQ), then no >> restart is needed. >> > > I think I need to stop the queue while binding to a dmabuf for the > sake of concurrency, no? I.e. the softirq thread may be delivering a > packet, and in parallel a separate thread holds rtnl_lock and tries to > bind the dma-buf. At that point the page_pool recreation will race > with the driver doing page_pool_alloc_page(). I don't think I can > insert a lock to handle this into the rx fast path, no? I think it depends on the details of how entries are added and removed from the pool. I am behind on the pp details at this point, so I do need to do some homework. > > Also, this sounds like it requires (lots of) more changes. The > page_pool + driver need to report how many pending references there > are (with locking so we don't race with incoming packets), and have > them reported via an ndo so that we can skip restarting the queue. > Implementing the changes in to a huge issue but handling the > concurrency may be a genuine blocker. Not sure it's worth the upside > of not restarting the single rx queue? It has to do with the usability of this overall solution. As I mentioned most ML use cases can (and will want to) use many memory allocations for receiving packets - e.g., allocations per message and receiving multiple messages per socket connection. > >>> + } >>> + } >>> + >>> + xa_erase(&netdev_dmabuf_bindings, binding->id); >>> + >>> + netdev_dmabuf_binding_put(binding); >>> +} >>> + >>> +int netdev_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, >>> + struct netdev_dmabuf_binding *binding) >>> +{ >>> + struct netdev_rx_queue *rxq; >>> + u32 xa_idx; >>> + int err; >>> + >>> + rxq = __netif_get_rx_queue(dev, rxq_idx); >>> + >>> + if (rxq->binding) >>> + return -EEXIST; >>> + >>> + err = xa_alloc(&binding->bound_rxq_list, &xa_idx, rxq, xa_limit_32b, >>> + GFP_KERNEL); >>> + if (err) >>> + return err; >>> + >>> + /* We hold the rtnl_lock while binding/unbinding dma-buf, so we can't >>> + * race with another thread that is also modifying this value. However, >>> + * the driver may read this config while it's creating its * rx-queues. >>> + * WRITE_ONCE() here to match the READ_ONCE() in the driver. >>> + */ >>> + WRITE_ONCE(rxq->binding, binding); >>> + >>> + err = netdev_restart_rx_queue(dev, rxq_idx); >> >> Similarly, here binding a dmabuf to a queue. I was expecting the dmabuf >> binding to add entries to the page pool for the queue. > > To be honest, I think maybe there's a slight disconnect between how > you think the page_pool works, and my primitive understanding of how > it works. Today, I see a 1:1 mapping between rx-queue and page_pool in > the code. I don't see 1:many or many:1 mappings. I am not referring to 1:N or N:1 for page pool and queues. I am referring to entries within a single page pool for a single Rx queue. > > In theory mapping 1 rx-queue to n page_pools is trivial: the driver > can call page_pool_create() multiple times to generate n queues and > decide for incoming packets which one to use. > > However, mapping n rx-queues to 1 page_pool seems like a can of worms. > I see code in the page_pool that looks to me (and Willem) like it's > safe only because the page_pool is used from the same napi context. > with a n rx-queueue: 1 page_pool mapping, that is no longer true, no? > There is a tail end of issues to resolve to be able to map 1 page_pool > to n queues as I understand and even if resolved I'm not sure the > maintainers are interested in taking the code. > > So, per my humble understanding there is no such thing as "add entries > to the page pool for the (specific) queue", the page_pool is always > used by 1 queue. > > Note that even though this limitation exists, we still support binding > 1 dma-buf to multiple queues, because multiple page pools can use the > same netdev_dmabuf_binding. I should add that to the docs. > >> If the pool was >> previously empty, then maybe the queue needs to be "started" in the >> sense of creating with h/w or just pushing buffers into the queue and >> moving the pidx. >> >> > > I don't think it's enough to add buffers to the page_pool, no? The > existing buffers in the page_pool (host mem) must be purged. I think > maybe the queue needs to be stopped as well so that we don't race with > incoming packets and end up with skbs with devmem and non-devmem frags > (unless you're thinking it becomes a requirement to support that, I > think things are complicated as-is and it's a good simplification). > When we already purge the existing buffers & restart the queue, it's > little effort to migrate this to become in line with Jakub's queue-api > that he also wants to use for per-queue configuration & ndo_stop/open. >
On Sat, Dec 9, 2023 at 3:29 PM David Ahern <dsahern@kernel.org> wrote: > > On 12/8/23 12:22 PM, Mina Almasry wrote: > > On Fri, Dec 8, 2023 at 9:48 AM David Ahern <dsahern@kernel.org> wrote: > >> > >> On 12/7/23 5:52 PM, Mina Almasry wrote: > > ... > >>> + > >>> + xa_for_each(&binding->bound_rxq_list, xa_idx, rxq) { > >>> + if (rxq->binding == binding) { > >>> + /* We hold the rtnl_lock while binding/unbinding > >>> + * dma-buf, so we can't race with another thread that > >>> + * is also modifying this value. However, the driver > >>> + * may read this config while it's creating its > >>> + * rx-queues. WRITE_ONCE() here to match the > >>> + * READ_ONCE() in the driver. > >>> + */ > >>> + WRITE_ONCE(rxq->binding, NULL); > >>> + > >>> + rxq_idx = get_netdev_rx_queue_index(rxq); > >>> + > >>> + netdev_restart_rx_queue(binding->dev, rxq_idx); > >> > >> Blindly restarting a queue when a dmabuf is heavy handed. If the dmabuf > >> has no outstanding references (ie., no references in the RxQ), then no > >> restart is needed. > >> > > > > I think I need to stop the queue while binding to a dmabuf for the > > sake of concurrency, no? I.e. the softirq thread may be delivering a > > packet, and in parallel a separate thread holds rtnl_lock and tries to > > bind the dma-buf. At that point the page_pool recreation will race > > with the driver doing page_pool_alloc_page(). I don't think I can > > insert a lock to handle this into the rx fast path, no? > > I think it depends on the details of how entries are added and removed > from the pool. I am behind on the pp details at this point, so I do need > to do some homework. > I think it also depends on the details of how to invalidate buffers posted to the rx queue of a particular driver. For GVE as far as I understands when the queue is started I believe it allocates a bunch of buffers and posts them to the rx queue. Then it processes the completion descriptors from the hardware and posts new buffers to replace the ones consumed, so any started queue would have postesd buffers in it. As far as I know we also don't support invalidating posted buffers without first stopping the queue, replacing the buffers, and starting again. But I don't think these are limitations overly specific to GVE, I believe non-RDMA NICs work similarly? But I'd stress that what I'm proposing here should be extensible to capabilities of specific drivers. If one has a driver that allows them to invalidate posted buffers on the fly, I imagine they can extend the queue API to declare that support to the netstack in a genric way, and the net stack can invalidate buffers from the previous page pool and supply the new one. > > > > Also, this sounds like it requires (lots of) more changes. The > > page_pool + driver need to report how many pending references there > > are (with locking so we don't race with incoming packets), and have > > them reported via an ndo so that we can skip restarting the queue. > > Implementing the changes in to a huge issue but handling the > > concurrency may be a genuine blocker. Not sure it's worth the upside > > of not restarting the single rx queue? > > It has to do with the usability of this overall solution. As I mentioned > most ML use cases can (and will want to) use many memory allocations for > receiving packets - e.g., allocations per message and receiving multiple > messages per socket connection. > We support that by flow steering different flows to different RX queues. Our NICs don't support smart choosing of which page_pool to place the packet in (based on ntuple rule or what not). So flows that must land on a given dmabuf are flow steered to that dmabuf, and flows that need to land host memory and not flow steered and are RSS'd to the non-dmabuf bound queues. This should also be extensible by folks that have NICs with the appropriate support. > > > >>> + } > >>> + } > >>> + > >>> + xa_erase(&netdev_dmabuf_bindings, binding->id); > >>> + > >>> + netdev_dmabuf_binding_put(binding); > >>> +} > >>> + > >>> +int netdev_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, > >>> + struct netdev_dmabuf_binding *binding) > >>> +{ > >>> + struct netdev_rx_queue *rxq; > >>> + u32 xa_idx; > >>> + int err; > >>> + > >>> + rxq = __netif_get_rx_queue(dev, rxq_idx); > >>> + > >>> + if (rxq->binding) > >>> + return -EEXIST; > >>> + > >>> + err = xa_alloc(&binding->bound_rxq_list, &xa_idx, rxq, xa_limit_32b, > >>> + GFP_KERNEL); > >>> + if (err) > >>> + return err; > >>> + > >>> + /* We hold the rtnl_lock while binding/unbinding dma-buf, so we can't > >>> + * race with another thread that is also modifying this value. However, > >>> + * the driver may read this config while it's creating its * rx-queues. > >>> + * WRITE_ONCE() here to match the READ_ONCE() in the driver. > >>> + */ > >>> + WRITE_ONCE(rxq->binding, binding); > >>> + > >>> + err = netdev_restart_rx_queue(dev, rxq_idx); > >> > >> Similarly, here binding a dmabuf to a queue. I was expecting the dmabuf > >> binding to add entries to the page pool for the queue. > > > > To be honest, I think maybe there's a slight disconnect between how > > you think the page_pool works, and my primitive understanding of how > > it works. Today, I see a 1:1 mapping between rx-queue and page_pool in > > the code. I don't see 1:many or many:1 mappings. > > I am not referring to 1:N or N:1 for page pool and queues. I am > referring to entries within a single page pool for a single Rx queue. > > Thanks, glad to hear that. I was afraid there is a miscommunication here. > > > > In theory mapping 1 rx-queue to n page_pools is trivial: the driver > > can call page_pool_create() multiple times to generate n queues and > > decide for incoming packets which one to use. > > > > However, mapping n rx-queues to 1 page_pool seems like a can of worms. > > I see code in the page_pool that looks to me (and Willem) like it's > > safe only because the page_pool is used from the same napi context. > > with a n rx-queueue: 1 page_pool mapping, that is no longer true, no? > > There is a tail end of issues to resolve to be able to map 1 page_pool > > to n queues as I understand and even if resolved I'm not sure the > > maintainers are interested in taking the code. > > > > So, per my humble understanding there is no such thing as "add entries > > to the page pool for the (specific) queue", the page_pool is always > > used by 1 queue. > > > > Note that even though this limitation exists, we still support binding > > 1 dma-buf to multiple queues, because multiple page pools can use the > > same netdev_dmabuf_binding. I should add that to the docs. > > > >> If the pool was > >> previously empty, then maybe the queue needs to be "started" in the > >> sense of creating with h/w or just pushing buffers into the queue and > >> moving the pidx. > >> > >> > > > > I don't think it's enough to add buffers to the page_pool, no? The > > existing buffers in the page_pool (host mem) must be purged. I think > > maybe the queue needs to be stopped as well so that we don't race with > > incoming packets and end up with skbs with devmem and non-devmem frags > > (unless you're thinking it becomes a requirement to support that, I > > think things are complicated as-is and it's a good simplification). > > When we already purge the existing buffers & restart the queue, it's > > little effort to migrate this to become in line with Jakub's queue-api > > that he also wants to use for per-queue configuration & ndo_stop/open. > > >
diff --git a/include/net/devmem.h b/include/net/devmem.h new file mode 100644 index 000000000000..29ff125f9815 --- /dev/null +++ b/include/net/devmem.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Device memory TCP support + * + * Authors: Mina Almasry <almasrymina@google.com> + * Willem de Bruijn <willemb@google.com> + * Kaiyuan Zhang <kaiyuanz@google.com> + * + */ +#ifndef _NET_DEVMEM_H +#define _NET_DEVMEM_H + +struct netdev_dmabuf_binding { + struct dma_buf *dmabuf; + struct dma_buf_attachment *attachment; + struct sg_table *sgt; + struct net_device *dev; + struct gen_pool *chunk_pool; + + /* The user holds a ref (via the netlink API) for as long as they want + * the binding to remain alive. Each page pool using this binding holds + * a ref to keep the binding alive. Each allocated page_pool_iov holds a + * ref. + * + * The binding undos itself and unmaps the underlying dmabuf once all + * those refs are dropped and the binding is no longer desired or in + * use. + */ + refcount_t ref; + + /* The portid of the user that owns this binding. Used for netlink to + * notify us of the user dropping the bind. + */ + u32 owner_nlportid; + + /* The list of bindings currently active. Used for netlink to notify us + * of the user dropping the bind. + */ + struct list_head list; + + /* rxq's this binding is active on. */ + struct xarray bound_rxq_list; + + /* ID of this binding. Globally unique to all bindings currently + * active. + */ + u32 id; +}; + +#ifdef CONFIG_DMA_SHARED_BUFFER +void __netdev_dmabuf_binding_free(struct netdev_dmabuf_binding *binding); +int netdev_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, + struct netdev_dmabuf_binding **out); +void netdev_unbind_dmabuf(struct netdev_dmabuf_binding *binding); +int netdev_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct netdev_dmabuf_binding *binding); +#else +static inline void +__netdev_dmabuf_binding_free(struct netdev_dmabuf_binding *binding) +{ +} + +static inline int netdev_bind_dmabuf(struct net_device *dev, + unsigned int dmabuf_fd, + struct netdev_dmabuf_binding **out) +{ + return -EOPNOTSUPP; +} +static inline void netdev_unbind_dmabuf(struct netdev_dmabuf_binding *binding) +{ +} + +static inline int +netdev_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct netdev_dmabuf_binding *binding) +{ + return -EOPNOTSUPP; +} +#endif + +static inline void +netdev_dmabuf_binding_get(struct netdev_dmabuf_binding *binding) +{ + refcount_inc(&binding->ref); +} + +static inline void +netdev_dmabuf_binding_put(struct netdev_dmabuf_binding *binding) +{ + if (!refcount_dec_and_test(&binding->ref)) + return; + + __netdev_dmabuf_binding_free(binding); +} + +#endif /* _NET_DEVMEM_H */ diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index aa1716fb0e53..5dc35628633a 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -25,6 +25,7 @@ struct netdev_rx_queue { * Readers and writers must hold RTNL */ struct napi_struct *napi; + struct netdev_dmabuf_binding *binding; } ____cacheline_aligned_in_smp; /* diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 0e9fa79a5ef1..44faee7a7b02 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -134,6 +134,33 @@ struct memory_provider_ops { bool (*release_page)(struct page_pool *pool, struct page *page); }; +/* page_pool_iov support */ + +/* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist + * entry from the dmabuf is inserted into the genpool as a chunk, and needs + * this owner struct to keep track of some metadata necessary to create + * allocations from this chunk. + */ +struct dmabuf_genpool_chunk_owner { + /* Offset into the dma-buf where this chunk starts. */ + unsigned long base_virtual; + + /* dma_addr of the start of the chunk. */ + dma_addr_t base_dma_addr; + + /* Array of page_pool_iovs for this chunk. */ + struct page_pool_iov *ppiovs; + size_t num_ppiovs; + + struct netdev_dmabuf_binding *binding; +}; + +struct page_pool_iov { + struct dmabuf_genpool_chunk_owner *owner; + + refcount_t refcount; +}; + struct page_pool { struct page_pool_params_fast p; diff --git a/net/core/dev.c b/net/core/dev.c index 0432b04cf9b0..b8c8be5a912e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -153,6 +153,10 @@ #include <linux/prandom.h> #include <linux/once_lite.h> #include <net/netdev_rx_queue.h> +#include <linux/genalloc.h> +#include <linux/dma-buf.h> +#include <net/page_pool/types.h> +#include <net/devmem.h> #include "dev.h" #include "net-sysfs.h" @@ -2041,6 +2045,278 @@ static int call_netdevice_notifiers_mtu(unsigned long val, return call_netdevice_notifiers_info(val, &info.info); } +/* Device memory support */ + +#ifdef CONFIG_DMA_SHARED_BUFFER +static void netdev_dmabuf_free_chunk_owner(struct gen_pool *genpool, + struct gen_pool_chunk *chunk, + void *not_used) +{ + struct dmabuf_genpool_chunk_owner *owner = chunk->owner; + + kvfree(owner->ppiovs); + kfree(owner); +} + +void __netdev_dmabuf_binding_free(struct netdev_dmabuf_binding *binding) +{ + size_t size, avail; + + gen_pool_for_each_chunk(binding->chunk_pool, + netdev_dmabuf_free_chunk_owner, NULL); + + size = gen_pool_size(binding->chunk_pool); + avail = gen_pool_avail(binding->chunk_pool); + + if (!WARN(size != avail, "can't destroy genpool. size=%lu, avail=%lu", + size, avail)) + gen_pool_destroy(binding->chunk_pool); + + dma_buf_unmap_attachment(binding->attachment, binding->sgt, + DMA_BIDIRECTIONAL); + dma_buf_detach(binding->dmabuf, binding->attachment); + dma_buf_put(binding->dmabuf); + xa_destroy(&binding->bound_rxq_list); + kfree(binding); +} + +static int netdev_restart_rx_queue(struct net_device *dev, int rxq_idx) +{ + void *new_mem; + void *old_mem; + int err; + + if (!dev || !dev->netdev_ops) + return -EINVAL; + + if (!dev->netdev_ops->ndo_queue_stop || + !dev->netdev_ops->ndo_queue_mem_free || + !dev->netdev_ops->ndo_queue_mem_alloc || + !dev->netdev_ops->ndo_queue_start) + return -EOPNOTSUPP; + + new_mem = dev->netdev_ops->ndo_queue_mem_alloc(dev, rxq_idx); + if (!new_mem) + return -ENOMEM; + + err = dev->netdev_ops->ndo_queue_stop(dev, rxq_idx, &old_mem); + if (err) + goto err_free_new_mem; + + err = dev->netdev_ops->ndo_queue_start(dev, rxq_idx, new_mem); + if (err) + goto err_start_queue; + + dev->netdev_ops->ndo_queue_mem_free(dev, old_mem); + + return 0; + +err_start_queue: + dev->netdev_ops->ndo_queue_start(dev, rxq_idx, old_mem); + +err_free_new_mem: + dev->netdev_ops->ndo_queue_mem_free(dev, new_mem); + + return err; +} + +/* Protected by rtnl_lock() */ +static DEFINE_XARRAY_FLAGS(netdev_dmabuf_bindings, XA_FLAGS_ALLOC1); + +void netdev_unbind_dmabuf(struct netdev_dmabuf_binding *binding) +{ + struct netdev_rx_queue *rxq; + unsigned long xa_idx; + unsigned int rxq_idx; + + if (!binding) + return; + + if (binding->list.next) + list_del(&binding->list); + + xa_for_each(&binding->bound_rxq_list, xa_idx, rxq) { + if (rxq->binding == binding) { + /* We hold the rtnl_lock while binding/unbinding + * dma-buf, so we can't race with another thread that + * is also modifying this value. However, the driver + * may read this config while it's creating its + * rx-queues. WRITE_ONCE() here to match the + * READ_ONCE() in the driver. + */ + WRITE_ONCE(rxq->binding, NULL); + + rxq_idx = get_netdev_rx_queue_index(rxq); + + netdev_restart_rx_queue(binding->dev, rxq_idx); + } + } + + xa_erase(&netdev_dmabuf_bindings, binding->id); + + netdev_dmabuf_binding_put(binding); +} + +int netdev_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct netdev_dmabuf_binding *binding) +{ + struct netdev_rx_queue *rxq; + u32 xa_idx; + int err; + + rxq = __netif_get_rx_queue(dev, rxq_idx); + + if (rxq->binding) + return -EEXIST; + + err = xa_alloc(&binding->bound_rxq_list, &xa_idx, rxq, xa_limit_32b, + GFP_KERNEL); + if (err) + return err; + + /* We hold the rtnl_lock while binding/unbinding dma-buf, so we can't + * race with another thread that is also modifying this value. However, + * the driver may read this config while it's creating its * rx-queues. + * WRITE_ONCE() here to match the READ_ONCE() in the driver. + */ + WRITE_ONCE(rxq->binding, binding); + + err = netdev_restart_rx_queue(dev, rxq_idx); + if (err) + goto err_xa_erase; + + return 0; + +err_xa_erase: + xa_erase(&binding->bound_rxq_list, xa_idx); + WRITE_ONCE(rxq->binding, NULL); + + return err; +} + +int netdev_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, + struct netdev_dmabuf_binding **out) +{ + struct netdev_dmabuf_binding *binding; + static u32 id_alloc_next; + struct scatterlist *sg; + struct dma_buf *dmabuf; + unsigned int sg_idx, i; + unsigned long virtual; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR_OR_NULL(dmabuf)) + return -EBADFD; + + binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, + dev_to_node(&dev->dev)); + if (!binding) { + err = -ENOMEM; + goto err_put_dmabuf; + } + binding->dev = dev; + + err = xa_alloc_cyclic(&netdev_dmabuf_bindings, &binding->id, binding, + xa_limit_32b, &id_alloc_next, GFP_KERNEL); + if (err < 0) + goto err_free_binding; + + xa_init_flags(&binding->bound_rxq_list, XA_FLAGS_ALLOC); + + refcount_set(&binding->ref, 1); + + binding->dmabuf = dmabuf; + + binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); + if (IS_ERR(binding->attachment)) { + err = PTR_ERR(binding->attachment); + goto err_free_id; + } + + binding->sgt = dma_buf_map_attachment(binding->attachment, + DMA_BIDIRECTIONAL); + if (IS_ERR(binding->sgt)) { + err = PTR_ERR(binding->sgt); + goto err_detach; + } + + /* For simplicity we expect to make PAGE_SIZE allocations, but the + * binding can be much more flexible than that. We may be able to + * allocate MTU sized chunks here. Leave that for future work... + */ + binding->chunk_pool = gen_pool_create(PAGE_SHIFT, + dev_to_node(&dev->dev)); + if (!binding->chunk_pool) { + err = -ENOMEM; + goto err_unmap; + } + + virtual = 0; + for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) { + dma_addr_t dma_addr = sg_dma_address(sg); + struct dmabuf_genpool_chunk_owner *owner; + size_t len = sg_dma_len(sg); + struct page_pool_iov *ppiov; + + owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, + dev_to_node(&dev->dev)); + owner->base_virtual = virtual; + owner->base_dma_addr = dma_addr; + owner->num_ppiovs = len / PAGE_SIZE; + owner->binding = binding; + + err = gen_pool_add_owner(binding->chunk_pool, dma_addr, + dma_addr, len, dev_to_node(&dev->dev), + owner); + if (err) { + err = -EINVAL; + goto err_free_chunks; + } + + owner->ppiovs = kvmalloc_array(owner->num_ppiovs, + sizeof(*owner->ppiovs), + GFP_KERNEL); + if (!owner->ppiovs) { + err = -ENOMEM; + goto err_free_chunks; + } + + for (i = 0; i < owner->num_ppiovs; i++) { + ppiov = &owner->ppiovs[i]; + ppiov->owner = owner; + refcount_set(&ppiov->refcount, 1); + } + + virtual += len; + } + + *out = binding; + + return 0; + +err_free_chunks: + gen_pool_for_each_chunk(binding->chunk_pool, + netdev_dmabuf_free_chunk_owner, NULL); + gen_pool_destroy(binding->chunk_pool); +err_unmap: + dma_buf_unmap_attachment(binding->attachment, binding->sgt, + DMA_BIDIRECTIONAL); +err_detach: + dma_buf_detach(dmabuf, binding->attachment); +err_free_id: + xa_erase(&netdev_dmabuf_bindings, binding->id); +err_free_binding: + kfree(binding); +err_put_dmabuf: + dma_buf_put(dmabuf); + return err; +} +#endif + #ifdef CONFIG_NET_INGRESS static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 0ed292d87ae0..b3323812d0b0 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -9,6 +9,7 @@ #include <net/xdp_sock.h> #include <net/netdev_rx_queue.h> #include <net/busy_poll.h> +#include <net/devmem.h> #include "netdev-genl-gen.h" #include "dev.h" @@ -469,10 +470,94 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -/* Stub */ +static LIST_HEAD(netdev_rbinding_list); + int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) { - return 0; + struct nlattr *tb[ARRAY_SIZE(netdev_queue_dmabuf_nl_policy)]; + struct netdev_dmabuf_binding *out_binding; + u32 ifindex, dmabuf_fd, rxq_idx; + struct net_device *netdev; + struct sk_buff *rsp; + struct nlattr *attr; + int rem, err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_BIND_DMABUF_DMABUF_FD) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_BIND_DMABUF_QUEUES)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_BIND_DMABUF_DMABUF_FD]); + + rtnl_lock(); + + netdev = __dev_get_by_index(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_unlock; + } + + err = netdev_bind_dmabuf(netdev, dmabuf_fd, &out_binding); + if (err) + goto err_unlock; + + nla_for_each_attr(attr, genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + if (nla_type(attr) != NETDEV_A_BIND_DMABUF_QUEUES) + continue; + + err = nla_parse_nested(tb, + ARRAY_SIZE(netdev_queue_dmabuf_nl_policy) - 1, + attr, netdev_queue_dmabuf_nl_policy, + info->extack); + + if (err < 0) + goto err_unbind; + + rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_DMABUF_IDX]); + + if (rxq_idx >= netdev->num_rx_queues) { + err = -ERANGE; + goto err_unbind; + } + + err = netdev_bind_dmabuf_to_queue(netdev, rxq_idx, out_binding); + if (err) + goto err_unbind; + } + + out_binding->owner_nlportid = info->snd_portid; + list_add(&out_binding->list, &netdev_rbinding_list); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) { + err = -ENOMEM; + goto err_unbind; + } + + hdr = genlmsg_put(rsp, info->snd_portid, info->snd_seq, + &netdev_nl_family, 0, info->genlhdr->cmd); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + nla_put_u32(rsp, NETDEV_A_BIND_DMABUF_DMABUF_ID, out_binding->id); + genlmsg_end(rsp, hdr); + + rtnl_unlock(); + + return genlmsg_reply(rsp, info); + +err_genlmsg_free: + nlmsg_free(rsp); +err_unbind: + netdev_unbind_dmabuf(out_binding); +err_unlock: + rtnl_unlock(); + return err; } static int netdev_genl_netdevice_event(struct notifier_block *nb, @@ -495,10 +580,37 @@ static int netdev_genl_netdevice_event(struct notifier_block *nb, return NOTIFY_OK; } +static int netdev_netlink_notify(struct notifier_block *nb, unsigned long state, + void *_notify) +{ + struct netlink_notify *notify = _notify; + struct netdev_dmabuf_binding *rbinding; + + if (state != NETLINK_URELEASE || notify->protocol != NETLINK_GENERIC) + return NOTIFY_DONE; + + rtnl_lock(); + + list_for_each_entry(rbinding, &netdev_rbinding_list, list) { + if (rbinding->owner_nlportid == notify->portid) { + netdev_unbind_dmabuf(rbinding); + break; + } + } + + rtnl_unlock(); + + return NOTIFY_OK; +} + static struct notifier_block netdev_genl_nb = { .notifier_call = netdev_genl_netdevice_event, }; +static struct notifier_block netdev_netlink_notifier = { + .notifier_call = netdev_netlink_notify, +}; + static int __init netdev_genl_init(void) { int err; @@ -511,8 +623,14 @@ static int __init netdev_genl_init(void) if (err) goto err_unreg_ntf; + err = netlink_register_notifier(&netdev_netlink_notifier); + if (err) + goto err_unreg_family; + return 0; +err_unreg_family: + genl_unregister_family(&netdev_nl_family); err_unreg_ntf: unregister_netdevice_notifier(&netdev_genl_nb); return err;