[RFCv1,2/4] v4l:vb2: add support for shared buffer (dma_buf)

Message ID 1325760118-27997-3-git-send-email-sumit.semwal@ti.com
State Superseded
Headers show

Commit Message

Sumit Semwal Jan. 5, 2012, 10:41 a.m.
This patch adds support for DMABUF memory type in videobuf2. It calls relevant
APIs of dma_buf for v4l reqbuf / qbuf / dqbuf operations.

For this version, the support is for videobuf2 as a user of the shared buffer;
so the allocation of the buffer is done outside of V4L2. [A sample allocator of
dma-buf shared buffer is given at [1]]

[1]: Rob Clark's DRM:
   https://github.com/robclark/kernel-omap4/commits/drmplane-dmabuf

Signed-off-by: Tomasz Stanislawski <t.stanislaws@samsung.com>
   [original work in the PoC for buffer sharing]
Signed-off-by: Sumit Semwal <sumit.semwal@ti.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 drivers/media/video/videobuf2-core.c |  186 +++++++++++++++++++++++++++++++++-
 include/media/videobuf2-core.h       |   30 ++++++
 2 files changed, 215 insertions(+), 1 deletions(-)

Comments

Sakari Ailus Jan. 14, 2012, 8:38 p.m. | #1
Hi Sumit,

Thanks for the patch!

Sumit Semwal wrote:
...
> @@ -962,6 +1030,109 @@ static int __qbuf_mmap(struct vb2_buffer *vb, const struct v4l2_buffer *b)
>  }
>  
>  /**
> + * __qbuf_dmabuf() - handle qbuf of a DMABUF buffer
> + */
> +static int __qbuf_dmabuf(struct vb2_buffer *vb, const struct v4l2_buffer *b)
> +{
> +	struct v4l2_plane planes[VIDEO_MAX_PLANES];
> +	struct vb2_queue *q = vb->vb2_queue;
> +	void *mem_priv;
> +	unsigned int plane;
> +	int ret;
> +	int write = !V4L2_TYPE_IS_OUTPUT(q->type);
> +
> +	/* Verify and copy relevant information provided by the userspace */
> +	ret = __fill_vb2_buffer(vb, b, planes);
> +	if (ret)
> +		return ret;
> +
> +	for (plane = 0; plane < vb->num_planes; ++plane) {
> +		struct dma_buf *dbuf = dma_buf_get(planes[plane].m.fd);
> +
> +		if (IS_ERR_OR_NULL(dbuf)) {
> +			dprintk(1, "qbuf: invalid dmabuf fd for "
> +				"plane %d\n", plane);
> +			ret = PTR_ERR(dbuf);
> +			goto err;
> +		}
> +
> +		/* this doesn't get filled in until __fill_vb2_buffer(),
> +		 * since it isn't known until after dma_buf_get()..
> +		 */
> +		planes[plane].length = dbuf->size;
> +
> +		/* Skip the plane if already verified */
> +		if (dbuf == vb->planes[plane].dbuf) {
> +			dma_buf_put(dbuf);
> +			continue;
> +		}
> +
> +		dprintk(3, "qbuf: buffer description for plane %d changed, "
> +			"reattaching dma buf\n", plane);
> +
> +		/* Release previously acquired memory if present */
> +		if (vb->planes[plane].mem_priv) {
> +			call_memop(q, plane, detach_dmabuf,
> +				vb->planes[plane].mem_priv);
> +			dma_buf_put(vb->planes[plane].dbuf);
> +		}
> +
> +		vb->planes[plane].mem_priv = NULL;
> +
> +		/* Acquire each plane's memory */
> +		mem_priv = q->mem_ops->attach_dmabuf(
> +				q->alloc_ctx[plane], dbuf);
> +		if (IS_ERR(mem_priv)) {
> +			dprintk(1, "qbuf: failed acquiring dmabuf "
> +				"memory for plane %d\n", plane);
> +			ret = PTR_ERR(mem_priv);
> +			goto err;
> +		}
> +
> +		vb->planes[plane].dbuf = dbuf;
> +		vb->planes[plane].mem_priv = mem_priv;
> +	}
> +
> +	/* TODO: This pins the buffer(s) with  dma_buf_map_attachment()).. but
> +	 * really we want to do this just before the DMA, not while queueing
> +	 * the buffer(s)..
> +	 */
> +	for (plane = 0; plane < vb->num_planes; ++plane) {
> +		ret = q->mem_ops->map_dmabuf(
> +				vb->planes[plane].mem_priv, write);
> +		if (ret) {
> +			dprintk(1, "qbuf: failed mapping dmabuf "
> +				"memory for plane %d\n", plane);
> +			goto err;
> +		}
> +	}

Shouldn't the buffer mapping only be done at the first call to
__qbuf_dmabuf()? On latter calls, the cache would need to be handled
according to presence of V4L2_BUF_FLAG_NO_CACHE_CLEAN /
V4L2_BUF_FLAG_NO_CACHE_INVALIDATE in v4l2_buffer.

> +	/*
> +	 * Call driver-specific initialization on the newly acquired buffer,
> +	 * if provided.
> +	 */
> +	ret = call_qop(q, buf_init, vb);
> +	if (ret) {
> +		dprintk(1, "qbuf: buffer initialization failed\n");
> +		goto err;
> +	}
> +
> +	/*
> +	 * Now that everything is in order, copy relevant information
> +	 * provided by userspace.
> +	 */
> +	for (plane = 0; plane < vb->num_planes; ++plane)
> +		vb->v4l2_planes[plane] = planes[plane];
> +
> +	return 0;
> +err:
> +	/* In case of errors, release planes that were already acquired */
> +	__vb2_buf_dmabuf_put(vb);
> +
> +	return ret;
> +}
> +
> +/**
>   * __enqueue_in_driver() - enqueue a vb2_buffer in driver for processing
>   */
>  static void __enqueue_in_driver(struct vb2_buffer *vb)
> @@ -985,6 +1156,9 @@ static int __buf_prepare(struct vb2_buffer *vb, const struct v4l2_buffer *b)
>  	case V4L2_MEMORY_USERPTR:
>  		ret = __qbuf_userptr(vb, b);
>  		break;
> +	case V4L2_MEMORY_DMABUF:
> +		ret = __qbuf_dmabuf(vb, b);
> +		break;
>  	default:
>  		WARN(1, "Invalid queue type\n");
>  		ret = -EINVAL;
> @@ -1284,6 +1458,7 @@ int vb2_dqbuf(struct vb2_queue *q, struct v4l2_buffer *b, bool nonblocking)
>  {
>  	struct vb2_buffer *vb = NULL;
>  	int ret;
> +	unsigned int plane;
>  
>  	if (q->fileio) {
>  		dprintk(1, "dqbuf: file io in progress\n");
> @@ -1307,6 +1482,15 @@ int vb2_dqbuf(struct vb2_queue *q, struct v4l2_buffer *b, bool nonblocking)
>  		return ret;
>  	}
>  
> +	/* TODO: this unpins the buffer(dma_buf_unmap_attachment()).. but
> +	 * really we want tot do this just after DMA, not when the
> +	 * buffer is dequeued..
> +	 */
> +	if (q->memory == V4L2_MEMORY_DMABUF)
> +		for (plane = 0; plane < vb->num_planes; ++plane)
> +			call_memop(q, plane, unmap_dmabuf,
> +				vb->planes[plane].mem_priv);
> +

Same here, except reverse: this only should be done when the buffer is
destroyed --- either when the user explicitly calls reqbufs(0) or when
the file handle owning this buffer is being closed.

Mapping buffers at every prepare_buf and unmapping them in dqbuf is
prohibitively expensive. Same goes for many other APIs than V4L2, I think.

I wonder if the means to do this exists already.

I have to admit I haven't followed the dma_buf discussion closely so I
might be missing something relevant here.

Kind regards,
Sumit Semwal Jan. 16, 2012, 5:33 a.m. | #2
On Sun, Jan 15, 2012 at 2:08 AM, Sakari Ailus <sakari.ailus@iki.fi> wrote:

> Hi Sumit,
>
> Thanks for the patch!
>
Hi Sakari,

Thanks for reviewing this :)

>
> <snip>
> Shouldn't the buffer mapping only be done at the first call to
> __qbuf_dmabuf()? On latter calls, the cache would need to be handled
> according to presence of V4L2_BUF_FLAG_NO_CACHE_CLEAN /
> V4L2_BUF_FLAG_NO_CACHE_INVALIDATE in v4l2_buffer.
>
Well, the 'map / unmap' implementation is by design exporter-specific; so
the exporter of the buffer may choose to, depending on the use case,
'map-and-keep' on first call to map_dmabuf, and do actual unmap only at
'release' time. This will mean that the {map,unmap}_dmabuf calls will be
used mostly for 'access-bracketing' between multiple users, and not for
actual map/unmap each time.
Again, the framework is flexible enough to allow exporters to actually
map/unmap as required (think cases where backing-storage migration might be
needed while buffers are in use - in that case, when all current users have
called unmap_XXX() on a buffer, it can be migrated, and the next map_XXX()
calls could give different mappings back to the users).
The kernel 'users' of dma-buf [in case of this RFC, v4l2] should not
ideally need to worry about the actual mapping/unmapping that is done; the
buffer exporter in a particular use-case should be able to handle it.
<snip>

> Same here, except reverse: this only should be done when the buffer is
> destroyed --- either when the user explicitly calls reqbufs(0) or when
> the file handle owning this buffer is being closed.
>
> Mapping buffers at every prepare_buf and unmapping them in dqbuf is
> prohibitively expensive. Same goes for many other APIs than V4L2, I think.
>
> I wonder if the means to do this exists already.
>
> I have to admit I haven't followed the dma_buf discussion closely so I
> might be missing something relevant here.
>
Hope the above explanation helps. Please do not hesitate to contact if you
need more details.

>
> Kind regards,
>
> --
> Sakari Ailus
> sakari.ailus@iki.fi
>
Best regards,
~Sumit.
Sumit Semwal Jan. 16, 2012, 5:35 a.m. | #3
On Mon, Jan 16, 2012 at 11:03 AM, Semwal, Sumit <sumit.semwal@ti.com> wrote:
>
> On Sun, Jan 15, 2012 at 2:08 AM, Sakari Ailus <sakari.ailus@iki.fi> wrote:
>>
>> Hi Sumit,
>>
>> Thanks for the patch!
>
Hi Sakari,

Thanks for reviewing this :)
>>
>>
>> <snip>
>> Shouldn't the buffer mapping only be done at the first call to
>> __qbuf_dmabuf()? On latter calls, the cache would need to be handled
>> according to presence of V4L2_BUF_FLAG_NO_CACHE_CLEAN /
>> V4L2_BUF_FLAG_NO_CACHE_INVALIDATE in v4l2_buffer.
>
Well, the 'map / unmap' implementation is by design exporter-specific;
so the exporter of the buffer may choose to, depending on the use
case, 'map-and-keep' on first call to map_dmabuf, and do actual unmap
only at 'release' time. This will mean that the {map,unmap}_dmabuf
calls will be used mostly for 'access-bracketing' between multiple
users, and not for actual map/unmap each time.
Again, the framework is flexible enough to allow exporters to actually
map/unmap as required (think cases where backing-storage migration
might be needed while buffers are in use - in that case, when all
current users have called unmap_XXX() on a buffer, it can be migrated,
and the next map_XXX() calls could give different mappings back to the
users).
> The kernel 'users' of dma-buf [in case of this RFC, v4l2] should not ideally need to worry about the actual mapping/unmapping that is done; the buffer exporter in a particular use-case should be able to handle it.
> <snip>
>>
>> Same here, except reverse: this only should be done when the buffer is
>> destroyed --- either when the user explicitly calls reqbufs(0) or when
>> the file handle owning this buffer is being closed.
>>
>> Mapping buffers at every prepare_buf and unmapping them in dqbuf is
>> prohibitively expensive. Same goes for many other APIs than V4L2, I think.
>>
>> I wonder if the means to do this exists already.
>>
>> I have to admit I haven't followed the dma_buf discussion closely so I
>> might be missing something relevant here.
>
Hope the above explanation helps. Please do not hesitate to contact if
you need more details.
>>
>>
>> Kind regards,
>>
>> --
>> Sakari Ailus
>> sakari.ailus@iki.fi

Best regards,
~Sumit.
Pawel Osciak Jan. 19, 2012, 7:07 p.m. | #4
Hi Sumit,
Thank you for your work. Please find my comments below.

On Thu, Jan 5, 2012 at 2:41 AM, Sumit Semwal <sumit.semwal@ti.com> wrote:
> This patch adds support for DMABUF memory type in videobuf2. It calls relevant
> APIs of dma_buf for v4l reqbuf / qbuf / dqbuf operations.
>
> For this version, the support is for videobuf2 as a user of the shared buffer;
> so the allocation of the buffer is done outside of V4L2. [A sample allocator of
> dma-buf shared buffer is given at [1]]
>
> [1]: Rob Clark's DRM:
>   https://github.com/robclark/kernel-omap4/commits/drmplane-dmabuf
>
> Signed-off-by: Tomasz Stanislawski <t.stanislaws@samsung.com>
>   [original work in the PoC for buffer sharing]
> Signed-off-by: Sumit Semwal <sumit.semwal@ti.com>
> Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
> ---
>  drivers/media/video/videobuf2-core.c |  186 +++++++++++++++++++++++++++++++++-
>  include/media/videobuf2-core.h       |   30 ++++++
>  2 files changed, 215 insertions(+), 1 deletions(-)
>
> diff --git a/drivers/media/video/videobuf2-core.c b/drivers/media/video/videobuf2-core.c
> index 95a3f5e..6cd2f97 100644
> --- a/drivers/media/video/videobuf2-core.c
> +++ b/drivers/media/video/videobuf2-core.c
> @@ -107,6 +107,27 @@ static void __vb2_buf_userptr_put(struct vb2_buffer *vb)
>  }
>
>  /**
> + * __vb2_buf_dmabuf_put() - release memory associated with
> + * a DMABUF shared buffer
> + */
> +static void __vb2_buf_dmabuf_put(struct vb2_buffer *vb)
> +{
> +       struct vb2_queue *q = vb->vb2_queue;
> +       unsigned int plane;
> +
> +       for (plane = 0; plane < vb->num_planes; ++plane) {
> +               void *mem_priv = vb->planes[plane].mem_priv;
> +
> +               if (mem_priv) {
> +                       call_memop(q, plane, detach_dmabuf, mem_priv);
> +                       dma_buf_put(vb->planes[plane].dbuf);
> +                       vb->planes[plane].dbuf = NULL;
> +                       vb->planes[plane].mem_priv = NULL;
> +               }
> +       }
> +}
> +
> +/**
>  * __setup_offsets() - setup unique offsets ("cookies") for every plane in
>  * every buffer on the queue
>  */
> @@ -228,6 +249,8 @@ static void __vb2_free_mem(struct vb2_queue *q, unsigned int buffers)
>                /* Free MMAP buffers or release USERPTR buffers */
>                if (q->memory == V4L2_MEMORY_MMAP)
>                        __vb2_buf_mem_free(vb);
> +               if (q->memory == V4L2_MEMORY_DMABUF)
> +                       __vb2_buf_dmabuf_put(vb);
>                else
>                        __vb2_buf_userptr_put(vb);

This looks like a bug. If memory is MMAP, you'd __vb2_buf_mem_free(vb)
AND __vb2_buf_userptr_put(vb), which is wrong. Have you tested MMAP
and USERPTR with those patches applied?

>        }
> @@ -350,6 +373,13 @@ static int __fill_v4l2_buffer(struct vb2_buffer *vb, struct v4l2_buffer *b)
>                 */
>                memcpy(b->m.planes, vb->v4l2_planes,
>                        b->length * sizeof(struct v4l2_plane));
> +
> +               if (q->memory == V4L2_MEMORY_DMABUF) {
> +                       unsigned int plane;
> +                       for (plane = 0; plane < vb->num_planes; ++plane) {
> +                               b->m.planes[plane].m.fd = 0;

I'm confused here. Isn't this the way to return fd for userspace to
pass to other drivers? I was imagining that the userspace would be
getting an fd back in plane structure to pass to other drivers, i.e.
userspace dequeuing a DMABUF v4l2_buffer should be able to pass it
forward to another driver using fd found in dequeued buffer.
Shouldn't this also fill in length?

> +                       }
> +               }
>        } else {
>                /*
>                 * We use length and offset in v4l2_planes array even for
> @@ -361,6 +391,8 @@ static int __fill_v4l2_buffer(struct vb2_buffer *vb, struct v4l2_buffer *b)
>                        b->m.offset = vb->v4l2_planes[0].m.mem_offset;
>                else if (q->memory == V4L2_MEMORY_USERPTR)
>                        b->m.userptr = vb->v4l2_planes[0].m.userptr;
> +               else if (q->memory == V4L2_MEMORY_DMABUF)
> +                       b->m.fd = 0;
>        }
>

Same here...

>        /*
> @@ -452,6 +484,21 @@ static int __verify_mmap_ops(struct vb2_queue *q)
>  }
>
>  /**
> + * __verify_dmabuf_ops() - verify that all memory operations required for
> + * DMABUF queue type have been provided
> + */
> +static int __verify_dmabuf_ops(struct vb2_queue *q)
> +{
> +       if (!(q->io_modes & VB2_DMABUF) || !q->mem_ops->attach_dmabuf
> +                       || !q->mem_ops->detach_dmabuf
> +                       || !q->mem_ops->map_dmabuf
> +                       || !q->mem_ops->unmap_dmabuf)
> +               return -EINVAL;
> +
> +       return 0;
> +}
> +
> +/**
>  * vb2_reqbufs() - Initiate streaming
>  * @q:         videobuf2 queue
>  * @req:       struct passed from userspace to vidioc_reqbufs handler in driver
> @@ -485,6 +532,7 @@ int vb2_reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req)
>        }
>
>        if (req->memory != V4L2_MEMORY_MMAP
> +                       && req->memory != V4L2_MEMORY_DMABUF
>                        && req->memory != V4L2_MEMORY_USERPTR) {
>                dprintk(1, "reqbufs: unsupported memory type\n");
>                return -EINVAL;
> @@ -514,6 +562,11 @@ int vb2_reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req)
>                return -EINVAL;
>        }
>
> +       if (req->memory == V4L2_MEMORY_DMABUF && __verify_dmabuf_ops(q)) {
> +               dprintk(1, "reqbufs: DMABUF for current setup unsupported\n");
> +               return -EINVAL;
> +       }
> +
>        if (req->count == 0 || q->num_buffers != 0 || q->memory != req->memory) {
>                /*
>                 * We already have buffers allocated, so first check if they
> @@ -621,7 +674,8 @@ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create)
>        }
>
>        if (create->memory != V4L2_MEMORY_MMAP
> -                       && create->memory != V4L2_MEMORY_USERPTR) {
> +                       && create->memory != V4L2_MEMORY_USERPTR
> +                       && create->memory != V4L2_MEMORY_DMABUF) {
>                dprintk(1, "%s(): unsupported memory type\n", __func__);
>                return -EINVAL;
>        }
> @@ -645,6 +699,11 @@ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create)
>                return -EINVAL;
>        }
>
> +       if (create->memory == V4L2_MEMORY_DMABUF && __verify_dmabuf_ops(q)) {
> +               dprintk(1, "%s(): DMABUF for current setup unsupported\n", __func__);
> +               return -EINVAL;
> +       }
> +
>        if (q->num_buffers == VIDEO_MAX_FRAME) {
>                dprintk(1, "%s(): maximum number of buffers already allocated\n",
>                        __func__);
> @@ -840,6 +899,11 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb, const struct v4l2_buffer *b,
>                                        b->m.planes[plane].length;
>                        }
>                }
> +               if (b->memory == V4L2_MEMORY_DMABUF) {
> +                       for (plane = 0; plane < vb->num_planes; ++plane) {
> +                               v4l2_planes[plane].m.fd = b->m.planes[plane].m.fd;

Shouldn't this fill length too?

> +                       }
> +               }
>        } else {
>                /*
>                 * Single-planar buffers do not use planes array,
> @@ -854,6 +918,10 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb, const struct v4l2_buffer *b,
>                        v4l2_planes[0].m.userptr = b->m.userptr;
>                        v4l2_planes[0].length = b->length;
>                }
> +               if (b->memory == V4L2_MEMORY_DMABUF) {
> +                       v4l2_planes[0].m.fd = b->m.fd;

Ditto.

> +               }
> +
>        }
>
>        vb->v4l2_buf.field = b->field;
> @@ -962,6 +1030,109 @@ static int __qbuf_mmap(struct vb2_buffer *vb, const struct v4l2_buffer *b)
>  }
>
>  /**
> + * __qbuf_dmabuf() - handle qbuf of a DMABUF buffer
> + */
> +static int __qbuf_dmabuf(struct vb2_buffer *vb, const struct v4l2_buffer *b)
> +{
> +       struct v4l2_plane planes[VIDEO_MAX_PLANES];
> +       struct vb2_queue *q = vb->vb2_queue;
> +       void *mem_priv;
> +       unsigned int plane;
> +       int ret;
> +       int write = !V4L2_TYPE_IS_OUTPUT(q->type);
> +
> +       /* Verify and copy relevant information provided by the userspace */
> +       ret = __fill_vb2_buffer(vb, b, planes);
> +       if (ret)
> +               return ret;
> +
> +       for (plane = 0; plane < vb->num_planes; ++plane) {
> +               struct dma_buf *dbuf = dma_buf_get(planes[plane].m.fd);
> +
> +               if (IS_ERR_OR_NULL(dbuf)) {
> +                       dprintk(1, "qbuf: invalid dmabuf fd for "
> +                               "plane %d\n", plane);
> +                       ret = PTR_ERR(dbuf);
> +                       goto err;
> +               }
> +
> +               /* this doesn't get filled in until __fill_vb2_buffer(),
> +                * since it isn't known until after dma_buf_get()..
> +                */
> +               planes[plane].length = dbuf->size;

But this is after dma_buf_get, unless I'm missing something... And
__fill_vb2_buffer() is not filing length...

> +
> +               /* Skip the plane if already verified */
> +               if (dbuf == vb->planes[plane].dbuf) {
> +                       dma_buf_put(dbuf);
> +                       continue;
> +               }

Won't this prevent us from using a buffer if the exporter only allows
exclusive access to it?

> +
> +               dprintk(3, "qbuf: buffer description for plane %d changed, "

s/description/descriptor ?

> +                       "reattaching dma buf\n", plane);
> +
> +               /* Release previously acquired memory if present */
> +               if (vb->planes[plane].mem_priv) {
> +                       call_memop(q, plane, detach_dmabuf,
> +                               vb->planes[plane].mem_priv);
> +                       dma_buf_put(vb->planes[plane].dbuf);
> +               }
> +
> +               vb->planes[plane].mem_priv = NULL;
> +
> +               /* Acquire each plane's memory */
> +               mem_priv = q->mem_ops->attach_dmabuf(
> +                               q->alloc_ctx[plane], dbuf);
> +               if (IS_ERR(mem_priv)) {
> +                       dprintk(1, "qbuf: failed acquiring dmabuf "
> +                               "memory for plane %d\n", plane);
> +                       ret = PTR_ERR(mem_priv);
> +                       goto err;

Since mem_priv is not assigned back to plane's mem_priv if an error
happens here, we won't be calling dma_buf_put on this dbuf, even
though we called _get() above.

> +               }
> +
> +               vb->planes[plane].dbuf = dbuf;
> +               vb->planes[plane].mem_priv = mem_priv;
> +       }
> +
> +       /* TODO: This pins the buffer(s) with  dma_buf_map_attachment()).. but
> +        * really we want to do this just before the DMA, not while queueing
> +        * the buffer(s)..
> +        */
> +       for (plane = 0; plane < vb->num_planes; ++plane) {
> +               ret = q->mem_ops->map_dmabuf(
> +                               vb->planes[plane].mem_priv, write);
> +               if (ret) {
> +                       dprintk(1, "qbuf: failed mapping dmabuf "
> +                               "memory for plane %d\n", plane);
> +                       goto err;
> +               }
> +       }
> +
> +       /*
> +        * Call driver-specific initialization on the newly acquired buffer,
> +        * if provided.
> +        */
> +       ret = call_qop(q, buf_init, vb);
> +       if (ret) {
> +               dprintk(1, "qbuf: buffer initialization failed\n");
> +               goto err;
> +       }
> +
> +       /*
> +        * Now that everything is in order, copy relevant information
> +        * provided by userspace.
> +        */
> +       for (plane = 0; plane < vb->num_planes; ++plane)
> +               vb->v4l2_planes[plane] = planes[plane];
> +
> +       return 0;
> +err:
> +       /* In case of errors, release planes that were already acquired */
> +       __vb2_buf_dmabuf_put(vb);
> +
> +       return ret;
> +}
> +
> +/**
>  * __enqueue_in_driver() - enqueue a vb2_buffer in driver for processing
>  */
>  static void __enqueue_in_driver(struct vb2_buffer *vb)
> @@ -985,6 +1156,9 @@ static int __buf_prepare(struct vb2_buffer *vb, const struct v4l2_buffer *b)
>        case V4L2_MEMORY_USERPTR:
>                ret = __qbuf_userptr(vb, b);
>                break;
> +       case V4L2_MEMORY_DMABUF:
> +               ret = __qbuf_dmabuf(vb, b);
> +               break;
>        default:
>                WARN(1, "Invalid queue type\n");
>                ret = -EINVAL;
> @@ -1284,6 +1458,7 @@ int vb2_dqbuf(struct vb2_queue *q, struct v4l2_buffer *b, bool nonblocking)
>  {
>        struct vb2_buffer *vb = NULL;
>        int ret;
> +       unsigned int plane;
>
>        if (q->fileio) {
>                dprintk(1, "dqbuf: file io in progress\n");
> @@ -1307,6 +1482,15 @@ int vb2_dqbuf(struct vb2_queue *q, struct v4l2_buffer *b, bool nonblocking)
>                return ret;
>        }
>
> +       /* TODO: this unpins the buffer(dma_buf_unmap_attachment()).. but
> +        * really we want tot do this just after DMA, not when the
> +        * buffer is dequeued..
> +        */
> +       if (q->memory == V4L2_MEMORY_DMABUF)
> +               for (plane = 0; plane < vb->num_planes; ++plane)
> +                       call_memop(q, plane, unmap_dmabuf,
> +                               vb->planes[plane].mem_priv);
> +
>        switch (vb->state) {
>        case VB2_BUF_STATE_DONE:
>                dprintk(3, "dqbuf: Returning done buffer\n");
> diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
> index a15d1f1..5c1836d 100644
> --- a/include/media/videobuf2-core.h
> +++ b/include/media/videobuf2-core.h
> @@ -16,6 +16,7 @@
>  #include <linux/mutex.h>
>  #include <linux/poll.h>
>  #include <linux/videodev2.h>
> +#include <linux/dma-buf.h>
>
>  struct vb2_alloc_ctx;
>  struct vb2_fileio_data;
> @@ -41,6 +42,20 @@ struct vb2_fileio_data;
>  *              argument to other ops in this structure
>  * @put_userptr: inform the allocator that a USERPTR buffer will no longer
>  *              be used
> + * @attach_dmabuf: attach a shared struct dma_buf for a hardware operation;
> + *                used for DMABUF memory types; alloc_ctx is the alloc context
> + *                dbuf is the shared dma_buf; returns NULL on failure;
> + *                allocator private per-buffer structure on success;
> + *                this needs to be used for further accesses to the buffer
> + * @detach_dmabuf: inform the exporter of the buffer that the current DMABUF
> + *                buffer is no longer used; the buf_priv argument is the
> + *                allocator private per-buffer structure previously returned
> + *                from the attach_dmabuf callback
> + * @map_dmabuf: request for access to the dmabuf from allocator; the allocator
> + *             of dmabuf is informed that this driver is going to use the
> + *             dmabuf
> + * @unmap_dmabuf: releases access control to the dmabuf - allocator is notified
> + *               that this driver is done using the dmabuf for now

I feel this requires more clarification. For example, for both detach
and unmap this says "the current DMABUF buffer is no longer used" and
"driver is done using the dmabuf for now", respectively. Without prior
knowledge of dmabuf, you don't know which one to use in which
situation. Similarly, attach and map could be clarified as well.

> @@ -56,6 +71,8 @@ struct vb2_fileio_data;
>  * Required ops for USERPTR types: get_userptr, put_userptr.
>  * Required ops for MMAP types: alloc, put, num_users, mmap.
>  * Required ops for read/write access types: alloc, put, num_users, vaddr
> + * Required ops for DMABUF types: attach_dmabuf, detach_dmabuf, map_dmabuf,
> + *                               unmap_dmabuf.
>  */
>  struct vb2_mem_ops {
>        void            *(*alloc)(void *alloc_ctx, unsigned long size);
> @@ -65,6 +82,16 @@ struct vb2_mem_ops {
>                                        unsigned long size, int write);
>        void            (*put_userptr)(void *buf_priv);
>
> +       /* Comment from Rob Clark: XXX: I think the attach / detach could be handled
> +        * in the vb2 core, and vb2_mem_ops really just need to get/put the
> +        * sglist (and make sure that the sglist fits it's needs..)
> +        */

I *strongly* agree with Rob here. Could you explain the reason behind
not doing this?
Allocator should ideally not have to be aware of attaching/detaching,
this is not specific to an allocator.

> +       void            *(*attach_dmabuf)(void *alloc_ctx,
> +                                         struct dma_buf *dbuf);
> +       void            (*detach_dmabuf)(void *buf_priv);
> +       int             (*map_dmabuf)(void *buf_priv, int write);
> +       void            (*unmap_dmabuf)(void *buf_priv);
> +
>        void            *(*vaddr)(void *buf_priv);
>        void            *(*cookie)(void *buf_priv);
>
> @@ -75,6 +102,7 @@ struct vb2_mem_ops {
>
>  struct vb2_plane {
>        void                    *mem_priv;
> +       struct dma_buf          *dbuf;
>  };
>
>  /**
> @@ -83,12 +111,14 @@ struct vb2_plane {
>  * @VB2_USERPTR:       driver supports USERPTR with streaming API
>  * @VB2_READ:          driver supports read() style access
>  * @VB2_WRITE:         driver supports write() style access
> + * @VB2_DMABUF:                driver supports DMABUF with streaming API
>  */
>  enum vb2_io_modes {
>        VB2_MMAP        = (1 << 0),
>        VB2_USERPTR     = (1 << 1),
>        VB2_READ        = (1 << 2),
>        VB2_WRITE       = (1 << 3),
> +       VB2_DMABUF      = (1 << 4),
>  };
>
>  /**
> --
> 1.7.5.4
Sumit Semwal Jan. 20, 2012, 10:41 a.m. | #5
On 20 January 2012 00:37, Pawel Osciak <pawel@osciak.com> wrote:
> Hi Sumit,
> Thank you for your work. Please find my comments below.
Hi Pawel,

Thank you for finding time for this review, and your comments :) - my
comments inline.
[Also, as an aside, Tomasz has also been working on the vb2 adaptation
to dma-buf, and his patches should be more comprehensive, in that he
is also planning to include 'vb2 as exporter' of dma-buf. He might
take and improve on this RFC, so it might be worthwhile to wait for
it?]
>
<snip>
>>  * __setup_offsets() - setup unique offsets ("cookies") for every plane in
>>  * every buffer on the queue
>>  */
>> @@ -228,6 +249,8 @@ static void __vb2_free_mem(struct vb2_queue *q, unsigned int buffers)
>>                /* Free MMAP buffers or release USERPTR buffers */
>>                if (q->memory == V4L2_MEMORY_MMAP)
>>                        __vb2_buf_mem_free(vb);
>> +               if (q->memory == V4L2_MEMORY_DMABUF)
>> +                       __vb2_buf_dmabuf_put(vb);
>>                else
>>                        __vb2_buf_userptr_put(vb);
>
> This looks like a bug. If memory is MMAP, you'd __vb2_buf_mem_free(vb)
> AND __vb2_buf_userptr_put(vb), which is wrong. Have you tested MMAP
> and USERPTR with those patches applied?
>
I agree - fairly stupid mistake on my end. will correct in the next version.
>>        }
>> @@ -350,6 +373,13 @@ static int __fill_v4l2_buffer(struct vb2_buffer *vb, struct v4l2_buffer *b)
>>                 */
>>                memcpy(b->m.planes, vb->v4l2_planes,
>>                        b->length * sizeof(struct v4l2_plane));
>> +
>> +               if (q->memory == V4L2_MEMORY_DMABUF) {
>> +                       unsigned int plane;
>> +                       for (plane = 0; plane < vb->num_planes; ++plane) {
>> +                               b->m.planes[plane].m.fd = 0;
>
> I'm confused here. Isn't this the way to return fd for userspace to
> pass to other drivers? I was imagining that the userspace would be
> getting an fd back in plane structure to pass to other drivers, i.e.
> userspace dequeuing a DMABUF v4l2_buffer should be able to pass it
> forward to another driver using fd found in dequeued buffer.
> Shouldn't this also fill in length?
>
Well, as a 'dma-buf' 'user', V4L2 will only get an FD from userspace
to map it to a dma-buf. The 'give-an-fd-to-pass-to-other-drivers' is
part of the exporter's functionality.
That's why I guess we did it like this - the __fill_vb2_buffer() does
copy data from userspace to vb2.
But perhaps you're right; it might be needed if the userspace refers
back to the fd from a dequeued buffer. Let me think through, and I
will reply again.
>> +                       }
<snip>
>> @@ -840,6 +899,11 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb, const struct v4l2_buffer *b,
>>                                        b->m.planes[plane].length;
>>                        }
>>                }
>> +               if (b->memory == V4L2_MEMORY_DMABUF) {
>> +                       for (plane = 0; plane < vb->num_planes; ++plane) {
>> +                               v4l2_planes[plane].m.fd = b->m.planes[plane].m.fd;
>
> Shouldn't this fill length too?
The reason this doesn't fill length is because length gets updated
based on the actual size of the buffer from the dma-buf gotten from
dma_buf_get() called in __qbuf_dmabuf().
>
>> +                       }
>> +               }
>>        } else {
>>                /*
>>                 * Single-planar buffers do not use planes array,
>> @@ -854,6 +918,10 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb, const struct v4l2_buffer *b,
>>                        v4l2_planes[0].m.userptr = b->m.userptr;
>>                        v4l2_planes[0].length = b->length;
>>                }
>> +               if (b->memory == V4L2_MEMORY_DMABUF) {
>> +                       v4l2_planes[0].m.fd = b->m.fd;
>
> Ditto.
see above.
>
>> +               }
>> +
>>        }
>>
>>        vb->v4l2_buf.field = b->field;
>> @@ -962,6 +1030,109 @@ static int __qbuf_mmap(struct vb2_buffer *vb, const struct v4l2_buffer *b)
>>  }
>>
>>  /**
>> + * __qbuf_dmabuf() - handle qbuf of a DMABUF buffer
>> + */
>> +static int __qbuf_dmabuf(struct vb2_buffer *vb, const struct v4l2_buffer *b)
>> +{
>> +       struct v4l2_plane planes[VIDEO_MAX_PLANES];
>> +       struct vb2_queue *q = vb->vb2_queue;
>> +       void *mem_priv;
>> +       unsigned int plane;
>> +       int ret;
>> +       int write = !V4L2_TYPE_IS_OUTPUT(q->type);
>> +
>> +       /* Verify and copy relevant information provided by the userspace */
>> +       ret = __fill_vb2_buffer(vb, b, planes);
>> +       if (ret)
>> +               return ret;
>> +
>> +       for (plane = 0; plane < vb->num_planes; ++plane) {
>> +               struct dma_buf *dbuf = dma_buf_get(planes[plane].m.fd);
>> +
>> +               if (IS_ERR_OR_NULL(dbuf)) {
>> +                       dprintk(1, "qbuf: invalid dmabuf fd for "
>> +                               "plane %d\n", plane);
>> +                       ret = PTR_ERR(dbuf);
>> +                       goto err;
>> +               }
>> +
>> +               /* this doesn't get filled in until __fill_vb2_buffer(),
>> +                * since it isn't known until after dma_buf_get()..
>> +                */
>> +               planes[plane].length = dbuf->size;
>
> But this is after dma_buf_get, unless I'm missing something... And
> __fill_vb2_buffer() is not filing length...
I guess replacing "this doesn't get filled in until
__fill_vb2_buffer()" with "We fill length here instead of in
__fill_vb2_buffer()" would make it clearer? This only informs about
why length is being filled here.
>
>> +
>> +               /* Skip the plane if already verified */
>> +               if (dbuf == vb->planes[plane].dbuf) {
>> +                       dma_buf_put(dbuf);
>> +                       continue;
>> +               }
>
> Won't this prevent us from using a buffer if the exporter only allows
> exclusive access to it?
I wouldn't think so; dma_buf_get() can be nested calls, and the
dma_buf_put() should match correspoding dma_buf_get(). It is of course
dependent on the exporter though.
>
>> +
>> +               dprintk(3, "qbuf: buffer description for plane %d changed, "
>
> s/description/descriptor ?
Right.
>
>> +                       "reattaching dma buf\n", plane);
>> +
>> +               /* Release previously acquired memory if present */
>> +               if (vb->planes[plane].mem_priv) {
>> +                       call_memop(q, plane, detach_dmabuf,
>> +                               vb->planes[plane].mem_priv);
>> +                       dma_buf_put(vb->planes[plane].dbuf);
>> +               }
>> +
>> +               vb->planes[plane].mem_priv = NULL;
>> +
>> +               /* Acquire each plane's memory */
>> +               mem_priv = q->mem_ops->attach_dmabuf(
>> +                               q->alloc_ctx[plane], dbuf);
>> +               if (IS_ERR(mem_priv)) {
>> +                       dprintk(1, "qbuf: failed acquiring dmabuf "
>> +                               "memory for plane %d\n", plane);
>> +                       ret = PTR_ERR(mem_priv);
>> +                       goto err;
>
> Since mem_priv is not assigned back to plane's mem_priv if an error
> happens here, we won't be calling dma_buf_put on this dbuf, even
> though we called _get() above.
>
Yes you're right of course - we should just do a dma-buf-put() for the
new buf in the IS_ERR() case.

<snip>
>> + * @attach_dmabuf: attach a shared struct dma_buf for a hardware operation;
>> + *                used for DMABUF memory types; alloc_ctx is the alloc context
>> + *                dbuf is the shared dma_buf; returns NULL on failure;
>> + *                allocator private per-buffer structure on success;
>> + *                this needs to be used for further accesses to the buffer
>> + * @detach_dmabuf: inform the exporter of the buffer that the current DMABUF
>> + *                buffer is no longer used; the buf_priv argument is the
>> + *                allocator private per-buffer structure previously returned
>> + *                from the attach_dmabuf callback
>> + * @map_dmabuf: request for access to the dmabuf from allocator; the allocator
>> + *             of dmabuf is informed that this driver is going to use the
>> + *             dmabuf
>> + * @unmap_dmabuf: releases access control to the dmabuf - allocator is notified
>> + *               that this driver is done using the dmabuf for now
>
> I feel this requires more clarification. For example, for both detach
> and unmap this says "the current DMABUF buffer is no longer used" and
> "driver is done using the dmabuf for now", respectively. Without prior
> knowledge of dmabuf, you don't know which one to use in which
> situation. Similarly, attach and map could be clarified as well.
Ok - maybe I will put a small pseudo code here?
like this:
attach()
while we will use it {
 map()
 dma .. etc etc
 unmap()
}
// finished using the buffer completely
detach()
>
>> @@ -56,6 +71,8 @@ struct vb2_fileio_data;
>>  * Required ops for USERPTR types: get_userptr, put_userptr.
>>  * Required ops for MMAP types: alloc, put, num_users, mmap.
>>  * Required ops for read/write access types: alloc, put, num_users, vaddr
>> + * Required ops for DMABUF types: attach_dmabuf, detach_dmabuf, map_dmabuf,
>> + *                               unmap_dmabuf.
>>  */
>>  struct vb2_mem_ops {
>>        void            *(*alloc)(void *alloc_ctx, unsigned long size);
>> @@ -65,6 +82,16 @@ struct vb2_mem_ops {
>>                                        unsigned long size, int write);
>>        void            (*put_userptr)(void *buf_priv);
>>
>> +       /* Comment from Rob Clark: XXX: I think the attach / detach could be handled
>> +        * in the vb2 core, and vb2_mem_ops really just need to get/put the
>> +        * sglist (and make sure that the sglist fits it's needs..)
>> +        */
>
> I *strongly* agree with Rob here. Could you explain the reason behind
> not doing this?
> Allocator should ideally not have to be aware of attaching/detaching,
> this is not specific to an allocator.
>
Ok, I thought we'll start with this version first, and then refine.
But you guys are right.

> --
> Best regards,
> Pawel Osciak
Tomasz Stanislawski Jan. 20, 2012, 10:58 a.m. | #6
Hi Sumit and Pawel,
Please find comments below.
On 01/20/2012 11:41 AM, Sumit Semwal wrote:
> On 20 January 2012 00:37, Pawel Osciak<pawel@osciak.com>  wrote:
>> Hi Sumit,
>> Thank you for your work. Please find my comments below.
> Hi Pawel,
>
> Thank you for finding time for this review, and your comments :) - my
> comments inline.
> [Also, as an aside, Tomasz has also been working on the vb2 adaptation
> to dma-buf, and his patches should be more comprehensive, in that he
> is also planning to include 'vb2 as exporter' of dma-buf. He might
> take and improve on this RFC, so it might be worthwhile to wait for
> it?]
>>

<snip>

>>>   struct vb2_mem_ops {
>>>         void            *(*alloc)(void *alloc_ctx, unsigned long size);
>>> @@ -65,6 +82,16 @@ struct vb2_mem_ops {
>>>                                         unsigned long size, int write);
>>>         void            (*put_userptr)(void *buf_priv);
>>>
>>> +       /* Comment from Rob Clark: XXX: I think the attach / detach could be handled
>>> +        * in the vb2 core, and vb2_mem_ops really just need to get/put the
>>> +        * sglist (and make sure that the sglist fits it's needs..)
>>> +        */
>>
>> I *strongly* agree with Rob here. Could you explain the reason behind
>> not doing this?
>> Allocator should ideally not have to be aware of attaching/detaching,
>> this is not specific to an allocator.
>>
> Ok, I thought we'll start with this version first, and then refine.
> But you guys are right.

I think that it is not possible to move attach/detach to vb2-core. The 
problem is that dma_buf_attach needs 'struct device' argument. This 
pointer is not available in vb2-core. This pointer is delivered by 
device's driver in "void *alloc_context".

Moving information about device would introduce new problems like:
- breaking layering in vb2
- some allocators like vb2-vmalloc do not posses any device related 
attributes

Best regards,
Tomasz Stanislawski

>
>> --
>> Best regards,
>> Pawel Osciak
>
Laurent Pinchart Jan. 20, 2012, 2:55 p.m. | #7
Hi Sumit,

On Thursday 05 January 2012 11:41:56 Sumit Semwal wrote:
> This patch adds support for DMABUF memory type in videobuf2. It calls
> relevant APIs of dma_buf for v4l reqbuf / qbuf / dqbuf operations.
> 
> For this version, the support is for videobuf2 as a user of the shared
> buffer; so the allocation of the buffer is done outside of V4L2. [A sample
> allocator of dma-buf shared buffer is given at [1]]
> 
> [1]: Rob Clark's DRM:
>    https://github.com/robclark/kernel-omap4/commits/drmplane-dmabuf
> 
> Signed-off-by: Tomasz Stanislawski <t.stanislaws@samsung.com>
>    [original work in the PoC for buffer sharing]
> Signed-off-by: Sumit Semwal <sumit.semwal@ti.com>
> Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
> ---
>  drivers/media/video/videobuf2-core.c |  186 ++++++++++++++++++++++++++++++-
>  include/media/videobuf2-core.h       |   30 ++++++
>  2 files changed, 215 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/media/video/videobuf2-core.c
> b/drivers/media/video/videobuf2-core.c index 95a3f5e..6cd2f97 100644
> --- a/drivers/media/video/videobuf2-core.c
> +++ b/drivers/media/video/videobuf2-core.c
> @@ -107,6 +107,27 @@ static void __vb2_buf_userptr_put(struct vb2_buffer
> *vb) }
> 
>  /**
> + * __vb2_buf_dmabuf_put() - release memory associated with
> + * a DMABUF shared buffer
> + */
> +static void __vb2_buf_dmabuf_put(struct vb2_buffer *vb)
> +{
> +	struct vb2_queue *q = vb->vb2_queue;
> +	unsigned int plane;
> +
> +	for (plane = 0; plane < vb->num_planes; ++plane) {
> +		void *mem_priv = vb->planes[plane].mem_priv;
> +
> +		if (mem_priv) {
> +			call_memop(q, plane, detach_dmabuf, mem_priv);
> +			dma_buf_put(vb->planes[plane].dbuf);
> +			vb->planes[plane].dbuf = NULL;
> +			vb->planes[plane].mem_priv = NULL;
> +		}
> +	}
> +}
> +
> +/**
>   * __setup_offsets() - setup unique offsets ("cookies") for every plane in
>   * every buffer on the queue
>   */
> @@ -228,6 +249,8 @@ static void __vb2_free_mem(struct vb2_queue *q,
> unsigned int buffers) /* Free MMAP buffers or release USERPTR buffers */
>  		if (q->memory == V4L2_MEMORY_MMAP)
>  			__vb2_buf_mem_free(vb);
> +		if (q->memory == V4L2_MEMORY_DMABUF)
> +			__vb2_buf_dmabuf_put(vb);
>  		else
>  			__vb2_buf_userptr_put(vb);
>  	}
> @@ -350,6 +373,13 @@ static int __fill_v4l2_buffer(struct vb2_buffer *vb,
> struct v4l2_buffer *b) */
>  		memcpy(b->m.planes, vb->v4l2_planes,
>  			b->length * sizeof(struct v4l2_plane));
> +
> +		if (q->memory == V4L2_MEMORY_DMABUF) {
> +			unsigned int plane;
> +			for (plane = 0; plane < vb->num_planes; ++plane) {
> +				b->m.planes[plane].m.fd = 0;
> +			}
> +		}
>  	} else {
>  		/*
>  		 * We use length and offset in v4l2_planes array even for
> @@ -361,6 +391,8 @@ static int __fill_v4l2_buffer(struct vb2_buffer *vb,
> struct v4l2_buffer *b) b->m.offset = vb->v4l2_planes[0].m.mem_offset;
>  		else if (q->memory == V4L2_MEMORY_USERPTR)
>  			b->m.userptr = vb->v4l2_planes[0].m.userptr;
> +		else if (q->memory == V4L2_MEMORY_DMABUF)
> +			b->m.fd = 0;
>  	}
> 
>  	/*
> @@ -452,6 +484,21 @@ static int __verify_mmap_ops(struct vb2_queue *q)
>  }
> 
>  /**
> + * __verify_dmabuf_ops() - verify that all memory operations required for
> + * DMABUF queue type have been provided
> + */
> +static int __verify_dmabuf_ops(struct vb2_queue *q)
> +{
> +	if (!(q->io_modes & VB2_DMABUF) || !q->mem_ops->attach_dmabuf
> +			|| !q->mem_ops->detach_dmabuf
> +			|| !q->mem_ops->map_dmabuf
> +			|| !q->mem_ops->unmap_dmabuf)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +/**
>   * vb2_reqbufs() - Initiate streaming
>   * @q:		videobuf2 queue
>   * @req:	struct passed from userspace to vidioc_reqbufs handler in driver
> @@ -485,6 +532,7 @@ int vb2_reqbufs(struct vb2_queue *q, struct
> v4l2_requestbuffers *req) }
> 
>  	if (req->memory != V4L2_MEMORY_MMAP
> +			&& req->memory != V4L2_MEMORY_DMABUF
>  			&& req->memory != V4L2_MEMORY_USERPTR) {
>  		dprintk(1, "reqbufs: unsupported memory type\n");
>  		return -EINVAL;
> @@ -514,6 +562,11 @@ int vb2_reqbufs(struct vb2_queue *q, struct
> v4l2_requestbuffers *req) return -EINVAL;
>  	}
> 
> +	if (req->memory == V4L2_MEMORY_DMABUF && __verify_dmabuf_ops(q)) {
> +		dprintk(1, "reqbufs: DMABUF for current setup unsupported\n");
> +		return -EINVAL;
> +	}
> +
>  	if (req->count == 0 || q->num_buffers != 0 || q->memory != req->memory) {
>  		/*
>  		 * We already have buffers allocated, so first check if they
> @@ -621,7 +674,8 @@ int vb2_create_bufs(struct vb2_queue *q, struct
> v4l2_create_buffers *create) }
> 
>  	if (create->memory != V4L2_MEMORY_MMAP
> -			&& create->memory != V4L2_MEMORY_USERPTR) {
> +			&& create->memory != V4L2_MEMORY_USERPTR
> +			&& create->memory != V4L2_MEMORY_DMABUF) {
>  		dprintk(1, "%s(): unsupported memory type\n", __func__);
>  		return -EINVAL;
>  	}
> @@ -645,6 +699,11 @@ int vb2_create_bufs(struct vb2_queue *q, struct
> v4l2_create_buffers *create) return -EINVAL;
>  	}
> 
> +	if (create->memory == V4L2_MEMORY_DMABUF && __verify_dmabuf_ops(q)) {
> +		dprintk(1, "%s(): DMABUF for current setup unsupported\n", __func__);
> +		return -EINVAL;
> +	}
> +
>  	if (q->num_buffers == VIDEO_MAX_FRAME) {
>  		dprintk(1, "%s(): maximum number of buffers already allocated\n",
>  			__func__);
> @@ -840,6 +899,11 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb,
> const struct v4l2_buffer *b, b->m.planes[plane].length;
>  			}
>  		}
> +		if (b->memory == V4L2_MEMORY_DMABUF) {
> +			for (plane = 0; plane < vb->num_planes; ++plane) {
> +				v4l2_planes[plane].m.fd = b->m.planes[plane].m.fd;
> +			}

No need for braces.

> +		}
>  	} else {
>  		/*
>  		 * Single-planar buffers do not use planes array,
> @@ -854,6 +918,10 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb,
> const struct v4l2_buffer *b, v4l2_planes[0].m.userptr = b->m.userptr;
>  			v4l2_planes[0].length = b->length;
>  		}
> +		if (b->memory == V4L2_MEMORY_DMABUF) {
> +			v4l2_planes[0].m.fd = b->m.fd;
> +		}

No need for braces.

> +
>  	}
> 
>  	vb->v4l2_buf.field = b->field;
> @@ -962,6 +1030,109 @@ static int __qbuf_mmap(struct vb2_buffer *vb, const
> struct v4l2_buffer *b) }
> 
>  /**
> + * __qbuf_dmabuf() - handle qbuf of a DMABUF buffer
> + */
> +static int __qbuf_dmabuf(struct vb2_buffer *vb, const struct v4l2_buffer
> *b) +{
> +	struct v4l2_plane planes[VIDEO_MAX_PLANES];
> +	struct vb2_queue *q = vb->vb2_queue;
> +	void *mem_priv;
> +	unsigned int plane;
> +	int ret;
> +	int write = !V4L2_TYPE_IS_OUTPUT(q->type);
> +
> +	/* Verify and copy relevant information provided by the userspace */
> +	ret = __fill_vb2_buffer(vb, b, planes);
> +	if (ret)
> +		return ret;
> +
> +	for (plane = 0; plane < vb->num_planes; ++plane) {
> +		struct dma_buf *dbuf = dma_buf_get(planes[plane].m.fd);
> +
> +		if (IS_ERR_OR_NULL(dbuf)) {
> +			dprintk(1, "qbuf: invalid dmabuf fd for "
> +				"plane %d\n", plane);
> +			ret = PTR_ERR(dbuf);

Can dma_buf_get() return NULL ? If so ret will be equal to 0, which indicates 
success. If not, you can replace IS_ERR_OR_NULL by IS_ERR.

> +			goto err;
> +		}
> +
> +		/* this doesn't get filled in until __fill_vb2_buffer(),
> +		 * since it isn't known until after dma_buf_get()..
> +		 */
> +		planes[plane].length = dbuf->size;
> +
> +		/* Skip the plane if already verified */
> +		if (dbuf == vb->planes[plane].dbuf) {
> +			dma_buf_put(dbuf);
> +			continue;

How expensive are dma_buf_get()/dma_buf_put() ? Is they're expensive, would 
there be a way to skip them if the video buffer references the same dma_buf ?

> +		}
> +
> +		dprintk(3, "qbuf: buffer description for plane %d changed, "
> +			"reattaching dma buf\n", plane);
> +
> +		/* Release previously acquired memory if present */
> +		if (vb->planes[plane].mem_priv) {
> +			call_memop(q, plane, detach_dmabuf,
> +				vb->planes[plane].mem_priv);
> +			dma_buf_put(vb->planes[plane].dbuf);
> +		}
> +
> +		vb->planes[plane].mem_priv = NULL;
> +
> +		/* Acquire each plane's memory */
> +		mem_priv = q->mem_ops->attach_dmabuf(
> +				q->alloc_ctx[plane], dbuf);
> +		if (IS_ERR(mem_priv)) {
> +			dprintk(1, "qbuf: failed acquiring dmabuf "
> +				"memory for plane %d\n", plane);
> +			ret = PTR_ERR(mem_priv);
> +			goto err;
> +		}
> +
> +		vb->planes[plane].dbuf = dbuf;
> +		vb->planes[plane].mem_priv = mem_priv;
> +	}
> +
> +	/* TODO: This pins the buffer(s) with  dma_buf_map_attachment()).. but
> +	 * really we want to do this just before the DMA, not while queueing
> +	 * the buffer(s)..
> +	 */

Mapping the buffer is an expensive operation which should be avoided when 
possible. Regardless of whether you move it to just before starting DMA or 
not, creating the sg list and mapping it through an IOMMU will have an 
important performance impact. That's why V4L2 uses streaming mappings.

I don't see a way to support streaming mappings with the current dma_buf API. 
That's in my opinion a major limitation that makes dma_buf unusable in V4L2 
for real world applications. Are you planning to fix this ?

> +	for (plane = 0; plane < vb->num_planes; ++plane) {
> +		ret = q->mem_ops->map_dmabuf(
> +				vb->planes[plane].mem_priv, write);
> +		if (ret) {
> +			dprintk(1, "qbuf: failed mapping dmabuf "
> +				"memory for plane %d\n", plane);
> +			goto err;
> +		}
> +	}
> +
> +	/*
> +	 * Call driver-specific initialization on the newly acquired buffer,
> +	 * if provided.
> +	 */
> +	ret = call_qop(q, buf_init, vb);
> +	if (ret) {
> +		dprintk(1, "qbuf: buffer initialization failed\n");
> +		goto err;
> +	}
> +
> +	/*
> +	 * Now that everything is in order, copy relevant information
> +	 * provided by userspace.
> +	 */
> +	for (plane = 0; plane < vb->num_planes; ++plane)
> +		vb->v4l2_planes[plane] = planes[plane];
> +
> +	return 0;
> +err:
> +	/* In case of errors, release planes that were already acquired */
> +	__vb2_buf_dmabuf_put(vb);
> +
> +	return ret;
> +}
> +
> +/**
>   * __enqueue_in_driver() - enqueue a vb2_buffer in driver for processing
>   */
>  static void __enqueue_in_driver(struct vb2_buffer *vb)
> @@ -985,6 +1156,9 @@ static int __buf_prepare(struct vb2_buffer *vb, const
> struct v4l2_buffer *b) case V4L2_MEMORY_USERPTR:
>  		ret = __qbuf_userptr(vb, b);
>  		break;
> +	case V4L2_MEMORY_DMABUF:
> +		ret = __qbuf_dmabuf(vb, b);
> +		break;
>  	default:
>  		WARN(1, "Invalid queue type\n");
>  		ret = -EINVAL;
> @@ -1284,6 +1458,7 @@ int vb2_dqbuf(struct vb2_queue *q, struct v4l2_buffer
> *b, bool nonblocking) {
>  	struct vb2_buffer *vb = NULL;
>  	int ret;
> +	unsigned int plane;
> 
>  	if (q->fileio) {
>  		dprintk(1, "dqbuf: file io in progress\n");
> @@ -1307,6 +1482,15 @@ int vb2_dqbuf(struct vb2_queue *q, struct
> v4l2_buffer *b, bool nonblocking) return ret;
>  	}
> 
> +	/* TODO: this unpins the buffer(dma_buf_unmap_attachment()).. but
> +	 * really we want tot do this just after DMA, not when the
> +	 * buffer is dequeued..
> +	 */
> +	if (q->memory == V4L2_MEMORY_DMABUF)
> +		for (plane = 0; plane < vb->num_planes; ++plane)
> +			call_memop(q, plane, unmap_dmabuf,
> +				vb->planes[plane].mem_priv);
> +
>  	switch (vb->state) {
>  	case VB2_BUF_STATE_DONE:
>  		dprintk(3, "dqbuf: Returning done buffer\n");
> diff --git a/include/media/videobuf2-core.h
> b/include/media/videobuf2-core.h index a15d1f1..5c1836d 100644
> --- a/include/media/videobuf2-core.h
> +++ b/include/media/videobuf2-core.h
> @@ -16,6 +16,7 @@
>  #include <linux/mutex.h>
>  #include <linux/poll.h>
>  #include <linux/videodev2.h>
> +#include <linux/dma-buf.h>

Please keep headers sorted alphabetically.

> 
>  struct vb2_alloc_ctx;
>  struct vb2_fileio_data;

[snip]
Laurent Pinchart Jan. 20, 2012, 3:04 p.m. | #8
Hi Sumit,

On Monday 16 January 2012 06:33:31 Semwal, Sumit wrote:
> On Sun, Jan 15, 2012 at 2:08 AM, Sakari Ailus <sakari.ailus@iki.fi> wrote:
> > Hi Sumit,
> > 
> > Thanks for the patch!
> 
> Hi Sakari,
> 
> Thanks for reviewing this :)
> 
> > <snip>
> > Shouldn't the buffer mapping only be done at the first call to
> > __qbuf_dmabuf()? On latter calls, the cache would need to be handled
> > according to presence of V4L2_BUF_FLAG_NO_CACHE_CLEAN /
> > V4L2_BUF_FLAG_NO_CACHE_INVALIDATE in v4l2_buffer.
> 
> Well, the 'map / unmap' implementation is by design exporter-specific; so
> the exporter of the buffer may choose to, depending on the use case,
> 'map-and-keep' on first call to map_dmabuf, and do actual unmap only at
> 'release' time. This will mean that the {map,unmap}_dmabuf calls will be
> used mostly for 'access-bracketing' between multiple users, and not for
> actual map/unmap each time.
> Again, the framework is flexible enough to allow exporters to actually
> map/unmap as required (think cases where backing-storage migration might be
> needed while buffers are in use - in that case, when all current users have
> called unmap_XXX() on a buffer, it can be migrated, and the next map_XXX()
> calls could give different mappings back to the users).
> The kernel 'users' of dma-buf [in case of this RFC, v4l2] should not
> ideally need to worry about the actual mapping/unmapping that is done; the
> buffer exporter in a particular use-case should be able to handle it.

I'm afraid it's more complex than that. Your patch calls q->mem_ops-
>map_dmabuf() at every VIDIOC_QBUF call. The function will call 
dma_buf_map_attachment(), which could cache the mapping somehow (even though 
that triggers an alarm somewhere in my brain, deciding in the exporter how to 
do so will likely cause issues - I'll try to sort my thoughts out on this), 
but it will also be responsible for mapping the sg list to the V4L2 device 
IOMMU (not for dma-contig obviously, but this code is in videobuf2-core.c). 
This is an expensive operation that we don't want to perform at every 
QBUF/DQBUF.

V4L2 uses streaming DMA mappings, partly for performance reasons. That's 
something dma-buf will likely need to support. Or you could argue that 
streaming DMA mappings are broken by design on some platform anyway, but then 
I'll expect a proposal for an efficient replacement :-)

> <snip>
> 
> > Same here, except reverse: this only should be done when the buffer is
> > destroyed --- either when the user explicitly calls reqbufs(0) or when
> > the file handle owning this buffer is being closed.
> > 
> > Mapping buffers at every prepare_buf and unmapping them in dqbuf is
> > prohibitively expensive. Same goes for many other APIs than V4L2, I
> > think.
> > 
> > I wonder if the means to do this exists already.
> > 
> > I have to admit I haven't followed the dma_buf discussion closely so I
> > might be missing something relevant here.
> 
> Hope the above explanation helps. Please do not hesitate to contact if you
> need more details.
Laurent Pinchart Jan. 20, 2012, 3:12 p.m. | #9
Hi Tomasz,

On Friday 20 January 2012 11:58:39 Tomasz Stanislawski wrote:
> On 01/20/2012 11:41 AM, Sumit Semwal wrote:
> > On 20 January 2012 00:37, Pawel Osciak<pawel@osciak.com>  wrote:
> >> Hi Sumit,
> >> Thank you for your work. Please find my comments below.
> > 
> > Hi Pawel,
> > 
> > Thank you for finding time for this review, and your comments :) - my
> > comments inline.
> > [Also, as an aside, Tomasz has also been working on the vb2 adaptation
> > to dma-buf, and his patches should be more comprehensive, in that he
> > is also planning to include 'vb2 as exporter' of dma-buf. He might
> > take and improve on this RFC, so it might be worthwhile to wait for
> > it?]
> 
> <snip>
> 
> >>>   struct vb2_mem_ops {
> >>>   
> >>>         void            *(*alloc)(void *alloc_ctx, unsigned long size);
> >>> 
> >>> @@ -65,6 +82,16 @@ struct vb2_mem_ops {
> >>> 
> >>>                                         unsigned long size, int write);
> >>>         
> >>>         void            (*put_userptr)(void *buf_priv);
> >>> 
> >>> +       /* Comment from Rob Clark: XXX: I think the attach / detach
> >>> could be handled +        * in the vb2 core, and vb2_mem_ops really
> >>> just need to get/put the +        * sglist (and make sure that the
> >>> sglist fits it's needs..) +        */
> >> 
> >> I *strongly* agree with Rob here. Could you explain the reason behind
> >> not doing this?
> >> Allocator should ideally not have to be aware of attaching/detaching,
> >> this is not specific to an allocator.
> > 
> > Ok, I thought we'll start with this version first, and then refine.
> > But you guys are right.
> 
> I think that it is not possible to move attach/detach to vb2-core. The
> problem is that dma_buf_attach needs 'struct device' argument. This
> pointer is not available in vb2-core. This pointer is delivered by
> device's driver in "void *alloc_context".
> 
> Moving information about device would introduce new problems like:
> - breaking layering in vb2
> - some allocators like vb2-vmalloc do not posses any device related
> attributes

What about passing the device to vb2-core then ?
Tomasz Stanislawski Jan. 20, 2012, 3:53 p.m. | #10
Hi Laurent,

On 01/20/2012 04:12 PM, Laurent Pinchart wrote:
> Hi Tomasz,
>
> On Friday 20 January 2012 11:58:39 Tomasz Stanislawski wrote:
>> On 01/20/2012 11:41 AM, Sumit Semwal wrote:
>>> On 20 January 2012 00:37, Pawel Osciak<pawel@osciak.com>   wrote:
>>>> Hi Sumit,
>>>> Thank you for your work. Please find my comments below.
>>>
>>> Hi Pawel,
>>>
<snip>
>>>>>    struct vb2_mem_ops {
>>>>>
>>>>>          void            *(*alloc)(void *alloc_ctx, unsigned long size);
>>>>>
>>>>> @@ -65,6 +82,16 @@ struct vb2_mem_ops {
>>>>>
>>>>>                                          unsigned long size, int write);
>>>>>
>>>>>          void            (*put_userptr)(void *buf_priv);
>>>>>
>>>>> +       /* Comment from Rob Clark: XXX: I think the attach / detach
>>>>> could be handled +        * in the vb2 core, and vb2_mem_ops really
>>>>> just need to get/put the +        * sglist (and make sure that the
>>>>> sglist fits it's needs..) +        */
>>>>
>>>> I *strongly* agree with Rob here. Could you explain the reason behind
>>>> not doing this?
>>>> Allocator should ideally not have to be aware of attaching/detaching,
>>>> this is not specific to an allocator.
>>>
>>> Ok, I thought we'll start with this version first, and then refine.
>>> But you guys are right.
>>
>> I think that it is not possible to move attach/detach to vb2-core. The
>> problem is that dma_buf_attach needs 'struct device' argument. This
>> pointer is not available in vb2-core. This pointer is delivered by
>> device's driver in "void *alloc_context".
>>
>> Moving information about device would introduce new problems like:
>> - breaking layering in vb2
>> - some allocators like vb2-vmalloc do not posses any device related
>> attributes
>
> What about passing the device to vb2-core then ?
>

IMO, One way to do this is adding field 'struct device *dev' to struct 
vb2_queue. This field should be filled by a driver prior to calling 
vb2_queue_init.

Regards,
Tomasz Stanislawski
Laurent Pinchart Jan. 20, 2012, 4:11 p.m. | #11
Hi Tomasz,

On Friday 20 January 2012 16:53:20 Tomasz Stanislawski wrote:
> On 01/20/2012 04:12 PM, Laurent Pinchart wrote:
> > On Friday 20 January 2012 11:58:39 Tomasz Stanislawski wrote:
> >> On 01/20/2012 11:41 AM, Sumit Semwal wrote:
> >>> On 20 January 2012 00:37, Pawel Osciak<pawel@osciak.com>   wrote:
> >>>> Hi Sumit,
> >>>> Thank you for your work. Please find my comments below.
> >>> 
> >>> Hi Pawel,
> 
> <snip>
> 
> >>>>>    struct vb2_mem_ops {
> >>>>>    
> >>>>>          void            *(*alloc)(void *alloc_ctx, unsigned long
> >>>>>          size);
> >>>>> 
> >>>>> @@ -65,6 +82,16 @@ struct vb2_mem_ops {
> >>>>> 
> >>>>>                                          unsigned long size, int
> >>>>>                                          write);
> >>>>>          
> >>>>>          void            (*put_userptr)(void *buf_priv);
> >>>>> 
> >>>>> +       /* Comment from Rob Clark: XXX: I think the attach / detach
> >>>>> could be handled +        * in the vb2 core, and vb2_mem_ops really
> >>>>> just need to get/put the +        * sglist (and make sure that the
> >>>>> sglist fits it's needs..) +        */
> >>>> 
> >>>> I *strongly* agree with Rob here. Could you explain the reason behind
> >>>> not doing this?
> >>>> Allocator should ideally not have to be aware of attaching/detaching,
> >>>> this is not specific to an allocator.
> >>> 
> >>> Ok, I thought we'll start with this version first, and then refine.
> >>> But you guys are right.
> >> 
> >> I think that it is not possible to move attach/detach to vb2-core. The
> >> problem is that dma_buf_attach needs 'struct device' argument. This
> >> pointer is not available in vb2-core. This pointer is delivered by
> >> device's driver in "void *alloc_context".
> >> 
> >> Moving information about device would introduce new problems like:
> >> - breaking layering in vb2
> >> - some allocators like vb2-vmalloc do not posses any device related
> >> attributes
> > 
> > What about passing the device to vb2-core then ?
> 
> IMO, One way to do this is adding field 'struct device *dev' to struct
> vb2_queue. This field should be filled by a driver prior to calling
> vb2_queue_init.

I haven't looked into the details, but that sounds good to me. Do we have use 
cases where a queue is allocated before knowing which physical device it will 
be used for ?
Tomasz Stanislawski Jan. 20, 2012, 4:20 p.m. | #12
>>
>> IMO, One way to do this is adding field 'struct device *dev' to struct
>> vb2_queue. This field should be filled by a driver prior to calling
>> vb2_queue_init.
>
> I haven't looked into the details, but that sounds good to me. Do we have use
> cases where a queue is allocated before knowing which physical device it will
> be used for ?
>

I don't think so. In case of S5P drivers, vb2_queue_init is called while 
opening /dev/videoX.

BTW. This struct device may help vb2 to produce logs with more 
descriptive client annotation.

What happens if such a device is NULL. It would happen for vmalloc 
allocator used by VIVI?

Regards,
Tomasz Stanislawski
Laurent Pinchart Jan. 20, 2012, 4:28 p.m. | #13
On Friday 20 January 2012 17:20:22 Tomasz Stanislawski wrote:
> >> IMO, One way to do this is adding field 'struct device *dev' to struct
> >> vb2_queue. This field should be filled by a driver prior to calling
> >> vb2_queue_init.
> > 
> > I haven't looked into the details, but that sounds good to me. Do we have
> > use cases where a queue is allocated before knowing which physical
> > device it will be used for ?
> 
> I don't think so. In case of S5P drivers, vb2_queue_init is called while
> opening /dev/videoX.
> 
> BTW. This struct device may help vb2 to produce logs with more
> descriptive client annotation.
> 
> What happens if such a device is NULL. It would happen for vmalloc
> allocator used by VIVI?

Good question. Should dma-buf accept NULL devices ? Or should vivi pass its 
V4L2 device to vb2 ?
Marek Szyprowski Jan. 23, 2012, 9:06 a.m. | #14
Hello,

On Friday, January 20, 2012 5:29 PM Laurent Pinchart wrote:

> On Friday 20 January 2012 17:20:22 Tomasz Stanislawski wrote:
> > >> IMO, One way to do this is adding field 'struct device *dev' to struct
> > >> vb2_queue. This field should be filled by a driver prior to calling
> > >> vb2_queue_init.
> > >
> > > I haven't looked into the details, but that sounds good to me. Do we have
> > > use cases where a queue is allocated before knowing which physical
> > > device it will be used for ?
> >
> > I don't think so. In case of S5P drivers, vb2_queue_init is called while
> > opening /dev/videoX.
> >
> > BTW. This struct device may help vb2 to produce logs with more
> > descriptive client annotation.
> >
> > What happens if such a device is NULL. It would happen for vmalloc
> > allocator used by VIVI?
> 
> Good question. Should dma-buf accept NULL devices ? Or should vivi pass its
> V4L2 device to vb2 ?

I assume you suggested using struct video_device->dev entry in such case. 
It will not work. DMA-mapping API requires some parameters to be set for the 
client device, like for example dma mask. struct video_device contains only an
artificial struct device entry, which has no relation to any physical device 
and cannot be used for calling DMA-mapping functions.

Performing dma_map_* operations with such artificial struct device doesn't make
any sense. It also slows down things significantly due to cache flushing 
(forced by dma-mapping) which should be avoided if the buffer is accessed only 
with CPU (like it is done by vb2-vmalloc style drivers).

IMHO this case perfectly shows the design mistake that have been made. The
current version simply tries to do too much. 

Each client of dma_buf should 'map' the provided sgtable/scatterlist on its own.
Only the client device driver has all knowledge to make a proper 'mapping'.
Real physical devices usually will use dma_map_sg() for such operation, while
some virtual ones will only create a kernel mapping for the provided scatterlist
(like vivi with vmalloc memory module).

Best regards
Daniel Vetter Jan. 23, 2012, 9:40 a.m. | #15
On Mon, Jan 23, 2012 at 10:06:57AM +0100, Marek Szyprowski wrote:
> Hello,
> 
> On Friday, January 20, 2012 5:29 PM Laurent Pinchart wrote:
> 
> > On Friday 20 January 2012 17:20:22 Tomasz Stanislawski wrote:
> > > >> IMO, One way to do this is adding field 'struct device *dev' to struct
> > > >> vb2_queue. This field should be filled by a driver prior to calling
> > > >> vb2_queue_init.
> > > >
> > > > I haven't looked into the details, but that sounds good to me. Do we have
> > > > use cases where a queue is allocated before knowing which physical
> > > > device it will be used for ?
> > >
> > > I don't think so. In case of S5P drivers, vb2_queue_init is called while
> > > opening /dev/videoX.
> > >
> > > BTW. This struct device may help vb2 to produce logs with more
> > > descriptive client annotation.
> > >
> > > What happens if such a device is NULL. It would happen for vmalloc
> > > allocator used by VIVI?
> > 
> > Good question. Should dma-buf accept NULL devices ? Or should vivi pass its
> > V4L2 device to vb2 ?
> 
> I assume you suggested using struct video_device->dev entry in such case. 
> It will not work. DMA-mapping API requires some parameters to be set for the 
> client device, like for example dma mask. struct video_device contains only an
> artificial struct device entry, which has no relation to any physical device 
> and cannot be used for calling DMA-mapping functions.
> 
> Performing dma_map_* operations with such artificial struct device doesn't make
> any sense. It also slows down things significantly due to cache flushing 
> (forced by dma-mapping) which should be avoided if the buffer is accessed only 
> with CPU (like it is done by vb2-vmalloc style drivers).
> 
> IMHO this case perfectly shows the design mistake that have been made. The
> current version simply tries to do too much. 

Nope, the current dma_buf does too little. Atm it's simple not useable for
drivers that need cpu access, at least not if you're willing to resort to
ugly an non-portable tricks like prime.

We've discussed this quite a bit and decided that solving cpu access and
coherency with n other devices involved is too much v1. It looks like we
need to add that extension rather sooner than later.
-Daniel
Daniel Vetter Jan. 23, 2012, 9:45 a.m. | #16
On Mon, Jan 23, 2012 at 10:40:07AM +0100, Daniel Vetter wrote:
> On Mon, Jan 23, 2012 at 10:06:57AM +0100, Marek Szyprowski wrote:
> > Hello,
> > 
> > On Friday, January 20, 2012 5:29 PM Laurent Pinchart wrote:
> > 
> > > On Friday 20 January 2012 17:20:22 Tomasz Stanislawski wrote:
> > > > >> IMO, One way to do this is adding field 'struct device *dev' to struct
> > > > >> vb2_queue. This field should be filled by a driver prior to calling
> > > > >> vb2_queue_init.
> > > > >
> > > > > I haven't looked into the details, but that sounds good to me. Do we have
> > > > > use cases where a queue is allocated before knowing which physical
> > > > > device it will be used for ?
> > > >
> > > > I don't think so. In case of S5P drivers, vb2_queue_init is called while
> > > > opening /dev/videoX.
> > > >
> > > > BTW. This struct device may help vb2 to produce logs with more
> > > > descriptive client annotation.
> > > >
> > > > What happens if such a device is NULL. It would happen for vmalloc
> > > > allocator used by VIVI?
> > > 
> > > Good question. Should dma-buf accept NULL devices ? Or should vivi pass its
> > > V4L2 device to vb2 ?
> > 
> > I assume you suggested using struct video_device->dev entry in such case. 
> > It will not work. DMA-mapping API requires some parameters to be set for the 
> > client device, like for example dma mask. struct video_device contains only an
> > artificial struct device entry, which has no relation to any physical device 
> > and cannot be used for calling DMA-mapping functions.
> > 
> > Performing dma_map_* operations with such artificial struct device doesn't make
> > any sense. It also slows down things significantly due to cache flushing 
> > (forced by dma-mapping) which should be avoided if the buffer is accessed only 
> > with CPU (like it is done by vb2-vmalloc style drivers).
> > 
> > IMHO this case perfectly shows the design mistake that have been made. The
> > current version simply tries to do too much. 
> 
> Nope, the current dma_buf does too little. Atm it's simple not useable for
> drivers that need cpu access, at least not if you're willing to resort to
> ugly an non-portable tricks like prime.

Argh, there's a 'not' missing in the above sentence: CPU access is not
possible, at elast not if you're *not* willing to resert to ugly ...

> We've discussed this quite a bit and decided that solving cpu access and
> coherency with n other devices involved is too much v1. It looks like we
> need to add that extension rather sooner than later.
-Daneil
Laurent Pinchart Jan. 23, 2012, 9:48 a.m. | #17
Hi Marek,

On Monday 23 January 2012 10:06:57 Marek Szyprowski wrote:
> On Friday, January 20, 2012 5:29 PM Laurent Pinchart wrote:
> > On Friday 20 January 2012 17:20:22 Tomasz Stanislawski wrote:
> > > >> IMO, One way to do this is adding field 'struct device *dev' to
> > > >> struct vb2_queue. This field should be filled by a driver prior to
> > > >> calling vb2_queue_init.
> > > > 
> > > > I haven't looked into the details, but that sounds good to me. Do we
> > > > have use cases where a queue is allocated before knowing which
> > > > physical device it will be used for ?
> > > 
> > > I don't think so. In case of S5P drivers, vb2_queue_init is called
> > > while opening /dev/videoX.
> > > 
> > > BTW. This struct device may help vb2 to produce logs with more
> > > descriptive client annotation.
> > > 
> > > What happens if such a device is NULL. It would happen for vmalloc
> > > allocator used by VIVI?
> > 
> > Good question. Should dma-buf accept NULL devices ? Or should vivi pass
> > its V4L2 device to vb2 ?
> 
> I assume you suggested using struct video_device->dev entry in such case.
> It will not work. DMA-mapping API requires some parameters to be set for
> the client device, like for example dma mask. struct video_device contains
> only an artificial struct device entry, which has no relation to any
> physical device and cannot be used for calling DMA-mapping functions.
> 
> Performing dma_map_* operations with such artificial struct device doesn't
> make any sense. It also slows down things significantly due to cache
> flushing (forced by dma-mapping) which should be avoided if the buffer is
> accessed only with CPU (like it is done by vb2-vmalloc style drivers).

I agree that mapping the buffer to the physical device doesn't make any sense, 
as there's simple no physical device to map the buffer to. In that case we 
could simply skip the dma_map/dma_unmap calls.

Note, however, that dma-buf v1 explicitly does not support CPU access by the 
importer.

> IMHO this case perfectly shows the design mistake that have been made. The
> current version simply tries to do too much.
> 
> Each client of dma_buf should 'map' the provided sgtable/scatterlist on its
> own. Only the client device driver has all knowledge to make a proper
> 'mapping'. Real physical devices usually will use dma_map_sg() for such
> operation, while some virtual ones will only create a kernel mapping for
> the provided scatterlist (like vivi with vmalloc memory module).

I tend to agree with that. Depending on the importer device, drivers could 
then map/unmap the buffer around each DMA access, or keep a mapping and sync 
the buffer.

What about splitting the map_dma_buf operation into an operation that backs 
the buffer with pages and returns an sg_list, and an operation that performs 
DMA synchronization with the exporter ? unmap_dma_buf would similarly be split 
in two operations.
Daniel Vetter Jan. 23, 2012, 10:35 a.m. | #18
On Mon, Jan 23, 2012 at 10:48, Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
> Hi Marek,
>
> On Monday 23 January 2012 10:06:57 Marek Szyprowski wrote:
>> On Friday, January 20, 2012 5:29 PM Laurent Pinchart wrote:
>> > On Friday 20 January 2012 17:20:22 Tomasz Stanislawski wrote:
>> > > >> IMO, One way to do this is adding field 'struct device *dev' to
>> > > >> struct vb2_queue. This field should be filled by a driver prior to
>> > > >> calling vb2_queue_init.
>> > > >
>> > > > I haven't looked into the details, but that sounds good to me. Do we
>> > > > have use cases where a queue is allocated before knowing which
>> > > > physical device it will be used for ?
>> > >
>> > > I don't think so. In case of S5P drivers, vb2_queue_init is called
>> > > while opening /dev/videoX.
>> > >
>> > > BTW. This struct device may help vb2 to produce logs with more
>> > > descriptive client annotation.
>> > >
>> > > What happens if such a device is NULL. It would happen for vmalloc
>> > > allocator used by VIVI?
>> >
>> > Good question. Should dma-buf accept NULL devices ? Or should vivi pass
>> > its V4L2 device to vb2 ?
>>
>> I assume you suggested using struct video_device->dev entry in such case.
>> It will not work. DMA-mapping API requires some parameters to be set for
>> the client device, like for example dma mask. struct video_device contains
>> only an artificial struct device entry, which has no relation to any
>> physical device and cannot be used for calling DMA-mapping functions.
>>
>> Performing dma_map_* operations with such artificial struct device doesn't
>> make any sense. It also slows down things significantly due to cache
>> flushing (forced by dma-mapping) which should be avoided if the buffer is
>> accessed only with CPU (like it is done by vb2-vmalloc style drivers).
>
> I agree that mapping the buffer to the physical device doesn't make any sense,
> as there's simple no physical device to map the buffer to. In that case we
> could simply skip the dma_map/dma_unmap calls.

See my other mail, dma_buf v1 does not support cpu access. So if you
don't have a device around, you can't use it in it's current form.

> Note, however, that dma-buf v1 explicitly does not support CPU access by the
> importer.
>
>> IMHO this case perfectly shows the design mistake that have been made. The
>> current version simply tries to do too much.
>>
>> Each client of dma_buf should 'map' the provided sgtable/scatterlist on its
>> own. Only the client device driver has all knowledge to make a proper
>> 'mapping'. Real physical devices usually will use dma_map_sg() for such
>> operation, while some virtual ones will only create a kernel mapping for
>> the provided scatterlist (like vivi with vmalloc memory module).
>
> I tend to agree with that. Depending on the importer device, drivers could
> then map/unmap the buffer around each DMA access, or keep a mapping and sync
> the buffer.

Again we've discussed adding a syncing op to the interface that would
allow keeping around mappings. The thing is that this also requires an
unmap callback or something similar, so that the exporter can inform
the importer that the memory just moved around. And the exporter
_needs_ to be able to do that, hence also the language in the doc that
importers need to braked all uses with a map/unmap and can't sit
forever on a dma_buf mapping.

> What about splitting the map_dma_buf operation into an operation that backs
> the buffer with pages and returns an sg_list, and an operation that performs
> DMA synchronization with the exporter ? unmap_dma_buf would similarly be split
> in two operations.

Again for v1 that doesn't make sense because you can't do cpu access
anyway and you should not hang onto mappings forever. Furthermore we
have cases where an unmapped sg_list for cpu access simple makes no
sense.

Yours, Daniel

[Aside: You can do a dirty trick like prime that grabs the underlying
page of an already mapped sg list. Obviously highly non-portable.]
Laurent Pinchart Jan. 23, 2012, 10:54 a.m. | #19
Hi Daniel,

On Monday 23 January 2012 11:35:01 Daniel Vetter wrote:
> On Mon, Jan 23, 2012 at 10:48, Laurent Pinchart wrote:
> > On Monday 23 January 2012 10:06:57 Marek Szyprowski wrote:
> >> On Friday, January 20, 2012 5:29 PM Laurent Pinchart wrote:
> >> > On Friday 20 January 2012 17:20:22 Tomasz Stanislawski wrote:
> >> > > >> IMO, One way to do this is adding field 'struct device *dev' to
> >> > > >> struct vb2_queue. This field should be filled by a driver prior
> >> > > >> to calling vb2_queue_init.
> >> > > > 
> >> > > > I haven't looked into the details, but that sounds good to me. Do
> >> > > > we have use cases where a queue is allocated before knowing which
> >> > > > physical device it will be used for ?
> >> > > 
> >> > > I don't think so. In case of S5P drivers, vb2_queue_init is called
> >> > > while opening /dev/videoX.
> >> > > 
> >> > > BTW. This struct device may help vb2 to produce logs with more
> >> > > descriptive client annotation.
> >> > > 
> >> > > What happens if such a device is NULL. It would happen for vmalloc
> >> > > allocator used by VIVI?
> >> > 
> >> > Good question. Should dma-buf accept NULL devices ? Or should vivi
> >> > pass its V4L2 device to vb2 ?
> >> 
> >> I assume you suggested using struct video_device->dev entry in such
> >> case. It will not work. DMA-mapping API requires some parameters to be
> >> set for the client device, like for example dma mask. struct
> >> video_device contains only an artificial struct device entry, which has
> >> no relation to any physical device and cannot be used for calling
> >> DMA-mapping functions.
> >> 
> >> Performing dma_map_* operations with such artificial struct device
> >> doesn't make any sense. It also slows down things significantly due to
> >> cache flushing (forced by dma-mapping) which should be avoided if the
> >> buffer is accessed only with CPU (like it is done by vb2-vmalloc style
> >> drivers).
> > 
> > I agree that mapping the buffer to the physical device doesn't make any
> > sense, as there's simple no physical device to map the buffer to. In
> > that case we could simply skip the dma_map/dma_unmap calls.
> 
> See my other mail, dma_buf v1 does not support cpu access.

v1 is in the kernel now, let's start discussing v2 ;-)

> So if you don't have a device around, you can't use it in it's current form.
> 
> > Note, however, that dma-buf v1 explicitly does not support CPU access by
> > the importer.
> > 
> >> IMHO this case perfectly shows the design mistake that have been made.
> >> The current version simply tries to do too much.
> >> 
> >> Each client of dma_buf should 'map' the provided sgtable/scatterlist on
> >> its own. Only the client device driver has all knowledge to make a
> >> proper 'mapping'. Real physical devices usually will use dma_map_sg()
> >> for such operation, while some virtual ones will only create a kernel
> >> mapping for the provided scatterlist (like vivi with vmalloc memory
> >> module).
> > 
> > I tend to agree with that. Depending on the importer device, drivers
> > could then map/unmap the buffer around each DMA access, or keep a
> > mapping and sync the buffer.
> 
> Again we've discussed adding a syncing op to the interface that would allow
> keeping around mappings. The thing is that this also requires an unmap
> callback or something similar, so that the exporter can inform the importer
> that the memory just moved around. And the exporter _needs_ to be able to do
> that, hence also the language in the doc that importers need to braked all
> uses with a map/unmap and can't sit forever on a dma_buf mapping.

Not all exporters need to be able to move buffers around. If I'm not mistaken, 
only DRM exporters need such a feature (which obviously makes it an important 
feature). Does the exporter need to be able to do so at any time ? Buffers 
can't obviously be moved around when they're used by an activa DMA, so I 
expect the exporter to be able to wait. How long can it wait ?

I'm not sure I would like a callback approach. If we add a sync operation, the 
exporter could signal to the importer that it must unmap the buffer by 
returning an appropriate value from the sync operation. Would that be usable 
for DRM ?

Another option would be to keep the mapping around, and check in the importer 
if the buffer has moved. If so, the importer would tear the mapping down and 
create a new one. This is a bit hackish though, as we would tear a mapping 
down for a buffer that doesn't exist anymore. Nothing should be accessing the 
mapping at that time, but it could be a security risk if we consider rogue 
hardware (that's pretty far-fetched though, as rogue hardware can probably 
already kill the system easily in many cases).

> > What about splitting the map_dma_buf operation into an operation that
> > backs the buffer with pages and returns an sg_list, and an operation that
> > performs DMA synchronization with the exporter ? unmap_dma_buf would
> > similarly be split in two operations.
> 
> Again for v1 that doesn't make sense because you can't do cpu access anyway
> and you should not hang onto mappings forever.

For performance reasons I'd like to hang onto the mapping as long as possible. 
Creating and tearing down IOMMU mappings for large buffers is a costly 
operation, and I don't want to spend time there every time I use a buffer if 
the buffer hasn't moved since the previous time it was mapped.

> Furthermore we have cases where an unmapped sg_list for cpu access simple
> makes no sense.
> 
> Yours, Daniel
> 
> [Aside: You can do a dirty trick like prime that grabs the underlying
> page of an already mapped sg list. Obviously highly non-portable.]
Rob Clark Jan. 24, 2012, 12:26 a.m. | #20
On Mon, Jan 23, 2012 at 4:54 AM, Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
> Hi Daniel,
>
> On Monday 23 January 2012 11:35:01 Daniel Vetter wrote:
>> On Mon, Jan 23, 2012 at 10:48, Laurent Pinchart wrote:
>> > On Monday 23 January 2012 10:06:57 Marek Szyprowski wrote:
>> >> On Friday, January 20, 2012 5:29 PM Laurent Pinchart wrote:
>> >> > On Friday 20 January 2012 17:20:22 Tomasz Stanislawski wrote:
>> >> > > >> IMO, One way to do this is adding field 'struct device *dev' to
>> >> > > >> struct vb2_queue. This field should be filled by a driver prior
>> >> > > >> to calling vb2_queue_init.
>> >> > > >
>> >> > > > I haven't looked into the details, but that sounds good to me. Do
>> >> > > > we have use cases where a queue is allocated before knowing which
>> >> > > > physical device it will be used for ?
>> >> > >
>> >> > > I don't think so. In case of S5P drivers, vb2_queue_init is called
>> >> > > while opening /dev/videoX.
>> >> > >
>> >> > > BTW. This struct device may help vb2 to produce logs with more
>> >> > > descriptive client annotation.
>> >> > >
>> >> > > What happens if such a device is NULL. It would happen for vmalloc
>> >> > > allocator used by VIVI?
>> >> >
>> >> > Good question. Should dma-buf accept NULL devices ? Or should vivi
>> >> > pass its V4L2 device to vb2 ?
>> >>
>> >> I assume you suggested using struct video_device->dev entry in such
>> >> case. It will not work. DMA-mapping API requires some parameters to be
>> >> set for the client device, like for example dma mask. struct
>> >> video_device contains only an artificial struct device entry, which has
>> >> no relation to any physical device and cannot be used for calling
>> >> DMA-mapping functions.
>> >>
>> >> Performing dma_map_* operations with such artificial struct device
>> >> doesn't make any sense. It also slows down things significantly due to
>> >> cache flushing (forced by dma-mapping) which should be avoided if the
>> >> buffer is accessed only with CPU (like it is done by vb2-vmalloc style
>> >> drivers).
>> >
>> > I agree that mapping the buffer to the physical device doesn't make any
>> > sense, as there's simple no physical device to map the buffer to. In
>> > that case we could simply skip the dma_map/dma_unmap calls.
>>
>> See my other mail, dma_buf v1 does not support cpu access.
>
> v1 is in the kernel now, let's start discussing v2 ;-)
>
>> So if you don't have a device around, you can't use it in it's current form.
>>
>> > Note, however, that dma-buf v1 explicitly does not support CPU access by
>> > the importer.
>> >
>> >> IMHO this case perfectly shows the design mistake that have been made.
>> >> The current version simply tries to do too much.
>> >>
>> >> Each client of dma_buf should 'map' the provided sgtable/scatterlist on
>> >> its own. Only the client device driver has all knowledge to make a
>> >> proper 'mapping'. Real physical devices usually will use dma_map_sg()
>> >> for such operation, while some virtual ones will only create a kernel
>> >> mapping for the provided scatterlist (like vivi with vmalloc memory
>> >> module).
>> >
>> > I tend to agree with that. Depending on the importer device, drivers
>> > could then map/unmap the buffer around each DMA access, or keep a
>> > mapping and sync the buffer.
>>
>> Again we've discussed adding a syncing op to the interface that would allow
>> keeping around mappings. The thing is that this also requires an unmap
>> callback or something similar, so that the exporter can inform the importer
>> that the memory just moved around. And the exporter _needs_ to be able to do
>> that, hence also the language in the doc that importers need to braked all
>> uses with a map/unmap and can't sit forever on a dma_buf mapping.
>
> Not all exporters need to be able to move buffers around. If I'm not mistaken,
> only DRM exporters need such a feature (which obviously makes it an important
> feature). Does the exporter need to be able to do so at any time ? Buffers
> can't obviously be moved around when they're used by an activa DMA, so I
> expect the exporter to be able to wait. How long can it wait ?

Offhand I think it would usually be a request from userspace (in some
cases page faults (although I think only if there is hw de-tiling?),
or command submission to gpu involving some buffer(s) that are not
currently mapped) that would trigger the exporter to want to be able
to evict something.  So could be blocked or something else
evicted/moved instead.  Although perhaps not ideal for performance.
(app/toolkit writers seem to have a love of temporary pixmaps, so
x11/ddx driver can chew thru a huge number of new buffer allocations
in very short amount of time)

> I'm not sure I would like a callback approach. If we add a sync operation, the
> exporter could signal to the importer that it must unmap the buffer by
> returning an appropriate value from the sync operation. Would that be usable
> for DRM ?

It does seem a bit over-complicated..  and deadlock prone.  Is there a
reason the importer couldn't just unmap when DMA is completed, and the
exporter give some hint on next map() that the buffer hasn't actually
moved?

BR,
-R

> Another option would be to keep the mapping around, and check in the importer
> if the buffer has moved. If so, the importer would tear the mapping down and
> create a new one. This is a bit hackish though, as we would tear a mapping
> down for a buffer that doesn't exist anymore. Nothing should be accessing the
> mapping at that time, but it could be a security risk if we consider rogue
> hardware (that's pretty far-fetched though, as rogue hardware can probably
> already kill the system easily in many cases).
>
>> > What about splitting the map_dma_buf operation into an operation that
>> > backs the buffer with pages and returns an sg_list, and an operation that
>> > performs DMA synchronization with the exporter ? unmap_dma_buf would
>> > similarly be split in two operations.
>>
>> Again for v1 that doesn't make sense because you can't do cpu access anyway
>> and you should not hang onto mappings forever.
>
> For performance reasons I'd like to hang onto the mapping as long as possible.
> Creating and tearing down IOMMU mappings for large buffers is a costly
> operation, and I don't want to spend time there every time I use a buffer if
> the buffer hasn't moved since the previous time it was mapped.
>
>> Furthermore we have cases where an unmapped sg_list for cpu access simple
>> makes no sense.
>>
>> Yours, Daniel
>>
>> [Aside: You can do a dirty trick like prime that grabs the underlying
>> page of an already mapped sg list. Obviously highly non-portable.]
>
> --
> Regards,
>
> Laurent Pinchart
Laurent Pinchart Jan. 24, 2012, 9:34 a.m. | #21
Hi Rob,

On Tuesday 24 January 2012 01:26:15 Clark, Rob wrote:
> On Mon, Jan 23, 2012 at 4:54 AM, Laurent Pinchart wrote:
> > On Monday 23 January 2012 11:35:01 Daniel Vetter wrote:
> >> On Mon, Jan 23, 2012 at 10:48, Laurent Pinchart wrote:
> >> > On Monday 23 January 2012 10:06:57 Marek Szyprowski wrote:
> >> >> On Friday, January 20, 2012 5:29 PM Laurent Pinchart wrote:
> >> >> > On Friday 20 January 2012 17:20:22 Tomasz Stanislawski wrote:
> >> >> > > >> IMO, One way to do this is adding field 'struct device *dev'
> >> >> > > >> to struct vb2_queue. This field should be filled by a driver
> >> >> > > >> prior to calling vb2_queue_init.
> >> >> > > > 
> >> >> > > > I haven't looked into the details, but that sounds good to me.
> >> >> > > > Do we have use cases where a queue is allocated before knowing
> >> >> > > > which physical device it will be used for ?
> >> >> > > 
> >> >> > > I don't think so. In case of S5P drivers, vb2_queue_init is
> >> >> > > called while opening /dev/videoX.
> >> >> > > 
> >> >> > > BTW. This struct device may help vb2 to produce logs with more
> >> >> > > descriptive client annotation.
> >> >> > > 
> >> >> > > What happens if such a device is NULL. It would happen for
> >> >> > > vmalloc allocator used by VIVI?
> >> >> > 
> >> >> > Good question. Should dma-buf accept NULL devices ? Or should vivi
> >> >> > pass its V4L2 device to vb2 ?
> >> >> 
> >> >> I assume you suggested using struct video_device->dev entry in such
> >> >> case. It will not work. DMA-mapping API requires some parameters to
> >> >> be set for the client device, like for example dma mask. struct
> >> >> video_device contains only an artificial struct device entry, which
> >> >> has no relation to any physical device and cannot be used for
> >> >> calling DMA-mapping functions.
> >> >> 
> >> >> Performing dma_map_* operations with such artificial struct device
> >> >> doesn't make any sense. It also slows down things significantly due
> >> >> to cache flushing (forced by dma-mapping) which should be avoided if
> >> >> the buffer is accessed only with CPU (like it is done by vb2-vmalloc
> >> >> style drivers).
> >> > 
> >> > I agree that mapping the buffer to the physical device doesn't make
> >> > any sense, as there's simple no physical device to map the buffer to.
> >> > In that case we could simply skip the dma_map/dma_unmap calls.
> >> 
> >> See my other mail, dma_buf v1 does not support cpu access.
> > 
> > v1 is in the kernel now, let's start discussing v2 ;-)
> > 
> >> So if you don't have a device around, you can't use it in it's current
> >> form.
> >> 
> >> > Note, however, that dma-buf v1 explicitly does not support CPU access
> >> > by the importer.
> >> > 
> >> >> IMHO this case perfectly shows the design mistake that have been
> >> >> made. The current version simply tries to do too much.
> >> >> 
> >> >> Each client of dma_buf should 'map' the provided sgtable/scatterlist
> >> >> on its own. Only the client device driver has all knowledge to make
> >> >> a proper 'mapping'. Real physical devices usually will use
> >> >> dma_map_sg() for such operation, while some virtual ones will only
> >> >> create a kernel mapping for the provided scatterlist (like vivi with
> >> >> vmalloc memory module).
> >> > 
> >> > I tend to agree with that. Depending on the importer device, drivers
> >> > could then map/unmap the buffer around each DMA access, or keep a
> >> > mapping and sync the buffer.
> >> 
> >> Again we've discussed adding a syncing op to the interface that would
> >> allow keeping around mappings. The thing is that this also requires an
> >> unmap callback or something similar, so that the exporter can inform
> >> the importer that the memory just moved around. And the exporter
> >> _needs_ to be able to do that, hence also the language in the doc that
> >> importers need to braked all uses with a map/unmap and can't sit
> >> forever on a dma_buf mapping.
> > 
> > Not all exporters need to be able to move buffers around. If I'm not
> > mistaken, only DRM exporters need such a feature (which obviously makes
> > it an important feature). Does the exporter need to be able to do so at
> > any time ? Buffers can't obviously be moved around when they're used by
> > an activa DMA, so I expect the exporter to be able to wait. How long can
> > it wait ?
> 
> Offhand I think it would usually be a request from userspace (in some
> cases page faults (although I think only if there is hw de-tiling?),
> or command submission to gpu involving some buffer(s) that are not
> currently mapped) that would trigger the exporter to want to be able
> to evict something.  So could be blocked or something else
> evicted/moved instead.  Although perhaps not ideal for performance.
> (app/toolkit writers seem to have a love of temporary pixmaps, so
> x11/ddx driver can chew thru a huge number of new buffer allocations
> in very short amount of time)
> 
> > I'm not sure I would like a callback approach. If we add a sync
> > operation, the exporter could signal to the importer that it must unmap
> > the buffer by returning an appropriate value from the sync operation.
> > Would that be usable for DRM ?
> 
> It does seem a bit over-complicated..  and deadlock prone.  Is there a
> reason the importer couldn't just unmap when DMA is completed, and the
> exporter give some hint on next map() that the buffer hasn't actually
> moved?

If the importer unmaps the buffer completely when DMA is completed, it will 
have to map it again for the next DMA transfer, even if the 
dma_buf_map_attachement() calls returns an hint that the buffer hasn't moved.

What I want to avoid here is having to map/unmap buffers to the device IOMMU 
around each DMA if the buffer doesn't move. To avoid that, the importer needs 
to keep the IOMMU mapping after DMA completes if the buffer won't move until 
the next DMA transfer, but that information isn't available when the first DMA 
completes.
Daniel Vetter Jan. 24, 2012, 10:52 a.m. | #22
On Tue, Jan 24, 2012 at 10:34:38AM +0100, Laurent Pinchart wrote:
> > > I'm not sure I would like a callback approach. If we add a sync
> > > operation, the exporter could signal to the importer that it must unmap
> > > the buffer by returning an appropriate value from the sync operation.
> > > Would that be usable for DRM ?
> > 
> > It does seem a bit over-complicated..  and deadlock prone.  Is there a
> > reason the importer couldn't just unmap when DMA is completed, and the
> > exporter give some hint on next map() that the buffer hasn't actually
> > moved?
> 
> If the importer unmaps the buffer completely when DMA is completed, it will 
> have to map it again for the next DMA transfer, even if the 
> dma_buf_map_attachement() calls returns an hint that the buffer hasn't moved.
> 
> What I want to avoid here is having to map/unmap buffers to the device IOMMU 
> around each DMA if the buffer doesn't move. To avoid that, the importer needs 
> to keep the IOMMU mapping after DMA completes if the buffer won't move until 
> the next DMA transfer, but that information isn't available when the first DMA 
> completes.

The exporter should cache the iommu mapping for all attachments. The
driver would the only need to adjust the dma address - I was kinda hoping
this would be little enough overhead that we could just ignore it, at
least for the moment (especially for hw that can only deal with contigious
buffers).

The issue with adding explicit support for evicting buffers is that it's
hilariously deadlock-prone, especially when both sides are exporters (dev1
waits for dev2 to finish processing buffer A while dev2 waits for dev1 to
finish with buffer B ...). Before we don't have a solid buffer sharing
between gpus (i.e. prime) I don't think it makes sense to add these
extension to dma_buf.
-Daniel
Daniel Vetter Jan. 24, 2012, 1:03 p.m. | #23
On Mon, Jan 23, 2012 at 11:54:20AM +0100, Laurent Pinchart wrote:
> On Monday 23 January 2012 11:35:01 Daniel Vetter wrote:
> > See my other mail, dma_buf v1 does not support cpu access.
> 
> v1 is in the kernel now, let's start discussing v2 ;-)

Ok, I'm in ;-)

I've thought a bit about this, and I think a reasonable next step would be
to enable cpu access from kernelspace. Userspace and mmap support is a
hole different beast altogether and I think we should postpone that until
we've got the kernel part hashed out.

I'm thinking about adding 3 pairs of function to dma_buf (not to
dma_buf_attachment).

dma_buf_get_backing_storage/put_backing_storage
This will be used before/after kernel cpu access to ensure that the
backing storage is in memory. E.g. gem objects can be swapped out, so
they need to be pinned before we can access them. For exporters with
static allocations this would be a no-op.

I think a start, length range would make sense, but the exporter is free
to just swap in the entire object unconditionally. The range is specified
in multiples of PAGE_SIZE - I don't think there's any usescase for a
get/put_backing_storage which deals in smaller units.

The get/put functions are allowed to block and grab all kinds of looks.
get is allowed to fail with e.g. -ENOMEM.

dma_buf_kmap/kunmap
This maps _one_ page into the kernels address space and out of it. This
function also flushes/invalidates any caches required. Importers are not
allowed to map more than 2 pages at the same time in total (to allow
copies). This is because at least for gem objects the backing storage can
be in high-mem.

Importers are allowed to sleep while holding such a kernel mapping.

These functions are not allowed to fail (like kmap/kunmap).

dma_buf_kmap_atomic/kunmap_atomic
For performance we want to also allow atomic mappigns. Only difference is
that importers are not allowed to sleep while holding an atomic mapping.

These functions are again not allowed to fail.

Comments, flames?

If this sounds sensible I'll throw together a quick rfc over the next few
days.

Cheers, Daniel

PS: Imo the next step after this would be adding userspace mmap support.
This has the funky requirement that the exporter needs to be able to shot
down any ptes before it moves around the buffer. I think solving that nice
little problem is a good exercise to prepare us for solving similar "get
of this buffer, now" issues with permanent device address space mappings
...
Daniel Vetter Jan. 25, 2012, 8:09 a.m. | #24
On Tue, Jan 24, 2012 at 02:03:22PM +0100, Daniel Vetter wrote:
> On Mon, Jan 23, 2012 at 11:54:20AM +0100, Laurent Pinchart wrote:
> > On Monday 23 January 2012 11:35:01 Daniel Vetter wrote:
> > > See my other mail, dma_buf v1 does not support cpu access.
> > 
> > v1 is in the kernel now, let's start discussing v2 ;-)
> 
> Ok, I'm in ;-)
> 
> I've thought a bit about this, and I think a reasonable next step would be
> to enable cpu access from kernelspace. Userspace and mmap support is a
> hole different beast altogether and I think we should postpone that until
> we've got the kernel part hashed out.
> 
> I'm thinking about adding 3 pairs of function to dma_buf (not to
> dma_buf_attachment).
> 
> dma_buf_get_backing_storage/put_backing_storage
> This will be used before/after kernel cpu access to ensure that the
> backing storage is in memory. E.g. gem objects can be swapped out, so
> they need to be pinned before we can access them. For exporters with
> static allocations this would be a no-op.
> 
> I think a start, length range would make sense, but the exporter is free
> to just swap in the entire object unconditionally. The range is specified
> in multiples of PAGE_SIZE - I don't think there's any usescase for a
> get/put_backing_storage which deals in smaller units.
> 
> The get/put functions are allowed to block and grab all kinds of looks.
> get is allowed to fail with e.g. -ENOMEM.
> 
> dma_buf_kmap/kunmap
> This maps _one_ page into the kernels address space and out of it. This
> function also flushes/invalidates any caches required. Importers are not
> allowed to map more than 2 pages at the same time in total (to allow
> copies). This is because at least for gem objects the backing storage can
> be in high-mem.
> 
> Importers are allowed to sleep while holding such a kernel mapping.
> 
> These functions are not allowed to fail (like kmap/kunmap).
> 
> dma_buf_kmap_atomic/kunmap_atomic
> For performance we want to also allow atomic mappigns. Only difference is
> that importers are not allowed to sleep while holding an atomic mapping.
> 
> These functions are again not allowed to fail.

I think we need to extend the kmap/kunmap functions with a few arguments
to properly flush caches. I think a simple flag with read, write or
read | write should be good enough.
-Daniel
Sakari Ailus Jan. 25, 2012, 11:28 p.m. | #25
Hi Daniel and Laurent,

On Mon, Jan 23, 2012 at 11:35:01AM +0100, Daniel Vetter wrote:
> On Mon, Jan 23, 2012 at 10:48, Laurent Pinchart
> <laurent.pinchart@ideasonboard.com> wrote:
> > Hi Marek,
> >
> > On Monday 23 January 2012 10:06:57 Marek Szyprowski wrote:
> >> On Friday, January 20, 2012 5:29 PM Laurent Pinchart wrote:
> >> > On Friday 20 January 2012 17:20:22 Tomasz Stanislawski wrote:
> >> > > >> IMO, One way to do this is adding field 'struct device *dev' to
> >> > > >> struct vb2_queue. This field should be filled by a driver prior to
> >> > > >> calling vb2_queue_init.
> >> > > >
> >> > > > I haven't looked into the details, but that sounds good to me. Do we
> >> > > > have use cases where a queue is allocated before knowing which
> >> > > > physical device it will be used for ?
> >> > >
> >> > > I don't think so. In case of S5P drivers, vb2_queue_init is called
> >> > > while opening /dev/videoX.
> >> > >
> >> > > BTW. This struct device may help vb2 to produce logs with more
> >> > > descriptive client annotation.
> >> > >
> >> > > What happens if such a device is NULL. It would happen for vmalloc
> >> > > allocator used by VIVI?
> >> >
> >> > Good question. Should dma-buf accept NULL devices ? Or should vivi pass
> >> > its V4L2 device to vb2 ?
> >>
> >> I assume you suggested using struct video_device->dev entry in such case.
> >> It will not work. DMA-mapping API requires some parameters to be set for
> >> the client device, like for example dma mask. struct video_device contains
> >> only an artificial struct device entry, which has no relation to any
> >> physical device and cannot be used for calling DMA-mapping functions.
> >>
> >> Performing dma_map_* operations with such artificial struct device doesn't
> >> make any sense. It also slows down things significantly due to cache
> >> flushing (forced by dma-mapping) which should be avoided if the buffer is
> >> accessed only with CPU (like it is done by vb2-vmalloc style drivers).
> >
> > I agree that mapping the buffer to the physical device doesn't make any sense,
> > as there's simple no physical device to map the buffer to. In that case we
> > could simply skip the dma_map/dma_unmap calls.
> 
> See my other mail, dma_buf v1 does not support cpu access. So if you
> don't have a device around, you can't use it in it's current form.
> 
> > Note, however, that dma-buf v1 explicitly does not support CPU access by the
> > importer.
> >
> >> IMHO this case perfectly shows the design mistake that have been made. The
> >> current version simply tries to do too much.
> >>
> >> Each client of dma_buf should 'map' the provided sgtable/scatterlist on its
> >> own. Only the client device driver has all knowledge to make a proper
> >> 'mapping'. Real physical devices usually will use dma_map_sg() for such
> >> operation, while some virtual ones will only create a kernel mapping for
> >> the provided scatterlist (like vivi with vmalloc memory module).
> >
> > I tend to agree with that. Depending on the importer device, drivers could
> > then map/unmap the buffer around each DMA access, or keep a mapping and sync
> > the buffer.
> 
> Again we've discussed adding a syncing op to the interface that would
> allow keeping around mappings. The thing is that this also requires an
> unmap callback or something similar, so that the exporter can inform
> the importer that the memory just moved around. And the exporter
> _needs_ to be able to do that, hence also the language in the doc that
> importers need to braked all uses with a map/unmap and can't sit
> forever on a dma_buf mapping.

I'd also like to stress that being able to prepare video buffers and keep
them around is a very important feature. The buffers should not be unmapped
while the user space doesn't want that. Mapping buffer in V4L2 for every
frame is prohibitively expensive and should only happen the first time the
buffer is introduced to the buffer queue. Similarly unmapping should take
place either explicitly (REQBUFS) IOCTL or implicitly (closing associated
file handle).

Could this be a single bit that tells whether the buffer can be moved around
or not? Where exactly, I don't know.

> > What about splitting the map_dma_buf operation into an operation that backs
> > the buffer with pages and returns an sg_list, and an operation that performs
> > DMA synchronization with the exporter ? unmap_dma_buf would similarly be split
> > in two operations.
> 
> Again for v1 that doesn't make sense because you can't do cpu access
> anyway and you should not hang onto mappings forever. Furthermore we
> have cases where an unmapped sg_list for cpu access simple makes no
> sense.

Why you "should not hang onto mappings forever"? This is currently done by
virtually all V4L2 drivers where such mappings are relevant. Not doing so
would really kill the performance i.e. it's infeasible. Same goes to (m)any
other multimedia devices dealing with buffers containing streaming video
data.

Kind regards,
Daniel Vetter Jan. 26, 2012, 11:27 a.m. | #26
On Thu, Jan 26, 2012 at 01:28:16AM +0200, Sakari Ailus wrote:
> Why you "should not hang onto mappings forever"? This is currently done by
> virtually all V4L2 drivers where such mappings are relevant. Not doing so
> would really kill the performance i.e. it's infeasible. Same goes to (m)any
> other multimedia devices dealing with buffers containing streaming video
> data.

Because I want dynamic memory managemt simple because everything else does
not make sense. I know that in v4l things don't work that way, but in drm
they _do_. And if you share tons of buffers with drm drivers and don't
follow the rules, the OOM killer will come around and shot at your apps.
Because at least in i915 we do slurp in as much memory as we can until the
oom killer starts growling, at which point we kick out stuff.

I know that current dma_buf isn't there and for many use-cases discussed
here we can get away without that complexity. So you actually can just map
your dma_buf and never ever let go of that mapping again in many cases.

The only reason I'm such a stuborn bastard about all this is that drm/*
will do dynamic bo management even with dma_buf sooner or later and you
should better know that and why and the implications if you choose to
ignore it.

And obviously, the generic dma_buf interface needs to be able to support
it.

Cheers, Daniel
Sakari Ailus Jan. 29, 2012, 11:03 a.m. | #27
Hi Daniel,

Daniel Vetter wrote:
> On Thu, Jan 26, 2012 at 01:28:16AM +0200, Sakari Ailus wrote:
>> Why you "should not hang onto mappings forever"? This is currently done by
>> virtually all V4L2 drivers where such mappings are relevant. Not doing so
>> would really kill the performance i.e. it's infeasible. Same goes to (m)any
>> other multimedia devices dealing with buffers containing streaming video
>> data.
> 
> Because I want dynamic memory managemt simple because everything else does
> not make sense. I know that in v4l things don't work that way, but in drm
> they _do_. And if you share tons of buffers with drm drivers and don't
> follow the rules, the OOM killer will come around and shot at your apps.
> Because at least in i915 we do slurp in as much memory as we can until the
> oom killer starts growling, at which point we kick out stuff.
> 
> I know that current dma_buf isn't there and for many use-cases discussed
> here we can get away without that complexity. So you actually can just map
> your dma_buf and never ever let go of that mapping again in many cases.
> 
> The only reason I'm such a stuborn bastard about all this is that drm/*
> will do dynamic bo management even with dma_buf sooner or later and you
> should better know that and why and the implications if you choose to
> ignore it.
> 
> And obviously, the generic dma_buf interface needs to be able to support
> it.

I do not think we should completely ignore this issue, but I think we
might want to at least postpone the implementation for non-DRM
subsystems to an unknown future date. The reason is simply that it's
currently unfeasible for various reasons.

Sharing large buffers with GPUs (where you might want to manage them
independently of the user space) is uncommon; typically you're sharing
buffers for viewfinder that tend to be around few megabytes in size and
there may be typically up to five of them. Also, we're still far from
getting things working in the first place. Let's not complicate them
more than we have to.

The very reason why we're pre-allocating these large buffers in
applications is that you can readily use them when you need them.
Consider camera, for example: a common use case is to have a set of 24
MB buffers (for 12 Mp images) prepared while the viewfinder is running.
These buffers must be immediately usable when the user presses the
shutter button.

We don't want to continuously map and unmap buffers in viewfinder
either: that adds a significan CPU load for no technical reason
whatsoever. Typically viewfinder also involves running software
algorithms that consume much of the available CPU time, so adding an
unnecessary CPU hog to the picture doesn't sound that enticing.

If the performance of memory management can be improved to such an
extent it really takes much less than a millisecond or so to perform all
the pinning-to-memory, IOMMU mapping and so on systems for 24 MB buffers
on regular embedded systems I think I wouldn't have much against doing
so. Currently I think we're talking about numbers that are at least
100-fold.

If you want to do this to buffers used only in DRM I'm fine with that.

Kind regards,
Daniel Vetter Jan. 29, 2012, 1:03 p.m. | #28
On Sun, Jan 29, 2012 at 01:03:39PM +0200, Sakari Ailus wrote:
> Daniel Vetter wrote:
> > On Thu, Jan 26, 2012 at 01:28:16AM +0200, Sakari Ailus wrote:
> >> Why you "should not hang onto mappings forever"? This is currently done by
> >> virtually all V4L2 drivers where such mappings are relevant. Not doing so
> >> would really kill the performance i.e. it's infeasible. Same goes to (m)any
> >> other multimedia devices dealing with buffers containing streaming video
> >> data.
> > 
> > Because I want dynamic memory managemt simple because everything else does
> > not make sense. I know that in v4l things don't work that way, but in drm
> > they _do_. And if you share tons of buffers with drm drivers and don't
> > follow the rules, the OOM killer will come around and shot at your apps.
> > Because at least in i915 we do slurp in as much memory as we can until the
> > oom killer starts growling, at which point we kick out stuff.
> > 
> > I know that current dma_buf isn't there and for many use-cases discussed
> > here we can get away without that complexity. So you actually can just map
> > your dma_buf and never ever let go of that mapping again in many cases.
> > 
> > The only reason I'm such a stuborn bastard about all this is that drm/*
> > will do dynamic bo management even with dma_buf sooner or later and you
> > should better know that and why and the implications if you choose to
> > ignore it.
> > 
> > And obviously, the generic dma_buf interface needs to be able to support
> > it.
> 
> I do not think we should completely ignore this issue, but I think we
> might want to at least postpone the implementation for non-DRM
> subsystems to an unknown future date. The reason is simply that it's
> currently unfeasible for various reasons.
> 
> Sharing large buffers with GPUs (where you might want to manage them
> independently of the user space) is uncommon; typically you're sharing
> buffers for viewfinder that tend to be around few megabytes in size and
> there may be typically up to five of them. Also, we're still far from
> getting things working in the first place. Let's not complicate them
> more than we have to.
> 
> The very reason why we're pre-allocating these large buffers in
> applications is that you can readily use them when you need them.
> Consider camera, for example: a common use case is to have a set of 24
> MB buffers (for 12 Mp images) prepared while the viewfinder is running.
> These buffers must be immediately usable when the user presses the
> shutter button.
> 
> We don't want to continuously map and unmap buffers in viewfinder
> either: that adds a significan CPU load for no technical reason
> whatsoever. Typically viewfinder also involves running software
> algorithms that consume much of the available CPU time, so adding an
> unnecessary CPU hog to the picture doesn't sound that enticing.
> 
> If the performance of memory management can be improved to such an
> extent it really takes much less than a millisecond or so to perform all
> the pinning-to-memory, IOMMU mapping and so on systems for 24 MB buffers
> on regular embedded systems I think I wouldn't have much against doing
> so. Currently I think we're talking about numbers that are at least
> 100-fold.
> 
> If you want to do this to buffers used only in DRM I'm fine with that.

A few things:
- I do understand that there are use cases where allocate, pin & forget
  works.
- I'm perfectly fine if you do this in your special embedded product. Or
  the entire v4l subsystem, I don't care much about that one, either.

But:
- I'm fully convinced that all these special purpose single use-case
  scenarios will show up sooner or later on a more general purpose
  platform.
- And as soon as your on a general purpose platform, you _want_ dynamic
  memory management.

I mean the entire reason people are pushing CMA is that preallocating gobs
of memory statically really isn't that great an idea ...

So to summarize I understand your constraints - gpu drivers have worked
like v4l a few years ago. The thing I'm trying to achieve with this
constant yelling is just to raise awereness for these issues so that
people aren't suprised when drm starts pulling tricks on dma_bufs.

Cheers, Daniel
Laurent Pinchart Jan. 30, 2012, 2:33 p.m. | #29
Hi Daniel,

On Sunday 29 January 2012 14:03:40 Daniel Vetter wrote:
> On Sun, Jan 29, 2012 at 01:03:39PM +0200, Sakari Ailus wrote:
> > Daniel Vetter wrote:
> > > On Thu, Jan 26, 2012 at 01:28:16AM +0200, Sakari Ailus wrote:
> > >> Why you "should not hang onto mappings forever"? This is currently
> > >> done by virtually all V4L2 drivers where such mappings are relevant.
> > >> Not doing so would really kill the performance i.e. it's infeasible.
> > >> Same goes to (m)any other multimedia devices dealing with buffers
> > >> containing streaming video data.
> > > 
> > > Because I want dynamic memory managemt simple because everything else
> > > does not make sense. I know that in v4l things don't work that way,
> > > but in drm they _do_. And if you share tons of buffers with drm
> > > drivers and don't follow the rules, the OOM killer will come around
> > > and shot at your apps. Because at least in i915 we do slurp in as much
> > > memory as we can until the oom killer starts growling, at which point
> > > we kick out stuff.
> > > 
> > > I know that current dma_buf isn't there and for many use-cases
> > > discussed here we can get away without that complexity. So you
> > > actually can just map your dma_buf and never ever let go of that
> > > mapping again in many cases.
> > > 
> > > The only reason I'm such a stuborn bastard about all this is that drm/*
> > > will do dynamic bo management even with dma_buf sooner or later and you
> > > should better know that and why and the implications if you choose to
> > > ignore it.
> > > 
> > > And obviously, the generic dma_buf interface needs to be able to
> > > support it.
> > 
> > I do not think we should completely ignore this issue, but I think we
> > might want to at least postpone the implementation for non-DRM
> > subsystems to an unknown future date. The reason is simply that it's
> > currently unfeasible for various reasons.
> > 
> > Sharing large buffers with GPUs (where you might want to manage them
> > independently of the user space) is uncommon; typically you're sharing
> > buffers for viewfinder that tend to be around few megabytes in size and
> > there may be typically up to five of them. Also, we're still far from
> > getting things working in the first place. Let's not complicate them
> > more than we have to.
> > 
> > The very reason why we're pre-allocating these large buffers in
> > applications is that you can readily use them when you need them.
> > Consider camera, for example: a common use case is to have a set of 24
> > MB buffers (for 12 Mp images) prepared while the viewfinder is running.
> > These buffers must be immediately usable when the user presses the
> > shutter button.
> > 
> > We don't want to continuously map and unmap buffers in viewfinder
> > either: that adds a significan CPU load for no technical reason
> > whatsoever. Typically viewfinder also involves running software
> > algorithms that consume much of the available CPU time, so adding an
> > unnecessary CPU hog to the picture doesn't sound that enticing.
> > 
> > If the performance of memory management can be improved to such an
> > extent it really takes much less than a millisecond or so to perform all
> > the pinning-to-memory, IOMMU mapping and so on systems for 24 MB buffers
> > on regular embedded systems I think I wouldn't have much against doing
> > so. Currently I think we're talking about numbers that are at least
> > 100-fold.
> > 
> > If you want to do this to buffers used only in DRM I'm fine with that.
> 
> A few things:
> - I do understand that there are use cases where allocate, pin & forget
>   works.

I don't like the "forget" part :-)

As a V4L2 developer, I'm not advocating for keeping mappings around forever, 
but to keep them for as long as possible. Without such a feature, dma-buf 
support in V4L2 will just be a proof-of-concept, with little use in real 
products.

> - I'm perfectly fine if you do this in your special embedded product. Or
>   the entire v4l subsystem, I don't care much about that one, either.

I thought the point of these discussions was to start caring about each-
other's subsystems :-)

> But:
> - I'm fully convinced that all these special purpose single use-case
>   scenarios will show up sooner or later on a more general purpose
>   platform.
> - And as soon as your on a general purpose platform, you _want_ dynamic
>   memory management.

I'm pretty sure we want dynamic memory management in special-purpose platforms 
as well.

> I mean the entire reason people are pushing CMA is that preallocating gobs
> of memory statically really isn't that great an idea ...
> 
> So to summarize I understand your constraints - gpu drivers have worked like
> v4l a few years ago. The thing I'm trying to achieve with this constant
> yelling is just to raise awereness for these issues so that people aren't
> suprised when drm starts pulling tricks on dma_bufs.

Once again, our constraints is not that we need to keep mappings around 
forever, but that we want to keep them for as long as possible. I don't think 
DRM is an issue here, I'm perfectly fine with releasing the mapping when DRM 
(or any other subsystem) needs to move the buffer. All we need here is a clear 
API in dma-buf to handle this use case. Do you think this would be overly 
complex ?
Laurent Pinchart Jan. 30, 2012, 2:44 p.m. | #30
Hi Daniel,

On Tuesday 24 January 2012 14:03:22 Daniel Vetter wrote:
> On Mon, Jan 23, 2012 at 11:54:20AM +0100, Laurent Pinchart wrote:
> > On Monday 23 January 2012 11:35:01 Daniel Vetter wrote:
> > > See my other mail, dma_buf v1 does not support cpu access.
> > 
> > v1 is in the kernel now, let's start discussing v2 ;-)
> 
> Ok, I'm in ;-)
> 
> I've thought a bit about this, and I think a reasonable next step would be
> to enable cpu access from kernelspace. Userspace and mmap support is a hole
> different beast altogether and I think we should postpone that until we've
> got the kernel part hashed out.

Userspace access through mmap can already be handled by the subsystem that 
exports the buffer, so I'm fine with postponing implementing this inside dma-
buf.

> I'm thinking about adding 3 pairs of function to dma_buf (not to
> dma_buf_attachment).
> 
> dma_buf_get_backing_storage/put_backing_storage
> This will be used before/after kernel cpu access to ensure that the
> backing storage is in memory. E.g. gem objects can be swapped out, so
> they need to be pinned before we can access them. For exporters with
> static allocations this would be a no-op.
> 
> I think a start, length range would make sense, but the exporter is free
> to just swap in the entire object unconditionally. The range is specified
> in multiples of PAGE_SIZE - I don't think there's any usescase for a
> get/put_backing_storage which deals in smaller units.
> 
> The get/put functions are allowed to block and grab all kinds of looks.
> get is allowed to fail with e.g. -ENOMEM.
> 
> dma_buf_kmap/kunmap
> This maps _one_ page into the kernels address space and out of it. This
> function also flushes/invalidates any caches required. Importers are not
> allowed to map more than 2 pages at the same time in total (to allow
> copies). This is because at least for gem objects the backing storage can
> be in high-mem.
> 
> Importers are allowed to sleep while holding such a kernel mapping.
> 
> These functions are not allowed to fail (like kmap/kunmap).
> 
> dma_buf_kmap_atomic/kunmap_atomic
> For performance we want to also allow atomic mappigns. Only difference is
> that importers are not allowed to sleep while holding an atomic mapping.
> 
> These functions are again not allowed to fail.
> 
> Comments, flames?

Before commenting (I don't plan on flaming :-)), I'd like to take a small step 
back. Could you describe the use case(s) you think of for kernel mappings ?

> If this sounds sensible I'll throw together a quick rfc over the next few
> days.
> 
> Cheers, Daniel
> 
> PS: Imo the next step after this would be adding userspace mmap support.
> This has the funky requirement that the exporter needs to be able to shot
> down any ptes before it moves around the buffer. I think solving that nice
> little problem is a good exercise to prepare us for solving similar "get
> of this buffer, now" issues with permanent device address space mappings
> ...
Daniel Vetter Jan. 30, 2012, 4:29 p.m. | #31
On Mon, Jan 30, 2012 at 03:44:20PM +0100, Laurent Pinchart wrote:
> Hi Daniel,
> 
> On Tuesday 24 January 2012 14:03:22 Daniel Vetter wrote:
> > On Mon, Jan 23, 2012 at 11:54:20AM +0100, Laurent Pinchart wrote:
> > > On Monday 23 January 2012 11:35:01 Daniel Vetter wrote:
> > > > See my other mail, dma_buf v1 does not support cpu access.
> > > 
> > > v1 is in the kernel now, let's start discussing v2 ;-)
> > 
> > Ok, I'm in ;-)
> > 
> > I've thought a bit about this, and I think a reasonable next step would be
> > to enable cpu access from kernelspace. Userspace and mmap support is a hole
> > different beast altogether and I think we should postpone that until we've
> > got the kernel part hashed out.
> 
> Userspace access through mmap can already be handled by the subsystem that 
> exports the buffer, so I'm fine with postponing implementing this inside dma-
> buf.

Actually I think the interface for mmap won't be that horribly. If we add
a bit of magic clue so that the exporter can intercept pagefaults we
should be able to fake coherent mmaps in all cases. And hence avoid
exposing importers and userspace to a lot of ugly things.

> > I'm thinking about adding 3 pairs of function to dma_buf (not to
> > dma_buf_attachment).
> > 
> > dma_buf_get_backing_storage/put_backing_storage
> > This will be used before/after kernel cpu access to ensure that the
> > backing storage is in memory. E.g. gem objects can be swapped out, so
> > they need to be pinned before we can access them. For exporters with
> > static allocations this would be a no-op.
> > 
> > I think a start, length range would make sense, but the exporter is free
> > to just swap in the entire object unconditionally. The range is specified
> > in multiples of PAGE_SIZE - I don't think there's any usescase for a
> > get/put_backing_storage which deals in smaller units.
> > 
> > The get/put functions are allowed to block and grab all kinds of looks.
> > get is allowed to fail with e.g. -ENOMEM.
> > 
> > dma_buf_kmap/kunmap
> > This maps _one_ page into the kernels address space and out of it. This
> > function also flushes/invalidates any caches required. Importers are not
> > allowed to map more than 2 pages at the same time in total (to allow
> > copies). This is because at least for gem objects the backing storage can
> > be in high-mem.
> > 
> > Importers are allowed to sleep while holding such a kernel mapping.
> > 
> > These functions are not allowed to fail (like kmap/kunmap).
> > 
> > dma_buf_kmap_atomic/kunmap_atomic
> > For performance we want to also allow atomic mappigns. Only difference is
> > that importers are not allowed to sleep while holding an atomic mapping.
> > 
> > These functions are again not allowed to fail.
> > 
> > Comments, flames?
> 
> Before commenting (I don't plan on flaming :-)), I'd like to take a small step 
> back. Could you describe the use case(s) you think of for kernel mappings ?

One is simply for devices where the kernel has to shove around the data -
I've heard some complaints in the v4l dma_buf threads that this seems to
be dearly in need. E.g. when the v4l devices is attached to usb or behind
another slow(er) bus where direct dma is not possible.

The other is that currently importers are 2nd class citizens and can't do
any cpu access. Making kernel cpu access would be the first step
(afterswards getting mmap working for userspace cpu access). Now for some
simple use-case we can just say "use the access paths provided by the
exporter". But for gpu drivers this gets really ugly because the
upload/download paths aren't standardized within drm/* and they're all
highly optimized. So to make buffer sharing possible among gpu's we need
this - the current gem prime code just grabs the underlying struct page
with sg_page and horribly fails if that doesn't work ;-)

On my proposal itself I think I'll change get/put_backing_storage to
begin/end_cpu_access - these functions must also do any required flushing
to ensure coherency (besides actually grabbing the backing storage). So I
think that's a better name.
-Daniel
Sakari Ailus Jan. 30, 2012, 10:01 p.m. | #32
Hi Daniel,

On Sun, Jan 29, 2012 at 02:03:40PM +0100, Daniel Vetter wrote:
> On Sun, Jan 29, 2012 at 01:03:39PM +0200, Sakari Ailus wrote:
> > Daniel Vetter wrote:
> > > On Thu, Jan 26, 2012 at 01:28:16AM +0200, Sakari Ailus wrote:
> > >> Why you "should not hang onto mappings forever"? This is currently done by
> > >> virtually all V4L2 drivers where such mappings are relevant. Not doing so
> > >> would really kill the performance i.e. it's infeasible. Same goes to (m)any
> > >> other multimedia devices dealing with buffers containing streaming video
> > >> data.
> > > 
> > > Because I want dynamic memory managemt simple because everything else does
> > > not make sense. I know that in v4l things don't work that way, but in drm
> > > they _do_. And if you share tons of buffers with drm drivers and don't
> > > follow the rules, the OOM killer will come around and shot at your apps.
> > > Because at least in i915 we do slurp in as much memory as we can until the
> > > oom killer starts growling, at which point we kick out stuff.
> > > 
> > > I know that current dma_buf isn't there and for many use-cases discussed
> > > here we can get away without that complexity. So you actually can just map
> > > your dma_buf and never ever let go of that mapping again in many cases.
> > > 
> > > The only reason I'm such a stuborn bastard about all this is that drm/*
> > > will do dynamic bo management even with dma_buf sooner or later and you
> > > should better know that and why and the implications if you choose to
> > > ignore it.
> > > 
> > > And obviously, the generic dma_buf interface needs to be able to support
> > > it.
> > 
> > I do not think we should completely ignore this issue, but I think we
> > might want to at least postpone the implementation for non-DRM
> > subsystems to an unknown future date. The reason is simply that it's
> > currently unfeasible for various reasons.
> > 
> > Sharing large buffers with GPUs (where you might want to manage them
> > independently of the user space) is uncommon; typically you're sharing
> > buffers for viewfinder that tend to be around few megabytes in size and
> > there may be typically up to five of them. Also, we're still far from
> > getting things working in the first place. Let's not complicate them
> > more than we have to.
> > 
> > The very reason why we're pre-allocating these large buffers in
> > applications is that you can readily use them when you need them.
> > Consider camera, for example: a common use case is to have a set of 24
> > MB buffers (for 12 Mp images) prepared while the viewfinder is running.
> > These buffers must be immediately usable when the user presses the
> > shutter button.
> > 
> > We don't want to continuously map and unmap buffers in viewfinder
> > either: that adds a significan CPU load for no technical reason
> > whatsoever. Typically viewfinder also involves running software
> > algorithms that consume much of the available CPU time, so adding an
> > unnecessary CPU hog to the picture doesn't sound that enticing.
> > 
> > If the performance of memory management can be improved to such an
> > extent it really takes much less than a millisecond or so to perform all
> > the pinning-to-memory, IOMMU mapping and so on systems for 24 MB buffers
> > on regular embedded systems I think I wouldn't have much against doing
> > so. Currently I think we're talking about numbers that are at least
> > 100-fold.
> > 
> > If you want to do this to buffers used only in DRM I'm fine with that.
> 
> A few things:
> - I do understand that there are use cases where allocate, pin & forget
>   works.

It's not so much about forgetting it; if the buffers could be considered to
be paged out at any point I think almost all users would prefer to perform
mlock(2) to them. Buffers allocated since they're needed for streaming, not
so much for single use pruposes.

There are also very few of those buffers compared to DRM.

> - I'm perfectly fine if you do this in your special embedded product. Or
>   the entire v4l subsystem, I don't care much about that one, either.

It's not so special; this is done on desktop PCs as well and I don't think
this is going to change any time soon.

> But:
> - I'm fully convinced that all these special purpose single use-case
>   scenarios will show up sooner or later on a more general purpose
>   platform.

They already do show on PCs. What PCs have been able to afford and still can
is to add extra data copies at any part of the stream processing to overcome
any minor technical obstacle for which other more memory bus friendly
solutions can be used for.

> - And as soon as your on a general purpose platform, you _want_ dynamic
>   memory management.

V4L2 has been like this for some 10 years, embedded or not. When the
performance of memory management improves enough we can reconsider this.

> I mean the entire reason people are pushing CMA is that preallocating gobs
> of memory statically really isn't that great an idea ...

There's a huge difference allocating things statically and an application
deciding what it needs to allocate and when; you don't want to keep the
buffers around if you're not using them. In that case you want to free them.

> So to summarize I understand your constraints - gpu drivers have worked
> like v4l a few years ago. The thing I'm trying to achieve with this
> constant yelling is just to raise awereness for these issues so that
> people aren't suprised when drm starts pulling tricks on dma_bufs.

I think we should be able to mark dma_bufs non-relocatable so also DRM can
work with these buffers. Or alternatively, as Laurent proposed, V4L2 be
prepared for moving the buffers around. Are there other reasons to do so
than paging them out of system memory to make room for something else?

Kind regards,
Rob Clark Jan. 31, 2012, 3:38 p.m. | #33
On Mon, Jan 30, 2012 at 4:01 PM, Sakari Ailus <sakari.ailus@iki.fi> wrote:
>
>> So to summarize I understand your constraints - gpu drivers have worked
>> like v4l a few years ago. The thing I'm trying to achieve with this
>> constant yelling is just to raise awereness for these issues so that
>> people aren't suprised when drm starts pulling tricks on dma_bufs.
>
> I think we should be able to mark dma_bufs non-relocatable so also DRM can
> work with these buffers. Or alternatively, as Laurent proposed, V4L2 be
> prepared for moving the buffers around. Are there other reasons to do so
> than paging them out of system memory to make room for something else?

fwiw, from GPU perspective, the DRM device wouldn't be actively
relocating buffers just for the fun of it.  I think it is more that we
want to give the GPU driver the flexibility to relocate when it really
needs to.  For example, maybe user has camera app running, then puts
it in the background and opens firefox which tries to allocate a big
set of pixmaps putting pressure on GPU memory..

I guess the root issue is who is doing the IOMMU programming for the
camera driver.  I guess if this is something built in to the camera
driver then when it calls dma_buf_map() it probably wants some hint
that the backing pages haven't moved so in the common case (ie. buffer
hasn't moved) it doesn't have to do anything expensive.

On omap4 v4l2+drm example I have running, it is actually the DRM
driver doing the "IOMMU" programming.. so v4l2 camera really doesn't
need to care about it.  (And the IOMMU programming here is pretty
fast.)  But I suppose this maybe doesn't represent all cases.  I
suppose if a camera didn't really sit behind an IOMMU but uses
something more like a DMA descriptor list would want to know if it
needed to regenerate it's descriptor list.  Or likewise if camera has
an IOMMU that isn't really using the IOMMU framework (although maybe
that is easier to solve).  But I think a hint returned from
dma_buf_map() would do the job?

BR,
-R
Laurent Pinchart Feb. 2, 2012, 10:19 a.m. | #34
Hi Rob,

On Tuesday 31 January 2012 16:38:35 Clark, Rob wrote:
> On Mon, Jan 30, 2012 at 4:01 PM, Sakari Ailus <sakari.ailus@iki.fi> wrote:
> >> So to summarize I understand your constraints - gpu drivers have worked
> >> like v4l a few years ago. The thing I'm trying to achieve with this
> >> constant yelling is just to raise awereness for these issues so that
> >> people aren't suprised when drm starts pulling tricks on dma_bufs.
> > 
> > I think we should be able to mark dma_bufs non-relocatable so also DRM
> > can work with these buffers. Or alternatively, as Laurent proposed, V4L2
> > be prepared for moving the buffers around. Are there other reasons to do
> > so than paging them out of system memory to make room for something
> > else?
> 
> fwiw, from GPU perspective, the DRM device wouldn't be actively
> relocating buffers just for the fun of it.  I think it is more that we
> want to give the GPU driver the flexibility to relocate when it really
> needs to.  For example, maybe user has camera app running, then puts
> it in the background and opens firefox which tries to allocate a big
> set of pixmaps putting pressure on GPU memory..

On an embedded system putting the camera application in the background will 
usually stop streaming, so buffers will be unmapped. On other systems, or even 
on some embedded systems, that will not be the case though.

I'm perfectly fine with relocating buffers when needed. What I want is to 
avoid unmapping and remapping them for every frame if they haven't moved. I'm 
sure we can come up with an API to handle that.

> I guess the root issue is who is doing the IOMMU programming for the camera
> driver. I guess if this is something built in to the camera driver then when
> it calls dma_buf_map() it probably wants some hint that the backing pages
> haven't moved so in the common case (ie. buffer hasn't moved) it doesn't
> have to do anything expensive.

It will likely depend on the camera hardware. For the OMAP3 ISP, the driver 
calls the IOMMU API explictly, but if I understand it correctly there's a plan 
to move IOMMU support to the DMA API.

> On omap4 v4l2+drm example I have running, it is actually the DRM driver
> doing the "IOMMU" programming.. so v4l2 camera really doesn't need to care
> about it.  (And the IOMMU programming here is pretty fast.)  But I suppose
> this maybe doesn't represent all cases. I suppose if a camera didn't really
> sit behind an IOMMU but uses something more like a DMA descriptor list would
> want to know if it needed to regenerate it's descriptor list. Or likewise if
> camera has an IOMMU that isn't really using the IOMMU framework (although
> maybe that is easier to solve).  But I think a hint returned from
> dma_buf_map() would do the job?

I see at least three possible solutions to this problem.

1. At dma_buf_unmap() time, the exporter will tell the importer that the 
buffer will move, and that it should be unmapped from whatever the importer 
mapped it to. That's probably the easiest solution to implement on the 
importer's side, but I expect it to be difficult for the exporter to know at 
dma_buf_unmap() time if the buffer will need to be moved or not.

2. Adding a callback to request the importer to unmap the buffer. This might 
be racy, and locking might be difficult to handle.

3. At dma_buf_unmap() time, keep importer's mappings around. The exporter is 
then free to move the buffer if needed, in which case the mappings will be 
invalid. This shouldn't be a problem in theory, as the buffer isn't being used 
by the importer at that time, but can cause stability issues when dealing with 
rogue hardware as this would punch holes in the IOMMU fence. At dma_buf_map() 
time the exporter would tell the importer whether the buffer moved or not. If 
it moved, the importer will tear down the mappings it kept, and create new 
ones.

Variations around those 3 possible solutions are possible.
Rob Clark Feb. 2, 2012, 2:01 p.m. | #35
On Thu, Feb 2, 2012 at 4:19 AM, Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
> Hi Rob,
>
> On Tuesday 31 January 2012 16:38:35 Clark, Rob wrote:
>> On Mon, Jan 30, 2012 at 4:01 PM, Sakari Ailus <sakari.ailus@iki.fi> wrote:
>> >> So to summarize I understand your constraints - gpu drivers have worked
>> >> like v4l a few years ago. The thing I'm trying to achieve with this
>> >> constant yelling is just to raise awereness for these issues so that
>> >> people aren't suprised when drm starts pulling tricks on dma_bufs.
>> >
>> > I think we should be able to mark dma_bufs non-relocatable so also DRM
>> > can work with these buffers. Or alternatively, as Laurent proposed, V4L2
>> > be prepared for moving the buffers around. Are there other reasons to do
>> > so than paging them out of system memory to make room for something
>> > else?
>>
>> fwiw, from GPU perspective, the DRM device wouldn't be actively
>> relocating buffers just for the fun of it.  I think it is more that we
>> want to give the GPU driver the flexibility to relocate when it really
>> needs to.  For example, maybe user has camera app running, then puts
>> it in the background and opens firefox which tries to allocate a big
>> set of pixmaps putting pressure on GPU memory..
>
> On an embedded system putting the camera application in the background will
> usually stop streaming, so buffers will be unmapped. On other systems, or even
> on some embedded systems, that will not be the case though.
>
> I'm perfectly fine with relocating buffers when needed. What I want is to
> avoid unmapping and remapping them for every frame if they haven't moved. I'm
> sure we can come up with an API to handle that.
>
>> I guess the root issue is who is doing the IOMMU programming for the camera
>> driver. I guess if this is something built in to the camera driver then when
>> it calls dma_buf_map() it probably wants some hint that the backing pages
>> haven't moved so in the common case (ie. buffer hasn't moved) it doesn't
>> have to do anything expensive.
>
> It will likely depend on the camera hardware. For the OMAP3 ISP, the driver
> calls the IOMMU API explictly, but if I understand it correctly there's a plan
> to move IOMMU support to the DMA API.
>
>> On omap4 v4l2+drm example I have running, it is actually the DRM driver
>> doing the "IOMMU" programming.. so v4l2 camera really doesn't need to care
>> about it.  (And the IOMMU programming here is pretty fast.)  But I suppose
>> this maybe doesn't represent all cases. I suppose if a camera didn't really
>> sit behind an IOMMU but uses something more like a DMA descriptor list would
>> want to know if it needed to regenerate it's descriptor list. Or likewise if
>> camera has an IOMMU that isn't really using the IOMMU framework (although
>> maybe that is easier to solve).  But I think a hint returned from
>> dma_buf_map() would do the job?
>
> I see at least three possible solutions to this problem.
>
> 1. At dma_buf_unmap() time, the exporter will tell the importer that the
> buffer will move, and that it should be unmapped from whatever the importer
> mapped it to. That's probably the easiest solution to implement on the
> importer's side, but I expect it to be difficult for the exporter to know at
> dma_buf_unmap() time if the buffer will need to be moved or not.
>
> 2. Adding a callback to request the importer to unmap the buffer. This might
> be racy, and locking might be difficult to handle.
>
> 3. At dma_buf_unmap() time, keep importer's mappings around. The exporter is
> then free to move the buffer if needed, in which case the mappings will be
> invalid. This shouldn't be a problem in theory, as the buffer isn't being used
> by the importer at that time, but can cause stability issues when dealing with
> rogue hardware as this would punch holes in the IOMMU fence. At dma_buf_map()
> time the exporter would tell the importer whether the buffer moved or not. If
> it moved, the importer will tear down the mappings it kept, and create new
> ones.

I was leaning towards door #3.. rogue hw is a good point, but I think
that would be an issue in general if hw kept accessing the buffer when
it wasn't supposed to.

BR,
-R

> Variations around those 3 possible solutions are possible.
>
> --
> Regards,
>
> Laurent Pinchart
Sumit Semwal Feb. 2, 2012, 2:40 p.m. | #36
On 2 February 2012 19:31, Clark, Rob <rob@ti.com> wrote:
> On Thu, Feb 2, 2012 at 4:19 AM, Laurent Pinchart
> <laurent.pinchart@ideasonboard.com> wrote:
>> Hi Rob,
>>
>> On Tuesday 31 January 2012 16:38:35 Clark, Rob wrote:
>>> On Mon, Jan 30, 2012 at 4:01 PM, Sakari Ailus <sakari.ailus@iki.fi> wrote:
>>> >> So to summarize I understand your constraints - gpu drivers have worked
>>> >> like v4l a few years ago. The thing I'm trying to achieve with this
>>> >> constant yelling is just to raise awereness for these issues so that
>>> >> people aren't suprised when drm starts pulling tricks on dma_bufs.
>>> >
>>> > I think we should be able to mark dma_bufs non-relocatable so also DRM
>>> > can work with these buffers. Or alternatively, as Laurent proposed, V4L2
>>> > be prepared for moving the buffers around. Are there other reasons to do
>>> > so than paging them out of system memory to make room for something
>>> > else?
>>>
>>> fwiw, from GPU perspective, the DRM device wouldn't be actively
>>> relocating buffers just for the fun of it.  I think it is more that we
>>> want to give the GPU driver the flexibility to relocate when it really
>>> needs to.  For example, maybe user has camera app running, then puts
>>> it in the background and opens firefox which tries to allocate a big
>>> set of pixmaps putting pressure on GPU memory..
>>
>> On an embedded system putting the camera application in the background will
>> usually stop streaming, so buffers will be unmapped. On other systems, or even
>> on some embedded systems, that will not be the case though.
>>
>> I'm perfectly fine with relocating buffers when needed. What I want is to
>> avoid unmapping and remapping them for every frame if they haven't moved. I'm
>> sure we can come up with an API to handle that.
>>
>>> I guess the root issue is who is doing the IOMMU programming for the camera
>>> driver. I guess if this is something built in to the camera driver then when
>>> it calls dma_buf_map() it probably wants some hint that the backing pages
>>> haven't moved so in the common case (ie. buffer hasn't moved) it doesn't
>>> have to do anything expensive.
>>
>> It will likely depend on the camera hardware. For the OMAP3 ISP, the driver
>> calls the IOMMU API explictly, but if I understand it correctly there's a plan
>> to move IOMMU support to the DMA API.
>>
>>> On omap4 v4l2+drm example I have running, it is actually the DRM driver
>>> doing the "IOMMU" programming.. so v4l2 camera really doesn't need to care
>>> about it.  (And the IOMMU programming here is pretty fast.)  But I suppose
>>> this maybe doesn't represent all cases. I suppose if a camera didn't really
>>> sit behind an IOMMU but uses something more like a DMA descriptor list would
>>> want to know if it needed to regenerate it's descriptor list. Or likewise if
>>> camera has an IOMMU that isn't really using the IOMMU framework (although
>>> maybe that is easier to solve).  But I think a hint returned from
>>> dma_buf_map() would do the job?
>>
>> I see at least three possible solutions to this problem.
>>
>> 1. At dma_buf_unmap() time, the exporter will tell the importer that the
>> buffer will move, and that it should be unmapped from whatever the importer
>> mapped it to. That's probably the easiest solution to implement on the
>> importer's side, but I expect it to be difficult for the exporter to know at
>> dma_buf_unmap() time if the buffer will need to be moved or not.
>>
>> 2. Adding a callback to request the importer to unmap the buffer. This might
>> be racy, and locking might be difficult to handle.
>>
>> 3. At dma_buf_unmap() time, keep importer's mappings around. The exporter is
>> then free to move the buffer if needed, in which case the mappings will be
>> invalid. This shouldn't be a problem in theory, as the buffer isn't being used
>> by the importer at that time, but can cause stability issues when dealing with
>> rogue hardware as this would punch holes in the IOMMU fence. At dma_buf_map()
>> time the exporter would tell the importer whether the buffer moved or not. If
>> it moved, the importer will tear down the mappings it kept, and create new
>> ones.
>
> I was leaning towards door #3.. rogue hw is a good point, but I think
> that would be an issue in general if hw kept accessing the buffer when
> it wasn't supposed to.
>
Yes, I feel #3 is a fair way of solving this.
> BR,
> -R
>
>> Variations around those 3 possible solutions are possible.
>>
>> --
>> Regards,
>>
>> Laurent Pinchart
Thanks and regards,
~Sumit.
Daniel Vetter Feb. 2, 2012, 8:23 p.m. | #37
On Thu, Feb 2, 2012 at 11:19, Laurent Pinchart <laurent.pinchart@ideasonboard.com> wrote:
>> On omap4 v4l2+drm example I have running, it is actually the DRM driver
>> doing the "IOMMU" programming.. so v4l2 camera really doesn't need to care
>> about it.  (And the IOMMU programming here is pretty fast.)  But I suppose
>> this maybe doesn't represent all cases. I suppose if a camera didn't really
>> sit behind an IOMMU but uses something more like a DMA descriptor list would
>> want to know if it needed to regenerate it's descriptor list. Or likewise if
>> camera has an IOMMU that isn't really using the IOMMU framework (although
>> maybe that is easier to solve).  But I think a hint returned from
>> dma_buf_map() would do the job?
>
> I see at least three possible solutions to this problem.
>
> 1. At dma_buf_unmap() time, the exporter will tell the importer that the
> buffer will move, and that it should be unmapped from whatever the importer
> mapped it to. That's probably the easiest solution to implement on the
> importer's side, but I expect it to be difficult for the exporter to know at
> dma_buf_unmap() time if the buffer will need to be moved or not.
>
> 2. Adding a callback to request the importer to unmap the buffer. This might
> be racy, and locking might be difficult to handle.
>
> 3. At dma_buf_unmap() time, keep importer's mappings around. The exporter is
> then free to move the buffer if needed, in which case the mappings will be
> invalid. This shouldn't be a problem in theory, as the buffer isn't being used
> by the importer at that time, but can cause stability issues when dealing with
> rogue hardware as this would punch holes in the IOMMU fence. At dma_buf_map()
> time the exporter would tell the importer whether the buffer moved or not. If
> it moved, the importer will tear down the mappings it kept, and create new
> ones.
>
> Variations around those 3 possible solutions are possible.

While preparing my fosdem presentation about dma_buf I've thought quite a
bit what we still need for forceful unmap support/persistent
mappings/dynamic dma_buf/whatever you want to call it. And it's a lot, and
we have quite a few lower hanging fruits to reap (like cpu access and mmap
support for importer). So I propose instead:

4. Just hang onto the device mappings for as long as it's convenient and/or
necessary and feel guilty about it.

The reason is that going fully static isn't worse than a half-baked
dynamic version of dma_buf, but the half-baked dynamic one has the
downside that we can ignore the issue and feel good about things ;-)

Cheers, Daniel
Rob Clark Feb. 2, 2012, 8:49 p.m. | #38
On Thu, Feb 2, 2012 at 2:23 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Thu, Feb 2, 2012 at 11:19, Laurent Pinchart <laurent.pinchart@ideasonboard.com> wrote:
>>> On omap4 v4l2+drm example I have running, it is actually the DRM driver
>>> doing the "IOMMU" programming.. so v4l2 camera really doesn't need to care
>>> about it.  (And the IOMMU programming here is pretty fast.)  But I suppose
>>> this maybe doesn't represent all cases. I suppose if a camera didn't really
>>> sit behind an IOMMU but uses something more like a DMA descriptor list would
>>> want to know if it needed to regenerate it's descriptor list. Or likewise if
>>> camera has an IOMMU that isn't really using the IOMMU framework (although
>>> maybe that is easier to solve).  But I think a hint returned from
>>> dma_buf_map() would do the job?
>>
>> I see at least three possible solutions to this problem.
>>
>> 1. At dma_buf_unmap() time, the exporter will tell the importer that the
>> buffer will move, and that it should be unmapped from whatever the importer
>> mapped it to. That's probably the easiest solution to implement on the
>> importer's side, but I expect it to be difficult for the exporter to know at
>> dma_buf_unmap() time if the buffer will need to be moved or not.
>>
>> 2. Adding a callback to request the importer to unmap the buffer. This might
>> be racy, and locking might be difficult to handle.
>>
>> 3. At dma_buf_unmap() time, keep importer's mappings around. The exporter is
>> then free to move the buffer if needed, in which case the mappings will be
>> invalid. This shouldn't be a problem in theory, as the buffer isn't being used
>> by the importer at that time, but can cause stability issues when dealing with
>> rogue hardware as this would punch holes in the IOMMU fence. At dma_buf_map()
>> time the exporter would tell the importer whether the buffer moved or not. If
>> it moved, the importer will tear down the mappings it kept, and create new
>> ones.
>>
>> Variations around those 3 possible solutions are possible.
>
> While preparing my fosdem presentation about dma_buf I've thought quite a
> bit what we still need for forceful unmap support/persistent
> mappings/dynamic dma_buf/whatever you want to call it. And it's a lot, and
> we have quite a few lower hanging fruits to reap (like cpu access and mmap
> support for importer). So I propose instead:
>
> 4. Just hang onto the device mappings for as long as it's convenient and/or
> necessary and feel guilty about it.

for v4l2/vb2, I'd like to at least request some sort of
BUF_PREPARE_IS_EXPENSIVE flag, so we don't penalize devices where
remapping is not expensive.  Ie. the camera driver could set this flag
so vb2 core knows not unmap()/re-map() between frames.

In my case, for v4l2 + encoder, I really need the unmapping/remapping
between frames, at least if there is anything else going on competing
for buffers.  But in my case, the exporter remaps to a contiguous
(sorta) "virtual" address that the camera can see, so there is no
expensive mapping on the importer side of things.


BR,
-R


> The reason is that going fully static isn't worse than a half-baked
> dynamic version of dma_buf, but the half-baked dynamic one has the
> downside that we can ignore the issue and feel good about things ;-)
>
> Cheers, Daniel
> --
> Daniel Vetter
> daniel.vetter@ffwll.ch - +41 (0) 79 364 57 48 - http://blog.ffwll.ch
>
Sakari Ailus Feb. 4, 2012, 11:43 a.m. | #39
Hi Rob,

Clark, Rob wrote:
> On Mon, Jan 30, 2012 at 4:01 PM, Sakari Ailus <sakari.ailus@iki.fi> wrote:
>>
>>> So to summarize I understand your constraints - gpu drivers have worked
>>> like v4l a few years ago. The thing I'm trying to achieve with this
>>> constant yelling is just to raise awereness for these issues so that
>>> people aren't suprised when drm starts pulling tricks on dma_bufs.
>>
>> I think we should be able to mark dma_bufs non-relocatable so also DRM can
>> work with these buffers. Or alternatively, as Laurent proposed, V4L2 be
>> prepared for moving the buffers around. Are there other reasons to do so
>> than paging them out of system memory to make room for something else?
> 
> fwiw, from GPU perspective, the DRM device wouldn't be actively
> relocating buffers just for the fun of it.  I think it is more that we
> want to give the GPU driver the flexibility to relocate when it really
> needs to.  For example, maybe user has camera app running, then puts
> it in the background and opens firefox which tries to allocate a big
> set of pixmaps putting pressure on GPU memory..
> 
> I guess the root issue is who is doing the IOMMU programming for the
> camera driver.  I guess if this is something built in to the camera
> driver then when it calls dma_buf_map() it probably wants some hint
> that the backing pages haven't moved so in the common case (ie. buffer
> hasn't moved) it doesn't have to do anything expensive.
> 
> On omap4 v4l2+drm example I have running, it is actually the DRM
> driver doing the "IOMMU" programming.. so v4l2 camera really doesn't
> need to care about it.  (And the IOMMU programming here is pretty

This part sounds odd to me. Well, I guess it _could_ be done that way,
but the ISP IOMMU could be as well different as the one in DRM. That's
the case on OMAP 3, for example.

> fast.)  But I suppose this maybe doesn't represent all cases.  I
> suppose if a camera didn't really sit behind an IOMMU but uses
> something more like a DMA descriptor list would want to know if it
> needed to regenerate it's descriptor list.  Or likewise if camera has
> an IOMMU that isn't really using the IOMMU framework (although maybe
> that is easier to solve).  But I think a hint returned from
> dma_buf_map() would do the job?

An alternative to IOMMU I think in practice would mean CMA-allocated
buffers.

I need to think about this a bit and understand how this would really
work to properly comment this.

For example, how does one mlock() something that isn't mapped to process
memory --- think of a dma buffer not mapped to the user space process
address space?

Cheers,
Rob Clark Feb. 5, 2012, 3:08 p.m. | #40
On Sat, Feb 4, 2012 at 5:43 AM, Sakari Ailus <sakari.ailus@iki.fi> wrote:
> Hi Rob,
>
> Clark, Rob wrote:
>> On Mon, Jan 30, 2012 at 4:01 PM, Sakari Ailus <sakari.ailus@iki.fi> wrote:
>>>
>>>> So to summarize I understand your constraints - gpu drivers have worked
>>>> like v4l a few years ago. The thing I'm trying to achieve with this
>>>> constant yelling is just to raise awereness for these issues so that
>>>> people aren't suprised when drm starts pulling tricks on dma_bufs.
>>>
>>> I think we should be able to mark dma_bufs non-relocatable so also DRM can
>>> work with these buffers. Or alternatively, as Laurent proposed, V4L2 be
>>> prepared for moving the buffers around. Are there other reasons to do so
>>> than paging them out of system memory to make room for something else?
>>
>> fwiw, from GPU perspective, the DRM device wouldn't be actively
>> relocating buffers just for the fun of it.  I think it is more that we
>> want to give the GPU driver the flexibility to relocate when it really
>> needs to.  For example, maybe user has camera app running, then puts
>> it in the background and opens firefox which tries to allocate a big
>> set of pixmaps putting pressure on GPU memory..
>>
>> I guess the root issue is who is doing the IOMMU programming for the
>> camera driver.  I guess if this is something built in to the camera
>> driver then when it calls dma_buf_map() it probably wants some hint
>> that the backing pages haven't moved so in the common case (ie. buffer
>> hasn't moved) it doesn't have to do anything expensive.
>>
>> On omap4 v4l2+drm example I have running, it is actually the DRM
>> driver doing the "IOMMU" programming.. so v4l2 camera really doesn't
>> need to care about it.  (And the IOMMU programming here is pretty
>
> This part sounds odd to me. Well, I guess it _could_ be done that way,
> but the ISP IOMMU could be as well different as the one in DRM. That's
> the case on OMAP 3, for example.

Yes, this is a difference between OMAP4 and OMAP3..  although I think
the intention is that OMAP3 type scenarios, if the IOMMU mapping was
done through the dma mapping API, then it could still be done (and
cached) by the exporter.

>> fast.)  But I suppose this maybe doesn't represent all cases.  I
>> suppose if a camera didn't really sit behind an IOMMU but uses
>> something more like a DMA descriptor list would want to know if it
>> needed to regenerate it's descriptor list.  Or likewise if camera has
>> an IOMMU that isn't really using the IOMMU framework (although maybe
>> that is easier to solve).  But I think a hint returned from
>> dma_buf_map() would do the job?
>
> An alternative to IOMMU I think in practice would mean CMA-allocated
> buffers.
>
> I need to think about this a bit and understand how this would really
> work to properly comment this.
>
> For example, how does one mlock() something that isn't mapped to process
> memory --- think of a dma buffer not mapped to the user space process
> address space?

The scatter list that the exporter gives you should be locked/pinned
already so importer should not need to call mlock()

BR,
-R

> Cheers,
>
> --
> Sakari Ailus
> sakari.ailus@iki.fi

Patch

diff --git a/drivers/media/video/videobuf2-core.c b/drivers/media/video/videobuf2-core.c
index 95a3f5e..6cd2f97 100644
--- a/drivers/media/video/videobuf2-core.c
+++ b/drivers/media/video/videobuf2-core.c
@@ -107,6 +107,27 @@  static void __vb2_buf_userptr_put(struct vb2_buffer *vb)
 }
 
 /**
+ * __vb2_buf_dmabuf_put() - release memory associated with
+ * a DMABUF shared buffer
+ */
+static void __vb2_buf_dmabuf_put(struct vb2_buffer *vb)
+{
+	struct vb2_queue *q = vb->vb2_queue;
+	unsigned int plane;
+
+	for (plane = 0; plane < vb->num_planes; ++plane) {
+		void *mem_priv = vb->planes[plane].mem_priv;
+
+		if (mem_priv) {
+			call_memop(q, plane, detach_dmabuf, mem_priv);
+			dma_buf_put(vb->planes[plane].dbuf);
+			vb->planes[plane].dbuf = NULL;
+			vb->planes[plane].mem_priv = NULL;
+		}
+	}
+}
+
+/**
  * __setup_offsets() - setup unique offsets ("cookies") for every plane in
  * every buffer on the queue
  */
@@ -228,6 +249,8 @@  static void __vb2_free_mem(struct vb2_queue *q, unsigned int buffers)
 		/* Free MMAP buffers or release USERPTR buffers */
 		if (q->memory == V4L2_MEMORY_MMAP)
 			__vb2_buf_mem_free(vb);
+		if (q->memory == V4L2_MEMORY_DMABUF)
+			__vb2_buf_dmabuf_put(vb);
 		else
 			__vb2_buf_userptr_put(vb);
 	}
@@ -350,6 +373,13 @@  static int __fill_v4l2_buffer(struct vb2_buffer *vb, struct v4l2_buffer *b)
 		 */
 		memcpy(b->m.planes, vb->v4l2_planes,
 			b->length * sizeof(struct v4l2_plane));
+
+		if (q->memory == V4L2_MEMORY_DMABUF) {
+			unsigned int plane;
+			for (plane = 0; plane < vb->num_planes; ++plane) {
+				b->m.planes[plane].m.fd = 0;
+			}
+		}
 	} else {
 		/*
 		 * We use length and offset in v4l2_planes array even for
@@ -361,6 +391,8 @@  static int __fill_v4l2_buffer(struct vb2_buffer *vb, struct v4l2_buffer *b)
 			b->m.offset = vb->v4l2_planes[0].m.mem_offset;
 		else if (q->memory == V4L2_MEMORY_USERPTR)
 			b->m.userptr = vb->v4l2_planes[0].m.userptr;
+		else if (q->memory == V4L2_MEMORY_DMABUF)
+			b->m.fd = 0;
 	}
 
 	/*
@@ -452,6 +484,21 @@  static int __verify_mmap_ops(struct vb2_queue *q)
 }
 
 /**
+ * __verify_dmabuf_ops() - verify that all memory operations required for
+ * DMABUF queue type have been provided
+ */
+static int __verify_dmabuf_ops(struct vb2_queue *q)
+{
+	if (!(q->io_modes & VB2_DMABUF) || !q->mem_ops->attach_dmabuf
+			|| !q->mem_ops->detach_dmabuf
+			|| !q->mem_ops->map_dmabuf
+			|| !q->mem_ops->unmap_dmabuf)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
  * vb2_reqbufs() - Initiate streaming
  * @q:		videobuf2 queue
  * @req:	struct passed from userspace to vidioc_reqbufs handler in driver
@@ -485,6 +532,7 @@  int vb2_reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req)
 	}
 
 	if (req->memory != V4L2_MEMORY_MMAP
+			&& req->memory != V4L2_MEMORY_DMABUF
 			&& req->memory != V4L2_MEMORY_USERPTR) {
 		dprintk(1, "reqbufs: unsupported memory type\n");
 		return -EINVAL;
@@ -514,6 +562,11 @@  int vb2_reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req)
 		return -EINVAL;
 	}
 
+	if (req->memory == V4L2_MEMORY_DMABUF && __verify_dmabuf_ops(q)) {
+		dprintk(1, "reqbufs: DMABUF for current setup unsupported\n");
+		return -EINVAL;
+	}
+
 	if (req->count == 0 || q->num_buffers != 0 || q->memory != req->memory) {
 		/*
 		 * We already have buffers allocated, so first check if they
@@ -621,7 +674,8 @@  int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create)
 	}
 
 	if (create->memory != V4L2_MEMORY_MMAP
-			&& create->memory != V4L2_MEMORY_USERPTR) {
+			&& create->memory != V4L2_MEMORY_USERPTR
+			&& create->memory != V4L2_MEMORY_DMABUF) {
 		dprintk(1, "%s(): unsupported memory type\n", __func__);
 		return -EINVAL;
 	}
@@ -645,6 +699,11 @@  int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create)
 		return -EINVAL;
 	}
 
+	if (create->memory == V4L2_MEMORY_DMABUF && __verify_dmabuf_ops(q)) {
+		dprintk(1, "%s(): DMABUF for current setup unsupported\n", __func__);
+		return -EINVAL;
+	}
+
 	if (q->num_buffers == VIDEO_MAX_FRAME) {
 		dprintk(1, "%s(): maximum number of buffers already allocated\n",
 			__func__);
@@ -840,6 +899,11 @@  static int __fill_vb2_buffer(struct vb2_buffer *vb, const struct v4l2_buffer *b,
 					b->m.planes[plane].length;
 			}
 		}
+		if (b->memory == V4L2_MEMORY_DMABUF) {
+			for (plane = 0; plane < vb->num_planes; ++plane) {
+				v4l2_planes[plane].m.fd = b->m.planes[plane].m.fd;
+			}
+		}
 	} else {
 		/*
 		 * Single-planar buffers do not use planes array,
@@ -854,6 +918,10 @@  static int __fill_vb2_buffer(struct vb2_buffer *vb, const struct v4l2_buffer *b,
 			v4l2_planes[0].m.userptr = b->m.userptr;
 			v4l2_planes[0].length = b->length;
 		}
+		if (b->memory == V4L2_MEMORY_DMABUF) {
+			v4l2_planes[0].m.fd = b->m.fd;
+		}
+
 	}
 
 	vb->v4l2_buf.field = b->field;
@@ -962,6 +1030,109 @@  static int __qbuf_mmap(struct vb2_buffer *vb, const struct v4l2_buffer *b)
 }
 
 /**
+ * __qbuf_dmabuf() - handle qbuf of a DMABUF buffer
+ */
+static int __qbuf_dmabuf(struct vb2_buffer *vb, const struct v4l2_buffer *b)
+{
+	struct v4l2_plane planes[VIDEO_MAX_PLANES];
+	struct vb2_queue *q = vb->vb2_queue;
+	void *mem_priv;
+	unsigned int plane;
+	int ret;
+	int write = !V4L2_TYPE_IS_OUTPUT(q->type);
+
+	/* Verify and copy relevant information provided by the userspace */
+	ret = __fill_vb2_buffer(vb, b, planes);
+	if (ret)
+		return ret;
+
+	for (plane = 0; plane < vb->num_planes; ++plane) {
+		struct dma_buf *dbuf = dma_buf_get(planes[plane].m.fd);
+
+		if (IS_ERR_OR_NULL(dbuf)) {
+			dprintk(1, "qbuf: invalid dmabuf fd for "
+				"plane %d\n", plane);
+			ret = PTR_ERR(dbuf);
+			goto err;
+		}
+
+		/* this doesn't get filled in until __fill_vb2_buffer(),
+		 * since it isn't known until after dma_buf_get()..
+		 */
+		planes[plane].length = dbuf->size;
+
+		/* Skip the plane if already verified */
+		if (dbuf == vb->planes[plane].dbuf) {
+			dma_buf_put(dbuf);
+			continue;
+		}
+
+		dprintk(3, "qbuf: buffer description for plane %d changed, "
+			"reattaching dma buf\n", plane);
+
+		/* Release previously acquired memory if present */
+		if (vb->planes[plane].mem_priv) {
+			call_memop(q, plane, detach_dmabuf,
+				vb->planes[plane].mem_priv);
+			dma_buf_put(vb->planes[plane].dbuf);
+		}
+
+		vb->planes[plane].mem_priv = NULL;
+
+		/* Acquire each plane's memory */
+		mem_priv = q->mem_ops->attach_dmabuf(
+				q->alloc_ctx[plane], dbuf);
+		if (IS_ERR(mem_priv)) {
+			dprintk(1, "qbuf: failed acquiring dmabuf "
+				"memory for plane %d\n", plane);
+			ret = PTR_ERR(mem_priv);
+			goto err;
+		}
+
+		vb->planes[plane].dbuf = dbuf;
+		vb->planes[plane].mem_priv = mem_priv;
+	}
+
+	/* TODO: This pins the buffer(s) with  dma_buf_map_attachment()).. but
+	 * really we want to do this just before the DMA, not while queueing
+	 * the buffer(s)..
+	 */
+	for (plane = 0; plane < vb->num_planes; ++plane) {
+		ret = q->mem_ops->map_dmabuf(
+				vb->planes[plane].mem_priv, write);
+		if (ret) {
+			dprintk(1, "qbuf: failed mapping dmabuf "
+				"memory for plane %d\n", plane);
+			goto err;
+		}
+	}
+
+	/*
+	 * Call driver-specific initialization on the newly acquired buffer,
+	 * if provided.
+	 */
+	ret = call_qop(q, buf_init, vb);
+	if (ret) {
+		dprintk(1, "qbuf: buffer initialization failed\n");
+		goto err;
+	}
+
+	/*
+	 * Now that everything is in order, copy relevant information
+	 * provided by userspace.
+	 */
+	for (plane = 0; plane < vb->num_planes; ++plane)
+		vb->v4l2_planes[plane] = planes[plane];
+
+	return 0;
+err:
+	/* In case of errors, release planes that were already acquired */
+	__vb2_buf_dmabuf_put(vb);
+
+	return ret;
+}
+
+/**
  * __enqueue_in_driver() - enqueue a vb2_buffer in driver for processing
  */
 static void __enqueue_in_driver(struct vb2_buffer *vb)
@@ -985,6 +1156,9 @@  static int __buf_prepare(struct vb2_buffer *vb, const struct v4l2_buffer *b)
 	case V4L2_MEMORY_USERPTR:
 		ret = __qbuf_userptr(vb, b);
 		break;
+	case V4L2_MEMORY_DMABUF:
+		ret = __qbuf_dmabuf(vb, b);
+		break;
 	default:
 		WARN(1, "Invalid queue type\n");
 		ret = -EINVAL;
@@ -1284,6 +1458,7 @@  int vb2_dqbuf(struct vb2_queue *q, struct v4l2_buffer *b, bool nonblocking)
 {
 	struct vb2_buffer *vb = NULL;
 	int ret;
+	unsigned int plane;
 
 	if (q->fileio) {
 		dprintk(1, "dqbuf: file io in progress\n");
@@ -1307,6 +1482,15 @@  int vb2_dqbuf(struct vb2_queue *q, struct v4l2_buffer *b, bool nonblocking)
 		return ret;
 	}
 
+	/* TODO: this unpins the buffer(dma_buf_unmap_attachment()).. but
+	 * really we want tot do this just after DMA, not when the
+	 * buffer is dequeued..
+	 */
+	if (q->memory == V4L2_MEMORY_DMABUF)
+		for (plane = 0; plane < vb->num_planes; ++plane)
+			call_memop(q, plane, unmap_dmabuf,
+				vb->planes[plane].mem_priv);
+
 	switch (vb->state) {
 	case VB2_BUF_STATE_DONE:
 		dprintk(3, "dqbuf: Returning done buffer\n");
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index a15d1f1..5c1836d 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -16,6 +16,7 @@ 
 #include <linux/mutex.h>
 #include <linux/poll.h>
 #include <linux/videodev2.h>
+#include <linux/dma-buf.h>
 
 struct vb2_alloc_ctx;
 struct vb2_fileio_data;
@@ -41,6 +42,20 @@  struct vb2_fileio_data;
  *		 argument to other ops in this structure
  * @put_userptr: inform the allocator that a USERPTR buffer will no longer
  *		 be used
+ * @attach_dmabuf: attach a shared struct dma_buf for a hardware operation;
+ * 		   used for DMABUF memory types; alloc_ctx is the alloc context
+ * 		   dbuf is the shared dma_buf; returns NULL on failure;
+ *		   allocator private per-buffer structure on success;
+ *		   this needs to be used for further accesses to the buffer
+ * @detach_dmabuf: inform the exporter of the buffer that the current DMABUF
+ * 		   buffer is no longer used; the buf_priv argument is the
+ * 		   allocator private per-buffer structure previously returned
+ * 		   from the attach_dmabuf callback
+ * @map_dmabuf: request for access to the dmabuf from allocator; the allocator
+ *		of dmabuf is informed that this driver is going to use the
+ *		dmabuf
+ * @unmap_dmabuf: releases access control to the dmabuf - allocator is notified
+ *		  that this driver is done using the dmabuf for now
  * @vaddr:	return a kernel virtual address to a given memory buffer
  *		associated with the passed private structure or NULL if no
  *		such mapping exists
@@ -56,6 +71,8 @@  struct vb2_fileio_data;
  * Required ops for USERPTR types: get_userptr, put_userptr.
  * Required ops for MMAP types: alloc, put, num_users, mmap.
  * Required ops for read/write access types: alloc, put, num_users, vaddr
+ * Required ops for DMABUF types: attach_dmabuf, detach_dmabuf, map_dmabuf,
+ *				  unmap_dmabuf.
  */
 struct vb2_mem_ops {
 	void		*(*alloc)(void *alloc_ctx, unsigned long size);
@@ -65,6 +82,16 @@  struct vb2_mem_ops {
 					unsigned long size, int write);
 	void		(*put_userptr)(void *buf_priv);
 
+	/* Comment from Rob Clark: XXX: I think the attach / detach could be handled
+	 * in the vb2 core, and vb2_mem_ops really just need to get/put the
+	 * sglist (and make sure that the sglist fits it's needs..)
+	 */
+	void		*(*attach_dmabuf)(void *alloc_ctx,
+					  struct dma_buf *dbuf);
+	void		(*detach_dmabuf)(void *buf_priv);
+	int		(*map_dmabuf)(void *buf_priv, int write);
+	void		(*unmap_dmabuf)(void *buf_priv);
+
 	void		*(*vaddr)(void *buf_priv);
 	void		*(*cookie)(void *buf_priv);
 
@@ -75,6 +102,7 @@  struct vb2_mem_ops {
 
 struct vb2_plane {
 	void			*mem_priv;
+	struct dma_buf		*dbuf;
 };
 
 /**
@@ -83,12 +111,14 @@  struct vb2_plane {
  * @VB2_USERPTR:	driver supports USERPTR with streaming API
  * @VB2_READ:		driver supports read() style access
  * @VB2_WRITE:		driver supports write() style access
+ * @VB2_DMABUF:		driver supports DMABUF with streaming API
  */
 enum vb2_io_modes {
 	VB2_MMAP	= (1 << 0),
 	VB2_USERPTR	= (1 << 1),
 	VB2_READ	= (1 << 2),
 	VB2_WRITE	= (1 << 3),
+	VB2_DMABUF	= (1 << 4),
 };
 
 /**