diff mbox

[RFCv2,2/9,-,4/4] v4l: vb2-dma-contig: update and code refactoring

Message ID 1332410543-14433-4-git-send-email-laurent.pinchart@ideasonboard.com
State New
Headers show

Commit Message

Laurent Pinchart March 22, 2012, 10:02 a.m. UTC
From: Tomasz Stanislawski <t.stanislaws@samsung.com>

This patch combines updates and fixes to dma-contig allocator.
Moreover the allocator code was refactored.
The most important changes are:
- functions were reordered
- move compression of scatterlist to separete function
- add support for multichunk but contiguous scatterlists
- simplified implementation of vb2-dma-contig context structure
- let mmap method to use dma_mmap_writecombine
- add support for scatterlist in userptr mode

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
	[mmap method]
Signed-off-by: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
	[scatterlist in userptr mode]
Signed-off-by: Kamil Debski <k.debski@samsung.com>
	[bugfixing]
Signed-off-by: Tomasz Stanislawski <t.stanislaws@samsung.com>
	[core refactoring, helper functions]
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
---
 drivers/media/video/videobuf2-dma-contig.c |  400 +++++++++++++++++++++++++---
 1 files changed, 365 insertions(+), 35 deletions(-)

Comments

Laurent Pinchart March 22, 2012, 10:50 a.m. UTC | #1
Hi Tomasz,

Thanks for the patch.

On Thursday 22 March 2012 11:02:23 Laurent Pinchart wrote:
> From: Tomasz Stanislawski <t.stanislaws@samsung.com>
> 
> This patch combines updates and fixes to dma-contig allocator.
> Moreover the allocator code was refactored.
> The most important changes are:
> - functions were reordered
> - move compression of scatterlist to separete function
> - add support for multichunk but contiguous scatterlists
> - simplified implementation of vb2-dma-contig context structure
> - let mmap method to use dma_mmap_writecombine
> - add support for scatterlist in userptr mode
> 
> Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
> 	[mmap method]
> Signed-off-by: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
> 	[scatterlist in userptr mode]
> Signed-off-by: Kamil Debski <k.debski@samsung.com>
> 	[bugfixing]
> Signed-off-by: Tomasz Stanislawski <t.stanislaws@samsung.com>
> 	[core refactoring, helper functions]
> Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
> ---
>  drivers/media/video/videobuf2-dma-contig.c |  400
> +++++++++++++++++++++++++--- 1 files changed, 365 insertions(+), 35
> deletions(-)
> 
> diff --git a/drivers/media/video/videobuf2-dma-contig.c
> b/drivers/media/video/videobuf2-dma-contig.c index c898e6f..9965465 100644
> --- a/drivers/media/video/videobuf2-dma-contig.c
> +++ b/drivers/media/video/videobuf2-dma-contig.c
> @@ -10,9 +10,12 @@
>   * the Free Software Foundation.
>   */
> 
> +#include <linux/dma-buf.h>
> +#include <linux/dma-mapping.h>
>  #include <linux/module.h>
> +#include <linux/scatterlist.h>
> +#include <linux/sched.h>
>  #include <linux/slab.h>
> -#include <linux/dma-mapping.h>
> 
>  #include <media/videobuf2-core.h>
>  #include <media/videobuf2-memops.h>
> @@ -22,16 +25,115 @@ struct vb2_dc_buf {
>  	void				*vaddr;
>  	unsigned long			size;
>  	dma_addr_t			dma_addr;
> +	struct sg_table			*dma_sgt;
> +	enum dma_data_direction		dma_dir;
> 
>  	/* MMAP related */
>  	struct vb2_vmarea_handler	handler;
>  	atomic_t			refcount;
> +	struct sg_table			*sgt_base;
> 
>  	/* USERPTR related */
>  	struct vm_area_struct		*vma;
>  };
> 
>  /*********************************************/
> +/*        scatterlist table functions        */
> +/*********************************************/
> +
> +static struct sg_table *vb2_dc_pages_to_sgt(struct page **pages,
> +	unsigned long n_pages, size_t offset, size_t offset2)
> +{
> +	struct sg_table *sgt;
> +	int i, j; /* loop counters */
> +	int cur_page, chunks;
> +	int ret;
> +	struct scatterlist *s;
> +
> +	sgt = kzalloc(sizeof *sgt, GFP_KERNEL);
> +	if (!sgt)
> +		return ERR_PTR(-ENOMEM);
> +
> +	/* compute number of chunks */
> +	chunks = 1;
> +	for (i = 1; i < n_pages; ++i)
> +		if (pages[i] != pages[i - 1] + 1)
> +			++chunks;
> +
> +	ret = sg_alloc_table(sgt, chunks, GFP_KERNEL);
> +	if (ret) {
> +		kfree(sgt);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	/* merging chunks and putting them into the scatterlist */
> +	cur_page = 0;
> +	for_each_sg(sgt->sgl, s, sgt->orig_nents, i) {
> +		size_t size = PAGE_SIZE;
> +
> +		for (j = cur_page + 1; j < n_pages; ++j) {
> +			if (pages[j] != pages[j - 1] + 1)
> +				break;
> +			size += PAGE_SIZE;
> +		}
> +
> +		/* cut offset if chunk starts at the first page */
> +		if (cur_page == 0)
> +			size -= offset;
> +		/* cut offset2 if chunk ends at the last page */
> +		if (j == n_pages)
> +			size -= offset2;
> +
> +		sg_set_page(s, pages[cur_page], size, offset);
> +		offset = 0;
> +		cur_page = j;
> +	}
> +
> +	return sgt;
> +}
> +
> +static void vb2_dc_release_sgtable(struct sg_table *sgt)
> +{
> +	sg_free_table(sgt);
> +	kfree(sgt);
> +}
> +
> +static void vb2_dc_put_sgtable(struct sg_table *sgt, int dirty)
> +{
> +	struct scatterlist *s;
> +	int i, j;
> +
> +	for_each_sg(sgt->sgl, s, sgt->nents, i) {
> +		struct page *page = sg_page(s);
> +		int n_pages = PAGE_ALIGN(s->offset + s->length) >> PAGE_SHIFT;
> +
> +		for (j = 0; j < n_pages; ++j, ++page) {
> +			if (dirty)
> +				set_page_dirty_lock(page);
> +			put_page(page);
> +		}
> +	}
> +
> +	vb2_dc_release_sgtable(sgt);
> +}
> +
> +static unsigned long vb2_dc_get_contiguous_size(struct sg_table *sgt)
> +{
> +	struct scatterlist *s;
> +	dma_addr_t expected = sg_dma_address(sgt->sgl);
> +	int i;
> +	unsigned long size = 0;
> +
> +	for_each_sg(sgt->sgl, s, sgt->nents, i) {
> +		if (sg_dma_address(s) != expected)
> +			break;
> +		expected = sg_dma_address(s) + sg_dma_len(s);
> +		size += sg_dma_len(s);
> +	}
> +	return size;
> +}
> +
> +/*********************************************/
>  /*         callbacks for all buffers         */
>  /*********************************************/
> 
> @@ -45,8 +147,6 @@ static void *vb2_dc_cookie(void *buf_priv)
>  static void *vb2_dc_vaddr(void *buf_priv)
>  {
>  	struct vb2_dc_buf *buf = buf_priv;
> -	if (!buf)
> -		return 0;
> 
>  	return buf->vaddr;
>  }
> @@ -58,6 +158,28 @@ static unsigned int vb2_dc_num_users(void *buf_priv)
>  	return atomic_read(&buf->refcount);
>  }
> 
> +static void vb2_dc_prepare(void *buf_priv)
> +{
> +	struct vb2_dc_buf *buf = buf_priv;
> +	struct sg_table *sgt = buf->dma_sgt;
> +
> +	if (!sgt)
> +		return;
> +
> +	dma_sync_sg_for_device(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
> +}
> +
> +static void vb2_dc_finish(void *buf_priv)
> +{
> +	struct vb2_dc_buf *buf = buf_priv;
> +	struct sg_table *sgt = buf->dma_sgt;
> +
> +	if (!sgt)
> +		return;
> +
> +	dma_sync_sg_for_cpu(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
> +}
> +
>  /*********************************************/
>  /*        callbacks for MMAP buffers         */
>  /*********************************************/
> @@ -66,31 +188,70 @@ static void vb2_dc_put(void *buf_priv)
>  {
>  	struct vb2_dc_buf *buf = buf_priv;
> 
> -	if (atomic_dec_and_test(&buf->refcount)) {
> -		dma_free_coherent(buf->dev, buf->size, buf->vaddr,
> -				  buf->dma_addr);
> -		kfree(buf);
> -	}
> +	if (!atomic_dec_and_test(&buf->refcount))
> +		return;
> +
> +	vb2_dc_release_sgtable(buf->sgt_base);
> +	dma_free_coherent(buf->dev, buf->size, buf->vaddr,
> +		buf->dma_addr);
> +	kfree(buf);
>  }
> 
>  static void *vb2_dc_alloc(void *alloc_ctx, unsigned long size)
>  {
>  	struct device *dev = alloc_ctx;
>  	struct vb2_dc_buf *buf;
> +	int ret;
> +	int n_pages;
> +	struct page **pages = NULL;
> 
>  	buf = kzalloc(sizeof *buf, GFP_KERNEL);
>  	if (!buf)
>  		return ERR_PTR(-ENOMEM);
> 
> -	buf->vaddr = dma_alloc_coherent(dev, size, &buf->dma_addr, GFP_KERNEL);
> +	buf->dev = dev;
> +	buf->size = size;
> +	buf->vaddr = dma_alloc_coherent(buf->dev, buf->size, &buf->dma_addr,
> +		GFP_KERNEL);
> +
> +	ret = -ENOMEM;
>  	if (!buf->vaddr) {
> -		dev_err(dev, "dma_alloc_coherent of size %ld failed\n", size);
> -		kfree(buf);
> -		return ERR_PTR(-ENOMEM);
> +		dev_err(dev, "dma_alloc_coherent of size %ld failed\n",
> +			size);
> +		goto fail_buf;
>  	}
> 
> -	buf->dev = dev;
> -	buf->size = size;
> +	WARN_ON((unsigned long)buf->vaddr & ~PAGE_MASK);
> +	WARN_ON(buf->dma_addr & ~PAGE_MASK);
> +
> +	n_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
> +
> +	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
> +	if (!pages) {
> +		printk(KERN_ERR "failed to alloc page table\n");
> +		goto fail_dma;
> +	}
> +
> +	ret = dma_get_pages(dev, buf->vaddr, buf->dma_addr, pages, n_pages);

As the only purpose of this is to retrieve a list of pages that will be used 
to create a single-entry sgt, wouldn't it be possible to shortcut the code and 
get the physical address of the buffer directly ?

> +	if (ret < 0) {
> +		printk(KERN_ERR "failed to get buffer pages from DMA API\n");
> +		goto fail_pages;
> +	}
> +	if (ret != n_pages) {
> +		ret = -EFAULT;
> +		printk(KERN_ERR "failed to get all pages from DMA API\n");
> +		goto fail_pages;
> +	}
> +
> +	buf->sgt_base = vb2_dc_pages_to_sgt(pages, n_pages, 0, 0);
> +	if (IS_ERR(buf->sgt_base)) {
> +		ret = PTR_ERR(buf->sgt_base);
> +		printk(KERN_ERR "failed to prepare sg table\n");
> +		goto fail_pages;
> +	}

buf->sgt_base isn't used in this patch. I would move the buf->sgt_base 
creation code to the patch that uses it then, or to its own patch just before 
the patch that uses it.

> +
> +	/* pages are no longer needed */
> +	kfree(pages);
> 
>  	buf->handler.refcount = &buf->refcount;
>  	buf->handler.put = vb2_dc_put;
> @@ -99,59 +260,226 @@ static void *vb2_dc_alloc(void *alloc_ctx, unsigned
> long size) atomic_inc(&buf->refcount);
> 
>  	return buf;
> +
> +fail_pages:
> +	kfree(pages);
> +
> +fail_dma:
> +	dma_free_coherent(buf->dev, buf->size, buf->vaddr, buf->dma_addr);
> +
> +fail_buf:
> +	kfree(buf);
> +
> +	return ERR_PTR(ret);
>  }
> 
>  static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma)
>  {
>  	struct vb2_dc_buf *buf = buf_priv;
> +	int ret;
> +
> +	/*
> +	 * dma_mmap_* uses vm_pgoff as in-buffer offset, but we want to
> +	 * map whole buffer
> +	 */
> +	vma->vm_pgoff = 0;
> +
> +	ret = dma_mmap_writecombine(buf->dev, vma, buf->vaddr,
> +		buf->dma_addr, buf->size);
> 
> -	if (!buf) {
> -		printk(KERN_ERR "No buffer to map\n");
> -		return -EINVAL;
> +	if (ret) {
> +		printk(KERN_ERR "Remapping memory failed, error: %d\n", ret);
> +		return ret;
>  	}
> 
> -	return vb2_mmap_pfn_range(vma, buf->dma_addr, buf->size,
> -				  &vb2_common_vm_ops, &buf->handler);
> +	vma->vm_flags		|= VM_DONTEXPAND | VM_RESERVED;
> +	vma->vm_private_data	= &buf->handler;
> +	vma->vm_ops		= &vb2_common_vm_ops;
> +
> +	vma->vm_ops->open(vma);
> +
> +	printk(KERN_DEBUG "%s: mapped dma addr 0x%08lx at 0x%08lx, size %ld\n",
> +		__func__, (unsigned long)buf->dma_addr, vma->vm_start,
> +		buf->size);
> +
> +	return 0;
>  }
> 
>  /*********************************************/
>  /*       callbacks for USERPTR buffers       */
>  /*********************************************/
> 
> +static inline int vma_is_io(struct vm_area_struct *vma)
> +{
> +	return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));

Isn't VM_PFNMAP enough ? Wouldn't it be possible (at least in theory) to get a 
discontinuous physical range with VM_IO ?

> +}
> +
> +static int vb2_dc_get_pages(unsigned long start, struct page **pages,
> +	int n_pages, struct vm_area_struct **copy_vma, int write)
> +{
> +	struct vm_area_struct *vma;
> +	int n = 0; /* number of get pages */
> +	int ret = -EFAULT;
> +
> +	/* entering critical section for mm access */
> +	down_read(&current->mm->mmap_sem);

This will generate AB-BA deadlock warnings if lockdep is enabled. This 
function is called with the queue lock held, and the mmap() handler which 
takes the queue lock is called with current->mm->mmap_sem held.

This is a known issue with videobuf2, not specific to this patch. The warning 
is usually a false positive (which we still need to fix, as it worries users), 
but can become a real issue if an MMAP queue and a USERPTR queue are created 
by a driver with the same queue lock.

> +	vma = find_vma(current->mm, start);
> +	if (!vma) {
> +		printk(KERN_ERR "no vma for address %lu\n", start);
> +		goto cleanup;
> +	}
> +
> +	if (vma_is_io(vma)) {
> +		unsigned long pfn;
> +
> +		if (vma->vm_end - start < n_pages * PAGE_SIZE) {
> +			printk(KERN_ERR "vma is too small\n");
> +			goto cleanup;
> +		}
> +
> +		for (n = 0; n < n_pages; ++n, start += PAGE_SIZE) {
> +			ret = follow_pfn(vma, start, &pfn);
> +			if (ret) {
> +				printk(KERN_ERR "no page for address %lu\n",
> +					start);
> +				goto cleanup;
> +			}
> +			pages[n] = pfn_to_page(pfn);
> +			get_page(pages[n]);

This worries me. When the VM_PFNMAP flag is set, the memory pages are not 
backed by a struct page. Creating a struct page pointer out of it can be an 
acceptable hack (for instance to store a page in an scatterlist with 
sg_set_page() and then retrieve its physical address with sg_phys()), but you 
should not expect the struct page to be valid for anything else. Calling 
get_page() on it will likely crash.

> +		}
> +	} else {
> +		n = get_user_pages(current, current->mm, start & PAGE_MASK,
> +			n_pages, write, 1, pages, NULL);
> +		if (n != n_pages) {
> +			printk(KERN_ERR "got only %d of %d user pages\n",
> +				n, n_pages);
> +			goto cleanup;
> +		}
> +	}
> +
> +	*copy_vma = vb2_get_vma(vma);
> +	if (!*copy_vma) {
> +		printk(KERN_ERR "failed to copy vma\n");
> +		ret = -ENOMEM;
> +		goto cleanup;
> +	}

Do we really need to make a copy of the VMA ? The only reason why we store a 
pointer to it is to check the flags in vb2_dc_put_userptr(). We could store 
the flags instead and avoid vb2_get_dma()/vb2_put_dma() calls altogether.

> +
> +	/* leaving critical section for mm access */
> +	up_read(&current->mm->mmap_sem);
> +
> +	return 0;
> +
> +cleanup:
> +	up_read(&current->mm->mmap_sem);
> +
> +	/* putting user pages if used, can be done wothout the lock */
> +	while (n)
> +		put_page(pages[--n]);
> +
> +	return ret;
> +}
> +
>  static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
> -					unsigned long size, int write)
> +	unsigned long size, int write)
>  {
>  	struct vb2_dc_buf *buf;
> -	struct vm_area_struct *vma;
> -	dma_addr_t dma_addr = 0;
> -	int ret;
> +	unsigned long start, end, offset, offset2;
> +	struct page **pages;
> +	int n_pages;
> +	int ret = 0;
> +	struct sg_table *sgt;
> +	unsigned long contig_size;
> 
>  	buf = kzalloc(sizeof *buf, GFP_KERNEL);
>  	if (!buf)
>  		return ERR_PTR(-ENOMEM);
> 
> -	ret = vb2_get_contig_userptr(vaddr, size, &vma, &dma_addr);
> +	buf->dev = alloc_ctx;
> +	buf->dma_dir = write ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
> +
> +	start = (unsigned long)vaddr & PAGE_MASK;
> +	offset = (unsigned long)vaddr & ~PAGE_MASK;
> +	end = PAGE_ALIGN((unsigned long)vaddr + size);
> +	offset2 = end - (unsigned long)vaddr - size;
> +	n_pages = (end - start) >> PAGE_SHIFT;
> +
> +	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
> +	if (!pages) {
> +		ret = -ENOMEM;
> +		printk(KERN_ERR "failed to allocate pages table\n");
> +		goto fail_buf;
> +	}
> +
> +	/* extract page list from userspace mapping */
> +	ret = vb2_dc_get_pages(start, pages, n_pages, &buf->vma, write);
>  	if (ret) {
> -		printk(KERN_ERR "Failed acquiring VMA for vaddr 0x%08lx\n",
> -				vaddr);
> -		kfree(buf);
> -		return ERR_PTR(ret);
> +		printk(KERN_ERR "failed to get user pages\n");
> +		goto fail_pages;
> +	}
> +
> +	sgt = vb2_dc_pages_to_sgt(pages, n_pages, offset, offset2);
> +	if (!sgt) {
> +		printk(KERN_ERR "failed to create scatterlist table\n");
> +		ret = -ENOMEM;
> +		goto fail_get_pages;
>  	}

This looks overly complex to me. You create a multi-chunk sgt out of the user 
pointer address and map it completely, and then check if it starts with a big 
enough contiguous chunk. Why don't you create an sgt with a single continuous 
chunk then ? In the VM_PFNMAP case you could check whether the area is 
contiguous when you follow the PFNs, stop at the first discontinuity, and 
create an sgt with a single element right there. You would then need to call 
vb2_dc_pages_to_sgt() in the normal case only, and stop at the first 
discontinuity as well.

> 
> +	/* pages are no longer needed */
> +	kfree(pages);
> +	pages = NULL;
> +
> +	sgt->nents = dma_map_sg(buf->dev, sgt->sgl, sgt->orig_nents,
> +		buf->dma_dir);
> +	if (sgt->nents <= 0) {
> +		printk(KERN_ERR "failed to map scatterlist\n");
> +		ret = -EIO;
> +		goto fail_sgt;
> +	}
> +
> +	contig_size = vb2_dc_get_contiguous_size(sgt);
> +	if (contig_size < size) {
> +		printk(KERN_ERR "contiguous mapping is too small %lu/%lu\n",
> +			contig_size, size);
> +		ret = -EFAULT;
> +		goto fail_map_sg;
> +	}
> +
> +	buf->dma_addr = sg_dma_address(sgt->sgl);
>  	buf->size = size;
> -	buf->dma_addr = dma_addr;
> -	buf->vma = vma;
> +	buf->dma_sgt = sgt;
> +
> +	atomic_inc(&buf->refcount);
> 
>  	return buf;
> +
> +fail_map_sg:
> +	dma_unmap_sg(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);

I think this will break in the VM_PFNMAP case on non-coherent architectures. 
arm_dma_unmap_page() will call __dma_page_dev_to_cpu() in that case, which can 
dereference struct page. As explain above, the struct page isn't valid with 
VM_PFNMAP. I haven't check the dma_map_sg() and dma_sync_sg_*() calls, but 
changes are they might break as well.

> +
> +fail_sgt:
> +	vb2_dc_put_sgtable(sgt, 0);
> +
> +fail_get_pages:
> +	while (pages && n_pages)
> +		put_page(pages[--n_pages]);
> +	vb2_put_vma(buf->vma);
> +
> +fail_pages:
> +	kfree(pages); /* kfree is NULL-proof */
> +
> +fail_buf:
> +	kfree(buf);
> +
> +	return ERR_PTR(ret);
>  }
> 
> -static void vb2_dc_put_userptr(void *mem_priv)
> +static void vb2_dc_put_userptr(void *buf_priv)
>  {
> -	struct vb2_dc_buf *buf = mem_priv;
> -
> -	if (!buf)
> -		return;
> +	struct vb2_dc_buf *buf = buf_priv;
> +	struct sg_table *sgt = buf->dma_sgt;
> 
> +	dma_unmap_sg(buf->dev, sgt->sgl, sgt->orig_nents, buf->dma_dir);
> +	vb2_dc_put_sgtable(sgt, !vma_is_io(buf->vma));
>  	vb2_put_vma(buf->vma);
>  	kfree(buf);
>  }
> @@ -168,6 +496,8 @@ const struct vb2_mem_ops vb2_dma_contig_memops = {
>  	.mmap		= vb2_dc_mmap,
>  	.get_userptr	= vb2_dc_get_userptr,
>  	.put_userptr	= vb2_dc_put_userptr,
> +	.prepare	= vb2_dc_prepare,
> +	.finish		= vb2_dc_finish,
>  	.num_users	= vb2_dc_num_users,
>  };
>  EXPORT_SYMBOL_GPL(vb2_dma_contig_memops);
Tomasz Stanislawski March 22, 2012, 1:36 p.m. UTC | #2
Hi Laurent,
Thank you very much for your comments and question.
They were very useful.
Please refer to the comments below.

On 03/22/2012 11:50 AM, Laurent Pinchart wrote:
> Hi Tomasz,
> 
> Thanks for the patch.
> 
> On Thursday 22 March 2012 11:02:23 Laurent Pinchart wrote:
>> From: Tomasz Stanislawski <t.stanislaws@samsung.com>
>>
>> This patch combines updates and fixes to dma-contig allocator.
>> Moreover the allocator code was refactored.
>> The most important changes are:
>> - functions were reordered
>> - move compression of scatterlist to separete function
>> - add support for multichunk but contiguous scatterlists
>> - simplified implementation of vb2-dma-contig context structure
>> - let mmap method to use dma_mmap_writecombine
>> - add support for scatterlist in userptr mode
>>
>> Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
>> 	[mmap method]
>> Signed-off-by: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
>> 	[scatterlist in userptr mode]
>> Signed-off-by: Kamil Debski <k.debski@samsung.com>
>> 	[bugfixing]
>> Signed-off-by: Tomasz Stanislawski <t.stanislaws@samsung.com>
>> 	[core refactoring, helper functions]
>> Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
>> ---
>>  drivers/media/video/videobuf2-dma-contig.c |  400
>> +++++++++++++++++++++++++--- 1 files changed, 365 insertions(+), 35
>> deletions(-)
>>
>> diff --git a/drivers/media/video/videobuf2-dma-contig.c
>> b/drivers/media/video/videobuf2-dma-contig.c index c898e6f..9965465 100644
>> --- a/drivers/media/video/videobuf2-dma-contig.c
>> +++ b/drivers/media/video/videobuf2-dma-contig.c
>> @@ -10,9 +10,12 @@
>>   * the Free Software Foundation.
>>   */
>>
>> +#include <linux/dma-buf.h>
>> +#include <linux/dma-mapping.h>
>>  #include <linux/module.h>
>> +#include <linux/scatterlist.h>
>> +#include <linux/sched.h>
>>  #include <linux/slab.h>
>> -#include <linux/dma-mapping.h>
>>
>>  #include <media/videobuf2-core.h>
>>  #include <media/videobuf2-memops.h>
>> @@ -22,16 +25,115 @@ struct vb2_dc_buf {
>>  	void				*vaddr;
>>  	unsigned long			size;
>>  	dma_addr_t			dma_addr;
>> +	struct sg_table			*dma_sgt;
>> +	enum dma_data_direction		dma_dir;
>>
>>  	/* MMAP related */
>>  	struct vb2_vmarea_handler	handler;
>>  	atomic_t			refcount;
>> +	struct sg_table			*sgt_base;
>>
>>  	/* USERPTR related */
>>  	struct vm_area_struct		*vma;
>>  };
>>
>>  /*********************************************/
>> +/*        scatterlist table functions        */
>> +/*********************************************/
>> +
>> +static struct sg_table *vb2_dc_pages_to_sgt(struct page **pages,
>> +	unsigned long n_pages, size_t offset, size_t offset2)
>> +{
>> +	struct sg_table *sgt;
>> +	int i, j; /* loop counters */
>> +	int cur_page, chunks;
>> +	int ret;
>> +	struct scatterlist *s;
>> +
>> +	sgt = kzalloc(sizeof *sgt, GFP_KERNEL);
>> +	if (!sgt)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	/* compute number of chunks */
>> +	chunks = 1;
>> +	for (i = 1; i < n_pages; ++i)
>> +		if (pages[i] != pages[i - 1] + 1)
>> +			++chunks;
>> +
>> +	ret = sg_alloc_table(sgt, chunks, GFP_KERNEL);
>> +	if (ret) {
>> +		kfree(sgt);
>> +		return ERR_PTR(-ENOMEM);
>> +	}
>> +
>> +	/* merging chunks and putting them into the scatterlist */
>> +	cur_page = 0;
>> +	for_each_sg(sgt->sgl, s, sgt->orig_nents, i) {
>> +		size_t size = PAGE_SIZE;
>> +
>> +		for (j = cur_page + 1; j < n_pages; ++j) {
>> +			if (pages[j] != pages[j - 1] + 1)
>> +				break;
>> +			size += PAGE_SIZE;
>> +		}
>> +
>> +		/* cut offset if chunk starts at the first page */
>> +		if (cur_page == 0)
>> +			size -= offset;
>> +		/* cut offset2 if chunk ends at the last page */
>> +		if (j == n_pages)
>> +			size -= offset2;
>> +
>> +		sg_set_page(s, pages[cur_page], size, offset);
>> +		offset = 0;
>> +		cur_page = j;
>> +	}
>> +
>> +	return sgt;
>> +}
>> +
>> +static void vb2_dc_release_sgtable(struct sg_table *sgt)
>> +{
>> +	sg_free_table(sgt);
>> +	kfree(sgt);
>> +}
>> +
>> +static void vb2_dc_put_sgtable(struct sg_table *sgt, int dirty)
>> +{
>> +	struct scatterlist *s;
>> +	int i, j;
>> +
>> +	for_each_sg(sgt->sgl, s, sgt->nents, i) {
>> +		struct page *page = sg_page(s);
>> +		int n_pages = PAGE_ALIGN(s->offset + s->length) >> PAGE_SHIFT;
>> +
>> +		for (j = 0; j < n_pages; ++j, ++page) {
>> +			if (dirty)
>> +				set_page_dirty_lock(page);
>> +			put_page(page);
>> +		}
>> +	}
>> +
>> +	vb2_dc_release_sgtable(sgt);
>> +}
>> +
>> +static unsigned long vb2_dc_get_contiguous_size(struct sg_table *sgt)
>> +{
>> +	struct scatterlist *s;
>> +	dma_addr_t expected = sg_dma_address(sgt->sgl);
>> +	int i;
>> +	unsigned long size = 0;
>> +
>> +	for_each_sg(sgt->sgl, s, sgt->nents, i) {
>> +		if (sg_dma_address(s) != expected)
>> +			break;
>> +		expected = sg_dma_address(s) + sg_dma_len(s);
>> +		size += sg_dma_len(s);
>> +	}
>> +	return size;
>> +}
>> +
>> +/*********************************************/
>>  /*         callbacks for all buffers         */
>>  /*********************************************/
>>
>> @@ -45,8 +147,6 @@ static void *vb2_dc_cookie(void *buf_priv)
>>  static void *vb2_dc_vaddr(void *buf_priv)
>>  {
>>  	struct vb2_dc_buf *buf = buf_priv;
>> -	if (!buf)
>> -		return 0;
>>
>>  	return buf->vaddr;
>>  }
>> @@ -58,6 +158,28 @@ static unsigned int vb2_dc_num_users(void *buf_priv)
>>  	return atomic_read(&buf->refcount);
>>  }
>>
>> +static void vb2_dc_prepare(void *buf_priv)
>> +{
>> +	struct vb2_dc_buf *buf = buf_priv;
>> +	struct sg_table *sgt = buf->dma_sgt;
>> +
>> +	if (!sgt)
>> +		return;
>> +
>> +	dma_sync_sg_for_device(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
>> +}
>> +
>> +static void vb2_dc_finish(void *buf_priv)
>> +{
>> +	struct vb2_dc_buf *buf = buf_priv;
>> +	struct sg_table *sgt = buf->dma_sgt;
>> +
>> +	if (!sgt)
>> +		return;
>> +
>> +	dma_sync_sg_for_cpu(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
>> +}
>> +
>>  /*********************************************/
>>  /*        callbacks for MMAP buffers         */
>>  /*********************************************/
>> @@ -66,31 +188,70 @@ static void vb2_dc_put(void *buf_priv)
>>  {
>>  	struct vb2_dc_buf *buf = buf_priv;
>>
>> -	if (atomic_dec_and_test(&buf->refcount)) {
>> -		dma_free_coherent(buf->dev, buf->size, buf->vaddr,
>> -				  buf->dma_addr);
>> -		kfree(buf);
>> -	}
>> +	if (!atomic_dec_and_test(&buf->refcount))
>> +		return;
>> +
>> +	vb2_dc_release_sgtable(buf->sgt_base);
>> +	dma_free_coherent(buf->dev, buf->size, buf->vaddr,
>> +		buf->dma_addr);
>> +	kfree(buf);
>>  }
>>
>>  static void *vb2_dc_alloc(void *alloc_ctx, unsigned long size)
>>  {
>>  	struct device *dev = alloc_ctx;
>>  	struct vb2_dc_buf *buf;
>> +	int ret;
>> +	int n_pages;http://165.213.219.115/cgi-bin/gitweb.cgi?p=mirror/linux-3.0-midas;a=shortlog;h=refs/heads/exynos-3.0-dev
>> +	struct page **pages = NULL;
>>
>>  	buf = kzalloc(sizeof *buf, GFP_KERNEL);
>>  	if (!buf)
>>  		return ERR_PTR(-ENOMEM);
>>
>> -	buf->vaddr = dma_alloc_coherent(dev, size, &buf->dma_addr, GFP_KERNEL);
>> +	buf->dev = dev;
>> +	buf->size = size;
>> +	buf->vaddr = dma_alloc_coherent(buf->dev, buf->size, &buf->dma_addr,
>> +		GFP_KERNEL);
>> +
>> +	ret = -ENOMEM;
>>  	if (!buf->vaddr) {
>> -		dev_err(dev, "dma_alloc_coherent of size %ld failed\n", size);
>> -		kfree(buf);
>> -		return ERR_PTR(-ENOMEM);
>> +		dev_err(dev, "dma_alloc_coherent of size %ld failed\n",
>> +			size);
>> +		goto fail_buf;
>>  	}
>>
>> -	buf->dev = dev;
>> -	buf->size = size;
>> +	WARN_ON((unsigned long)buf->vaddr & ~PAGE_MASK);
>> +	WARN_ON(buf->dma_addr & ~PAGE_MASK);
>> +
>> +	n_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
>> +
>> +	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
>> +	if (!pages) {
>> +		printk(KERN_ERR "failed to alloc page table\n");
>> +		goto fail_dma;
>> +	}
>> +
>> +	ret = dma_get_pages(dev, buf->vaddr, buf->dma_addr, pages, n_pages);
> 
> As the only purpose of this is to retrieve a list of pages that will be used 
> to create a single-entry sgt, wouldn't it be possible to shortcut the code and 
> get the physical address of the buffer directly ?
> 

The physical address should not be used since they are meaningless in a context of
different devices. It seams that only the list of pages is more-or-less portable
between different drivers.

The physical address is already present in buf->dma_addr, but it is only valid
if the device has no MMU. Notice that vb2-dma-contig possess no knowledge if
MMU is present for a given device.

The sg list is not going to be single-entry if the device is provided with its own MMU.


>> +	if (ret < 0) {
>> +		printk(KERN_ERR "failed to get buffer pages from DMA API\n");
>> +		goto fail_pages;
>> +	}
>> +	if (ret != n_pages) {
>> +		ret = -EFAULT;
>> +		printk(KERN_ERR "failed to get all pages from DMA API\n");
>> +		goto fail_pages;
>> +	}
>> +
>> +	buf->sgt_base = vb2_dc_pages_to_sgt(pages, n_pages, 0, 0);
>> +	if (IS_ERR(buf->sgt_base)) {
>> +		ret = PTR_ERR(buf->sgt_base);
>> +		printk(KERN_ERR "failed to prepare sg table\n");
>> +		goto fail_pages;
>> +	}
> 
> buf->sgt_base isn't used in this patch. I would move the buf->sgt_base 
> creation code to the patch that uses it then, or to its own patch just before 
> the patch that uses it.

Good point. The sgt_base is used by exporter only. Thanks for noticing it.

> 
>> +
>> +	/* pages are no longer needed */
>> +	kfree(pages);
>>
>>  	buf->handler.refcount = &buf->refcount;
>>  	buf->handler.put = vb2_dc_put;
>> @@ -99,59 +260,226 @@ static void *vb2_dc_alloc(void *alloc_ctx, unsigned
>> long size) atomic_inc(&buf->refcount);
>>
>>  	return buf;
>> +
>> +fail_pages:
>> +	kfree(pages);
>> +
>> +fail_dma:
>> +	dma_free_coherent(buf->dev, buf->size, buf->vaddr, buf->dma_addr);
>> +
>> +fail_buf:
>> +	kfree(buf);
>> +
>> +	return ERR_PTR(ret);
>>  }
>>
>>  static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma)
>>  {
>>  	struct vb2_dc_buf *buf = buf_priv;
>> +	int ret;
>> +
>> +	/*
>> +	 * dma_mmap_* uses vm_pgoff as in-buffer offset, but we want to
>> +	 * map whole buffer
>> +	 */
>> +	vma->vm_pgoff = 0;
>> +
>> +	ret = dma_mmap_writecombine(buf->dev, vma, buf->vaddr,
>> +		buf->dma_addr, buf->size);
>>
>> -	if (!buf) {
>> -		printk(KERN_ERR "No buffer to map\n");
>> -		return -EINVAL;
>> +	if (ret) {
>> +		printk(KERN_ERR "Remapping memory failed, error: %d\n", ret);
>> +		return ret;
>>  	}
>>
>> -	return vb2_mmap_pfn_range(vma, buf->dma_addr, buf->size,
>> -				  &vb2_common_vm_ops, &buf->handler);
>> +	vma->vm_flags		|= VM_DONTEXPAND | VM_RESERVED;
>> +	vma->vm_private_data	= &buf->handler;
>> +	vma->vm_ops		= &vb2_common_vm_ops;
>> +
>> +	vma->vm_ops->open(vma);
>> +
>> +	printk(KERN_DEBUG "%s: mapped dma addr 0x%08lx at 0x%08lx, size %ld\n",
>> +		__func__, (unsigned long)buf->dma_addr, vma->vm_start,
>> +		buf->size);
>> +
>> +	return 0;
>>  }
>>
>>  /*********************************************/
>>  /*       callbacks for USERPTR buffers       */
>>  /*********************************************/
>>
>> +static inline int vma_is_io(struct vm_area_struct *vma)
>> +{
>> +	return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));
> 
> Isn't VM_PFNMAP enough ? Wouldn't it be possible (at least in theory) to get a 
> discontinuous physical range with VM_IO ?
> 

Frankly, I found that that in get_user_pages flags are checked against
(VM_IO | VM_PFNMAP). Probably for noMMU (not no IOMMU) case it is possible
to get vma with VM_IO on and VM_PFNMAP off, isn't it?

The problem is that this framework should work in both cases so this
check was added just in case :).

>> +}
>> +
>> +static int vb2_dc_get_pages(unsigned long start, struct page **pages,
>> +	int n_pages, struct vm_area_struct **copy_vma, int write)
>> +{
>> +	struct vm_area_struct *vma;
>> +	int n = 0; /* number of get pages */
>> +	int ret = -EFAULT;
>> +
>> +	/* entering critical section for mm access */
>> +	down_read(&current->mm->mmap_sem);
> 
> This will generate AB-BA deadlock warnings if lockdep is enabled. This 
> function is called with the queue lock held, and the mmap() handler which 
> takes the queue lock is called with current->mm->mmap_sem held.
> 
> This is a known issue with videobuf2, not specific to this patch. The warning 
> is usually a false positive (which we still need to fix, as it worries users), 
> but can become a real issue if an MMAP queue and a USERPTR queue are created 
> by a driver with the same queue lock.
> 

Good point. Do you know any good solution to this problem?

>> +	vma = find_vma(current->mm, start);
>> +	if (!vma) {
>> +		printk(KERN_ERR "no vma for address %lu\n", start);
>> +		goto cleanup;
>> +	}
>> +
>> +	if (vma_is_io(vma)) {
>> +		unsigned long pfn;
>> +
>> +		if (vma->vm_end - start < n_pages * PAGE_SIZE) {
>> +			printk(KERN_ERR "vma is too small\n");
>> +			goto cleanup;
>> +		}
>> +
>> +		for (n = 0; n < n_pages; ++n, start += PAGE_SIZE) {
>> +			ret = follow_pfn(vma, start, &pfn);
>> +			if (ret) {
>> +				printk(KERN_ERR "no page for address %lu\n",
>> +					start);
>> +				goto cleanup;
>> +			}
>> +			pages[n] = pfn_to_page(pfn);
>> +			get_page(pages[n]);
> 
> This worries me. When the VM_PFNMAP flag is set, the memory pages are not 
> backed by a struct page. Creating a struct page pointer out of it can be an 
> acceptable hack (for instance to store a page in an scatterlist with 
> sg_set_page() and then retrieve its physical address with sg_phys()), but you 
> should not expect the struct page to be valid for anything else. Calling 
> get_page() on it will likely crash.
> 

You are completetly right. This is the corner case where list of pages is
not a portable way of describing the memory.
Maybe pfn_valid should be used to check validity of the page (pfn)
before getting it?

>> +		}
>> +	} else {
>> +		n = get_user_pages(current, current->mm, start & PAGE_MASK,
>> +			n_pages, write, 1, pages, NULL);
>> +		if (n != n_pages) {
>> +			printk(KERN_ERR "got only %d of %d user pages\n",
>> +				n, n_pages);
>> +			goto cleanup;
>> +		}
>> +	}
>> +
>> +	*copy_vma = vb2_get_vma(vma);
>> +	if (!*copy_vma) {
>> +		printk(KERN_ERR "failed to copy vma\n");
>> +		ret = -ENOMEM;
>> +		goto cleanup;
>> +	}
> 
> Do we really need to make a copy of the VMA ? The only reason why we store a 
> pointer to it is to check the flags in vb2_dc_put_userptr(). We could store 
> the flags instead and avoid vb2_get_dma()/vb2_put_dma() calls altogether.
> 

I remember that there was a very good reason of copying this vma structure.
You caught me on 'cargo-cult' programming.
I will do some reverse engineering and try to answer it soon.

>> +
>> +	/* leaving critical section for mm access */
>> +	up_read(&current->mm->mmap_sem);
>> +
>> +	return 0;
>> +
>> +cleanup:
>> +	up_read(&current->mm->mmap_sem);
>> +
>> +	/* putting user pages if used, can be done wothout the lock */
>> +	while (n)
>> +		put_page(pages[--n]);
>> +
>> +	return ret;
>> +}
>> +
>>  static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
>> -					unsigned long size, int write)
>> +	unsigned long size, int write)
>>  {
>>  	struct vb2_dc_buf *buf;
>> -	struct vm_area_struct *vma;
>> -	dma_addr_t dma_addr = 0;
>> -	int ret;
>> +	unsigned long start, end, offset, offset2;
>> +	struct page **pages;
>> +	int n_pages;
>> +	int ret = 0;
>> +	struct sg_table *sgt;
>> +	unsigned long contig_size;
>>
>>  	buf = kzalloc(sizeof *buf, GFP_KERNEL);
>>  	if (!buf)
>>  		return ERR_PTR(-ENOMEM);
>>
>> -	ret = vb2_get_contig_userptr(vaddr, size, &vma, &dma_addr);
>> +	buf->dev = alloc_ctx;
>> +	buf->dma_dir = write ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
>> +
>> +	start = (unsigned long)vaddr & PAGE_MASK;
>> +	offset = (unsigned long)vaddr & ~PAGE_MASK;
>> +	end = PAGE_ALIGN((unsigned long)vaddr + size);
>> +	offset2 = end - (unsigned long)vaddr - size;
>> +	n_pages = (end - start) >> PAGE_SHIFT;
>> +
>> +	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
>> +	if (!pages) {
>> +		ret = -ENOMEM;
>> +		printk(KERN_ERR "failed to allocate pages table\n");
>> +		goto fail_buf;
>> +	}
>> +
>> +	/* extract page list from userspace mapping */
>> +	ret = vb2_dc_get_pages(start, pages, n_pages, &buf->vma, write);
>>  	if (ret) {
>> -		printk(KERN_ERR "Failed acquiring VMA for vaddr 0x%08lx\n",
>> -				vaddr);
>> -		kfree(buf);
>> -		return ERR_PTR(ret);
>> +		printk(KERN_ERR "failed to get user pages\n");
>> +		goto fail_pages;
>> +	}
>> +
>> +	sgt = vb2_dc_pages_to_sgt(pages, n_pages, offset, offset2);
>> +	if (!sgt) {
>> +		printk(KERN_ERR "failed to create scatterlist table\n");
>> +		ret = -ENOMEM;
>> +		goto fail_get_pages;
>>  	}
> 
> This looks overly complex to me. You create a multi-chunk sgt out of the user 
> pointer address and map it completely, and then check if it starts with a big 
> enough contiguous chunk.

Notice that vb2_dc_pages_to_sgt does compress contiguous ranges of pfns (pages).
So if the memory is contiguous, then sigle chunk sglist is produced.
The memory used to store pages list is just temporary. It is freed after the
sglist is created.

> Why don't you create an sgt with a single continuous
> chunk then ? In the VM_PFNMAP case you could check whether the area is
> contiguous when you follow the PFNs, stop at the first discontinuity, and 
> create an sgt with a single element right there. You would then need to call
> vb2_dc_pages_to_sgt() in the normal case only, and stop at the first
> discontinuity as well.

Discontinuity of pfns is not a problem if a device has own IOMMU. It is not
known for vb2-dma-contig if mapping this multi-chunk sglist will succeed until
calling and checking a result of dma_map_sg.

Why bothering if both VM_PFNMAP and non-VM_PFNMAP are handled in the same way
after list of pages is obtained? Trating them the same way allows to reuse code
and simplify the program flow.

The DMA framework does not provide any way to force single chunk mapping in sg.
If the device is capable of mapping discontinous pages into a single chunk
the DMA framework will probably do merge the pages. The documentation
encourages to merge the list but it is not obligatory.

The reason is that if 'struct scatterlist' contains no dma_length field,
what is controlled by CONFIG_NEED_SG_DMA_LENGTH macro, then field length
is used instead. Chunk marging cannot be done in such a case.
This is the reason why I look for the longest contiguous block.

> 
>>
>> +	/* pages are no longer needed */
>> +	kfree(pages);
>> +	pages = NULL;
>> +
>> +	sgt->nents = dma_map_sg(buf->dev, sgt->sgl, sgt->orig_nents,
>> +		buf->dma_dir);
>> +	if (sgt->nents <= 0) {
>> +		printk(KERN_ERR "failed to map scatterlist\n");
>> +		ret = -EIO;
>> +		goto fail_sgt;
>> +	}
>> +
>> +	contig_size = vb2_dc_get_contiguous_size(sgt);
>> +	if (contig_size < size) {
>> +		printk(KERN_ERR "contiguous mapping is too small %lu/%lu\n",
>> +			contig_size, size);
>> +		ret = -EFAULT;
>> +		goto fail_map_sg;
>> +	}
>> +
>> +	buf->dma_addr = sg_dma_address(sgt->sgl);
>>  	buf->size = size;
>> -	buf->dma_addr = dma_addr;
>> -	buf->vma = vma;
>> +	buf->dma_sgt = sgt;
>> +
>> +	atomic_inc(&buf->refcount);
>>
>>  	return buf;
>> +
>> +fail_map_sg:
>> +	dma_unmap_sg(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
> 
> I think this will break in the VM_PFNMAP case on non-coherent architectures. 
> arm_dma_unmap_page() will call __dma_page_dev_to_cpu() in that case, which can 
> dereference struct page. As explain above, the struct page isn't valid with 
> VM_PFNMAP. I haven't check the dma_map_sg() and dma_sync_sg_*() calls, but 
> changes are they might break as well.
> 

It will crash as long it is true that there is no struct page behind given pfn.
In practice, I found that VM_PFNMAP means that one cannot assume that there is
a 'struct page' behind PFNs of a given mapping. Thoses struct pages really
exists for all our drivers. Anyway, I agree that using those pages is a hack.

It could be avoided if vb2_dc_get_pages returned a list of PFNs. Anyway, those PFNs
have to be transformed to pages to create an sglist. Those pointers might be
accessed somewhere deep inside dma_map_sg internals.

The quite good solution would be dropping support for VM_PFNMAP mappings since
they cannot be handled reliably.

>> +
>> +fail_sgt:
>> +	vb2_dc_put_sgtable(sgt, 0);
>> +
>> +fail_get_pages:
>> +	while (pages && n_pages)
>> +		put_page(pages[--n_pages]);
>> +	vb2_put_vma(buf->vma);
>> +
>> +fail_pages:
>> +	kfree(pages); /* kfree is NULL-proof */
>> +
>> +fail_buf:
>> +	kfree(buf);
>> +
>> +	return ERR_PTR(ret);
>>  }
>>
>> -static void vb2_dc_put_userptr(void *mem_priv)
>> +static void vb2_dc_put_userptr(void *buf_priv)
>>  {
>> -	struct vb2_dc_buf *buf = mem_priv;
>> -
>> -	if (!buf)
>> -		return;
>> +	struct vb2_dc_buf *buf = buf_priv;
>> +	struct sg_table *sgt = buf->dma_sgt;
>>
>> +	dma_unmap_sg(buf->dev, sgt->sgl, sgt->orig_nents, buf->dma_dir);
>> +	vb2_dc_put_sgtable(sgt, !vma_is_io(buf->vma));
>>  	vb2_put_vma(buf->vma);
>>  	kfree(buf);
>>  }
>> @@ -168,6 +496,8 @@ const struct vb2_mem_ops vb2_dma_contig_memops = {
>>  	.mmap		= vb2_dc_mmap,
>>  	.get_userptr	= vb2_dc_get_userptr,
>>  	.put_userptr	= vb2_dc_put_userptr,
>> +	.prepare	= vb2_dc_prepare,
>> +	.finish		= vb2_dc_finish,
>>  	.num_users	= vb2_dc_num_users,
>>  };
>>  EXPORT_SYMBOL_GPL(vb2_dma_contig_memops);
> 

Regards,
Tomasz Stanislawski
Laurent Pinchart March 22, 2012, 2:42 p.m. UTC | #3
Hi Tomasz,

On Thursday 22 March 2012 14:36:33 Tomasz Stanislawski wrote:
> Hi Laurent,
> Thank you very much for your comments and question.
> They were very useful.

You're welcome.

> Please refer to the comments below.
> 
> On 03/22/2012 11:50 AM, Laurent Pinchart wrote:
> > On Thursday 22 March 2012 11:02:23 Laurent Pinchart wrote:
> >> From: Tomasz Stanislawski <t.stanislaws@samsung.com>
> >> 
> >> This patch combines updates and fixes to dma-contig allocator.
> >> Moreover the allocator code was refactored.
> >> The most important changes are:
> >> - functions were reordered
> >> - move compression of scatterlist to separete function
> >> - add support for multichunk but contiguous scatterlists
> >> - simplified implementation of vb2-dma-contig context structure
> >> - let mmap method to use dma_mmap_writecombine
> >> - add support for scatterlist in userptr mode

[snip]

> >> diff --git a/drivers/media/video/videobuf2-dma-contig.c
> >> b/drivers/media/video/videobuf2-dma-contig.c index c898e6f..9965465
> >> 100644
> >> --- a/drivers/media/video/videobuf2-dma-contig.c
> >> +++ b/drivers/media/video/videobuf2-dma-contig.c

[snip]

> >>  static void *vb2_dc_alloc(void *alloc_ctx, unsigned long size)
> >>  {
> >>  
> >>  	struct device *dev = alloc_ctx;
> >>  	struct vb2_dc_buf *buf;
> >> 
> >> +	int ret;
> >> +	int
> >> n_pages;http://165.213.219.115/cgi-bin/gitweb.cgi?p=mirror/linux-3.0-mid
> >> as;a=shortlog;h=refs/heads/exynos-3.0-dev +	struct page **pages = NULL;
> >> 
> >>  	buf = kzalloc(sizeof *buf, GFP_KERNEL);
> >>  	if (!buf)
> >>  	
> >>  		return ERR_PTR(-ENOMEM);
> >> 
> >> -	buf->vaddr = dma_alloc_coherent(dev, size, &buf->dma_addr, 
GFP_KERNEL);
> >> +	buf->dev = dev;
> >> +	buf->size = size;
> >> +	buf->vaddr = dma_alloc_coherent(buf->dev, buf->size, &buf->dma_addr,
> >> +		GFP_KERNEL);
> >> +
> >> +	ret = -ENOMEM;
> >> 
> >>  	if (!buf->vaddr) {
> >> 
> >> -		dev_err(dev, "dma_alloc_coherent of size %ld failed\n", size);
> >> -		kfree(buf);
> >> -		return ERR_PTR(-ENOMEM);
> >> +		dev_err(dev, "dma_alloc_coherent of size %ld failed\n",
> >> +			size);
> >> +		goto fail_buf;
> >> 
> >>  	}
> >> 
> >> -	buf->dev = dev;
> >> -	buf->size = size;
> >> +	WARN_ON((unsigned long)buf->vaddr & ~PAGE_MASK);
> >> +	WARN_ON(buf->dma_addr & ~PAGE_MASK);
> >> +
> >> +	n_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
> >> +
> >> +	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
> >> +	if (!pages) {
> >> +		printk(KERN_ERR "failed to alloc page table\n");
> >> +		goto fail_dma;
> >> +	}
> >> +
> >> +	ret = dma_get_pages(dev, buf->vaddr, buf->dma_addr, pages, n_pages);
> > 
> > As the only purpose of this is to retrieve a list of pages that will be
> > used to create a single-entry sgt, wouldn't it be possible to shortcut
> > the code and get the physical address of the buffer directly ?
> 
> The physical address should not be used since they are meaningless in a
> context of different devices. It seams that only the list of pages is
> more-or-less portable between different drivers.

The pages are physically contiguous. The physical address of the first page is 
thus all you need.

struct page and physical addresses can be used interchangeably in this case if 
I'm not mistaken. If you want to go with pages, you could use the first page 
only instead of the physical buffer address.

> The physical address is already present in buf->dma_addr, but it is only
> valid if the device has no MMU. Notice that vb2-dma-contig possess no
> knowledge if MMU is present for a given device.

That's why buf->dma_addr can't be considered as a physical address. It's only 
useful in the device context.

> The sg list is not going to be single-entry if the device is provided with
> its own MMU.

There's something I don't get then. vb2-dma-contig deals with physically 
contiguous buffers. The buffer is backed by physically contiguous pages, so 
the sg list should have a single entry.

> >> +	if (ret < 0) {
> >> +		printk(KERN_ERR "failed to get buffer pages from DMA API\n");
> >> +		goto fail_pages;
> >> +	}
> >> +	if (ret != n_pages) {
> >> +		ret = -EFAULT;
> >> +		printk(KERN_ERR "failed to get all pages from DMA API\n");
> >> +		goto fail_pages;
> >> +	}
> >> +
> >> +	buf->sgt_base = vb2_dc_pages_to_sgt(pages, n_pages, 0, 0);
> >> +	if (IS_ERR(buf->sgt_base)) {
> >> +		ret = PTR_ERR(buf->sgt_base);
> >> +		printk(KERN_ERR "failed to prepare sg table\n");
> >> +		goto fail_pages;
> >> +	}
> > 
> > buf->sgt_base isn't used in this patch. I would move the buf->sgt_base
> > creation code to the patch that uses it then, or to its own patch just
> > before the patch that uses it.
> 
> Good point. The sgt_base is used by exporter only. Thanks for noticing it.
> 
> >> +
> >> +	/* pages are no longer needed */
> >> +	kfree(pages);
> >> 
> >>  	buf->handler.refcount = &buf->refcount;
> >>  	buf->handler.put = vb2_dc_put;

[snip]

> >>  /*********************************************/
> >>  /*       callbacks for USERPTR buffers       */
> >>  /*********************************************/
> >> 
> >> +static inline int vma_is_io(struct vm_area_struct *vma)
> >> +{
> >> +	return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));
> > 
> > Isn't VM_PFNMAP enough ? Wouldn't it be possible (at least in theory) to
> > get a discontinuous physical range with VM_IO ?
> 
> Frankly, I found that that in get_user_pages flags are checked against
> (VM_IO | VM_PFNMAP). Probably for noMMU (not no IOMMU) case it is possible
> to get vma with VM_IO on and VM_PFNMAP off, isn't it?
> 
> The problem is that this framework should work in both cases so this
> check was added just in case :).

OK. We can leave it here and deal with problems if they arise :-)

> >> +}
> >> +
> >> +static int vb2_dc_get_pages(unsigned long start, struct page **pages,
> >> +	int n_pages, struct vm_area_struct **copy_vma, int write)
> >> +{
> >> +	struct vm_area_struct *vma;
> >> +	int n = 0; /* number of get pages */
> >> +	int ret = -EFAULT;
> >> +
> >> +	/* entering critical section for mm access */
> >> +	down_read(&current->mm->mmap_sem);
> > 
> > This will generate AB-BA deadlock warnings if lockdep is enabled. This
> > function is called with the queue lock held, and the mmap() handler which
> > takes the queue lock is called with current->mm->mmap_sem held.
> > 
> > This is a known issue with videobuf2, not specific to this patch. The
> > warning is usually a false positive (which we still need to fix, as it
> > worries users), but can become a real issue if an MMAP queue and a
> > USERPTR queue are created by a driver with the same queue lock.
> 
> Good point. Do you know any good solution to this problem?

http://patchwork.linuxtv.org/patch/8455/

It seems QBUF is safe, but PREPAREBUF isn't (both call __buf_prepare, which 
end up calling the memops get_userptr operation).

I'll post a patch to fix it for PREPAREBUF. If I'm not mistaken, you can drop 
the down_read/up_read here.

> >> +	vma = find_vma(current->mm, start);
> >> +	if (!vma) {
> >> +		printk(KERN_ERR "no vma for address %lu\n", start);
> >> +		goto cleanup;
> >> +	}
> >> +
> >> +	if (vma_is_io(vma)) {
> >> +		unsigned long pfn;
> >> +
> >> +		if (vma->vm_end - start < n_pages * PAGE_SIZE) {
> >> +			printk(KERN_ERR "vma is too small\n");
> >> +			goto cleanup;
> >> +		}
> >> +
> >> +		for (n = 0; n < n_pages; ++n, start += PAGE_SIZE) {
> >> +			ret = follow_pfn(vma, start, &pfn);
> >> +			if (ret) {
> >> +				printk(KERN_ERR "no page for address %lu\n",
> >> +					start);
> >> +				goto cleanup;
> >> +			}
> >> +			pages[n] = pfn_to_page(pfn);
> >> +			get_page(pages[n]);
> > 
> > This worries me. When the VM_PFNMAP flag is set, the memory pages are not
> > backed by a struct page. Creating a struct page pointer out of it can be
> > an acceptable hack (for instance to store a page in an scatterlist with
> > sg_set_page() and then retrieve its physical address with sg_phys()), but
> > you should not expect the struct page to be valid for anything else.
> > Calling get_page() on it will likely crash.
> 
> You are completetly right. This is the corner case where list of pages is
> not a portable way of describing the memory.
> Maybe pfn_valid should be used to check validity of the page (pfn)
> before getting it?

I think you should just drop the get_page() call. There's no page, so there's 
no need to get a reference count to it.

The VM_PFNMAP flag is mostly used with memory out of the kernel allocator's 
control if I'm not mistaken. The main use case I've seen is memory reserved at 
boot time and use as a frame buffer for instance. In that case the pages can't 
go away, as there no page in the first place.

This won't fix the DMA SG problem though (see below).

> >> +		}
> >> +	} else {
> >> +		n = get_user_pages(current, current->mm, start & PAGE_MASK,
> >> +			n_pages, write, 1, pages, NULL);
> >> +		if (n != n_pages) {
> >> +			printk(KERN_ERR "got only %d of %d user pages\n",
> >> +				n, n_pages);
> >> +			goto cleanup;
> >> +		}
> >> +	}
> >> +
> >> +	*copy_vma = vb2_get_vma(vma);
> >> +	if (!*copy_vma) {
> >> +		printk(KERN_ERR "failed to copy vma\n");
> >> +		ret = -ENOMEM;
> >> +		goto cleanup;
> >> +	}
> > 
> > Do we really need to make a copy of the VMA ? The only reason why we store
> > a pointer to it is to check the flags in vb2_dc_put_userptr(). We could
> > store the flags instead and avoid vb2_get_dma()/vb2_put_dma() calls
> > altogether.
>
> I remember that there was a very good reason of copying this vma structure.
> You caught me on 'cargo-cult' programming.
> I will do some reverse engineering and try to answer it soon.

OK :-) I'm not copying the VMA in the OMAP3 ISP driver, which is why this 
caught my eyes. If you find the reason why copying it is needed, please add a 
comment to the code.
 
> >> +
> >> +	/* leaving critical section for mm access */
> >> +	up_read(&current->mm->mmap_sem);
> >> +
> >> +	return 0;
> >> +
> >> +cleanup:
> >> +	up_read(&current->mm->mmap_sem);
> >> +
> >> +	/* putting user pages if used, can be done wothout the lock */
> >> +	while (n)
> >> +		put_page(pages[--n]);
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> 
> >>  static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
> >> -					unsigned long size, int write)
> >> +	unsigned long size, int write)
> >>  {
> >>  
> >>  	struct vb2_dc_buf *buf;
> >> 
> >> -	struct vm_area_struct *vma;
> >> -	dma_addr_t dma_addr = 0;
> >> -	int ret;
> >> +	unsigned long start, end, offset, offset2;
> >> +	struct page **pages;
> >> +	int n_pages;
> >> +	int ret = 0;
> >> +	struct sg_table *sgt;
> >> +	unsigned long contig_size;
> >> 
> >>  	buf = kzalloc(sizeof *buf, GFP_KERNEL);
> >>  	if (!buf)
> >>  	
> >>  		return ERR_PTR(-ENOMEM);
> >> 
> >> -	ret = vb2_get_contig_userptr(vaddr, size, &vma, &dma_addr);
> >> +	buf->dev = alloc_ctx;
> >> +	buf->dma_dir = write ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
> >> +
> >> +	start = (unsigned long)vaddr & PAGE_MASK;
> >> +	offset = (unsigned long)vaddr & ~PAGE_MASK;
> >> +	end = PAGE_ALIGN((unsigned long)vaddr + size);
> >> +	offset2 = end - (unsigned long)vaddr - size;
> >> +	n_pages = (end - start) >> PAGE_SHIFT;
> >> +
> >> +	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
> >> +	if (!pages) {
> >> +		ret = -ENOMEM;
> >> +		printk(KERN_ERR "failed to allocate pages table\n");
> >> +		goto fail_buf;
> >> +	}
> >> +
> >> +	/* extract page list from userspace mapping */
> >> +	ret = vb2_dc_get_pages(start, pages, n_pages, &buf->vma, write);
> >> 
> >>  	if (ret) {
> >> 
> >> -		printk(KERN_ERR "Failed acquiring VMA for vaddr 0x%08lx\n",
> >> -				vaddr);
> >> -		kfree(buf);
> >> -		return ERR_PTR(ret);
> >> +		printk(KERN_ERR "failed to get user pages\n");
> >> +		goto fail_pages;
> >> +	}
> >> +
> >> +	sgt = vb2_dc_pages_to_sgt(pages, n_pages, offset, offset2);
> >> +	if (!sgt) {
> >> +		printk(KERN_ERR "failed to create scatterlist table\n");
> >> +		ret = -ENOMEM;
> >> +		goto fail_get_pages;
> >> 
> >>  	}
> > 
> > This looks overly complex to me. You create a multi-chunk sgt out of the
> > user pointer address and map it completely, and then check if it starts
> > with a big enough contiguous chunk.
> 
> Notice that vb2_dc_pages_to_sgt does compress contiguous ranges of pfns
> (pages). So if the memory is contiguous, then sigle chunk sglist is
> produced. The memory used to store pages list is just temporary. It is
> freed after the sglist is created.

That's exactly my point. The memory needs to be contiguous to be usable. If it 
isn't, vb2-dma-contig will only use the first contiguous chunk. We could thus 
simplify the code by hardcoding the single-chunk assumption. vb2-dma-contig 
would walk user user pages list (or the PFN, depending on the VMA flags) and 
stop at the first discontinuity. It would then create a single-entry sg list 
and operate on that, without mapping or otherwise touching the rest of the 
VMA, which is unusable to the device anyway.

> > Why don't you create an sgt with a single continuous
> > chunk then ? In the VM_PFNMAP case you could check whether the area is
> > contiguous when you follow the PFNs, stop at the first discontinuity, and
> > create an sgt with a single element right there. You would then need to
> > call vb2_dc_pages_to_sgt() in the normal case only, and stop at the first
> > discontinuity as well.
> 
> Discontinuity of pfns is not a problem if a device has own IOMMU. It is not
> known for vb2-dma-contig if mapping this multi-chunk sglist will succeed
> until calling and checking a result of dma_map_sg.

If the device has an IOMMU it won't need contiguous memory. Shouldn't it then 
use vb2-dma-sg instead ?

> Why bothering if both VM_PFNMAP and non-VM_PFNMAP are handled in the same
> way after list of pages is obtained? Trating them the same way allows to
> reuse code and simplify the program flow.
> 
> The DMA framework does not provide any way to force single chunk mapping in
> sg. If the device is capable of mapping discontinous pages into a single
> chunk the DMA framework will probably do merge the pages. The documentation
> encourages to merge the list but it is not obligatory.
> 
> The reason is that if 'struct scatterlist' contains no dma_length field,
> what is controlled by CONFIG_NEED_SG_DMA_LENGTH macro, then field length
> is used instead. Chunk marging cannot be done in such a case.
> This is the reason why I look for the longest contiguous block.
> 
> >> +	/* pages are no longer needed */
> >> +	kfree(pages);
> >> +	pages = NULL;
> >> +
> >> +	sgt->nents = dma_map_sg(buf->dev, sgt->sgl, sgt->orig_nents,
> >> +		buf->dma_dir);
> >> +	if (sgt->nents <= 0) {
> >> +		printk(KERN_ERR "failed to map scatterlist\n");
> >> +		ret = -EIO;
> >> +		goto fail_sgt;
> >> +	}
> >> +
> >> +	contig_size = vb2_dc_get_contiguous_size(sgt);
> >> +	if (contig_size < size) {
> >> +		printk(KERN_ERR "contiguous mapping is too small %lu/%lu\n",
> >> +			contig_size, size);
> >> +		ret = -EFAULT;
> >> +		goto fail_map_sg;
> >> +	}
> >> +
> >> +	buf->dma_addr = sg_dma_address(sgt->sgl);
> >> 
> >>  	buf->size = size;
> >> 
> >> -	buf->dma_addr = dma_addr;
> >> -	buf->vma = vma;
> >> +	buf->dma_sgt = sgt;
> >> +
> >> +	atomic_inc(&buf->refcount);
> >> 
> >>  	return buf;
> >> 
> >> +
> >> +fail_map_sg:
> >> +	dma_unmap_sg(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
> > 
> > I think this will break in the VM_PFNMAP case on non-coherent
> > architectures. arm_dma_unmap_page() will call __dma_page_dev_to_cpu() in
> > that case, which can dereference struct page. As explain above, the
> > struct page isn't valid with VM_PFNMAP. I haven't check the dma_map_sg()
> > and dma_sync_sg_*() calls, but changes are they might break as well.
> 
> It will crash as long it is true that there is no struct page behind given
> pfn. In practice, I found that VM_PFNMAP means that one cannot assume that
> there is a 'struct page' behind PFNs of a given mapping. Thoses struct
> pages really exists for all our drivers. Anyway, I agree that using those
> pages is a hack.

They don't exist for the memory used as frame buffer on the OMAP3 (or at least 
didn't exist in the N900 and N9, I haven't checked since). This could become 
just a bad distant memory when drivers will use CMA.

> It could be avoided if vb2_dc_get_pages returned a list of PFNs. Anyway,
> those PFNs have to be transformed to pages to create an sglist. Those
> pointers might be accessed somewhere deep inside dma_map_sg internals.
> 
> The quite good solution would be dropping support for VM_PFNMAP mappings
> since they cannot be handled reliably.

We should either drop VM_PFNMAP support or fix the DMA SG mapping API to 
support VM_PFNMAP-style memory. I would vote for the former, as that's way 
simpler and we have no VM_PFNMAP use case right now.

> >> +
> >> +fail_sgt:
> >> +	vb2_dc_put_sgtable(sgt, 0);
> >> +
> >> +fail_get_pages:
> >> +	while (pages && n_pages)
> >> +		put_page(pages[--n_pages]);
> >> +	vb2_put_vma(buf->vma);
> >> +
> >> +fail_pages:
> >> +	kfree(pages); /* kfree is NULL-proof */
> >> +
> >> +fail_buf:
> >> +	kfree(buf);
> >> +
> >> +	return ERR_PTR(ret);
> >> 
> >>  }
Subash Patel March 22, 2012, 2:52 p.m. UTC | #4
Hi Laurent,

On 03/22/2012 08:12 PM, Laurent Pinchart wrote:
> Hi Tomasz,
>
> On Thursday 22 March 2012 14:36:33 Tomasz Stanislawski wrote:
>> Hi Laurent,
>> Thank you very much for your comments and question.
>> They were very useful.
>
> You're welcome.
>
>> Please refer to the comments below.
>>
>> On 03/22/2012 11:50 AM, Laurent Pinchart wrote:
>>> On Thursday 22 March 2012 11:02:23 Laurent Pinchart wrote:
>>>> From: Tomasz Stanislawski<t.stanislaws@samsung.com>
>>>>
>>>> This patch combines updates and fixes to dma-contig allocator.
>>>> Moreover the allocator code was refactored.
>>>> The most important changes are:
>>>> - functions were reordered
>>>> - move compression of scatterlist to separete function
>>>> - add support for multichunk but contiguous scatterlists
>>>> - simplified implementation of vb2-dma-contig context structure
>>>> - let mmap method to use dma_mmap_writecombine
>>>> - add support for scatterlist in userptr mode
>
> [snip]
>
>>>> diff --git a/drivers/media/video/videobuf2-dma-contig.c
>>>> b/drivers/media/video/videobuf2-dma-contig.c index c898e6f..9965465
>>>> 100644
>>>> --- a/drivers/media/video/videobuf2-dma-contig.c
>>>> +++ b/drivers/media/video/videobuf2-dma-contig.c
>
> [snip]
>
>>>>   static void *vb2_dc_alloc(void *alloc_ctx, unsigned long size)
>>>>   {
>>>>
>>>>   	struct device *dev = alloc_ctx;
>>>>   	struct vb2_dc_buf *buf;
>>>>
>>>> +	int ret;
>>>> +	int
>>>> n_pages;http://165.213.219.115/cgi-bin/gitweb.cgi?p=mirror/linux-3.0-mid
>>>> as;a=shortlog;h=refs/heads/exynos-3.0-dev +	struct page **pages = NULL;
>>>>
>>>>   	buf = kzalloc(sizeof *buf, GFP_KERNEL);
>>>>   	if (!buf)
>>>>   	
>>>>   		return ERR_PTR(-ENOMEM);
>>>>
>>>> -	buf->vaddr = dma_alloc_coherent(dev, size,&buf->dma_addr,
> GFP_KERNEL);
>>>> +	buf->dev = dev;
>>>> +	buf->size = size;
>>>> +	buf->vaddr = dma_alloc_coherent(buf->dev, buf->size,&buf->dma_addr,
>>>> +		GFP_KERNEL);
>>>> +
>>>> +	ret = -ENOMEM;
>>>>
>>>>   	if (!buf->vaddr) {
>>>>
>>>> -		dev_err(dev, "dma_alloc_coherent of size %ld failed\n", size);
>>>> -		kfree(buf);
>>>> -		return ERR_PTR(-ENOMEM);
>>>> +		dev_err(dev, "dma_alloc_coherent of size %ld failed\n",
>>>> +			size);
>>>> +		goto fail_buf;
>>>>
>>>>   	}
>>>>
>>>> -	buf->dev = dev;
>>>> -	buf->size = size;
>>>> +	WARN_ON((unsigned long)buf->vaddr&  ~PAGE_MASK);
>>>> +	WARN_ON(buf->dma_addr&  ~PAGE_MASK);
>>>> +
>>>> +	n_pages = PAGE_ALIGN(size)>>  PAGE_SHIFT;
>>>> +
>>>> +	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
>>>> +	if (!pages) {
>>>> +		printk(KERN_ERR "failed to alloc page table\n");
>>>> +		goto fail_dma;
>>>> +	}
>>>> +
>>>> +	ret = dma_get_pages(dev, buf->vaddr, buf->dma_addr, pages, n_pages);
>>>
>>> As the only purpose of this is to retrieve a list of pages that will be
>>> used to create a single-entry sgt, wouldn't it be possible to shortcut
>>> the code and get the physical address of the buffer directly ?
>>
>> The physical address should not be used since they are meaningless in a
>> context of different devices. It seams that only the list of pages is
>> more-or-less portable between different drivers.
>
> The pages are physically contiguous. The physical address of the first page is
> thus all you need.
>
> struct page and physical addresses can be used interchangeably in this case if
> I'm not mistaken. If you want to go with pages, you could use the first page
> only instead of the physical buffer address.
>
>> The physical address is already present in buf->dma_addr, but it is only
>> valid if the device has no MMU. Notice that vb2-dma-contig possess no
>> knowledge if MMU is present for a given device.
>
> That's why buf->dma_addr can't be considered as a physical address. It's only
> useful in the device context.
>
>> The sg list is not going to be single-entry if the device is provided with
>> its own MMU.
>
> There's something I don't get then. vb2-dma-contig deals with physically
> contiguous buffers. The buffer is backed by physically contiguous pages, so
> the sg list should have a single entry.
>
I think at present, vb2-dma-contig is abused for any kind of memory 
allocation (continuous or not). Wouldnt it be good to have a proper 
working setup for videobuf2-dma-sg instead? Driver which chooses to use 
continuous, shall assign vb2_queue->mem_ops = vb2_dma_contig_memops. The 
devices which know they have a MMU backing can assign the same to 
vb2_dma_sg_memops. But as of now, we try to use vb2_dma_contig_memops 
for all kind of operation. I have also done this mistake, and wish I 
repaired it and posted it before :(

>>>> +	if (ret<  0) {
>>>> +		printk(KERN_ERR "failed to get buffer pages from DMA API\n");
>>>> +		goto fail_pages;
>>>> +	}
>>>> +	if (ret != n_pages) {
>>>> +		ret = -EFAULT;
>>>> +		printk(KERN_ERR "failed to get all pages from DMA API\n");
>>>> +		goto fail_pages;
>>>> +	}
>>>> +
>>>> +	buf->sgt_base = vb2_dc_pages_to_sgt(pages, n_pages, 0, 0);
>>>> +	if (IS_ERR(buf->sgt_base)) {
>>>> +		ret = PTR_ERR(buf->sgt_base);
>>>> +		printk(KERN_ERR "failed to prepare sg table\n");
>>>> +		goto fail_pages;
>>>> +	}
>>>
>>> buf->sgt_base isn't used in this patch. I would move the buf->sgt_base
>>> creation code to the patch that uses it then, or to its own patch just
>>> before the patch that uses it.
>>
>> Good point. The sgt_base is used by exporter only. Thanks for noticing it.
>>
>>>> +
>>>> +	/* pages are no longer needed */
>>>> +	kfree(pages);
>>>>
>>>>   	buf->handler.refcount =&buf->refcount;
>>>>   	buf->handler.put = vb2_dc_put;
>
> [snip]
>
>>>>   /*********************************************/
>>>>   /*       callbacks for USERPTR buffers       */
>>>>   /*********************************************/
>>>>
>>>> +static inline int vma_is_io(struct vm_area_struct *vma)
>>>> +{
>>>> +	return !!(vma->vm_flags&  (VM_IO | VM_PFNMAP));
>>>
>>> Isn't VM_PFNMAP enough ? Wouldn't it be possible (at least in theory) to
>>> get a discontinuous physical range with VM_IO ?
>>
>> Frankly, I found that that in get_user_pages flags are checked against
>> (VM_IO | VM_PFNMAP). Probably for noMMU (not no IOMMU) case it is possible
>> to get vma with VM_IO on and VM_PFNMAP off, isn't it?
>>
>> The problem is that this framework should work in both cases so this
>> check was added just in case :).
>
> OK. We can leave it here and deal with problems if they arise :-)
>
>>>> +}
>>>> +
>>>> +static int vb2_dc_get_pages(unsigned long start, struct page **pages,
>>>> +	int n_pages, struct vm_area_struct **copy_vma, int write)
>>>> +{
>>>> +	struct vm_area_struct *vma;
>>>> +	int n = 0; /* number of get pages */
>>>> +	int ret = -EFAULT;
>>>> +
>>>> +	/* entering critical section for mm access */
>>>> +	down_read(&current->mm->mmap_sem);
>>>
>>> This will generate AB-BA deadlock warnings if lockdep is enabled. This
>>> function is called with the queue lock held, and the mmap() handler which
>>> takes the queue lock is called with current->mm->mmap_sem held.
>>>
>>> This is a known issue with videobuf2, not specific to this patch. The
>>> warning is usually a false positive (which we still need to fix, as it
>>> worries users), but can become a real issue if an MMAP queue and a
>>> USERPTR queue are created by a driver with the same queue lock.
>>
>> Good point. Do you know any good solution to this problem?
>
> http://patchwork.linuxtv.org/patch/8455/
>
> It seems QBUF is safe, but PREPAREBUF isn't (both call __buf_prepare, which
> end up calling the memops get_userptr operation).
>
> I'll post a patch to fix it for PREPAREBUF. If I'm not mistaken, you can drop
> the down_read/up_read here.
>
>>>> +	vma = find_vma(current->mm, start);
>>>> +	if (!vma) {
>>>> +		printk(KERN_ERR "no vma for address %lu\n", start);
>>>> +		goto cleanup;
>>>> +	}
>>>> +
>>>> +	if (vma_is_io(vma)) {
>>>> +		unsigned long pfn;
>>>> +
>>>> +		if (vma->vm_end - start<  n_pages * PAGE_SIZE) {
>>>> +			printk(KERN_ERR "vma is too small\n");
>>>> +			goto cleanup;
>>>> +		}
>>>> +
>>>> +		for (n = 0; n<  n_pages; ++n, start += PAGE_SIZE) {
>>>> +			ret = follow_pfn(vma, start,&pfn);
>>>> +			if (ret) {
>>>> +				printk(KERN_ERR "no page for address %lu\n",
>>>> +					start);
>>>> +				goto cleanup;
>>>> +			}
>>>> +			pages[n] = pfn_to_page(pfn);
>>>> +			get_page(pages[n]);
>>>
>>> This worries me. When the VM_PFNMAP flag is set, the memory pages are not
>>> backed by a struct page. Creating a struct page pointer out of it can be
>>> an acceptable hack (for instance to store a page in an scatterlist with
>>> sg_set_page() and then retrieve its physical address with sg_phys()), but
>>> you should not expect the struct page to be valid for anything else.
>>> Calling get_page() on it will likely crash.
>>
>> You are completetly right. This is the corner case where list of pages is
>> not a portable way of describing the memory.
>> Maybe pfn_valid should be used to check validity of the page (pfn)
>> before getting it?
>
> I think you should just drop the get_page() call. There's no page, so there's
> no need to get a reference count to it.
>
> The VM_PFNMAP flag is mostly used with memory out of the kernel allocator's
> control if I'm not mistaken. The main use case I've seen is memory reserved at
> boot time and use as a frame buffer for instance. In that case the pages can't
> go away, as there no page in the first place.
>
> This won't fix the DMA SG problem though (see below).
>
>>>> +		}
>>>> +	} else {
>>>> +		n = get_user_pages(current, current->mm, start&  PAGE_MASK,
>>>> +			n_pages, write, 1, pages, NULL);
>>>> +		if (n != n_pages) {
>>>> +			printk(KERN_ERR "got only %d of %d user pages\n",
>>>> +				n, n_pages);
>>>> +			goto cleanup;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	*copy_vma = vb2_get_vma(vma);
>>>> +	if (!*copy_vma) {
>>>> +		printk(KERN_ERR "failed to copy vma\n");
>>>> +		ret = -ENOMEM;
>>>> +		goto cleanup;
>>>> +	}
>>>
>>> Do we really need to make a copy of the VMA ? The only reason why we store
>>> a pointer to it is to check the flags in vb2_dc_put_userptr(). We could
>>> store the flags instead and avoid vb2_get_dma()/vb2_put_dma() calls
>>> altogether.
>>
>> I remember that there was a very good reason of copying this vma structure.
>> You caught me on 'cargo-cult' programming.
>> I will do some reverse engineering and try to answer it soon.
>
> OK :-) I'm not copying the VMA in the OMAP3 ISP driver, which is why this
> caught my eyes. If you find the reason why copying it is needed, please add a
> comment to the code.
>
>>>> +
>>>> +	/* leaving critical section for mm access */
>>>> +	up_read(&current->mm->mmap_sem);
>>>> +
>>>> +	return 0;
>>>> +
>>>> +cleanup:
>>>> +	up_read(&current->mm->mmap_sem);
>>>> +
>>>> +	/* putting user pages if used, can be done wothout the lock */
>>>> +	while (n)
>>>> +		put_page(pages[--n]);
>>>> +
>>>> +	return ret;
>>>> +}
>>>> +
>>>>
>>>>   static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
>>>> -					unsigned long size, int write)
>>>> +	unsigned long size, int write)
>>>>   {
>>>>
>>>>   	struct vb2_dc_buf *buf;
>>>>
>>>> -	struct vm_area_struct *vma;
>>>> -	dma_addr_t dma_addr = 0;
>>>> -	int ret;
>>>> +	unsigned long start, end, offset, offset2;
>>>> +	struct page **pages;
>>>> +	int n_pages;
>>>> +	int ret = 0;
>>>> +	struct sg_table *sgt;
>>>> +	unsigned long contig_size;
>>>>
>>>>   	buf = kzalloc(sizeof *buf, GFP_KERNEL);
>>>>   	if (!buf)
>>>>   	
>>>>   		return ERR_PTR(-ENOMEM);
>>>>
>>>> -	ret = vb2_get_contig_userptr(vaddr, size,&vma,&dma_addr);
>>>> +	buf->dev = alloc_ctx;
>>>> +	buf->dma_dir = write ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
>>>> +
>>>> +	start = (unsigned long)vaddr&  PAGE_MASK;
>>>> +	offset = (unsigned long)vaddr&  ~PAGE_MASK;
>>>> +	end = PAGE_ALIGN((unsigned long)vaddr + size);
>>>> +	offset2 = end - (unsigned long)vaddr - size;
>>>> +	n_pages = (end - start)>>  PAGE_SHIFT;
>>>> +
>>>> +	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
>>>> +	if (!pages) {
>>>> +		ret = -ENOMEM;
>>>> +		printk(KERN_ERR "failed to allocate pages table\n");
>>>> +		goto fail_buf;
>>>> +	}
>>>> +
>>>> +	/* extract page list from userspace mapping */
>>>> +	ret = vb2_dc_get_pages(start, pages, n_pages,&buf->vma, write);
>>>>
>>>>   	if (ret) {
>>>>
>>>> -		printk(KERN_ERR "Failed acquiring VMA for vaddr 0x%08lx\n",
>>>> -				vaddr);
>>>> -		kfree(buf);
>>>> -		return ERR_PTR(ret);
>>>> +		printk(KERN_ERR "failed to get user pages\n");
>>>> +		goto fail_pages;
>>>> +	}
>>>> +
>>>> +	sgt = vb2_dc_pages_to_sgt(pages, n_pages, offset, offset2);
>>>> +	if (!sgt) {
>>>> +		printk(KERN_ERR "failed to create scatterlist table\n");
>>>> +		ret = -ENOMEM;
>>>> +		goto fail_get_pages;
>>>>
>>>>   	}
>>>
>>> This looks overly complex to me. You create a multi-chunk sgt out of the
>>> user pointer address and map it completely, and then check if it starts
>>> with a big enough contiguous chunk.
>>
>> Notice that vb2_dc_pages_to_sgt does compress contiguous ranges of pfns
>> (pages). So if the memory is contiguous, then sigle chunk sglist is
>> produced. The memory used to store pages list is just temporary. It is
>> freed after the sglist is created.
>
> That's exactly my point. The memory needs to be contiguous to be usable. If it
> isn't, vb2-dma-contig will only use the first contiguous chunk. We could thus
> simplify the code by hardcoding the single-chunk assumption. vb2-dma-contig
> would walk user user pages list (or the PFN, depending on the VMA flags) and
> stop at the first discontinuity. It would then create a single-entry sg list
> and operate on that, without mapping or otherwise touching the rest of the
> VMA, which is unusable to the device anyway.
>
>>> Why don't you create an sgt with a single continuous
>>> chunk then ? In the VM_PFNMAP case you could check whether the area is
>>> contiguous when you follow the PFNs, stop at the first discontinuity, and
>>> create an sgt with a single element right there. You would then need to
>>> call vb2_dc_pages_to_sgt() in the normal case only, and stop at the first
>>> discontinuity as well.
>>
>> Discontinuity of pfns is not a problem if a device has own IOMMU. It is not
>> known for vb2-dma-contig if mapping this multi-chunk sglist will succeed
>> until calling and checking a result of dma_map_sg.
>
> If the device has an IOMMU it won't need contiguous memory. Shouldn't it then
> use vb2-dma-sg instead ?
>
>> Why bothering if both VM_PFNMAP and non-VM_PFNMAP are handled in the same
>> way after list of pages is obtained? Trating them the same way allows to
>> reuse code and simplify the program flow.
>>
>> The DMA framework does not provide any way to force single chunk mapping in
>> sg. If the device is capable of mapping discontinous pages into a single
>> chunk the DMA framework will probably do merge the pages. The documentation
>> encourages to merge the list but it is not obligatory.
>>
>> The reason is that if 'struct scatterlist' contains no dma_length field,
>> what is controlled by CONFIG_NEED_SG_DMA_LENGTH macro, then field length
>> is used instead. Chunk marging cannot be done in such a case.
>> This is the reason why I look for the longest contiguous block.
>>
>>>> +	/* pages are no longer needed */
>>>> +	kfree(pages);
>>>> +	pages = NULL;
>>>> +
>>>> +	sgt->nents = dma_map_sg(buf->dev, sgt->sgl, sgt->orig_nents,
>>>> +		buf->dma_dir);
>>>> +	if (sgt->nents<= 0) {
>>>> +		printk(KERN_ERR "failed to map scatterlist\n");
>>>> +		ret = -EIO;
>>>> +		goto fail_sgt;
>>>> +	}
>>>> +
>>>> +	contig_size = vb2_dc_get_contiguous_size(sgt);
>>>> +	if (contig_size<  size) {
>>>> +		printk(KERN_ERR "contiguous mapping is too small %lu/%lu\n",
>>>> +			contig_size, size);
>>>> +		ret = -EFAULT;
>>>> +		goto fail_map_sg;
>>>> +	}
>>>> +
>>>> +	buf->dma_addr = sg_dma_address(sgt->sgl);
>>>>
>>>>   	buf->size = size;
>>>>
>>>> -	buf->dma_addr = dma_addr;
>>>> -	buf->vma = vma;
>>>> +	buf->dma_sgt = sgt;
>>>> +
>>>> +	atomic_inc(&buf->refcount);
>>>>
>>>>   	return buf;
>>>>
>>>> +
>>>> +fail_map_sg:
>>>> +	dma_unmap_sg(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
>>>
>>> I think this will break in the VM_PFNMAP case on non-coherent
>>> architectures. arm_dma_unmap_page() will call __dma_page_dev_to_cpu() in
>>> that case, which can dereference struct page. As explain above, the
>>> struct page isn't valid with VM_PFNMAP. I haven't check the dma_map_sg()
>>> and dma_sync_sg_*() calls, but changes are they might break as well.
>>
>> It will crash as long it is true that there is no struct page behind given
>> pfn. In practice, I found that VM_PFNMAP means that one cannot assume that
>> there is a 'struct page' behind PFNs of a given mapping. Thoses struct
>> pages really exists for all our drivers. Anyway, I agree that using those
>> pages is a hack.
>
> They don't exist for the memory used as frame buffer on the OMAP3 (or at least
> didn't exist in the N900 and N9, I haven't checked since). This could become
> just a bad distant memory when drivers will use CMA.
>
>> It could be avoided if vb2_dc_get_pages returned a list of PFNs. Anyway,
>> those PFNs have to be transformed to pages to create an sglist. Those
>> pointers might be accessed somewhere deep inside dma_map_sg internals.
>>
>> The quite good solution would be dropping support for VM_PFNMAP mappings
>> since they cannot be handled reliably.
>
> We should either drop VM_PFNMAP support or fix the DMA SG mapping API to
> support VM_PFNMAP-style memory. I would vote for the former, as that's way
> simpler and we have no VM_PFNMAP use case right now.
>
>>>> +
>>>> +fail_sgt:
>>>> +	vb2_dc_put_sgtable(sgt, 0);
>>>> +
>>>> +fail_get_pages:
>>>> +	while (pages&&  n_pages)
>>>> +		put_page(pages[--n_pages]);
>>>> +	vb2_put_vma(buf->vma);
>>>> +
>>>> +fail_pages:
>>>> +	kfree(pages); /* kfree is NULL-proof */
>>>> +
>>>> +fail_buf:
>>>> +	kfree(buf);
>>>> +
>>>> +	return ERR_PTR(ret);
>>>>
>>>>   }
>
Regards,
Subash
Tomasz Stanislawski March 22, 2012, 3:58 p.m. UTC | #5
Hi Laurent,

On 03/22/2012 03:42 PM, Laurent Pinchart wrote:
> Hi Tomasz,
> 
> On Thursday 22 March 2012 14:36:33 Tomasz Stanislawski wrote:
>> Hi Laurent,
>> Thank you very much for your comments and question.
>> They were very useful.
> 
> You're welcome.
> 
>> Please refer to the comments below.
>>
>> On 03/22/2012 11:50 AM, Laurent Pinchart wrote:
>>> On Thursday 22 March 2012 11:02:23 Laurent Pinchart wrote:
>>>> From: Tomasz Stanislawski <t.stanislaws@samsung.com>
>>>>
>>>> This patch combines updates and fixes to dma-contig allocator.
>>>> Moreover the allocator code was refactored.
>>>> The most important changes are:
>>>> - functions were reordered
>>>> - move compression of scatterlist to separete function
>>>> - add support for multichunk but contiguous scatterlists
>>>> - simplified implementation of vb2-dma-contig context structure
>>>> - let mmap method to use dma_mmap_writecombine
>>>> - add support for scatterlist in userptr mode
> 
> [snip]
> 
>>>> diff --git a/drivers/media/video/videobuf2-dma-contig.c
>>>> b/drivers/media/video/videobuf2-dma-contig.c index c898e6f..9965465
>>>> 100644
>>>> --- a/drivers/media/video/videobuf2-dma-contig.c
>>>> +++ b/drivers/media/video/videobuf2-dma-contig.c
> 
> [snip]
> 
>>>>  static void *vb2_dc_alloc(void *alloc_ctx, unsigned long size)
>>>>  {
>>>>  
>>>>  	struct device *dev = alloc_ctx;
>>>>  	struct vb2_dc_buf *buf;
>>>>
>>>> +	int ret;
>>>> +	int
>>>> n_pages;
>>>>
>>>>  	buf = kzalloc(sizeof *buf, GFP_KERNEL);
>>>>  	if (!buf)
>>>>  	
>>>>  		return ERR_PTR(-ENOMEM);
>>>>
>>>> -	buf->vaddr = dma_alloc_coherent(dev, size, &buf->dma_addr, 
> GFP_KERNEL);
>>>> +	buf->dev = dev;
>>>> +	buf->size = size;
>>>> +	buf->vaddr = dma_alloc_coherent(buf->dev, buf->size, &buf->dma_addr,
>>>> +		GFP_KERNEL);
>>>> +
>>>> +	ret = -ENOMEM;
>>>>
>>>>  	if (!buf->vaddr) {
>>>>
>>>> -		dev_err(dev, "dma_alloc_coherent of size %ld failed\n", size);
>>>> -		kfree(buf);
>>>> -		return ERR_PTR(-ENOMEM);
>>>> +		dev_err(dev, "dma_alloc_coherent of size %ld failed\n",
>>>> +			size);
>>>> +		goto fail_buf;
>>>>
>>>>  	}
>>>>
>>>> -	buf->dev = dev;
>>>> -	buf->size = size;
>>>> +	WARN_ON((unsigned long)buf->vaddr & ~PAGE_MASK);
>>>> +	WARN_ON(buf->dma_addr & ~PAGE_MASK);
>>>> +
>>>> +	n_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
>>>> +
>>>> +	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
>>>> +	if (!pages) {
>>>> +		printk(KERN_ERR "failed to alloc page table\n");
>>>> +		goto fail_dma;
>>>> +	}
>>>> +
>>>> +	ret = dma_get_pages(dev, buf->vaddr, buf->dma_addr, pages, n_pages);
>>>
>>> As the only purpose of this is to retrieve a list of pages that will be
>>> used to create a single-entry sgt, wouldn't it be possible to shortcut
>>> the code and get the physical address of the buffer directly ?
>>
>> The physical address should not be used since they are meaningless in a
>> context of different devices. It seams that only the list of pages is
>> more-or-less portable between different drivers.
> 
> The pages are physically contiguous. The physical address of the first page is 
> thus all you need.

No. DMA-CONTIG buffers do not have to be physically contiguous. Please refer below.

> 
> struct page and physical addresses can be used interchangeably in this case if 
> I'm not mistaken. If you want to go with pages, you could use the first page 
> only instead of the physical buffer address.
> 

Ok. There are bus addresses, physical addresses, DMA addresses and PFNs.
As I understand PFNs and 'struct page' can be interchanged, at least in one direction.
The PFNs are used to create a bus address, I mean addresses that are recognized by
a RAM chip. So a list of PFNs seams to be the most portable way of describing
the memory, isn't it?

>> The physical address is already present in buf->dma_addr, but it is only
>> valid if the device has no MMU. Notice that vb2-dma-contig possess no
>> knowledge if MMU is present for a given device.
> 
> That's why buf->dma_addr can't be considered as a physical address. It's only 
> useful in the device context.
> 

ok

>> The sg list is not going to be single-entry if the device is provided with
>> its own MMU.
> 
> There's something I don't get then. vb2-dma-contig deals with physically 
> contiguous buffers. The buffer is backed by physically contiguous pages, so 
> the sg list should have a single entry.
> 

As I understand dma-contig deal with DMA contiguous buffers, it means buffers
that are contiguous from device point of view. Therefore those buffers do NOT
have to be physically contiguous if the device has its own IOMMU.

>>>> +	if (ret < 0) {
>>>> +		printk(KERN_ERR "failed to get buffer pages from DMA API\n");
>>>> +		goto fail_pages;
>>>> +	}
>>>> +	if (ret != n_pages) {
>>>> +		ret = -EFAULT;
>>>> +		printk(KERN_ERR "failed to get all pages from DMA API\n");
>>>> +		goto fail_pages;
>>>> +	}
>>>> +
>>>> +	buf->sgt_base = vb2_dc_pages_to_sgt(pages, n_pages, 0, 0);
>>>> +	if (IS_ERR(buf->sgt_base)) {
>>>> +		ret = PTR_ERR(buf->sgt_base);
>>>> +		printk(KERN_ERR "failed to prepare sg table\n");
>>>> +		goto fail_pages;
>>>> +	}
>>>
>>> buf->sgt_base isn't used in this patch. I would move the buf->sgt_base
>>> creation code to the patch that uses it then, or to its own patch just
>>> before the patch that uses it.
>>
>> Good point. The sgt_base is used by exporter only. Thanks for noticing it.
>>
>>>> +
>>>> +	/* pages are no longer needed */
>>>> +	kfree(pages);
>>>>
>>>>  	buf->handler.refcount = &buf->refcount;
>>>>  	buf->handler.put = vb2_dc_put;
> 
> [snip]
> 
>>>>  /*********************************************/
>>>>  /*       callbacks for USERPTR buffers       */
>>>>  /*********************************************/
>>>>
>>>> +static inline int vma_is_io(struct vm_area_struct *vma)
>>>> +{
>>>> +	return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));
>>>
>>> Isn't VM_PFNMAP enough ? Wouldn't it be possible (at least in theory) to
>>> get a discontinuous physical range with VM_IO ?
>>
>> Frankly, I found that that in get_user_pages flags are checked against
>> (VM_IO | VM_PFNMAP). Probably for noMMU (not no IOMMU) case it is possible
>> to get vma with VM_IO on and VM_PFNMAP off, isn't it?
>>
>> The problem is that this framework should work in both cases so this
>> check was added just in case :).
> 
> OK. We can leave it here and deal with problems if they arise :-)
> 
>>>> +}
>>>> +
>>>> +static int vb2_dc_get_pages(unsigned long start, struct page **pages,
>>>> +	int n_pages, struct vm_area_struct **copy_vma, int write)
>>>> +{
>>>> +	struct vm_area_struct *vma;
>>>> +	int n = 0; /* number of get pages */
>>>> +	int ret = -EFAULT;
>>>> +
>>>> +	/* entering critical section for mm access */
>>>> +	down_read(&current->mm->mmap_sem);
>>>
>>> This will generate AB-BA deadlock warnings if lockdep is enabled. This
>>> function is called with the queue lock held, and the mmap() handler which
>>> takes the queue lock is called with current->mm->mmap_sem held.
>>>
>>> This is a known issue with videobuf2, not specific to this patch. The
>>> warning is usually a false positive (which we still need to fix, as it
>>> worries users), but can become a real issue if an MMAP queue and a
>>> USERPTR queue are created by a driver with the same queue lock.
>>
>> Good point. Do you know any good solution to this problem?
> 
> http://patchwork.linuxtv.org/patch/8455/
> 
> It seems QBUF is safe, but PREPAREBUF isn't (both call __buf_prepare, which 
> end up calling the memops get_userptr operation).
> 
> I'll post a patch to fix it for PREPAREBUF. If I'm not mistaken, you can drop 
> the down_read/up_read here.
> 

ok. Thanks for the link.

>>>> +	vma = find_vma(current->mm, start);
>>>> +	if (!vma) {
>>>> +		printk(KERN_ERR "no vma for address %lu\n", start);
>>>> +		goto cleanup;
>>>> +	}
>>>> +
>>>> +	if (vma_is_io(vma)) {
>>>> +		unsigned long pfn;
>>>> +
>>>> +		if (vma->vm_end - start < n_pages * PAGE_SIZE) {
>>>> +			printk(KERN_ERR "vma is too small\n");
>>>> +			goto cleanup;
>>>> +		}
>>>> +
>>>> +		for (n = 0; n < n_pages; ++n, start += PAGE_SIZE) {
>>>> +			ret = follow_pfn(vma, start, &pfn);
>>>> +			if (ret) {
>>>> +				printk(KERN_ERR "no page for address %lu\n",
>>>> +					start);
>>>> +				goto cleanup;
>>>> +			}
>>>> +			pages[n] = pfn_to_page(pfn);
>>>> +			get_page(pages[n]);
>>>
>>> This worries me. When the VM_PFNMAP flag is set, the memory pages are not
>>> backed by a struct page. Creating a struct page pointer out of it can be
>>> an acceptable hack (for instance to store a page in an scatterlist with
>>> sg_set_page() and then retrieve its physical address with sg_phys()), but
>>> you should not expect the struct page to be valid for anything else.
>>> Calling get_page() on it will likely crash.
>>
>> You are completetly right. This is the corner case where list of pages is
>> not a portable way of describing the memory.
>> Maybe pfn_valid should be used to check validity of the page (pfn)
>> before getting it?
> 
> I think you should just drop the get_page() call. There's no page, so there's 
> no need to get a reference count to it.

The problem is that get_user_pages does call get_page. Not calling get_page
will break the symmetry between PFNMAP and non-PFNMAP buffers. Maybe checking
page validity before get_page/put_page is enough?

> 
> The VM_PFNMAP flag is mostly used with memory out of the kernel allocator's 
> control if I'm not mistaken. The main use case I've seen is memory reserved at 
> boot time and use as a frame buffer for instance. In that case the pages can't 
> go away, as there no page in the first place.
> 
> This won't fix the DMA SG problem though (see below).
> 
>>>> +		}
>>>> +	} else {
>>>> +		n = get_user_pages(current, current->mm, start & PAGE_MASK,
>>>> +			n_pages, write, 1, pages, NULL);
>>>> +		if (n != n_pages) {
>>>> +			printk(KERN_ERR "got only %d of %d user pages\n",
>>>> +				n, n_pages);
>>>> +			goto cleanup;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	*copy_vma = vb2_get_vma(vma);
>>>> +	if (!*copy_vma) {
>>>> +		printk(KERN_ERR "failed to copy vma\n");
>>>> +		ret = -ENOMEM;
>>>> +		goto cleanup;
>>>> +	}
>>>
>>> Do we really need to make a copy of the VMA ? The only reason why we store
>>> a pointer to it is to check the flags in vb2_dc_put_userptr(). We could
>>> store the flags instead and avoid vb2_get_dma()/vb2_put_dma() calls
>>> altogether.
>>
>> I remember that there was a very good reason of copying this vma structure.
>> You caught me on 'cargo-cult' programming.
>> I will do some reverse engineering and try to answer it soon.
> 
> OK :-) I'm not copying the VMA in the OMAP3 ISP driver, which is why this 
> caught my eyes. If you find the reason why copying it is needed, please add a 
> comment to the code.
>  

The reason of copying vma was that 'struct vma' has no reference counters.
Therefore it could be deleted after mm lock is freed, ending with freeing its all
pages belonging to vma. To prevent it, a copy of vma is created. Notice that
inside vb2_get_vma the callback open is called for original vma, preventing
memory from being released. On vb2_put_vma the complementary close is called.

>>>> +
>>>> +	/* leaving critical section for mm access */
>>>> +	up_read(&current->mm->mmap_sem);
>>>> +
>>>> +	return 0;
>>>> +
>>>> +cleanup:
>>>> +	up_read(&current->mm->mmap_sem);
>>>> +
>>>> +	/* putting user pages if used, can be done wothout the lock */
>>>> +	while (n)
>>>> +		put_page(pages[--n]);
>>>> +
>>>> +	return ret;
>>>> +}
>>>> +
>>>>
>>>>  static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
>>>> -					unsigned long size, int write)
>>>> +	unsigned long size, int write)
>>>>  {
>>>>  
>>>>  	struct vb2_dc_buf *buf;
>>>>
>>>> -	struct vm_area_struct *vma;
>>>> -	dma_addr_t dma_addr = 0;
>>>> -	int ret;
>>>> +	unsigned long start, end, offset, offset2;
>>>> +	struct page **pages;
>>>> +	int n_pages;
>>>> +	int ret = 0;
>>>> +	struct sg_table *sgt;
>>>> +	unsigned long contig_size;
>>>>
>>>>  	buf = kzalloc(sizeof *buf, GFP_KERNEL);
>>>>  	if (!buf)
>>>>  	
>>>>  		return ERR_PTR(-ENOMEM);
>>>>
>>>> -	ret = vb2_get_contig_userptr(vaddr, size, &vma, &dma_addr);
>>>> +	buf->dev = alloc_ctx;
>>>> +	buf->dma_dir = write ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
>>>> +
>>>> +	start = (unsigned long)vaddr & PAGE_MASK;
>>>> +	offset = (unsigned long)vaddr & ~PAGE_MASK;
>>>> +	end = PAGE_ALIGN((unsigned long)vaddr + size);
>>>> +	offset2 = end - (unsigned long)vaddr - size;
>>>> +	n_pages = (end - start) >> PAGE_SHIFT;
>>>> +
>>>> +	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
>>>> +	if (!pages) {
>>>> +		ret = -ENOMEM;
>>>> +		printk(KERN_ERR "failed to allocate pages table\n");
>>>> +		goto fail_buf;
>>>> +	}
>>>> +
>>>> +	/* extract page list from userspace mapping */
>>>> +	ret = vb2_dc_get_pages(start, pages, n_pages, &buf->vma, write);
>>>>
>>>>  	if (ret) {
>>>>
>>>> -		printk(KERN_ERR "Failed acquiring VMA for vaddr 0x%08lx\n",
>>>> -				vaddr);
>>>> -		kfree(buf);
>>>> -		return ERR_PTR(ret);
>>>> +		printk(KERN_ERR "failed to get user pages\n");
>>>> +		goto fail_pages;
>>>> +	}
>>>> +
>>>> +	sgt = vb2_dc_pages_to_sgt(pages, n_pages, offset, offset2);
>>>> +	if (!sgt) {
>>>> +		printk(KERN_ERR "failed to create scatterlist table\n");
>>>> +		ret = -ENOMEM;
>>>> +		goto fail_get_pages;
>>>>
>>>>  	}
>>>
>>> This looks overly complex to me. You create a multi-chunk sgt out of the
>>> user pointer address and map it completely, and then check if it starts
>>> with a big enough contiguous chunk.
>>
>> Notice that vb2_dc_pages_to_sgt does compress contiguous ranges of pfns
>> (pages). So if the memory is contiguous, then sigle chunk sglist is
>> produced. The memory used to store pages list is just temporary. It is
>> freed after the sglist is created.
> 
> That's exactly my point. The memory needs to be contiguous to be usable. If it 
> isn't, vb2-dma-contig will only use the first contiguous chunk. We could thus 
> simplify the code by hardcoding the single-chunk assumption. vb2-dma-contig 
> would walk user user pages list (or the PFN, depending on the VMA flags) and 
> stop at the first discontinuity. It would then create a single-entry sg list 
> and operate on that, without mapping or otherwise touching the rest of the 
> VMA, which is unusable to the device anyway.
> 

[see above]

>>> Why don't you create an sgt with a single continuous
>>> chunk then ? In the VM_PFNMAP case you could check whether the area is
>>> contiguous when you follow the PFNs, stop at the first discontinuity, and
>>> create an sgt with a single element right there. You would then need to
>>> call vb2_dc_pages_to_sgt() in the normal case only, and stop at the first
>>> discontinuity as well.
>>
>> Discontinuity of pfns is not a problem if a device has own IOMMU. It is not
>> known for vb2-dma-contig if mapping this multi-chunk sglist will succeed
>> until calling and checking a result of dma_map_sg.
> 
> If the device has an IOMMU it won't need contiguous memory. Shouldn't it then 
> use vb2-dma-sg instead ?
> 

No. The vb2-dma-sg is used for devices that can deal with memory accessed from
multiple chunks. For example a device with attached DMA engine that can
handle a scattergather list.

>> Why bothering if both VM_PFNMAP and non-VM_PFNMAP are handled in the same
>> way after list of pages is obtained? Trating them the same way allows to
>> reuse code and simplify the program flow.
>>
>> The DMA framework does not provide any way to force single chunk mapping in
>> sg. If the device is capable of mapping discontinous pages into a single
>> chunk the DMA framework will probably do merge the pages. The documentation
>> encourages to merge the list but it is not obligatory.
>>
>> The reason is that if 'struct scatterlist' contains no dma_length field,
>> what is controlled by CONFIG_NEED_SG_DMA_LENGTH macro, then field length
>> is used instead. Chunk marging cannot be done in such a case.
>> This is the reason why I look for the longest contiguous block.
>>
>>>> +	/* pages are no longer needed */
>>>> +	kfree(pages);
>>>> +	pages = NULL;
>>>> +
>>>> +	sgt->nents = dma_map_sg(buf->dev, sgt->sgl, sgt->orig_nents,
>>>> +		buf->dma_dir);
>>>> +	if (sgt->nents <= 0) {
>>>> +		printk(KERN_ERR "failed to map scatterlist\n");
>>>> +		ret = -EIO;
>>>> +		goto fail_sgt;
>>>> +	}
>>>> +
>>>> +	contig_size = vb2_dc_get_contiguous_size(sgt);
>>>> +	if (contig_size < size) {
>>>> +		printk(KERN_ERR "contiguous mapping is too small %lu/%lu\n",
>>>> +			contig_size, size);
>>>> +		ret = -EFAULT;
>>>> +		goto fail_map_sg;
>>>> +	}
>>>> +
>>>> +	buf->dma_addr = sg_dma_address(sgt->sgl);
>>>>
>>>>  	buf->size = size;
>>>>
>>>> -	buf->dma_addr = dma_addr;
>>>> -	buf->vma = vma;
>>>> +	buf->dma_sgt = sgt;
>>>> +
>>>> +	atomic_inc(&buf->refcount);
>>>>
>>>>  	return buf;
>>>>
>>>> +
>>>> +fail_map_sg:
>>>> +	dma_unmap_sg(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
>>>
>>> I think this will break in the VM_PFNMAP case on non-coherent
>>> architectures. arm_dma_unmap_page() will call __dma_page_dev_to_cpu() in
>>> that case, which can dereference struct page. As explain above, the
>>> struct page isn't valid with VM_PFNMAP. I haven't check the dma_map_sg()
>>> and dma_sync_sg_*() calls, but changes are they might break as well.
>>
>> It will crash as long it is true that there is no struct page behind given
>> pfn. In practice, I found that VM_PFNMAP means that one cannot assume that
>> there is a 'struct page' behind PFNs of a given mapping. Thoses struct
>> pages really exists for all our drivers. Anyway, I agree that using those
>> pages is a hack.
> 
> They don't exist for the memory used as frame buffer on the OMAP3 (or at least 
> didn't exist in the N900 and N9, I haven't checked since). This could become 
> just a bad distant memory when drivers will use CMA.
> 

I see. That is why one should check page validity before calling get_pages.
Other way would be splitting logic between PFNMAP and non-PFNMAP
userspace mappings.

>> It could be avoided if vb2_dc_get_pages returned a list of PFNs. Anyway,
>> those PFNs have to be transformed to pages to create an sglist. Those
>> pointers might be accessed somewhere deep inside dma_map_sg internals.
>>
>> The quite good solution would be dropping support for VM_PFNMAP mappings
>> since they cannot be handled reliably.
> 
> We should either drop VM_PFNMAP support or fix the DMA SG mapping API to 
> support VM_PFNMAP-style memory. I would vote for the former, as that's way 
> simpler and we have no VM_PFNMAP use case right now.
> 

Good point. But this way we have new dependencies on DMA mapping framework.

>>>> +
>>>> +fail_sgt:
>>>> +	vb2_dc_put_sgtable(sgt, 0);
>>>> +
>>>> +fail_get_pages:
>>>> +	while (pages && n_pages)
>>>> +		put_page(pages[--n_pages]);
>>>> +	vb2_put_vma(buf->vma);
>>>> +
>>>> +fail_pages:
>>>> +	kfree(pages); /* kfree is NULL-proof */
>>>> +
>>>> +fail_buf:
>>>> +	kfree(buf);
>>>> +
>>>> +	return ERR_PTR(ret);
>>>>
>>>>  }
> 

Regards,
Tomasz Stanislawski
Tomasz Stanislawski March 22, 2012, 4:18 p.m. UTC | #6
On 03/22/2012 03:52 PM, Subash Patel wrote:
> Hi Laurent,
> 
> On 03/22/2012 08:12 PM, Laurent Pinchart wrote:
>> Hi Tomasz,
>>
>> On Thursday 22 March 2012 14:36:33 Tomasz Stanislawski wrote:
>>> Hi Laurent,
>>> Thank you very much for your comments and question.
>>> They were very useful.
>>
>> You're welcome.
>>
>>> Please refer to the comments below.
>>>
>>> On 03/22/2012 11:50 AM, Laurent Pinchart wrote:
>>>> On Thursday 22 March 2012 11:02:23 Laurent Pinchart wrote:
>>>>> From: Tomasz Stanislawski<t.stanislaws@samsung.com>
>>>>>

[snip]

>>> The physical address is already present in buf->dma_addr, but it is only
>>> valid if the device has no MMU. Notice that vb2-dma-contig possess no
>>> knowledge if MMU is present for a given device.
>>
>> That's why buf->dma_addr can't be considered as a physical address. It's only
>> useful in the device context.
>>
>>> The sg list is not going to be single-entry if the device is provided with
>>> its own MMU.
>>
>> There's something I don't get then. vb2-dma-contig deals with physically
>> contiguous buffers. The buffer is backed by physically contiguous pages, so
>> the sg list should have a single entry.
>>
> I think at present, vb2-dma-contig is abused for any kind of memory allocation (continuous or not). Wouldnt it be good to have a
> proper working setup for videobuf2-dma-sg instead? Driver which chooses to use continuous, shall assign vb2_queue->mem_ops =
> vb2_dma_contig_memops. The devices which know they have a MMU backing can assign the same to vb2_dma_sg_memops. But as of now,
> we try to use vb2_dma_contig_memops for all kind of operation. I have also done this mistake, and wish I repaired it and posted
> it before :(
> 

Hi Subash,

At first, I do not think that vb2-dma-contig is abused for any kind of
allocation. It is used only for DMA coherent contiguous mapping what is
very close to vb2-dma-contig's original purpose.

One thing has to be said loudly:

"The driver does not need to and should not know if IOMMU is present or not."

The DMA framework must know if device uses IOMMU or not.
Reason: the memory is allocated, free, flushed, mapped only by DMA framework.

Usage of vb2-dma-contig or vb2-dma-sg depends only on the way how the memory is
configured into the device. Most of embedded devices use only buffer's
start address and buffer size (often indirectly specified by width/height and format).
It mean that device needs a buffer that is contiguous for its DMA engine.

In such a case, the driver should ALWAYS use vb2-dma-contig.

The DMA framework hidden deep in dma_alloc_coherent internals will deal with
IOMMU configuration.

I hope you find this information useful.

Regards,
Tomasz Stanislawski
Laurent Pinchart March 27, 2012, 3:01 p.m. UTC | #7
Hi Tomasz,

On Thursday 22 March 2012 16:58:27 Tomasz Stanislawski wrote:
> On 03/22/2012 03:42 PM, Laurent Pinchart wrote:
> > On Thursday 22 March 2012 14:36:33 Tomasz Stanislawski wrote:
> >> On 03/22/2012 11:50 AM, Laurent Pinchart wrote:
> >>> On Thursday 22 March 2012 11:02:23 Laurent Pinchart wrote:

[snip]

> >>>>  static void *vb2_dc_alloc(void *alloc_ctx, unsigned long size)
> >>>>  {
> >>>>  
> >>>>  	struct device *dev = alloc_ctx;
> >>>>  	struct vb2_dc_buf *buf;
> >>>> 
> >>>> +	int ret;
> >>>> +	int
> >>>> n_pages;
> >>>> 
> >>>>  	buf = kzalloc(sizeof *buf, GFP_KERNEL);
> >>>>  	if (!buf)
> >>>>  	
> >>>>  		return ERR_PTR(-ENOMEM);
> >>>> 
> >>>> -	buf->vaddr = dma_alloc_coherent(dev, size, &buf->dma_addr,
> > 
> > GFP_KERNEL);
> > 
> >>>> +	buf->dev = dev;
> >>>> +	buf->size = size;
> >>>> +	buf->vaddr = dma_alloc_coherent(buf->dev, buf->size, &buf->dma_addr,
> >>>> +		GFP_KERNEL);
> >>>> +
> >>>> +	ret = -ENOMEM;
> >>>> 
> >>>>  	if (!buf->vaddr) {
> >>>> 
> >>>> -		dev_err(dev, "dma_alloc_coherent of size %ld failed\n", size);
> >>>> -		kfree(buf);
> >>>> -		return ERR_PTR(-ENOMEM);
> >>>> +		dev_err(dev, "dma_alloc_coherent of size %ld failed\n",
> >>>> +			size);
> >>>> +		goto fail_buf;
> >>>> 
> >>>>  	}
> >>>> 
> >>>> -	buf->dev = dev;
> >>>> -	buf->size = size;
> >>>> +	WARN_ON((unsigned long)buf->vaddr & ~PAGE_MASK);
> >>>> +	WARN_ON(buf->dma_addr & ~PAGE_MASK);
> >>>> +
> >>>> +	n_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
> >>>> +
> >>>> +	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
> >>>> +	if (!pages) {
> >>>> +		printk(KERN_ERR "failed to alloc page table\n");
> >>>> +		goto fail_dma;
> >>>> +	}
> >>>> +
> >>>> +	ret = dma_get_pages(dev, buf->vaddr, buf->dma_addr, pages, n_pages);
> >>> 
> >>> As the only purpose of this is to retrieve a list of pages that will be
> >>> used to create a single-entry sgt, wouldn't it be possible to shortcut
> >>> the code and get the physical address of the buffer directly ?
> >> 
> >> The physical address should not be used since they are meaningless in a
> >> context of different devices. It seams that only the list of pages is
> >> more-or-less portable between different drivers.
> > 
> > The pages are physically contiguous. The physical address of the first
> > page is thus all you need.
> 
> No. DMA-CONTIG buffers do not have to be physically contiguous. Please refer
> below.
>
> > struct page and physical addresses can be used interchangeably in this
> > case if I'm not mistaken. If you want to go with pages, you could use the
> > first page only instead of the physical buffer address.
> 
> Ok. There are bus addresses, physical addresses, DMA addresses and PFNs.
> As I understand PFNs and 'struct page' can be interchanged, at least in one
> direction. The PFNs are used to create a bus address, I mean addresses that
> are recognized by a RAM chip. So a list of PFNs seams to be the most
> portable way of describing the memory, isn't it?
> 
> >> The physical address is already present in buf->dma_addr, but it is only
> >> valid if the device has no MMU. Notice that vb2-dma-contig possess no
> >> knowledge if MMU is present for a given device.
> > 
> > That's why buf->dma_addr can't be considered as a physical address. It's
> > only useful in the device context.
> 
> ok
> 
> >> The sg list is not going to be single-entry if the device is provided
> >> with its own MMU.
> > 
> > There's something I don't get then. vb2-dma-contig deals with physically
> > contiguous buffers. The buffer is backed by physically contiguous pages,
> > so the sg list should have a single entry.
> 
> As I understand dma-contig deal with DMA contiguous buffers, it means
> buffers that are contiguous from device point of view. Therefore those
> buffers do NOT have to be physically contiguous if the device has its own
> IOMMU.

My bad. There was thus a misunderstanding to begin with.

In the light of this new information, and (at least partially) sharing 
Daniel's opinion regarding dma_get_pages(), I think what we need here would be 
either

-  a DMA API call that maps the memory to the importer device instead of 
dma_get_pages() + vb2_dc_pages_to_sgt(). The call would take a DMA memory 
"cookie" (see the "Minutes from V4L2 update call" mail thread) and a pointer 
to the importer device.

- a DMA API call to retrieve a scatter list suitable to be passed to 
dma_map_sg(). This would be similar to dma_get_pages() + 
vb2_dc_pages_to_sgt().

We also need to figure out whether the mapping call should be in the exporter 
or importer.

> >>>> +	if (ret < 0) {
> >>>> +		printk(KERN_ERR "failed to get buffer pages from DMA API\n");
> >>>> +		goto fail_pages;
> >>>> +	}
> >>>> +	if (ret != n_pages) {
> >>>> +		ret = -EFAULT;
> >>>> +		printk(KERN_ERR "failed to get all pages from DMA API\n");
> >>>> +		goto fail_pages;
> >>>> +	}
> >>>> +
> >>>> +	buf->sgt_base = vb2_dc_pages_to_sgt(pages, n_pages, 0, 0);
> >>>> +	if (IS_ERR(buf->sgt_base)) {
> >>>> +		ret = PTR_ERR(buf->sgt_base);
> >>>> +		printk(KERN_ERR "failed to prepare sg table\n");
> >>>> +		goto fail_pages;
> >>>> +	}
> >>> 
> >>> buf->sgt_base isn't used in this patch. I would move the buf->sgt_base
> >>> creation code to the patch that uses it then, or to its own patch just
> >>> before the patch that uses it.
> >> 
> >> Good point. The sgt_base is used by exporter only. Thanks for noticing
> >> it.
> >> 
> >>>> +
> >>>> +	/* pages are no longer needed */
> >>>> +	kfree(pages);
> >>>> 
> >>>>  	buf->handler.refcount = &buf->refcount;
> >>>>  	buf->handler.put = vb2_dc_put;
> > 
> > [snip]
> > 
> >>>>  /*********************************************/
> >>>>  /*       callbacks for USERPTR buffers       */
> >>>>  /*********************************************/
> >>>> 
> >>>> +static inline int vma_is_io(struct vm_area_struct *vma)
> >>>> +{
> >>>> +	return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));
> >>> 
> >>> Isn't VM_PFNMAP enough ? Wouldn't it be possible (at least in theory) to
> >>> get a discontinuous physical range with VM_IO ?
> >> 
> >> Frankly, I found that that in get_user_pages flags are checked against
> >> (VM_IO | VM_PFNMAP). Probably for noMMU (not no IOMMU) case it is
> >> possible to get vma with VM_IO on and VM_PFNMAP off, isn't it?
> >> 
> >> The problem is that this framework should work in both cases so this
> >> check was added just in case :).
> > 
> > OK. We can leave it here and deal with problems if they arise :-)
> > 
> >>>> +}
> >>>> +
> >>>> +static int vb2_dc_get_pages(unsigned long start, struct page **pages,
> >>>> +	int n_pages, struct vm_area_struct **copy_vma, int write)
> >>>> +{
> >>>> +	struct vm_area_struct *vma;
> >>>> +	int n = 0; /* number of get pages */
> >>>> +	int ret = -EFAULT;
> >>>> +
> >>>> +	/* entering critical section for mm access */
> >>>> +	down_read(&current->mm->mmap_sem);
> >>> 
> >>> This will generate AB-BA deadlock warnings if lockdep is enabled. This
> >>> function is called with the queue lock held, and the mmap() handler
> >>> which
> >>> takes the queue lock is called with current->mm->mmap_sem held.
> >>> 
> >>> This is a known issue with videobuf2, not specific to this patch. The
> >>> warning is usually a false positive (which we still need to fix, as it
> >>> worries users), but can become a real issue if an MMAP queue and a
> >>> USERPTR queue are created by a driver with the same queue lock.
> >> 
> >> Good point. Do you know any good solution to this problem?
> > 
> > http://patchwork.linuxtv.org/patch/8455/
> > 
> > It seems QBUF is safe, but PREPAREBUF isn't (both call __buf_prepare,
> > which end up calling the memops get_userptr operation).
> > 
> > I'll post a patch to fix it for PREPAREBUF. If I'm not mistaken, you can
> > drop the down_read/up_read here.
> 
> ok. Thanks for the link.
> 
> >>>> +	vma = find_vma(current->mm, start);
> >>>> +	if (!vma) {
> >>>> +		printk(KERN_ERR "no vma for address %lu\n", start);
> >>>> +		goto cleanup;
> >>>> +	}
> >>>> +
> >>>> +	if (vma_is_io(vma)) {
> >>>> +		unsigned long pfn;
> >>>> +
> >>>> +		if (vma->vm_end - start < n_pages * PAGE_SIZE) {
> >>>> +			printk(KERN_ERR "vma is too small\n");
> >>>> +			goto cleanup;
> >>>> +		}
> >>>> +
> >>>> +		for (n = 0; n < n_pages; ++n, start += PAGE_SIZE) {
> >>>> +			ret = follow_pfn(vma, start, &pfn);
> >>>> +			if (ret) {
> >>>> +				printk(KERN_ERR "no page for address %lu\n",
> >>>> +					start);
> >>>> +				goto cleanup;
> >>>> +			}
> >>>> +			pages[n] = pfn_to_page(pfn);
> >>>> +			get_page(pages[n]);
> >>> 
> >>> This worries me. When the VM_PFNMAP flag is set, the memory pages are
> >>> not backed by a struct page. Creating a struct page pointer out of it
> >>> can be an acceptable hack (for instance to store a page in an
> >>> scatterlist with sg_set_page() and then retrieve its physical address
> >>> with sg_phys()), but you should not expect the struct page to be valid
> >>> for anything else. Calling get_page() on it will likely crash.
> >> 
> >> You are completetly right. This is the corner case where list of pages is
> >> not a portable way of describing the memory.
> >> Maybe pfn_valid should be used to check validity of the page (pfn)
> >> before getting it?
> > 
> > I think you should just drop the get_page() call. There's no page, so
> > there's no need to get a reference count to it.
> 
> The problem is that get_user_pages does call get_page. Not calling get_page
> will break the symmetry between PFNMAP and non-PFNMAP buffers. Maybe
> checking page validity before get_page/put_page is enough?

PFNMAP and non-PFNMAP buffers are inherently different, so I don't see a 
problem in handling them differently. We will likely run into an issue though, 
with hardware such as the OMAP TILER, where the memory isn't backed by normal 
memory (and thus no struct page is present), but for which the target must be 
pinned somehow (in the case of the tiler, that would be a tiler mapping). I 
don't think we have an API to ask the kernel to pin a memory range regardless 
of how the memory is handled (system memory, reserved memory with PFNMAP, 
device mapping such as with the tiler, ...). This is an open issue. One 
possible solution is to deprecate USERPTR support for that kind of memory and 
use dma-buf instead.

> > The VM_PFNMAP flag is mostly used with memory out of the kernel
> > allocator's control if I'm not mistaken. The main use case I've seen is
> > memory reserved at boot time and use as a frame buffer for instance. In
> > that case the pages can't go away, as there no page in the first place.
> > 
> > This won't fix the DMA SG problem though (see below).
> > 
> >>>> +		}
> >>>> +	} else {
> >>>> +		n = get_user_pages(current, current->mm, start & PAGE_MASK,
> >>>> +			n_pages, write, 1, pages, NULL);
> >>>> +		if (n != n_pages) {
> >>>> +			printk(KERN_ERR "got only %d of %d user pages\n",
> >>>> +				n, n_pages);
> >>>> +			goto cleanup;
> >>>> +		}
> >>>> +	}
> >>>> +
> >>>> +	*copy_vma = vb2_get_vma(vma);
> >>>> +	if (!*copy_vma) {
> >>>> +		printk(KERN_ERR "failed to copy vma\n");
> >>>> +		ret = -ENOMEM;
> >>>> +		goto cleanup;
> >>>> +	}
> >>> 
> >>> Do we really need to make a copy of the VMA ? The only reason why we
> >>> store a pointer to it is to check the flags in vb2_dc_put_userptr(). We
> >>> could store the flags instead and avoid vb2_get_dma()/vb2_put_dma()
> >>> calls altogether.
> >> 
> >> I remember that there was a very good reason of copying this vma
> >> structure.
> >> You caught me on 'cargo-cult' programming.
> >> I will do some reverse engineering and try to answer it soon.
> > 
> > OK :-) I'm not copying the VMA in the OMAP3 ISP driver, which is why this
> > caught my eyes. If you find the reason why copying it is needed, please
> > add a comment to the code.
> 
> The reason of copying vma was that 'struct vma' has no reference counters.
> Therefore it could be deleted after mm lock is freed, ending with freeing
> its all pages belonging to vma. To prevent it, a copy of vma is created.
> Notice that inside vb2_get_vma the callback open is called for original
> vma, preventing memory from being released. On vb2_put_vma the
> complementary close is called.

Feel free to prove me wrong, but I think get_user_pages() is enough to prevent 
the pages from being freed, even if the VMA is deleted.

However, there's one subtle issue that we will need to deal with when we will 
implement cache management. It took me a lot of time to debug and fix it when 
I was working on the OMAP3 ISP driver, so I'll explain it in the hope that 
someone will find it later when dealing with the same problems :-)

When a buffer is queued, the OMAP3 ISP driver cleans up the cache using the 
userspace mapping addresses (for USERPTR buffers). This might be a bad thing, 
but that's the way we currently implement that.

A prior get_user_pages() call will make sure pages are not freed, but due to 
optimization in the lockless memory management algorithms the userspace 
mapping can disappear: the kernel might consider that a page can be freed, 
remove its userspace mapping, and then find out that the page is locked. It 
will then move on without restoring the userspace mapping, which will be 
restored when the next page fault occurs.

When cleaning the cache using the userspace mapping addresses, any page for 
which the userspace mapping has been removed will trigger a page fault. The 
page fault handler (do_page_fault() in arm/arch/mm/fault.c) will try to 
read_lock mmap_sem. If it fails, it will check if the page fault occured in 
userspace context, or from a known exception location. As neither condition is 
true, it will panic.

The solution I found to this issue was to lock the VMA. This ensured that the 
userspace mapping would stay in place. See isp_video_buffer_lock_vma() in 
drivers/media/video/omap3isp/ispqueue.c. You could use a similar approach here 
if you want to ensure that userspace mappings are not removed, but once again 
I don't think that's needed (until we get to cache handling) as 
get_user_pages() will ensure that the pages are not freed.

> >>>> +
> >>>> +	/* leaving critical section for mm access */
> >>>> +	up_read(&current->mm->mmap_sem);
> >>>> +
> >>>> +	return 0;
> >>>> +
> >>>> +cleanup:
> >>>> +	up_read(&current->mm->mmap_sem);
> >>>> +
> >>>> +	/* putting user pages if used, can be done wothout the lock */
> >>>> +	while (n)
> >>>> +		put_page(pages[--n]);
> >>>> +
> >>>> +	return ret;
> >>>> +}
> >>>> +
> >>>> 
> >>>>  static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
> >>>> 
> >>>> -					unsigned long size, int write)
> >>>> +	unsigned long size, int write)
> >>>> 
> >>>>  {
> >>>>  
> >>>>  	struct vb2_dc_buf *buf;
> >>>> 
> >>>> -	struct vm_area_struct *vma;
> >>>> -	dma_addr_t dma_addr = 0;
> >>>> -	int ret;
> >>>> +	unsigned long start, end, offset, offset2;
> >>>> +	struct page **pages;
> >>>> +	int n_pages;
> >>>> +	int ret = 0;
> >>>> +	struct sg_table *sgt;
> >>>> +	unsigned long contig_size;
> >>>> 
> >>>>  	buf = kzalloc(sizeof *buf, GFP_KERNEL);
> >>>>  	if (!buf)
> >>>>  	
> >>>>  		return ERR_PTR(-ENOMEM);
> >>>> 
> >>>> -	ret = vb2_get_contig_userptr(vaddr, size, &vma, &dma_addr);
> >>>> +	buf->dev = alloc_ctx;
> >>>> +	buf->dma_dir = write ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
> >>>> +
> >>>> +	start = (unsigned long)vaddr & PAGE_MASK;
> >>>> +	offset = (unsigned long)vaddr & ~PAGE_MASK;
> >>>> +	end = PAGE_ALIGN((unsigned long)vaddr + size);
> >>>> +	offset2 = end - (unsigned long)vaddr - size;
> >>>> +	n_pages = (end - start) >> PAGE_SHIFT;
> >>>> +
> >>>> +	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
> >>>> +	if (!pages) {
> >>>> +		ret = -ENOMEM;
> >>>> +		printk(KERN_ERR "failed to allocate pages table\n");
> >>>> +		goto fail_buf;
> >>>> +	}
> >>>> +
> >>>> +	/* extract page list from userspace mapping */
> >>>> +	ret = vb2_dc_get_pages(start, pages, n_pages, &buf->vma, write);
> >>>> 
> >>>>  	if (ret) {
> >>>> 
> >>>> -		printk(KERN_ERR "Failed acquiring VMA for vaddr 0x%08lx\n",
> >>>> -				vaddr);
> >>>> -		kfree(buf);
> >>>> -		return ERR_PTR(ret);
> >>>> +		printk(KERN_ERR "failed to get user pages\n");
> >>>> +		goto fail_pages;
> >>>> +	}
> >>>> +
> >>>> +	sgt = vb2_dc_pages_to_sgt(pages, n_pages, offset, offset2);
> >>>> +	if (!sgt) {
> >>>> +		printk(KERN_ERR "failed to create scatterlist table\n");
> >>>> +		ret = -ENOMEM;
> >>>> +		goto fail_get_pages;
> >>>> 
> >>>>  	}
> >>> 
> >>> This looks overly complex to me. You create a multi-chunk sgt out of the
> >>> user pointer address and map it completely, and then check if it starts
> >>> with a big enough contiguous chunk.
> >> 
> >> Notice that vb2_dc_pages_to_sgt does compress contiguous ranges of pfns
> >> (pages). So if the memory is contiguous, then sigle chunk sglist is
> >> produced. The memory used to store pages list is just temporary. It is
> >> freed after the sglist is created.
> > 
> > That's exactly my point. The memory needs to be contiguous to be usable.
> > If it isn't, vb2-dma-contig will only use the first contiguous chunk. We
> > could thus simplify the code by hardcoding the single-chunk assumption.
> > vb2-dma-contig would walk user user pages list (or the PFN, depending on
> > the VMA flags) and stop at the first discontinuity. It would then create
> > a single-entry sg list and operate on that, without mapping or otherwise
> > touching the rest of the VMA, which is unusable to the device anyway.
> 
> [see above]
> 
> >>> Why don't you create an sgt with a single continuous
> >>> chunk then ? In the VM_PFNMAP case you could check whether the area is
> >>> contiguous when you follow the PFNs, stop at the first discontinuity,
> >>> and create an sgt with a single element right there. You would then need
> >>> to call vb2_dc_pages_to_sgt() in the normal case only, and stop at the
> >>> first discontinuity as well.
> >> 
> >> Discontinuity of pfns is not a problem if a device has own IOMMU. It is
> >> not known for vb2-dma-contig if mapping this multi-chunk sglist will
> >> succeed until calling and checking a result of dma_map_sg.
> > 
> > If the device has an IOMMU it won't need contiguous memory. Shouldn't it
> > then use vb2-dma-sg instead ?
> 
> No. The vb2-dma-sg is used for devices that can deal with memory accessed
> from multiple chunks. For example a device with attached DMA engine that
> can handle a scattergather list.
> 
> >> Why bothering if both VM_PFNMAP and non-VM_PFNMAP are handled in the same
> >> way after list of pages is obtained? Trating them the same way allows to
> >> reuse code and simplify the program flow.
> >> 
> >> The DMA framework does not provide any way to force single chunk mapping
> >> in sg. If the device is capable of mapping discontinous pages into a
> >> single chunk the DMA framework will probably do merge the pages. The
> >> documentation encourages to merge the list but it is not obligatory.
> >> 
> >> The reason is that if 'struct scatterlist' contains no dma_length field,
> >> what is controlled by CONFIG_NEED_SG_DMA_LENGTH macro, then field length
> >> is used instead. Chunk marging cannot be done in such a case.
> >> This is the reason why I look for the longest contiguous block.
> >> 
> >>>> +	/* pages are no longer needed */
> >>>> +	kfree(pages);
> >>>> +	pages = NULL;
> >>>> +
> >>>> +	sgt->nents = dma_map_sg(buf->dev, sgt->sgl, sgt->orig_nents,
> >>>> +		buf->dma_dir);
> >>>> +	if (sgt->nents <= 0) {
> >>>> +		printk(KERN_ERR "failed to map scatterlist\n");
> >>>> +		ret = -EIO;
> >>>> +		goto fail_sgt;
> >>>> +	}
> >>>> +
> >>>> +	contig_size = vb2_dc_get_contiguous_size(sgt);
> >>>> +	if (contig_size < size) {
> >>>> +		printk(KERN_ERR "contiguous mapping is too small %lu/%lu\n",
> >>>> +			contig_size, size);
> >>>> +		ret = -EFAULT;
> >>>> +		goto fail_map_sg;
> >>>> +	}
> >>>> +
> >>>> +	buf->dma_addr = sg_dma_address(sgt->sgl);
> >>>> 
> >>>>  	buf->size = size;
> >>>> 
> >>>> -	buf->dma_addr = dma_addr;
> >>>> -	buf->vma = vma;
> >>>> +	buf->dma_sgt = sgt;
> >>>> +
> >>>> +	atomic_inc(&buf->refcount);
> >>>> 
> >>>>  	return buf;
> >>>> 
> >>>> +
> >>>> +fail_map_sg:
> >>>> +	dma_unmap_sg(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
> >>> 
> >>> I think this will break in the VM_PFNMAP case on non-coherent
> >>> architectures. arm_dma_unmap_page() will call __dma_page_dev_to_cpu() in
> >>> that case, which can dereference struct page. As explain above, the
> >>> struct page isn't valid with VM_PFNMAP. I haven't check the dma_map_sg()
> >>> and dma_sync_sg_*() calls, but changes are they might break as well.
> >> 
> >> It will crash as long it is true that there is no struct page behind
> >> given pfn. In practice, I found that VM_PFNMAP means that one cannot
> >> assume that there is a 'struct page' behind PFNs of a given mapping.
> >> Thoses struct pages really exists for all our drivers. Anyway, I agree
> >> that using those pages is a hack.
> > 
> > They don't exist for the memory used as frame buffer on the OMAP3 (or at
> > least didn't exist in the N900 and N9, I haven't checked since). This
> > could become just a bad distant memory when drivers will use CMA.
> 
> I see. That is why one should check page validity before calling get_pages.
> Other way would be splitting logic between PFNMAP and non-PFNMAP userspace
> mappings.

The OMAP3 ISP driver splits the logic for the PFNMAP and non-PFNMAP cases and 
the code is quite clean (at least in my opinion ;-)). I can give you a quick 
tour around its allocator if you want. I would go for that solution here as 
well.

> >> It could be avoided if vb2_dc_get_pages returned a list of PFNs. Anyway,
> >> those PFNs have to be transformed to pages to create an sglist. Those
> >> pointers might be accessed somewhere deep inside dma_map_sg internals.
> >> 
> >> The quite good solution would be dropping support for VM_PFNMAP mappings
> >> since they cannot be handled reliably.
> > 
> > We should either drop VM_PFNMAP support or fix the DMA SG mapping API to
> > support VM_PFNMAP-style memory. I would vote for the former, as that's way
> > simpler and we have no VM_PFNMAP use case right now.
> 
> Good point. But this way we have new dependencies on DMA mapping framework.
> 
> >>>> +
> >>>> +fail_sgt:
> >>>> +	vb2_dc_put_sgtable(sgt, 0);
> >>>> +
> >>>> +fail_get_pages:
> >>>> +	while (pages && n_pages)
> >>>> +		put_page(pages[--n_pages]);
> >>>> +	vb2_put_vma(buf->vma);
> >>>> +
> >>>> +fail_pages:
> >>>> +	kfree(pages); /* kfree is NULL-proof */
> >>>> +
> >>>> +fail_buf:
> >>>> +	kfree(buf);
> >>>> +
> >>>> +	return ERR_PTR(ret);
> >>>> 
> >>>>  }
Jerome Glisse March 27, 2012, 4:45 p.m. UTC | #8
On Tue, Mar 27, 2012 at 11:01 AM, Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
> Hi Tomasz,
>
> On Thursday 22 March 2012 16:58:27 Tomasz Stanislawski wrote:
>> On 03/22/2012 03:42 PM, Laurent Pinchart wrote:
>> > On Thursday 22 March 2012 14:36:33 Tomasz Stanislawski wrote:
>> >> On 03/22/2012 11:50 AM, Laurent Pinchart wrote:
>> >>> On Thursday 22 March 2012 11:02:23 Laurent Pinchart wrote:
>
> [snip]
>
>> >>>>  static void *vb2_dc_alloc(void *alloc_ctx, unsigned long size)
>> >>>>  {
>> >>>>
>> >>>>          struct device *dev = alloc_ctx;
>> >>>>          struct vb2_dc_buf *buf;
>> >>>>
>> >>>> +        int ret;
>> >>>> +        int
>> >>>> n_pages;
>> >>>>
>> >>>>          buf = kzalloc(sizeof *buf, GFP_KERNEL);
>> >>>>          if (!buf)
>> >>>>
>> >>>>                  return ERR_PTR(-ENOMEM);
>> >>>>
>> >>>> -        buf->vaddr = dma_alloc_coherent(dev, size, &buf->dma_addr,
>> >
>> > GFP_KERNEL);
>> >
>> >>>> +        buf->dev = dev;
>> >>>> +        buf->size = size;
>> >>>> +        buf->vaddr = dma_alloc_coherent(buf->dev, buf->size, &buf->dma_addr,
>> >>>> +                GFP_KERNEL);
>> >>>> +
>> >>>> +        ret = -ENOMEM;
>> >>>>
>> >>>>          if (!buf->vaddr) {
>> >>>>
>> >>>> -                dev_err(dev, "dma_alloc_coherent of size %ld failed\n", size);
>> >>>> -                kfree(buf);
>> >>>> -                return ERR_PTR(-ENOMEM);
>> >>>> +                dev_err(dev, "dma_alloc_coherent of size %ld failed\n",
>> >>>> +                        size);
>> >>>> +                goto fail_buf;
>> >>>>
>> >>>>          }
>> >>>>
>> >>>> -        buf->dev = dev;
>> >>>> -        buf->size = size;
>> >>>> +        WARN_ON((unsigned long)buf->vaddr & ~PAGE_MASK);
>> >>>> +        WARN_ON(buf->dma_addr & ~PAGE_MASK);
>> >>>> +
>> >>>> +        n_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
>> >>>> +
>> >>>> +        pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
>> >>>> +        if (!pages) {
>> >>>> +                printk(KERN_ERR "failed to alloc page table\n");
>> >>>> +                goto fail_dma;
>> >>>> +        }
>> >>>> +
>> >>>> +        ret = dma_get_pages(dev, buf->vaddr, buf->dma_addr, pages, n_pages);
>> >>>
>> >>> As the only purpose of this is to retrieve a list of pages that will be
>> >>> used to create a single-entry sgt, wouldn't it be possible to shortcut
>> >>> the code and get the physical address of the buffer directly ?
>> >>
>> >> The physical address should not be used since they are meaningless in a
>> >> context of different devices. It seams that only the list of pages is
>> >> more-or-less portable between different drivers.
>> >
>> > The pages are physically contiguous. The physical address of the first
>> > page is thus all you need.
>>
>> No. DMA-CONTIG buffers do not have to be physically contiguous. Please refer
>> below.
>>
>> > struct page and physical addresses can be used interchangeably in this
>> > case if I'm not mistaken. If you want to go with pages, you could use the
>> > first page only instead of the physical buffer address.
>>
>> Ok. There are bus addresses, physical addresses, DMA addresses and PFNs.
>> As I understand PFNs and 'struct page' can be interchanged, at least in one
>> direction. The PFNs are used to create a bus address, I mean addresses that
>> are recognized by a RAM chip. So a list of PFNs seams to be the most
>> portable way of describing the memory, isn't it?
>>
>> >> The physical address is already present in buf->dma_addr, but it is only
>> >> valid if the device has no MMU. Notice that vb2-dma-contig possess no
>> >> knowledge if MMU is present for a given device.
>> >
>> > That's why buf->dma_addr can't be considered as a physical address. It's
>> > only useful in the device context.
>>
>> ok
>>
>> >> The sg list is not going to be single-entry if the device is provided
>> >> with its own MMU.
>> >
>> > There's something I don't get then. vb2-dma-contig deals with physically
>> > contiguous buffers. The buffer is backed by physically contiguous pages,
>> > so the sg list should have a single entry.
>>
>> As I understand dma-contig deal with DMA contiguous buffers, it means
>> buffers that are contiguous from device point of view. Therefore those
>> buffers do NOT have to be physically contiguous if the device has its own
>> IOMMU.
>
> My bad. There was thus a misunderstanding to begin with.
>
> In the light of this new information, and (at least partially) sharing
> Daniel's opinion regarding dma_get_pages(), I think what we need here would be
> either
>
> -  a DMA API call that maps the memory to the importer device instead of
> dma_get_pages() + vb2_dc_pages_to_sgt(). The call would take a DMA memory
> "cookie" (see the "Minutes from V4L2 update call" mail thread) and a pointer
> to the importer device.
>
> - a DMA API call to retrieve a scatter list suitable to be passed to
> dma_map_sg(). This would be similar to dma_get_pages() +
> vb2_dc_pages_to_sgt().
>
> We also need to figure out whether the mapping call should be in the exporter
> or importer.
>
>> >>>> +        if (ret < 0) {
>> >>>> +                printk(KERN_ERR "failed to get buffer pages from DMA API\n");
>> >>>> +                goto fail_pages;
>> >>>> +        }
>> >>>> +        if (ret != n_pages) {
>> >>>> +                ret = -EFAULT;
>> >>>> +                printk(KERN_ERR "failed to get all pages from DMA API\n");
>> >>>> +                goto fail_pages;
>> >>>> +        }
>> >>>> +
>> >>>> +        buf->sgt_base = vb2_dc_pages_to_sgt(pages, n_pages, 0, 0);
>> >>>> +        if (IS_ERR(buf->sgt_base)) {
>> >>>> +                ret = PTR_ERR(buf->sgt_base);
>> >>>> +                printk(KERN_ERR "failed to prepare sg table\n");
>> >>>> +                goto fail_pages;
>> >>>> +        }
>> >>>
>> >>> buf->sgt_base isn't used in this patch. I would move the buf->sgt_base
>> >>> creation code to the patch that uses it then, or to its own patch just
>> >>> before the patch that uses it.
>> >>
>> >> Good point. The sgt_base is used by exporter only. Thanks for noticing
>> >> it.
>> >>
>> >>>> +
>> >>>> +        /* pages are no longer needed */
>> >>>> +        kfree(pages);
>> >>>>
>> >>>>          buf->handler.refcount = &buf->refcount;
>> >>>>          buf->handler.put = vb2_dc_put;
>> >
>> > [snip]
>> >
>> >>>>  /*********************************************/
>> >>>>  /*       callbacks for USERPTR buffers       */
>> >>>>  /*********************************************/
>> >>>>
>> >>>> +static inline int vma_is_io(struct vm_area_struct *vma)
>> >>>> +{
>> >>>> +        return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));
>> >>>
>> >>> Isn't VM_PFNMAP enough ? Wouldn't it be possible (at least in theory) to
>> >>> get a discontinuous physical range with VM_IO ?
>> >>
>> >> Frankly, I found that that in get_user_pages flags are checked against
>> >> (VM_IO | VM_PFNMAP). Probably for noMMU (not no IOMMU) case it is
>> >> possible to get vma with VM_IO on and VM_PFNMAP off, isn't it?
>> >>
>> >> The problem is that this framework should work in both cases so this
>> >> check was added just in case :).
>> >
>> > OK. We can leave it here and deal with problems if they arise :-)
>> >
>> >>>> +}
>> >>>> +
>> >>>> +static int vb2_dc_get_pages(unsigned long start, struct page **pages,
>> >>>> +        int n_pages, struct vm_area_struct **copy_vma, int write)
>> >>>> +{
>> >>>> +        struct vm_area_struct *vma;
>> >>>> +        int n = 0; /* number of get pages */
>> >>>> +        int ret = -EFAULT;
>> >>>> +
>> >>>> +        /* entering critical section for mm access */
>> >>>> +        down_read(&current->mm->mmap_sem);
>> >>>
>> >>> This will generate AB-BA deadlock warnings if lockdep is enabled. This
>> >>> function is called with the queue lock held, and the mmap() handler
>> >>> which
>> >>> takes the queue lock is called with current->mm->mmap_sem held.
>> >>>
>> >>> This is a known issue with videobuf2, not specific to this patch. The
>> >>> warning is usually a false positive (which we still need to fix, as it
>> >>> worries users), but can become a real issue if an MMAP queue and a
>> >>> USERPTR queue are created by a driver with the same queue lock.
>> >>
>> >> Good point. Do you know any good solution to this problem?
>> >
>> > http://patchwork.linuxtv.org/patch/8455/
>> >
>> > It seems QBUF is safe, but PREPAREBUF isn't (both call __buf_prepare,
>> > which end up calling the memops get_userptr operation).
>> >
>> > I'll post a patch to fix it for PREPAREBUF. If I'm not mistaken, you can
>> > drop the down_read/up_read here.
>>
>> ok. Thanks for the link.
>>
>> >>>> +        vma = find_vma(current->mm, start);
>> >>>> +        if (!vma) {
>> >>>> +                printk(KERN_ERR "no vma for address %lu\n", start);
>> >>>> +                goto cleanup;
>> >>>> +        }
>> >>>> +
>> >>>> +        if (vma_is_io(vma)) {
>> >>>> +                unsigned long pfn;
>> >>>> +
>> >>>> +                if (vma->vm_end - start < n_pages * PAGE_SIZE) {
>> >>>> +                        printk(KERN_ERR "vma is too small\n");
>> >>>> +                        goto cleanup;
>> >>>> +                }
>> >>>> +
>> >>>> +                for (n = 0; n < n_pages; ++n, start += PAGE_SIZE) {
>> >>>> +                        ret = follow_pfn(vma, start, &pfn);
>> >>>> +                        if (ret) {
>> >>>> +                                printk(KERN_ERR "no page for address %lu\n",
>> >>>> +                                        start);
>> >>>> +                                goto cleanup;
>> >>>> +                        }
>> >>>> +                        pages[n] = pfn_to_page(pfn);
>> >>>> +                        get_page(pages[n]);
>> >>>
>> >>> This worries me. When the VM_PFNMAP flag is set, the memory pages are
>> >>> not backed by a struct page. Creating a struct page pointer out of it
>> >>> can be an acceptable hack (for instance to store a page in an
>> >>> scatterlist with sg_set_page() and then retrieve its physical address
>> >>> with sg_phys()), but you should not expect the struct page to be valid
>> >>> for anything else. Calling get_page() on it will likely crash.
>> >>
>> >> You are completetly right. This is the corner case where list of pages is
>> >> not a portable way of describing the memory.
>> >> Maybe pfn_valid should be used to check validity of the page (pfn)
>> >> before getting it?
>> >
>> > I think you should just drop the get_page() call. There's no page, so
>> > there's no need to get a reference count to it.
>>
>> The problem is that get_user_pages does call get_page. Not calling get_page
>> will break the symmetry between PFNMAP and non-PFNMAP buffers. Maybe
>> checking page validity before get_page/put_page is enough?
>
> PFNMAP and non-PFNMAP buffers are inherently different, so I don't see a
> problem in handling them differently. We will likely run into an issue though,
> with hardware such as the OMAP TILER, where the memory isn't backed by normal
> memory (and thus no struct page is present), but for which the target must be
> pinned somehow (in the case of the tiler, that would be a tiler mapping). I
> don't think we have an API to ask the kernel to pin a memory range regardless
> of how the memory is handled (system memory, reserved memory with PFNMAP,
> device mapping such as with the tiler, ...). This is an open issue. One
> possible solution is to deprecate USERPTR support for that kind of memory and
> use dma-buf instead.
>
>> > The VM_PFNMAP flag is mostly used with memory out of the kernel
>> > allocator's control if I'm not mistaken. The main use case I've seen is
>> > memory reserved at boot time and use as a frame buffer for instance. In
>> > that case the pages can't go away, as there no page in the first place.
>> >
>> > This won't fix the DMA SG problem though (see below).
>> >
>> >>>> +                }
>> >>>> +        } else {
>> >>>> +                n = get_user_pages(current, current->mm, start & PAGE_MASK,
>> >>>> +                        n_pages, write, 1, pages, NULL);
>> >>>> +                if (n != n_pages) {
>> >>>> +                        printk(KERN_ERR "got only %d of %d user pages\n",
>> >>>> +                                n, n_pages);
>> >>>> +                        goto cleanup;
>> >>>> +                }
>> >>>> +        }
>> >>>> +
>> >>>> +        *copy_vma = vb2_get_vma(vma);
>> >>>> +        if (!*copy_vma) {
>> >>>> +                printk(KERN_ERR "failed to copy vma\n");
>> >>>> +                ret = -ENOMEM;
>> >>>> +                goto cleanup;
>> >>>> +        }
>> >>>
>> >>> Do we really need to make a copy of the VMA ? The only reason why we
>> >>> store a pointer to it is to check the flags in vb2_dc_put_userptr(). We
>> >>> could store the flags instead and avoid vb2_get_dma()/vb2_put_dma()
>> >>> calls altogether.
>> >>
>> >> I remember that there was a very good reason of copying this vma
>> >> structure.
>> >> You caught me on 'cargo-cult' programming.
>> >> I will do some reverse engineering and try to answer it soon.
>> >
>> > OK :-) I'm not copying the VMA in the OMAP3 ISP driver, which is why this
>> > caught my eyes. If you find the reason why copying it is needed, please
>> > add a comment to the code.
>>
>> The reason of copying vma was that 'struct vma' has no reference counters.
>> Therefore it could be deleted after mm lock is freed, ending with freeing
>> its all pages belonging to vma. To prevent it, a copy of vma is created.
>> Notice that inside vb2_get_vma the callback open is called for original
>> vma, preventing memory from being released. On vb2_put_vma the
>> complementary close is called.
>
> Feel free to prove me wrong, but I think get_user_pages() is enough to prevent
> the pages from being freed, even if the VMA is deleted.
>
> However, there's one subtle issue that we will need to deal with when we will
> implement cache management. It took me a lot of time to debug and fix it when
> I was working on the OMAP3 ISP driver, so I'll explain it in the hope that
> someone will find it later when dealing with the same problems :-)
>
> When a buffer is queued, the OMAP3 ISP driver cleans up the cache using the
> userspace mapping addresses (for USERPTR buffers). This might be a bad thing,
> but that's the way we currently implement that.
>
> A prior get_user_pages() call will make sure pages are not freed, but due to
> optimization in the lockless memory management algorithms the userspace
> mapping can disappear: the kernel might consider that a page can be freed,
> remove its userspace mapping, and then find out that the page is locked. It
> will then move on without restoring the userspace mapping, which will be
> restored when the next page fault occurs.
>
> When cleaning the cache using the userspace mapping addresses, any page for
> which the userspace mapping has been removed will trigger a page fault. The
> page fault handler (do_page_fault() in arm/arch/mm/fault.c) will try to
> read_lock mmap_sem. If it fails, it will check if the page fault occured in
> userspace context, or from a known exception location. As neither condition is
> true, it will panic.
>
> The solution I found to this issue was to lock the VMA. This ensured that the
> userspace mapping would stay in place. See isp_video_buffer_lock_vma() in
> drivers/media/video/omap3isp/ispqueue.c. You could use a similar approach here
> if you want to ensure that userspace mappings are not removed, but once again
> I don't think that's needed (until we get to cache handling) as
> get_user_pages() will ensure that the pages are not freed.

I think the proper solution is to not use any user allocated memory and always
use dma-buf. Some evil process thread might unlock the vma behind your back
and you back to the original issue.

The linux memory management is not designed to easily allow use of user
allocated memory by a device to do dma to/from it, at least not for the usecase
where dma operation might happen over long period of time.

I guess some VMA change might help but this might also be hurt full and i am
not familiar enough with the whole memory management to venture a guess on
what kind if implication there is.

Cheers,
Jerome Glisse
Laurent Pinchart March 27, 2012, 4:58 p.m. UTC | #9
Hi Jerome,

On Tuesday 27 March 2012 12:45:23 Jerome Glisse wrote:
> On Tue, Mar 27, 2012 at 11:01 AM, Laurent Pinchart wrote:
> > On Thursday 22 March 2012 16:58:27 Tomasz Stanislawski wrote:
> >> On 03/22/2012 03:42 PM, Laurent Pinchart wrote:
> >> > On Thursday 22 March 2012 14:36:33 Tomasz Stanislawski wrote:
> >> >> On 03/22/2012 11:50 AM, Laurent Pinchart wrote:
> >> >>> On Thursday 22 March 2012 11:02:23 Laurent Pinchart wrote:

[snip]

> >> >>>> +        *copy_vma = vb2_get_vma(vma);
> >> >>>> +        if (!*copy_vma) {
> >> >>>> +                printk(KERN_ERR "failed to copy vma\n");
> >> >>>> +                ret = -ENOMEM;
> >> >>>> +                goto cleanup;
> >> >>>> +        }
> >> >>> 
> >> >>> Do we really need to make a copy of the VMA ? The only reason why we
> >> >>> store a pointer to it is to check the flags in vb2_dc_put_userptr().
> >> >>> We could store the flags instead and avoid
> >> >>> vb2_get_dma()/vb2_put_dma() calls altogether.
> >> >> 
> >> >> I remember that there was a very good reason of copying this vma
> >> >> structure.
> >> >> You caught me on 'cargo-cult' programming.
> >> >> I will do some reverse engineering and try to answer it soon.
> >> > 
> >> > OK :-) I'm not copying the VMA in the OMAP3 ISP driver, which is why
> >> > this caught my eyes. If you find the reason why copying it is needed,
> >> > please add a comment to the code.
> >> 
> >> The reason of copying vma was that 'struct vma' has no reference
> >> counters.
> >> Therefore it could be deleted after mm lock is freed, ending with freeing
> >> its all pages belonging to vma. To prevent it, a copy of vma is created.
> >> Notice that inside vb2_get_vma the callback open is called for original
> >> vma, preventing memory from being released. On vb2_put_vma the
> >> complementary close is called.
> > 
> > Feel free to prove me wrong, but I think get_user_pages() is enough to
> > prevent the pages from being freed, even if the VMA is deleted.
> > 
> > However, there's one subtle issue that we will need to deal with when we
> > will implement cache management. It took me a lot of time to debug and
> > fix it when I was working on the OMAP3 ISP driver, so I'll explain it in
> > the hope that someone will find it later when dealing with the same
> > problems :-)
> > 
> > When a buffer is queued, the OMAP3 ISP driver cleans up the cache using
> > the userspace mapping addresses (for USERPTR buffers). This might be a bad
> > thing, but that's the way we currently implement that.
> > 
> > A prior get_user_pages() call will make sure pages are not freed, but due
> > to optimization in the lockless memory management algorithms the
> > userspace mapping can disappear: the kernel might consider that a page
> > can be freed, remove its userspace mapping, and then find out that the
> > page is locked. It will then move on without restoring the userspace
> > mapping, which will be restored when the next page fault occurs.
> > 
> > When cleaning the cache using the userspace mapping addresses, any page
> > for which the userspace mapping has been removed will trigger a page
> > fault. The page fault handler (do_page_fault() in arm/arch/mm/fault.c)
> > will try to read_lock mmap_sem. If it fails, it will check if the page
> > fault occured in userspace context, or from a known exception location. As
> > neither condition is true, it will panic.
> > 
> > The solution I found to this issue was to lock the VMA. This ensured that
> > the userspace mapping would stay in place. See
> > isp_video_buffer_lock_vma() in drivers/media/video/omap3isp/ispqueue.c.
> > You could use a similar approach here if you want to ensure that
> > userspace mappings are not removed, but once again I don't think that's
> > needed (until we get to cache handling) as
> > get_user_pages() will ensure that the pages are not freed.
> 
> I think the proper solution is to not use any user allocated memory and
> always use dma-buf. Some evil process thread might unlock the vma behind
> your back and you back to the original issue.
> 
> The linux memory management is not designed to easily allow use of user
> allocated memory by a device to do dma to/from it, at least not for the
> usecase where dma operation might happen over long period of time.
> 
> I guess some VMA change might help but this might also be hurt full and i am
> not familiar enough with the whole memory management to venture a guess on
> what kind if implication there is.

I agree with you, we should move away from using user-allocated memory. I just 
wanted to explain what I did and why for reference purpose, as it took me lots 
of time to debug the issue. Until every driver moves to dma-buf (and for some 
time after as well) we will still need to support user-allocated memory in 
V4L2, even if the solution isn't completely bullet-proof.
diff mbox

Patch

diff --git a/drivers/media/video/videobuf2-dma-contig.c b/drivers/media/video/videobuf2-dma-contig.c
index c898e6f..9965465 100644
--- a/drivers/media/video/videobuf2-dma-contig.c
+++ b/drivers/media/video/videobuf2-dma-contig.c
@@ -10,9 +10,12 @@ 
  * the Free Software Foundation.
  */
 
+#include <linux/dma-buf.h>
+#include <linux/dma-mapping.h>
 #include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/dma-mapping.h>
 
 #include <media/videobuf2-core.h>
 #include <media/videobuf2-memops.h>
@@ -22,16 +25,115 @@  struct vb2_dc_buf {
 	void				*vaddr;
 	unsigned long			size;
 	dma_addr_t			dma_addr;
+	struct sg_table			*dma_sgt;
+	enum dma_data_direction		dma_dir;
 
 	/* MMAP related */
 	struct vb2_vmarea_handler	handler;
 	atomic_t			refcount;
+	struct sg_table			*sgt_base;
 
 	/* USERPTR related */
 	struct vm_area_struct		*vma;
 };
 
 /*********************************************/
+/*        scatterlist table functions        */
+/*********************************************/
+
+static struct sg_table *vb2_dc_pages_to_sgt(struct page **pages,
+	unsigned long n_pages, size_t offset, size_t offset2)
+{
+	struct sg_table *sgt;
+	int i, j; /* loop counters */
+	int cur_page, chunks;
+	int ret;
+	struct scatterlist *s;
+
+	sgt = kzalloc(sizeof *sgt, GFP_KERNEL);
+	if (!sgt)
+		return ERR_PTR(-ENOMEM);
+
+	/* compute number of chunks */
+	chunks = 1;
+	for (i = 1; i < n_pages; ++i)
+		if (pages[i] != pages[i - 1] + 1)
+			++chunks;
+
+	ret = sg_alloc_table(sgt, chunks, GFP_KERNEL);
+	if (ret) {
+		kfree(sgt);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* merging chunks and putting them into the scatterlist */
+	cur_page = 0;
+	for_each_sg(sgt->sgl, s, sgt->orig_nents, i) {
+		size_t size = PAGE_SIZE;
+
+		for (j = cur_page + 1; j < n_pages; ++j) {
+			if (pages[j] != pages[j - 1] + 1)
+				break;
+			size += PAGE_SIZE;
+		}
+
+		/* cut offset if chunk starts at the first page */
+		if (cur_page == 0)
+			size -= offset;
+		/* cut offset2 if chunk ends at the last page */
+		if (j == n_pages)
+			size -= offset2;
+
+		sg_set_page(s, pages[cur_page], size, offset);
+		offset = 0;
+		cur_page = j;
+	}
+
+	return sgt;
+}
+
+static void vb2_dc_release_sgtable(struct sg_table *sgt)
+{
+	sg_free_table(sgt);
+	kfree(sgt);
+}
+
+static void vb2_dc_put_sgtable(struct sg_table *sgt, int dirty)
+{
+	struct scatterlist *s;
+	int i, j;
+
+	for_each_sg(sgt->sgl, s, sgt->nents, i) {
+		struct page *page = sg_page(s);
+		int n_pages = PAGE_ALIGN(s->offset + s->length) >> PAGE_SHIFT;
+
+		for (j = 0; j < n_pages; ++j, ++page) {
+			if (dirty)
+				set_page_dirty_lock(page);
+			put_page(page);
+		}
+	}
+
+	vb2_dc_release_sgtable(sgt);
+}
+
+static unsigned long vb2_dc_get_contiguous_size(struct sg_table *sgt)
+{
+	struct scatterlist *s;
+	dma_addr_t expected = sg_dma_address(sgt->sgl);
+	int i;
+	unsigned long size = 0;
+
+	for_each_sg(sgt->sgl, s, sgt->nents, i) {
+		if (sg_dma_address(s) != expected)
+			break;
+		expected = sg_dma_address(s) + sg_dma_len(s);
+		size += sg_dma_len(s);
+	}
+	return size;
+}
+
+/*********************************************/
 /*         callbacks for all buffers         */
 /*********************************************/
 
@@ -45,8 +147,6 @@  static void *vb2_dc_cookie(void *buf_priv)
 static void *vb2_dc_vaddr(void *buf_priv)
 {
 	struct vb2_dc_buf *buf = buf_priv;
-	if (!buf)
-		return 0;
 
 	return buf->vaddr;
 }
@@ -58,6 +158,28 @@  static unsigned int vb2_dc_num_users(void *buf_priv)
 	return atomic_read(&buf->refcount);
 }
 
+static void vb2_dc_prepare(void *buf_priv)
+{
+	struct vb2_dc_buf *buf = buf_priv;
+	struct sg_table *sgt = buf->dma_sgt;
+
+	if (!sgt)
+		return;
+
+	dma_sync_sg_for_device(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
+}
+
+static void vb2_dc_finish(void *buf_priv)
+{
+	struct vb2_dc_buf *buf = buf_priv;
+	struct sg_table *sgt = buf->dma_sgt;
+
+	if (!sgt)
+		return;
+
+	dma_sync_sg_for_cpu(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
+}
+
 /*********************************************/
 /*        callbacks for MMAP buffers         */
 /*********************************************/
@@ -66,31 +188,70 @@  static void vb2_dc_put(void *buf_priv)
 {
 	struct vb2_dc_buf *buf = buf_priv;
 
-	if (atomic_dec_and_test(&buf->refcount)) {
-		dma_free_coherent(buf->dev, buf->size, buf->vaddr,
-				  buf->dma_addr);
-		kfree(buf);
-	}
+	if (!atomic_dec_and_test(&buf->refcount))
+		return;
+
+	vb2_dc_release_sgtable(buf->sgt_base);
+	dma_free_coherent(buf->dev, buf->size, buf->vaddr,
+		buf->dma_addr);
+	kfree(buf);
 }
 
 static void *vb2_dc_alloc(void *alloc_ctx, unsigned long size)
 {
 	struct device *dev = alloc_ctx;
 	struct vb2_dc_buf *buf;
+	int ret;
+	int n_pages;
+	struct page **pages = NULL;
 
 	buf = kzalloc(sizeof *buf, GFP_KERNEL);
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
-	buf->vaddr = dma_alloc_coherent(dev, size, &buf->dma_addr, GFP_KERNEL);
+	buf->dev = dev;
+	buf->size = size;
+	buf->vaddr = dma_alloc_coherent(buf->dev, buf->size, &buf->dma_addr,
+		GFP_KERNEL);
+
+	ret = -ENOMEM;
 	if (!buf->vaddr) {
-		dev_err(dev, "dma_alloc_coherent of size %ld failed\n", size);
-		kfree(buf);
-		return ERR_PTR(-ENOMEM);
+		dev_err(dev, "dma_alloc_coherent of size %ld failed\n",
+			size);
+		goto fail_buf;
 	}
 
-	buf->dev = dev;
-	buf->size = size;
+	WARN_ON((unsigned long)buf->vaddr & ~PAGE_MASK);
+	WARN_ON(buf->dma_addr & ~PAGE_MASK);
+
+	n_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
+	if (!pages) {
+		printk(KERN_ERR "failed to alloc page table\n");
+		goto fail_dma;
+	}
+
+	ret = dma_get_pages(dev, buf->vaddr, buf->dma_addr, pages, n_pages);
+	if (ret < 0) {
+		printk(KERN_ERR "failed to get buffer pages from DMA API\n");
+		goto fail_pages;
+	}
+	if (ret != n_pages) {
+		ret = -EFAULT;
+		printk(KERN_ERR "failed to get all pages from DMA API\n");
+		goto fail_pages;
+	}
+
+	buf->sgt_base = vb2_dc_pages_to_sgt(pages, n_pages, 0, 0);
+	if (IS_ERR(buf->sgt_base)) {
+		ret = PTR_ERR(buf->sgt_base);
+		printk(KERN_ERR "failed to prepare sg table\n");
+		goto fail_pages;
+	}
+
+	/* pages are no longer needed */
+	kfree(pages);
 
 	buf->handler.refcount = &buf->refcount;
 	buf->handler.put = vb2_dc_put;
@@ -99,59 +260,226 @@  static void *vb2_dc_alloc(void *alloc_ctx, unsigned long size)
 	atomic_inc(&buf->refcount);
 
 	return buf;
+
+fail_pages:
+	kfree(pages);
+
+fail_dma:
+	dma_free_coherent(buf->dev, buf->size, buf->vaddr, buf->dma_addr);
+
+fail_buf:
+	kfree(buf);
+
+	return ERR_PTR(ret);
 }
 
 static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma)
 {
 	struct vb2_dc_buf *buf = buf_priv;
+	int ret;
+
+	/*
+	 * dma_mmap_* uses vm_pgoff as in-buffer offset, but we want to
+	 * map whole buffer
+	 */
+	vma->vm_pgoff = 0;
+
+	ret = dma_mmap_writecombine(buf->dev, vma, buf->vaddr,
+		buf->dma_addr, buf->size);
 
-	if (!buf) {
-		printk(KERN_ERR "No buffer to map\n");
-		return -EINVAL;
+	if (ret) {
+		printk(KERN_ERR "Remapping memory failed, error: %d\n", ret);
+		return ret;
 	}
 
-	return vb2_mmap_pfn_range(vma, buf->dma_addr, buf->size,
-				  &vb2_common_vm_ops, &buf->handler);
+	vma->vm_flags		|= VM_DONTEXPAND | VM_RESERVED;
+	vma->vm_private_data	= &buf->handler;
+	vma->vm_ops		= &vb2_common_vm_ops;
+
+	vma->vm_ops->open(vma);
+
+	printk(KERN_DEBUG "%s: mapped dma addr 0x%08lx at 0x%08lx, size %ld\n",
+		__func__, (unsigned long)buf->dma_addr, vma->vm_start,
+		buf->size);
+
+	return 0;
 }
 
 /*********************************************/
 /*       callbacks for USERPTR buffers       */
 /*********************************************/
 
+static inline int vma_is_io(struct vm_area_struct *vma)
+{
+	return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));
+}
+
+static int vb2_dc_get_pages(unsigned long start, struct page **pages,
+	int n_pages, struct vm_area_struct **copy_vma, int write)
+{
+	struct vm_area_struct *vma;
+	int n = 0; /* number of get pages */
+	int ret = -EFAULT;
+
+	/* entering critical section for mm access */
+	down_read(&current->mm->mmap_sem);
+
+	vma = find_vma(current->mm, start);
+	if (!vma) {
+		printk(KERN_ERR "no vma for address %lu\n", start);
+		goto cleanup;
+	}
+
+	if (vma_is_io(vma)) {
+		unsigned long pfn;
+
+		if (vma->vm_end - start < n_pages * PAGE_SIZE) {
+			printk(KERN_ERR "vma is too small\n");
+			goto cleanup;
+		}
+
+		for (n = 0; n < n_pages; ++n, start += PAGE_SIZE) {
+			ret = follow_pfn(vma, start, &pfn);
+			if (ret) {
+				printk(KERN_ERR "no page for address %lu\n",
+					start);
+				goto cleanup;
+			}
+			pages[n] = pfn_to_page(pfn);
+			get_page(pages[n]);
+		}
+	} else {
+		n = get_user_pages(current, current->mm, start & PAGE_MASK,
+			n_pages, write, 1, pages, NULL);
+		if (n != n_pages) {
+			printk(KERN_ERR "got only %d of %d user pages\n",
+				n, n_pages);
+			goto cleanup;
+		}
+	}
+
+	*copy_vma = vb2_get_vma(vma);
+	if (!*copy_vma) {
+		printk(KERN_ERR "failed to copy vma\n");
+		ret = -ENOMEM;
+		goto cleanup;
+	}
+
+	/* leaving critical section for mm access */
+	up_read(&current->mm->mmap_sem);
+
+	return 0;
+
+cleanup:
+	up_read(&current->mm->mmap_sem);
+
+	/* putting user pages if used, can be done wothout the lock */
+	while (n)
+		put_page(pages[--n]);
+
+	return ret;
+}
+
 static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
-					unsigned long size, int write)
+	unsigned long size, int write)
 {
 	struct vb2_dc_buf *buf;
-	struct vm_area_struct *vma;
-	dma_addr_t dma_addr = 0;
-	int ret;
+	unsigned long start, end, offset, offset2;
+	struct page **pages;
+	int n_pages;
+	int ret = 0;
+	struct sg_table *sgt;
+	unsigned long contig_size;
 
 	buf = kzalloc(sizeof *buf, GFP_KERNEL);
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
-	ret = vb2_get_contig_userptr(vaddr, size, &vma, &dma_addr);
+	buf->dev = alloc_ctx;
+	buf->dma_dir = write ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+
+	start = (unsigned long)vaddr & PAGE_MASK;
+	offset = (unsigned long)vaddr & ~PAGE_MASK;
+	end = PAGE_ALIGN((unsigned long)vaddr + size);
+	offset2 = end - (unsigned long)vaddr - size;
+	n_pages = (end - start) >> PAGE_SHIFT;
+
+	pages = kmalloc(n_pages * sizeof pages[0], GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		printk(KERN_ERR "failed to allocate pages table\n");
+		goto fail_buf;
+	}
+
+	/* extract page list from userspace mapping */
+	ret = vb2_dc_get_pages(start, pages, n_pages, &buf->vma, write);
 	if (ret) {
-		printk(KERN_ERR "Failed acquiring VMA for vaddr 0x%08lx\n",
-				vaddr);
-		kfree(buf);
-		return ERR_PTR(ret);
+		printk(KERN_ERR "failed to get user pages\n");
+		goto fail_pages;
+	}
+
+	sgt = vb2_dc_pages_to_sgt(pages, n_pages, offset, offset2);
+	if (!sgt) {
+		printk(KERN_ERR "failed to create scatterlist table\n");
+		ret = -ENOMEM;
+		goto fail_get_pages;
 	}
 
+	/* pages are no longer needed */
+	kfree(pages);
+	pages = NULL;
+
+	sgt->nents = dma_map_sg(buf->dev, sgt->sgl, sgt->orig_nents,
+		buf->dma_dir);
+	if (sgt->nents <= 0) {
+		printk(KERN_ERR "failed to map scatterlist\n");
+		ret = -EIO;
+		goto fail_sgt;
+	}
+
+	contig_size = vb2_dc_get_contiguous_size(sgt);
+	if (contig_size < size) {
+		printk(KERN_ERR "contiguous mapping is too small %lu/%lu\n",
+			contig_size, size);
+		ret = -EFAULT;
+		goto fail_map_sg;
+	}
+
+	buf->dma_addr = sg_dma_address(sgt->sgl);
 	buf->size = size;
-	buf->dma_addr = dma_addr;
-	buf->vma = vma;
+	buf->dma_sgt = sgt;
+
+	atomic_inc(&buf->refcount);
 
 	return buf;
+
+fail_map_sg:
+	dma_unmap_sg(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
+
+fail_sgt:
+	vb2_dc_put_sgtable(sgt, 0);
+
+fail_get_pages:
+	while (pages && n_pages)
+		put_page(pages[--n_pages]);
+	vb2_put_vma(buf->vma);
+
+fail_pages:
+	kfree(pages); /* kfree is NULL-proof */
+
+fail_buf:
+	kfree(buf);
+
+	return ERR_PTR(ret);
 }
 
-static void vb2_dc_put_userptr(void *mem_priv)
+static void vb2_dc_put_userptr(void *buf_priv)
 {
-	struct vb2_dc_buf *buf = mem_priv;
-
-	if (!buf)
-		return;
+	struct vb2_dc_buf *buf = buf_priv;
+	struct sg_table *sgt = buf->dma_sgt;
 
+	dma_unmap_sg(buf->dev, sgt->sgl, sgt->orig_nents, buf->dma_dir);
+	vb2_dc_put_sgtable(sgt, !vma_is_io(buf->vma));
 	vb2_put_vma(buf->vma);
 	kfree(buf);
 }
@@ -168,6 +496,8 @@  const struct vb2_mem_ops vb2_dma_contig_memops = {
 	.mmap		= vb2_dc_mmap,
 	.get_userptr	= vb2_dc_get_userptr,
 	.put_userptr	= vb2_dc_put_userptr,
+	.prepare	= vb2_dc_prepare,
+	.finish		= vb2_dc_finish,
 	.num_users	= vb2_dc_num_users,
 };
 EXPORT_SYMBOL_GPL(vb2_dma_contig_memops);