diff mbox series

[v2,3/5] dma-buf: heaps: support alloc async read file

Message ID 20240730075755.10941-4-link@vivo.com
State New
Headers show
Series Introduce DMA_HEAP_ALLOC_AND_READ_FILE heap flag | expand

Commit Message

Huan Yang July 30, 2024, 7:57 a.m. UTC
This patch completes the infrastructure for async read. It treats memory
allocation as the producer and assigns file reading to the heap_fwork_t
thread as the consumer.

The heap needs to gather each allocated page and, when a certain amount
(default 128MB) is gathered, it will package and pass it to heap_fwork_t
to initiate file reading.

This process is completed by the helper functions
dma_heap_gather_file_page. Ever heap declare a task and pass each page,
then wait file read done before return dma-buf.

Because the memory allocation and file reading correspond to each other,
the number of gathers during the prepare process and submit process can
determine the offset in the file as well as the size to be read.

When a gather page initiates a read, it is packaged into a work and
passed to the heap_fwork_t thread, containing the offset and size of the
file being read, the buffer obtained by mapping the gather page to
vmalloc, and the credentials used during the read.

The buffer for file reading is provided by mapping the gathered pages to
vmalloc. This means that if direct I/O is used to read a file, the file
content will be directly transferred to the corresponding memory of the
dma-buf, without the need for additional CPU copying and intermediate
buffers.

Although direct I/O requires page aligned, this patch can
automatically adapt to the file size and use buffer I/O to read the
unaligned parts.

Note that heap_fwork_t is a single-threaded process, which means that
the file read work is executed serially. Considering that the default
I/O amount initiated at a time is 128MB, which is already quite large,
multiple threads will not help accelerate I/O performance.

So, this is more suit for large size file read into dma-buf.

Signed-off-by: Huan Yang <link@vivo.com>
---
 drivers/dma-buf/dma-heap.c | 423 ++++++++++++++++++++++++++++++++++++-
 include/linux/dma-heap.h   |  45 ++++
 2 files changed, 462 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
index 91e241763ebc..df1b2518f126 100644
--- a/drivers/dma-buf/dma-heap.c
+++ b/drivers/dma-buf/dma-heap.c
@@ -18,6 +18,7 @@ 
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
 #include <linux/dma-heap.h>
+#include <linux/vmalloc.h>
 #include <uapi/linux/dma-heap.h>
 
 #define DEVNAME "dma_heap"
@@ -46,42 +47,419 @@  struct dma_heap {
 /**
  * struct dma_heap_file - wrap the file, read task for dma_heap allocate use.
  * @file:		file to read from.
+ * @cred:		kthread use, user cred copy to use for the read.
+ * @glimit:		The size limit for gathering. Whenever the page of the
+ *			gather reaches the limit, file I/O is triggered.
+ *			This is the maximum limit for the current ALLOC_AND_READ
+ *			operation.
  * @fsize:		file size.
+ * @direct:		use direct IO?
  */
 struct dma_heap_file {
 	struct file *file;
+	struct cred *cred;
+	size_t glimit;
 	size_t fsize;
+	bool direct;
 };
 
+/**
+ * struct dma_heap_file_work - represents a dma_heap file read real work.
+ * @vaddr:		contigous virtual address alloc by vmap, file read need.
+ *
+ * @start_size:		file read start offset, same to @dma_heap_file_task->roffset.
+ *
+ * @need_size:		file read need size, same to @dma_heap_file_task->rsize.
+ *
+ * @heap_file:		file wrapper.
+ *
+ * @list:		child node of @dma_heap_file_control->works.
+ *
+ * @refp:		same @dma_heap_file_task->ref, if end of read, put ref.
+ *
+ * @failp:		if any work io failed, set it true, pointp @dma_heap_file_task->fail.
+ */
+struct dma_heap_file_work {
+	void *vaddr;
+	ssize_t start_size;
+	ssize_t need_size;
+	struct dma_heap_file *heap_file;
+	struct list_head list;
+	atomic_t *refp;
+	bool *failp;
+};
+
+/**
+ * struct dma_heap_file_task - represents a dma_heap file read process
+ * @ref:		current file work counter, if zero, allocate and read
+ *			done.
+ *
+ * @roffset:		last read offset, current prepared work' begin file
+ *			start offset.
+ *
+ * @rsize:		current allocated page size use to read, if reach rbatch,
+ *			trigger commit.
+ *
+ * @nr_gathered:	current gathered page, Take the minimum value
+ *			between the @glimit and the remaining allocation amount.
+ *
+ * @heap_file:		current dma_heap_file
+ *
+ * @parray:		used for vmap, size is @dma_heap_file's batch's number
+ *			pages.(this is maximum). Due to single thread file read,
+ *			one page array reuse in ftask prepare is OK.
+ *			Each index in parray is PAGE_SIZE.(vmap need)
+ *
+ * @pindex:		current allocated page filled in @parray's index.
+ *
+ * @fail:		any work failed when file read?
+ *
+ * dma_heap_file_task is the production of file read, will prepare each work
+ * during allocate dma_buf pages, if match current batch, then trigger commit
+ * and prepare next work. After all batch queued, user going on prepare dma_buf
+ * and so on, but before return dma_buf fd, need to wait file read end and
+ * check read result.
+ */
+struct dma_heap_file_task {
+	atomic_t ref;
+	size_t roffset;
+	size_t rsize;
+	size_t nr_gathered;
+	struct dma_heap_file *heap_file;
+	struct page **parray;
+	unsigned int pindex;
+	bool fail;
+};
+
+/**
+ * struct dma_heap_file_control - global control of dma_heap file read.
+ * @works:		@dma_heap_file_work's list head.
+ *
+ * @threadwq:		wait queue for @work_thread, if commit work, @work_thread
+ *			wakeup and read this work's file contains.
+ *
+ * @workwq:		used for main thread wait for file read end, if allocation
+ *			end before file read. @dma_heap_file_task ref effect this.
+ *
+ * @work_thread:	file read kthread. the dma_heap_file_task work's consumer.
+ *
+ * @heap_fwork_cachep:	@dma_heap_file_work's cachep, it's alloc/free frequently.
+ *
+ * @nr_work:		global number of how many work committed.
+ */
+struct dma_heap_file_control {
+	struct list_head works;
+	spinlock_t lock; // only lock for @works.
+	wait_queue_head_t threadwq;
+	wait_queue_head_t workwq;
+	struct task_struct *work_thread;
+	struct kmem_cache *heap_fwork_cachep;
+	atomic_t nr_work;
+};
+
+static struct dma_heap_file_control *heap_fctl;
 static LIST_HEAD(heap_list);
 static DEFINE_MUTEX(heap_list_lock);
 static dev_t dma_heap_devt;
 static struct class *dma_heap_class;
 static DEFINE_XARRAY_ALLOC(dma_heap_minors);
 
+static struct dma_heap_file_work *
+init_file_work(struct dma_heap_file_task *heap_ftask)
+{
+	struct dma_heap_file_work *heap_fwork;
+	struct dma_heap_file *heap_file = heap_ftask->heap_file;
+
+	if (READ_ONCE(heap_ftask->fail))
+		return NULL;
+
+	heap_fwork = kmem_cache_alloc(heap_fctl->heap_fwork_cachep, GFP_KERNEL);
+	if (unlikely(!heap_fwork))
+		return NULL;
+
+	/**
+	 * Map the gathered page to the vmalloc area.
+	 * So we get a continuous virtual address, even if the physical address
+	 * is scatter, can use this to trigger file read, if use direct I/O,
+	 * all content can direct read into dma-buf pages without extra copy.
+	 *
+	 * Now that we get vaddr page, cached pages can return to original user, so we
+	 * will not effect dma-buf export even if file read not end.
+	 */
+	heap_fwork->vaddr = vmap(heap_ftask->parray, heap_ftask->pindex, VM_MAP,
+				 PAGE_KERNEL);
+	if (unlikely(!heap_fwork->vaddr)) {
+		kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
+		return NULL;
+	}
+
+	heap_fwork->heap_file = heap_file;
+	heap_fwork->start_size = heap_ftask->roffset;
+	heap_fwork->need_size = heap_ftask->rsize;
+	heap_fwork->refp = &heap_ftask->ref;
+	heap_fwork->failp = &heap_ftask->fail;
+	atomic_inc(&heap_ftask->ref);
+	return heap_fwork;
+}
+
+static void deinit_file_work(struct dma_heap_file_work *heap_fwork)
+{
+	vunmap(heap_fwork->vaddr);
+	atomic_dec(heap_fwork->refp);
+	wake_up(&heap_fctl->workwq);
+
+	kmem_cache_free(heap_fctl->heap_fwork_cachep, heap_fwork);
+}
+
+/**
+ * dma_heap_submit_file_read -  prepare collect enough memory, going to trigger IO
+ * @heap_ftask:			info that current IO needs
+ *
+ * This will also check if reach to tail read.
+ * For direct I/O submissions, it is necessary to pay attention to file reads
+ * that are not page-aligned. For the unaligned portion of the read, buffer IO
+ * needs to be triggered.
+ * Returns:
+ *   0 if all right, negative if something wrong
+ */
+static int dma_heap_submit_file_read(struct dma_heap_file_task *heap_ftask)
+{
+	struct dma_heap_file_work *heap_fwork = init_file_work(heap_ftask);
+	struct page *last = NULL;
+	struct dma_heap_file *heap_file = heap_ftask->heap_file;
+	size_t start = heap_ftask->roffset;
+	struct file *file = heap_file->file;
+	size_t fsz = heap_file->fsize;
+
+	if (unlikely(!heap_fwork))
+		return -ENOMEM;
+
+	/**
+	 * If file size is not page aligned, direct io can't process the tail.
+	 * So, if reach to tail, remain the last page use buffer read.
+	 */
+	if (heap_file->direct && start + heap_ftask->rsize > fsz) {
+		heap_fwork->need_size -= PAGE_SIZE;
+		last = heap_ftask->parray[heap_ftask->pindex - 1];
+	}
+
+	spin_lock(&heap_fctl->lock);
+	list_add_tail(&heap_fwork->list, &heap_fctl->works);
+	spin_unlock(&heap_fctl->lock);
+	atomic_inc(&heap_fctl->nr_work);
+
+	wake_up(&heap_fctl->threadwq);
+
+	if (last) {
+		char *buf, *pathp;
+		ssize_t err;
+		void *buffer;
+
+		buf = kmalloc(PATH_MAX, GFP_KERNEL);
+		if (unlikely(!buf))
+			return -ENOMEM;
+
+		start = PAGE_ALIGN_DOWN(fsz);
+
+		pathp = file_path(file, buf, PATH_MAX);
+		if (IS_ERR(pathp)) {
+			kfree(buf);
+			return PTR_ERR(pathp);
+		}
+
+		// use page's kaddr as file read buffer.
+		buffer = kmap_local_page(last);
+		err = kernel_read_file_from_path(pathp, start, &buffer,
+						 fsz - start, &fsz,
+						 READING_POLICY);
+		kunmap_local(buffer);
+		kfree(buf);
+		if (err < 0)
+			return err;
+	}
+
+	heap_ftask->roffset += heap_ftask->rsize;
+	heap_ftask->rsize = 0;
+	heap_ftask->pindex = 0;
+	heap_ftask->nr_gathered = min_t(size_t,
+					PAGE_ALIGN(fsz) - heap_ftask->roffset,
+					heap_ftask->nr_gathered);
+	return 0;
+}
+
+int dma_heap_gather_file_page(struct dma_heap_file_task *heap_ftask,
+			      struct page *page)
+{
+	struct page **array = heap_ftask->parray;
+	int index = heap_ftask->pindex;
+	int num = compound_nr(page), i;
+	unsigned long sz = page_size(page);
+
+	heap_ftask->rsize += sz;
+	for (i = 0; i < num; ++i)
+		array[index++] = &page[i];
+	heap_ftask->pindex = index;
+
+	if (heap_ftask->rsize < heap_ftask->nr_gathered)
+		return 0;
+
+	// already reach to limit, trigger file read.
+	return dma_heap_submit_file_read(heap_ftask);
+}
+
+int dma_heap_wait_for_file_read(struct dma_heap_file_task *heap_ftask)
+{
+	wait_event_freezable(heap_fctl->workwq,
+			     atomic_read(&heap_ftask->ref) == 0);
+	return heap_ftask->fail ? -EIO : 0;
+}
+
+int dma_heap_end_file_read(struct dma_heap_file_task *heap_ftask)
+{
+	int ret;
+
+	ret = dma_heap_wait_for_file_read(heap_ftask);
+	kvfree(heap_ftask->parray);
+	kfree(heap_ftask);
+
+	return ret;
+}
+
+struct dma_heap_file_task *
+dma_heap_declare_file_read(struct dma_heap_file *heap_file)
+{
+	struct dma_heap_file_task *heap_ftask =
+		kzalloc(sizeof(*heap_ftask), GFP_KERNEL);
+	if (unlikely(!heap_ftask))
+		return NULL;
+
+	/**
+	 * glimit is the maximum size which we prepare work will meet.
+	 * So, direct alloc this number's page array is OK.
+	 */
+	heap_ftask->parray = kvmalloc_array(heap_file->glimit >> PAGE_SHIFT,
+					    sizeof(struct page *), GFP_KERNEL);
+	if (unlikely(!heap_ftask->parray))
+		goto put;
+
+	heap_ftask->heap_file = heap_file;
+	heap_ftask->nr_gathered = heap_file->glimit;
+	return heap_ftask;
+
+put:
+	kfree(heap_ftask);
+	return NULL;
+}
+
+static void __work_this_io(struct dma_heap_file_work *heap_fwork)
+{
+	struct dma_heap_file *heap_file = heap_fwork->heap_file;
+	struct file *file = heap_file->file;
+	ssize_t start = heap_fwork->start_size;
+	ssize_t size = heap_fwork->need_size;
+	void *buffer = heap_fwork->vaddr;
+	const struct cred *old_cred;
+	ssize_t err;
+
+	// use real task's cred to read this file.
+	old_cred = override_creds(heap_file->cred);
+	err = kernel_read_file(file, start, &buffer, size, &heap_file->fsize,
+			       READING_POLICY);
+	if (err < 0)
+		WRITE_ONCE(*heap_fwork->failp, true);
+	// recovery to my cred.
+	revert_creds(old_cred);
+}
+
+static int dma_heap_file_work_thread(void *data)
+{
+	struct dma_heap_file_control *heap_fctl =
+		(struct dma_heap_file_control *)data;
+	struct dma_heap_file_work *worker, *tmp;
+	int nr_work;
+
+	LIST_HEAD(pages);
+	LIST_HEAD(workers);
+
+	while (true) {
+		wait_event_freezable(heap_fctl->threadwq,
+				     atomic_read(&heap_fctl->nr_work) > 0);
+recheck:
+		spin_lock(&heap_fctl->lock);
+		list_splice_init(&heap_fctl->works, &workers);
+		spin_unlock(&heap_fctl->lock);
+
+		if (unlikely(kthread_should_stop())) {
+			list_for_each_entry_safe(worker, tmp, &workers, list) {
+				list_del(&worker->list);
+				deinit_file_work(worker);
+			}
+			break;
+		}
+
+		nr_work = 0;
+		list_for_each_entry_safe(worker, tmp, &workers, list) {
+			++nr_work;
+			list_del(&worker->list);
+			__work_this_io(worker);
+
+			deinit_file_work(worker);
+		}
+
+		if (atomic_sub_return(nr_work, &heap_fctl->nr_work) > 0)
+			goto recheck;
+	}
+	return 0;
+}
+
+size_t dma_heap_file_size(struct dma_heap_file *heap_file)
+{
+	return heap_file->fsize;
+}
+
 static int init_dma_heap_file(struct dma_heap_file *heap_file, int file_fd)
 {
 	struct file *file;
 	size_t fsz;
+	int ret;
 
 	file = fget(file_fd);
 	if (!file)
 		return -EINVAL;
 
-	// Direct I/O only support PAGE_SIZE aligned files.
 	fsz = i_size_read(file_inode(file));
-	if (file->f_flags & O_DIRECT && !PAGE_ALIGNED(fsz))
-		return -EINVAL;
 
-	heap_file->fsize = fsz;
+	/**
+	 * Selinux block our read, but actually we are reading the stand-in
+	 * for this file.
+	 * So save current's cred and when going to read, override mine, and
+	 * end of read, revert.
+	 */
+	heap_file->cred = prepare_kernel_cred(current);
+	if (unlikely(!heap_file->cred)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
 	heap_file->file = file;
+#define DEFAULT_DMA_BUF_HEAPS_GATHER_LIMIT (128 << 20)
+	heap_file->glimit = min_t(size_t, PAGE_ALIGN(fsz),
+				  DEFAULT_DMA_BUF_HEAPS_GATHER_LIMIT);
+	heap_file->fsize = fsz;
+
+	heap_file->direct = file->f_flags & O_DIRECT;
 
 	return 0;
+
+err:
+	fput(file);
+	return ret;
 }
 
 static void deinit_dma_heap_file(struct dma_heap_file *heap_file)
 {
 	fput(heap_file->file);
+	put_cred(heap_file->cred);
 }
 
 /**
@@ -443,11 +821,44 @@  static int dma_heap_init(void)
 
 	dma_heap_class = class_create(DEVNAME);
 	if (IS_ERR(dma_heap_class)) {
-		unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
-		return PTR_ERR(dma_heap_class);
+		ret = PTR_ERR(dma_heap_class);
+		goto fail_class;
 	}
 	dma_heap_class->devnode = dma_heap_devnode;
 
+	heap_fctl = kzalloc(sizeof(*heap_fctl), GFP_KERNEL);
+	if (unlikely(!heap_fctl)) {
+		ret =  -ENOMEM;
+		goto fail_alloc;
+	}
+
+	INIT_LIST_HEAD(&heap_fctl->works);
+	init_waitqueue_head(&heap_fctl->threadwq);
+	init_waitqueue_head(&heap_fctl->workwq);
+
+	heap_fctl->work_thread = kthread_run(dma_heap_file_work_thread,
+					     heap_fctl, "heap_fwork_t");
+	if (IS_ERR(heap_fctl->work_thread)) {
+		ret = -ENOMEM;
+		goto fail_thread;
+	}
+
+	heap_fctl->heap_fwork_cachep = KMEM_CACHE(dma_heap_file_work, 0);
+	if (unlikely(!heap_fctl->heap_fwork_cachep)) {
+		ret = -ENOMEM;
+		goto fail_cache;
+	}
+
 	return 0;
+
+fail_cache:
+	kthread_stop(heap_fctl->work_thread);
+fail_thread:
+	kfree(heap_fctl);
+fail_alloc:
+	class_destroy(dma_heap_class);
+fail_class:
+	unregister_chrdev_region(dma_heap_devt, NUM_HEAP_MINORS);
+	return ret;
 }
 subsys_initcall(dma_heap_init);
diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
index 824acbf5a1bc..3becbd08963a 100644
--- a/include/linux/dma-heap.h
+++ b/include/linux/dma-heap.h
@@ -14,6 +14,8 @@ 
 
 struct dma_heap;
 struct dma_heap_file;
+struct dma_heap_file_task;
+struct dma_heap_file;
 
 /**
  * struct dma_heap_ops - ops to operate on a given heap
@@ -69,4 +71,47 @@  const char *dma_heap_get_name(struct dma_heap *heap);
  */
 struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info);
 
+/**
+ * dma_heap_wait_for_file_read - waits for a file read to complete
+ *
+ * Some users need to call this function before destroying the page to ensure
+ * that all file work has been completed, in order to avoid UAF issues.
+ * Remember, this function does not destroy the data structure corresponding to
+ * the ftask. Before ending the actual processing, you need to call
+ * @dma_heap_end_file_read.
+ *
+ * 0 - success, -EIO - if any file work failed
+ */
+int dma_heap_wait_for_file_read(struct dma_heap_file_task *heap_ftask);
+
+/**
+ * dma_heap_end_file_read - waits for a file read to complete then destroy it
+ * 0 - success, -EIO - if any file work failed
+ */
+int dma_heap_end_file_read(struct dma_heap_file_task *heap_ftask);
+
+/**
+ * dma_heap_alloc_file_read - Declare a task to read file when allocate pages.
+ * @heap_file:		target file to read
+ *
+ * Return NULL if failed, otherwise return a struct pointer.
+ */
+struct dma_heap_file_task *
+dma_heap_declare_file_read(struct dma_heap_file *heap_file);
+
+/**
+ * dma_heap_gather_file_page - gather each allocated page.
+ * @heap_ftask:		prepared and need to commit's work.
+ * @page:		current allocated page. don't care which order.
+ *
+ * This function gather all allocated pages, automatically submit when the
+ * gathering reaches the limit. Submit will package pages, prepare the data
+ * required for reading file, then submit to async read thread.
+ *
+ * 0 - success, nagtive - failed.
+ */
+int dma_heap_gather_file_page(struct dma_heap_file_task *heap_ftask,
+			      struct page *page);
+size_t dma_heap_file_size(struct dma_heap_file *heap_file);
+
 #endif /* _DMA_HEAPS_H */