diff mbox

[1/2] First pass at vma based anonymous volatile ranges

Message ID 1393363729-8459-2-git-send-email-john.stultz@linaro.org
State New
Headers show

Commit Message

John Stultz Feb. 25, 2014, 9:28 p.m. UTC
This patch add a vrange() syscall, which allows vmas to be set
as volatile or nonvolatile. If volatile, pages in the vma can be
discarded under memory pressure.

When setting memory non-volatile, we check the pages to see if
they have been purged and return a purged flag accordingly.

This does not have the sigbus semantics in place.

This patch hevily takes from Minchan's vrange patchset, so
credits to him for his original work.

Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/x86/syscalls/syscall_64.tbl |   1 +
 include/linux/mm.h               |   1 +
 include/linux/vrange.h           |  12 ++
 mm/Makefile                      |   2 +-
 mm/internal.h                    |   2 -
 mm/rmap.c                        |   5 +
 mm/vmscan.c                      |  12 ++
 mm/vrange.c                      | 348 +++++++++++++++++++++++++++++++++++++++
 8 files changed, 380 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/vrange.h
 create mode 100644 mm/vrange.c
diff mbox

Patch

diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index a12bddc..7ae3940 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -322,6 +322,7 @@ 
 313	common	finit_module		sys_finit_module
 314	common	sched_setattr		sys_sched_setattr
 315	common	sched_getattr		sys_sched_getattr
+316	common	vrange			sys_vrange
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f28f46e..a7ea310 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -117,6 +117,7 @@  extern unsigned int kobjsize(const void *objp);
 #define VM_IO           0x00004000	/* Memory mapped I/O or similar */
 
 					/* Used by sys_madvise() */
+#define VM_VOLATILE	0x00001000	/* VMA is volatile */
 #define VM_SEQ_READ	0x00008000	/* App will access data sequentially */
 #define VM_RAND_READ	0x00010000	/* App will not benefit from clustered reads */
 
diff --git a/include/linux/vrange.h b/include/linux/vrange.h
new file mode 100644
index 0000000..696e001
--- /dev/null
+++ b/include/linux/vrange.h
@@ -0,0 +1,12 @@ 
+#ifndef _LINUX_VRANGE_H
+#define _LINUX_VRANGE_H
+
+#include <linux/mm.h>
+
+
+#define VRANGE_NONVOLATILE 0
+#define VRANGE_VOLATILE 1
+
+extern int discard_vpage(struct page *page);
+
+#endif /* _LINUX_VRANGE_H */
diff --git a/mm/Makefile b/mm/Makefile
index 310c90a..20229e2 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,7 @@  obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
-			   compaction.o balloon_compaction.o \
+			   compaction.o balloon_compaction.o vrange.o \
 			   interval_tree.o list_lru.o $(mmu-y)
 
 obj-y += init-mm.o
diff --git a/mm/internal.h b/mm/internal.h
index 29e1e76..ea66bf9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -225,10 +225,8 @@  static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 
 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern unsigned long vma_address(struct page *page,
 				 struct vm_area_struct *vma);
-#endif
 #else /* !CONFIG_MMU */
 static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
 {
diff --git a/mm/rmap.c b/mm/rmap.c
index d9d4231..2b6f079 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -728,6 +728,11 @@  int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 				referenced++;
 		}
 		pte_unmap_unlock(pte, ptl);
+		if (vma->vm_flags & VM_VOLATILE) {
+			pra->mapcount = 0;
+			pra->vm_flags |= VM_VOLATILE;
+			return SWAP_FAIL;
+		}
 	}
 
 	if (referenced) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a9c74b4..c5c0ee0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -43,6 +43,7 @@ 
 #include <linux/sysctl.h>
 #include <linux/oom.h>
 #include <linux/prefetch.h>
+#include <linux/vrange.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -683,6 +684,7 @@  enum page_references {
 	PAGEREF_RECLAIM,
 	PAGEREF_RECLAIM_CLEAN,
 	PAGEREF_KEEP,
+	PAGEREF_DISCARD,
 	PAGEREF_ACTIVATE,
 };
 
@@ -703,6 +705,13 @@  static enum page_references page_check_references(struct page *page,
 	if (vm_flags & VM_LOCKED)
 		return PAGEREF_RECLAIM;
 
+	/*
+	 * If volatile page is reached on LRU's tail, we discard the
+	 * page without considering recycle the page.
+	 */
+	if (vm_flags & VM_VOLATILE)
+		return PAGEREF_DISCARD;
+
 	if (referenced_ptes) {
 		if (PageSwapBacked(page))
 			return PAGEREF_ACTIVATE;
@@ -930,6 +939,9 @@  static unsigned long shrink_page_list(struct list_head *page_list,
 		switch (references) {
 		case PAGEREF_ACTIVATE:
 			goto activate_locked;
+		case PAGEREF_DISCARD:
+			if (may_enter_fs && discard_vpage(page) == 0)
+				goto free_it;
 		case PAGEREF_KEEP:
 			goto keep_locked;
 		case PAGEREF_RECLAIM:
diff --git a/mm/vrange.c b/mm/vrange.c
new file mode 100644
index 0000000..b80f611
--- /dev/null
+++ b/mm/vrange.c
@@ -0,0 +1,348 @@ 
+#include <linux/mm.h>
+#include <linux/mempolicy.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/vrange.h>
+#include <linux/mm_inline.h>
+#include <linux/migrate.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mm_inline.h>
+#include <linux/migrate.h>
+#include <linux/pagevec.h>
+#include <linux/shmem_fs.h>
+#include "internal.h"
+
+struct vrange_walker {
+	struct vm_area_struct *vma;
+	int pages_purged;
+};
+
+
+
+static unsigned long vrange_check_purged(struct mm_struct *mm,
+					 struct vm_area_struct *vma,
+					 unsigned long start,
+					 unsigned long end);
+
+
+
+
+static ssize_t do_vrange(struct mm_struct *mm, unsigned long start,
+				unsigned long end, int mode, int *purged)
+{
+	struct vm_area_struct *vma, *prev;
+	unsigned long orig_start = start;
+	ssize_t count = 0, ret = 0;
+	int lpurged = 0;
+
+	down_read(&mm->mmap_sem);
+
+	vma = find_vma_prev(mm, start, &prev);
+	if (vma && start > vma->vm_start)
+		prev = vma;
+
+	for (;;) {
+		unsigned long new_flags;
+		pgoff_t pgoff;
+		unsigned long tmp;
+
+		if (!vma)
+			goto out;
+
+		if (vma->vm_flags & (VM_SPECIAL|VM_LOCKED|VM_MIXEDMAP|
+					VM_HUGETLB))
+			goto out;
+
+		/* We don't support volatility on files for now */
+		if (vma->vm_file) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		new_flags = vma->vm_flags;
+
+		if (start < vma->vm_start) {
+			start = vma->vm_start;
+			if (start >= end)
+				goto out;
+		}
+		tmp = vma->vm_end;
+		if (end < tmp)
+			tmp = end;
+
+		switch(mode) {
+			case VRANGE_VOLATILE:
+				new_flags |= VM_VOLATILE;
+			break;
+			case VRANGE_NONVOLATILE:
+				new_flags &= ~VM_VOLATILE;
+				lpurged |= vrange_check_purged(mm, vma,
+								vma->vm_start,
+								vma->vm_end);
+		}
+
+		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+		prev = vma_merge(mm, prev, start, tmp, new_flags,
+					vma->anon_vma, vma->vm_file, pgoff,
+					vma_policy(vma));
+		if (prev) {
+			goto success;
+		}
+
+
+		if (start != vma->vm_start) {
+			ret = split_vma(mm, vma, start, 1);
+			if (ret)
+				goto out;
+		}
+
+		if (tmp != vma->vm_end) {
+			ret = split_vma(mm, vma, tmp, 0);
+			if (ret)
+				goto out;
+		}
+
+		prev = vma;
+success:
+		vma->vm_flags = new_flags;
+		*purged = lpurged;
+
+		/* update count to distance covered so far*/
+		count = tmp - orig_start;
+
+		if (prev && start < prev->vm_end)
+			start = prev->vm_end;
+		if (start >= end)
+			goto out;
+		if (prev)
+			vma = prev->vm_next;
+		else	/* madvise_remove dropped mmap_sem */
+			vma = find_vma(mm, start);
+	}
+out:
+	up_read(&mm->mmap_sem);
+
+	/* report bytes successfully marked, even if we're exiting on error */
+	if (count)
+		return count;
+
+	return ret;
+}
+
+SYSCALL_DEFINE4(vrange, unsigned long, start,
+		size_t, len, int, mode, int __user *, purged)
+{
+	unsigned long end;
+	struct mm_struct *mm = current->mm;
+	ssize_t ret = -EINVAL;
+	int p = 0;
+
+	if (start & ~PAGE_MASK)
+		goto out;
+
+	len &= PAGE_MASK;
+	if (!len)
+		goto out;
+
+	end = start + len;
+	if (end < start)
+		goto out;
+
+	if (start >= TASK_SIZE)
+		goto out;
+
+	if (purged) {
+		/* Test pointer is valid before making any changes */
+		if (put_user(p, purged))
+			return -EFAULT;
+	}
+
+	ret = do_vrange(mm, start, end, mode, &p);
+
+	if (purged) {
+		if (put_user(p, purged)) {
+			/*
+			 * This would be bad, since we've modified volatilty
+			 * and the change in purged state would be lost.
+			 */
+			BUG();
+		}
+	}
+
+out:
+	return ret;
+}
+
+static void try_to_discard_one(struct page *page, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pte_t *pte;
+	pte_t pteval;
+	spinlock_t *ptl;
+	unsigned long addr;
+
+	VM_BUG_ON(!PageLocked(page));
+
+	addr = vma_address(page, vma);
+	pte = page_check_address(page, mm, addr, &ptl, 0);
+	if (!pte)
+		return;
+
+	BUG_ON(vma->vm_flags & (VM_SPECIAL|VM_LOCKED|VM_MIXEDMAP|VM_HUGETLB));
+
+	flush_cache_page(vma, addr, page_to_pfn(page));
+	pteval = ptep_clear_flush(vma, addr, pte);
+
+	update_hiwater_rss(mm);
+	if (PageAnon(page))
+		dec_mm_counter(mm, MM_ANONPAGES);
+	else
+		dec_mm_counter(mm, MM_FILEPAGES);
+
+	page_remove_rmap(page);
+	page_cache_release(page);
+
+//	set_pte_at(mm, addr, pte, swp_entry_to_pte(make_vrange_entry()));
+	pte_unmap_unlock(pte, ptl);
+	mmu_notifier_invalidate_page(mm, addr);
+
+}
+
+
+static int try_to_discard_anon_vpage(struct page *page)
+{
+	struct anon_vma *anon_vma;
+	struct anon_vma_chain *avc;
+	pgoff_t pgoff;
+
+	anon_vma = page_lock_anon_vma_read(page);
+	if (!anon_vma)
+		return -1;
+
+	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	/*
+	 * During interating the loop, some processes could see a page as
+	 * purged while others could see a page as not-purged because we have
+	 * no global lock between parent and child for protecting vrange system
+	 * call during this loop. But it's not a problem because the page is
+	 * not *SHARED* page but *COW* page so parent and child can see other
+	 * data anytime. The worst case by this race is a page was purged
+	 * but couldn't be discarded so it makes unnecessary page fault but
+	 * it wouldn't be severe.
+	 */
+	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
+		struct vm_area_struct *vma = avc->vma;
+
+		if (!(vma->vm_flags & VM_VOLATILE))
+			continue;
+		try_to_discard_one(page, vma);
+	}
+	page_unlock_anon_vma_read(anon_vma);
+	return 0;
+}
+
+
+static int try_to_discard_vpage(struct page *page)
+{
+	if (PageAnon(page))
+		return try_to_discard_anon_vpage(page);
+	return -1;
+}
+
+
+int discard_vpage(struct page *page)
+{
+	VM_BUG_ON(!PageLocked(page));
+	VM_BUG_ON(PageLRU(page));
+
+	if (!try_to_discard_vpage(page)) {
+		if (PageSwapCache(page))
+			try_to_free_swap(page);
+
+		if (page_freeze_refs(page, 1)) {
+			unlock_page(page);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static void vrange_pte_entry(pte_t pteval, unsigned long address,
+				unsigned ptent_size, struct mm_walk *walk)
+{
+	struct page *page;
+	struct vrange_walker *vw = walk->private;
+	struct vm_area_struct *vma = vw->vma;
+
+	if (pte_none(pteval)) {
+		vw->pages_purged = 1;
+		return;
+	}
+
+	if (!pte_present(pteval)) {
+		return;
+	}
+
+	page = vm_normal_page(vma, address, pteval);
+	if (unlikely(!page))
+		return;
+
+	if (!PageLRU(page) || PageLocked(page))
+		return;
+
+	BUG_ON(PageCompound(page));
+
+
+	VM_BUG_ON(page_is_file_cache(page));
+}
+
+static int vrange_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+				struct mm_walk *walk)
+{
+	struct vrange_walker *vw = walk->private;
+	struct vm_area_struct *uninitialized_var(vma);
+	pte_t *pte;
+	spinlock_t *ptl;
+
+	vma = vw->vma;
+	split_huge_page_pmd(vma, addr, pmd);
+	if (pmd_trans_unstable(pmd))
+		return 0;
+
+	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE)
+		vrange_pte_entry(*pte, addr, PAGE_SIZE, walk);
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+
+	return 0;
+}
+
+static unsigned long vrange_check_purged(struct mm_struct *mm,
+					 struct vm_area_struct *vma,
+					 unsigned long start,
+					 unsigned long end)
+{
+	struct vrange_walker vw;
+	struct mm_walk vrange_walk = {
+		.pmd_entry = vrange_pte_range,
+		.mm = vma->vm_mm,
+		.private = &vw,
+	};
+	vw.pages_purged = 0;
+	vw.vma = vma;
+
+	walk_page_range(start, end, &vrange_walk);
+
+	return vw.pages_purged;
+
+}
+