@@ -27,6 +27,7 @@
#include <linux/cred.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
+#include <linux/volatile.h>
#include "internal.h"
/*
@@ -254,6 +255,7 @@ void __destroy_inode(struct inode *inode)
if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
posix_acl_release(inode->i_default_acl);
#endif
+ mapping_clear_volatile_ranges(&inode->i_data);
this_cpu_dec(nr_inodes);
}
EXPORT_SYMBOL(__destroy_inode);
@@ -360,6 +362,8 @@ void address_space_init_once(struct address_space *mapping)
spin_lock_init(&mapping->private_lock);
INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+ range_tree_init(&mapping->volatile_root);
+
}
EXPORT_SYMBOL(address_space_init_once);
@@ -18,4 +18,9 @@
#define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */
#endif
+#define POSIX_FADV_VOLATILE 8 /* _can_ toss, but don't toss now */
+#define POSIX_FADV_NONVOLATILE 9 /* Remove VOLATILE flag */
+
+
+
#endif /* FADVISE_H_INCLUDED */
@@ -10,6 +10,7 @@
#include <linux/ioctl.h>
#include <linux/blk_types.h>
#include <linux/types.h>
+#include <linux/rangetree.h>
/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -655,6 +656,7 @@ struct address_space {
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
struct address_space *assoc_mapping; /* ditto */
+ struct range_tree_root volatile_root; /* volatile range list */
} __attribute__((aligned(sizeof(long))));
/*
* On most architectures that alignment is already the case; but
new file mode 100644
@@ -0,0 +1,14 @@
+#ifndef _LINUX_VOLATILE_H
+#define _LINUX_VOLATILE_H
+
+#include <linux/fs.h>
+
+extern long mapping_range_volatile(struct address_space *mapping,
+ pgoff_t start_index, pgoff_t end_index);
+extern long mapping_range_nonvolatile(struct address_space *mapping,
+ pgoff_t start_index, pgoff_t end_index);
+extern long mapping_range_isvolatile(struct address_space *mapping,
+ pgoff_t start_index, pgoff_t end_index);
+extern void mapping_clear_volatile_ranges(struct address_space *mapping);
+
+#endif /* _LINUX_VOLATILE_H */
@@ -13,7 +13,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o mm_init.o mmu_context.o percpu.o \
- $(mmu-y)
+ volatile.o $(mmu-y)
obj-y += init-mm.o
ifdef CONFIG_NO_BOOTMEM
@@ -17,6 +17,7 @@
#include <linux/fadvise.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
+#include <linux/volatile.h>
#include <asm/unistd.h>
@@ -106,7 +107,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
nrpages = end_index - start_index + 1;
if (!nrpages)
nrpages = ~0UL;
-
+
ret = force_page_cache_readahead(mapping, file,
start_index,
nrpages);
@@ -128,6 +129,19 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
invalidate_mapping_pages(mapping, start_index,
end_index);
break;
+ case POSIX_FADV_VOLATILE:
+ /* First and last PARTIAL page! */
+ start_index = offset >> PAGE_CACHE_SHIFT;
+ end_index = endbyte >> PAGE_CACHE_SHIFT;
+ ret = mapping_range_volatile(mapping, start_index, end_index);
+ break;
+ case POSIX_FADV_NONVOLATILE:
+ /* First and last PARTIAL page! */
+ start_index = offset >> PAGE_CACHE_SHIFT;
+ end_index = endbyte >> PAGE_CACHE_SHIFT;
+ ret = mapping_range_nonvolatile(mapping, start_index,
+ end_index);
+ break;
default:
ret = -EINVAL;
}
new file mode 100644
@@ -0,0 +1,342 @@
+/* mm/volatile.c
+ *
+ * Volatile page range managment.
+ * Copyright 2011 Linaro
+ *
+ * Based on mm/ashmem.c
+ * by Robert Love <rlove@google.com>
+ * Copyright (C) 2008 Google, Inc.
+ *
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * The goal behind volatile ranges is to allow applications to interact
+ * with the kernel's cache management infrastructure. In particular an
+ * application can say "this memory contains data that might be useful in
+ * the future, but can be reconstructed if necessary, so if the kernel
+ * needs, it can zap and reclaim this memory without having to swap it out.
+ *
+ * The proposed mechanism - at a high level - is for user-space to be able
+ * to say "This memory is volatile" and then later "this memory is no longer
+ * volatile". If the content of the memory is still available the second
+ * request succeeds. If not, the memory is marked non-volatile and an
+ * error is returned to denote that the contents have been lost.
+ *
+ * Credits to Neil Brown for the above description.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/volatile.h>
+
+static DEFINE_MUTEX(volatile_mutex);
+
+struct volatile_range {
+ struct list_head lru;
+ struct range_tree_node range_node;
+
+ unsigned int purged;
+ struct address_space *mapping;
+};
+
+/* LRU list of volatile page ranges */
+static LIST_HEAD(volatile_lru_list);
+
+/* Count of pages on our LRU list */
+static u64 lru_count;
+
+
+static inline u64 range_size(struct volatile_range *range)
+{
+ return range->range_node.end - range->range_node.start + 1;
+}
+
+static inline void lru_add(struct volatile_range *range)
+{
+ list_add_tail(&range->lru, &volatile_lru_list);
+ lru_count += range_size(range);
+}
+
+static inline void lru_del(struct volatile_range *range)
+{
+ list_del(&range->lru);
+ lru_count -= range_size(range);
+}
+
+#define range_on_lru(range) (!(range)->purged)
+
+
+static inline void volatile_range_resize(struct volatile_range *range,
+ pgoff_t start_index, pgoff_t end_index)
+{
+ size_t pre = range_size(range);
+
+ range->range_node.start = start_index;
+ range->range_node.end = end_index;
+
+ if (range_on_lru(range))
+ lru_count -= pre - range_size(range);
+}
+
+static struct volatile_range *vrange_alloc(void)
+{
+ struct volatile_range *new;
+
+ new = kzalloc(sizeof(struct volatile_range), GFP_KERNEL);
+ if (!new)
+ return 0;
+ range_tree_node_init(&new->range_node);
+ return new;
+}
+
+static void vrange_del(struct volatile_range *vrange)
+{
+ struct address_space *mapping;
+ mapping = vrange->mapping;
+
+ if (range_on_lru(vrange))
+ lru_del(vrange);
+ range_tree_remove(&mapping->volatile_root, &vrange->range_node);
+ kfree(vrange);
+}
+
+
+
+/*
+ * Mark a region as volatile, allowing dirty pages to be purged
+ * under memory pressure
+ */
+long mapping_range_volatile(struct address_space *mapping,
+ pgoff_t start_index, pgoff_t end_index)
+{
+ struct volatile_range *new;
+ struct range_tree_node *node;
+ struct volatile_range *vrange;
+
+ u64 start, end;
+ int purged = 0;
+ start = (u64)start_index;
+ end = (u64)end_index;
+
+ new = vrange_alloc();
+ if (!new)
+ return -ENOMEM;
+
+ mutex_lock(&volatile_mutex);
+
+ /* Find any existing ranges that overlap */
+ node = range_tree_in_range(&mapping->volatile_root, start, end);
+ while (node) {
+ /* Already entirely marked volatile, so we're done */
+ if (node->start < start && node->end > end) {
+ /* don't need the allocated value */
+ kfree(new);
+ goto out;
+ }
+
+ /* Grab containing volatile range */
+ vrange = container_of(node, struct volatile_range, range_node);
+
+ /* resize range */
+ start = min_t(u64, start, node->start);
+ end = max_t(u64, end, node->end);
+ purged |= vrange->purged;
+
+ node = range_tree_next_in_range(&vrange->range_node,
+ start, end);
+ vrange_del(vrange);
+ }
+
+ /* Coalesce unpurged left-adjacent ranges */
+ node = range_tree_in_range(&mapping->volatile_root, start-1, start);
+ if (node) {
+ vrange = container_of(node, struct volatile_range, range_node);
+ if (!vrange->purged) {
+ /* resize range */
+ start = min_t(u64, start, node->start);
+ end = max_t(u64, end, node->end);
+ vrange_del(vrange);
+ }
+ }
+
+ /* Coalesce unpurged right-adjacent ranges */
+ node = range_tree_in_range(&mapping->volatile_root, end, end+1);
+ if (node) {
+ vrange = container_of(node, struct volatile_range, range_node);
+ if (!vrange->purged) {
+ /* resize range */
+ start = min_t(u64, start, node->start);
+ end = max_t(u64, end, node->end);
+ vrange_del(vrange);
+ }
+ }
+
+ new->mapping = mapping;
+ new->range_node.start = start;
+ new->range_node.end = end;
+ new->purged = purged;
+
+ range_tree_add(&mapping->volatile_root, &new->range_node);
+ if (range_on_lru(new))
+ lru_add(new);
+
+out:
+ mutex_unlock(&volatile_mutex);
+
+ return 0;
+}
+
+/*
+ * Mark a region as nonvolatile, returns 1 if any pages in the region
+ * were purged.
+ */
+long mapping_range_nonvolatile(struct address_space *mapping,
+ pgoff_t start_index, pgoff_t end_index)
+{
+ struct volatile_range *new;
+ struct range_tree_node *node;
+ int ret = 0;
+ u64 start, end;
+ int used_new = 0;
+
+ start = (u64)start_index;
+ end = (u64)end_index;
+
+ /* create new node */
+ new = vrange_alloc();
+ if (!new)
+ return -ENOMEM;
+
+ mutex_lock(&volatile_mutex);
+ node = range_tree_in_range(&mapping->volatile_root, start, end);
+ while (node) {
+ struct volatile_range *vrange;
+ vrange = container_of(node, struct volatile_range, range_node);
+
+ ret |= vrange->purged;
+
+ if (start <= node->start && end >= node->end) {
+ /* delete: volatile range is totally within range */
+ node = range_tree_next_in_range(&vrange->range_node,
+ start, end);
+ vrange_del(vrange);
+ } else if (node->start >= start) {
+ /* resize: volatile range right-overlaps range */
+ volatile_range_resize(vrange, end+1, node->end);
+ node = range_tree_next_in_range(&vrange->range_node,
+ start, end);
+
+ } else if (node->end <= end) {
+ /* resize: volatile range left-overlaps range */
+ volatile_range_resize(vrange, node->start, start-1);
+ node = range_tree_next_in_range(&vrange->range_node,
+ start, end);
+ } else {
+ /* split: range is totally within a volatile range */
+ used_new = 1; /* we only do this once */
+ new->mapping = mapping;
+ new->range_node.start = end + 1;
+ new->range_node.end = node->end;
+ new->purged = vrange->purged;
+ range_tree_add(&mapping->volatile_root,
+ &new->range_node);
+ if (range_on_lru(new))
+ lru_add(new);
+ volatile_range_resize(vrange, node->start, start-1);
+
+ break;
+ }
+ }
+ mutex_unlock(&volatile_mutex);
+
+ if (!used_new)
+ kfree(new);
+
+ return ret;
+}
+
+
+/*
+ * Cleans up any volatile ranges.
+ */
+void mapping_clear_volatile_ranges(struct address_space *mapping)
+{
+ struct volatile_range *tozap;
+
+ mutex_lock(&volatile_mutex);
+ while (!range_tree_empty(&mapping->volatile_root)) {
+ struct range_tree_node *tmp;
+ tmp = range_tree_root_node(&mapping->volatile_root);
+ tozap = container_of(tmp, struct volatile_range, range_node);
+ vrange_del(tozap);
+
+ }
+ mutex_unlock(&volatile_mutex);
+}
+
+/*
+ * Purges volatile ranges when under memory pressure
+ */
+static int volatile_shrink(struct shrinker *ignored, struct shrink_control *sc)
+{
+ struct volatile_range *range, *next;
+ s64 nr_to_scan = sc->nr_to_scan;
+ const gfp_t gfp_mask = sc->gfp_mask;
+
+ if (nr_to_scan && !(gfp_mask & __GFP_FS))
+ return -1;
+ if (!nr_to_scan)
+ return lru_count;
+
+ mutex_lock(&volatile_mutex);
+ list_for_each_entry_safe(range, next, &volatile_lru_list, lru) {
+ struct inode *inode;
+ loff_t start, end;
+
+ inode = range->mapping->host;
+
+ start = range->range_node.start << PAGE_CACHE_SHIFT;
+ end = ((range->range_node.end + 1) << PAGE_CACHE_SHIFT) - 1;
+
+ /*
+ * XXX - calling vmtruncate_range from a shrinker causes
+ * lockdep warnings. Revisit this!
+ */
+ if (!vmtruncate_range(inode, start, end)) {
+ lru_del(range);
+ range->purged = 1;
+ nr_to_scan -= range_size(range);
+ }
+
+ if (nr_to_scan <= 0)
+ break;
+ }
+ mutex_unlock(&volatile_mutex);
+
+ return lru_count;
+}
+
+static struct shrinker volatile_shrinker = {
+ .shrink = volatile_shrink,
+ .seeks = DEFAULT_SEEKS,
+};
+
+static int __init volatile_init(void)
+{
+ register_shrinker(&volatile_shrinker);
+ return 0;
+}
+
+arch_initcall(volatile_init);