diff mbox

[1/2] Add mempressure cgroup

Message ID 1357288152-23625-1-git-send-email-anton.vorontsov@linaro.org
State New
Headers show

Commit Message

Anton Vorontsov Jan. 4, 2013, 8:29 a.m. UTC
This commit implements David Rientjes' idea of mempressure cgroup.

The main characteristics are the same to what I've tried to add to vmevent
API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
pressure index calculation. But we don't expose the index to the userland.
Instead, there are three levels of the pressure:

 o low (just reclaiming, e.g. caches are draining);
 o medium (allocation cost becomes high, e.g. swapping);
 o oom (about to oom very soon).

The rationale behind exposing levels and not the raw pressure index
described here: http://lkml.org/lkml/2012/11/16/675

For a task it is possible to be in both cpusets, memcg and mempressure
cgroups, so by rearranging the tasks it is possible to watch a specific
pressure (i.e. caused by cpuset and/or memcg).

Note that while this adds the cgroups support, the code is well separated
and eventually we might add a lightweight, non-cgroups API, i.e. vmevent.
But this is another story.

Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
---
 Documentation/cgroups/mempressure.txt |  50 ++++++
 include/linux/cgroup_subsys.h         |   6 +
 include/linux/vmstat.h                |  11 ++
 init/Kconfig                          |  12 ++
 mm/Makefile                           |   1 +
 mm/mempressure.c                      | 330 ++++++++++++++++++++++++++++++++++
 mm/vmscan.c                           |   4 +
 7 files changed, 414 insertions(+)
 create mode 100644 Documentation/cgroups/mempressure.txt
 create mode 100644 mm/mempressure.c

Comments

KAMEZAWA Hiroyuki Jan. 7, 2013, 8:51 a.m. UTC | #1
(2013/01/04 17:29), Anton Vorontsov wrote:
> This commit implements David Rientjes' idea of mempressure cgroup.
> 
> The main characteristics are the same to what I've tried to add to vmevent
> API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
> pressure index calculation. But we don't expose the index to the userland.
> Instead, there are three levels of the pressure:
> 
>   o low (just reclaiming, e.g. caches are draining);
>   o medium (allocation cost becomes high, e.g. swapping);
>   o oom (about to oom very soon).
> 
> The rationale behind exposing levels and not the raw pressure index
> described here: http://lkml.org/lkml/2012/11/16/675
> 
> For a task it is possible to be in both cpusets, memcg and mempressure
> cgroups, so by rearranging the tasks it is possible to watch a specific
> pressure (i.e. caused by cpuset and/or memcg).
> 
> Note that while this adds the cgroups support, the code is well separated
> and eventually we might add a lightweight, non-cgroups API, i.e. vmevent.
> But this is another story.
> 
> Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>

I'm just curious..
 
> ---
>   Documentation/cgroups/mempressure.txt |  50 ++++++
>   include/linux/cgroup_subsys.h         |   6 +
>   include/linux/vmstat.h                |  11 ++
>   init/Kconfig                          |  12 ++
>   mm/Makefile                           |   1 +
>   mm/mempressure.c                      | 330 ++++++++++++++++++++++++++++++++++
>   mm/vmscan.c                           |   4 +
>   7 files changed, 414 insertions(+)
>   create mode 100644 Documentation/cgroups/mempressure.txt
>   create mode 100644 mm/mempressure.c
> 
> diff --git a/Documentation/cgroups/mempressure.txt b/Documentation/cgroups/mempressure.txt
> new file mode 100644
> index 0000000..dbc0aca
> --- /dev/null
> +++ b/Documentation/cgroups/mempressure.txt
> @@ -0,0 +1,50 @@
> +  Memory pressure cgroup
> +~~~~~~~~~~~~~~~~~~~~~~~~~~
> +  Before using the mempressure cgroup, make sure you have it mounted:
> +
> +   # cd /sys/fs/cgroup/
> +   # mkdir mempressure
> +   # mount -t cgroup cgroup ./mempressure -o mempressure
> +
> +  It is possible to combine cgroups, for example you can mount memory
> +  (memcg) and mempressure cgroups together:
> +
> +   # mount -t cgroup cgroup ./mempressure -o memory,mempressure
> +
> +  That way the reported pressure will honour memory cgroup limits. The
> +  same goes for cpusets.
> +
> +  After the hierarchy is mounted, you can use the following API:
> +
> +  /sys/fs/cgroup/.../mempressure.level
> +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> +  To maintain the interactivity/memory allocation cost, one can use the
> +  pressure level notifications, and the levels are defined like this:
> +
> +  The "low" level means that the system is reclaiming memory for new
> +  allocations. Monitoring reclaiming activity might be useful for
> +  maintaining overall system's cache level. Upon notification, the program
> +  (typically "Activity Manager") might analyze vmstat and act in advance
> +  (i.e. prematurely shutdown unimportant services).
> +
> +  The "medium" level means that the system is experiencing medium memory
> +  pressure, there is some mild swapping activity. Upon this event
> +  applications may decide to free any resources that can be easily
> +  reconstructed or re-read from a disk.
> +
> +  The "oom" level means that the system is actively thrashing, it is about
> +  to out of memory (OOM) or even the in-kernel OOM killer is on its way to
> +  trigger. Applications should do whatever they can to help the system.
> +
> +  Event control:
> +    Is used to setup an eventfd with a level threshold. The argument to
> +    the event control specifies the level threshold.
> +  Read:
> +    Reads mempory presure levels: low, medium or oom.
> +  Write:
> +    Not implemented.
> +  Test:
> +    To set up a notification:
> +
> +    # cgroup_event_listener ./mempressure.level low
> +    ("low", "medium", "oom" are permitted.)
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index f204a7a..b9802e2 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -37,6 +37,12 @@ SUBSYS(mem_cgroup)
>   
>   /* */
>   
> +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_MEMPRESSURE)
> +SUBSYS(mpc_cgroup)
> +#endif
> +
> +/* */
> +
>   #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE)
>   SUBSYS(devices)
>   #endif
> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> index a13291f..c1a66c7 100644
> --- a/include/linux/vmstat.h
> +++ b/include/linux/vmstat.h
> @@ -10,6 +10,17 @@
>   
>   extern int sysctl_stat_interval;
>   
> +struct mem_cgroup;
> +#ifdef CONFIG_CGROUP_MEMPRESSURE
> +extern void vmpressure(struct mem_cgroup *memcg,
> +		       ulong scanned, ulong reclaimed);
> +extern void vmpressure_prio(struct mem_cgroup *memcg, int prio);
> +#else
> +static inline void vmpressure(struct mem_cgroup *memcg,
> +			      ulong scanned, ulong reclaimed) {}
> +static inline void vmpressure_prio(struct mem_cgroup *memcg, int prio) {}
> +#endif
> +
>   #ifdef CONFIG_VM_EVENT_COUNTERS
>   /*
>    * Light weight per cpu counter implementation.
> diff --git a/init/Kconfig b/init/Kconfig
> index 7d30240..d526249 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -891,6 +891,18 @@ config MEMCG_KMEM
>   	  the kmem extension can use it to guarantee that no group of processes
>   	  will ever exhaust kernel resources alone.
>   
> +config CGROUP_MEMPRESSURE
> +	bool "Memory pressure monitor for Control Groups"
> +	help
> +	  The memory pressure monitor cgroup provides a facility for
> +	  userland programs so that they could easily assist the kernel
> +	  with the memory management. So far the API provides simple,
> +	  levels-based memory pressure notifications.
> +
> +	  For more information see Documentation/cgroups/mempressure.txt
> +
> +	  If unsure, say N.
> +
>   config CGROUP_HUGETLB
>   	bool "HugeTLB Resource Controller for Control Groups"
>   	depends on RESOURCE_COUNTERS && HUGETLB_PAGE && EXPERIMENTAL
> diff --git a/mm/Makefile b/mm/Makefile
> index 3a46287..e69bbda 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -51,6 +51,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
>   obj-$(CONFIG_QUICKLIST) += quicklist.o
>   obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
>   obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
> +obj-$(CONFIG_CGROUP_MEMPRESSURE) += mempressure.o
>   obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
>   obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
>   obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
> diff --git a/mm/mempressure.c b/mm/mempressure.c
> new file mode 100644
> index 0000000..ea312bb
> --- /dev/null
> +++ b/mm/mempressure.c
> @@ -0,0 +1,330 @@
> +/*
> + * Linux VM pressure
> + *
> + * Copyright 2012 Linaro Ltd.
> + *		  Anton Vorontsov <anton.vorontsov@linaro.org>
> + *
> + * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
> + * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published
> + * by the Free Software Foundation.
> + */
> +
> +#include <linux/cgroup.h>
> +#include <linux/fs.h>
> +#include <linux/sched.h>
> +#include <linux/mm.h>
> +#include <linux/vmstat.h>
> +#include <linux/eventfd.h>
> +#include <linux/swap.h>
> +#include <linux/printk.h>
> +
> +static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r);
> +
> +/*
> + * Generic VM Pressure routines (no cgroups or any other API details)
> + */
> +
> +/*
> + * The window size is the number of scanned pages before we try to analyze
> + * the scanned/reclaimed ratio (or difference).
> + *
> + * It is used as a rate-limit tunable for the "low" level notification,
> + * and for averaging medium/oom levels. Using small window sizes can cause
> + * lot of false positives, but too big window size will delay the
> + * notifications.
> + */
> +static const uint vmpressure_win = SWAP_CLUSTER_MAX * 16;
> +static const uint vmpressure_level_med = 60;
> +static const uint vmpressure_level_oom = 99;
> +static const uint vmpressure_level_oom_prio = 4;
> +

Hmm... isn't this window size too small ?
If vmscan cannot find a reclaimable page while scanning 2M of pages in a zone,
oom notify will be returned. Right ?

Thanks,
-Kame
Anton Vorontsov Jan. 8, 2013, 7:29 a.m. UTC | #2
On Mon, Jan 07, 2013 at 05:51:46PM +0900, Kamezawa Hiroyuki wrote:
[...]
> I'm just curious..

Thanks for taking a look! :)

[...]
> > +/*
> > + * The window size is the number of scanned pages before we try to analyze
> > + * the scanned/reclaimed ratio (or difference).
> > + *
> > + * It is used as a rate-limit tunable for the "low" level notification,
> > + * and for averaging medium/oom levels. Using small window sizes can cause
> > + * lot of false positives, but too big window size will delay the
> > + * notifications.
> > + */
> > +static const uint vmpressure_win = SWAP_CLUSTER_MAX * 16;
> > +static const uint vmpressure_level_med = 60;
> > +static const uint vmpressure_level_oom = 99;
> > +static const uint vmpressure_level_oom_prio = 4;
> > +
> 
> Hmm... isn't this window size too small ?
> If vmscan cannot find a reclaimable page while scanning 2M of pages in a zone,
> oom notify will be returned. Right ?

Yup, you are right, if we were not able to find anything within the window
size (which is 2M, but see below), then it is effectively the "OOM level".
The thing is, the vmpressure reports... the pressure. :) Or, the
allocation cost, and if the cost becomes high, it is no good.

The 2M is, of course, not ideal. And the "ideal" depends on many factors,
alike to vmstat. And, actually I dream about deriving the window size from
zone->stat_threshold, which would make the window automatically adjustable
for different "machine sizes" (as we do in calculate_normal_threshold(),
in vmstat.c).

But again, this is all "implementation details"; tunable stuff that we can
either adjust ourselves as needed, or try to be smart, i.e. apply some
heuristics, again, as in vmstat.

Thanks,
Anton
leonid.moiseichuk@nokia.com Jan. 8, 2013, 7:57 a.m. UTC | #3
-----Original Message-----
From: ext Anton Vorontsov [mailto:anton.vorontsov@linaro.org] 

Sent: 08 January, 2013 08:30
...
> > +static const uint vmpressure_level_med = 60;

> > +static const uint vmpressure_level_oom = 99;

> > +static const uint vmpressure_level_oom_prio = 4;

> > +

..
Seems vmpressure_level_oom = 99 is quite high if I understand it as a global. If I do not wrong in old version of kernel the kernel only memory border was stated as 1/32 part of available memory meaning no allocation for user-space if amount of free memory reached 1/32. So, decreasing this parameter to 95 or 90 will allow notification to be propagated to user-space and handled.

Best wishes,
Leonid
KAMEZAWA Hiroyuki Jan. 8, 2013, 8:24 a.m. UTC | #4
(2013/01/08 16:29), Anton Vorontsov wrote:
> On Mon, Jan 07, 2013 at 05:51:46PM +0900, Kamezawa Hiroyuki wrote:
> [...]
>> I'm just curious..
>
> Thanks for taking a look! :)
>
> [...]
>>> +/*
>>> + * The window size is the number of scanned pages before we try to analyze
>>> + * the scanned/reclaimed ratio (or difference).
>>> + *
>>> + * It is used as a rate-limit tunable for the "low" level notification,
>>> + * and for averaging medium/oom levels. Using small window sizes can cause
>>> + * lot of false positives, but too big window size will delay the
>>> + * notifications.
>>> + */
>>> +static const uint vmpressure_win = SWAP_CLUSTER_MAX * 16;
>>> +static const uint vmpressure_level_med = 60;
>>> +static const uint vmpressure_level_oom = 99;
>>> +static const uint vmpressure_level_oom_prio = 4;
>>> +
>>
>> Hmm... isn't this window size too small ?
>> If vmscan cannot find a reclaimable page while scanning 2M of pages in a zone,
>> oom notify will be returned. Right ?
>
> Yup, you are right, if we were not able to find anything within the window
> size (which is 2M, but see below), then it is effectively the "OOM level".
> The thing is, the vmpressure reports... the pressure. :) Or, the
> allocation cost, and if the cost becomes high, it is no good.
>
> The 2M is, of course, not ideal. And the "ideal" depends on many factors,
> alike to vmstat. And, actually I dream about deriving the window size from
> zone->stat_threshold, which would make the window automatically adjustable
> for different "machine sizes" (as we do in calculate_normal_threshold(),
> in vmstat.c).
>
> But again, this is all "implementation details"; tunable stuff that we can
> either adjust ourselves as needed, or try to be smart, i.e. apply some
> heuristics, again, as in vmstat.
>

Hmm, I like automatic adjustment for things like this (but may be need to be tunable by
user). My concern is, for example, that if a qemu-kvm with pci-passthrough running on
a node using the most of memory on it, the interface will say "Hey it's near to OOM"
to users. We may need a complicated heuristics ;)

Anyway, your approach seems interesting to me but it seems peaky to usual users.
Uses should know what they should check (vmstat, zoneinfo, malloc latency ??) when they
get notify before rising real alarm. (not explained in the doc.)
For example, if the user takes care of usage of swap, he should check it.

I'm glad if you explain in Doc that this interface just makes a hint and notify status
of _recent_ vmscans of some amount of window. That means latency of recent memory allocations.
Users should confirm the real status and make the final judge by themselves.
The point is that this notify is important because it's quick and related to ongoing memory
allocation latency. But kernel is not sure there are long-standing heavy vm pressure.

I'm sorry if I misundestand the concept.

Thank you,
-Kame
Minchan Kim Jan. 8, 2013, 8:49 a.m. UTC | #5
Hi Anton,

On Fri, Jan 04, 2013 at 12:29:11AM -0800, Anton Vorontsov wrote:
> This commit implements David Rientjes' idea of mempressure cgroup.
> 
> The main characteristics are the same to what I've tried to add to vmevent
> API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
> pressure index calculation. But we don't expose the index to the userland.
> Instead, there are three levels of the pressure:
> 
>  o low (just reclaiming, e.g. caches are draining);
>  o medium (allocation cost becomes high, e.g. swapping);
>  o oom (about to oom very soon).
> 
> The rationale behind exposing levels and not the raw pressure index
> described here: http://lkml.org/lkml/2012/11/16/675
> 
> For a task it is possible to be in both cpusets, memcg and mempressure
> cgroups, so by rearranging the tasks it is possible to watch a specific
> pressure (i.e. caused by cpuset and/or memcg).
> 
> Note that while this adds the cgroups support, the code is well separated
> and eventually we might add a lightweight, non-cgroups API, i.e. vmevent.
> But this is another story.
> 
> Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>

Sorry still I didn't look at your implementation about cgroup part.
but I had a question since long time ago.

How can we can make sure false positive about zone and NUMA?
I mean DMA zone is short in system so VM notify to user and user
free all memory of NORMAL zone because he can't know what pages live
in any zones. NUMA is ditto.
Andrew Morton Jan. 8, 2013, 9:44 p.m. UTC | #6
On Fri,  4 Jan 2013 00:29:11 -0800
Anton Vorontsov <anton.vorontsov@linaro.org> wrote:

> This commit implements David Rientjes' idea of mempressure cgroup.
> 
> The main characteristics are the same to what I've tried to add to vmevent
> API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
> pressure index calculation. But we don't expose the index to the userland.
> Instead, there are three levels of the pressure:
> 
>  o low (just reclaiming, e.g. caches are draining);
>  o medium (allocation cost becomes high, e.g. swapping);
>  o oom (about to oom very soon).
> 
> The rationale behind exposing levels and not the raw pressure index
> described here: http://lkml.org/lkml/2012/11/16/675
> 
> For a task it is possible to be in both cpusets, memcg and mempressure
> cgroups, so by rearranging the tasks it is possible to watch a specific
> pressure (i.e. caused by cpuset and/or memcg).
> 
> Note that while this adds the cgroups support, the code is well separated
> and eventually we might add a lightweight, non-cgroups API, i.e. vmevent.
> But this is another story.
> 

I'd have thought that it's pretty important offer this feature to
non-cgroups setups.  Restricting it to cgroups-only seems a large
limitation.

> diff --git a/mm/mempressure.c b/mm/mempressure.c
> new file mode 100644
> index 0000000..ea312bb
> --- /dev/null
> +++ b/mm/mempressure.c
> @@ -0,0 +1,330 @@
> +/*
> + * Linux VM pressure
> + *
> + * Copyright 2012 Linaro Ltd.
> + *		  Anton Vorontsov <anton.vorontsov@linaro.org>
> + *
> + * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
> + * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published
> + * by the Free Software Foundation.
> + */
> +
> +#include <linux/cgroup.h>
> +#include <linux/fs.h>
> +#include <linux/sched.h>
> +#include <linux/mm.h>
> +#include <linux/vmstat.h>
> +#include <linux/eventfd.h>
> +#include <linux/swap.h>
> +#include <linux/printk.h>
> +
> +static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r);

mm/ doesn't use uint or ulong.  In fact I can find zero uses of either
in all of mm/.

I don't have a problem with them personally - they're short and clear. 
But we just ...  don't do that.  Perhaps we shold start using them.

> +
> +/*
> + * Generic VM Pressure routines (no cgroups or any other API details)
> + */
> +
> +/*
> + * The window size is the number of scanned pages before we try to analyze
> + * the scanned/reclaimed ratio (or difference).
> + *
> + * It is used as a rate-limit tunable for the "low" level notification,
> + * and for averaging medium/oom levels. Using small window sizes can cause
> + * lot of false positives, but too big window size will delay the
> + * notifications.
> + */
> +static const uint vmpressure_win = SWAP_CLUSTER_MAX * 16;
> +static const uint vmpressure_level_med = 60;
> +static const uint vmpressure_level_oom = 99;
> +static const uint vmpressure_level_oom_prio = 4;
> +
> +enum vmpressure_levels {
> +	VMPRESSURE_LOW = 0,
> +	VMPRESSURE_MEDIUM,
> +	VMPRESSURE_OOM,

VMPRESSURE_OOM seems an odd-man-out.  VMPRESSURE_HIGH would be pleasing.

> +	VMPRESSURE_NUM_LEVELS,
> +};
> +
>
> ...
>
> +static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r)
> +{
> +	/*
> +	 * There are two options for implementing cgroup pressure
> +	 * notifications:
> +	 *
> +	 * - Store pressure counter atomically in the task struct. Upon
> +	 *   hitting 'window' wake up a workqueue that will walk every
> +	 *   task and sum per-thread pressure into cgroup pressure (to
> +	 *   which the task belongs). The cons are obvious: bloats task
> +	 *   struct, have to walk all processes and makes pressue less
> +	 *   accurate (the window becomes per-thread);
> +	 *
> +	 * - Store pressure counters in per-cgroup state. This is easy and
> +	 *   straightforward, and that's how we do things here. But this
> +	 *   requires us to not put the vmpressure hooks into hotpath,
> +	 *   since we have to grab some locks.
> +	 */
> +
> +#ifdef CONFIG_MEMCG
> +	if (memcg) {
> +		struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
> +		struct cgroup *cg = css->cgroup;
> +		struct mpc_state *mpc = cg2mpc(cg);
> +
> +		if (mpc)
> +			__mpc_vmpressure(mpc, s, r);
> +		return;
> +	}
> +#endif
> +	task_lock(current);
> +	__mpc_vmpressure(tsk2mpc(current), s, r);
> +	task_unlock(current);
> +}

The task_lock() is mysterious.  What's it protecting?  That's unobvious
and afacit undocumented.

Also it is buggy: __mpc_vmpressure() does mutex_lock(). 
Documentation/SubmitChecklist section 12 has handy hints!

>
> ...
>
Glauber Costa Jan. 9, 2013, 8:56 a.m. UTC | #7
Hi.

I have a couple of small questions.

On 01/04/2013 12:29 PM, Anton Vorontsov wrote:
> This commit implements David Rientjes' idea of mempressure cgroup.
> 
> The main characteristics are the same to what I've tried to add to vmevent
> API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
> pressure index calculation. But we don't expose the index to the userland.
> Instead, there are three levels of the pressure:
> 
>  o low (just reclaiming, e.g. caches are draining);
>  o medium (allocation cost becomes high, e.g. swapping);
>  o oom (about to oom very soon).
> 
> The rationale behind exposing levels and not the raw pressure index
> described here: http://lkml.org/lkml/2012/11/16/675
> 
> For a task it is possible to be in both cpusets, memcg and mempressure
> cgroups, so by rearranging the tasks it is possible to watch a specific
> pressure (i.e. caused by cpuset and/or memcg).
> 
> Note that while this adds the cgroups support, the code is well separated
> and eventually we might add a lightweight, non-cgroups API, i.e. vmevent.
> But this is another story.
Andrew already said he would like to see this exposed to non cgroup
users, I'll just add to that: I'd like the interfaces to be consistent.

We need to make sure that cgroups and non-cgroup users will act on this
in the same way. So it is important that this is included in the
proposition, so we can judge and avoid a future kludge.

> diff --git a/Documentation/cgroups/mempressure.txt b/Documentation/cgroups/mempressure.txt
> new file mode 100644
> index 0000000..dbc0aca
> --- /dev/null
> +++ b/Documentation/cgroups/mempressure.txt
> @@ -0,0 +1,50 @@
> +  Memory pressure cgroup
> +~~~~~~~~~~~~~~~~~~~~~~~~~~
> +  Before using the mempressure cgroup, make sure you have it mounted:
> +
> +   # cd /sys/fs/cgroup/
> +   # mkdir mempressure
> +   # mount -t cgroup cgroup ./mempressure -o mempressure
> +
> +  It is possible to combine cgroups, for example you can mount memory
> +  (memcg) and mempressure cgroups together:
> +
> +   # mount -t cgroup cgroup ./mempressure -o memory,mempressure
> +

Most of the time these days, the groups are mounted separately. The
tasks, however, still belong to one or more controllers regardless of
where they are mounted.

Can you describe a bit better (not only in reply, but also update the
docs) what happens when:

1) both cpusets and memcg are present. Which one takes precedence? Will
there be a way to differentiate which kind of pressure is being seen so
I as a task can adjust my actions accordingly?

2) the task belongs to memcg (or cpuset), but the controllers itself are
mounted separately. Is it equivalent to mounted them jointly? Will this
fact just be ignored by the pressure levels?

I can guess the answer to some of them by the code, but I think it is
quite important to have all this crystal clear.

> +    ("low", "medium", "oom" are permitted.)
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index f204a7a..b9802e2 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -37,6 +37,12 @@ SUBSYS(mem_cgroup)
>  
>  /* */
>  
> +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_MEMPRESSURE)
> +SUBSYS(mpc_cgroup)
> +#endif

It might be just me, but if one does not know what this is about, "mpc"
immediately fetches something communication-related to mind. I would
suggest changing this to just plain "mempressure_cgroup", or something
more descriptive.

> diff --git a/mm/mempressure.c b/mm/mempressure.c
> new file mode 100644
> index 0000000..ea312bb
> --- /dev/null
> +++ b/mm/mempressure.c
> @@ -0,0 +1,330 @@
> +/*
> + * Linux VM pressure
> + *
> + * Copyright 2012 Linaro Ltd.
> + *		  Anton Vorontsov <anton.vorontsov@linaro.org>
> + *
> + * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
> + * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published
> + * by the Free Software Foundation.
> + */
> +
> +#include <linux/cgroup.h>
> +#include <linux/fs.h>
> +#include <linux/sched.h>
> +#include <linux/mm.h>
> +#include <linux/vmstat.h>
> +#include <linux/eventfd.h>
> +#include <linux/swap.h>
> +#include <linux/printk.h>
> +
> +static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r);
> +
> +/*
> + * Generic VM Pressure routines (no cgroups or any other API details)
> + */
> +
> +/*
> + * The window size is the number of scanned pages before we try to analyze
> + * the scanned/reclaimed ratio (or difference).
> + *
> + * It is used as a rate-limit tunable for the "low" level notification,
> + * and for averaging medium/oom levels. Using small window sizes can cause
> + * lot of false positives, but too big window size will delay the
> + * notifications.
> + */
> +static const uint vmpressure_win = SWAP_CLUSTER_MAX * 16;
> +static const uint vmpressure_level_med = 60;
> +static const uint vmpressure_level_oom = 99;
> +static const uint vmpressure_level_oom_prio = 4;
> +
> +enum vmpressure_levels {
> +	VMPRESSURE_LOW = 0,
> +	VMPRESSURE_MEDIUM,
> +	VMPRESSURE_OOM,
> +	VMPRESSURE_NUM_LEVELS,
> +};
> +
> +static const char *vmpressure_str_levels[] = {
> +	[VMPRESSURE_LOW] = "low",
> +	[VMPRESSURE_MEDIUM] = "medium",
> +	[VMPRESSURE_OOM] = "oom",
> +};
> +
> +static enum vmpressure_levels vmpressure_level(uint pressure)
> +{
> +	if (pressure >= vmpressure_level_oom)
> +		return VMPRESSURE_OOM;
> +	else if (pressure >= vmpressure_level_med)
> +		return VMPRESSURE_MEDIUM;
> +	return VMPRESSURE_LOW;
> +}
> +
> +static ulong vmpressure_calc_level(uint win, uint s, uint r)
> +{
> +	ulong p;
> +
> +	if (!s)
> +		return 0;
> +
> +	/*
> +	 * We calculate the ratio (in percents) of how many pages were
> +	 * scanned vs. reclaimed in a given time frame (window). Note that
> +	 * time is in VM reclaimer's "ticks", i.e. number of pages
> +	 * scanned. This makes it possible to set desired reaction time
> +	 * and serves as a ratelimit.
> +	 */
> +	p = win - (r * win / s);
> +	p = p * 100 / win;
> +
> +	pr_debug("%s: %3lu  (s: %6u  r: %6u)\n", __func__, p, s, r);
> +
> +	return vmpressure_level(p);
> +}
> +
> +void vmpressure(struct mem_cgroup *memcg, ulong scanned, ulong reclaimed)
> +{
> +	if (!scanned)
> +		return;
> +	mpc_vmpressure(memcg, scanned, reclaimed);
> +}
> +
> +void vmpressure_prio(struct mem_cgroup *memcg, int prio)
> +{
> +	if (prio > vmpressure_level_oom_prio)
> +		return;
> +
> +	/* OK, the prio is below the threshold, send the pre-OOM event. */
> +	vmpressure(memcg, vmpressure_win, 0);
> +}
> +
> +/*
> + * Memory pressure cgroup code
> + */
> +
> +struct mpc_event {
> +	struct eventfd_ctx *efd;
> +	enum vmpressure_levels level;
> +	struct list_head node;
> +};
> +
> +struct mpc_state {
> +	struct cgroup_subsys_state css;
> +
> +	uint scanned;
> +	uint reclaimed;
> +	struct mutex sr_lock;
> +
> +	struct list_head events;
> +	struct mutex events_lock;
> +
> +	struct work_struct work;
> +};
> +
> +static struct mpc_state *wk2mpc(struct work_struct *wk)
> +{
> +	return container_of(wk, struct mpc_state, work);
> +}
> +
> +static struct mpc_state *css2mpc(struct cgroup_subsys_state *css)
> +{
> +	return container_of(css, struct mpc_state, css);
> +}
> +
> +static struct mpc_state *tsk2mpc(struct task_struct *tsk)
> +{
> +	return css2mpc(task_subsys_state(tsk, mpc_cgroup_subsys_id));
> +}
> +
> +static struct mpc_state *cg2mpc(struct cgroup *cg)
> +{
> +	return css2mpc(cgroup_subsys_state(cg, mpc_cgroup_subsys_id));
> +}

I think we would be better of with more descriptive names here as well.
Other cgroups would use the convention of using _to_ and _from_ in names
instead of 2.

For instance, task_to_mempressure is a lot more descriptive than
"tsk2mpc". There are no bonus points for manually compressing code.

> +
> +static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r)
> +{
> +	/*
> +	 * There are two options for implementing cgroup pressure
> +	 * notifications:
> +	 *
> +	 * - Store pressure counter atomically in the task struct. Upon
> +	 *   hitting 'window' wake up a workqueue that will walk every
> +	 *   task and sum per-thread pressure into cgroup pressure (to
> +	 *   which the task belongs). The cons are obvious: bloats task
> +	 *   struct, have to walk all processes and makes pressue less
> +	 *   accurate (the window becomes per-thread);
> +	 *
> +	 * - Store pressure counters in per-cgroup state. This is easy and
> +	 *   straightforward, and that's how we do things here. But this
> +	 *   requires us to not put the vmpressure hooks into hotpath,
> +	 *   since we have to grab some locks.
> +	 */
> +
> +#ifdef CONFIG_MEMCG
> +	if (memcg) {
> +		struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
> +		struct cgroup *cg = css->cgroup;
> +		struct mpc_state *mpc = cg2mpc(cg);
> +
> +		if (mpc)
> +			__mpc_vmpressure(mpc, s, r);
> +		return;
> +	}
> +#endif
> +	task_lock(current);
> +	__mpc_vmpressure(tsk2mpc(current), s, r);
> +	task_unlock(current);
> +}

How about cpusets?

I still see no significant mention of it, and I would like to understand
how does it get into play in practice.
Andrew Morton Jan. 9, 2013, 9:15 a.m. UTC | #8
On Wed, 9 Jan 2013 12:56:46 +0400 Glauber Costa <glommer@parallels.com> wrote:

> > +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_MEMPRESSURE)
> > +SUBSYS(mpc_cgroup)
> > +#endif
> 
> It might be just me, but if one does not know what this is about, "mpc"
> immediately fetches something communication-related to mind. I would
> suggest changing this to just plain "mempressure_cgroup", or something
> more descriptive.

mempressure_cgroup is rather lengthy.  "mpcg" would be good - it's short
and rememberable.
Glauber Costa Jan. 9, 2013, 1:43 p.m. UTC | #9
On 01/09/2013 01:15 PM, Andrew Morton wrote:
> On Wed, 9 Jan 2013 12:56:46 +0400 Glauber Costa <glommer@parallels.com> wrote:
> 
>>> +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_MEMPRESSURE)
>>> +SUBSYS(mpc_cgroup)
>>> +#endif
>>
>> It might be just me, but if one does not know what this is about, "mpc"
>> immediately fetches something communication-related to mind. I would
>> suggest changing this to just plain "mempressure_cgroup", or something
>> more descriptive.
> 
> mempressure_cgroup is rather lengthy.  "mpcg" would be good - it's short
> and rememberable.
> 
Or, since most of the cgroups don't actually use the suffix "cgroup"
(with the exception of cpu and memcg), maybe just mempressure?
Glauber Costa Jan. 9, 2013, 2:10 p.m. UTC | #10
On 01/09/2013 01:44 AM, Andrew Morton wrote:
> On Fri,  4 Jan 2013 00:29:11 -0800
> Anton Vorontsov <anton.vorontsov@linaro.org> wrote:
> 
>> This commit implements David Rientjes' idea of mempressure cgroup.
>>
>> The main characteristics are the same to what I've tried to add to vmevent
>> API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
>> pressure index calculation. But we don't expose the index to the userland.
>> Instead, there are three levels of the pressure:
>>
>>  o low (just reclaiming, e.g. caches are draining);
>>  o medium (allocation cost becomes high, e.g. swapping);
>>  o oom (about to oom very soon).
>>
>> The rationale behind exposing levels and not the raw pressure index
>> described here: http://lkml.org/lkml/2012/11/16/675
>>
>> For a task it is possible to be in both cpusets, memcg and mempressure
>> cgroups, so by rearranging the tasks it is possible to watch a specific
>> pressure (i.e. caused by cpuset and/or memcg).
>>
>> Note that while this adds the cgroups support, the code is well separated
>> and eventually we might add a lightweight, non-cgroups API, i.e. vmevent.
>> But this is another story.
>>
> 
> I'd have thought that it's pretty important offer this feature to
> non-cgroups setups.  Restricting it to cgroups-only seems a large
> limitation.
> 

Why is it so, Andrew?

When we talk about "cgroups", we are not necessarily talking about the
whole beast, with all controllers enabled. Much less we are talking
about hierarchies being created, and tasks put on it.

It's an interface only. And since all controllers will always have a
special "root" cgroup, this applies to the tasks in the system all the
same. In the end of the day, if we have something like
CONFIG_MEMPRESSURE that selects CONFIG_CGROUP, the user needs to do the
same thing to actually turn on the functionality: switch a config
option. It is not more expensive, and it doesn't bring in anything extra
as well.

To actually use it, one needs to mount the filesystem, and write to a
file. Nothing else.

What is that drives this opposition towards a cgroup-only interface?
Is it about the interface, or the underlying machinery ?
Andrew Morton Jan. 9, 2013, 8:28 p.m. UTC | #11
On Wed, 9 Jan 2013 18:10:02 +0400
Glauber Costa <glommer@parallels.com> wrote:

> On 01/09/2013 01:44 AM, Andrew Morton wrote:
> > On Fri,  4 Jan 2013 00:29:11 -0800
> > Anton Vorontsov <anton.vorontsov@linaro.org> wrote:
> > 
> >> This commit implements David Rientjes' idea of mempressure cgroup.
> >>
> >> The main characteristics are the same to what I've tried to add to vmevent
> >> API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
> >> pressure index calculation. But we don't expose the index to the userland.
> >> Instead, there are three levels of the pressure:
> >>
> >>  o low (just reclaiming, e.g. caches are draining);
> >>  o medium (allocation cost becomes high, e.g. swapping);
> >>  o oom (about to oom very soon).
> >>
> >> The rationale behind exposing levels and not the raw pressure index
> >> described here: http://lkml.org/lkml/2012/11/16/675
> >>
> >> For a task it is possible to be in both cpusets, memcg and mempressure
> >> cgroups, so by rearranging the tasks it is possible to watch a specific
> >> pressure (i.e. caused by cpuset and/or memcg).
> >>
> >> Note that while this adds the cgroups support, the code is well separated
> >> and eventually we might add a lightweight, non-cgroups API, i.e. vmevent.
> >> But this is another story.
> >>
> > 
> > I'd have thought that it's pretty important offer this feature to
> > non-cgroups setups.  Restricting it to cgroups-only seems a large
> > limitation.
> > 
> 
> Why is it so, Andrew?
> 
> When we talk about "cgroups", we are not necessarily talking about the
> whole beast, with all controllers enabled. Much less we are talking
> about hierarchies being created, and tasks put on it.
> 
> It's an interface only. And since all controllers will always have a
> special "root" cgroup, this applies to the tasks in the system all the
> same. In the end of the day, if we have something like
> CONFIG_MEMPRESSURE that selects CONFIG_CGROUP, the user needs to do the
> same thing to actually turn on the functionality: switch a config
> option. It is not more expensive, and it doesn't bring in anything extra
> as well.
> 
> To actually use it, one needs to mount the filesystem, and write to a
> file. Nothing else.
> 

Oh, OK, well if the feature can be used in a system-wide fashion in
this manner then I guess that is sufficient.  For some reason I was
thinking it was tied to memcg, doh.
Tejun Heo Jan. 9, 2013, 8:37 p.m. UTC | #12
Hello,

Can you please cc me too when posting further patches?  I kinda missed
the whole discussion upto this point.

On Fri, Jan 04, 2013 at 12:29:11AM -0800, Anton Vorontsov wrote:
> This commit implements David Rientjes' idea of mempressure cgroup.
> 
> The main characteristics are the same to what I've tried to add to vmevent
> API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
> pressure index calculation. But we don't expose the index to the userland.
> Instead, there are three levels of the pressure:
> 
>  o low (just reclaiming, e.g. caches are draining);
>  o medium (allocation cost becomes high, e.g. swapping);
>  o oom (about to oom very soon).
> 
> The rationale behind exposing levels and not the raw pressure index
> described here: http://lkml.org/lkml/2012/11/16/675
> 
> For a task it is possible to be in both cpusets, memcg and mempressure
> cgroups, so by rearranging the tasks it is possible to watch a specific
> pressure (i.e. caused by cpuset and/or memcg).

So, cgroup is headed towards single hierarchy.  Dunno how much it
would affect mempressure but it probably isn't wise to design with
focus on multiple hierarchies.

Isn't memory reclaim and oom condition tied to memcgs when memcg is in
use?  It seems natural to tie mempressure to memcg.  Is there some
reason this should be a separate cgroup.  I'm kinda worried this is
headed cpuacct / cpu silliness we have.  Glauber, what's your opinion
here?

Thanks.
Tejun Heo Jan. 9, 2013, 8:39 p.m. UTC | #13
On Wed, Jan 09, 2013 at 12:37:31PM -0800, Tejun Heo wrote:
> Hello,
> 
> Can you please cc me too when posting further patches?  I kinda missed
> the whole discussion upto this point.
> 
> On Fri, Jan 04, 2013 at 12:29:11AM -0800, Anton Vorontsov wrote:
> > This commit implements David Rientjes' idea of mempressure cgroup.
> > 
> > The main characteristics are the same to what I've tried to add to vmevent
> > API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
> > pressure index calculation. But we don't expose the index to the userland.
> > Instead, there are three levels of the pressure:
> > 
> >  o low (just reclaiming, e.g. caches are draining);
> >  o medium (allocation cost becomes high, e.g. swapping);
> >  o oom (about to oom very soon).
> > 
> > The rationale behind exposing levels and not the raw pressure index
> > described here: http://lkml.org/lkml/2012/11/16/675
> > 
> > For a task it is possible to be in both cpusets, memcg and mempressure
> > cgroups, so by rearranging the tasks it is possible to watch a specific
> > pressure (i.e. caused by cpuset and/or memcg).
> 
> So, cgroup is headed towards single hierarchy.  Dunno how much it
> would affect mempressure but it probably isn't wise to design with
> focus on multiple hierarchies.

Also, how are you implementing hierarchical behavior?  All controllers
should support hierarchy.  Can you please explain how the interface
would work in detail?

Thanks.
Glauber Costa Jan. 9, 2013, 9:20 p.m. UTC | #14
On 01/10/2013 12:37 AM, Tejun Heo wrote:
> Hello,
> 
> Can you please cc me too when posting further patches?  I kinda missed
> the whole discussion upto this point.
> 
> On Fri, Jan 04, 2013 at 12:29:11AM -0800, Anton Vorontsov wrote:
>> This commit implements David Rientjes' idea of mempressure cgroup.
>>
>> The main characteristics are the same to what I've tried to add to vmevent
>> API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
>> pressure index calculation. But we don't expose the index to the userland.
>> Instead, there are three levels of the pressure:
>>
>>  o low (just reclaiming, e.g. caches are draining);
>>  o medium (allocation cost becomes high, e.g. swapping);
>>  o oom (about to oom very soon).
>>
>> The rationale behind exposing levels and not the raw pressure index
>> described here: http://lkml.org/lkml/2012/11/16/675
>>
>> For a task it is possible to be in both cpusets, memcg and mempressure
>> cgroups, so by rearranging the tasks it is possible to watch a specific
>> pressure (i.e. caused by cpuset and/or memcg).
> 
> So, cgroup is headed towards single hierarchy.  Dunno how much it
> would affect mempressure but it probably isn't wise to design with
> focus on multiple hierarchies.
> 
> Isn't memory reclaim and oom condition tied to memcgs when memcg is in
> use?  It seems natural to tie mempressure to memcg.  Is there some
> reason this should be a separate cgroup.  I'm kinda worried this is
> headed cpuacct / cpu silliness we have.  Glauber, what's your opinion
> here?
> 

I've already said this in a previous incarnation of this thread. But
I'll summarize my main points:

* I believe this mechanism is superior to memcg notification mechanism.
* I believe memcg notification mechanism is quite coarce - we actually
define the thresholds prior to flushing the stock, which means we can be
wrong by as much as 32 * ncpus.
* Agreeing with you that most of the data will come from memcg, I just
think this should all be part of memcg.
* memcg is indeed expensive even when it is not being used, so global
users would like to avoid it. This is true, but I've already
demonstrated that it is an implementation problem rather than a
conceptual problem, and can be fixed - although I had not yet the time
to go back to it (but now I have a lot less on my shoulders than before)

Given the above, I believe that ideally we should use this pressure
mechanism in memcg replacing the current memcg notification mechanism.
More or less like timer expiration happens: you could still write
numbers for compatibility, but those numbers would be internally mapped
into the levels Anton is proposing, that makes *way* more sense.

If that is not possible, they should coexist as "notification" and a
"pressure" mechanism inside memcg.

The main argument against it centered around cpusets also being able to
participate in the play. I haven't yet understood how would it take
place. In particular, I saw no mention to cpusets in the patches.

I will say again that I fully know memcg is expensive. We all do.
However, it only matters to the global case. For the child cgroup case,
you are *already* paying this anyway. And for the global case, we should
not use the costs of it as an excuse: we should fix it, or otherwise
prove that it is unfixable.
Anton Vorontsov Jan. 9, 2013, 9:36 p.m. UTC | #15
On Thu, Jan 10, 2013 at 01:20:30AM +0400, Glauber Costa wrote:
[...]
> Given the above, I believe that ideally we should use this pressure
> mechanism in memcg replacing the current memcg notification mechanism.

Just a quick wonder: why would we need to place it into memcg, when we
don't need any of the memcg stuff for it? I see no benefits, not
design-wise, not implementation-wise or anything-wise. :)

We can use mempressure w/o memcg, and even then it can (or should :) be
useful (for cpuset, for example).

> More or less like timer expiration happens: you could still write
> numbers for compatibility, but those numbers would be internally mapped
> into the levels Anton is proposing, that makes *way* more sense.
> 
> If that is not possible, they should coexist as "notification" and a
> "pressure" mechanism inside memcg.
> 
> The main argument against it centered around cpusets also being able to
> participate in the play. I haven't yet understood how would it take
> place. In particular, I saw no mention to cpusets in the patches.

I didn't test it, but as I see it, once a process in a specific cpuset,
the task can only use a specific allowed zones for reclaim/alloc, i.e.
various checks like this in vmscan:

         if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                     continue;

So, vmscan simply won't call vmpressure() if the zone is not allowed (so
we won't account that pressure, from that zone).

Thanks,
Anton
Tejun Heo Jan. 9, 2013, 9:55 p.m. UTC | #16
Hello, Anton.

On Wed, Jan 09, 2013 at 01:36:04PM -0800, Anton Vorontsov wrote:
> On Thu, Jan 10, 2013 at 01:20:30AM +0400, Glauber Costa wrote:
> [...]
> > Given the above, I believe that ideally we should use this pressure
> > mechanism in memcg replacing the current memcg notification mechanism.
> 
> Just a quick wonder: why would we need to place it into memcg, when we
> don't need any of the memcg stuff for it? I see no benefits, not
> design-wise, not implementation-wise or anything-wise. :)

Maybe I'm misunderstanding the whole thing but how can memory pressure
exist apart from memcg when memcg is in use?  Memory limits, reclaim
and OOM are all per-memcg, how do you even define memory pressure?  If
ten tasks belong to a memcg w/ a lot of spare memory and one belongs
to another which is about to hit OOM, is that mempressure cgroup under
pressure?

> We can use mempressure w/o memcg, and even then it can (or should :) be
> useful (for cpuset, for example).

The problem is that you end with, at the very least, duplicate
hierarchical accounting mechanisms which overlap with each other
while, most likely, being slightly different.  About the same thing
happened with cpu and cpuacct controllers and we're now trying to
deprecate the latter.

Please talk with memcg people and fold it into memcg.  It can (and
should) be done in a way to not incur overhead when only root memcg is
in use and how this is done defines userland-visible interface, so
let's please not repeat past mistakes.

Thanks.
Tejun Heo Jan. 9, 2013, 10:04 p.m. UTC | #17
On Wed, Jan 09, 2013 at 01:55:14PM -0800, Tejun Heo wrote:
> Please talk with memcg people and fold it into memcg.  It can (and
> should) be done in a way to not incur overhead when only root memcg is
> in use and how this is done defines userland-visible interface, so
> let's please not repeat past mistakes.

CC'ing KAMEZAWA, Johannes, Li and cgroup mailing list.  Please keep
them cc'd for further discussion.

Thanks.
Anton Vorontsov Jan. 9, 2013, 10:06 p.m. UTC | #18
On Wed, Jan 09, 2013 at 01:55:14PM -0800, Tejun Heo wrote:
[...]
> > We can use mempressure w/o memcg, and even then it can (or should :) be
> > useful (for cpuset, for example).
> 
> The problem is that you end with, at the very least, duplicate
> hierarchical accounting mechanisms which overlap with each other
> while, most likely, being slightly different.  About the same thing
> happened with cpu and cpuacct controllers and we're now trying to
> deprecate the latter.

Yeah. I started answering your comments about hierarchical accounting,
looked into the memcg code, and realized that *this* is where I need the
memcg stuff. :)

Thus yes, I guess I'll have to integrate it with memcg, or sort of.

I will surely Cc you on the next interations.

Thanks,
Anton
Anton Vorontsov Jan. 9, 2013, 10:14 p.m. UTC | #19
On Tue, Jan 08, 2013 at 05:49:49PM +0900, Minchan Kim wrote:
[...]
> Sorry still I didn't look at your implementation about cgroup part.
> but I had a question since long time ago.
> 
> How can we can make sure false positive about zone and NUMA?
> I mean DMA zone is short in system so VM notify to user and user
> free all memory of NORMAL zone because he can't know what pages live
> in any zones. NUMA is ditto.

Um, we count scans irrespective of zones or nodes, i.e. we sum all 'number
of scanned' and 'number of reclaimed' stats. So, it should not be a
problem, as I see it.

Thanks,
Anton
Tejun Heo Jan. 9, 2013, 10:21 p.m. UTC | #20
Hello, Anton.

On Wed, Jan 09, 2013 at 02:06:41PM -0800, Anton Vorontsov wrote:
> Yeah. I started answering your comments about hierarchical accounting,
> looked into the memcg code, and realized that *this* is where I need the
> memcg stuff. :)

Yay, I wasn't completely clueless.

> Thus yes, I guess I'll have to integrate it with memcg, or sort of.

I really don't know much about memcg internals but I guess
implementation can be split into two pieces.  memcg already has its
own accounting and pressure mechanism so it should be possible to bolt
on the mempressure interface on top of already existing data.  You can
improve / bring some sanity :) to memcg if the proposed mempressure
implementation is better.

Thanks.
Glauber Costa Jan. 10, 2013, 7:18 a.m. UTC | #21
On 01/10/2013 02:06 AM, Anton Vorontsov wrote:
> On Wed, Jan 09, 2013 at 01:55:14PM -0800, Tejun Heo wrote:
> [...]
>>> We can use mempressure w/o memcg, and even then it can (or should :) be
>>> useful (for cpuset, for example).
>>
>> The problem is that you end with, at the very least, duplicate
>> hierarchical accounting mechanisms which overlap with each other
>> while, most likely, being slightly different.  About the same thing
>> happened with cpu and cpuacct controllers and we're now trying to
>> deprecate the latter.
> 
> Yeah. I started answering your comments about hierarchical accounting,
> looked into the memcg code, and realized that *this* is where I need the
> memcg stuff. :)
> 
> Thus yes, I guess I'll have to integrate it with memcg, or sort of.
> 

That being my point since the beginning. To generate per-memcg pressure,
you need memcg anyway. So you would have to have two different and
orthogonal mechanisms, and therefore, double account.
Minchan Kim Jan. 11, 2013, 5:12 a.m. UTC | #22
On Wed, Jan 09, 2013 at 02:14:49PM -0800, Anton Vorontsov wrote:
> On Tue, Jan 08, 2013 at 05:49:49PM +0900, Minchan Kim wrote:
> [...]
> > Sorry still I didn't look at your implementation about cgroup part.
> > but I had a question since long time ago.
> > 
> > How can we can make sure false positive about zone and NUMA?
> > I mean DMA zone is short in system so VM notify to user and user
> > free all memory of NORMAL zone because he can't know what pages live
> > in any zones. NUMA is ditto.
> 
> Um, we count scans irrespective of zones or nodes, i.e. we sum all 'number
> of scanned' and 'number of reclaimed' stats. So, it should not be a
> problem, as I see it.

Why is it no problem? For example, let's think of normal zone reclaim.
Page allocator try to allocate pages from NORMAL zone to DMA zone fallback
and your logic could trigger mpc_shrinker. So process A, B, C start to
release thier freeable memory but unfortunately, freed pages are all
HIGHMEM pages. Why should processes release memory unnecessary?
Is there any method for proecess to detect such situation in user level
before releasing the freeable memory?

In android smart phone, until now, there was a zone - DMA so low memory
killer didn't have a problem but these days smart phone use 2G DRAM so
we started seeing the above problem. Your generic approach should solve
the problem, too.

> 
> Thanks,
> Anton
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
Anton Vorontsov Jan. 11, 2013, 5:38 a.m. UTC | #23
On Fri, Jan 11, 2013 at 02:12:10PM +0900, Minchan Kim wrote:
> On Wed, Jan 09, 2013 at 02:14:49PM -0800, Anton Vorontsov wrote:
> > On Tue, Jan 08, 2013 at 05:49:49PM +0900, Minchan Kim wrote:
> > [...]
> > > Sorry still I didn't look at your implementation about cgroup part.
> > > but I had a question since long time ago.
> > > 
> > > How can we can make sure false positive about zone and NUMA?
> > > I mean DMA zone is short in system so VM notify to user and user
> > > free all memory of NORMAL zone because he can't know what pages live
> > > in any zones. NUMA is ditto.
> > 
> > Um, we count scans irrespective of zones or nodes, i.e. we sum all 'number
> > of scanned' and 'number of reclaimed' stats. So, it should not be a
> > problem, as I see it.
> 
> Why is it no problem? For example, let's think of normal zone reclaim.
> Page allocator try to allocate pages from NORMAL zone to DMA zone fallback
> and your logic could trigger mpc_shrinker. So process A, B, C start to
> release thier freeable memory but unfortunately, freed pages are all
> HIGHMEM pages. Why should processes release memory unnecessary?
> Is there any method for proecess to detect such situation in user level
> before releasing the freeable memory?

Ahh. You're talking about the shrinker interface. Yes, there is no way to
tell if the freed memory will be actually "released" (and if not, then
yes, we released it unnecessary).

But that's not only problem with NUMA or zones. Shared pages are in the
same boat, right? An app might free some memory, but as another process
might be still using it, we don't know whether our action helps or not.

The situation is a little bit easier for the in-kernel shrinkers, since we
have more control over pages, but still, even for the kernel shrinkers, we
don't provide all the information (only gfpmask, which, I just looked into
the random user, drivers/gpu/drm/ttm, sometimes is not used).

So, answering your question: no, I don't know how to solve it for the
userland. But I also don't think it's a big concern (especially if we make
it cgroup-aware -- this would be cgroup's worry then, i.e. we might
isolate task to only some nodes/zones, if we really care about precise
accounting?). But I'm surely open for ideas. :)

Thanks!

Anton
Minchan Kim Jan. 11, 2013, 5:56 a.m. UTC | #24
On Thu, Jan 10, 2013 at 09:38:31PM -0800, Anton Vorontsov wrote:
> On Fri, Jan 11, 2013 at 02:12:10PM +0900, Minchan Kim wrote:
> > On Wed, Jan 09, 2013 at 02:14:49PM -0800, Anton Vorontsov wrote:
> > > On Tue, Jan 08, 2013 at 05:49:49PM +0900, Minchan Kim wrote:
> > > [...]
> > > > Sorry still I didn't look at your implementation about cgroup part.
> > > > but I had a question since long time ago.
> > > > 
> > > > How can we can make sure false positive about zone and NUMA?
> > > > I mean DMA zone is short in system so VM notify to user and user
> > > > free all memory of NORMAL zone because he can't know what pages live
> > > > in any zones. NUMA is ditto.
> > > 
> > > Um, we count scans irrespective of zones or nodes, i.e. we sum all 'number
> > > of scanned' and 'number of reclaimed' stats. So, it should not be a
> > > problem, as I see it.
> > 
> > Why is it no problem? For example, let's think of normal zone reclaim.
> > Page allocator try to allocate pages from NORMAL zone to DMA zone fallback
> > and your logic could trigger mpc_shrinker. So process A, B, C start to
> > release thier freeable memory but unfortunately, freed pages are all
> > HIGHMEM pages. Why should processes release memory unnecessary?
> > Is there any method for proecess to detect such situation in user level
> > before releasing the freeable memory?
> 
> Ahh. You're talking about the shrinker interface. Yes, there is no way to
> tell if the freed memory will be actually "released" (and if not, then
> yes, we released it unnecessary).

I don't tell about actually "released" or not.
I assume application actually release pages but the pages would be another
zones, NOT targetted zone from kernel. In case of that, kernel could ask
continuously until target zone has enough free memory.

> 
> But that's not only problem with NUMA or zones. Shared pages are in the
> same boat, right? An app might free some memory, but as another process
> might be still using it, we don't know whether our action helps or not.

It's not what I meant.

> 
> The situation is a little bit easier for the in-kernel shrinkers, since we
> have more control over pages, but still, even for the kernel shrinkers, we
> don't provide all the information (only gfpmask, which, I just looked into
> the random user, drivers/gpu/drm/ttm, sometimes is not used).
> 
> So, answering your question: no, I don't know how to solve it for the
> userland. But I also don't think it's a big concern (especially if we make
> it cgroup-aware -- this would be cgroup's worry then, i.e. we might
> isolate task to only some nodes/zones, if we really care about precise
> accounting?). But I'm surely open for ideas. :)

My dumb idea is only notify to user when reclaim is triggered by
__GFP_HIGHMEM|__GFP_MOVABLE which is most gfp_t for application memory. :)


> 
> Thanks!
> 
> Anton
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
Anton Vorontsov Jan. 11, 2013, 6:09 a.m. UTC | #25
On Fri, Jan 11, 2013 at 02:56:15PM +0900, Minchan Kim wrote:
[...]
> > Ahh. You're talking about the shrinker interface. Yes, there is no way to
> > tell if the freed memory will be actually "released" (and if not, then
> > yes, we released it unnecessary).
> 
> I don't tell about actually "released" or not.
> I assume application actually release pages but the pages would be another
> zones, NOT targetted zone from kernel. In case of that, kernel could ask
> continuously until target zone has enough free memory.
[...]
> > isolate task to only some nodes/zones, if we really care about precise
> > accounting?). But I'm surely open for ideas. :)
> 
> My dumb idea is only notify to user when reclaim is triggered by
> __GFP_HIGHMEM|__GFP_MOVABLE which is most gfp_t for application memory. :)

Ah, I see. Sure, that will help a lot. I'll try to incorporate this into
the next iteration. But there are still unresolved accounting issues that
I outlined, and I don't think that they are this easy to solve. :)

Thanks!

Anton
Simon Jeons Jan. 13, 2013, 8:50 a.m. UTC | #26
On Fri, 2013-01-04 at 00:29 -0800, Anton Vorontsov wrote:
> This commit implements David Rientjes' idea of mempressure cgroup.
> 
> The main characteristics are the same to what I've tried to add to vmevent
> API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
> pressure index calculation. But we don't expose the index to the userland.
> Instead, there are three levels of the pressure:
> 
>  o low (just reclaiming, e.g. caches are draining);
>  o medium (allocation cost becomes high, e.g. swapping);
>  o oom (about to oom very soon).
> 
> The rationale behind exposing levels and not the raw pressure index
> described here: http://lkml.org/lkml/2012/11/16/675
> 
> For a task it is possible to be in both cpusets, memcg and mempressure
> cgroups, so by rearranging the tasks it is possible to watch a specific
> pressure (i.e. caused by cpuset and/or memcg).
> 
> Note that while this adds the cgroups support, the code is well separated
> and eventually we might add a lightweight, non-cgroups API, i.e. vmevent.
> But this is another story.
> 
> Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
> ---
>  Documentation/cgroups/mempressure.txt |  50 ++++++
>  include/linux/cgroup_subsys.h         |   6 +
>  include/linux/vmstat.h                |  11 ++
>  init/Kconfig                          |  12 ++
>  mm/Makefile                           |   1 +
>  mm/mempressure.c                      | 330 ++++++++++++++++++++++++++++++++++
>  mm/vmscan.c                           |   4 +
>  7 files changed, 414 insertions(+)
>  create mode 100644 Documentation/cgroups/mempressure.txt
>  create mode 100644 mm/mempressure.c
> 
> diff --git a/Documentation/cgroups/mempressure.txt b/Documentation/cgroups/mempressure.txt
> new file mode 100644
> index 0000000..dbc0aca
> --- /dev/null
> +++ b/Documentation/cgroups/mempressure.txt
> @@ -0,0 +1,50 @@
> +  Memory pressure cgroup
> +~~~~~~~~~~~~~~~~~~~~~~~~~~
> +  Before using the mempressure cgroup, make sure you have it mounted:
> +
> +   # cd /sys/fs/cgroup/
> +   # mkdir mempressure
> +   # mount -t cgroup cgroup ./mempressure -o mempressure
> +
> +  It is possible to combine cgroups, for example you can mount memory
> +  (memcg) and mempressure cgroups together:
> +
> +   # mount -t cgroup cgroup ./mempressure -o memory,mempressure
> +
> +  That way the reported pressure will honour memory cgroup limits. The
> +  same goes for cpusets.
> +
> +  After the hierarchy is mounted, you can use the following API:
> +
> +  /sys/fs/cgroup/.../mempressure.level
> +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> +  To maintain the interactivity/memory allocation cost, one can use the
> +  pressure level notifications, and the levels are defined like this:
> +
> +  The "low" level means that the system is reclaiming memory for new
> +  allocations. Monitoring reclaiming activity might be useful for
> +  maintaining overall system's cache level. Upon notification, the program
> +  (typically "Activity Manager") might analyze vmstat and act in advance
> +  (i.e. prematurely shutdown unimportant services).
> +
> +  The "medium" level means that the system is experiencing medium memory
> +  pressure, there is some mild swapping activity. Upon this event
> +  applications may decide to free any resources that can be easily
> +  reconstructed or re-read from a disk.
> +
> +  The "oom" level means that the system is actively thrashing, it is about
> +  to out of memory (OOM) or even the in-kernel OOM killer is on its way to
> +  trigger. Applications should do whatever they can to help the system.
> +
> +  Event control:
> +    Is used to setup an eventfd with a level threshold. The argument to
> +    the event control specifies the level threshold.
> +  Read:
> +    Reads mempory presure levels: low, medium or oom.
> +  Write:
> +    Not implemented.
> +  Test:
> +    To set up a notification:
> +
> +    # cgroup_event_listener ./mempressure.level low
> +    ("low", "medium", "oom" are permitted.)
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index f204a7a..b9802e2 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -37,6 +37,12 @@ SUBSYS(mem_cgroup)
>  
>  /* */
>  
> +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_MEMPRESSURE)
> +SUBSYS(mpc_cgroup)
> +#endif
> +
> +/* */
> +
>  #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE)
>  SUBSYS(devices)
>  #endif
> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> index a13291f..c1a66c7 100644
> --- a/include/linux/vmstat.h
> +++ b/include/linux/vmstat.h
> @@ -10,6 +10,17 @@
>  
>  extern int sysctl_stat_interval;
>  
> +struct mem_cgroup;
> +#ifdef CONFIG_CGROUP_MEMPRESSURE
> +extern void vmpressure(struct mem_cgroup *memcg,
> +		       ulong scanned, ulong reclaimed);
> +extern void vmpressure_prio(struct mem_cgroup *memcg, int prio);
> +#else
> +static inline void vmpressure(struct mem_cgroup *memcg,
> +			      ulong scanned, ulong reclaimed) {}
> +static inline void vmpressure_prio(struct mem_cgroup *memcg, int prio) {}
> +#endif
> +
>  #ifdef CONFIG_VM_EVENT_COUNTERS
>  /*
>   * Light weight per cpu counter implementation.
> diff --git a/init/Kconfig b/init/Kconfig
> index 7d30240..d526249 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -891,6 +891,18 @@ config MEMCG_KMEM
>  	  the kmem extension can use it to guarantee that no group of processes
>  	  will ever exhaust kernel resources alone.
>  
> +config CGROUP_MEMPRESSURE
> +	bool "Memory pressure monitor for Control Groups"
> +	help
> +	  The memory pressure monitor cgroup provides a facility for
> +	  userland programs so that they could easily assist the kernel
> +	  with the memory management. So far the API provides simple,
> +	  levels-based memory pressure notifications.
> +
> +	  For more information see Documentation/cgroups/mempressure.txt
> +
> +	  If unsure, say N.
> +
>  config CGROUP_HUGETLB
>  	bool "HugeTLB Resource Controller for Control Groups"
>  	depends on RESOURCE_COUNTERS && HUGETLB_PAGE && EXPERIMENTAL
> diff --git a/mm/Makefile b/mm/Makefile
> index 3a46287..e69bbda 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -51,6 +51,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
>  obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
>  obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
> +obj-$(CONFIG_CGROUP_MEMPRESSURE) += mempressure.o
>  obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
>  obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
>  obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
> diff --git a/mm/mempressure.c b/mm/mempressure.c
> new file mode 100644
> index 0000000..ea312bb
> --- /dev/null
> +++ b/mm/mempressure.c
> @@ -0,0 +1,330 @@
> +/*
> + * Linux VM pressure
> + *
> + * Copyright 2012 Linaro Ltd.
> + *		  Anton Vorontsov <anton.vorontsov@linaro.org>
> + *
> + * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
> + * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published
> + * by the Free Software Foundation.
> + */
> +
> +#include <linux/cgroup.h>
> +#include <linux/fs.h>
> +#include <linux/sched.h>
> +#include <linux/mm.h>
> +#include <linux/vmstat.h>
> +#include <linux/eventfd.h>
> +#include <linux/swap.h>
> +#include <linux/printk.h>
> +
> +static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r);
> +
> +/*
> + * Generic VM Pressure routines (no cgroups or any other API details)
> + */
> +
> +/*
> + * The window size is the number of scanned pages before we try to analyze
> + * the scanned/reclaimed ratio (or difference).
> + *
> + * It is used as a rate-limit tunable for the "low" level notification,
> + * and for averaging medium/oom levels. Using small window sizes can cause
> + * lot of false positives, but too big window size will delay the
> + * notifications.
> + */
> +static const uint vmpressure_win = SWAP_CLUSTER_MAX * 16;
> +static const uint vmpressure_level_med = 60;
> +static const uint vmpressure_level_oom = 99;
> +static const uint vmpressure_level_oom_prio = 4;
> +
> +enum vmpressure_levels {
> +	VMPRESSURE_LOW = 0,
> +	VMPRESSURE_MEDIUM,
> +	VMPRESSURE_OOM,
> +	VMPRESSURE_NUM_LEVELS,
> +};
> +
> +static const char *vmpressure_str_levels[] = {
> +	[VMPRESSURE_LOW] = "low",
> +	[VMPRESSURE_MEDIUM] = "medium",
> +	[VMPRESSURE_OOM] = "oom",
> +};
> +
> +static enum vmpressure_levels vmpressure_level(uint pressure)
> +{
> +	if (pressure >= vmpressure_level_oom)
> +		return VMPRESSURE_OOM;
> +	else if (pressure >= vmpressure_level_med)
> +		return VMPRESSURE_MEDIUM;
> +	return VMPRESSURE_LOW;
> +}
> +
> +static ulong vmpressure_calc_level(uint win, uint s, uint r)
> +{
> +	ulong p;
> +
> +	if (!s)
> +		return 0;
> +
> +	/*
> +	 * We calculate the ratio (in percents) of how many pages were
> +	 * scanned vs. reclaimed in a given time frame (window). Note that
> +	 * time is in VM reclaimer's "ticks", i.e. number of pages
> +	 * scanned. This makes it possible to set desired reaction time
> +	 * and serves as a ratelimit.
> +	 */
> +	p = win - (r * win / s);
> +	p = p * 100 / win;
> +
> +	pr_debug("%s: %3lu  (s: %6u  r: %6u)\n", __func__, p, s, r);
> +
> +	return vmpressure_level(p);
> +}
> +
> +void vmpressure(struct mem_cgroup *memcg, ulong scanned, ulong reclaimed)
> +{
> +	if (!scanned)
> +		return;
> +	mpc_vmpressure(memcg, scanned, reclaimed);
> +}
> +
> +void vmpressure_prio(struct mem_cgroup *memcg, int prio)
> +{
> +	if (prio > vmpressure_level_oom_prio)
> +		return;
> +
> +	/* OK, the prio is below the threshold, send the pre-OOM event. */
> +	vmpressure(memcg, vmpressure_win, 0);
> +}
> +
> +/*
> + * Memory pressure cgroup code
> + */
> +
> +struct mpc_event {
> +	struct eventfd_ctx *efd;
> +	enum vmpressure_levels level;
> +	struct list_head node;
> +};
> +
> +struct mpc_state {
> +	struct cgroup_subsys_state css;
> +
> +	uint scanned;
> +	uint reclaimed;
> +	struct mutex sr_lock;
> +
> +	struct list_head events;
> +	struct mutex events_lock;
> +
> +	struct work_struct work;
> +};
> +
> +static struct mpc_state *wk2mpc(struct work_struct *wk)
> +{
> +	return container_of(wk, struct mpc_state, work);
> +}
> +
> +static struct mpc_state *css2mpc(struct cgroup_subsys_state *css)
> +{
> +	return container_of(css, struct mpc_state, css);
> +}
> +
> +static struct mpc_state *tsk2mpc(struct task_struct *tsk)
> +{
> +	return css2mpc(task_subsys_state(tsk, mpc_cgroup_subsys_id));
> +}
> +
> +static struct mpc_state *cg2mpc(struct cgroup *cg)
> +{
> +	return css2mpc(cgroup_subsys_state(cg, mpc_cgroup_subsys_id));
> +}
> +
> +static void mpc_event(struct mpc_state *mpc, ulong s, ulong r)
> +{
> +	struct mpc_event *ev;
> +	int level = vmpressure_calc_level(vmpressure_win, s, r);
> +
> +	mutex_lock(&mpc->events_lock);
> +
> +	list_for_each_entry(ev, &mpc->events, node) {
> +		if (level >= ev->level)
> +			eventfd_signal(ev->efd, 1);
> +	}
> +
> +	mutex_unlock(&mpc->events_lock);
> +}
> +
> +static void mpc_vmpressure_wk_fn(struct work_struct *wk)
> +{
> +	struct mpc_state *mpc = wk2mpc(wk);
> +	ulong s;
> +	ulong r;
> +
> +	mutex_lock(&mpc->sr_lock);
> +	s = mpc->scanned;
> +	r = mpc->reclaimed;
> +	mpc->scanned = 0;
> +	mpc->reclaimed = 0;
> +	mutex_unlock(&mpc->sr_lock);
> +
> +	mpc_event(mpc, s, r);
> +}
> +
> +static void __mpc_vmpressure(struct mpc_state *mpc, ulong s, ulong r)
> +{
> +	mutex_lock(&mpc->sr_lock);
> +	mpc->scanned += s;
> +	mpc->reclaimed += r;
> +	mutex_unlock(&mpc->sr_lock);
> +
> +	if (s < vmpressure_win || work_pending(&mpc->work))
> +		return;
> +
> +	schedule_work(&mpc->work);
> +}
> +
> +static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r)
> +{
> +	/*
> +	 * There are two options for implementing cgroup pressure
> +	 * notifications:
> +	 *
> +	 * - Store pressure counter atomically in the task struct. Upon
> +	 *   hitting 'window' wake up a workqueue that will walk every
> +	 *   task and sum per-thread pressure into cgroup pressure (to
> +	 *   which the task belongs). The cons are obvious: bloats task
> +	 *   struct, have to walk all processes and makes pressue less
> +	 *   accurate (the window becomes per-thread);
> +	 *
> +	 * - Store pressure counters in per-cgroup state. This is easy and
> +	 *   straightforward, and that's how we do things here. But this
> +	 *   requires us to not put the vmpressure hooks into hotpath,
> +	 *   since we have to grab some locks.
> +	 */
> +
> +#ifdef CONFIG_MEMCG
> +	if (memcg) {
> +		struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
> +		struct cgroup *cg = css->cgroup;
> +		struct mpc_state *mpc = cg2mpc(cg);
> +
> +		if (mpc)
> +			__mpc_vmpressure(mpc, s, r);
> +		return;
> +	}
> +#endif
> +	task_lock(current);
> +	__mpc_vmpressure(tsk2mpc(current), s, r);
> +	task_unlock(current);
> +}
> +
> +static struct cgroup_subsys_state *mpc_css_alloc(struct cgroup *cg)
> +{
> +	struct mpc_state *mpc;
> +
> +	mpc = kzalloc(sizeof(*mpc), GFP_KERNEL);
> +	if (!mpc)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&mpc->sr_lock);
> +	mutex_init(&mpc->events_lock);
> +	INIT_LIST_HEAD(&mpc->events);
> +	INIT_WORK(&mpc->work, mpc_vmpressure_wk_fn);
> +
> +	return &mpc->css;
> +}
> +
> +static void mpc_css_free(struct cgroup *cg)
> +{
> +	struct mpc_state *mpc = cg2mpc(cg);
> +
> +	kfree(mpc);
> +}
> +
> +static ssize_t mpc_read_level(struct cgroup *cg, struct cftype *cft,
> +			      struct file *file, char __user *buf,
> +			      size_t sz, loff_t *ppos)
> +{
> +	struct mpc_state *mpc = cg2mpc(cg);
> +	uint level;
> +	const char *str;
> +
> +	mutex_lock(&mpc->sr_lock);
> +
> +	level = vmpressure_calc_level(vmpressure_win,
> +			mpc->scanned, mpc->reclaimed);
> +
> +	mutex_unlock(&mpc->sr_lock);
> +
> +	str = vmpressure_str_levels[level];
> +	return simple_read_from_buffer(buf, sz, ppos, str, strlen(str));
> +}
> +
> +static int mpc_register_level(struct cgroup *cg, struct cftype *cft,
> +			      struct eventfd_ctx *eventfd, const char *args)
> +{
> +	struct mpc_state *mpc = cg2mpc(cg);
> +	struct mpc_event *ev;
> +	int lvl;
> +
> +	for (lvl = 0; lvl < VMPRESSURE_NUM_LEVELS; lvl++) {
> +		if (!strcmp(vmpressure_str_levels[lvl], args))
> +			break;
> +	}
> +
> +	if (lvl >= VMPRESSURE_NUM_LEVELS)
> +		return -EINVAL;
> +
> +	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
> +	if (!ev)
> +		return -ENOMEM;
> +
> +	ev->efd = eventfd;
> +	ev->level = lvl;
> +
> +	mutex_lock(&mpc->events_lock);
> +	list_add(&ev->node, &mpc->events);
> +	mutex_unlock(&mpc->events_lock);
> +
> +	return 0;
> +}
> +
> +static void mpc_unregister_level(struct cgroup *cg, struct cftype *cft,
> +				 struct eventfd_ctx *eventfd)
> +{
> +	struct mpc_state *mpc = cg2mpc(cg);
> +	struct mpc_event *ev;
> +
> +	mutex_lock(&mpc->events_lock);
> +	list_for_each_entry(ev, &mpc->events, node) {
> +		if (ev->efd != eventfd)
> +			continue;
> +		list_del(&ev->node);
> +		kfree(ev);
> +		break;
> +	}
> +	mutex_unlock(&mpc->events_lock);
> +}
> +
> +static struct cftype mpc_files[] = {
> +	{
> +		.name = "level",
> +		.read = mpc_read_level,
> +		.register_event = mpc_register_level,
> +		.unregister_event = mpc_unregister_level,
> +	},
> +	{},
> +};
> +
> +struct cgroup_subsys mpc_cgroup_subsys = {
> +	.name = "mempressure",
> +	.subsys_id = mpc_cgroup_subsys_id,
> +	.css_alloc = mpc_css_alloc,
> +	.css_free = mpc_css_free,
> +	.base_cftypes = mpc_files,
> +};
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 16b42af..fed0e04 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1900,6 +1900,9 @@ restart:
>  		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
>  				   sc, LRU_ACTIVE_ANON);
>  
> +	vmpressure(sc->target_mem_cgroup,
> +		   sc->nr_scanned - nr_scanned, nr_reclaimed);
> +
>  	/* reclaim/compaction might need reclaim to continue */
>  	if (should_continue_reclaim(lruvec, nr_reclaimed,
>  				    sc->nr_scanned - nr_scanned, sc))
> @@ -2122,6 +2125,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
>  		count_vm_event(ALLOCSTALL);
>  
>  	do {
> +		vmpressure_prio(sc->target_mem_cgroup, sc->priority);

Why need function vmpressure_prio? It seems that it's reduncated.  

>  		sc->nr_scanned = 0;
>  		aborted_reclaim = shrink_zones(zonelist, sc);
>
Wanpeng Li Jan. 13, 2013, 8:52 a.m. UTC | #27
Hi Anton,

On Fri, Jan 04, 2013 at 12:29:11AM -0800, Anton Vorontsov wrote:
>This commit implements David Rientjes' idea of mempressure cgroup.
>
>The main characteristics are the same to what I've tried to add to vmevent
>API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
>pressure index calculation. But we don't expose the index to the userland.
>Instead, there are three levels of the pressure:
>
> o low (just reclaiming, e.g. caches are draining);
> o medium (allocation cost becomes high, e.g. swapping);
> o oom (about to oom very soon).
>
>The rationale behind exposing levels and not the raw pressure index
>described here: http://lkml.org/lkml/2012/11/16/675
>
>For a task it is possible to be in both cpusets, memcg and mempressure
>cgroups, so by rearranging the tasks it is possible to watch a specific
>pressure (i.e. caused by cpuset and/or memcg).
>
>Note that while this adds the cgroups support, the code is well separated
>and eventually we might add a lightweight, non-cgroups API, i.e. vmevent.
>But this is another story.
>
>Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
>---
> Documentation/cgroups/mempressure.txt |  50 ++++++
> include/linux/cgroup_subsys.h         |   6 +
> include/linux/vmstat.h                |  11 ++
> init/Kconfig                          |  12 ++
> mm/Makefile                           |   1 +
> mm/mempressure.c                      | 330 ++++++++++++++++++++++++++++++++++
> mm/vmscan.c                           |   4 +
> 7 files changed, 414 insertions(+)
> create mode 100644 Documentation/cgroups/mempressure.txt
> create mode 100644 mm/mempressure.c
>
>diff --git a/Documentation/cgroups/mempressure.txt b/Documentation/cgroups/mempressure.txt
>new file mode 100644
>index 0000000..dbc0aca
>--- /dev/null
>+++ b/Documentation/cgroups/mempressure.txt
>@@ -0,0 +1,50 @@
>+  Memory pressure cgroup
>+~~~~~~~~~~~~~~~~~~~~~~~~~~
>+  Before using the mempressure cgroup, make sure you have it mounted:
>+
>+   # cd /sys/fs/cgroup/
>+   # mkdir mempressure
>+   # mount -t cgroup cgroup ./mempressure -o mempressure
>+
>+  It is possible to combine cgroups, for example you can mount memory
>+  (memcg) and mempressure cgroups together:
>+
>+   # mount -t cgroup cgroup ./mempressure -o memory,mempressure
>+
>+  That way the reported pressure will honour memory cgroup limits. The
>+  same goes for cpusets.
>+
>+  After the hierarchy is mounted, you can use the following API:
>+
>+  /sys/fs/cgroup/.../mempressure.level
>+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>+  To maintain the interactivity/memory allocation cost, one can use the
>+  pressure level notifications, and the levels are defined like this:
>+
>+  The "low" level means that the system is reclaiming memory for new
>+  allocations. Monitoring reclaiming activity might be useful for
>+  maintaining overall system's cache level. Upon notification, the program
>+  (typically "Activity Manager") might analyze vmstat and act in advance
>+  (i.e. prematurely shutdown unimportant services).
>+
>+  The "medium" level means that the system is experiencing medium memory
>+  pressure, there is some mild swapping activity. Upon this event
>+  applications may decide to free any resources that can be easily
>+  reconstructed or re-read from a disk.
>+
>+  The "oom" level means that the system is actively thrashing, it is about
>+  to out of memory (OOM) or even the in-kernel OOM killer is on its way to
>+  trigger. Applications should do whatever they can to help the system.
>+
>+  Event control:
>+    Is used to setup an eventfd with a level threshold. The argument to
>+    the event control specifies the level threshold.
>+  Read:
>+    Reads mempory presure levels: low, medium or oom.
>+  Write:
>+    Not implemented.
>+  Test:
>+    To set up a notification:
>+
>+    # cgroup_event_listener ./mempressure.level low
>+    ("low", "medium", "oom" are permitted.)
>diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
>index f204a7a..b9802e2 100644
>--- a/include/linux/cgroup_subsys.h
>+++ b/include/linux/cgroup_subsys.h
>@@ -37,6 +37,12 @@ SUBSYS(mem_cgroup)
>
> /* */
>
>+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_MEMPRESSURE)
>+SUBSYS(mpc_cgroup)
>+#endif
>+
>+/* */
>+
> #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE)
> SUBSYS(devices)
> #endif
>diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
>index a13291f..c1a66c7 100644
>--- a/include/linux/vmstat.h
>+++ b/include/linux/vmstat.h
>@@ -10,6 +10,17 @@
>
> extern int sysctl_stat_interval;
>
>+struct mem_cgroup;
>+#ifdef CONFIG_CGROUP_MEMPRESSURE
>+extern void vmpressure(struct mem_cgroup *memcg,
>+		       ulong scanned, ulong reclaimed);
>+extern void vmpressure_prio(struct mem_cgroup *memcg, int prio);
>+#else
>+static inline void vmpressure(struct mem_cgroup *memcg,
>+			      ulong scanned, ulong reclaimed) {}
>+static inline void vmpressure_prio(struct mem_cgroup *memcg, int prio) {}
>+#endif
>+
> #ifdef CONFIG_VM_EVENT_COUNTERS
> /*
>  * Light weight per cpu counter implementation.
>diff --git a/init/Kconfig b/init/Kconfig
>index 7d30240..d526249 100644
>--- a/init/Kconfig
>+++ b/init/Kconfig
>@@ -891,6 +891,18 @@ config MEMCG_KMEM
> 	  the kmem extension can use it to guarantee that no group of processes
> 	  will ever exhaust kernel resources alone.
>
>+config CGROUP_MEMPRESSURE
>+	bool "Memory pressure monitor for Control Groups"
>+	help
>+	  The memory pressure monitor cgroup provides a facility for
>+	  userland programs so that they could easily assist the kernel
>+	  with the memory management. So far the API provides simple,
>+	  levels-based memory pressure notifications.
>+
>+	  For more information see Documentation/cgroups/mempressure.txt
>+
>+	  If unsure, say N.
>+
> config CGROUP_HUGETLB
> 	bool "HugeTLB Resource Controller for Control Groups"
> 	depends on RESOURCE_COUNTERS && HUGETLB_PAGE && EXPERIMENTAL
>diff --git a/mm/Makefile b/mm/Makefile
>index 3a46287..e69bbda 100644
>--- a/mm/Makefile
>+++ b/mm/Makefile
>@@ -51,6 +51,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
> obj-$(CONFIG_QUICKLIST) += quicklist.o
> obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
> obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
>+obj-$(CONFIG_CGROUP_MEMPRESSURE) += mempressure.o
> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
> obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
> obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
>diff --git a/mm/mempressure.c b/mm/mempressure.c
>new file mode 100644
>index 0000000..ea312bb
>--- /dev/null
>+++ b/mm/mempressure.c
>@@ -0,0 +1,330 @@
>+/*
>+ * Linux VM pressure
>+ *
>+ * Copyright 2012 Linaro Ltd.
>+ *		  Anton Vorontsov <anton.vorontsov@linaro.org>
>+ *
>+ * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
>+ * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
>+ *
>+ * This program is free software; you can redistribute it and/or modify it
>+ * under the terms of the GNU General Public License version 2 as published
>+ * by the Free Software Foundation.
>+ */
>+
>+#include <linux/cgroup.h>
>+#include <linux/fs.h>
>+#include <linux/sched.h>
>+#include <linux/mm.h>
>+#include <linux/vmstat.h>
>+#include <linux/eventfd.h>
>+#include <linux/swap.h>
>+#include <linux/printk.h>
>+
>+static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r);
>+
>+/*
>+ * Generic VM Pressure routines (no cgroups or any other API details)
>+ */
>+
>+/*
>+ * The window size is the number of scanned pages before we try to analyze
>+ * the scanned/reclaimed ratio (or difference).
>+ *
>+ * It is used as a rate-limit tunable for the "low" level notification,
>+ * and for averaging medium/oom levels. Using small window sizes can cause
>+ * lot of false positives, but too big window size will delay the
>+ * notifications.
>+ */
>+static const uint vmpressure_win = SWAP_CLUSTER_MAX * 16;

Since the type is const, how can it tunable?

>+static const uint vmpressure_level_med = 60;
>+static const uint vmpressure_level_oom = 99;
>+static const uint vmpressure_level_oom_prio = 4;
>+
>+enum vmpressure_levels {
>+	VMPRESSURE_LOW = 0,
>+	VMPRESSURE_MEDIUM,
>+	VMPRESSURE_OOM,
>+	VMPRESSURE_NUM_LEVELS,
>+};
>+
>+static const char *vmpressure_str_levels[] = {
>+	[VMPRESSURE_LOW] = "low",
>+	[VMPRESSURE_MEDIUM] = "medium",
>+	[VMPRESSURE_OOM] = "oom",
>+};
>+
>+static enum vmpressure_levels vmpressure_level(uint pressure)
>+{
>+	if (pressure >= vmpressure_level_oom)
>+		return VMPRESSURE_OOM;
>+	else if (pressure >= vmpressure_level_med)
>+		return VMPRESSURE_MEDIUM;
>+	return VMPRESSURE_LOW;
>+}
>+
>+static ulong vmpressure_calc_level(uint win, uint s, uint r)
>+{
>+	ulong p;
>+
>+	if (!s)
>+		return 0;
>+
>+	/*
>+	 * We calculate the ratio (in percents) of how many pages were
>+	 * scanned vs. reclaimed in a given time frame (window). Note that
>+	 * time is in VM reclaimer's "ticks", i.e. number of pages
>+	 * scanned. This makes it possible to set desired reaction time
>+	 * and serves as a ratelimit.
>+	 */
>+	p = win - (r * win / s);
>+	p = p * 100 / win;
>+
>+	pr_debug("%s: %3lu  (s: %6u  r: %6u)\n", __func__, p, s, r);
>+
>+	return vmpressure_level(p);
>+}
>+
>+void vmpressure(struct mem_cgroup *memcg, ulong scanned, ulong reclaimed)
>+{
>+	if (!scanned)
>+		return;
>+	mpc_vmpressure(memcg, scanned, reclaimed);
>+}
>+
>+void vmpressure_prio(struct mem_cgroup *memcg, int prio)
>+{
>+	if (prio > vmpressure_level_oom_prio)
>+		return;

Since the max value of prio(sc->priority) == DEF_PRIORITY(12), why need
it?

>+
>+	/* OK, the prio is below the threshold, send the pre-OOM event. */
>+	vmpressure(memcg, vmpressure_win, 0);
>+}
>+
>+/*
>+ * Memory pressure cgroup code
>+ */
>+
>+struct mpc_event {
>+	struct eventfd_ctx *efd;
>+	enum vmpressure_levels level;
>+	struct list_head node;
>+};
>+
>+struct mpc_state {
>+	struct cgroup_subsys_state css;
>+
>+	uint scanned;
>+	uint reclaimed;
>+	struct mutex sr_lock;
>+
>+	struct list_head events;
>+	struct mutex events_lock;
>+
>+	struct work_struct work;
>+};
>+
>+static struct mpc_state *wk2mpc(struct work_struct *wk)
>+{
>+	return container_of(wk, struct mpc_state, work);
>+}
>+
>+static struct mpc_state *css2mpc(struct cgroup_subsys_state *css)
>+{
>+	return container_of(css, struct mpc_state, css);
>+}
>+
>+static struct mpc_state *tsk2mpc(struct task_struct *tsk)
>+{
>+	return css2mpc(task_subsys_state(tsk, mpc_cgroup_subsys_id));
>+}
>+
>+static struct mpc_state *cg2mpc(struct cgroup *cg)
>+{
>+	return css2mpc(cgroup_subsys_state(cg, mpc_cgroup_subsys_id));
>+}
>+
>+static void mpc_event(struct mpc_state *mpc, ulong s, ulong r)
>+{
>+	struct mpc_event *ev;
>+	int level = vmpressure_calc_level(vmpressure_win, s, r);
>+
>+	mutex_lock(&mpc->events_lock);
>+
>+	list_for_each_entry(ev, &mpc->events, node) {
>+		if (level >= ev->level)
>+			eventfd_signal(ev->efd, 1);
>+	}
>+
>+	mutex_unlock(&mpc->events_lock);
>+}
>+
>+static void mpc_vmpressure_wk_fn(struct work_struct *wk)
>+{
>+	struct mpc_state *mpc = wk2mpc(wk);
>+	ulong s;
>+	ulong r;
>+
>+	mutex_lock(&mpc->sr_lock);
>+	s = mpc->scanned;
>+	r = mpc->reclaimed;
>+	mpc->scanned = 0;
>+	mpc->reclaimed = 0;
>+	mutex_unlock(&mpc->sr_lock);
>+
>+	mpc_event(mpc, s, r);
>+}
>+
>+static void __mpc_vmpressure(struct mpc_state *mpc, ulong s, ulong r)
>+{
>+	mutex_lock(&mpc->sr_lock);
>+	mpc->scanned += s;
>+	mpc->reclaimed += r;
>+	mutex_unlock(&mpc->sr_lock);
>+
>+	if (s < vmpressure_win || work_pending(&mpc->work))
>+		return;
>+
>+	schedule_work(&mpc->work);
>+}
>+
>+static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r)
>+{
>+	/*
>+	 * There are two options for implementing cgroup pressure
>+	 * notifications:
>+	 *
>+	 * - Store pressure counter atomically in the task struct. Upon
>+	 *   hitting 'window' wake up a workqueue that will walk every
>+	 *   task and sum per-thread pressure into cgroup pressure (to
>+	 *   which the task belongs). The cons are obvious: bloats task
>+	 *   struct, have to walk all processes and makes pressue less
>+	 *   accurate (the window becomes per-thread);
>+	 *
>+	 * - Store pressure counters in per-cgroup state. This is easy and
>+	 *   straightforward, and that's how we do things here. But this
>+	 *   requires us to not put the vmpressure hooks into hotpath,
>+	 *   since we have to grab some locks.
>+	 */
>+
>+#ifdef CONFIG_MEMCG
>+	if (memcg) {
>+		struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
>+		struct cgroup *cg = css->cgroup;
>+		struct mpc_state *mpc = cg2mpc(cg);
>+
>+		if (mpc)
>+			__mpc_vmpressure(mpc, s, r);
>+		return;
>+	}
>+#endif
>+	task_lock(current);
>+	__mpc_vmpressure(tsk2mpc(current), s, r);
>+	task_unlock(current);
>+}
>+
>+static struct cgroup_subsys_state *mpc_css_alloc(struct cgroup *cg)
>+{
>+	struct mpc_state *mpc;
>+
>+	mpc = kzalloc(sizeof(*mpc), GFP_KERNEL);
>+	if (!mpc)
>+		return ERR_PTR(-ENOMEM);
>+
>+	mutex_init(&mpc->sr_lock);
>+	mutex_init(&mpc->events_lock);
>+	INIT_LIST_HEAD(&mpc->events);
>+	INIT_WORK(&mpc->work, mpc_vmpressure_wk_fn);
>+
>+	return &mpc->css;
>+}
>+
>+static void mpc_css_free(struct cgroup *cg)
>+{
>+	struct mpc_state *mpc = cg2mpc(cg);
>+
>+	kfree(mpc);
>+}
>+
>+static ssize_t mpc_read_level(struct cgroup *cg, struct cftype *cft,
>+			      struct file *file, char __user *buf,
>+			      size_t sz, loff_t *ppos)
>+{
>+	struct mpc_state *mpc = cg2mpc(cg);
>+	uint level;
>+	const char *str;
>+
>+	mutex_lock(&mpc->sr_lock);
>+
>+	level = vmpressure_calc_level(vmpressure_win,
>+			mpc->scanned, mpc->reclaimed);
>+
>+	mutex_unlock(&mpc->sr_lock);
>+
>+	str = vmpressure_str_levels[level];
>+	return simple_read_from_buffer(buf, sz, ppos, str, strlen(str));

You miss "\n". The print result:
[root@kernel ~]# cat /sys/fs/cgroup/mempressure/mempressure.level
low[root@kernel ~]#

Regards,
Wanpeng Li

>+}
>+
>+static int mpc_register_level(struct cgroup *cg, struct cftype *cft,
>+			      struct eventfd_ctx *eventfd, const char *args)
>+{
>+	struct mpc_state *mpc = cg2mpc(cg);
>+	struct mpc_event *ev;
>+	int lvl;
>+
>+	for (lvl = 0; lvl < VMPRESSURE_NUM_LEVELS; lvl++) {
>+		if (!strcmp(vmpressure_str_levels[lvl], args))
>+			break;
>+	}
>+
>+	if (lvl >= VMPRESSURE_NUM_LEVELS)
>+		return -EINVAL;
>+
>+	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
>+	if (!ev)
>+		return -ENOMEM;
>+
>+	ev->efd = eventfd;
>+	ev->level = lvl;
>+
>+	mutex_lock(&mpc->events_lock);
>+	list_add(&ev->node, &mpc->events);
>+	mutex_unlock(&mpc->events_lock);
>+
>+	return 0;
>+}
>+
>+static void mpc_unregister_level(struct cgroup *cg, struct cftype *cft,
>+				 struct eventfd_ctx *eventfd)
>+{
>+	struct mpc_state *mpc = cg2mpc(cg);
>+	struct mpc_event *ev;
>+
>+	mutex_lock(&mpc->events_lock);
>+	list_for_each_entry(ev, &mpc->events, node) {
>+		if (ev->efd != eventfd)
>+			continue;
>+		list_del(&ev->node);
>+		kfree(ev);
>+		break;
>+	}
>+	mutex_unlock(&mpc->events_lock);
>+}
>+
>+static struct cftype mpc_files[] = {
>+	{
>+		.name = "level",
>+		.read = mpc_read_level,
>+		.register_event = mpc_register_level,
>+		.unregister_event = mpc_unregister_level,
>+	},
>+	{},
>+};
>+
>+struct cgroup_subsys mpc_cgroup_subsys = {
>+	.name = "mempressure",
>+	.subsys_id = mpc_cgroup_subsys_id,
>+	.css_alloc = mpc_css_alloc,
>+	.css_free = mpc_css_free,
>+	.base_cftypes = mpc_files,
>+};
>diff --git a/mm/vmscan.c b/mm/vmscan.c
>index 16b42af..fed0e04 100644
>--- a/mm/vmscan.c
>+++ b/mm/vmscan.c
>@@ -1900,6 +1900,9 @@ restart:
> 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> 				   sc, LRU_ACTIVE_ANON);
>
>+	vmpressure(sc->target_mem_cgroup,
>+		   sc->nr_scanned - nr_scanned, nr_reclaimed);
>+
> 	/* reclaim/compaction might need reclaim to continue */
> 	if (should_continue_reclaim(lruvec, nr_reclaimed,
> 				    sc->nr_scanned - nr_scanned, sc))
>@@ -2122,6 +2125,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> 		count_vm_event(ALLOCSTALL);
>
> 	do {
>+		vmpressure_prio(sc->target_mem_cgroup, sc->priority);
> 		sc->nr_scanned = 0;
> 		aborted_reclaim = shrink_zones(zonelist, sc);
>
>-- 
>1.8.0.2
>
>--
>To unsubscribe, send a message with 'unsubscribe linux-mm' in
>the body to majordomo@kvack.org.  For more info on Linux MM,
>see: http://www.linux-mm.org/ .
>Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
diff mbox

Patch

diff --git a/Documentation/cgroups/mempressure.txt b/Documentation/cgroups/mempressure.txt
new file mode 100644
index 0000000..dbc0aca
--- /dev/null
+++ b/Documentation/cgroups/mempressure.txt
@@ -0,0 +1,50 @@ 
+  Memory pressure cgroup
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+  Before using the mempressure cgroup, make sure you have it mounted:
+
+   # cd /sys/fs/cgroup/
+   # mkdir mempressure
+   # mount -t cgroup cgroup ./mempressure -o mempressure
+
+  It is possible to combine cgroups, for example you can mount memory
+  (memcg) and mempressure cgroups together:
+
+   # mount -t cgroup cgroup ./mempressure -o memory,mempressure
+
+  That way the reported pressure will honour memory cgroup limits. The
+  same goes for cpusets.
+
+  After the hierarchy is mounted, you can use the following API:
+
+  /sys/fs/cgroup/.../mempressure.level
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  To maintain the interactivity/memory allocation cost, one can use the
+  pressure level notifications, and the levels are defined like this:
+
+  The "low" level means that the system is reclaiming memory for new
+  allocations. Monitoring reclaiming activity might be useful for
+  maintaining overall system's cache level. Upon notification, the program
+  (typically "Activity Manager") might analyze vmstat and act in advance
+  (i.e. prematurely shutdown unimportant services).
+
+  The "medium" level means that the system is experiencing medium memory
+  pressure, there is some mild swapping activity. Upon this event
+  applications may decide to free any resources that can be easily
+  reconstructed or re-read from a disk.
+
+  The "oom" level means that the system is actively thrashing, it is about
+  to out of memory (OOM) or even the in-kernel OOM killer is on its way to
+  trigger. Applications should do whatever they can to help the system.
+
+  Event control:
+    Is used to setup an eventfd with a level threshold. The argument to
+    the event control specifies the level threshold.
+  Read:
+    Reads mempory presure levels: low, medium or oom.
+  Write:
+    Not implemented.
+  Test:
+    To set up a notification:
+
+    # cgroup_event_listener ./mempressure.level low
+    ("low", "medium", "oom" are permitted.)
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index f204a7a..b9802e2 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -37,6 +37,12 @@  SUBSYS(mem_cgroup)
 
 /* */
 
+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_MEMPRESSURE)
+SUBSYS(mpc_cgroup)
+#endif
+
+/* */
+
 #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE)
 SUBSYS(devices)
 #endif
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index a13291f..c1a66c7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -10,6 +10,17 @@ 
 
 extern int sysctl_stat_interval;
 
+struct mem_cgroup;
+#ifdef CONFIG_CGROUP_MEMPRESSURE
+extern void vmpressure(struct mem_cgroup *memcg,
+		       ulong scanned, ulong reclaimed);
+extern void vmpressure_prio(struct mem_cgroup *memcg, int prio);
+#else
+static inline void vmpressure(struct mem_cgroup *memcg,
+			      ulong scanned, ulong reclaimed) {}
+static inline void vmpressure_prio(struct mem_cgroup *memcg, int prio) {}
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
diff --git a/init/Kconfig b/init/Kconfig
index 7d30240..d526249 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -891,6 +891,18 @@  config MEMCG_KMEM
 	  the kmem extension can use it to guarantee that no group of processes
 	  will ever exhaust kernel resources alone.
 
+config CGROUP_MEMPRESSURE
+	bool "Memory pressure monitor for Control Groups"
+	help
+	  The memory pressure monitor cgroup provides a facility for
+	  userland programs so that they could easily assist the kernel
+	  with the memory management. So far the API provides simple,
+	  levels-based memory pressure notifications.
+
+	  For more information see Documentation/cgroups/mempressure.txt
+
+	  If unsure, say N.
+
 config CGROUP_HUGETLB
 	bool "HugeTLB Resource Controller for Control Groups"
 	depends on RESOURCE_COUNTERS && HUGETLB_PAGE && EXPERIMENTAL
diff --git a/mm/Makefile b/mm/Makefile
index 3a46287..e69bbda 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,6 +51,7 @@  obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
 obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_MEMPRESSURE) += mempressure.o
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/mempressure.c b/mm/mempressure.c
new file mode 100644
index 0000000..ea312bb
--- /dev/null
+++ b/mm/mempressure.c
@@ -0,0 +1,330 @@ 
+/*
+ * Linux VM pressure
+ *
+ * Copyright 2012 Linaro Ltd.
+ *		  Anton Vorontsov <anton.vorontsov@linaro.org>
+ *
+ * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
+ * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/eventfd.h>
+#include <linux/swap.h>
+#include <linux/printk.h>
+
+static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r);
+
+/*
+ * Generic VM Pressure routines (no cgroups or any other API details)
+ */
+
+/*
+ * The window size is the number of scanned pages before we try to analyze
+ * the scanned/reclaimed ratio (or difference).
+ *
+ * It is used as a rate-limit tunable for the "low" level notification,
+ * and for averaging medium/oom levels. Using small window sizes can cause
+ * lot of false positives, but too big window size will delay the
+ * notifications.
+ */
+static const uint vmpressure_win = SWAP_CLUSTER_MAX * 16;
+static const uint vmpressure_level_med = 60;
+static const uint vmpressure_level_oom = 99;
+static const uint vmpressure_level_oom_prio = 4;
+
+enum vmpressure_levels {
+	VMPRESSURE_LOW = 0,
+	VMPRESSURE_MEDIUM,
+	VMPRESSURE_OOM,
+	VMPRESSURE_NUM_LEVELS,
+};
+
+static const char *vmpressure_str_levels[] = {
+	[VMPRESSURE_LOW] = "low",
+	[VMPRESSURE_MEDIUM] = "medium",
+	[VMPRESSURE_OOM] = "oom",
+};
+
+static enum vmpressure_levels vmpressure_level(uint pressure)
+{
+	if (pressure >= vmpressure_level_oom)
+		return VMPRESSURE_OOM;
+	else if (pressure >= vmpressure_level_med)
+		return VMPRESSURE_MEDIUM;
+	return VMPRESSURE_LOW;
+}
+
+static ulong vmpressure_calc_level(uint win, uint s, uint r)
+{
+	ulong p;
+
+	if (!s)
+		return 0;
+
+	/*
+	 * We calculate the ratio (in percents) of how many pages were
+	 * scanned vs. reclaimed in a given time frame (window). Note that
+	 * time is in VM reclaimer's "ticks", i.e. number of pages
+	 * scanned. This makes it possible to set desired reaction time
+	 * and serves as a ratelimit.
+	 */
+	p = win - (r * win / s);
+	p = p * 100 / win;
+
+	pr_debug("%s: %3lu  (s: %6u  r: %6u)\n", __func__, p, s, r);
+
+	return vmpressure_level(p);
+}
+
+void vmpressure(struct mem_cgroup *memcg, ulong scanned, ulong reclaimed)
+{
+	if (!scanned)
+		return;
+	mpc_vmpressure(memcg, scanned, reclaimed);
+}
+
+void vmpressure_prio(struct mem_cgroup *memcg, int prio)
+{
+	if (prio > vmpressure_level_oom_prio)
+		return;
+
+	/* OK, the prio is below the threshold, send the pre-OOM event. */
+	vmpressure(memcg, vmpressure_win, 0);
+}
+
+/*
+ * Memory pressure cgroup code
+ */
+
+struct mpc_event {
+	struct eventfd_ctx *efd;
+	enum vmpressure_levels level;
+	struct list_head node;
+};
+
+struct mpc_state {
+	struct cgroup_subsys_state css;
+
+	uint scanned;
+	uint reclaimed;
+	struct mutex sr_lock;
+
+	struct list_head events;
+	struct mutex events_lock;
+
+	struct work_struct work;
+};
+
+static struct mpc_state *wk2mpc(struct work_struct *wk)
+{
+	return container_of(wk, struct mpc_state, work);
+}
+
+static struct mpc_state *css2mpc(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct mpc_state, css);
+}
+
+static struct mpc_state *tsk2mpc(struct task_struct *tsk)
+{
+	return css2mpc(task_subsys_state(tsk, mpc_cgroup_subsys_id));
+}
+
+static struct mpc_state *cg2mpc(struct cgroup *cg)
+{
+	return css2mpc(cgroup_subsys_state(cg, mpc_cgroup_subsys_id));
+}
+
+static void mpc_event(struct mpc_state *mpc, ulong s, ulong r)
+{
+	struct mpc_event *ev;
+	int level = vmpressure_calc_level(vmpressure_win, s, r);
+
+	mutex_lock(&mpc->events_lock);
+
+	list_for_each_entry(ev, &mpc->events, node) {
+		if (level >= ev->level)
+			eventfd_signal(ev->efd, 1);
+	}
+
+	mutex_unlock(&mpc->events_lock);
+}
+
+static void mpc_vmpressure_wk_fn(struct work_struct *wk)
+{
+	struct mpc_state *mpc = wk2mpc(wk);
+	ulong s;
+	ulong r;
+
+	mutex_lock(&mpc->sr_lock);
+	s = mpc->scanned;
+	r = mpc->reclaimed;
+	mpc->scanned = 0;
+	mpc->reclaimed = 0;
+	mutex_unlock(&mpc->sr_lock);
+
+	mpc_event(mpc, s, r);
+}
+
+static void __mpc_vmpressure(struct mpc_state *mpc, ulong s, ulong r)
+{
+	mutex_lock(&mpc->sr_lock);
+	mpc->scanned += s;
+	mpc->reclaimed += r;
+	mutex_unlock(&mpc->sr_lock);
+
+	if (s < vmpressure_win || work_pending(&mpc->work))
+		return;
+
+	schedule_work(&mpc->work);
+}
+
+static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r)
+{
+	/*
+	 * There are two options for implementing cgroup pressure
+	 * notifications:
+	 *
+	 * - Store pressure counter atomically in the task struct. Upon
+	 *   hitting 'window' wake up a workqueue that will walk every
+	 *   task and sum per-thread pressure into cgroup pressure (to
+	 *   which the task belongs). The cons are obvious: bloats task
+	 *   struct, have to walk all processes and makes pressue less
+	 *   accurate (the window becomes per-thread);
+	 *
+	 * - Store pressure counters in per-cgroup state. This is easy and
+	 *   straightforward, and that's how we do things here. But this
+	 *   requires us to not put the vmpressure hooks into hotpath,
+	 *   since we have to grab some locks.
+	 */
+
+#ifdef CONFIG_MEMCG
+	if (memcg) {
+		struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+		struct cgroup *cg = css->cgroup;
+		struct mpc_state *mpc = cg2mpc(cg);
+
+		if (mpc)
+			__mpc_vmpressure(mpc, s, r);
+		return;
+	}
+#endif
+	task_lock(current);
+	__mpc_vmpressure(tsk2mpc(current), s, r);
+	task_unlock(current);
+}
+
+static struct cgroup_subsys_state *mpc_css_alloc(struct cgroup *cg)
+{
+	struct mpc_state *mpc;
+
+	mpc = kzalloc(sizeof(*mpc), GFP_KERNEL);
+	if (!mpc)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&mpc->sr_lock);
+	mutex_init(&mpc->events_lock);
+	INIT_LIST_HEAD(&mpc->events);
+	INIT_WORK(&mpc->work, mpc_vmpressure_wk_fn);
+
+	return &mpc->css;
+}
+
+static void mpc_css_free(struct cgroup *cg)
+{
+	struct mpc_state *mpc = cg2mpc(cg);
+
+	kfree(mpc);
+}
+
+static ssize_t mpc_read_level(struct cgroup *cg, struct cftype *cft,
+			      struct file *file, char __user *buf,
+			      size_t sz, loff_t *ppos)
+{
+	struct mpc_state *mpc = cg2mpc(cg);
+	uint level;
+	const char *str;
+
+	mutex_lock(&mpc->sr_lock);
+
+	level = vmpressure_calc_level(vmpressure_win,
+			mpc->scanned, mpc->reclaimed);
+
+	mutex_unlock(&mpc->sr_lock);
+
+	str = vmpressure_str_levels[level];
+	return simple_read_from_buffer(buf, sz, ppos, str, strlen(str));
+}
+
+static int mpc_register_level(struct cgroup *cg, struct cftype *cft,
+			      struct eventfd_ctx *eventfd, const char *args)
+{
+	struct mpc_state *mpc = cg2mpc(cg);
+	struct mpc_event *ev;
+	int lvl;
+
+	for (lvl = 0; lvl < VMPRESSURE_NUM_LEVELS; lvl++) {
+		if (!strcmp(vmpressure_str_levels[lvl], args))
+			break;
+	}
+
+	if (lvl >= VMPRESSURE_NUM_LEVELS)
+		return -EINVAL;
+
+	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+	if (!ev)
+		return -ENOMEM;
+
+	ev->efd = eventfd;
+	ev->level = lvl;
+
+	mutex_lock(&mpc->events_lock);
+	list_add(&ev->node, &mpc->events);
+	mutex_unlock(&mpc->events_lock);
+
+	return 0;
+}
+
+static void mpc_unregister_level(struct cgroup *cg, struct cftype *cft,
+				 struct eventfd_ctx *eventfd)
+{
+	struct mpc_state *mpc = cg2mpc(cg);
+	struct mpc_event *ev;
+
+	mutex_lock(&mpc->events_lock);
+	list_for_each_entry(ev, &mpc->events, node) {
+		if (ev->efd != eventfd)
+			continue;
+		list_del(&ev->node);
+		kfree(ev);
+		break;
+	}
+	mutex_unlock(&mpc->events_lock);
+}
+
+static struct cftype mpc_files[] = {
+	{
+		.name = "level",
+		.read = mpc_read_level,
+		.register_event = mpc_register_level,
+		.unregister_event = mpc_unregister_level,
+	},
+	{},
+};
+
+struct cgroup_subsys mpc_cgroup_subsys = {
+	.name = "mempressure",
+	.subsys_id = mpc_cgroup_subsys_id,
+	.css_alloc = mpc_css_alloc,
+	.css_free = mpc_css_free,
+	.base_cftypes = mpc_files,
+};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 16b42af..fed0e04 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1900,6 +1900,9 @@  restart:
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
 
+	vmpressure(sc->target_mem_cgroup,
+		   sc->nr_scanned - nr_scanned, nr_reclaimed);
+
 	/* reclaim/compaction might need reclaim to continue */
 	if (should_continue_reclaim(lruvec, nr_reclaimed,
 				    sc->nr_scanned - nr_scanned, sc))
@@ -2122,6 +2125,7 @@  static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		count_vm_event(ALLOCSTALL);
 
 	do {
+		vmpressure_prio(sc->target_mem_cgroup, sc->priority);
 		sc->nr_scanned = 0;
 		aborted_reclaim = shrink_zones(zonelist, sc);