diff mbox

[3/3] vmevent: Implement special low-memory attribute

Message ID 20120501132620.GC24226@lizard
State New
Headers show

Commit Message

Anton Vorontsov May 1, 2012, 1:26 p.m. UTC
This is specially "blended" attribute, the event triggers when kernel
decides that we're close to the low memory threshold. Userspace should
not expect very precise meaning of low memory situation, mostly, it's
just a guess on the kernel's side.

Well, this is the same as userland should not know or care how exactly
kernel manages the memory, or assume that memory management behaviour
is a part of the "ABI". So, all the 'low memory' is just guessing, but
we're trying to do our best. It might be that we will end up with two
or three variations of 'low memory' thresholds, and all of them would
be useful for different use cases.

For this implementation, we assume that there's a low memory situation
for the N pages threshold when we have neither N pages of completely
free pages, nor we have N reclaimable pages in the cache. This
effectively means, that if userland expects to allocate N pages, it
would consume all the free pages, and any further allocations (above
N) would start draining caches.

In the worst case, prior to hitting the threshold, we might have only
N pages in cache, and nearly no memory as free pages.

The same 'low memory' meaning is used in the current Android Low
Memory Killer driver.

Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
---
 include/linux/vmevent.h              |    7 ++++++
 mm/vmevent.c                         |   40 ++++++++++++++++++++++++++++++++++
 tools/testing/vmevent/vmevent-test.c |   12 +++++++++-
 3 files changed, 58 insertions(+), 1 deletion(-)

Comments

Pekka Enberg May 3, 2012, 10:33 a.m. UTC | #1
On Tue, May 1, 2012 at 4:26 PM, Anton Vorontsov
<anton.vorontsov@linaro.org> wrote:
> This is specially "blended" attribute, the event triggers when kernel
> decides that we're close to the low memory threshold. Userspace should
> not expect very precise meaning of low memory situation, mostly, it's
> just a guess on the kernel's side.
>
> Well, this is the same as userland should not know or care how exactly
> kernel manages the memory, or assume that memory management behaviour
> is a part of the "ABI". So, all the 'low memory' is just guessing, but
> we're trying to do our best. It might be that we will end up with two
> or three variations of 'low memory' thresholds, and all of them would
> be useful for different use cases.
>
> For this implementation, we assume that there's a low memory situation
> for the N pages threshold when we have neither N pages of completely
> free pages, nor we have N reclaimable pages in the cache. This
> effectively means, that if userland expects to allocate N pages, it
> would consume all the free pages, and any further allocations (above
> N) would start draining caches.
>
> In the worst case, prior to hitting the threshold, we might have only
> N pages in cache, and nearly no memory as free pages.
>
> The same 'low memory' meaning is used in the current Android Low
> Memory Killer driver.
>
> Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>

I don't see why we couldn't add this. Minchan, thoughts?

> ---
>  include/linux/vmevent.h              |    7 ++++++
>  mm/vmevent.c                         |   40 ++++++++++++++++++++++++++++++++++
>  tools/testing/vmevent/vmevent-test.c |   12 +++++++++-
>  3 files changed, 58 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/vmevent.h b/include/linux/vmevent.h
> index aae0d24..9bfa244 100644
> --- a/include/linux/vmevent.h
> +++ b/include/linux/vmevent.h
> @@ -10,6 +10,13 @@ enum {
>        VMEVENT_ATTR_NR_AVAIL_PAGES     = 1UL,
>        VMEVENT_ATTR_NR_FREE_PAGES      = 2UL,
>        VMEVENT_ATTR_NR_SWAP_PAGES      = 3UL,
> +       /*
> +        * This is specially blended attribute, the event triggers
> +        * when kernel decides that we're close to the low memory threshold.
> +        * Don't expect very precise meaning of low memory situation, mostly,
> +        * it's just a guess on the kernel's side.
> +        */
> +       VMEVENT_ATTR_LOWMEM_PAGES       = 4UL,
>
>        VMEVENT_ATTR_MAX                /* non-ABI */
>  };
> diff --git a/mm/vmevent.c b/mm/vmevent.c
> index b312236..d278a25 100644
> --- a/mm/vmevent.c
> +++ b/mm/vmevent.c
> @@ -68,10 +68,50 @@ static u64 vmevent_attr_avail_pages(struct vmevent_watch *watch,
>        return totalram_pages;
>  }
>
> +/*
> + * Here's some implementation details for the "low memory" meaning.
> + *
> + * (The explanation is not in the header file as userland should not
> + * know these details, nor it should assume that the meaning will
> + * always be the same. As well as it should not know how exactly kernel
> + * manages the memory, or assume that memory management behaviour is a
> + * part of the "ABI". So, all the 'low memory' is just guessing, but
> + * we're trying to do our best.)
> + *
> + * For this implementation, we assume that there's a low memory situation
> + * for the N pages threshold when we have neither N pages of completely
> + * free pages, nor we have N reclaimable pages in the cache. This
> + * effectively means, that if userland expects to allocate N pages, it
> + * would consume all the free pages, and any further allocations (above
> + * N) would start draining caches.
> + *
> + * In the worst case, prior hitting the threshold, we might have only
> + * N pages in cache, and nearly no memory as free pages.
> + */
> +static u64 vmevent_attr_lowmem_pages(struct vmevent_watch *watch,
> +                                    struct vmevent_attr *attr)
> +{
> +       int free = global_page_state(NR_FREE_PAGES);
> +       int file = global_page_state(NR_FILE_PAGES) -
> +                  global_page_state(NR_SHMEM); /* TODO: account locked pages */
> +       int val = attr->value;
> +
> +       /*
> +        * For convenience we return 0 or attr value (instead of 0/1), it
> +        * makes it easier for vmevent_match() to cope with blended
> +        * attributes, plus userland might use the value to find out which
> +        * threshold triggered.
> +        */
> +       if (free < val && file < val)
> +               return val;
> +       return 0;
> +}
> +
>  static vmevent_attr_sample_fn attr_samplers[] = {
>        [VMEVENT_ATTR_NR_AVAIL_PAGES]   = vmevent_attr_avail_pages,
>        [VMEVENT_ATTR_NR_FREE_PAGES]    = vmevent_attr_free_pages,
>        [VMEVENT_ATTR_NR_SWAP_PAGES]    = vmevent_attr_swap_pages,
> +       [VMEVENT_ATTR_LOWMEM_PAGES]     = vmevent_attr_lowmem_pages,
>  };
>
>  static u64 vmevent_sample_attr(struct vmevent_watch *watch, struct vmevent_attr *attr)
> diff --git a/tools/testing/vmevent/vmevent-test.c b/tools/testing/vmevent/vmevent-test.c
> index fd9a174..c61aed7 100644
> --- a/tools/testing/vmevent/vmevent-test.c
> +++ b/tools/testing/vmevent/vmevent-test.c
> @@ -33,7 +33,7 @@ int main(int argc, char *argv[])
>
>        config = (struct vmevent_config) {
>                .sample_period_ns       = 1000000000L,
> -               .counter                = 6,
> +               .counter                = 7,
>                .attrs                  = {
>                        {
>                                .type   = VMEVENT_ATTR_NR_FREE_PAGES,
> @@ -59,6 +59,13 @@ int main(int argc, char *argv[])
>                                .type   = VMEVENT_ATTR_NR_SWAP_PAGES,
>                        },
>                        {
> +                               .type   = VMEVENT_ATTR_LOWMEM_PAGES,
> +                               .state  = VMEVENT_ATTR_STATE_VALUE_LT |
> +                                         VMEVENT_ATTR_STATE_VALUE_EQ |
> +                                         VMEVENT_ATTR_STATE_ONE_SHOT,
> +                               .value  = phys_pages / 2,
> +                       },
> +                       {
>                                .type   = 0xffff, /* invalid */
>                        },
>                },
> @@ -108,6 +115,9 @@ int main(int argc, char *argv[])
>                        case VMEVENT_ATTR_NR_SWAP_PAGES:
>                                printf("  VMEVENT_ATTR_NR_SWAP_PAGES: %Lu\n", attr->value);
>                                break;
> +                       case VMEVENT_ATTR_LOWMEM_PAGES:
> +                               printf("  VMEVENT_ATTR_LOWMEM_PAGES: %Lu\n", attr->value);
> +                               break;
>                        default:
>                                printf("  Unknown attribute: %Lu\n", attr->value);
>                        }
> --
> 1.7.9.2
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
Minchan Kim May 4, 2012, 4:26 a.m. UTC | #2
On 05/01/2012 10:26 PM, Anton Vorontsov wrote:

> This is specially "blended" attribute, the event triggers when kernel
> decides that we're close to the low memory threshold. Userspace should
> not expect very precise meaning of low memory situation, mostly, it's
> just a guess on the kernel's side.
> 
> Well, this is the same as userland should not know or care how exactly
> kernel manages the memory, or assume that memory management behaviour
> is a part of the "ABI". So, all the 'low memory' is just guessing, but
> we're trying to do our best. It might be that we will end up with two
> or three variations of 'low memory' thresholds, and all of them would


First of all, your calculation for available pages is very simple and 
it's very specific of recent mobile phone.
But recent systems have various devices.
For example,

SSD : very fast server SSD which has lots of internal ram so that write is very fast.
thumb usb : very slow whihc has small ram 

1) We can consider anon pages and dirty pages as available pages.

       	SSD 	thumb usb	
rootfs 	 0
swap	 0

2) We can consider anon pages as available pages but dirty page doesn't

       	SSD 	thumb usb	
rootfs		   O 	 
swap	 0

3) We can consider dirty pages as available pages but anon doesn't

       	SSD 	thumb usb	
rootfs 	 O
swap	 	   O

4) We can't consider dirty pages and anon pages as available pages

       	SSD 	thumb usb	
rootfs 	 	   0
swap	 	   0

5) If we use zram as swap?
6) Another idea. If we use both zram and swap device(eMMC), then when zram is full,
   we writes zram pages into swap device with align cluster size?

I mean we can select various option to define low memory state.

> be useful for different use cases.


Why should we do it in kernel side?
If vmevent will have VMEVENT_ATTR_[FILE|MOCK|DIRTY|WRITEBACK|SHMEM|ANON|SWAP]_PAGES
and so on which is needed by calculation, we can calculate it in userspace without
forking /proc/vmstat to see it. So I think there is no problem to do it in userspace.

And even though we can solve above problem, it is possible to show up another new "blended" attribute
in future and it will suffer same problem, again. So IMHO, let's leave vmevent as it is which is 
very raw attribute and let's do blended attribute in user space.

> 
> For this implementation, we assume that there's a low memory situation
> for the N pages threshold when we have neither N pages of completely
> free pages, nor we have N reclaimable pages in the cache. This
> effectively means, that if userland expects to allocate N pages, it
> would consume all the free pages, and any further allocations (above
> N) would start draining caches.
> 
> In the worst case, prior to hitting the threshold, we might have only
> N pages in cache, and nearly no memory as free pages.
> 
> The same 'low memory' meaning is used in the current Android Low
> Memory Killer driver.
> 
> Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
> ---
>  include/linux/vmevent.h              |    7 ++++++
>  mm/vmevent.c                         |   40 ++++++++++++++++++++++++++++++++++
>  tools/testing/vmevent/vmevent-test.c |   12 +++++++++-
>  3 files changed, 58 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/vmevent.h b/include/linux/vmevent.h
> index aae0d24..9bfa244 100644
> --- a/include/linux/vmevent.h
> +++ b/include/linux/vmevent.h
> @@ -10,6 +10,13 @@ enum {
>  	VMEVENT_ATTR_NR_AVAIL_PAGES	= 1UL,
>  	VMEVENT_ATTR_NR_FREE_PAGES	= 2UL,
>  	VMEVENT_ATTR_NR_SWAP_PAGES	= 3UL,
> +	/*
> +	 * This is specially blended attribute, the event triggers
> +	 * when kernel decides that we're close to the low memory threshold.
> +	 * Don't expect very precise meaning of low memory situation, mostly,
> +	 * it's just a guess on the kernel's side.
> +	 */
> +	VMEVENT_ATTR_LOWMEM_PAGES	= 4UL,
>  
>  	VMEVENT_ATTR_MAX		/* non-ABI */
>  };
> diff --git a/mm/vmevent.c b/mm/vmevent.c
> index b312236..d278a25 100644
> --- a/mm/vmevent.c
> +++ b/mm/vmevent.c
> @@ -68,10 +68,50 @@ static u64 vmevent_attr_avail_pages(struct vmevent_watch *watch,
>  	return totalram_pages;
>  }
>  
> +/*
> + * Here's some implementation details for the "low memory" meaning.
> + *
> + * (The explanation is not in the header file as userland should not
> + * know these details, nor it should assume that the meaning will
> + * always be the same. As well as it should not know how exactly kernel
> + * manages the memory, or assume that memory management behaviour is a
> + * part of the "ABI". So, all the 'low memory' is just guessing, but
> + * we're trying to do our best.)
> + *
> + * For this implementation, we assume that there's a low memory situation
> + * for the N pages threshold when we have neither N pages of completely
> + * free pages, nor we have N reclaimable pages in the cache. This
> + * effectively means, that if userland expects to allocate N pages, it
> + * would consume all the free pages, and any further allocations (above
> + * N) would start draining caches.
> + *
> + * In the worst case, prior hitting the threshold, we might have only
> + * N pages in cache, and nearly no memory as free pages.
> + */
> +static u64 vmevent_attr_lowmem_pages(struct vmevent_watch *watch,
> +				     struct vmevent_attr *attr)
> +{
> +	int free = global_page_state(NR_FREE_PAGES);
> +	int file = global_page_state(NR_FILE_PAGES) -
> +		   global_page_state(NR_SHMEM); /* TODO: account locked pages */
> +	int val = attr->value;
> +
> +	/*
> +	 * For convenience we return 0 or attr value (instead of 0/1), it
> +	 * makes it easier for vmevent_match() to cope with blended
> +	 * attributes, plus userland might use the value to find out which
> +	 * threshold triggered.
> +	 */
> +	if (free < val && file < val)
> +		return val;
> +	return 0;
> +}
> +
>  static vmevent_attr_sample_fn attr_samplers[] = {
>  	[VMEVENT_ATTR_NR_AVAIL_PAGES]   = vmevent_attr_avail_pages,
>  	[VMEVENT_ATTR_NR_FREE_PAGES]    = vmevent_attr_free_pages,
>  	[VMEVENT_ATTR_NR_SWAP_PAGES]    = vmevent_attr_swap_pages,
> +	[VMEVENT_ATTR_LOWMEM_PAGES]     = vmevent_attr_lowmem_pages,
>  };
>  
>  static u64 vmevent_sample_attr(struct vmevent_watch *watch, struct vmevent_attr *attr)
> diff --git a/tools/testing/vmevent/vmevent-test.c b/tools/testing/vmevent/vmevent-test.c
> index fd9a174..c61aed7 100644
> --- a/tools/testing/vmevent/vmevent-test.c
> +++ b/tools/testing/vmevent/vmevent-test.c
> @@ -33,7 +33,7 @@ int main(int argc, char *argv[])
>  
>  	config = (struct vmevent_config) {
>  		.sample_period_ns	= 1000000000L,
> -		.counter		= 6,
> +		.counter		= 7,
>  		.attrs			= {
>  			{
>  				.type	= VMEVENT_ATTR_NR_FREE_PAGES,
> @@ -59,6 +59,13 @@ int main(int argc, char *argv[])
>  				.type	= VMEVENT_ATTR_NR_SWAP_PAGES,
>  			},
>  			{
> +				.type	= VMEVENT_ATTR_LOWMEM_PAGES,
> +				.state	= VMEVENT_ATTR_STATE_VALUE_LT |
> +					  VMEVENT_ATTR_STATE_VALUE_EQ |
> +					  VMEVENT_ATTR_STATE_ONE_SHOT,
> +				.value	= phys_pages / 2,
> +			},
> +			{
>  				.type	= 0xffff, /* invalid */
>  			},
>  		},
> @@ -108,6 +115,9 @@ int main(int argc, char *argv[])
>  			case VMEVENT_ATTR_NR_SWAP_PAGES:
>  				printf("  VMEVENT_ATTR_NR_SWAP_PAGES: %Lu\n", attr->value);
>  				break;
> +			case VMEVENT_ATTR_LOWMEM_PAGES:
> +				printf("  VMEVENT_ATTR_LOWMEM_PAGES: %Lu\n", attr->value);
> +				break;
>  			default:
>  				printf("  Unknown attribute: %Lu\n", attr->value);
>  			}
Anton Vorontsov May 4, 2012, 7:38 a.m. UTC | #3
On Fri, May 04, 2012 at 01:26:45PM +0900, Minchan Kim wrote:
[...]
> > be useful for different use cases.
> 
> Why should we do it in kernel side?

Because currently you can't do this in userland, see below. Today
this would be effectively the same as constantly reading /proc/vmstat,
which is surely not friendly performance/context switches/battery
wise.

> If vmevent will have VMEVENT_ATTR_[FILE|MOCK|DIRTY|WRITEBACK|SHMEM|ANON|SWAP]_PAGES
> and so on which is needed by calculation, we can calculate it in userspace without
> forking /proc/vmstat to see it. So I think there is no problem to do it in userspace.

There are two problems.

1. Originally, the idea behind vmevent was that we should not expose all
   these mm details in vmevent, because it ties ABI with Linux internal
   memory representation;

2. If you have say a boolean '(A + B + C + ...) > X' attribute (which is
   exactly what blended attributes are), you can't just set up independent
   thresholds on A, B, C, ... and have the same effect.

   (What we can do, though, is... introduce arithmetic operators in
   vmevent. :-D But then, at the end, we'll probably implement in-kernel
   forth-like stack machine, with vmevent_config array serving as a
   sequence of op-codes. ;-)

If we'll give up on "1." (Pekka, ping), then we need to solve "2."
in a sane way: we'll have to add a 'NR_FILE_PAGES - NR_SHMEM -
<todo-locked-file-pages>' attribute, and give it a name.

RECLAIMABLE_CACHE_PAGES maybe?

Thanks!
Pekka Enberg May 7, 2012, 7:14 a.m. UTC | #4
On Fri, May 4, 2012 at 10:38 AM, Anton Vorontsov
<anton.vorontsov@linaro.org> wrote:
> There are two problems.
>
> 1. Originally, the idea behind vmevent was that we should not expose all
>   these mm details in vmevent, because it ties ABI with Linux internal
>   memory representation;
>
> 2. If you have say a boolean '(A + B + C + ...) > X' attribute (which is
>   exactly what blended attributes are), you can't just set up independent
>   thresholds on A, B, C, ... and have the same effect.
>
>   (What we can do, though, is... introduce arithmetic operators in
>   vmevent. :-D But then, at the end, we'll probably implement in-kernel
>   forth-like stack machine, with vmevent_config array serving as a
>   sequence of op-codes. ;-)
>
> If we'll give up on "1." (Pekka, ping), then we need to solve "2."
> in a sane way: we'll have to add a 'NR_FILE_PAGES - NR_SHMEM -
> <todo-locked-file-pages>' attribute, and give it a name.

Well, no, we can't give up on (1) completely. That'd mean that
eventually we'd need to change the ABI and break userspace. The
difference between exposing internal details and reasonable
abstractions is by no means black and white.

AFAICT, RECLAIMABLE_CACHE_PAGES is a reasonable thing to support. Can
anyone come up with a reason why we couldn't do that in the future?

                        Pekka
KOSAKI Motohiro May 7, 2012, 8:26 a.m. UTC | #5
>> If we'll give up on "1." (Pekka, ping), then we need to solve "2."
>> in a sane way: we'll have to add a 'NR_FILE_PAGES - NR_SHMEM -
>> <todo-locked-file-pages>' attribute, and give it a name.
>
> Well, no, we can't give up on (1) completely. That'd mean that
> eventually we'd need to change the ABI and break userspace. The
> difference between exposing internal details and reasonable
> abstractions is by no means black and white.
>
> AFAICT, RECLAIMABLE_CACHE_PAGES is a reasonable thing to support. Can
> anyone come up with a reason why we couldn't do that in the future?

It can. but the problem is, that is completely useless. Because of, 1)
dirty pages writing-out
is sometimes very slow and 2) libc and some important library's pages
are critical important
for running a system even though it is clean and reclaimable. In other
word, kernel don't have
an info then can't expose it.
Anton Vorontsov May 7, 2012, 12:15 p.m. UTC | #6
On Mon, May 07, 2012 at 04:26:00AM -0400, KOSAKI Motohiro wrote:
> >> If we'll give up on "1." (Pekka, ping), then we need to solve "2."
> >> in a sane way: we'll have to add a 'NR_FILE_PAGES - NR_SHMEM -
> >> <todo-locked-file-pages>' attribute, and give it a name.
> >
> > Well, no, we can't give up on (1) completely. That'd mean that
> > eventually we'd need to change the ABI and break userspace. The
> > difference between exposing internal details and reasonable
> > abstractions is by no means black and white.
> >
> > AFAICT, RECLAIMABLE_CACHE_PAGES is a reasonable thing to support. Can
> > anyone come up with a reason why we couldn't do that in the future?
> 
> It can. but the problem is, that is completely useless.

Surely it is useful. Could be not ideal, but you can't say that
it is completely useless.

> Because of, 1) dirty pages writing-out is sometimes very slow and

I don't see it as a unresolvable problem: we can exclude dirty pages,
that's a nice idea actually.

Easily reclaimable cache pages = file_pages - shmem - locked pages
- dirty pages.

The amount of dirty pages is configurable, which is also great.

Even more, we may introduce two attributes:

RECLAIMABLE_CACHE_PAGES and
RECLAIMABLE_CACHE_PAGES_NOIO (which excludes dirty pages).

This makes ABI detached from the mm internals and still keeps a
defined meaning of the attributes.

> 2) libc and some important library's pages are critical important
> for running a system even though it is clean and reclaimable. In other
> word, kernel don't have an info then can't expose it.

First off, I guess LRU would try to keep important/most used pages in
the cache, as we try to never fully drain page cache to the zero mark.

Secondly, if we're really low on memory (which low memory notifications
help to prevent) and kernel decided to throw libc's pages out of the
cache, you'll get cache miss and kernel will have to read it back. Well,
sometimes cache misses do happen, that's life. And if somebody really
don't want this for the essential parts of the system, one have to
mlock it (which eliminates your "kernel don't have an info" argument).


Btw, if you have any better strategy on helping userspace to define
'low memory' conditions, I'll readily try to implement it.

Thanks!
KOSAKI Motohiro May 7, 2012, 7:19 p.m. UTC | #7
(5/7/12 8:15 AM), Anton Vorontsov wrote:
> On Mon, May 07, 2012 at 04:26:00AM -0400, KOSAKI Motohiro wrote:
>>>> If we'll give up on "1." (Pekka, ping), then we need to solve "2."
>>>> in a sane way: we'll have to add a 'NR_FILE_PAGES - NR_SHMEM -
>>>> <todo-locked-file-pages>' attribute, and give it a name.
>>>
>>> Well, no, we can't give up on (1) completely. That'd mean that
>>> eventually we'd need to change the ABI and break userspace. The
>>> difference between exposing internal details and reasonable
>>> abstractions is by no means black and white.
>>>
>>> AFAICT, RECLAIMABLE_CACHE_PAGES is a reasonable thing to support. Can
>>> anyone come up with a reason why we couldn't do that in the future?
>>
>> It can. but the problem is, that is completely useless.
>
> Surely it is useful. Could be not ideal, but you can't say that
> it is completely useless.

Why? It doesn't work.



>> Because of, 1) dirty pages writing-out is sometimes very slow and
>
> I don't see it as a unresolvable problem: we can exclude dirty pages,
> that's a nice idea actually.
>
> Easily reclaimable cache pages = file_pages - shmem - locked pages
> - dirty pages.
>
> The amount of dirty pages is configurable, which is also great.

You don't understand the issue. The point is NOT a formula. The problem
is, dirty and non-dirty pages aren't isolated in our kernel. Then, kernel
start to get stuck  far before non-dirty pages become empty. Lie notification
always useless.


> Even more, we may introduce two attributes:
>
> RECLAIMABLE_CACHE_PAGES and
> RECLAIMABLE_CACHE_PAGES_NOIO (which excludes dirty pages).
>
> This makes ABI detached from the mm internals and still keeps a
> defined meaning of the attributes.

Collection of craps are also crap. If you want to improve userland
notification, you should join VM improvement activity. You shouldn't
think nobody except you haven't think userland notification feature.

The problem is, Any current kernel vm statistics were not created for
such purpose and don't fit.

Even though, some inaccurate and incorrect statistics fit _your_ usecase,
they definitely don't fit other. And their people think it is bug.


>> 2) libc and some important library's pages are critical important
>> for running a system even though it is clean and reclaimable. In other
>> word, kernel don't have an info then can't expose it.
>
> First off, I guess LRU would try to keep important/most used pages in
> the cache, as we try to never fully drain page cache to the zero mark.

Yes, what do you want say?


> Secondly, if we're really low on memory (which low memory notifications
> help to prevent) and kernel decided to throw libc's pages out of the
> cache, you'll get cache miss and kernel will have to read it back. Well,
> sometimes cache misses do happen, that's life. And if somebody really
> don't want this for the essential parts of the system, one have to
> mlock it (which eliminates your "kernel don't have an info" argument).

First off, "low memory" is very poor definition and we must not use it.
It is multiple meanings. 1) System free memory is low. Some embedded have userland
oom killer and they want to know _system_ status. 2) available memory is low.
This is different from (1) when using NUMA, memcg or cpusets. And in nowadays,
almost all x86 box have numa. This is userful for swap avoidance activity if
we can implement correctly.

Secondly, we can't assume someone mlock to libc. Because of, Linux is generic
purpose kernel. As far as you continue to talk about only user usecase, we can't
agree you. "Users may have a workaround" don't make excuse to accept broken patch.




> Btw, if you have any better strategy on helping userspace to define
> 'low memory' conditions, I'll readily try to implement it.
Anton Vorontsov May 8, 2012, 12:31 a.m. UTC | #8
On Mon, May 07, 2012 at 03:19:50PM -0400, KOSAKI Motohiro wrote:
[...]
> You don't understand the issue.

Apparently.

> The point is NOT a formula. The problem
> is, dirty and non-dirty pages aren't isolated in our kernel. Then, kernel
> start to get stuck  far before non-dirty pages become empty. Lie notification
> always useless.

Ugh. I don't get it (yeah, see above :-), in what sense they're not
isolated? In sense of isolate_lru_page and friends? Yes, they're not
isolated, but how that makes the notifications untrustworthy?

I'm confused. Can you elaborate a bit?

> >Even more, we may introduce two attributes:
> >
> >RECLAIMABLE_CACHE_PAGES and
> >RECLAIMABLE_CACHE_PAGES_NOIO (which excludes dirty pages).
> >
> >This makes ABI detached from the mm internals and still keeps a
> >defined meaning of the attributes.
> 
> Collection of craps are also crap. If you want to improve userland
> notification, you should join VM improvement activity.

I'm all for improving VM, but please, be specific. I'm assuming
there is currently some efforts on VM improvements, which I'm
not aware of. Or there are some plans or thoughts on improvements --
please tell what are they.

> You shouldn't
> think nobody except you haven't think userland notification feature.

That was never my assumption; surely many people have worked on userland
notifications, and still, today we have none that would fully suite
Android's or Nokia's (or "embedded people's") needs, right? ;-)

So, let's try solve things.

Memcg is currently not usable for us, and I explained why (the slab
accounting for root cgroup thing: http://lkml.org/lkml/2012/4/30/115 ),
any comments?

> The problem is, Any current kernel vm statistics were not created for
> such purpose and don't fit.

OK, presuming current statistics don't fit, which ones should we
implement? How do you see it?

> Even though, some inaccurate and incorrect statistics fit _your_ usecase,
> they definitely don't fit other. And their people think it is bug.

I'm all for making a single solution for your and ours use cases, but
you don't say anything specific.

(Btw, what are your use cases?)

> >>2) libc and some important library's pages are critical important
> >>for running a system even though it is clean and reclaimable. In other
> >>word, kernel don't have an info then can't expose it.
> >
> >First off, I guess LRU would try to keep important/most used pages in
> >the cache, as we try to never fully drain page cache to the zero mark.

*1

> Yes, what do you want say?

> >Secondly, if we're really low on memory (which low memory notifications
> >help to prevent) and kernel decided to throw libc's pages out of the
> >cache, you'll get cache miss and kernel will have to read it back. Well,
> >sometimes cache misses do happen, that's life. And if somebody really
> >don't want this for the essential parts of the system, one have to
> >mlock it (which eliminates your "kernel don't have an info" argument).
> 
> First off, "low memory" is very poor definition and we must not use it.

OK.

> It is multiple meanings.
> 1) System free memory is low. Some embedded have userland

The 'free' has multiple meanings as well. For us, it is
'not-used-at-all-pages + the-pages-that-we-can-get-in-a-
jiffy-and-not-disturb-things-much'.

The 'not-disturb-things-much' has a moot meaning as well, so all this
should probably be tunable. Cool, so let's give the userspace all the
needed statistics to decide on these meanings.

> oom killer and they want to know _system_ status. 2) available memory is low.
> This is different from (1) when using NUMA, memcg or cpusets. And in nowadays,
> almost all x86 box have numa. This is userful for swap avoidance activity if
> we can implement correctly.

I don't get it: you don't see '1)' as a use case? You're saying
that the meanings are different when using NUMA/memcg. If we don't
use memcg, what statistics should we use?

OK, if you are hinting that memcg should be mandatory for proper
statistics accounting, then please comment on the current memcg
issues, which don't let us do '1)' via '2)'.

> Secondly, we can't assume someone mlock to libc. Because of, Linux is generic
> purpose kernel.

You said that libc pages are important, implying that ideally they should
never leave the page cache (i.e. we should not count the pages as 'easily
reclaimable').

I answered that if are OK with "not guaranteed, but we'll do our best"
strategy, then just don't let fully drain the caches, and then LRU will
try keep "most important" pages (apparently libc) in the cache. *1  It
is surely userland's task to maintain the needed amount of memory, and
to do this efficiently we need..... notifications, that's right.

But if you want a guarantee, I guess mlock() is the only option -- it is
the only way to tell the kernel that the pages are really not to be
reclaimed.

So, in the light of 'easily reclaimable pages' statistics, what for was
your libc point again? How would you solve "the libc problem"?

Thanks!
Pekka Enberg May 8, 2012, 5:20 a.m. UTC | #9
On Mon, May 7, 2012 at 10:19 PM, KOSAKI Motohiro
<kosaki.motohiro@gmail.com> wrote:
>> Even more, we may introduce two attributes:
>>
>> RECLAIMABLE_CACHE_PAGES and
>> RECLAIMABLE_CACHE_PAGES_NOIO (which excludes dirty pages).
>>
>> This makes ABI detached from the mm internals and still keeps a
>> defined meaning of the attributes.
>
> Collection of craps are also crap. If you want to improve userland
> notification, you should join VM improvement activity. You shouldn't
> think nobody except you haven't think userland notification feature.
>
> The problem is, Any current kernel vm statistics were not created for
> such purpose and don't fit.
>
> Even though, some inaccurate and incorrect statistics fit _your_ usecase,
> they definitely don't fit other. And their people think it is bug.

Well, yeah, if we are to report _number of pages_, the numbers better
be meaningful.

That said, I think you are being unfair to Anton who's one of the few
that's actually taking the time to implement this properly instead of
settling for an out-of-tree hack.
KOSAKI Motohiro May 8, 2012, 5:42 a.m. UTC | #10
On Tue, May 8, 2012 at 1:20 AM, Pekka Enberg <penberg@kernel.org> wrote:
> On Mon, May 7, 2012 at 10:19 PM, KOSAKI Motohiro
> <kosaki.motohiro@gmail.com> wrote:
>>> Even more, we may introduce two attributes:
>>>
>>> RECLAIMABLE_CACHE_PAGES and
>>> RECLAIMABLE_CACHE_PAGES_NOIO (which excludes dirty pages).
>>>
>>> This makes ABI detached from the mm internals and still keeps a
>>> defined meaning of the attributes.
>>
>> Collection of craps are also crap. If you want to improve userland
>> notification, you should join VM improvement activity. You shouldn't
>> think nobody except you haven't think userland notification feature.
>>
>> The problem is, Any current kernel vm statistics were not created for
>> such purpose and don't fit.
>>
>> Even though, some inaccurate and incorrect statistics fit _your_ usecase,
>> they definitely don't fit other. And their people think it is bug.
>
> Well, yeah, if we are to report _number of pages_, the numbers better
> be meaningful.
>
> That said, I think you are being unfair to Anton who's one of the few
> that's actually taking the time to implement this properly instead of
> settling for an out-of-tree hack.

Unfair? But only I can talk about technical comment. To be honest, I
really dislike
I need say the same explanation again and again. A lot of people don't read
past discussion. And as far as the patches take the same mistake, I must say
the same thing. It is just PITA.

I don't disagree vmevent notification itself, but I must disagree lie
notification.
And also, To make just idea statistics doesn't make sense at all. How do an
application choose the right events? If that depend on hardware configuration,
userland developers can't write proper applications.

That's the reason why I often disagree at random new features.
Pekka Enberg May 8, 2012, 5:53 a.m. UTC | #11
On Tue, May 8, 2012 at 8:42 AM, KOSAKI Motohiro
<kosaki.motohiro@gmail.com> wrote:
>> That said, I think you are being unfair to Anton who's one of the few
>> that's actually taking the time to implement this properly instead of
>> settling for an out-of-tree hack.
>
> Unfair? But only I can talk about technical comment. To be honest, I
> really dislike
> I need say the same explanation again and again. A lot of people don't read
> past discussion. And as far as the patches take the same mistake, I must say
> the same thing. It is just PITA.

Unfair because you are trying to make it look as if Anton is only
concerned with his specific use case. That's simply not true.

On Tue, May 8, 2012 at 8:42 AM, KOSAKI Motohiro
<kosaki.motohiro@gmail.com> wrote:
> I don't disagree vmevent notification itself, but I must disagree lie
> notification.
> And also, To make just idea statistics doesn't make sense at all. How do an
> application choose the right events? If that depend on hardware configuration,
> userland developers can't write proper applications.

That's exactly the problem we're trying to tackle here! We _want_ the
ABI to provide sane, well-defined events that solve real world
problems.

                        Pekka
Anton Vorontsov May 8, 2012, 6:58 a.m. UTC | #12
On Tue, May 08, 2012 at 01:42:05AM -0400, KOSAKI Motohiro wrote:
[...]
> > Well, yeah, if we are to report _number of pages_, the numbers better
> > be meaningful.
> >
> > That said, I think you are being unfair to Anton who's one of the few
> > that's actually taking the time to implement this properly instead of
> > settling for an out-of-tree hack.
> 
> Unfair? But only I can talk about technical comment. To be honest, I
> really dislike
> I need say the same explanation again and again. A lot of people don't read
> past discussion. And as far as the patches take the same mistake, I must say
> the same thing. It is just PITA.

Note that just telling people that something is PITA doesn't help solve
things (so people will come back to you with stupid questions over and
over again). You can call people morons, idiots and dumbasses (that's
all fine) but still finding a way to be productive. :-)

You could just give a link to a previous discussion, in which you think
you explained all your concerns regarding cache handling issues, or
memory notifications/statistics in general.

So, feel free to call me an idiot, but please expand your points a
little bit or give a link to the discussion you're referring to?

Thanks,
KOSAKI Motohiro May 8, 2012, 7:11 a.m. UTC | #13
On Tue, May 8, 2012 at 1:53 AM, Pekka Enberg <penberg@kernel.org> wrote:
> On Tue, May 8, 2012 at 8:42 AM, KOSAKI Motohiro
> <kosaki.motohiro@gmail.com> wrote:
>>> That said, I think you are being unfair to Anton who's one of the few
>>> that's actually taking the time to implement this properly instead of
>>> settling for an out-of-tree hack.
>>
>> Unfair? But only I can talk about technical comment. To be honest, I
>> really dislike
>> I need say the same explanation again and again. A lot of people don't read
>> past discussion. And as far as the patches take the same mistake, I must say
>> the same thing. It is just PITA.
>
> Unfair because you are trying to make it look as if Anton is only
> concerned with his specific use case. That's simply not true.

However current proposal certainly don't refer past discuss and don't work
many environment.


> On Tue, May 8, 2012 at 8:42 AM, KOSAKI Motohiro
> <kosaki.motohiro@gmail.com> wrote:
>> I don't disagree vmevent notification itself, but I must disagree lie
>> notification.
>> And also, To make just idea statistics doesn't make sense at all. How do an
>> application choose the right events? If that depend on hardware configuration,
>> userland developers can't write proper applications.
>
> That's exactly the problem we're trying to tackle here! We _want_ the
> ABI to provide sane, well-defined events that solve real world
> problems.

Ok, sane. Then I take my time a little and review current vmevent code briefly.
(I read vmevent/core branch in pekka's tree. please let me know if
there is newer
repositry)

I think following thing should be fixed.

1) sample_period is brain damaged idea. If people ONLY need to
sampling stastics, they
  only need to read /proc/vmstat periodically. just remove it and
implement push notification.
  _IF_ someone need unfrequent level trigger, just use
"usleep(timeout); read(vmevent_fd)"
 on userland code.
2) VMEVENT_ATTR_STATE_ONE_SHOT is misleading name. That is effect as
edge trigger
  shot. not only once.
3) vmevent_fd() seems sane interface. but it has name space unaware.
maybe we discuss how
  to harmonize name space feature.  No hurry. but we have to think
that issue since at beginning.
4) Currently, vmstat have per-cpu batch and vmstat updating makes 3
second delay at maximum.
  This is fine for usual case because almost userland watcher only
read /proc/vmstat per second.
  But, for vmevent_fd() case, 3 seconds may be unacceptable delay. At
worst, 128 batch x 4096
  x 4k pagesize = 2G bytes inaccurate is there.
5) __VMEVENT_ATTR_STATE_VALUE_WAS_LT should be removed from userland
exporting files.
  When exporing kenrel internal, always silly gus used them and made unhappy.
6) Also vmevent_event must hide from userland.
7) vmevent_config::size must be removed. In 20th century, M$ API
prefer to use this technique. But
  They dropped the way because a lot of application don't initialize
size member and they can't use
   it for keeping upper compitibility.
8) memcg unaware
9) numa unaware
10) zone unaware

And, we may need vm internal change if we really need lowmem
notification. current kernel don't
have such info. _And_ there is one more big problem. Currently the
kernel maintain memory per
zone. But almost all userland application aren't aware zone nor node.
Thus raw notification aren't
useful for userland. In the other hands, total memory and total free
memory is useful? Definitely No!
Even though total free memory are lots, system may start swap out and
oom invokation. If we can't
oom invocation, this feature has serious raison d'etre issue. (i.e.
(4), (8), (9) and (19) are not ignorable
issue. I think)
KOSAKI Motohiro May 8, 2012, 7:16 a.m. UTC | #14
(5/8/12 2:58 AM), Anton Vorontsov wrote:
> On Tue, May 08, 2012 at 01:42:05AM -0400, KOSAKI Motohiro wrote:
> [...]
>>> Well, yeah, if we are to report _number of pages_, the numbers better
>>> be meaningful.
>>>
>>> That said, I think you are being unfair to Anton who's one of the few
>>> that's actually taking the time to implement this properly instead of
>>> settling for an out-of-tree hack.
>>
>> Unfair? But only I can talk about technical comment. To be honest, I
>> really dislike
>> I need say the same explanation again and again. A lot of people don't read
>> past discussion. And as far as the patches take the same mistake, I must say
>> the same thing. It is just PITA.
>
> Note that just telling people that something is PITA doesn't help solve
> things (so people will come back to you with stupid questions over and
> over again). You can call people morons, idiots and dumbasses (that's
> all fine) but still finding a way to be productive. :-)
>
> You could just give a link to a previous discussion, in which you think
> you explained all your concerns regarding cache handling issues, or
> memory notifications/statistics in general.
>
> So, feel free to call me an idiot, but please expand your points a
> little bit or give a link to the discussion you're referring to?

I don't think you are idiot. But I hope you test your patch before submitting.
That just don't work especially on x86. Because of, all x86 box have multiple zone
and summarized statistics (i.e. global_page_state() thing) don't work and can't
prevent oom nor swapping.

and please see may previous mail.
Pekka Enberg May 8, 2012, 7:36 a.m. UTC | #15
On Tue, May 8, 2012 at 10:11 AM, KOSAKI Motohiro
<kosaki.motohiro@gmail.com> wrote:
> Ok, sane. Then I take my time a little and review current vmevent code briefly.
> (I read vmevent/core branch in pekka's tree. please let me know if
> there is newer repositry)

It's the latest one.

On Tue, May 8, 2012 at 10:11 AM, KOSAKI Motohiro
<kosaki.motohiro@gmail.com> wrote:
> 1) sample_period is brain damaged idea. If people ONLY need to
> sampling stastics, they
>  only need to read /proc/vmstat periodically. just remove it and
> implement push notification.
>  _IF_ someone need unfrequent level trigger, just use
> "usleep(timeout); read(vmevent_fd)"
>  on userland code.

That comes from a real-world requirement. See Leonid's email on the topic:

https://lkml.org/lkml/2012/5/2/42

> 2) VMEVENT_ATTR_STATE_ONE_SHOT is misleading name. That is effect as
> edge trigger shot. not only once.

Would VMEVENT_ATTR_STATE_EDGE_TRIGGER be a better name?

> 3) vmevent_fd() seems sane interface. but it has name space unaware.
> maybe we discuss how to harmonize name space feature.  No hurry. but we have
> to think that issue since at beginning.

You mean VFS namespaces? Yeah, we need to take care of that.

> 4) Currently, vmstat have per-cpu batch and vmstat updating makes 3
> second delay at maximum.
>  This is fine for usual case because almost userland watcher only
> read /proc/vmstat per second.
>  But, for vmevent_fd() case, 3 seconds may be unacceptable delay. At
> worst, 128 batch x 4096
>  x 4k pagesize = 2G bytes inaccurate is there.

That's pretty awful. Anton, Leonid, comments?

> 5) __VMEVENT_ATTR_STATE_VALUE_WAS_LT should be removed from userland
> exporting files.
>  When exporing kenrel internal, always silly gus used them and made unhappy.

Agreed. Anton, care to cook up a patch to do that?

> 6) Also vmevent_event must hide from userland.

Why? That's part of the ABI.

> 7) vmevent_config::size must be removed. In 20th century, M$ API
> prefer to use this technique. But
>  They dropped the way because a lot of application don't initialize
> size member and they can't use it for keeping upper compitibility.

It's there to support forward/backward ABI compatibility like perf
does. I'm going to keep it for now but I'm open to dropping it when
the ABI is more mature.

> 8) memcg unaware
> 9) numa unaware
> 10) zone unaware

Yup.

> And, we may need vm internal change if we really need lowmem
> notification. current kernel don't have such info. _And_ there is one more
> big problem. Currently the kernel maintain memory per
> zone. But almost all userland application aren't aware zone nor node.
> Thus raw notification aren't useful for userland. In the other hands, total
> memory and total free memory is useful? Definitely No!
> Even though total free memory are lots, system may start swap out and
> oom invokation. If we can't oom invocation, this feature has serious raison
> d'etre issue. (i.e. (4), (8), (9) and (19) are not ignorable issue. I think)

I'm guessing most of the existing solutions get away with
approximations and soft limits because they're mostly used on UMA
embedded machines.

But yes, we need to do better here.
KOSAKI Motohiro May 8, 2012, 7:50 a.m. UTC | #16
(5/8/12 3:36 AM), Pekka Enberg wrote:
> On Tue, May 8, 2012 at 10:11 AM, KOSAKI Motohiro
> <kosaki.motohiro@gmail.com>  wrote:
>> Ok, sane. Then I take my time a little and review current vmevent code briefly.
>> (I read vmevent/core branch in pekka's tree. please let me know if
>> there is newer repositry)
>
> It's the latest one.
>
> On Tue, May 8, 2012 at 10:11 AM, KOSAKI Motohiro
> <kosaki.motohiro@gmail.com>  wrote:
>> 1) sample_period is brain damaged idea. If people ONLY need to
>> sampling stastics, they
>>   only need to read /proc/vmstat periodically. just remove it and
>> implement push notification.
>>   _IF_ someone need unfrequent level trigger, just use
>> "usleep(timeout); read(vmevent_fd)"
>>   on userland code.
>
> That comes from a real-world requirement. See Leonid's email on the topic:
>
> https://lkml.org/lkml/2012/5/2/42

I know, many embedded guys prefer such timer interval. I also have an experience
similar logic when I was TV box developer. but I must disagree. Someone hope
timer housekeeping complexity into kernel. but I haven't seen any justification.


>> 2) VMEVENT_ATTR_STATE_ONE_SHOT is misleading name. That is effect as
>> edge trigger shot. not only once.
>
> Would VMEVENT_ATTR_STATE_EDGE_TRIGGER be a better name?

maybe.


>> 3) vmevent_fd() seems sane interface. but it has name space unaware.
>> maybe we discuss how to harmonize name space feature.  No hurry. but we have
>> to think that issue since at beginning.
>
> You mean VFS namespaces? Yeah, we need to take care of that.

If we keep current vmevent_fd() design, we may need to create new namespace concept
likes ipc namespace. current vmevent_fd() is not VFS based.


>> 4) Currently, vmstat have per-cpu batch and vmstat updating makes 3
>> second delay at maximum.
>>   This is fine for usual case because almost userland watcher only
>> read /proc/vmstat per second.
>>   But, for vmevent_fd() case, 3 seconds may be unacceptable delay. At
>> worst, 128 batch x 4096
>>   x 4k pagesize = 2G bytes inaccurate is there.
>
> That's pretty awful. Anton, Leonid, comments?
>
>> 5) __VMEVENT_ATTR_STATE_VALUE_WAS_LT should be removed from userland
>> exporting files.
>>   When exporing kenrel internal, always silly gus used them and made unhappy.
>
> Agreed. Anton, care to cook up a patch to do that?
>
>> 6) Also vmevent_event must hide from userland.
>
> Why? That's part of the ABI.

Ahhh, if so, I missed something. as far as I look, vmevent_fd() only depend
on vmevent_config. which syscall depend on vmevent_evennt?



>> 7) vmevent_config::size must be removed. In 20th century, M$ API
>> prefer to use this technique. But
>>   They dropped the way because a lot of application don't initialize
>> size member and they can't use it for keeping upper compitibility.
>
> It's there to support forward/backward ABI compatibility like perf
> does. I'm going to keep it for now but I'm open to dropping it when
> the ABI is more mature.

perf api is not intended to use from generic applications. then, I don't
think it will make abi issue. tool/perf is sane, isn't it? but vmevent_fd()
is generic api and we can't trust all userland guy have sane, unfortunately.

>> 8) memcg unaware
>> 9) numa unaware
>> 10) zone unaware
>
> Yup.
>
>> And, we may need vm internal change if we really need lowmem
>> notification. current kernel don't have such info. _And_ there is one more
>> big problem. Currently the kernel maintain memory per
>> zone. But almost all userland application aren't aware zone nor node.
>> Thus raw notification aren't useful for userland. In the other hands, total
>> memory and total free memory is useful? Definitely No!
>> Even though total free memory are lots, system may start swap out and
>> oom invokation. If we can't oom invocation, this feature has serious raison
>> d'etre issue. (i.e. (4), (8), (9) and (19) are not ignorable issue. I think)
>
> I'm guessing most of the existing solutions get away with
> approximations and soft limits because they're mostly used on UMA
> embedded machines.
>
> But yes, we need to do better here.

Hm. If you want vmevent makes depend on CONFIG_EMBEDDED, I have no reason to
complain this feature. At that world, almost all applications _know_ their
system configuration. then I don't think api misuse issue is big matter.
Pekka Enberg May 8, 2012, 8:03 a.m. UTC | #17
On Tue, May 8, 2012 at 10:50 AM, KOSAKI Motohiro
<kosaki.motohiro@gmail.com> wrote:
>>> 1) sample_period is brain damaged idea. If people ONLY need to
>>> sampling stastics, they
>>>  only need to read /proc/vmstat periodically. just remove it and
>>> implement push notification.
>>>  _IF_ someone need unfrequent level trigger, just use
>>> "usleep(timeout); read(vmevent_fd)"
>>>  on userland code.
>>
>> That comes from a real-world requirement. See Leonid's email on the topic:
>>
>> https://lkml.org/lkml/2012/5/2/42
>
> I know, many embedded guys prefer such timer interval. I also have an
> experience
> similar logic when I was TV box developer. but I must disagree. Someone hope
> timer housekeeping complexity into kernel. but I haven't seen any
> justification.

Leonid?

>>> 6) Also vmevent_event must hide from userland.
>>
>> Why? That's part of the ABI.
>
> Ahhh, if so, I missed something. as far as I look, vmevent_fd() only depend
> on vmevent_config. which syscall depend on vmevent_evennt?

read(2). See tools/testing/vmevent/vmevent-test.c for an example how
the ABI is used.

>>> 7) vmevent_config::size must be removed. In 20th century, M$ API
>>> prefer to use this technique. But
>>>  They dropped the way because a lot of application don't initialize
>>> size member and they can't use it for keeping upper compitibility.
>>
>> It's there to support forward/backward ABI compatibility like perf
>> does. I'm going to keep it for now but I'm open to dropping it when
>> the ABI is more mature.
>
> perf api is not intended to use from generic applications. then, I don't
> think it will make abi issue. tool/perf is sane, isn't it? but vmevent_fd()
> is generic api and we can't trust all userland guy have sane, unfortunately.

The perf ABI is being used by other projects as well. We don't
_depend_ on ->size being sane as much as use it to detect new features
in the future.

But anyway, we can consider dropping once the ABI is more stable.

> Hm. If you want vmevent makes depend on CONFIG_EMBEDDED, I have no reason to
> complain this feature. At that world, almost all applications _know_ their
> system configuration. then I don't think api misuse issue is big matter.

No, I don't want to do that. I was just commeting on why existing
solutions are the way they are.
Anton Vorontsov May 8, 2012, 8:13 a.m. UTC | #18
On Tue, May 08, 2012 at 03:16:59AM -0400, KOSAKI Motohiro wrote:
[...]
> >So, feel free to call me an idiot, but please expand your points a
> >little bit or give a link to the discussion you're referring to?
> 
> I don't think you are idiot. But I hope you test your patch before submitting.
> That just don't work especially on x86. Because of, all x86 box have multiple zone
> and summarized statistics (i.e. global_page_state() thing) don't work and can't
> prevent oom nor swapping.

Now I think I understand you: we don't take into account that e.g. DMA
zone is not usable by the normal allocations, and so if we're basing our
calculations on summarized stats, it is indeed possible to get an OOM
in such a case. On Android kernels nobody noticed the issue as it is
used mainly on ARM targets, which might have CONFIG_ZONE_DMA=n.

So, this particular issue makes sense now.

Thanks!
Anton Vorontsov May 8, 2012, 8:21 a.m. UTC | #19
On Tue, May 08, 2012 at 01:13:05AM -0700, Anton Vorontsov wrote:
> On Tue, May 08, 2012 at 03:16:59AM -0400, KOSAKI Motohiro wrote:
> [...]
> > >So, feel free to call me an idiot, but please expand your points a
> > >little bit or give a link to the discussion you're referring to?
> > 
> > I don't think you are idiot. But I hope you test your patch before submitting.
> > That just don't work especially on x86. Because of, all x86 box have multiple zone
> > and summarized statistics (i.e. global_page_state() thing) don't work and can't
> > prevent oom nor swapping.
> 
> Now I think I understand you: we don't take into account that e.g. DMA
> zone is not usable by the normal allocations, and so if we're basing our
> calculations on summarized stats, it is indeed possible to get an OOM
> in such a case.

Oops. Looking into it more, I think I was wrong here: kernel will surely
use pages from the DMA zone when we have no pages in normal zones.

So, I don't see how we can get OOM in that case.

Hm.
Minchan Kim May 8, 2012, 8:32 a.m. UTC | #20
On 05/08/2012 04:11 PM, KOSAKI Motohiro wrote:

> On Tue, May 8, 2012 at 1:53 AM, Pekka Enberg <penberg@kernel.org> wrote:
>> On Tue, May 8, 2012 at 8:42 AM, KOSAKI Motohiro
>> <kosaki.motohiro@gmail.com> wrote:
>>>> That said, I think you are being unfair to Anton who's one of the few
>>>> that's actually taking the time to implement this properly instead of
>>>> settling for an out-of-tree hack.
>>>
>>> Unfair? But only I can talk about technical comment. To be honest, I
>>> really dislike
>>> I need say the same explanation again and again. A lot of people don't read
>>> past discussion. And as far as the patches take the same mistake, I must say
>>> the same thing. It is just PITA.
>>
>> Unfair because you are trying to make it look as if Anton is only
>> concerned with his specific use case. That's simply not true.
> 
> However current proposal certainly don't refer past discuss and don't work
> many environment.
> 
> 
>> On Tue, May 8, 2012 at 8:42 AM, KOSAKI Motohiro
>> <kosaki.motohiro@gmail.com> wrote:
>>> I don't disagree vmevent notification itself, but I must disagree lie
>>> notification.
>>> And also, To make just idea statistics doesn't make sense at all. How do an
>>> application choose the right events? If that depend on hardware configuration,
>>> userland developers can't write proper applications.
>>
>> That's exactly the problem we're trying to tackle here! We _want_ the
>> ABI to provide sane, well-defined events that solve real world
>> problems.
> 
> Ok, sane. Then I take my time a little and review current vmevent code briefly.
> (I read vmevent/core branch in pekka's tree. please let me know if
> there is newer
> repositry)
> 
> I think following thing should be fixed.
> 
> 1) sample_period is brain damaged idea. If people ONLY need to
> sampling stastics, they
>   only need to read /proc/vmstat periodically. just remove it and
> implement push notification.
>   _IF_ someone need unfrequent level trigger, just use
> "usleep(timeout); read(vmevent_fd)"
>  on userland code.
> 2) VMEVENT_ATTR_STATE_ONE_SHOT is misleading name. That is effect as
> edge trigger
>   shot. not only once.
> 3) vmevent_fd() seems sane interface. but it has name space unaware.
> maybe we discuss how
>   to harmonize name space feature.  No hurry. but we have to think
> that issue since at beginning.
> 4) Currently, vmstat have per-cpu batch and vmstat updating makes 3
> second delay at maximum.
>   This is fine for usual case because almost userland watcher only
> read /proc/vmstat per second.
>   But, for vmevent_fd() case, 3 seconds may be unacceptable delay. At
> worst, 128 batch x 4096
>   x 4k pagesize = 2G bytes inaccurate is there.
> 5) __VMEVENT_ATTR_STATE_VALUE_WAS_LT should be removed from userland
> exporting files.
>   When exporing kenrel internal, always silly gus used them and made unhappy.
> 6) Also vmevent_event must hide from userland.
> 7) vmevent_config::size must be removed. In 20th century, M$ API
> prefer to use this technique. But
>   They dropped the way because a lot of application don't initialize
> size member and they can't use
>    it for keeping upper compitibility.
> 8) memcg unaware
> 9) numa unaware
> 10) zone unaware


I would like to add a concern.

11) understand storage speed.

As I mentioned, system can have various storage type(SSD, disk, eMMC, ramfs)
In some system, user can tolerate ramfs and SSD write or swapout.
We should consdier that to make it really useful.

The problem is user can't know it in advance so it should be detected by kernel.
Unfortunately, it's not easy now.

The idea is that we can make some levels in advane and explain it to user

Level 1: It a immediate response to user when kernel decide there are not fast-reclaimable pages any more.
Level 2: It's rather slower response than level 1 but kernel will consider it as reclaimable target
Level 3: It's slowest response because kernel will consider page needed long time to reclaim as reclaimable target.

It doesn't expose any internal of kernel and can implment it in internal.
For simple example,

Level 1: non-mapped clean page
Level 2: Level 1 + mapped clean-page
Level 3: Level 2 + dirty pages
 
So users of vmevent_fd can select his level.
Of course, latency sensitive application with slow stoarge would select Level 1.
Some application might use Level 4(Level 3 + half of swap) because it has very fast storage.

And application receives event can make strategy folloing as.

When it receives level 1 notification, it could request to others if it can release their own buffers.
When it receives level 2 notification, it could request to suicide if it's not critical application.
When it receives level 3 notification, it could kill others. 

It's a just example and my point is we should storage speed to make it general.
leonid.moiseichuk@nokia.com May 8, 2012, 9:15 a.m. UTC | #21
> -----Original Message-----
> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
> Pekka Enberg
> Sent: 08 May, 2012 11:03
> To: KOSAKI Motohiro
> Cc: Anton Vorontsov; Minchan Kim; Moiseichuk Leonid (Nokia-MP/Espoo); John
...
> >> That comes from a real-world requirement. See Leonid's email on the topic:
> >>
> >> https://lkml.org/lkml/2012/5/2/42
> >
> > I know, many embedded guys prefer such timer interval. I also have an
> > experience similar logic when I was TV box developer. but I must
> > disagree. Someone hope timer housekeeping complexity into kernel. but
> > I haven't seen any justification.
> 
> Leonid?

The "usleep(timeout); read(vmevent_fd)" will eliminate opportunity to use vmevent API for mobile devices.  
Developers already have to use heartbeat primitives to align/sync timers and update code which is not always simple to do.
But the idea is to have user-space wakeup only if we have something change in memory numbers, thus aligned timers will not help much in vmevent case due to memory situation may change a lot in short time.
Short depends from software stack but usually it below 1s.  To have use-time and wakeups on good level (below 50Hz by e.g. powertop) and allow cpu switch off timers of such short period like 1s are not allowed.

Leonid
PS: Sorry, meetings prevent to do interesting things :(  I am tracking conversation with quite low understanding how it will be useful for practical needs because user-space developers in 80% cases needs to track simply dirty memory changes i.e. modified pages which cannot be dropped.
Pekka Enberg May 8, 2012, 9:19 a.m. UTC | #22
On Tue, May 8, 2012 at 12:15 PM,  <leonid.moiseichuk@nokia.com> wrote:
> I am tracking conversation with quite low understanding how it will be useful for
> practical needs because user-space developers in 80% cases needs to track
> simply dirty memory changes i.e. modified pages which cannot be dropped.

The point is to support those cases in such a way that works sanely
across different architectures and configurations.
Pekka Enberg May 8, 2012, 9:27 a.m. UTC | #23
On Tue, May 8, 2012 at 11:32 AM, Minchan Kim <minchan@kernel.org> wrote:
> The idea is that we can make some levels in advane and explain it to user
>
> Level 1: It a immediate response to user when kernel decide there are not fast-reclaimable pages any more.
> Level 2: It's rather slower response than level 1 but kernel will consider it as reclaimable target
> Level 3: It's slowest response because kernel will consider page needed long time to reclaim as reclaimable target.
>
> It doesn't expose any internal of kernel and can implment it in internal.
> For simple example,
>
> Level 1: non-mapped clean page
> Level 2: Level 1 + mapped clean-page
> Level 3: Level 2 + dirty pages
>
> So users of vmevent_fd can select his level.

I'm totally OK with pursuing something like this if people like Leonid
and Anton think it's useful for their use-cases.
leonid.moiseichuk@nokia.com May 8, 2012, 10:38 a.m. UTC | #24
> -----Original Message-----
> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
> Pekka Enberg
> Sent: 08 May, 2012 12:20
...
> On Tue, May 8, 2012 at 12:15 PM,  <leonid.moiseichuk@nokia.com> wrote:
> > I am tracking conversation with quite low understanding how it will be
> > useful for practical needs because user-space developers in 80% cases
> > needs to track simply dirty memory changes i.e. modified pages which cannot
> be dropped.
> 
> The point is to support those cases in such a way that works sanely across
> different architectures and configurations.

Which usually means you need to increase level of abstraction from hugepages, swapped, kernel reclaimable, slab allocated, active memory to used, cache and must-have memory which are common for all platforms. Do you have visibility what do you need to show and how do you will calculate it? Does it possible to do for system, group of processes or particular process?

I implemented system-wide variant because that was a simplest one and most urgent I need. But e.g. to say how much memory consumed particular process in Linux you need to use smaps. Probably vmevent need to have some scratches (aka roadmap) into this direction as well.
Anton Vorontsov June 1, 2012, 12:21 p.m. UTC | #25
On Tue, May 08, 2012 at 10:36:31AM +0300, Pekka Enberg wrote:
[...]
> > 2) VMEVENT_ATTR_STATE_ONE_SHOT is misleading name. That is effect as
> > edge trigger shot. not only once.
> 
> Would VMEVENT_ATTR_STATE_EDGE_TRIGGER be a better name?
[...] 
> > 4) Currently, vmstat have per-cpu batch and vmstat updating makes 3
> > second delay at maximum.
> >  This is fine for usual case because almost userland watcher only
> > read /proc/vmstat per second.
> >  But, for vmevent_fd() case, 3 seconds may be unacceptable delay. At
> > worst, 128 batch x 4096
> >  x 4k pagesize = 2G bytes inaccurate is there.
> 
> That's pretty awful. Anton, Leonid, comments?
[...]
> > 5) __VMEVENT_ATTR_STATE_VALUE_WAS_LT should be removed from userland
> > exporting files.
> >  When exporing kenrel internal, always silly gus used them and made unhappy.
> 
> Agreed. Anton, care to cook up a patch to do that?

KOSAKI-San, Pekka,

Much thanks for your reviews!

These three issues should be fixed by the following patches. One mm/
change is needed outside of vmevent...

And I'm looking into other issues you pointed out...

Thanks!

---
 include/linux/vmevent.h |   10 +++----
 include/linux/vmstat.h  |    2 ++
 mm/vmevent.c            |   66 +++++++++++++++++++++++++++++------------------
 mm/vmstat.c             |   22 +++++++++++++++-
 4 files changed, 68 insertions(+), 32 deletions(-)
Pekka Enberg June 3, 2012, 6:26 p.m. UTC | #26
On Fri, 1 Jun 2012, Anton Vorontsov wrote:
> > That's pretty awful. Anton, Leonid, comments?
> [...]
> > > 5) __VMEVENT_ATTR_STATE_VALUE_WAS_LT should be removed from userland
> > > exporting files.
> > >  When exporing kenrel internal, always silly gus used them and made unhappy.
> > 
> > Agreed. Anton, care to cook up a patch to do that?
> 
> KOSAKI-San, Pekka,
> 
> Much thanks for your reviews!
> 
> These three issues should be fixed by the following patches. One mm/
> change is needed outside of vmevent...
> 
> And I'm looking into other issues you pointed out...

I applied patches 2, 4, and 5. The vmstat patch need ACKs from VM folks 
to enter the tree.

			Pekka
Minchan Kim June 4, 2012, 8:45 a.m. UTC | #27
On 06/04/2012 03:26 AM, Pekka Enberg wrote:

> On Fri, 1 Jun 2012, Anton Vorontsov wrote:
>>> That's pretty awful. Anton, Leonid, comments?
>> [...]
>>>> 5) __VMEVENT_ATTR_STATE_VALUE_WAS_LT should be removed from userland
>>>> exporting files.
>>>>  When exporing kenrel internal, always silly gus used them and made unhappy.
>>>
>>> Agreed. Anton, care to cook up a patch to do that?
>>
>> KOSAKI-San, Pekka,
>>
>> Much thanks for your reviews!
>>
>> These three issues should be fixed by the following patches. One mm/
>> change is needed outside of vmevent...
>>
>> And I'm looking into other issues you pointed out...
> 
> I applied patches 2, 4, and 5. The vmstat patch need ACKs from VM folks 
> to enter the tree.
> 
> 			Pekka


It's time to wrap it up.
It seems some people include me have tried to improve vmevent
But I hope let us convince why we need vmevent before further review/patch.

Recently I tried reclaim-latency based notifier to consider backed device speed I mentioned elsewhere thread.
The working model is that measure reclaim time and if it doesn't finish requirement time which is configurable
by admin, notify it to user or kill some thread but I didn't published yet because it's not easy for admin to control
and several issues.

AFAIK, low memory notifier is started for replacing android lowmemory killer.
At the same time, other folks want use it generally.
As I look through android low memory killer, it's not too bad except some point.

1. It should not depend on shrink_slab. If we need, we can add some hook in vmscan.c directly instead of shrink_slab.
2. We can use out_of_memory instead of custom victim selection/killing function. If we need,
   we can change out_of_memory interface little bit for passing needed information to select victim.
3. calculation for available pages

1) and 2) would make android low memory killer very general and 3) can meet each folk's requirement, I believe.

Anton, I expect you already investigated android low memory killer so maybe you know pros and cons of each solution.
Could you convince us "why we need vmevent" and "why can't android LMK do it?"

KOSAKI, AFAIRC, you are a person who hates android low memory killer.
Why do you hate it? If it solve problems I mentioned, do you have a concern, still?
If so, please, list up.

Android low memory killer is proved solution for a long time, at least embedded area(So many android phone already have used it) so I think improving it makes sense to me rather than inventing new wheel.
Frankly speaking, I don't know vmevent's other use cases except low memory notification and didn't see
any agreement about that with other guys.

I hope we get an agreement about vmevent before further enhance.

Thanks, all.
Pekka Enberg June 4, 2012, 9:20 a.m. UTC | #28
On Mon, Jun 4, 2012 at 11:45 AM, Minchan Kim <minchan@kernel.org> wrote:
> KOSAKI, AFAIRC, you are a person who hates android low memory killer.
> Why do you hate it? If it solve problems I mentioned, do you have a concern, still?
> If so, please, list up.
>
> Android low memory killer is proved solution for a long time, at least embedded
> area(So many android phone already have used it) so I think improving it makes
> sense to me rather than inventing new wheel.

VM events started out as *ABI cleanup* of Nokia's N9 Linux lowmem
notifier. That's not reinventing the wheel.

On Mon, Jun 4, 2012 at 11:45 AM, Minchan Kim <minchan@kernel.org> wrote:
> Frankly speaking, I don't know vmevent's other use cases except low memory
> notification and didn't see any agreement about that with other guys.

I think you are missing the point. "vmevent" is an ABI for delivering
VM events to userspace. I started it because different userspaces do
not agree what "low memory" means - for obvious reasons.

As for use cases, it'd be useful for VMs to be notified of "about to
swap your pages soon" so that they can aggressively GC before entering
GC-swapstorm hell. I also hear that something similar would be useful
for KVM/QEMU but I don't know the details.

I really don't see how Android's "low memory killer" will be useful as
a generic solution.

                        Pekka
Anton Vorontsov June 4, 2012, 11:38 a.m. UTC | #29
On Mon, Jun 04, 2012 at 05:45:06PM +0900, Minchan Kim wrote:
[...]
> AFAIK, low memory notifier is started for replacing android lowmemory killer.
> At the same time, other folks want use it generally.
> As I look through android low memory killer, it's not too bad except some point.
> 
> 1. It should not depend on shrink_slab. If we need, we can add some hook in vmscan.c directly instead of shrink_slab.
> 2. We can use out_of_memory instead of custom victim selection/killing function. If we need,
>    we can change out_of_memory interface little bit for passing needed information to select victim.
> 3. calculation for available pages
> 
> 1) and 2) would make android low memory killer very general and 3) can meet each folk's requirement, I believe.
> 
> Anton, I expect you already investigated android low memory killer so maybe you know pros and cons of each solution.
> Could you convince us "why we need vmevent" and "why can't android LMK do it?"

Note that 1) and 2) are not problems per se, it's just implementation
details, easy stuff. Vmevent is basically an ABI/API, and I didn't
hear anybody who would object to vmevent ABI idea itself. More than
this, nobody stop us from implementing in-kernel vmevent API, and
make Android Lowmemory killer use it, if we want to.

The real problem is not with vmevent. Today there are two real problems:

a) Gathering proper statistics from the kernel. Both cgroups and vmstat
   have issues. Android lowmemory killer has the same problems w/ the
   statistics as vmevent, it uses vmstat, so by no means Android
   low memory killer is better or easier in this regard.
   (And cgroups has issues w/ slab accounting, plus some folks don't
   want memcg at all, since it has runtime and memory-wise costs.)

b) Interpreting this statistics. We can't provide one, universal
   "low memory" definition that would work for everybody.
   (Btw, your "levels" based low memory grading actually sounds
   the same as mine RECLAIMABLE_CACHE_PAGES and
   RECLAIMABLE_CACHE_PAGES_NOIO idea, i.e.
   http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/02751.html
   so personally I like the idea of level-based approach, based
   on available memory *cost*.)

So, you see, all these issues are valid for vmevent, cgroups and
android low memory killer.

> KOSAKI, AFAIRC, you are a person who hates android low memory killer.
> Why do you hate it? If it solve problems I mentioned, do you have a concern, still?
> If so, please, list up.
> 
> Android low memory killer is proved solution for a long time, at least embedded area(So many android phone already have used it) so I think improving it makes sense to me rather than inventing new wheel.

Yes, nobody throws Android lowmemory killer away. And recently I fixed
a bunch of issues in its tasks traversing and killing code. Now it's
just time to "fix" statistics gathering and interpretation issues,
and I see vmevent as a good way to do just that, and then we
can either turn Android lowmemory killer driver to use the vmevent
in-kernel API (so it will become just a "glue" between notifications
and killing functions), or use userland daemon.

Note that memcg has notifications as well, so it's another proof that
there is a demand for this stuff outside of embedded world, and going
with ad-hoc, custom "low memory killer" is simple and tempting approach,
but it doesn't solve any real problems.

> Frankly speaking, I don't know vmevent's other use cases except low memory notification 

I won't speak for realistic use-cases, but that is what comes to
mind:

- DB can grow its caches/in-memory indexes infinitely, and start dropping
  them on demand (based on internal LRU list, for example). No more
  guessed/static configuration for DB daemons?
- Assuming VPS hosting w/ dynamic resources management, notifications
  would be useful to readjust resources?
- On desktops, apps can drop their caches on demand if they want to
  and can avoid swap activity?

Thanks,
Minchan Kim June 4, 2012, 12:17 p.m. UTC | #30
On Mon, Jun 04, 2012 at 04:38:12AM -0700, Anton Vorontsov wrote:
> On Mon, Jun 04, 2012 at 05:45:06PM +0900, Minchan Kim wrote:
> [...]
> > AFAIK, low memory notifier is started for replacing android lowmemory killer.
> > At the same time, other folks want use it generally.
> > As I look through android low memory killer, it's not too bad except some point.
> > 
> > 1. It should not depend on shrink_slab. If we need, we can add some hook in vmscan.c directly instead of shrink_slab.
> > 2. We can use out_of_memory instead of custom victim selection/killing function. If we need,
> >    we can change out_of_memory interface little bit for passing needed information to select victim.
> > 3. calculation for available pages
> > 
> > 1) and 2) would make android low memory killer very general and 3) can meet each folk's requirement, I believe.
> > 
> > Anton, I expect you already investigated android low memory killer so maybe you know pros and cons of each solution.
> > Could you convince us "why we need vmevent" and "why can't android LMK do it?"
> 
> Note that 1) and 2) are not problems per se, it's just implementation
> details, easy stuff. Vmevent is basically an ABI/API, and I didn't

My opinion is different.
Things depends on only vmstat have limitation. IMHO.
1) of such POV is very important. If we don't do 1), low memory notifier must depend on vmstat only.
But if we do 1), we can add some hooks in vmscan.c so we can control notifier more exactly/fine-grained.

As stupid example, if get_scan_count get the anon pages and it is about to reclaim inactive anon list,
it's a kind of signal anon pages swap out so that we can notify it to qemu/kvm which will ballon.

Other example, if we get the high priority of scanning (ex, DEF_PRIORITY - 2) and get scanned value 
greater than lru size, it is sort of high memory pressure.

My point is that if we want level notifier approach, we need more tightly VM-coupled thing

> hear anybody who would object to vmevent ABI idea itself. More than
> this, nobody stop us from implementing in-kernel vmevent API, and
> make Android Lowmemory killer use it, if we want to.

I guess other guys didn't have a interest or very busy.
In this chance, I hope all guys have a consensus.
At least, we need Andrew, Rik, David and KOSAKI's opinion.

> 
> The real problem is not with vmevent. Today there are two real problems:
> 
> a) Gathering proper statistics from the kernel. Both cgroups and vmstat
>    have issues. Android lowmemory killer has the same problems w/ the
>    statistics as vmevent, it uses vmstat, so by no means Android
>    low memory killer is better or easier in this regard.

Right.

>    (And cgroups has issues w/ slab accounting, plus some folks don't
>    want memcg at all, since it has runtime and memory-wise costs.)

As I mentioned earlier, we need more VM-tightly coupled policy, NOT vmstat.

> 
> b) Interpreting this statistics. We can't provide one, universal
>    "low memory" definition that would work for everybody.
>    (Btw, your "levels" based low memory grading actually sounds
>    the same as mine RECLAIMABLE_CACHE_PAGES and
>    RECLAIMABLE_CACHE_PAGES_NOIO idea, i.e.
>    http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/02751.html
>    so personally I like the idea of level-based approach, based
>    on available memory *cost*.)

Yes. I like such abstraction like my suggestion.
For new comer's information, Quote from my previoius mail
"
The idea is that we can make some levels in advane and explain it to user

Level 1: It a immediate response to user when kernel decide there are not fast-reclaimable pages any more.
Level 2: It's rather slower response than level 1 but kernel will consider it as reclaimable target
Level 3: It's slowest response because kernel will consider page needed long time to reclaim as reclaimable target.

It doesn't expose any internal of kernel and can implment it in internal.
For simple example,

Level 1: non-mapped clean page
Level 2: Level 1 + mapped clean-page
Level 3: Level 2 + dirty pages

So users of vmevent_fd can select his level.
Of course, latency sensitive application with slow stoarge would select Level 1.
Some application might use Level 4(Level 3 + half of swap) because it has very fast storage.

And application receives event can make strategy folloing as.

When it receives level 1 notification, it could request to others if it can release their own buffers.
When it receives level 2 notification, it could request to suicide if it's not critical application.
When it receives level 3 notification, it could kill others.

It's a just example and my point is we should storage speed to make it general.
"
> 
> So, you see, all these issues are valid for vmevent, cgroups and
> android low memory killer.

Agree.

> 
> > KOSAKI, AFAIRC, you are a person who hates android low memory killer.
> > Why do you hate it? If it solve problems I mentioned, do you have a concern, still?
> > If so, please, list up.
> > 
> > Android low memory killer is proved solution for a long time, at least embedded area(So many android phone already have used it) so I think improving it makes sense to me rather than inventing new wheel.
> 
> Yes, nobody throws Android lowmemory killer away. And recently I fixed
> a bunch of issues in its tasks traversing and killing code. Now it's
> just time to "fix" statistics gathering and interpretation issues,
> and I see vmevent as a good way to do just that, and then we
> can either turn Android lowmemory killer driver to use the vmevent
> in-kernel API (so it will become just a "glue" between notifications
> and killing functions), or use userland daemon.

If you can define such level by only vmstat, I am OKAY and want to see
how to glue vmevent and android LMK.
But keep in mind about killing. killer should be a kernel, not user.
https://lkml.org/lkml/2011/12/19/330

> 
> Note that memcg has notifications as well, so it's another proof that
> there is a demand for this stuff outside of embedded world, and going
> with ad-hoc, custom "low memory killer" is simple and tempting approach,
> but it doesn't solve any real problems.
> 
> > Frankly speaking, I don't know vmevent's other use cases except low memory notification 
> 
> I won't speak for realistic use-cases, but that is what comes to
> mind:
> 
> - DB can grow its caches/in-memory indexes infinitely, and start dropping
>   them on demand (based on internal LRU list, for example). No more
>   guessed/static configuration for DB daemons?
> - Assuming VPS hosting w/ dynamic resources management, notifications
>   would be useful to readjust resources?
> - On desktops, apps can drop their caches on demand if they want to
>   and can avoid swap activity?

I don't mean VMEVENT_ATTR_LOWMEM_PAGES but following as,

VMEVENT_ATTR_NR_FREE_PAGES
VMEVENT_ATTR_NR_SWAP_PAGES
VMEVENT_ATTR_NR_AVAIL_PAGES

I'm not sure how it is useful.
Even VMEVENT_ATTR_LOWMEM_PAGES, I'm not sure it would be useful with only vmstat.

> 
> Thanks,
> 
> -- 
> Anton Vorontsov
> Email: cbouatmailru@gmail.com
Minchan Kim June 4, 2012, 12:23 p.m. UTC | #31
On Mon, Jun 04, 2012 at 12:20:18PM +0300, Pekka Enberg wrote:
> On Mon, Jun 4, 2012 at 11:45 AM, Minchan Kim <minchan@kernel.org> wrote:
> > KOSAKI, AFAIRC, you are a person who hates android low memory killer.
> > Why do you hate it? If it solve problems I mentioned, do you have a concern, still?
> > If so, please, list up.
> >
> > Android low memory killer is proved solution for a long time, at least embedded
> > area(So many android phone already have used it) so I think improving it makes
> > sense to me rather than inventing new wheel.
> 
> VM events started out as *ABI cleanup* of Nokia's N9 Linux lowmem
> notifier. That's not reinventing the wheel.
> 
> On Mon, Jun 4, 2012 at 11:45 AM, Minchan Kim <minchan@kernel.org> wrote:
> > Frankly speaking, I don't know vmevent's other use cases except low memory
> > notification and didn't see any agreement about that with other guys.
> 
> I think you are missing the point. "vmevent" is an ABI for delivering
> VM events to userspace. I started it because different userspaces do
> not agree what "low memory" means - for obvious reasons.

The part I dislike vmevent is to expose raw vmstat itself.

VMEVENT_ATTR_NR_SWAP_PAGES
VMEVENT_ATTR_NR_FREE_PAGES
VMEVENT_ATTR_NR_AVAIL_PAGES

And some calculation based on raw vmstat.
We need more abstraction based on vmscan heuristic.

> 
> As for use cases, it'd be useful for VMs to be notified of "about to
> swap your pages soon" so that they can aggressively GC before entering

How do you detect it? It's a policy which is most important part on vmevent.

> GC-swapstorm hell. I also hear that something similar would be useful
> for KVM/QEMU but I don't know the details.
> 
> I really don't see how Android's "low memory killer" will be useful as
> a generic solution.

Yes. we can't do it by current android LMK.
I'm not big fan of androild LMK but we can improve it much by my suggestions
and yours smart ideas, I believe.

> 
>                         Pekka
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
Anton Vorontsov June 4, 2012, 1:35 p.m. UTC | #32
On Mon, Jun 04, 2012 at 09:17:22PM +0900, Minchan Kim wrote:
[...]
> But keep in mind about killing. killer should be a kernel, not user.
> https://lkml.org/lkml/2011/12/19/330

There you're discussing out-of-memory condition vs. low memory
situation, and I don't see LMK as a substitution for the OOMK,
or vice versa.

LMK triggers when we have plenty of free memory (and thus time);
and OOMK is a last resort thing: it is super-fast, as the kernel
may have literally nothing to do but start killing tasks.

Sure, if LMK won't react fast and low-memory turns into "out
of memory", then OOMK will "help", which is totally fine. And
it is no different from current in-kernel Android low memory
killer (which gives no guarantees that OOMK will not trigger,
it is still possible to make OOM condition, even with LMK
enabled).

Anyway, personally I don't mind if LMK would live in the kernel,
but I don't see it as a mandatory thing (unlike OOMK, which is
kernel-only thing, of course).

> > > Frankly speaking, I don't know vmevent's other use cases except low memory notification 
> > 
> > I won't speak for realistic use-cases, but that is what comes to
> > mind:
> > 
> > - DB can grow its caches/in-memory indexes infinitely, and start dropping
> >   them on demand (based on internal LRU list, for example). No more
> >   guessed/static configuration for DB daemons?
> > - Assuming VPS hosting w/ dynamic resources management, notifications
> >   would be useful to readjust resources?
> > - On desktops, apps can drop their caches on demand if they want to
> >   and can avoid swap activity?
> 
> I don't mean VMEVENT_ATTR_LOWMEM_PAGES but following as,
> 
> VMEVENT_ATTR_NR_FREE_PAGES
> VMEVENT_ATTR_NR_SWAP_PAGES
> VMEVENT_ATTR_NR_AVAIL_PAGES
> 
> I'm not sure how it is useful.

Yep, these raw values are mostly useless, and personally I don't use
these plain attributes. I use heuristics, i.e. "blended" attributes.
If we can come up with levels-based approach w/o using vmstat, well..
OK then.

But note that we actually want to know how much "free", "cheap to
reclaim" and "can reclaim, but at cost" pages there are. It should
not be just "ouch, we're out of cheap pages" signal, since the whole
point of Android low memory killer is to act in advance, its idea
is to try to free memory before the condition happens, and thus make
the phone UI more smooth. And the threshold is actually load/HW
specific, so I guess folks want to tune it (e.g. "trigger when
there are XX MB of 'cheap to reclaim' pages left").

Thanks,
KOSAKI Motohiro June 4, 2012, 7:50 p.m. UTC | #33
> KOSAKI, AFAIRC, you are a person who hates android low memory killer.
> Why do you hate it? If it solve problems I mentioned, do you have a concern, still?
> If so, please, list up.

1) it took tasklist_lock. (it was solved already)
2) hacky lowmem_deathpending_timeout
3) No ZONE awareness. global_page_state(), lowmem_minfree[] and shrink_slab interface
    don't realize real memory footprint.
4) No memcg, cpuset awareness.
5) strange score calculation. the score is calculated from anon_rss+file_rss,
    but oom killer only free anon_rss.
6) strange oom_score_adj overload
7) much duplicate code w/ oom_killer. we can make common helper function, I think.
8) John's fallocate(VOLATILE) is more cleaner.
KOSAKI Motohiro June 4, 2012, 8:05 p.m. UTC | #34
>> Anton, I expect you already investigated android low memory killer so maybe you know pros and cons of each solution.
>> Could you convince us "why we need vmevent" and "why can't android LMK do it?"
>
> Note that 1) and 2) are not problems per se, it's just implementation
> details, easy stuff. Vmevent is basically an ABI/API, and I didn't
> hear anybody who would object to vmevent ABI idea itself. More than
> this, nobody stop us from implementing in-kernel vmevent API, and
> make Android Lowmemory killer use it, if we want to.

I never agree "it's mere ABI" discussion. Until the implementation is ugly,
I never agree the ABI even if syscall interface is very clean.


> The real problem is not with vmevent. Today there are two real problems:
>
> a) Gathering proper statistics from the kernel. Both cgroups and vmstat
>     have issues. Android lowmemory killer has the same problems w/ the
>     statistics as vmevent, it uses vmstat, so by no means Android
>     low memory killer is better or easier in this regard.
>     (And cgroups has issues w/ slab accounting, plus some folks don't
>     want memcg at all, since it has runtime and memory-wise costs.)

Right. android lowmemory killer is also buggy.


> b) Interpreting this statistics. We can't provide one, universal
>     "low memory" definition that would work for everybody.
>     (Btw, your "levels" based low memory grading actually sounds
>     the same as mine RECLAIMABLE_CACHE_PAGES and
>     RECLAIMABLE_CACHE_PAGES_NOIO idea, i.e.
>     http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/02751.html
>     so personally I like the idea of level-based approach, based
>     on available memory *cost*.)
>
> So, you see, all these issues are valid for vmevent, cgroups and
> android low memory killer.
>
>> KOSAKI, AFAIRC, you are a person who hates android low memory killer.
>> Why do you hate it? If it solve problems I mentioned, do you have a concern, still?
>> If so, please, list up.
>>
>> Android low memory killer is proved solution for a long time, at least embedded area(So many android phone already have used it) so I think improving it makes sense to me rather than inventing new wheel.
>
> Yes, nobody throws Android lowmemory killer away. And recently I fixed
> a bunch of issues in its tasks traversing and killing code. Now it's
> just time to "fix" statistics gathering and interpretation issues,
> and I see vmevent as a good way to do just that, and then we
> can either turn Android lowmemory killer driver to use the vmevent
> in-kernel API (so it will become just a "glue" between notifications
> and killing functions), or use userland daemon.

Huh? No? android lowmem killer is a "killer". it doesn't make any notification,
it only kill memory hogging process. I don't think we can merge them.



> Note that memcg has notifications as well, so it's another proof that
> there is a demand for this stuff outside of embedded world, and going
> with ad-hoc, custom "low memory killer" is simple and tempting approach,
> but it doesn't solve any real problems.

Wrong.
memcg notification notify the event to _another_ mem cgroup's process. Then, it can
avoid a notified process can't work well by swap thrashing. Its feature only share a
weakness of vmevent api.



>> Frankly speaking, I don't know vmevent's other use cases except low memory notification
>
> I won't speak for realistic use-cases, but that is what comes to
> mind:
>
> - DB can grow its caches/in-memory indexes infinitely, and start dropping
>    them on demand (based on internal LRU list, for example). No more
>    guessed/static configuration for DB daemons?

They uses direct-io for fine grained cache control.


> - Assuming VPS hosting w/ dynamic resources management, notifications
>    would be useful to readjust resources?

How do they readjust? Now kvm/xen use balloon driver for dynamic resource
adjustment. AND it work more fine than vmevent because it doesn't route
userspace.


> - On desktops, apps can drop their caches on demand if they want to
>    and can avoid swap activity?

In this case, fallocate(VOLATILE) is work more better.
Anton Vorontsov June 4, 2012, 10:39 p.m. UTC | #35
On Mon, Jun 04, 2012 at 04:05:05PM -0400, KOSAKI Motohiro wrote:
[...]
> >Yes, nobody throws Android lowmemory killer away. And recently I fixed
> >a bunch of issues in its tasks traversing and killing code. Now it's
> >just time to "fix" statistics gathering and interpretation issues,
> >and I see vmevent as a good way to do just that, and then we
> >can either turn Android lowmemory killer driver to use the vmevent
> >in-kernel API (so it will become just a "glue" between notifications
> >and killing functions), or use userland daemon.
> 
> Huh? No? android lowmem killer is a "killer". it doesn't make any notification,
> it only kill memory hogging process. I don't think we can merge them.

KOSAKI, you don't read what I write. I didn't ever say that low memory
killer makes any notifications, that's not what I was saying. I said
that once we'll have a good "low memory" notification mechanism (e.g.
vmevent), Android low memory killer would just use this mechanism. Be
it userland notifications or in-kernel, doesn't matter much.
Pekka Enberg June 5, 2012, 7:47 a.m. UTC | #36
On Mon, Jun 4, 2012 at 11:05 PM, KOSAKI Motohiro
<kosaki.motohiro@gmail.com> wrote:
>> Note that 1) and 2) are not problems per se, it's just implementation
>> details, easy stuff. Vmevent is basically an ABI/API, and I didn't
>> hear anybody who would object to vmevent ABI idea itself. More than
>> this, nobody stop us from implementing in-kernel vmevent API, and
>> make Android Lowmemory killer use it, if we want to.
>
> I never agree "it's mere ABI" discussion. Until the implementation is ugly,
> I never agree the ABI even if syscall interface is very clean.

I don't know what discussion you are talking about.

I also don't agree that something should be merged just because the
ABI is clean. The implementation must also make sense. I don't see how
we disagree here at all.
Pekka Enberg June 5, 2012, 7:52 a.m. UTC | #37
On Mon, Jun 4, 2012 at 11:05 PM, KOSAKI Motohiro
<kosaki.motohiro@gmail.com> wrote:
>> - On desktops, apps can drop their caches on demand if they want to
>>   and can avoid swap activity?
>
> In this case, fallocate(VOLATILE) is work more better.

For some cases, yes, but probably not for all.

For example, if userspace doesn't know about "about to swap real soon"
condition, it can continue to grow its caches making
fallocate(VOLATILE) pretty much useless.
Pekka Enberg June 5, 2012, 7:53 a.m. UTC | #38
On Mon, Jun 4, 2012 at 4:35 PM, Anton Vorontsov <cbouatmailru@gmail.com> wrote:
>> I don't mean VMEVENT_ATTR_LOWMEM_PAGES but following as,
>>
>> VMEVENT_ATTR_NR_FREE_PAGES
>> VMEVENT_ATTR_NR_SWAP_PAGES
>> VMEVENT_ATTR_NR_AVAIL_PAGES
>>
>> I'm not sure how it is useful.
>
> Yep, these raw values are mostly useless, and personally I don't use
> these plain attributes. I use heuristics, i.e. "blended" attributes.
> If we can come up with levels-based approach w/o using vmstat, well..
> OK then.

That's what Nokia's lowmem notifier uses. We can probably drop them
once we have something else they could use.
Minchan Kim June 5, 2012, 8 a.m. UTC | #39
Hi Peakk,

On 06/05/2012 04:53 PM, Pekka Enberg wrote:

> On Mon, Jun 4, 2012 at 4:35 PM, Anton Vorontsov <cbouatmailru@gmail.com> wrote:
>>> I don't mean VMEVENT_ATTR_LOWMEM_PAGES but following as,
>>>
>>> VMEVENT_ATTR_NR_FREE_PAGES
>>> VMEVENT_ATTR_NR_SWAP_PAGES
>>> VMEVENT_ATTR_NR_AVAIL_PAGES
>>>
>>> I'm not sure how it is useful.
>>
>> Yep, these raw values are mostly useless, and personally I don't use
>> these plain attributes. I use heuristics, i.e. "blended" attributes.
>> If we can come up with levels-based approach w/o using vmstat, well..
>> OK then.
> 
> That's what Nokia's lowmem notifier uses. We can probably drop them
> once we have something else they could use.


Next concern is that periodic timer of implementation.
I think it would add direct hook in vmscan.c rather than peeking raw vmstat periodically by timer
so we can control more fine-grained way without unnecessary overhead.
Pekka Enberg June 5, 2012, 8:01 a.m. UTC | #40
On Tue, Jun 5, 2012 at 11:00 AM, Minchan Kim <minchan@kernel.org> wrote:
>> That's what Nokia's lowmem notifier uses. We can probably drop them
>> once we have something else they could use.
>
> Next concern is that periodic timer of implementation.
> I think it would add direct hook in vmscan.c rather than peeking raw vmstat periodically by timer
> so we can control more fine-grained way without unnecessary overhead.

If the hooks are clean and it doesn't hurt the  !CONFIG_VMEVENT case,
I'm completely OK with that.
leonid.moiseichuk@nokia.com June 5, 2012, 8:16 a.m. UTC | #41
> -----Original Message-----
> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
> Pekka Enberg
> Sent: 05 June, 2012 11:02
> To: Minchan Kim
...
> > Next concern is that periodic timer of implementation.
> > I think it would add direct hook in vmscan.c rather than peeking raw
> > vmstat periodically by timer so we can control more fine-grained way
> without unnecessary overhead.
> 
> If the hooks are clean and it doesn't hurt the  !CONFIG_VMEVENT case, I'm
> completely OK with that.

On the previous iteration hooking vm was pointed as very bad idea, so in my version I installed shrinker to handle cases when we have memory pressure.
Using deferred timer with adequate timeout (0.250 ms or larger) fully suitable for userspace and produce adequate overhead 
-> by nature such API should not be 100% accurate, anyhow applications cannot handle situation as good as kernel can provide, 0.5MB space accuracy, 100ms is maximum user-space require for 64-1024MB devices.
Minchan Kim June 5, 2012, 8:27 a.m. UTC | #42
On 06/05/2012 05:16 PM, leonid.moiseichuk@nokia.com wrote:

>> -----Original Message-----
>> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
>> Pekka Enberg
>> Sent: 05 June, 2012 11:02
>> To: Minchan Kim
> ...
>>> Next concern is that periodic timer of implementation.
>>> I think it would add direct hook in vmscan.c rather than peeking raw
>>> vmstat periodically by timer so we can control more fine-grained way
>> without unnecessary overhead.
>>
>> If the hooks are clean and it doesn't hurt the  !CONFIG_VMEVENT case, I'm
>> completely OK with that.
> 
> On the previous iteration hooking vm was pointed as very bad idea, so in my version I installed shrinker to handle cases when we have memory pressure.
> Using deferred timer with adequate timeout (0.250 ms or larger) fully suitable for userspace and produce adequate overhead 
> -> by nature such API should not be 100% accurate, anyhow applications cannot handle situation as good as kernel can provide, 0.5MB space accuracy, 100ms is maximum user-space require for 64-1024MB devices.
> 


I didn't follow previous iteration you mentioned so I don't know the history.
I think it's a not good idea if LMN(low memory notifier) is needed by only embedded world.
Maybe in that case, we might control it enough by only vmstat events but now we know many folks want it
so we are trying to make it general.
IMHO, for meeting various requirement, vmstat raw event isn't enough so we need direct hook in vmscan
and should abstract it to some levels.
Of course, VM guys should maintain it to work best as VM algorithm are changing.
Anton Vorontsov June 5, 2012, 8:39 a.m. UTC | #43
On Tue, Jun 05, 2012 at 10:47:18AM +0300, Pekka Enberg wrote:
> On Mon, Jun 4, 2012 at 11:05 PM, KOSAKI Motohiro
> <kosaki.motohiro@gmail.com> wrote:
> >> Note that 1) and 2) are not problems per se, it's just implementation
> >> details, easy stuff. Vmevent is basically an ABI/API, and I didn't
> >> hear anybody who would object to vmevent ABI idea itself. More than
> >> this, nobody stop us from implementing in-kernel vmevent API, and
> >> make Android Lowmemory killer use it, if we want to.
> >
> > I never agree "it's mere ABI" discussion. Until the implementation is ugly,
> > I never agree the ABI even if syscall interface is very clean.
> 
> I don't know what discussion you are talking about.
> 
> I also don't agree that something should be merged just because the
> ABI is clean. The implementation must also make sense. I don't see how
> we disagree here at all.

BTW, I wasn't implying that vmevent should be merged just because
it is a clean ABI, and I wasn't implying that it is clean, and I
didn't propose to merge it at all. :-)

I just don't see any point in trying to scrap vmevent in favour of
Android low memory killer. This makes no sense at all, since today
vmevent is more useful than Android's solution. For vmevent we have
contributors from Nokia, Samsung, and of course Linaro, plus we
have an userland killer daemon* for Android (which can work with
both cgroups and vmevent backends). So vmevent is more generic
already.

To me it would make more sense if mm guys would tell us "scrap
this all, just use cgroups and its notifications; fix cgroups'
slab accounting and be happy". Well, I'd understand that.

Anyway, we all know that vmevent is 'work in progress', so nobody
tries to push it, nobody asks to merge it. So far we're just
discussing any possible solutions, and vmevent is a good
playground.


So, question to Minchan. Do you have anything particular in mind
regarding how the vmstat hooks should look like? And how all this
would connect with cgroups, since KOSAKI wants to see it cgroups-
aware...

p.s. http://git.infradead.org/users/cbou/ulmkd.git
     I haven't updated it for new vmevent changes, but still,
     its idea should be clear enough.
Christoph Lameter (Ampere) June 5, 2012, 2:40 p.m. UTC | #44
On Tue, 8 May 2012, KOSAKI Motohiro wrote:

> 4) Currently, vmstat have per-cpu batch and vmstat updating makes 3
> second delay at maximum.

Nope. The delay is one second only and it is limited to a certain amount
per cpu. There is a bound on the inaccuracy of the counter. If you want to
have them more accurate then the right approach is to limit the
threshhold. The more accurate the higher the impact of cache line
bouncing.
KOSAKI Motohiro June 8, 2012, 3:35 a.m. UTC | #45
(6/5/12 4:16 AM), leonid.moiseichuk@nokia.com wrote:
>> -----Original Message-----
>> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
>> Pekka Enberg
>> Sent: 05 June, 2012 11:02
>> To: Minchan Kim
> ...
>>> Next concern is that periodic timer of implementation.
>>> I think it would add direct hook in vmscan.c rather than peeking raw
>>> vmstat periodically by timer so we can control more fine-grained way
>> without unnecessary overhead.
>>
>> If the hooks are clean and it doesn't hurt the  !CONFIG_VMEVENT case, I'm
>> completely OK with that.
>
> On the previous iteration hooking vm was pointed as very bad idea, so in my version I installed shrinker to handle cases when we have memory pressure.
> Using deferred timer with adequate timeout (0.250 ms or larger) fully suitable for userspace and produce adequate overhead
> ->  by nature such API should not be 100% accurate, anyhow applications cannot handle situation as good as kernel can provide, 0.5MB space accuracy, 100ms is maximum user-space require for 64-1024MB devices.

I believe that's bad idea. In fact, An "adequate" timeout depend on hardware, not application performance tendency. Thus, applications can't know "adequate" value.
KOSAKI Motohiro June 8, 2012, 3:45 a.m. UTC | #46
(6/4/12 6:39 PM), Anton Vorontsov wrote:
> On Mon, Jun 04, 2012 at 04:05:05PM -0400, KOSAKI Motohiro wrote:
> [...]
>>> Yes, nobody throws Android lowmemory killer away. And recently I fixed
>>> a bunch of issues in its tasks traversing and killing code. Now it's
>>> just time to "fix" statistics gathering and interpretation issues,
>>> and I see vmevent as a good way to do just that, and then we
>>> can either turn Android lowmemory killer driver to use the vmevent
>>> in-kernel API (so it will become just a "glue" between notifications
>>> and killing functions), or use userland daemon.
>>
>> Huh? No? android lowmem killer is a "killer". it doesn't make any notification,
>> it only kill memory hogging process. I don't think we can merge them.
>
> KOSAKI, you don't read what I write. I didn't ever say that low memory
> killer makes any notifications, that's not what I was saying. I said
> that once we'll have a good "low memory" notification mechanism (e.g.
> vmevent), Android low memory killer would just use this mechanism. Be
> it userland notifications or in-kernel, doesn't matter much.

I don't disagree this. But this was not my point. I have to note why
current android killer is NOT notification.

In fact, notification is a mere notification. There is no guarantee to
success to kill. There are various reason to fail to kill. e.g. 1) Any
shrinking activity need more memory. (that's the reason why now we only
have memcg oom notification) 2) userland memory returning activity is not
atomic nor fast. kernel might find another memory shortage before finishing
memory returning. 3) system thrashing may bring userland process stucking
4) ... and userland bugs.

So, big design choice here. 1) vmevent is a just notification. it can't guarantee
to prevent oom. or 2) to implement some trick (e.g. reserved memory for vmevent
processes, kernel activity blocking until finish memory returing, etc)
KOSAKI Motohiro June 8, 2012, 3:55 a.m. UTC | #47
(6/5/12 3:52 AM), Pekka Enberg wrote:
> On Mon, Jun 4, 2012 at 11:05 PM, KOSAKI Motohiro
> <kosaki.motohiro@gmail.com>  wrote:
>>> - On desktops, apps can drop their caches on demand if they want to
>>>    and can avoid swap activity?
>>
>> In this case, fallocate(VOLATILE) is work more better.
>
> For some cases, yes, but probably not for all.
>
> For example, if userspace doesn't know about "about to swap real soon"
> condition, it can continue to grow its caches making
> fallocate(VOLATILE) pretty much useless.

Fair enough. But, Please consider your scenario don't need vmevent(2),
cat /proc/meminfo works enough. Only dropping activity need a quick work
and just in time notification. cache growing limitation don't need.
Pekka Enberg June 8, 2012, 6:54 a.m. UTC | #48
On Fri, Jun 8, 2012 at 6:55 AM, KOSAKI Motohiro
<kosaki.motohiro@gmail.com> wrote:
>> For some cases, yes, but probably not for all.
>>
>> For example, if userspace doesn't know about "about to swap real soon"
>> condition, it can continue to grow its caches making
>> fallocate(VOLATILE) pretty much useless.
>
> Fair enough. But, Please consider your scenario don't need vmevent(2),
> cat /proc/meminfo works enough. Only dropping activity need a quick work
> and just in time notification. cache growing limitation don't need.

Why do you keep bringing this up? Yes, you might get away with polling
/proc/meminfo or /proc/vmstat for some specific cases. It does not
make it a proper generic solution to the problem "vmevent" ABI tries
to address.
Pekka Enberg June 8, 2012, 6:57 a.m. UTC | #49
On Fri, Jun 8, 2012 at 6:45 AM, KOSAKI Motohiro
<kosaki.motohiro@gmail.com> wrote:
> So, big design choice here. 1) vmevent is a just notification. it can't
> guarantee to prevent oom. or 2) to implement some trick (e.g. reserved
> memory for vmevent processes, kernel activity blocking until finish
> memory returing, etc)

Yes, vmevent is only for notification and will not *guarantee* OOM
prevention. It simply tries to provide hints early enough to the
userspace to so that OOM can be avoided if possible.
KOSAKI Motohiro June 8, 2012, 6:57 a.m. UTC | #50
(6/8/12 2:54 AM), Pekka Enberg wrote:
> On Fri, Jun 8, 2012 at 6:55 AM, KOSAKI Motohiro
> <kosaki.motohiro@gmail.com>  wrote:
>>> For some cases, yes, but probably not for all.
>>>
>>> For example, if userspace doesn't know about "about to swap real soon"
>>> condition, it can continue to grow its caches making
>>> fallocate(VOLATILE) pretty much useless.
>>
>> Fair enough. But, Please consider your scenario don't need vmevent(2),
>> cat /proc/meminfo works enough. Only dropping activity need a quick work
>> and just in time notification. cache growing limitation don't need.
>
> Why do you keep bringing this up? Yes, you might get away with polling
> /proc/meminfo or /proc/vmstat for some specific cases. It does not
> make it a proper generic solution to the problem "vmevent" ABI tries
> to address.

Guys, current vmevent _is_ a polling interface. It only is wrapped kernel
timer.
Pekka Enberg June 8, 2012, 6:59 a.m. UTC | #51
On Fri, Jun 8, 2012 at 9:57 AM, KOSAKI Motohiro
<kosaki.motohiro@gmail.com> wrote:
> Guys, current vmevent _is_ a polling interface. It only is wrapped kernel
> timer.

Current implementation is like that but we don't intend to keep it
that way. That's why having a separate ABI is so important.
diff mbox

Patch

diff --git a/include/linux/vmevent.h b/include/linux/vmevent.h
index aae0d24..9bfa244 100644
--- a/include/linux/vmevent.h
+++ b/include/linux/vmevent.h
@@ -10,6 +10,13 @@  enum {
 	VMEVENT_ATTR_NR_AVAIL_PAGES	= 1UL,
 	VMEVENT_ATTR_NR_FREE_PAGES	= 2UL,
 	VMEVENT_ATTR_NR_SWAP_PAGES	= 3UL,
+	/*
+	 * This is specially blended attribute, the event triggers
+	 * when kernel decides that we're close to the low memory threshold.
+	 * Don't expect very precise meaning of low memory situation, mostly,
+	 * it's just a guess on the kernel's side.
+	 */
+	VMEVENT_ATTR_LOWMEM_PAGES	= 4UL,
 
 	VMEVENT_ATTR_MAX		/* non-ABI */
 };
diff --git a/mm/vmevent.c b/mm/vmevent.c
index b312236..d278a25 100644
--- a/mm/vmevent.c
+++ b/mm/vmevent.c
@@ -68,10 +68,50 @@  static u64 vmevent_attr_avail_pages(struct vmevent_watch *watch,
 	return totalram_pages;
 }
 
+/*
+ * Here's some implementation details for the "low memory" meaning.
+ *
+ * (The explanation is not in the header file as userland should not
+ * know these details, nor it should assume that the meaning will
+ * always be the same. As well as it should not know how exactly kernel
+ * manages the memory, or assume that memory management behaviour is a
+ * part of the "ABI". So, all the 'low memory' is just guessing, but
+ * we're trying to do our best.)
+ *
+ * For this implementation, we assume that there's a low memory situation
+ * for the N pages threshold when we have neither N pages of completely
+ * free pages, nor we have N reclaimable pages in the cache. This
+ * effectively means, that if userland expects to allocate N pages, it
+ * would consume all the free pages, and any further allocations (above
+ * N) would start draining caches.
+ *
+ * In the worst case, prior hitting the threshold, we might have only
+ * N pages in cache, and nearly no memory as free pages.
+ */
+static u64 vmevent_attr_lowmem_pages(struct vmevent_watch *watch,
+				     struct vmevent_attr *attr)
+{
+	int free = global_page_state(NR_FREE_PAGES);
+	int file = global_page_state(NR_FILE_PAGES) -
+		   global_page_state(NR_SHMEM); /* TODO: account locked pages */
+	int val = attr->value;
+
+	/*
+	 * For convenience we return 0 or attr value (instead of 0/1), it
+	 * makes it easier for vmevent_match() to cope with blended
+	 * attributes, plus userland might use the value to find out which
+	 * threshold triggered.
+	 */
+	if (free < val && file < val)
+		return val;
+	return 0;
+}
+
 static vmevent_attr_sample_fn attr_samplers[] = {
 	[VMEVENT_ATTR_NR_AVAIL_PAGES]   = vmevent_attr_avail_pages,
 	[VMEVENT_ATTR_NR_FREE_PAGES]    = vmevent_attr_free_pages,
 	[VMEVENT_ATTR_NR_SWAP_PAGES]    = vmevent_attr_swap_pages,
+	[VMEVENT_ATTR_LOWMEM_PAGES]     = vmevent_attr_lowmem_pages,
 };
 
 static u64 vmevent_sample_attr(struct vmevent_watch *watch, struct vmevent_attr *attr)
diff --git a/tools/testing/vmevent/vmevent-test.c b/tools/testing/vmevent/vmevent-test.c
index fd9a174..c61aed7 100644
--- a/tools/testing/vmevent/vmevent-test.c
+++ b/tools/testing/vmevent/vmevent-test.c
@@ -33,7 +33,7 @@  int main(int argc, char *argv[])
 
 	config = (struct vmevent_config) {
 		.sample_period_ns	= 1000000000L,
-		.counter		= 6,
+		.counter		= 7,
 		.attrs			= {
 			{
 				.type	= VMEVENT_ATTR_NR_FREE_PAGES,
@@ -59,6 +59,13 @@  int main(int argc, char *argv[])
 				.type	= VMEVENT_ATTR_NR_SWAP_PAGES,
 			},
 			{
+				.type	= VMEVENT_ATTR_LOWMEM_PAGES,
+				.state	= VMEVENT_ATTR_STATE_VALUE_LT |
+					  VMEVENT_ATTR_STATE_VALUE_EQ |
+					  VMEVENT_ATTR_STATE_ONE_SHOT,
+				.value	= phys_pages / 2,
+			},
+			{
 				.type	= 0xffff, /* invalid */
 			},
 		},
@@ -108,6 +115,9 @@  int main(int argc, char *argv[])
 			case VMEVENT_ATTR_NR_SWAP_PAGES:
 				printf("  VMEVENT_ATTR_NR_SWAP_PAGES: %Lu\n", attr->value);
 				break;
+			case VMEVENT_ATTR_LOWMEM_PAGES:
+				printf("  VMEVENT_ATTR_LOWMEM_PAGES: %Lu\n", attr->value);
+				break;
 			default:
 				printf("  Unknown attribute: %Lu\n", attr->value);
 			}