[tip/core/rcu,07/23] rcu: Provide OOM handler to motivate lazy RCU callbacks

Message ID 1346350718-30937-7-git-send-email-paulmck@linux.vnet.ibm.com
State New
Headers show

Commit Message

Paul E. McKenney Aug. 30, 2012, 6:18 p.m.
From: "Paul E. McKenney" <paul.mckenney@linaro.org>

In kernels built with CONFIG_RCU_FAST_NO_HZ=y, CPUs can accumulate a
large number of lazy callbacks, which as the name implies will be slow
to be invoked.  This can be a problem on small-memory systems, where the
default 6-second sleep for CPUs having only lazy RCU callbacks could well
be fatal.  This commit therefore installs an OOM hander that ensures that
every CPU with non-lazy callbacks has at least one non-lazy callback,
in turn ensuring timely advancement for these callbacks.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
---
 kernel/rcutree.h        |    5 ++-
 kernel/rcutree_plugin.h |   80 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 1 deletions(-)

Comments

Josh Triplett Sept. 2, 2012, 2:13 a.m. | #1
On Thu, Aug 30, 2012 at 11:18:22AM -0700, Paul E. McKenney wrote:
> From: "Paul E. McKenney" <paul.mckenney@linaro.org>
> 
> In kernels built with CONFIG_RCU_FAST_NO_HZ=y, CPUs can accumulate a
> large number of lazy callbacks, which as the name implies will be slow
> to be invoked.  This can be a problem on small-memory systems, where the
> default 6-second sleep for CPUs having only lazy RCU callbacks could well
> be fatal.  This commit therefore installs an OOM hander that ensures that
> every CPU with non-lazy callbacks has at least one non-lazy callback,
> in turn ensuring timely advancement for these callbacks.

Did you mean "every CPU with lazy callbacks" here?

> Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Tested-by: Sasha Levin <levinsasha928@gmail.com>

Reviewed-by: Josh Triplett <josh@joshtriplett.org>

>  kernel/rcutree.h        |    5 ++-
>  kernel/rcutree_plugin.h |   80 +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 84 insertions(+), 1 deletions(-)
> 
> diff --git a/kernel/rcutree.h b/kernel/rcutree.h
> index 117a150..effb273 100644
> --- a/kernel/rcutree.h
> +++ b/kernel/rcutree.h
> @@ -315,8 +315,11 @@ struct rcu_data {
>  	unsigned long n_rp_need_fqs;
>  	unsigned long n_rp_need_nothing;
>  
> -	/* 6) _rcu_barrier() callback. */
> +	/* 6) _rcu_barrier() and OOM callbacks. */
>  	struct rcu_head barrier_head;
> +#ifdef CONFIG_RCU_FAST_NO_HZ
> +	struct rcu_head oom_head;
> +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
>  
>  	int cpu;
>  	struct rcu_state *rsp;
> diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
> index 7f3244c..bac8cc1 100644
> --- a/kernel/rcutree_plugin.h
> +++ b/kernel/rcutree_plugin.h
> @@ -25,6 +25,7 @@
>   */
>  
>  #include <linux/delay.h>
> +#include <linux/oom.h>
>  
>  #define RCU_KTHREAD_PRIO 1
>  
> @@ -2112,6 +2113,85 @@ static void rcu_idle_count_callbacks_posted(void)
>  	__this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
>  }
>  
> +/*
> + * Data for flushing lazy RCU callbacks at OOM time.
> + */
> +static atomic_t oom_callback_count;
> +static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
> +
> +/*
> + * RCU OOM callback -- decrement the outstanding count and deliver the
> + * wake-up if we are the last one.
> + */
> +static void rcu_oom_callback(struct rcu_head *rhp)
> +{
> +	if (atomic_dec_and_test(&oom_callback_count))
> +		wake_up(&oom_callback_wq);
> +}
> +
> +/*
> + * Post an rcu_oom_notify callback on the current CPU if it has at
> + * least one lazy callback.  This will unnecessarily post callbacks
> + * to CPUs that already have a non-lazy callback at the end of their
> + * callback list, but this is an infrequent operation, so accept some
> + * extra overhead to keep things simple.
> + */
> +static void rcu_oom_notify_cpu(void *flavor)
> +{
> +	struct rcu_state *rsp = flavor;
> +	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
> +
> +	if (rdp->qlen_lazy != 0) {
> +		atomic_inc(&oom_callback_count);
> +		rsp->call(&rdp->oom_head, rcu_oom_callback);
> +	}
> +}
> +
> +/*
> + * If low on memory, ensure that each CPU has a non-lazy callback.
> + * This will wake up CPUs that have only lazy callbacks, in turn
> + * ensuring that they free up the corresponding memory in a timely manner.
> + */
> +static int rcu_oom_notify(struct notifier_block *self,
> +			  unsigned long notused, void *nfreed)
> +{
> +	int cpu;
> +	struct rcu_state *rsp;
> +
> +	/* Wait for callbacks from earlier instance to complete. */
> +	wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
> +
> +	/*
> +	 * Prevent premature wakeup: ensure that all increments happen
> +	 * before there is a chance of the counter reaching zero.
> +	 */
> +	atomic_set(&oom_callback_count, 1);
> +
> +	get_online_cpus();
> +	for_each_online_cpu(cpu)
> +		for_each_rcu_flavor(rsp)
> +			smp_call_function_single(cpu, rcu_oom_notify_cpu,
> +						 rsp, 1);
> +	put_online_cpus();
> +
> +	/* Unconditionally decrement: no need to wake ourselves up. */
> +	atomic_dec(&oom_callback_count);
> +
> +	*(unsigned long *)nfreed = 1;
> +	return NOTIFY_OK;
> +}
> +
> +static struct notifier_block rcu_oom_nb = {
> +	.notifier_call = rcu_oom_notify
> +};
> +
> +static int __init rcu_register_oom_notifier(void)
> +{
> +	register_oom_notifier(&rcu_oom_nb);
> +	return 0;
> +}
> +early_initcall(rcu_register_oom_notifier);
> +
>  #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
>  
>  #ifdef CONFIG_RCU_CPU_STALL_INFO
> -- 
> 1.7.8
>
Lai Jiangshan Sept. 3, 2012, 9:08 a.m. | #2
On 08/31/2012 02:18 AM, Paul E. McKenney wrote:
> From: "Paul E. McKenney" <paul.mckenney@linaro.org>
> 
> In kernels built with CONFIG_RCU_FAST_NO_HZ=y, CPUs can accumulate a
> large number of lazy callbacks, which as the name implies will be slow
> to be invoked.  This can be a problem on small-memory systems, where the
> default 6-second sleep for CPUs having only lazy RCU callbacks could well
> be fatal.  This commit therefore installs an OOM hander that ensures that
> every CPU with non-lazy callbacks has at least one non-lazy callback,
> in turn ensuring timely advancement for these callbacks.
> 
> Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Tested-by: Sasha Levin <levinsasha928@gmail.com>
> ---
>  kernel/rcutree.h        |    5 ++-
>  kernel/rcutree_plugin.h |   80 +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 84 insertions(+), 1 deletions(-)
> 
> diff --git a/kernel/rcutree.h b/kernel/rcutree.h
> index 117a150..effb273 100644
> --- a/kernel/rcutree.h
> +++ b/kernel/rcutree.h
> @@ -315,8 +315,11 @@ struct rcu_data {
>  	unsigned long n_rp_need_fqs;
>  	unsigned long n_rp_need_nothing;
>  
> -	/* 6) _rcu_barrier() callback. */
> +	/* 6) _rcu_barrier() and OOM callbacks. */
>  	struct rcu_head barrier_head;
> +#ifdef CONFIG_RCU_FAST_NO_HZ
> +	struct rcu_head oom_head;
> +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
>  
>  	int cpu;
>  	struct rcu_state *rsp;
> diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
> index 7f3244c..bac8cc1 100644
> --- a/kernel/rcutree_plugin.h
> +++ b/kernel/rcutree_plugin.h
> @@ -25,6 +25,7 @@
>   */
>  
>  #include <linux/delay.h>
> +#include <linux/oom.h>
>  
>  #define RCU_KTHREAD_PRIO 1
>  
> @@ -2112,6 +2113,85 @@ static void rcu_idle_count_callbacks_posted(void)
>  	__this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
>  }
>  
> +/*
> + * Data for flushing lazy RCU callbacks at OOM time.
> + */
> +static atomic_t oom_callback_count;
> +static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
> +
> +/*
> + * RCU OOM callback -- decrement the outstanding count and deliver the
> + * wake-up if we are the last one.
> + */
> +static void rcu_oom_callback(struct rcu_head *rhp)
> +{
> +	if (atomic_dec_and_test(&oom_callback_count))
> +		wake_up(&oom_callback_wq);
> +}
> +
> +/*
> + * Post an rcu_oom_notify callback on the current CPU if it has at
> + * least one lazy callback.  This will unnecessarily post callbacks
> + * to CPUs that already have a non-lazy callback at the end of their
> + * callback list, but this is an infrequent operation, so accept some
> + * extra overhead to keep things simple.
> + */
> +static void rcu_oom_notify_cpu(void *flavor)
> +{
> +	struct rcu_state *rsp = flavor;
> +	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
> +
> +	if (rdp->qlen_lazy != 0) {
> +		atomic_inc(&oom_callback_count);
> +		rsp->call(&rdp->oom_head, rcu_oom_callback);
> +	}
> +}
> +
> +/*
> + * If low on memory, ensure that each CPU has a non-lazy callback.
> + * This will wake up CPUs that have only lazy callbacks, in turn
> + * ensuring that they free up the corresponding memory in a timely manner.
> + */
> +static int rcu_oom_notify(struct notifier_block *self,
> +			  unsigned long notused, void *nfreed)
> +{
> +	int cpu;
> +	struct rcu_state *rsp;
> +
> +	/* Wait for callbacks from earlier instance to complete. */
> +	wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
> +
> +	/*
> +	 * Prevent premature wakeup: ensure that all increments happen
> +	 * before there is a chance of the counter reaching zero.
> +	 */
> +	atomic_set(&oom_callback_count, 1);
> +
> +	get_online_cpus();
> +	for_each_online_cpu(cpu)
> +		for_each_rcu_flavor(rsp)
> +			smp_call_function_single(cpu, rcu_oom_notify_cpu,
> +						 rsp, 1);
> +	put_online_cpus();
> +
> +	/* Unconditionally decrement: no need to wake ourselves up. */
> +	atomic_dec(&oom_callback_count);
> +
> +	*(unsigned long *)nfreed = 1;

Hi, Paul

If you consider the above code has free some memory,
you should use *(unsigned long *)nfreed = +1.
                                          ^^

And your code disable OOM actually, because it transfer *nfreed to NON-ZERO
unconditionally.

I did not review the patch nor the whole series carefully.

And if it is possible, could you share the code with rcu_barrier()?

Thanks,
Lai

> +	return NOTIFY_OK;
> +}
> +
> +static struct notifier_block rcu_oom_nb = {
> +	.notifier_call = rcu_oom_notify
> +};
> +
> +static int __init rcu_register_oom_notifier(void)
> +{
> +	register_oom_notifier(&rcu_oom_nb);
> +	return 0;
> +}
> +early_initcall(rcu_register_oom_notifier);
> +
>  #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
>  
>  #ifdef CONFIG_RCU_CPU_STALL_INFO
Paul E. McKenney Sept. 5, 2012, 5:45 p.m. | #3
On Mon, Sep 03, 2012 at 05:08:24PM +0800, Lai Jiangshan wrote:
> On 08/31/2012 02:18 AM, Paul E. McKenney wrote:
> > From: "Paul E. McKenney" <paul.mckenney@linaro.org>
> > 
> > In kernels built with CONFIG_RCU_FAST_NO_HZ=y, CPUs can accumulate a
> > large number of lazy callbacks, which as the name implies will be slow
> > to be invoked.  This can be a problem on small-memory systems, where the
> > default 6-second sleep for CPUs having only lazy RCU callbacks could well
> > be fatal.  This commit therefore installs an OOM hander that ensures that
> > every CPU with non-lazy callbacks has at least one non-lazy callback,
> > in turn ensuring timely advancement for these callbacks.
> > 
> > Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > Tested-by: Sasha Levin <levinsasha928@gmail.com>
> > ---
> >  kernel/rcutree.h        |    5 ++-
> >  kernel/rcutree_plugin.h |   80 +++++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 84 insertions(+), 1 deletions(-)
> > 
> > diff --git a/kernel/rcutree.h b/kernel/rcutree.h
> > index 117a150..effb273 100644
> > --- a/kernel/rcutree.h
> > +++ b/kernel/rcutree.h
> > @@ -315,8 +315,11 @@ struct rcu_data {
> >  	unsigned long n_rp_need_fqs;
> >  	unsigned long n_rp_need_nothing;
> >  
> > -	/* 6) _rcu_barrier() callback. */
> > +	/* 6) _rcu_barrier() and OOM callbacks. */
> >  	struct rcu_head barrier_head;
> > +#ifdef CONFIG_RCU_FAST_NO_HZ
> > +	struct rcu_head oom_head;
> > +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
> >  
> >  	int cpu;
> >  	struct rcu_state *rsp;
> > diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
> > index 7f3244c..bac8cc1 100644
> > --- a/kernel/rcutree_plugin.h
> > +++ b/kernel/rcutree_plugin.h
> > @@ -25,6 +25,7 @@
> >   */
> >  
> >  #include <linux/delay.h>
> > +#include <linux/oom.h>
> >  
> >  #define RCU_KTHREAD_PRIO 1
> >  
> > @@ -2112,6 +2113,85 @@ static void rcu_idle_count_callbacks_posted(void)
> >  	__this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
> >  }
> >  
> > +/*
> > + * Data for flushing lazy RCU callbacks at OOM time.
> > + */
> > +static atomic_t oom_callback_count;
> > +static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
> > +
> > +/*
> > + * RCU OOM callback -- decrement the outstanding count and deliver the
> > + * wake-up if we are the last one.
> > + */
> > +static void rcu_oom_callback(struct rcu_head *rhp)
> > +{
> > +	if (atomic_dec_and_test(&oom_callback_count))
> > +		wake_up(&oom_callback_wq);
> > +}
> > +
> > +/*
> > + * Post an rcu_oom_notify callback on the current CPU if it has at
> > + * least one lazy callback.  This will unnecessarily post callbacks
> > + * to CPUs that already have a non-lazy callback at the end of their
> > + * callback list, but this is an infrequent operation, so accept some
> > + * extra overhead to keep things simple.
> > + */
> > +static void rcu_oom_notify_cpu(void *flavor)
> > +{
> > +	struct rcu_state *rsp = flavor;
> > +	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
> > +
> > +	if (rdp->qlen_lazy != 0) {
> > +		atomic_inc(&oom_callback_count);
> > +		rsp->call(&rdp->oom_head, rcu_oom_callback);
> > +	}
> > +}
> > +
> > +/*
> > + * If low on memory, ensure that each CPU has a non-lazy callback.
> > + * This will wake up CPUs that have only lazy callbacks, in turn
> > + * ensuring that they free up the corresponding memory in a timely manner.
> > + */
> > +static int rcu_oom_notify(struct notifier_block *self,
> > +			  unsigned long notused, void *nfreed)
> > +{
> > +	int cpu;
> > +	struct rcu_state *rsp;
> > +
> > +	/* Wait for callbacks from earlier instance to complete. */
> > +	wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
> > +
> > +	/*
> > +	 * Prevent premature wakeup: ensure that all increments happen
> > +	 * before there is a chance of the counter reaching zero.
> > +	 */
> > +	atomic_set(&oom_callback_count, 1);
> > +
> > +	get_online_cpus();
> > +	for_each_online_cpu(cpu)
> > +		for_each_rcu_flavor(rsp)
> > +			smp_call_function_single(cpu, rcu_oom_notify_cpu,
> > +						 rsp, 1);
> > +	put_online_cpus();
> > +
> > +	/* Unconditionally decrement: no need to wake ourselves up. */
> > +	atomic_dec(&oom_callback_count);
> > +
> > +	*(unsigned long *)nfreed = 1;
> 
> Hi, Paul
> 
> If you consider the above code has free some memory,
> you should use *(unsigned long *)nfreed = +1.
>                                           ^^
> 
> And your code disable OOM actually, because it transfer *nfreed to NON-ZERO
> unconditionally.

Hmmm...  That does indeed cause out_of_memory() to unconditionally
return, doesn't it?

So I should really just leave *nfreed alone, since I cannot be sure
whether or not anything will actually get freed.  I -could- count
callbacks, but they might well be allocated as fast as they are freed.

Good catch!!!

> I did not review the patch nor the whole series carefully.
> 
> And if it is possible, could you share the code with rcu_barrier()?

At the moment, it adds more code than it saves.

							Thanx, Paul

> Thanks,
> Lai
> 
> > +	return NOTIFY_OK;
> > +}
> > +
> > +static struct notifier_block rcu_oom_nb = {
> > +	.notifier_call = rcu_oom_notify
> > +};
> > +
> > +static int __init rcu_register_oom_notifier(void)
> > +{
> > +	register_oom_notifier(&rcu_oom_nb);
> > +	return 0;
> > +}
> > +early_initcall(rcu_register_oom_notifier);
> > +
> >  #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
> >  
> >  #ifdef CONFIG_RCU_CPU_STALL_INFO
>
Peter Zijlstra Sept. 6, 2012, 1:46 p.m. | #4
On Thu, 2012-08-30 at 11:18 -0700, Paul E. McKenney wrote:
> +       get_online_cpus();
> +       for_each_online_cpu(cpu)
> +               for_each_rcu_flavor(rsp)
> +                       smp_call_function_single(cpu, rcu_oom_notify_cpu,
> +                                                rsp, 1);
> +       put_online_cpus(); 

I guess blasting IPIs around is better than OOM but still.. do you
really need to wait for each cpu individually, or would a construct
using on_each_cpu() be possible, or better yet, on_each_cpu_cond()?
Steven Rostedt Sept. 6, 2012, 1:52 p.m. | #5
On Thu, 2012-09-06 at 15:46 +0200, Peter Zijlstra wrote:
> On Thu, 2012-08-30 at 11:18 -0700, Paul E. McKenney wrote:
> > +       get_online_cpus();
> > +       for_each_online_cpu(cpu)
> > +               for_each_rcu_flavor(rsp)
> > +                       smp_call_function_single(cpu, rcu_oom_notify_cpu,
> > +                                                rsp, 1);
> > +       put_online_cpus(); 
> 
> I guess blasting IPIs around is better than OOM but still.. do you
> really need to wait for each cpu individually, or would a construct
> using on_each_cpu() be possible, or better yet, on_each_cpu_cond()?

Also, what about having the rcu_oom_notify_cpu handler do the
for_each_rcu_flavor() and not send an IPI multiple times to a single
CPU?

-- Steve
Paul E. McKenney Sept. 6, 2012, 5:41 p.m. | #6
On Thu, Sep 06, 2012 at 09:52:53AM -0400, Steven Rostedt wrote:
> On Thu, 2012-09-06 at 15:46 +0200, Peter Zijlstra wrote:
> > On Thu, 2012-08-30 at 11:18 -0700, Paul E. McKenney wrote:
> > > +       get_online_cpus();
> > > +       for_each_online_cpu(cpu)
> > > +               for_each_rcu_flavor(rsp)
> > > +                       smp_call_function_single(cpu, rcu_oom_notify_cpu,
> > > +                                                rsp, 1);
> > > +       put_online_cpus(); 
> > 
> > I guess blasting IPIs around is better than OOM but still.. do you
> > really need to wait for each cpu individually, or would a construct
> > using on_each_cpu() be possible, or better yet, on_each_cpu_cond()?

I rejected on_each_cpu_cond() because it disables preemption across
a scan of all CPUs.  Probably need to fix that at some point...

> Also, what about having the rcu_oom_notify_cpu handler do the
> for_each_rcu_flavor() and not send an IPI multiple times to a single
> CPU?

Fair enough!

							Thanx, Paul
Peter Zijlstra Sept. 6, 2012, 5:46 p.m. | #7
On Thu, 2012-09-06 at 10:41 -0700, Paul E. McKenney wrote:
> On Thu, Sep 06, 2012 at 09:52:53AM -0400, Steven Rostedt wrote:
> > On Thu, 2012-09-06 at 15:46 +0200, Peter Zijlstra wrote:
> > > On Thu, 2012-08-30 at 11:18 -0700, Paul E. McKenney wrote:
> > > > +       get_online_cpus();
> > > > +       for_each_online_cpu(cpu)
> > > > +               for_each_rcu_flavor(rsp)
> > > > +                       smp_call_function_single(cpu, rcu_oom_notify_cpu,
> > > > +                                                rsp, 1);
> > > > +       put_online_cpus(); 
> > > 
> > > I guess blasting IPIs around is better than OOM but still.. do you
> > > really need to wait for each cpu individually, or would a construct
> > > using on_each_cpu() be possible, or better yet, on_each_cpu_cond()?
> 
> I rejected on_each_cpu_cond() because it disables preemption across
> a scan of all CPUs.  Probably need to fix that at some point...

It would be rather straight fwd to make a variant that does
get_online_cpus() though.. but even then there's smp_call_function()
that does a broadcast, avoiding the need to spray individual IPIs and
wait for each CPU individually.
Paul E. McKenney Sept. 6, 2012, 8:32 p.m. | #8
On Thu, Sep 06, 2012 at 07:46:16PM +0200, Peter Zijlstra wrote:
> On Thu, 2012-09-06 at 10:41 -0700, Paul E. McKenney wrote:
> > On Thu, Sep 06, 2012 at 09:52:53AM -0400, Steven Rostedt wrote:
> > > On Thu, 2012-09-06 at 15:46 +0200, Peter Zijlstra wrote:
> > > > On Thu, 2012-08-30 at 11:18 -0700, Paul E. McKenney wrote:
> > > > > +       get_online_cpus();
> > > > > +       for_each_online_cpu(cpu)
> > > > > +               for_each_rcu_flavor(rsp)
> > > > > +                       smp_call_function_single(cpu, rcu_oom_notify_cpu,
> > > > > +                                                rsp, 1);
> > > > > +       put_online_cpus(); 
> > > > 
> > > > I guess blasting IPIs around is better than OOM but still.. do you
> > > > really need to wait for each cpu individually, or would a construct
> > > > using on_each_cpu() be possible, or better yet, on_each_cpu_cond()?
> > 
> > I rejected on_each_cpu_cond() because it disables preemption across
> > a scan of all CPUs.  Probably need to fix that at some point...
> 
> It would be rather straight fwd to make a variant that does
> get_online_cpus() though.. but even then there's smp_call_function()
> that does a broadcast, avoiding the need to spray individual IPIs and
> wait for each CPU individually.

And in this case I can live with inexactness with respect to CPUs actually
being hotplugged, so smp_call_function() does sound good.

							Thanx, Paul

Patch

diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 117a150..effb273 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -315,8 +315,11 @@  struct rcu_data {
 	unsigned long n_rp_need_fqs;
 	unsigned long n_rp_need_nothing;
 
-	/* 6) _rcu_barrier() callback. */
+	/* 6) _rcu_barrier() and OOM callbacks. */
 	struct rcu_head barrier_head;
+#ifdef CONFIG_RCU_FAST_NO_HZ
+	struct rcu_head oom_head;
+#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 
 	int cpu;
 	struct rcu_state *rsp;
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 7f3244c..bac8cc1 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@ 
  */
 
 #include <linux/delay.h>
+#include <linux/oom.h>
 
 #define RCU_KTHREAD_PRIO 1
 
@@ -2112,6 +2113,85 @@  static void rcu_idle_count_callbacks_posted(void)
 	__this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
 }
 
+/*
+ * Data for flushing lazy RCU callbacks at OOM time.
+ */
+static atomic_t oom_callback_count;
+static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
+
+/*
+ * RCU OOM callback -- decrement the outstanding count and deliver the
+ * wake-up if we are the last one.
+ */
+static void rcu_oom_callback(struct rcu_head *rhp)
+{
+	if (atomic_dec_and_test(&oom_callback_count))
+		wake_up(&oom_callback_wq);
+}
+
+/*
+ * Post an rcu_oom_notify callback on the current CPU if it has at
+ * least one lazy callback.  This will unnecessarily post callbacks
+ * to CPUs that already have a non-lazy callback at the end of their
+ * callback list, but this is an infrequent operation, so accept some
+ * extra overhead to keep things simple.
+ */
+static void rcu_oom_notify_cpu(void *flavor)
+{
+	struct rcu_state *rsp = flavor;
+	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+
+	if (rdp->qlen_lazy != 0) {
+		atomic_inc(&oom_callback_count);
+		rsp->call(&rdp->oom_head, rcu_oom_callback);
+	}
+}
+
+/*
+ * If low on memory, ensure that each CPU has a non-lazy callback.
+ * This will wake up CPUs that have only lazy callbacks, in turn
+ * ensuring that they free up the corresponding memory in a timely manner.
+ */
+static int rcu_oom_notify(struct notifier_block *self,
+			  unsigned long notused, void *nfreed)
+{
+	int cpu;
+	struct rcu_state *rsp;
+
+	/* Wait for callbacks from earlier instance to complete. */
+	wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
+
+	/*
+	 * Prevent premature wakeup: ensure that all increments happen
+	 * before there is a chance of the counter reaching zero.
+	 */
+	atomic_set(&oom_callback_count, 1);
+
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		for_each_rcu_flavor(rsp)
+			smp_call_function_single(cpu, rcu_oom_notify_cpu,
+						 rsp, 1);
+	put_online_cpus();
+
+	/* Unconditionally decrement: no need to wake ourselves up. */
+	atomic_dec(&oom_callback_count);
+
+	*(unsigned long *)nfreed = 1;
+	return NOTIFY_OK;
+}
+
+static struct notifier_block rcu_oom_nb = {
+	.notifier_call = rcu_oom_notify
+};
+
+static int __init rcu_register_oom_notifier(void)
+{
+	register_oom_notifier(&rcu_oom_nb);
+	return 0;
+}
+early_initcall(rcu_register_oom_notifier);
+
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
 
 #ifdef CONFIG_RCU_CPU_STALL_INFO