diff mbox series

[RFC] cpu_pm: Remove RCU abuse

Message ID 20200903135347.GC1362448@hirez.programming.kicks-ass.net
State New
Headers show
Series [RFC] cpu_pm: Remove RCU abuse | expand

Commit Message

Peter Zijlstra Sept. 3, 2020, 1:53 p.m. UTC
On Wed, Sep 02, 2020 at 05:58:55PM +0200, Ulf Hansson wrote:
> On Wed, 2 Sep 2020 at 14:14, <peterz@infradead.org> wrote:
> >
> > On Wed, Sep 02, 2020 at 09:03:37AM +0200, Ulf Hansson wrote:
> > > Lots of cpuidle drivers are using CPU_PM notifiers (grep for
> > > cpu_pm_enter and you will see) from their idlestates ->enter()
> > > callbacks. And for those we are already calling
> > > rcu_irq_enter_irqson|off() in cpu_pm_notify() when firing them.
> >
> > Yeah, that particular trainwreck is on my todo list already ... then
> > again, that list is forever overflowing.
> >
> > I'm thinking cpu_pm_unregister_notifier() is not a common thing? The few
> > I looked at seem to suggest 'never' is a good approximation.
> 
> The trend is that drivers are turning into regular modules that may
> also need to manage "->remove()", which may mean unregistering the
> notifier. Of course, I don't know for sure whether that becomes a
> problem, but it seems quite limiting.

You can pin modules, once they're loaded they can never be removed
again.

Anyway, the below should 'work', I think.

---

Comments

Ulf Hansson Sept. 3, 2020, 2:36 p.m. UTC | #1
On Thu, 3 Sep 2020 at 15:53, <peterz@infradead.org> wrote:
>
> On Wed, Sep 02, 2020 at 05:58:55PM +0200, Ulf Hansson wrote:
> > On Wed, 2 Sep 2020 at 14:14, <peterz@infradead.org> wrote:
> > >
> > > On Wed, Sep 02, 2020 at 09:03:37AM +0200, Ulf Hansson wrote:
> > > > Lots of cpuidle drivers are using CPU_PM notifiers (grep for
> > > > cpu_pm_enter and you will see) from their idlestates ->enter()
> > > > callbacks. And for those we are already calling
> > > > rcu_irq_enter_irqson|off() in cpu_pm_notify() when firing them.
> > >
> > > Yeah, that particular trainwreck is on my todo list already ... then
> > > again, that list is forever overflowing.
> > >
> > > I'm thinking cpu_pm_unregister_notifier() is not a common thing? The few
> > > I looked at seem to suggest 'never' is a good approximation.
> >
> > The trend is that drivers are turning into regular modules that may
> > also need to manage "->remove()", which may mean unregistering the
> > notifier. Of course, I don't know for sure whether that becomes a
> > problem, but it seems quite limiting.
>
> You can pin modules, once they're loaded they can never be removed
> again.
>
> Anyway, the below should 'work', I think.
>
> ---
> diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
> index f7e1d0eccdbc..72804e0883d5 100644
> --- a/kernel/cpu_pm.c
> +++ b/kernel/cpu_pm.c
> @@ -12,21 +12,18 @@
>  #include <linux/notifier.h>
>  #include <linux/spinlock.h>
>  #include <linux/syscore_ops.h>
> +#include <linux/cpu.h>
> +#include <linux/smp.h>
>
> -static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
> +static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
> +static DEFINE_SPINLOCK(cpu_pm_lock);
>
>  static int cpu_pm_notify(enum cpu_pm_event event)
>  {
>         int ret;
>
> -       /*
> -        * atomic_notifier_call_chain has a RCU read critical section, which
> -        * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
> -        * RCU know this.
> -        */
> -       rcu_irq_enter_irqson();
> -       ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL);
> -       rcu_irq_exit_irqson();
> +       lockdep_assert_irqs_disabled();

Nitpick, maybe the lockdep should be moved to a separate patch.

> +       ret = raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL);

Converting to raw_notifiers seems reasonable - if we need to avoid the
RCU usage.

My point is, I wonder about if the notifier callbacks themselves are
safe from RCU usage. For example, I would not be surprised if tracing
is happening behind them.

Moreover, I am not sure that we really need to prevent and limit
tracing from happening. Instead we could push rcu_idle_enter|exit()
further down to the arch specific code in the cpuidle drivers, as you
kind of all proposed earlier.

In this way, we can step by step, move to a new "version" of
cpu_pm_enter() that doesn't have to deal with rcu_irq_enter_irqson(),
because RCU hasn't been pushed to idle yet.

[...]

Kind regards
Uffe
diff mbox series

Patch

diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index f7e1d0eccdbc..72804e0883d5 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -12,21 +12,18 @@ 
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/syscore_ops.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
 
-static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+static DEFINE_SPINLOCK(cpu_pm_lock);
 
 static int cpu_pm_notify(enum cpu_pm_event event)
 {
 	int ret;
 
-	/*
-	 * atomic_notifier_call_chain has a RCU read critical section, which
-	 * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
-	 * RCU know this.
-	 */
-	rcu_irq_enter_irqson();
-	ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL);
-	rcu_irq_exit_irqson();
+	lockdep_assert_irqs_disabled();
+	ret = raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL);
 
 	return notifier_to_errno(ret);
 }
@@ -35,9 +32,8 @@  static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
 {
 	int ret;
 
-	rcu_irq_enter_irqson();
-	ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL);
-	rcu_irq_exit_irqson();
+	lockdep_assert_irqs_disabled();
+	ret = raw_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL);
 
 	return notifier_to_errno(ret);
 }
@@ -54,10 +50,28 @@  static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
  */
 int cpu_pm_register_notifier(struct notifier_block *nb)
 {
-	return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cpu_pm_lock, flags);
+	ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+	spin_unlock_irqrestore(&cpu_pm_lock, flags);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
 
+static bool __is_idle_cpu(int cpu, void *info)
+{
+	/*
+	 * Racy as heck, however if we fail to see an idle task, it must be
+	 * after we removed our element, so all is fine.
+	 */
+	return is_idle_task(curr_task(cpu));
+}
+
+static void __nop_func(void *arg) { }
+
 /**
  * cpu_pm_unregister_notifier - unregister a driver with cpu_pm
  * @nb: notifier block to be unregistered
@@ -69,7 +83,30 @@  EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
  */
 int cpu_pm_unregister_notifier(struct notifier_block *nb)
 {
-	return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+	unsigned long flags;
+	int ret, cpu;
+
+	spin_lock_irqsave(&cpu_pm_lock, flags);
+	ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+	spin_unlock_irqrestore(&cpu_pm_lock, flags);
+
+	/*
+	 * Orders the removal above vs the __is_idle_cpu() test below. Matches
+	 * schedule() switching to the idle task.
+	 *
+	 * Ensures that if we miss an idle task, it must be after the removal.
+	 */
+	smp_mb();
+
+	/*
+	 * IPI all idle CPUs, this guarantees that no CPU is currently
+	 * iterating the notifier list.
+	 */
+	cpus_read_lock();
+	on_each_cpu_cond(__is_idle_cpu, __nop_func, NULL, 1);
+	cpus_read_unlock();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);