diff mbox series

[2/2] PM: domain: use per-genpd lockdep class

Message ID 20210611101540.3379937-3-dmitry.baryshkov@linaro.org
State New
Headers show
Series PM: domains: use separate lockdep class for each genpd | expand

Commit Message

Dmitry Baryshkov June 11, 2021, 10:15 a.m. UTC
In case of nested genpds it is easy to get the following warning from
lockdep, because all genpd's mutexes share same locking class. Use the
per-genpd locking class to stop lockdep from warning about possible
deadlocks. It is not possible to directly use genpd nested locking, as
it is not the genpd code calling genpd. There are interim calls to
regulator core.

[    3.030219] ============================================
[    3.030220] WARNING: possible recursive locking detected
[    3.030221] 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480 Not tainted
[    3.030222] --------------------------------------------
[    3.030223] kworker/u16:0/7 is trying to acquire lock:
[    3.030224] ffffde0eabd29aa0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c
[    3.030236]
[    3.030236] but task is already holding lock:
[    3.030236] ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c
[    3.030240]
[    3.030240] other info that might help us debug this:
[    3.030240]  Possible unsafe locking scenario:
[    3.030240]
[    3.030241]        CPU0
[    3.030241]        ----
[    3.030242]   lock(&genpd->mlock);
[    3.030243]   lock(&genpd->mlock);
[    3.030244]
[    3.030244]  *** DEADLOCK ***
[    3.030244]
[    3.030244]  May be due to missing lock nesting notation
[    3.030244]
[    3.030245] 6 locks held by kworker/u16:0/7:
[    3.030246]  #0: ffff6cca00010938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730
[    3.030252]  #1: ffff8000100c3db0 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730
[    3.030255]  #2: ffff6cca00ce3188 (&dev->mutex){....}-{3:3}, at: __device_attach+0x3c/0x184
[    3.030260]  #3: ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c
[    3.030264]  #4: ffff8000100c3968 (regulator_ww_class_acquire){+.+.}-{0:0}, at: regulator_lock_dependent+0x6c/0x1b0
[    3.030270]  #5: ffff6cca00a59158 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive+0x94/0x1d0
[    3.030273]
[    3.030273] stack backtrace:
[    3.030275] CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480
[    3.030276] Hardware name: Qualcomm Technologies, Inc. Robotics RB5 (DT)
[    3.030278] Workqueue: events_unbound deferred_probe_work_func
[    3.030280] Call trace:
[    3.030281]  dump_backtrace+0x0/0x1a0
[    3.030284]  show_stack+0x18/0x24
[    3.030286]  dump_stack+0x108/0x188
[    3.030289]  __lock_acquire+0xa20/0x1e0c
[    3.030292]  lock_acquire.part.0+0xc8/0x320
[    3.030294]  lock_acquire+0x68/0x84
[    3.030296]  __mutex_lock+0xa0/0x4f0
[    3.030299]  mutex_lock_nested+0x40/0x50
[    3.030301]  genpd_lock_mtx+0x18/0x2c
[    3.030303]  dev_pm_genpd_set_performance_state+0x94/0x1a0
[    3.030305]  reg_domain_enable+0x28/0x4c
[    3.030308]  _regulator_do_enable+0x420/0x6b0
[    3.030310]  _regulator_enable+0x178/0x1f0
[    3.030312]  regulator_enable+0x3c/0x80
[    3.030314]  gdsc_toggle_logic+0x30/0x124
[    3.030317]  gdsc_enable+0x60/0x290
[    3.030318]  _genpd_power_on+0xc0/0x134
[    3.030320]  genpd_power_on.part.0+0xa4/0x1f0
[    3.030322]  __genpd_dev_pm_attach+0xf4/0x1b0
[    3.030324]  genpd_dev_pm_attach+0x60/0x70
[    3.030326]  dev_pm_domain_attach+0x54/0x5c
[    3.030329]  platform_probe+0x50/0xe0
[    3.030330]  really_probe+0xe4/0x510
[    3.030332]  driver_probe_device+0x64/0xcc
[    3.030333]  __device_attach_driver+0xb8/0x114
[    3.030334]  bus_for_each_drv+0x78/0xd0
[    3.030337]  __device_attach+0xdc/0x184
[    3.030338]  device_initial_probe+0x14/0x20
[    3.030339]  bus_probe_device+0x9c/0xa4
[    3.030340]  deferred_probe_work_func+0x88/0xc4
[    3.030342]  process_one_work+0x298/0x730
[    3.030343]  worker_thread+0x74/0x470
[    3.030344]  kthread+0x168/0x170
[    3.030346]  ret_from_fork+0x10/0x34

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>

---
 drivers/base/power/domain.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

-- 
2.30.2

Comments

Ulf Hansson June 11, 2021, 1:49 p.m. UTC | #1
On Fri, 11 Jun 2021 at 12:15, Dmitry Baryshkov
<dmitry.baryshkov@linaro.org> wrote:
>

> In case of nested genpds it is easy to get the following warning from

> lockdep, because all genpd's mutexes share same locking class. Use the

> per-genpd locking class to stop lockdep from warning about possible

> deadlocks. It is not possible to directly use genpd nested locking, as

> it is not the genpd code calling genpd. There are interim calls to

> regulator core.

>

> [    3.030219] ============================================

> [    3.030220] WARNING: possible recursive locking detected

> [    3.030221] 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480 Not tainted

> [    3.030222] --------------------------------------------

> [    3.030223] kworker/u16:0/7 is trying to acquire lock:

> [    3.030224] ffffde0eabd29aa0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> [    3.030236]

> [    3.030236] but task is already holding lock:

> [    3.030236] ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> [    3.030240]

> [    3.030240] other info that might help us debug this:

> [    3.030240]  Possible unsafe locking scenario:

> [    3.030240]

> [    3.030241]        CPU0

> [    3.030241]        ----

> [    3.030242]   lock(&genpd->mlock);

> [    3.030243]   lock(&genpd->mlock);

> [    3.030244]

> [    3.030244]  *** DEADLOCK ***

> [    3.030244]

> [    3.030244]  May be due to missing lock nesting notation

> [    3.030244]

> [    3.030245] 6 locks held by kworker/u16:0/7:

> [    3.030246]  #0: ffff6cca00010938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> [    3.030252]  #1: ffff8000100c3db0 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> [    3.030255]  #2: ffff6cca00ce3188 (&dev->mutex){....}-{3:3}, at: __device_attach+0x3c/0x184

> [    3.030260]  #3: ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> [    3.030264]  #4: ffff8000100c3968 (regulator_ww_class_acquire){+.+.}-{0:0}, at: regulator_lock_dependent+0x6c/0x1b0

> [    3.030270]  #5: ffff6cca00a59158 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive+0x94/0x1d0

> [    3.030273]

> [    3.030273] stack backtrace:

> [    3.030275] CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480

> [    3.030276] Hardware name: Qualcomm Technologies, Inc. Robotics RB5 (DT)

> [    3.030278] Workqueue: events_unbound deferred_probe_work_func

> [    3.030280] Call trace:

> [    3.030281]  dump_backtrace+0x0/0x1a0

> [    3.030284]  show_stack+0x18/0x24

> [    3.030286]  dump_stack+0x108/0x188

> [    3.030289]  __lock_acquire+0xa20/0x1e0c

> [    3.030292]  lock_acquire.part.0+0xc8/0x320

> [    3.030294]  lock_acquire+0x68/0x84

> [    3.030296]  __mutex_lock+0xa0/0x4f0

> [    3.030299]  mutex_lock_nested+0x40/0x50

> [    3.030301]  genpd_lock_mtx+0x18/0x2c

> [    3.030303]  dev_pm_genpd_set_performance_state+0x94/0x1a0

> [    3.030305]  reg_domain_enable+0x28/0x4c

> [    3.030308]  _regulator_do_enable+0x420/0x6b0

> [    3.030310]  _regulator_enable+0x178/0x1f0

> [    3.030312]  regulator_enable+0x3c/0x80


At a closer look, I am pretty sure that it's the wrong code design
that triggers this problem, rather than that we have a real problem in
genpd. To put it simply, the code in genpd isn't designed to work like
this. We will end up in circular looking paths, leading to deadlocks,
sooner or later if we allow the above code path.

To fix it, the regulator here needs to be converted to a proper PM
domain. This PM domain should be assigned as the parent to the one
that is requested to be powered on.

> [    3.030314]  gdsc_toggle_logic+0x30/0x124

> [    3.030317]  gdsc_enable+0x60/0x290

> [    3.030318]  _genpd_power_on+0xc0/0x134

> [    3.030320]  genpd_power_on.part.0+0xa4/0x1f0

> [    3.030322]  __genpd_dev_pm_attach+0xf4/0x1b0

> [    3.030324]  genpd_dev_pm_attach+0x60/0x70

> [    3.030326]  dev_pm_domain_attach+0x54/0x5c

> [    3.030329]  platform_probe+0x50/0xe0

> [    3.030330]  really_probe+0xe4/0x510

> [    3.030332]  driver_probe_device+0x64/0xcc

> [    3.030333]  __device_attach_driver+0xb8/0x114

> [    3.030334]  bus_for_each_drv+0x78/0xd0

> [    3.030337]  __device_attach+0xdc/0x184

> [    3.030338]  device_initial_probe+0x14/0x20

> [    3.030339]  bus_probe_device+0x9c/0xa4

> [    3.030340]  deferred_probe_work_func+0x88/0xc4

> [    3.030342]  process_one_work+0x298/0x730

> [    3.030343]  worker_thread+0x74/0x470

> [    3.030344]  kthread+0x168/0x170

> [    3.030346]  ret_from_fork+0x10/0x34

>

> Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>


Kind regards
Uffe

> ---

>  drivers/base/power/domain.c | 25 ++++++++++++++++++++-----

>  1 file changed, 20 insertions(+), 5 deletions(-)

>

> diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c

> index 74219d032910..bdf439b48763 100644

> --- a/drivers/base/power/domain.c

> +++ b/drivers/base/power/domain.c

> @@ -1899,20 +1899,33 @@ static int genpd_set_default_power_state(struct generic_pm_domain *genpd)

>         return 0;

>  }

>

> -static void genpd_lock_init(struct generic_pm_domain *genpd)

> +static int genpd_lock_init(struct generic_pm_domain *genpd)

>  {

>         if (genpd->flags & GENPD_FLAG_IRQ_SAFE) {

>                 spin_lock_init(&genpd->slock);

>                 genpd->lock_ops = &genpd_spin_ops;

>         } else {

> -               mutex_init(&genpd->mlock);

> +               /* Some genpds are static, some are dynamically allocated. To

> +                * make lockdep happy always allocate the key dynamically and

> +                * register it. */

> +               genpd->mlock_key = kzalloc(sizeof(genpd->mlock_key), GFP_KERNEL);

> +               if (!genpd->mlock_key)

> +                       return -ENOMEM;

> +

> +               lockdep_register_key(genpd->mlock_key);

> +

> +               __mutex_init(&genpd->mlock, genpd->name, genpd->mlock_key);

>                 genpd->lock_ops = &genpd_mtx_ops;

>         }

> +

> +       return 0;

>  }

>

>  static void genpd_lock_destroy(struct generic_pm_domain *genpd) {

> -       if (!(genpd->flags & GENPD_FLAG_IRQ_SAFE))

> +       if (!(genpd->flags & GENPD_FLAG_IRQ_SAFE)) {

>                 mutex_destroy(&genpd->mlock);

> +               kfree(genpd->mlock_key);

> +       }

>  }

>

>  /**

> @@ -1935,7 +1948,10 @@ int pm_genpd_init(struct generic_pm_domain *genpd,

>         INIT_LIST_HEAD(&genpd->child_links);

>         INIT_LIST_HEAD(&genpd->dev_list);

>         RAW_INIT_NOTIFIER_HEAD(&genpd->power_notifiers);

> -       genpd_lock_init(genpd);

> +       ret = genpd_lock_init(genpd);

> +       if (ret)

> +               return ret;

> +

>         genpd->gov = gov;

>         INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);

>         atomic_set(&genpd->sd_count, 0);

> @@ -2040,7 +2056,6 @@ static int genpd_remove(struct generic_pm_domain *genpd)

>                 free_cpumask_var(genpd->cpus);

>         if (genpd->free_states)

>                 genpd->free_states(genpd->states, genpd->state_count);

> -       genpd_lock_destroy(genpd);

>

>         pr_debug("%s: removed %s\n", __func__, genpd->name);

>

> --

> 2.30.2

>
Dmitry Baryshkov June 11, 2021, 2:34 p.m. UTC | #2
Added Stephen to Cc list

On Fri, 11 Jun 2021 at 16:50, Ulf Hansson <ulf.hansson@linaro.org> wrote:
>

> On Fri, 11 Jun 2021 at 12:15, Dmitry Baryshkov

> <dmitry.baryshkov@linaro.org> wrote:

> >

> > In case of nested genpds it is easy to get the following warning from

> > lockdep, because all genpd's mutexes share same locking class. Use the

> > per-genpd locking class to stop lockdep from warning about possible

> > deadlocks. It is not possible to directly use genpd nested locking, as

> > it is not the genpd code calling genpd. There are interim calls to

> > regulator core.

> >

> > [    3.030219] ============================================

> > [    3.030220] WARNING: possible recursive locking detected

> > [    3.030221] 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480 Not tainted

> > [    3.030222] --------------------------------------------

> > [    3.030223] kworker/u16:0/7 is trying to acquire lock:

> > [    3.030224] ffffde0eabd29aa0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > [    3.030236]

> > [    3.030236] but task is already holding lock:

> > [    3.030236] ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > [    3.030240]

> > [    3.030240] other info that might help us debug this:

> > [    3.030240]  Possible unsafe locking scenario:

> > [    3.030240]

> > [    3.030241]        CPU0

> > [    3.030241]        ----

> > [    3.030242]   lock(&genpd->mlock);

> > [    3.030243]   lock(&genpd->mlock);

> > [    3.030244]

> > [    3.030244]  *** DEADLOCK ***

> > [    3.030244]

> > [    3.030244]  May be due to missing lock nesting notation

> > [    3.030244]

> > [    3.030245] 6 locks held by kworker/u16:0/7:

> > [    3.030246]  #0: ffff6cca00010938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> > [    3.030252]  #1: ffff8000100c3db0 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> > [    3.030255]  #2: ffff6cca00ce3188 (&dev->mutex){....}-{3:3}, at: __device_attach+0x3c/0x184

> > [    3.030260]  #3: ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > [    3.030264]  #4: ffff8000100c3968 (regulator_ww_class_acquire){+.+.}-{0:0}, at: regulator_lock_dependent+0x6c/0x1b0

> > [    3.030270]  #5: ffff6cca00a59158 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive+0x94/0x1d0

> > [    3.030273]

> > [    3.030273] stack backtrace:

> > [    3.030275] CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480

> > [    3.030276] Hardware name: Qualcomm Technologies, Inc. Robotics RB5 (DT)

> > [    3.030278] Workqueue: events_unbound deferred_probe_work_func

> > [    3.030280] Call trace:

> > [    3.030281]  dump_backtrace+0x0/0x1a0

> > [    3.030284]  show_stack+0x18/0x24

> > [    3.030286]  dump_stack+0x108/0x188

> > [    3.030289]  __lock_acquire+0xa20/0x1e0c

> > [    3.030292]  lock_acquire.part.0+0xc8/0x320

> > [    3.030294]  lock_acquire+0x68/0x84

> > [    3.030296]  __mutex_lock+0xa0/0x4f0

> > [    3.030299]  mutex_lock_nested+0x40/0x50

> > [    3.030301]  genpd_lock_mtx+0x18/0x2c

> > [    3.030303]  dev_pm_genpd_set_performance_state+0x94/0x1a0

> > [    3.030305]  reg_domain_enable+0x28/0x4c

> > [    3.030308]  _regulator_do_enable+0x420/0x6b0

> > [    3.030310]  _regulator_enable+0x178/0x1f0

> > [    3.030312]  regulator_enable+0x3c/0x80

>

> At a closer look, I am pretty sure that it's the wrong code design

> that triggers this problem, rather than that we have a real problem in

> genpd. To put it simply, the code in genpd isn't designed to work like

> this. We will end up in circular looking paths, leading to deadlocks,

> sooner or later if we allow the above code path.

>

> To fix it, the regulator here needs to be converted to a proper PM

> domain. This PM domain should be assigned as the parent to the one

> that is requested to be powered on.


This more or less resembles original design, replaced per review
request to use separate regulator
(https://lore.kernel.org/linux-arm-msm/160269659638.884498.4031967462806977493@swboyd.mtv.corp.google.com/,
https://lore.kernel.org/linux-arm-msm/20201023131925.334864-1-dmitry.baryshkov@linaro.org/).

Stephen, would it be fine to you to convert the mmcx regulator into
the PM domain?

> > [    3.030314]  gdsc_toggle_logic+0x30/0x124

> > [    3.030317]  gdsc_enable+0x60/0x290

> > [    3.030318]  _genpd_power_on+0xc0/0x134

> > [    3.030320]  genpd_power_on.part.0+0xa4/0x1f0

> > [    3.030322]  __genpd_dev_pm_attach+0xf4/0x1b0

> > [    3.030324]  genpd_dev_pm_attach+0x60/0x70

> > [    3.030326]  dev_pm_domain_attach+0x54/0x5c

> > [    3.030329]  platform_probe+0x50/0xe0

> > [    3.030330]  really_probe+0xe4/0x510

> > [    3.030332]  driver_probe_device+0x64/0xcc

> > [    3.030333]  __device_attach_driver+0xb8/0x114

> > [    3.030334]  bus_for_each_drv+0x78/0xd0

> > [    3.030337]  __device_attach+0xdc/0x184

> > [    3.030338]  device_initial_probe+0x14/0x20

> > [    3.030339]  bus_probe_device+0x9c/0xa4

> > [    3.030340]  deferred_probe_work_func+0x88/0xc4

> > [    3.030342]  process_one_work+0x298/0x730

> > [    3.030343]  worker_thread+0x74/0x470

> > [    3.030344]  kthread+0x168/0x170

> > [    3.030346]  ret_from_fork+0x10/0x34

> >

> > Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>

>

> Kind regards

> Uffe

>

> > ---

> >  drivers/base/power/domain.c | 25 ++++++++++++++++++++-----

> >  1 file changed, 20 insertions(+), 5 deletions(-)

> >

> > diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c

> > index 74219d032910..bdf439b48763 100644

> > --- a/drivers/base/power/domain.c

> > +++ b/drivers/base/power/domain.c

> > @@ -1899,20 +1899,33 @@ static int genpd_set_default_power_state(struct generic_pm_domain *genpd)

> >         return 0;

> >  }

> >

> > -static void genpd_lock_init(struct generic_pm_domain *genpd)

> > +static int genpd_lock_init(struct generic_pm_domain *genpd)

> >  {

> >         if (genpd->flags & GENPD_FLAG_IRQ_SAFE) {

> >                 spin_lock_init(&genpd->slock);

> >                 genpd->lock_ops = &genpd_spin_ops;

> >         } else {

> > -               mutex_init(&genpd->mlock);

> > +               /* Some genpds are static, some are dynamically allocated. To

> > +                * make lockdep happy always allocate the key dynamically and

> > +                * register it. */

> > +               genpd->mlock_key = kzalloc(sizeof(genpd->mlock_key), GFP_KERNEL);

> > +               if (!genpd->mlock_key)

> > +                       return -ENOMEM;

> > +

> > +               lockdep_register_key(genpd->mlock_key);

> > +

> > +               __mutex_init(&genpd->mlock, genpd->name, genpd->mlock_key);

> >                 genpd->lock_ops = &genpd_mtx_ops;

> >         }

> > +

> > +       return 0;

> >  }

> >

> >  static void genpd_lock_destroy(struct generic_pm_domain *genpd) {

> > -       if (!(genpd->flags & GENPD_FLAG_IRQ_SAFE))

> > +       if (!(genpd->flags & GENPD_FLAG_IRQ_SAFE)) {

> >                 mutex_destroy(&genpd->mlock);

> > +               kfree(genpd->mlock_key);

> > +       }

> >  }

> >

> >  /**

> > @@ -1935,7 +1948,10 @@ int pm_genpd_init(struct generic_pm_domain *genpd,

> >         INIT_LIST_HEAD(&genpd->child_links);

> >         INIT_LIST_HEAD(&genpd->dev_list);

> >         RAW_INIT_NOTIFIER_HEAD(&genpd->power_notifiers);

> > -       genpd_lock_init(genpd);

> > +       ret = genpd_lock_init(genpd);

> > +       if (ret)

> > +               return ret;

> > +

> >         genpd->gov = gov;

> >         INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);

> >         atomic_set(&genpd->sd_count, 0);

> > @@ -2040,7 +2056,6 @@ static int genpd_remove(struct generic_pm_domain *genpd)

> >                 free_cpumask_var(genpd->cpus);

> >         if (genpd->free_states)

> >                 genpd->free_states(genpd->states, genpd->state_count);

> > -       genpd_lock_destroy(genpd);

> >

> >         pr_debug("%s: removed %s\n", __func__, genpd->name);

> >

> > --

> > 2.30.2

> >




-- 
With best wishes
Dmitry
Ulf Hansson June 15, 2021, 10:17 a.m. UTC | #3
+ Mark

On Fri, 11 Jun 2021 at 16:34, Dmitry Baryshkov
<dmitry.baryshkov@linaro.org> wrote:
>

> Added Stephen to Cc list

>

> On Fri, 11 Jun 2021 at 16:50, Ulf Hansson <ulf.hansson@linaro.org> wrote:

> >

> > On Fri, 11 Jun 2021 at 12:15, Dmitry Baryshkov

> > <dmitry.baryshkov@linaro.org> wrote:

> > >

> > > In case of nested genpds it is easy to get the following warning from

> > > lockdep, because all genpd's mutexes share same locking class. Use the

> > > per-genpd locking class to stop lockdep from warning about possible

> > > deadlocks. It is not possible to directly use genpd nested locking, as

> > > it is not the genpd code calling genpd. There are interim calls to

> > > regulator core.

> > >

> > > [    3.030219] ============================================

> > > [    3.030220] WARNING: possible recursive locking detected

> > > [    3.030221] 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480 Not tainted

> > > [    3.030222] --------------------------------------------

> > > [    3.030223] kworker/u16:0/7 is trying to acquire lock:

> > > [    3.030224] ffffde0eabd29aa0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > [    3.030236]

> > > [    3.030236] but task is already holding lock:

> > > [    3.030236] ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > [    3.030240]

> > > [    3.030240] other info that might help us debug this:

> > > [    3.030240]  Possible unsafe locking scenario:

> > > [    3.030240]

> > > [    3.030241]        CPU0

> > > [    3.030241]        ----

> > > [    3.030242]   lock(&genpd->mlock);

> > > [    3.030243]   lock(&genpd->mlock);

> > > [    3.030244]

> > > [    3.030244]  *** DEADLOCK ***

> > > [    3.030244]

> > > [    3.030244]  May be due to missing lock nesting notation

> > > [    3.030244]

> > > [    3.030245] 6 locks held by kworker/u16:0/7:

> > > [    3.030246]  #0: ffff6cca00010938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> > > [    3.030252]  #1: ffff8000100c3db0 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> > > [    3.030255]  #2: ffff6cca00ce3188 (&dev->mutex){....}-{3:3}, at: __device_attach+0x3c/0x184

> > > [    3.030260]  #3: ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > [    3.030264]  #4: ffff8000100c3968 (regulator_ww_class_acquire){+.+.}-{0:0}, at: regulator_lock_dependent+0x6c/0x1b0

> > > [    3.030270]  #5: ffff6cca00a59158 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive+0x94/0x1d0

> > > [    3.030273]

> > > [    3.030273] stack backtrace:

> > > [    3.030275] CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480

> > > [    3.030276] Hardware name: Qualcomm Technologies, Inc. Robotics RB5 (DT)

> > > [    3.030278] Workqueue: events_unbound deferred_probe_work_func

> > > [    3.030280] Call trace:

> > > [    3.030281]  dump_backtrace+0x0/0x1a0

> > > [    3.030284]  show_stack+0x18/0x24

> > > [    3.030286]  dump_stack+0x108/0x188

> > > [    3.030289]  __lock_acquire+0xa20/0x1e0c

> > > [    3.030292]  lock_acquire.part.0+0xc8/0x320

> > > [    3.030294]  lock_acquire+0x68/0x84

> > > [    3.030296]  __mutex_lock+0xa0/0x4f0

> > > [    3.030299]  mutex_lock_nested+0x40/0x50

> > > [    3.030301]  genpd_lock_mtx+0x18/0x2c

> > > [    3.030303]  dev_pm_genpd_set_performance_state+0x94/0x1a0

> > > [    3.030305]  reg_domain_enable+0x28/0x4c

> > > [    3.030308]  _regulator_do_enable+0x420/0x6b0

> > > [    3.030310]  _regulator_enable+0x178/0x1f0

> > > [    3.030312]  regulator_enable+0x3c/0x80

> >

> > At a closer look, I am pretty sure that it's the wrong code design

> > that triggers this problem, rather than that we have a real problem in

> > genpd. To put it simply, the code in genpd isn't designed to work like

> > this. We will end up in circular looking paths, leading to deadlocks,

> > sooner or later if we allow the above code path.

> >

> > To fix it, the regulator here needs to be converted to a proper PM

> > domain. This PM domain should be assigned as the parent to the one

> > that is requested to be powered on.

>

> This more or less resembles original design, replaced per review

> request to use separate regulator

> (https://lore.kernel.org/linux-arm-msm/160269659638.884498.4031967462806977493@swboyd.mtv.corp.google.com/,

> https://lore.kernel.org/linux-arm-msm/20201023131925.334864-1-dmitry.baryshkov@linaro.org/).


Thanks for the pointers. In hindsight, it looks like the
"regulator-fixed-domain" DT binding wasn't the right thing.

Fortunately, it looks like the problem can be quite easily fixed, by
moving to a correct model of the domain hierarchy.

Beyond this, perhaps we should consider removing the
"regulator-fixed-domain" DT property, as to avoid similar problems
from cropping up?

Mark, what do you think?

>

> Stephen, would it be fine to you to convert the mmcx regulator into

> the PM domain?

>

> > > [    3.030314]  gdsc_toggle_logic+0x30/0x124

> > > [    3.030317]  gdsc_enable+0x60/0x290

> > > [    3.030318]  _genpd_power_on+0xc0/0x134

> > > [    3.030320]  genpd_power_on.part.0+0xa4/0x1f0

> > > [    3.030322]  __genpd_dev_pm_attach+0xf4/0x1b0

> > > [    3.030324]  genpd_dev_pm_attach+0x60/0x70

> > > [    3.030326]  dev_pm_domain_attach+0x54/0x5c

> > > [    3.030329]  platform_probe+0x50/0xe0

> > > [    3.030330]  really_probe+0xe4/0x510

> > > [    3.030332]  driver_probe_device+0x64/0xcc

> > > [    3.030333]  __device_attach_driver+0xb8/0x114

> > > [    3.030334]  bus_for_each_drv+0x78/0xd0

> > > [    3.030337]  __device_attach+0xdc/0x184

> > > [    3.030338]  device_initial_probe+0x14/0x20

> > > [    3.030339]  bus_probe_device+0x9c/0xa4

> > > [    3.030340]  deferred_probe_work_func+0x88/0xc4

> > > [    3.030342]  process_one_work+0x298/0x730

> > > [    3.030343]  worker_thread+0x74/0x470

> > > [    3.030344]  kthread+0x168/0x170

> > > [    3.030346]  ret_from_fork+0x10/0x34

> > >

> > > Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>


[...]

Kind regards
Uffe
Mark Brown June 15, 2021, 11:10 a.m. UTC | #4
On Tue, Jun 15, 2021 at 12:17:20PM +0200, Ulf Hansson wrote:

> Beyond this, perhaps we should consider removing the

> "regulator-fixed-domain" DT property, as to avoid similar problems

> from cropping up?


> Mark, what do you think?


We need to maintain compatibility for existing users...
Ulf Hansson June 15, 2021, 2:55 p.m. UTC | #5
On Tue, 15 Jun 2021 at 13:10, Mark Brown <broonie@kernel.org> wrote:
>

> On Tue, Jun 15, 2021 at 12:17:20PM +0200, Ulf Hansson wrote:

>

> > Beyond this, perhaps we should consider removing the

> > "regulator-fixed-domain" DT property, as to avoid similar problems

> > from cropping up?

>

> > Mark, what do you think?

>

> We need to maintain compatibility for existing users...


Normally, yes, I would agree.

In this case, it looks like there is only one user, which is somewhat
broken in regards to this, so what's the point of keeping this around?

Kind regards
Uffe
Mark Brown June 15, 2021, 3:26 p.m. UTC | #6
On Tue, Jun 15, 2021 at 04:55:24PM +0200, Ulf Hansson wrote:
> On Tue, 15 Jun 2021 at 13:10, Mark Brown <broonie@kernel.org> wrote:

> > On Tue, Jun 15, 2021 at 12:17:20PM +0200, Ulf Hansson wrote:


> > > Beyond this, perhaps we should consider removing the

> > > "regulator-fixed-domain" DT property, as to avoid similar problems

> > > from cropping up?


> > > Mark, what do you think?


> > We need to maintain compatibility for existing users...


> Normally, yes, I would agree.


> In this case, it looks like there is only one user, which is somewhat

> broken in regards to this, so what's the point of keeping this around?


Only one user in mainline and you were just suggesting removing the
property (you mean binding I think?) - at the very least we'd need to
transition that upstream user away to something else before doing
anything.
Ulf Hansson June 15, 2021, 3:35 p.m. UTC | #7
On Tue, 15 Jun 2021 at 17:26, Mark Brown <broonie@kernel.org> wrote:
>

> On Tue, Jun 15, 2021 at 04:55:24PM +0200, Ulf Hansson wrote:

> > On Tue, 15 Jun 2021 at 13:10, Mark Brown <broonie@kernel.org> wrote:

> > > On Tue, Jun 15, 2021 at 12:17:20PM +0200, Ulf Hansson wrote:

>

> > > > Beyond this, perhaps we should consider removing the

> > > > "regulator-fixed-domain" DT property, as to avoid similar problems

> > > > from cropping up?

>

> > > > Mark, what do you think?

>

> > > We need to maintain compatibility for existing users...

>

> > Normally, yes, I would agree.

>

> > In this case, it looks like there is only one user, which is somewhat

> > broken in regards to this, so what's the point of keeping this around?

>

> Only one user in mainline and you were just suggesting removing the

> property (you mean binding I think?) - at the very least we'd need to

> transition that upstream user away to something else before doing

> anything.


Yes, I am referring to the binding.

Let's see where we end up with this. My concern at this point is that
it could spread to more users, which would make it even more difficult
to remove.

Kind regards
Uffe
Mark Brown June 15, 2021, 3:38 p.m. UTC | #8
On Tue, Jun 15, 2021 at 05:35:01PM +0200, Ulf Hansson wrote:

> Let's see where we end up with this. My concern at this point is that

> it could spread to more users, which would make it even more difficult

> to remove.


Perhaps mark it as deprecated while people figure out how to fix the
existing user?
Bjorn Andersson June 15, 2021, 3:55 p.m. UTC | #9
On Tue 15 Jun 05:17 CDT 2021, Ulf Hansson wrote:

> + Mark

> 

> On Fri, 11 Jun 2021 at 16:34, Dmitry Baryshkov

> <dmitry.baryshkov@linaro.org> wrote:

> >

> > Added Stephen to Cc list

> >

> > On Fri, 11 Jun 2021 at 16:50, Ulf Hansson <ulf.hansson@linaro.org> wrote:

> > >

> > > On Fri, 11 Jun 2021 at 12:15, Dmitry Baryshkov

> > > <dmitry.baryshkov@linaro.org> wrote:

> > > >

> > > > In case of nested genpds it is easy to get the following warning from

> > > > lockdep, because all genpd's mutexes share same locking class. Use the

> > > > per-genpd locking class to stop lockdep from warning about possible

> > > > deadlocks. It is not possible to directly use genpd nested locking, as

> > > > it is not the genpd code calling genpd. There are interim calls to

> > > > regulator core.

> > > >

> > > > [    3.030219] ============================================

> > > > [    3.030220] WARNING: possible recursive locking detected

> > > > [    3.030221] 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480 Not tainted

> > > > [    3.030222] --------------------------------------------

> > > > [    3.030223] kworker/u16:0/7 is trying to acquire lock:

> > > > [    3.030224] ffffde0eabd29aa0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > > [    3.030236]

> > > > [    3.030236] but task is already holding lock:

> > > > [    3.030236] ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > > [    3.030240]

> > > > [    3.030240] other info that might help us debug this:

> > > > [    3.030240]  Possible unsafe locking scenario:

> > > > [    3.030240]

> > > > [    3.030241]        CPU0

> > > > [    3.030241]        ----

> > > > [    3.030242]   lock(&genpd->mlock);

> > > > [    3.030243]   lock(&genpd->mlock);

> > > > [    3.030244]

> > > > [    3.030244]  *** DEADLOCK ***

> > > > [    3.030244]

> > > > [    3.030244]  May be due to missing lock nesting notation

> > > > [    3.030244]

> > > > [    3.030245] 6 locks held by kworker/u16:0/7:

> > > > [    3.030246]  #0: ffff6cca00010938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> > > > [    3.030252]  #1: ffff8000100c3db0 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> > > > [    3.030255]  #2: ffff6cca00ce3188 (&dev->mutex){....}-{3:3}, at: __device_attach+0x3c/0x184

> > > > [    3.030260]  #3: ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > > [    3.030264]  #4: ffff8000100c3968 (regulator_ww_class_acquire){+.+.}-{0:0}, at: regulator_lock_dependent+0x6c/0x1b0

> > > > [    3.030270]  #5: ffff6cca00a59158 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive+0x94/0x1d0

> > > > [    3.030273]

> > > > [    3.030273] stack backtrace:

> > > > [    3.030275] CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480

> > > > [    3.030276] Hardware name: Qualcomm Technologies, Inc. Robotics RB5 (DT)

> > > > [    3.030278] Workqueue: events_unbound deferred_probe_work_func

> > > > [    3.030280] Call trace:

> > > > [    3.030281]  dump_backtrace+0x0/0x1a0

> > > > [    3.030284]  show_stack+0x18/0x24

> > > > [    3.030286]  dump_stack+0x108/0x188

> > > > [    3.030289]  __lock_acquire+0xa20/0x1e0c

> > > > [    3.030292]  lock_acquire.part.0+0xc8/0x320

> > > > [    3.030294]  lock_acquire+0x68/0x84

> > > > [    3.030296]  __mutex_lock+0xa0/0x4f0

> > > > [    3.030299]  mutex_lock_nested+0x40/0x50

> > > > [    3.030301]  genpd_lock_mtx+0x18/0x2c

> > > > [    3.030303]  dev_pm_genpd_set_performance_state+0x94/0x1a0

> > > > [    3.030305]  reg_domain_enable+0x28/0x4c

> > > > [    3.030308]  _regulator_do_enable+0x420/0x6b0

> > > > [    3.030310]  _regulator_enable+0x178/0x1f0

> > > > [    3.030312]  regulator_enable+0x3c/0x80

> > >

> > > At a closer look, I am pretty sure that it's the wrong code design

> > > that triggers this problem, rather than that we have a real problem in

> > > genpd. To put it simply, the code in genpd isn't designed to work like

> > > this. We will end up in circular looking paths, leading to deadlocks,

> > > sooner or later if we allow the above code path.

> > >

> > > To fix it, the regulator here needs to be converted to a proper PM

> > > domain. This PM domain should be assigned as the parent to the one

> > > that is requested to be powered on.

> >

> > This more or less resembles original design, replaced per review

> > request to use separate regulator

> > (https://lore.kernel.org/linux-arm-msm/160269659638.884498.4031967462806977493@swboyd.mtv.corp.google.com/,

> > https://lore.kernel.org/linux-arm-msm/20201023131925.334864-1-dmitry.baryshkov@linaro.org/).

> 

> Thanks for the pointers. In hindsight, it looks like the

> "regulator-fixed-domain" DT binding wasn't the right thing.

> 

> Fortunately, it looks like the problem can be quite easily fixed, by

> moving to a correct model of the domain hierarchy.

> 


Can you give some pointers to how we actually fix this?

The problem that lead us down this path is that drivers/clk/qcom/gdsc.c
describes power domains, which are parented by domains provided by
drivers/soc/qcom/rpmhpd.c.

But I am unable to find a way for the gdsc driver to get hold of the
struct generic_pm_domain of the resources exposed by the rpmhpd driver.


The second thing that Dmitry's regulator driver does is to cast the
appropriate performance state vote on the rpmhpd resource, but I _think_
we can do that using OPP tables in the gdsc client's node...

> Beyond this, perhaps we should consider removing the

> "regulator-fixed-domain" DT property, as to avoid similar problems

> from cropping up?

> 


Currently there's a single user upstream, but we have the exact same
problem in at least 3-4 platforms with patches in the pipeline.

In order to avoid breakage with existing DT I would prefer to see
regulator-fixed-domain to live for at least one kernel release beyond
the introduction of the other model.

Regards,
Bjorn

> Mark, what do you think?

> 

> >

> > Stephen, would it be fine to you to convert the mmcx regulator into

> > the PM domain?

> >

> > > > [    3.030314]  gdsc_toggle_logic+0x30/0x124

> > > > [    3.030317]  gdsc_enable+0x60/0x290

> > > > [    3.030318]  _genpd_power_on+0xc0/0x134

> > > > [    3.030320]  genpd_power_on.part.0+0xa4/0x1f0

> > > > [    3.030322]  __genpd_dev_pm_attach+0xf4/0x1b0

> > > > [    3.030324]  genpd_dev_pm_attach+0x60/0x70

> > > > [    3.030326]  dev_pm_domain_attach+0x54/0x5c

> > > > [    3.030329]  platform_probe+0x50/0xe0

> > > > [    3.030330]  really_probe+0xe4/0x510

> > > > [    3.030332]  driver_probe_device+0x64/0xcc

> > > > [    3.030333]  __device_attach_driver+0xb8/0x114

> > > > [    3.030334]  bus_for_each_drv+0x78/0xd0

> > > > [    3.030337]  __device_attach+0xdc/0x184

> > > > [    3.030338]  device_initial_probe+0x14/0x20

> > > > [    3.030339]  bus_probe_device+0x9c/0xa4

> > > > [    3.030340]  deferred_probe_work_func+0x88/0xc4

> > > > [    3.030342]  process_one_work+0x298/0x730

> > > > [    3.030343]  worker_thread+0x74/0x470

> > > > [    3.030344]  kthread+0x168/0x170

> > > > [    3.030346]  ret_from_fork+0x10/0x34

> > > >

> > > > Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>

> 

> [...]

> 

> Kind regards

> Uffe
Ulf Hansson June 17, 2021, 9:07 a.m. UTC | #10
+ Rajendra

On Tue, 15 Jun 2021 at 17:55, Bjorn Andersson
<bjorn.andersson@linaro.org> wrote:
>

> On Tue 15 Jun 05:17 CDT 2021, Ulf Hansson wrote:

>

> > + Mark

> >

> > On Fri, 11 Jun 2021 at 16:34, Dmitry Baryshkov

> > <dmitry.baryshkov@linaro.org> wrote:

> > >

> > > Added Stephen to Cc list

> > >

> > > On Fri, 11 Jun 2021 at 16:50, Ulf Hansson <ulf.hansson@linaro.org> wrote:

> > > >

> > > > On Fri, 11 Jun 2021 at 12:15, Dmitry Baryshkov

> > > > <dmitry.baryshkov@linaro.org> wrote:

> > > > >

> > > > > In case of nested genpds it is easy to get the following warning from

> > > > > lockdep, because all genpd's mutexes share same locking class. Use the

> > > > > per-genpd locking class to stop lockdep from warning about possible

> > > > > deadlocks. It is not possible to directly use genpd nested locking, as

> > > > > it is not the genpd code calling genpd. There are interim calls to

> > > > > regulator core.

> > > > >

> > > > > [    3.030219] ============================================

> > > > > [    3.030220] WARNING: possible recursive locking detected

> > > > > [    3.030221] 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480 Not tainted

> > > > > [    3.030222] --------------------------------------------

> > > > > [    3.030223] kworker/u16:0/7 is trying to acquire lock:

> > > > > [    3.030224] ffffde0eabd29aa0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > > > [    3.030236]

> > > > > [    3.030236] but task is already holding lock:

> > > > > [    3.030236] ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > > > [    3.030240]

> > > > > [    3.030240] other info that might help us debug this:

> > > > > [    3.030240]  Possible unsafe locking scenario:

> > > > > [    3.030240]

> > > > > [    3.030241]        CPU0

> > > > > [    3.030241]        ----

> > > > > [    3.030242]   lock(&genpd->mlock);

> > > > > [    3.030243]   lock(&genpd->mlock);

> > > > > [    3.030244]

> > > > > [    3.030244]  *** DEADLOCK ***

> > > > > [    3.030244]

> > > > > [    3.030244]  May be due to missing lock nesting notation

> > > > > [    3.030244]

> > > > > [    3.030245] 6 locks held by kworker/u16:0/7:

> > > > > [    3.030246]  #0: ffff6cca00010938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> > > > > [    3.030252]  #1: ffff8000100c3db0 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> > > > > [    3.030255]  #2: ffff6cca00ce3188 (&dev->mutex){....}-{3:3}, at: __device_attach+0x3c/0x184

> > > > > [    3.030260]  #3: ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > > > [    3.030264]  #4: ffff8000100c3968 (regulator_ww_class_acquire){+.+.}-{0:0}, at: regulator_lock_dependent+0x6c/0x1b0

> > > > > [    3.030270]  #5: ffff6cca00a59158 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive+0x94/0x1d0

> > > > > [    3.030273]

> > > > > [    3.030273] stack backtrace:

> > > > > [    3.030275] CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480

> > > > > [    3.030276] Hardware name: Qualcomm Technologies, Inc. Robotics RB5 (DT)

> > > > > [    3.030278] Workqueue: events_unbound deferred_probe_work_func

> > > > > [    3.030280] Call trace:

> > > > > [    3.030281]  dump_backtrace+0x0/0x1a0

> > > > > [    3.030284]  show_stack+0x18/0x24

> > > > > [    3.030286]  dump_stack+0x108/0x188

> > > > > [    3.030289]  __lock_acquire+0xa20/0x1e0c

> > > > > [    3.030292]  lock_acquire.part.0+0xc8/0x320

> > > > > [    3.030294]  lock_acquire+0x68/0x84

> > > > > [    3.030296]  __mutex_lock+0xa0/0x4f0

> > > > > [    3.030299]  mutex_lock_nested+0x40/0x50

> > > > > [    3.030301]  genpd_lock_mtx+0x18/0x2c

> > > > > [    3.030303]  dev_pm_genpd_set_performance_state+0x94/0x1a0

> > > > > [    3.030305]  reg_domain_enable+0x28/0x4c

> > > > > [    3.030308]  _regulator_do_enable+0x420/0x6b0

> > > > > [    3.030310]  _regulator_enable+0x178/0x1f0

> > > > > [    3.030312]  regulator_enable+0x3c/0x80

> > > >

> > > > At a closer look, I am pretty sure that it's the wrong code design

> > > > that triggers this problem, rather than that we have a real problem in

> > > > genpd. To put it simply, the code in genpd isn't designed to work like

> > > > this. We will end up in circular looking paths, leading to deadlocks,

> > > > sooner or later if we allow the above code path.

> > > >

> > > > To fix it, the regulator here needs to be converted to a proper PM

> > > > domain. This PM domain should be assigned as the parent to the one

> > > > that is requested to be powered on.

> > >

> > > This more or less resembles original design, replaced per review

> > > request to use separate regulator

> > > (https://lore.kernel.org/linux-arm-msm/160269659638.884498.4031967462806977493@swboyd.mtv.corp.google.com/,

> > > https://lore.kernel.org/linux-arm-msm/20201023131925.334864-1-dmitry.baryshkov@linaro.org/).

> >

> > Thanks for the pointers. In hindsight, it looks like the

> > "regulator-fixed-domain" DT binding wasn't the right thing.

> >

> > Fortunately, it looks like the problem can be quite easily fixed, by

> > moving to a correct model of the domain hierarchy.

> >

>

> Can you give some pointers to how we actually fix this?

>

> The problem that lead us down this path is that drivers/clk/qcom/gdsc.c

> describes power domains, which are parented by domains provided by

> drivers/soc/qcom/rpmhpd.c.

>

> But I am unable to find a way for the gdsc driver to get hold of the

> struct generic_pm_domain of the resources exposed by the rpmhpd driver.


You don't need a handle to the struct generic_pm_domain, to assign a
parent/child domain. Instead you can use of_genpd_add_subdomain(),
which takes two "struct of_phandle_args*" corresponding to the
parent/child device nodes of the genpd providers and then let genpd
internally do the look up.

As an example, you may have a look at how the PM domain topology in
drivers/cpuidle/cpuidle-psci-domain.c are being created.

>

>

> The second thing that Dmitry's regulator driver does is to cast the

> appropriate performance state vote on the rpmhpd resource, but I _think_

> we can do that using OPP tables in the gdsc client's node...


Yes, it looks like using an OPP table and to specify a
"required-opps", at some device node is the right thing to do.

In this case, I wonder if the "required-opps" belongs to the genpd
provider node of the new power-domain (as it seems like it only
supports one fixed performance state when it's powered on). On the
other hand, specifying this at the consumer node should work as well,
I think.

Actually, this relates to the changes [1] that Rajendra is working on
with "assigned-performance-state" (that we agreed to move to
OPP/required-opps) for genpd.

>

> > Beyond this, perhaps we should consider removing the

> > "regulator-fixed-domain" DT property, as to avoid similar problems

> > from cropping up?

> >

>

> Currently there's a single user upstream, but we have the exact same

> problem in at least 3-4 platforms with patches in the pipeline.

>

> In order to avoid breakage with existing DT I would prefer to see

> regulator-fixed-domain to live for at least one kernel release beyond

> the introduction of the other model.


Yes, this seems reasonable to me.

As Mark suggested, let's mark the regulator-fixed-domain DT property
as deprecated and remove it once we have the new solution in place.

[...]

Kind regards
Uffe
Dmitry Baryshkov June 17, 2021, 4:19 p.m. UTC | #11
On 17/06/2021 12:07, Ulf Hansson wrote:
> + Rajendra

> 

> On Tue, 15 Jun 2021 at 17:55, Bjorn Andersson

> <bjorn.andersson@linaro.org> wrote:

>>

>> On Tue 15 Jun 05:17 CDT 2021, Ulf Hansson wrote:

>>

>>> + Mark

>>>

>>> On Fri, 11 Jun 2021 at 16:34, Dmitry Baryshkov

>>> <dmitry.baryshkov@linaro.org> wrote:

>>>>

>>>> Added Stephen to Cc list

>>>>

>>>> On Fri, 11 Jun 2021 at 16:50, Ulf Hansson <ulf.hansson@linaro.org> wrote:

>>>>>

>>>>> On Fri, 11 Jun 2021 at 12:15, Dmitry Baryshkov

>>>>> <dmitry.baryshkov@linaro.org> wrote:

>>>>>>

>>>>>> In case of nested genpds it is easy to get the following warning from

>>>>>> lockdep, because all genpd's mutexes share same locking class. Use the

>>>>>> per-genpd locking class to stop lockdep from warning about possible

>>>>>> deadlocks. It is not possible to directly use genpd nested locking, as

>>>>>> it is not the genpd code calling genpd. There are interim calls to

>>>>>> regulator core.

>>>>>>

>>>>>> [    3.030219] ============================================

>>>>>> [    3.030220] WARNING: possible recursive locking detected

>>>>>> [    3.030221] 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480 Not tainted

>>>>>> [    3.030222] --------------------------------------------

>>>>>> [    3.030223] kworker/u16:0/7 is trying to acquire lock:

>>>>>> [    3.030224] ffffde0eabd29aa0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

>>>>>> [    3.030236]

>>>>>> [    3.030236] but task is already holding lock:

>>>>>> [    3.030236] ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

>>>>>> [    3.030240]

>>>>>> [    3.030240] other info that might help us debug this:

>>>>>> [    3.030240]  Possible unsafe locking scenario:

>>>>>> [    3.030240]

>>>>>> [    3.030241]        CPU0

>>>>>> [    3.030241]        ----

>>>>>> [    3.030242]   lock(&genpd->mlock);

>>>>>> [    3.030243]   lock(&genpd->mlock);

>>>>>> [    3.030244]

>>>>>> [    3.030244]  *** DEADLOCK ***

>>>>>> [    3.030244]

>>>>>> [    3.030244]  May be due to missing lock nesting notation

>>>>>> [    3.030244]

>>>>>> [    3.030245] 6 locks held by kworker/u16:0/7:

>>>>>> [    3.030246]  #0: ffff6cca00010938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

>>>>>> [    3.030252]  #1: ffff8000100c3db0 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

>>>>>> [    3.030255]  #2: ffff6cca00ce3188 (&dev->mutex){....}-{3:3}, at: __device_attach+0x3c/0x184

>>>>>> [    3.030260]  #3: ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

>>>>>> [    3.030264]  #4: ffff8000100c3968 (regulator_ww_class_acquire){+.+.}-{0:0}, at: regulator_lock_dependent+0x6c/0x1b0

>>>>>> [    3.030270]  #5: ffff6cca00a59158 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive+0x94/0x1d0

>>>>>> [    3.030273]

>>>>>> [    3.030273] stack backtrace:

>>>>>> [    3.030275] CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480

>>>>>> [    3.030276] Hardware name: Qualcomm Technologies, Inc. Robotics RB5 (DT)

>>>>>> [    3.030278] Workqueue: events_unbound deferred_probe_work_func

>>>>>> [    3.030280] Call trace:

>>>>>> [    3.030281]  dump_backtrace+0x0/0x1a0

>>>>>> [    3.030284]  show_stack+0x18/0x24

>>>>>> [    3.030286]  dump_stack+0x108/0x188

>>>>>> [    3.030289]  __lock_acquire+0xa20/0x1e0c

>>>>>> [    3.030292]  lock_acquire.part.0+0xc8/0x320

>>>>>> [    3.030294]  lock_acquire+0x68/0x84

>>>>>> [    3.030296]  __mutex_lock+0xa0/0x4f0

>>>>>> [    3.030299]  mutex_lock_nested+0x40/0x50

>>>>>> [    3.030301]  genpd_lock_mtx+0x18/0x2c

>>>>>> [    3.030303]  dev_pm_genpd_set_performance_state+0x94/0x1a0

>>>>>> [    3.030305]  reg_domain_enable+0x28/0x4c

>>>>>> [    3.030308]  _regulator_do_enable+0x420/0x6b0

>>>>>> [    3.030310]  _regulator_enable+0x178/0x1f0

>>>>>> [    3.030312]  regulator_enable+0x3c/0x80

>>>>>

>>>>> At a closer look, I am pretty sure that it's the wrong code design

>>>>> that triggers this problem, rather than that we have a real problem in

>>>>> genpd. To put it simply, the code in genpd isn't designed to work like

>>>>> this. We will end up in circular looking paths, leading to deadlocks,

>>>>> sooner or later if we allow the above code path.

>>>>>

>>>>> To fix it, the regulator here needs to be converted to a proper PM

>>>>> domain. This PM domain should be assigned as the parent to the one

>>>>> that is requested to be powered on.

>>>>

>>>> This more or less resembles original design, replaced per review

>>>> request to use separate regulator

>>>> (https://lore.kernel.org/linux-arm-msm/160269659638.884498.4031967462806977493@swboyd.mtv.corp.google.com/,

>>>> https://lore.kernel.org/linux-arm-msm/20201023131925.334864-1-dmitry.baryshkov@linaro.org/).

>>>

>>> Thanks for the pointers. In hindsight, it looks like the

>>> "regulator-fixed-domain" DT binding wasn't the right thing.

>>>

>>> Fortunately, it looks like the problem can be quite easily fixed, by

>>> moving to a correct model of the domain hierarchy.

>>>

>>

>> Can you give some pointers to how we actually fix this?

>>

>> The problem that lead us down this path is that drivers/clk/qcom/gdsc.c

>> describes power domains, which are parented by domains provided by

>> drivers/soc/qcom/rpmhpd.c.

>>

>> But I am unable to find a way for the gdsc driver to get hold of the

>> struct generic_pm_domain of the resources exposed by the rpmhpd driver.

> 

> You don't need a handle to the struct generic_pm_domain, to assign a

> parent/child domain. Instead you can use of_genpd_add_subdomain(),

> which takes two "struct of_phandle_args*" corresponding to the

> parent/child device nodes of the genpd providers and then let genpd

> internally do the look up.

> 

> As an example, you may have a look at how the PM domain topology in

> drivers/cpuidle/cpuidle-psci-domain.c are being created.

> 

>>

>>

>> The second thing that Dmitry's regulator driver does is to cast the

>> appropriate performance state vote on the rpmhpd resource, but I _think_

>> we can do that using OPP tables in the gdsc client's node...

> 

> Yes, it looks like using an OPP table and to specify a

> "required-opps", at some device node is the right thing to do.

> 

> In this case, I wonder if the "required-opps" belongs to the genpd

> provider node of the new power-domain (as it seems like it only

> supports one fixed performance state when it's powered on). On the

> other hand, specifying this at the consumer node should work as well,

> I think.

> 

> Actually, this relates to the changes [1] that Rajendra is working on

> with "assigned-performance-state" (that we agreed to move to

> OPP/required-opps) for genpd.


What about the following dts snippet?
I do not want to add power-domains directly to the dispcc node (as it's 
not a device's power domain, just gdsc's parent power domain).


dispcc: clock-controller@af00000 {
	compatible = "qcom,sm8250-dispcc";
	[....]
	#power-domain-cells = <1>;

	mmss_gdsc {
		power-domains = <&rpmhpd SM8250_MMCX>;
                 required-opps = <&rpmhpd_opp_low_svs>;
	};
};

> 

>>

>>> Beyond this, perhaps we should consider removing the

>>> "regulator-fixed-domain" DT property, as to avoid similar problems

>>> from cropping up?

>>>

>>

>> Currently there's a single user upstream, but we have the exact same

>> problem in at least 3-4 platforms with patches in the pipeline.

>>

>> In order to avoid breakage with existing DT I would prefer to see

>> regulator-fixed-domain to live for at least one kernel release beyond

>> the introduction of the other model.

> 

> Yes, this seems reasonable to me.

> 

> As Mark suggested, let's mark the regulator-fixed-domain DT property

> as deprecated and remove it once we have the new solution in place.

> 

> [...]

> 

> Kind regards

> Uffe

> 



-- 
With best wishes
Dmitry
Bjorn Andersson June 17, 2021, 5:17 p.m. UTC | #12
On Thu 17 Jun 04:07 CDT 2021, Ulf Hansson wrote:

> + Rajendra

> 

> On Tue, 15 Jun 2021 at 17:55, Bjorn Andersson

> <bjorn.andersson@linaro.org> wrote:

> >

> > On Tue 15 Jun 05:17 CDT 2021, Ulf Hansson wrote:

> >

> > > + Mark

> > >

> > > On Fri, 11 Jun 2021 at 16:34, Dmitry Baryshkov

> > > <dmitry.baryshkov@linaro.org> wrote:

> > > >

> > > > Added Stephen to Cc list

> > > >

> > > > On Fri, 11 Jun 2021 at 16:50, Ulf Hansson <ulf.hansson@linaro.org> wrote:

> > > > >

> > > > > On Fri, 11 Jun 2021 at 12:15, Dmitry Baryshkov

> > > > > <dmitry.baryshkov@linaro.org> wrote:

> > > > > >

> > > > > > In case of nested genpds it is easy to get the following warning from

> > > > > > lockdep, because all genpd's mutexes share same locking class. Use the

> > > > > > per-genpd locking class to stop lockdep from warning about possible

> > > > > > deadlocks. It is not possible to directly use genpd nested locking, as

> > > > > > it is not the genpd code calling genpd. There are interim calls to

> > > > > > regulator core.

> > > > > >

> > > > > > [    3.030219] ============================================

> > > > > > [    3.030220] WARNING: possible recursive locking detected

> > > > > > [    3.030221] 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480 Not tainted

> > > > > > [    3.030222] --------------------------------------------

> > > > > > [    3.030223] kworker/u16:0/7 is trying to acquire lock:

> > > > > > [    3.030224] ffffde0eabd29aa0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > > > > [    3.030236]

> > > > > > [    3.030236] but task is already holding lock:

> > > > > > [    3.030236] ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > > > > [    3.030240]

> > > > > > [    3.030240] other info that might help us debug this:

> > > > > > [    3.030240]  Possible unsafe locking scenario:

> > > > > > [    3.030240]

> > > > > > [    3.030241]        CPU0

> > > > > > [    3.030241]        ----

> > > > > > [    3.030242]   lock(&genpd->mlock);

> > > > > > [    3.030243]   lock(&genpd->mlock);

> > > > > > [    3.030244]

> > > > > > [    3.030244]  *** DEADLOCK ***

> > > > > > [    3.030244]

> > > > > > [    3.030244]  May be due to missing lock nesting notation

> > > > > > [    3.030244]

> > > > > > [    3.030245] 6 locks held by kworker/u16:0/7:

> > > > > > [    3.030246]  #0: ffff6cca00010938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> > > > > > [    3.030252]  #1: ffff8000100c3db0 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> > > > > > [    3.030255]  #2: ffff6cca00ce3188 (&dev->mutex){....}-{3:3}, at: __device_attach+0x3c/0x184

> > > > > > [    3.030260]  #3: ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > > > > [    3.030264]  #4: ffff8000100c3968 (regulator_ww_class_acquire){+.+.}-{0:0}, at: regulator_lock_dependent+0x6c/0x1b0

> > > > > > [    3.030270]  #5: ffff6cca00a59158 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive+0x94/0x1d0

> > > > > > [    3.030273]

> > > > > > [    3.030273] stack backtrace:

> > > > > > [    3.030275] CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480

> > > > > > [    3.030276] Hardware name: Qualcomm Technologies, Inc. Robotics RB5 (DT)

> > > > > > [    3.030278] Workqueue: events_unbound deferred_probe_work_func

> > > > > > [    3.030280] Call trace:

> > > > > > [    3.030281]  dump_backtrace+0x0/0x1a0

> > > > > > [    3.030284]  show_stack+0x18/0x24

> > > > > > [    3.030286]  dump_stack+0x108/0x188

> > > > > > [    3.030289]  __lock_acquire+0xa20/0x1e0c

> > > > > > [    3.030292]  lock_acquire.part.0+0xc8/0x320

> > > > > > [    3.030294]  lock_acquire+0x68/0x84

> > > > > > [    3.030296]  __mutex_lock+0xa0/0x4f0

> > > > > > [    3.030299]  mutex_lock_nested+0x40/0x50

> > > > > > [    3.030301]  genpd_lock_mtx+0x18/0x2c

> > > > > > [    3.030303]  dev_pm_genpd_set_performance_state+0x94/0x1a0

> > > > > > [    3.030305]  reg_domain_enable+0x28/0x4c

> > > > > > [    3.030308]  _regulator_do_enable+0x420/0x6b0

> > > > > > [    3.030310]  _regulator_enable+0x178/0x1f0

> > > > > > [    3.030312]  regulator_enable+0x3c/0x80

> > > > >

> > > > > At a closer look, I am pretty sure that it's the wrong code design

> > > > > that triggers this problem, rather than that we have a real problem in

> > > > > genpd. To put it simply, the code in genpd isn't designed to work like

> > > > > this. We will end up in circular looking paths, leading to deadlocks,

> > > > > sooner or later if we allow the above code path.

> > > > >

> > > > > To fix it, the regulator here needs to be converted to a proper PM

> > > > > domain. This PM domain should be assigned as the parent to the one

> > > > > that is requested to be powered on.

> > > >

> > > > This more or less resembles original design, replaced per review

> > > > request to use separate regulator

> > > > (https://lore.kernel.org/linux-arm-msm/160269659638.884498.4031967462806977493@swboyd.mtv.corp.google.com/,

> > > > https://lore.kernel.org/linux-arm-msm/20201023131925.334864-1-dmitry.baryshkov@linaro.org/).

> > >

> > > Thanks for the pointers. In hindsight, it looks like the

> > > "regulator-fixed-domain" DT binding wasn't the right thing.

> > >

> > > Fortunately, it looks like the problem can be quite easily fixed, by

> > > moving to a correct model of the domain hierarchy.

> > >

> >

> > Can you give some pointers to how we actually fix this?

> >

> > The problem that lead us down this path is that drivers/clk/qcom/gdsc.c

> > describes power domains, which are parented by domains provided by

> > drivers/soc/qcom/rpmhpd.c.

> >

> > But I am unable to find a way for the gdsc driver to get hold of the

> > struct generic_pm_domain of the resources exposed by the rpmhpd driver.

> 

> You don't need a handle to the struct generic_pm_domain, to assign a

> parent/child domain. Instead you can use of_genpd_add_subdomain(),

> which takes two "struct of_phandle_args*" corresponding to the

> parent/child device nodes of the genpd providers and then let genpd

> internally do the look up.

> 

> As an example, you may have a look at how the PM domain topology in

> drivers/cpuidle/cpuidle-psci-domain.c are being created.

> 


This seems to do exactly what I was looking for, just different from any
other part of the kernel...

> >

> >

> > The second thing that Dmitry's regulator driver does is to cast the

> > appropriate performance state vote on the rpmhpd resource, but I _think_

> > we can do that using OPP tables in the gdsc client's node...

> 

> Yes, it looks like using an OPP table and to specify a

> "required-opps", at some device node is the right thing to do.

> 

> In this case, I wonder if the "required-opps" belongs to the genpd

> provider node of the new power-domain (as it seems like it only

> supports one fixed performance state when it's powered on). On the

> other hand, specifying this at the consumer node should work as well,

> I think.

> 


I actually think that the single level relates to the needs of the
DISP_CC_MDSS_MDP_CLK clock rate, which we in the DPU node scale further
using an opp table.

So I think it would be appropriate to ensure that we vote on the
performance level from the display driver(s). But this needs some more
investigation.

I don't think enabling MDSS_GDSC requires the performance level
directly.

> Actually, this relates to the changes [1] that Rajendra is working on

> with "assigned-performance-state" (that we agreed to move to

> OPP/required-opps) for genpd.

> 


Might be, but my recent escapades in this area indicates that we do want
to drive the performance state dynamically, and that the current vote is
essentially setting a "minimum".

Regards,
Bjorn

> >

> > > Beyond this, perhaps we should consider removing the

> > > "regulator-fixed-domain" DT property, as to avoid similar problems

> > > from cropping up?

> > >

> >

> > Currently there's a single user upstream, but we have the exact same

> > problem in at least 3-4 platforms with patches in the pipeline.

> >

> > In order to avoid breakage with existing DT I would prefer to see

> > regulator-fixed-domain to live for at least one kernel release beyond

> > the introduction of the other model.

> 

> Yes, this seems reasonable to me.

> 

> As Mark suggested, let's mark the regulator-fixed-domain DT property

> as deprecated and remove it once we have the new solution in place.

> 

> [...]

> 

> Kind regards

> Uffe
Bjorn Andersson June 17, 2021, 5:27 p.m. UTC | #13
On Thu 17 Jun 11:19 CDT 2021, Dmitry Baryshkov wrote:

> On 17/06/2021 12:07, Ulf Hansson wrote:

> > + Rajendra

> > 

> > On Tue, 15 Jun 2021 at 17:55, Bjorn Andersson

> > <bjorn.andersson@linaro.org> wrote:

> > > 

> > > On Tue 15 Jun 05:17 CDT 2021, Ulf Hansson wrote:

> > > 

> > > > + Mark

> > > > 

> > > > On Fri, 11 Jun 2021 at 16:34, Dmitry Baryshkov

> > > > <dmitry.baryshkov@linaro.org> wrote:

> > > > > 

> > > > > Added Stephen to Cc list

> > > > > 

> > > > > On Fri, 11 Jun 2021 at 16:50, Ulf Hansson <ulf.hansson@linaro.org> wrote:

> > > > > > 

> > > > > > On Fri, 11 Jun 2021 at 12:15, Dmitry Baryshkov

> > > > > > <dmitry.baryshkov@linaro.org> wrote:

> > > > > > > 

> > > > > > > In case of nested genpds it is easy to get the following warning from

> > > > > > > lockdep, because all genpd's mutexes share same locking class. Use the

> > > > > > > per-genpd locking class to stop lockdep from warning about possible

> > > > > > > deadlocks. It is not possible to directly use genpd nested locking, as

> > > > > > > it is not the genpd code calling genpd. There are interim calls to

> > > > > > > regulator core.

> > > > > > > 

> > > > > > > [    3.030219] ============================================

> > > > > > > [    3.030220] WARNING: possible recursive locking detected

> > > > > > > [    3.030221] 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480 Not tainted

> > > > > > > [    3.030222] --------------------------------------------

> > > > > > > [    3.030223] kworker/u16:0/7 is trying to acquire lock:

> > > > > > > [    3.030224] ffffde0eabd29aa0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > > > > > [    3.030236]

> > > > > > > [    3.030236] but task is already holding lock:

> > > > > > > [    3.030236] ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > > > > > [    3.030240]

> > > > > > > [    3.030240] other info that might help us debug this:

> > > > > > > [    3.030240]  Possible unsafe locking scenario:

> > > > > > > [    3.030240]

> > > > > > > [    3.030241]        CPU0

> > > > > > > [    3.030241]        ----

> > > > > > > [    3.030242]   lock(&genpd->mlock);

> > > > > > > [    3.030243]   lock(&genpd->mlock);

> > > > > > > [    3.030244]

> > > > > > > [    3.030244]  *** DEADLOCK ***

> > > > > > > [    3.030244]

> > > > > > > [    3.030244]  May be due to missing lock nesting notation

> > > > > > > [    3.030244]

> > > > > > > [    3.030245] 6 locks held by kworker/u16:0/7:

> > > > > > > [    3.030246]  #0: ffff6cca00010938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> > > > > > > [    3.030252]  #1: ffff8000100c3db0 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> > > > > > > [    3.030255]  #2: ffff6cca00ce3188 (&dev->mutex){....}-{3:3}, at: __device_attach+0x3c/0x184

> > > > > > > [    3.030260]  #3: ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> > > > > > > [    3.030264]  #4: ffff8000100c3968 (regulator_ww_class_acquire){+.+.}-{0:0}, at: regulator_lock_dependent+0x6c/0x1b0

> > > > > > > [    3.030270]  #5: ffff6cca00a59158 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive+0x94/0x1d0

> > > > > > > [    3.030273]

> > > > > > > [    3.030273] stack backtrace:

> > > > > > > [    3.030275] CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480

> > > > > > > [    3.030276] Hardware name: Qualcomm Technologies, Inc. Robotics RB5 (DT)

> > > > > > > [    3.030278] Workqueue: events_unbound deferred_probe_work_func

> > > > > > > [    3.030280] Call trace:

> > > > > > > [    3.030281]  dump_backtrace+0x0/0x1a0

> > > > > > > [    3.030284]  show_stack+0x18/0x24

> > > > > > > [    3.030286]  dump_stack+0x108/0x188

> > > > > > > [    3.030289]  __lock_acquire+0xa20/0x1e0c

> > > > > > > [    3.030292]  lock_acquire.part.0+0xc8/0x320

> > > > > > > [    3.030294]  lock_acquire+0x68/0x84

> > > > > > > [    3.030296]  __mutex_lock+0xa0/0x4f0

> > > > > > > [    3.030299]  mutex_lock_nested+0x40/0x50

> > > > > > > [    3.030301]  genpd_lock_mtx+0x18/0x2c

> > > > > > > [    3.030303]  dev_pm_genpd_set_performance_state+0x94/0x1a0

> > > > > > > [    3.030305]  reg_domain_enable+0x28/0x4c

> > > > > > > [    3.030308]  _regulator_do_enable+0x420/0x6b0

> > > > > > > [    3.030310]  _regulator_enable+0x178/0x1f0

> > > > > > > [    3.030312]  regulator_enable+0x3c/0x80

> > > > > > 

> > > > > > At a closer look, I am pretty sure that it's the wrong code design

> > > > > > that triggers this problem, rather than that we have a real problem in

> > > > > > genpd. To put it simply, the code in genpd isn't designed to work like

> > > > > > this. We will end up in circular looking paths, leading to deadlocks,

> > > > > > sooner or later if we allow the above code path.

> > > > > > 

> > > > > > To fix it, the regulator here needs to be converted to a proper PM

> > > > > > domain. This PM domain should be assigned as the parent to the one

> > > > > > that is requested to be powered on.

> > > > > 

> > > > > This more or less resembles original design, replaced per review

> > > > > request to use separate regulator

> > > > > (https://lore.kernel.org/linux-arm-msm/160269659638.884498.4031967462806977493@swboyd.mtv.corp.google.com/,

> > > > > https://lore.kernel.org/linux-arm-msm/20201023131925.334864-1-dmitry.baryshkov@linaro.org/).

> > > > 

> > > > Thanks for the pointers. In hindsight, it looks like the

> > > > "regulator-fixed-domain" DT binding wasn't the right thing.

> > > > 

> > > > Fortunately, it looks like the problem can be quite easily fixed, by

> > > > moving to a correct model of the domain hierarchy.

> > > > 

> > > 

> > > Can you give some pointers to how we actually fix this?

> > > 

> > > The problem that lead us down this path is that drivers/clk/qcom/gdsc.c

> > > describes power domains, which are parented by domains provided by

> > > drivers/soc/qcom/rpmhpd.c.

> > > 

> > > But I am unable to find a way for the gdsc driver to get hold of the

> > > struct generic_pm_domain of the resources exposed by the rpmhpd driver.

> > 

> > You don't need a handle to the struct generic_pm_domain, to assign a

> > parent/child domain. Instead you can use of_genpd_add_subdomain(),

> > which takes two "struct of_phandle_args*" corresponding to the

> > parent/child device nodes of the genpd providers and then let genpd

> > internally do the look up.

> > 

> > As an example, you may have a look at how the PM domain topology in

> > drivers/cpuidle/cpuidle-psci-domain.c are being created.

> > 

> > > 

> > > 

> > > The second thing that Dmitry's regulator driver does is to cast the

> > > appropriate performance state vote on the rpmhpd resource, but I _think_

> > > we can do that using OPP tables in the gdsc client's node...

> > 

> > Yes, it looks like using an OPP table and to specify a

> > "required-opps", at some device node is the right thing to do.

> > 

> > In this case, I wonder if the "required-opps" belongs to the genpd

> > provider node of the new power-domain (as it seems like it only

> > supports one fixed performance state when it's powered on). On the

> > other hand, specifying this at the consumer node should work as well,

> > I think.

> > 

> > Actually, this relates to the changes [1] that Rajendra is working on

> > with "assigned-performance-state" (that we agreed to move to

> > OPP/required-opps) for genpd.

> 

> What about the following dts snippet?

> I do not want to add power-domains directly to the dispcc node (as it's not

> a device's power domain, just gdsc's parent power domain).

> 

> 

> dispcc: clock-controller@af00000 {

> 	compatible = "qcom,sm8250-dispcc";

> 	[....]

> 	#power-domain-cells = <1>;

> 

> 	mmss_gdsc {

> 		power-domains = <&rpmhpd SM8250_MMCX>;

>                 required-opps = <&rpmhpd_opp_low_svs>;

> 	};

> };


According to the documentation dispcc actually sits in MMCX (I thought
it sat in CX...). So it seems appropriate to just specify that as the
one and only power-domain for &dispcc and use that as the parent for
MDSS_GDSC.

That said, I do think we have other GDSCs in the system where the
controller sits in one power-domain and the parent power-domain is a
different one. I presume the right path here is to list all the
power-domains in DT and then use some name based matching?

Regards,
Bjorn
kernel test robot June 20, 2021, 12:39 a.m. UTC | #14
Hi Dmitry,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on pm/linux-next]
[also build test ERROR on v5.13-rc6 next-20210618]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Dmitry-Baryshkov/PM-domains-use-separate-lockdep-class-for-each-genpd/20210617-032213
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git linux-next
config: x86_64-allyesconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/e8b916bca5705e8166b78ec1b58feb27130d07f4
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Dmitry-Baryshkov/PM-domains-use-separate-lockdep-class-for-each-genpd/20210617-032213
        git checkout e8b916bca5705e8166b78ec1b58feb27130d07f4
        # save the attached .config to linux build tree
        make W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   drivers/base/power/domain.c: In function 'genpd_lock_init':
>> drivers/base/power/domain.c:1945:8: error: 'struct generic_pm_domain' has no member named 'mlock_key'

    1945 |   genpd->mlock_key = kzalloc(sizeof(genpd->mlock_key), GFP_KERNEL);
         |        ^~
   drivers/base/power/domain.c:1945:42: error: 'struct generic_pm_domain' has no member named 'mlock_key'
    1945 |   genpd->mlock_key = kzalloc(sizeof(genpd->mlock_key), GFP_KERNEL);
         |                                          ^~
   drivers/base/power/domain.c:1946:13: error: 'struct generic_pm_domain' has no member named 'mlock_key'
    1946 |   if (!genpd->mlock_key)
         |             ^~
   drivers/base/power/domain.c:1949:29: error: 'struct generic_pm_domain' has no member named 'mlock_key'
    1949 |   lockdep_register_key(genpd->mlock_key);
         |                             ^~
   drivers/base/power/domain.c:1951:49: error: 'struct generic_pm_domain' has no member named 'mlock_key'
    1951 |   __mutex_init(&genpd->mlock, genpd->name, genpd->mlock_key);
         |                                                 ^~
   drivers/base/power/domain.c: In function 'genpd_lock_destroy':
   drivers/base/power/domain.c:1961:14: error: 'struct generic_pm_domain' has no member named 'mlock_key'
    1961 |   kfree(genpd->mlock_key);
         |              ^~


vim +1945 drivers/base/power/domain.c

  1935	
  1936	static int genpd_lock_init(struct generic_pm_domain *genpd)
  1937	{
  1938		if (genpd->flags & GENPD_FLAG_IRQ_SAFE) {
  1939			spin_lock_init(&genpd->slock);
  1940			genpd->lock_ops = &genpd_spin_ops;
  1941		} else {
  1942			/* Some genpds are static, some are dynamically allocated. To
  1943			 * make lockdep happy always allocate the key dynamically and
  1944			 * register it. */
> 1945			genpd->mlock_key = kzalloc(sizeof(genpd->mlock_key), GFP_KERNEL);

  1946			if (!genpd->mlock_key)
  1947				return -ENOMEM;
  1948	
  1949			lockdep_register_key(genpd->mlock_key);
  1950	
  1951			__mutex_init(&genpd->mlock, genpd->name, genpd->mlock_key);
  1952			genpd->lock_ops = &genpd_mtx_ops;
  1953		}
  1954	
  1955		return 0;
  1956	}
  1957	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Dmitry Baryshkov June 28, 2021, 7:55 p.m. UTC | #15
Hi,

On 17/06/2021 12:07, Ulf Hansson wrote:
> + Rajendra

> 

> On Tue, 15 Jun 2021 at 17:55, Bjorn Andersson

> <bjorn.andersson@linaro.org> wrote:

>>

>> On Tue 15 Jun 05:17 CDT 2021, Ulf Hansson wrote:

>>

>>> + Mark

>>>

>>> On Fri, 11 Jun 2021 at 16:34, Dmitry Baryshkov

>>> <dmitry.baryshkov@linaro.org> wrote:

>>>>

>>>> Added Stephen to Cc list

>>>>

>>>> On Fri, 11 Jun 2021 at 16:50, Ulf Hansson <ulf.hansson@linaro.org> wrote:

>>>>>

>>>>> On Fri, 11 Jun 2021 at 12:15, Dmitry Baryshkov

>>>>> <dmitry.baryshkov@linaro.org> wrote:

>>>>>>

>>>>>> In case of nested genpds it is easy to get the following warning from

>>>>>> lockdep, because all genpd's mutexes share same locking class. Use the

>>>>>> per-genpd locking class to stop lockdep from warning about possible

>>>>>> deadlocks. It is not possible to directly use genpd nested locking, as

>>>>>> it is not the genpd code calling genpd. There are interim calls to

>>>>>> regulator core.

>>>>>>

>>>>>> [    3.030219] ============================================

>>>>>> [    3.030220] WARNING: possible recursive locking detected

>>>>>> [    3.030221] 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480 Not tainted

>>>>>> [    3.030222] --------------------------------------------

>>>>>> [    3.030223] kworker/u16:0/7 is trying to acquire lock:

>>>>>> [    3.030224] ffffde0eabd29aa0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

>>>>>> [    3.030236]

>>>>>> [    3.030236] but task is already holding lock:

>>>>>> [    3.030236] ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

>>>>>> [    3.030240]

>>>>>> [    3.030240] other info that might help us debug this:

>>>>>> [    3.030240]  Possible unsafe locking scenario:

>>>>>> [    3.030240]

>>>>>> [    3.030241]        CPU0

>>>>>> [    3.030241]        ----

>>>>>> [    3.030242]   lock(&genpd->mlock);

>>>>>> [    3.030243]   lock(&genpd->mlock);

>>>>>> [    3.030244]

>>>>>> [    3.030244]  *** DEADLOCK ***

>>>>>> [    3.030244]

>>>>>> [    3.030244]  May be due to missing lock nesting notation

>>>>>> [    3.030244]

>>>>>> [    3.030245] 6 locks held by kworker/u16:0/7:

>>>>>> [    3.030246]  #0: ffff6cca00010938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

>>>>>> [    3.030252]  #1: ffff8000100c3db0 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

>>>>>> [    3.030255]  #2: ffff6cca00ce3188 (&dev->mutex){....}-{3:3}, at: __device_attach+0x3c/0x184

>>>>>> [    3.030260]  #3: ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

>>>>>> [    3.030264]  #4: ffff8000100c3968 (regulator_ww_class_acquire){+.+.}-{0:0}, at: regulator_lock_dependent+0x6c/0x1b0

>>>>>> [    3.030270]  #5: ffff6cca00a59158 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive+0x94/0x1d0

>>>>>> [    3.030273]

>>>>>> [    3.030273] stack backtrace:

>>>>>> [    3.030275] CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480

>>>>>> [    3.030276] Hardware name: Qualcomm Technologies, Inc. Robotics RB5 (DT)

>>>>>> [    3.030278] Workqueue: events_unbound deferred_probe_work_func

>>>>>> [    3.030280] Call trace:

>>>>>> [    3.030281]  dump_backtrace+0x0/0x1a0

>>>>>> [    3.030284]  show_stack+0x18/0x24

>>>>>> [    3.030286]  dump_stack+0x108/0x188

>>>>>> [    3.030289]  __lock_acquire+0xa20/0x1e0c

>>>>>> [    3.030292]  lock_acquire.part.0+0xc8/0x320

>>>>>> [    3.030294]  lock_acquire+0x68/0x84

>>>>>> [    3.030296]  __mutex_lock+0xa0/0x4f0

>>>>>> [    3.030299]  mutex_lock_nested+0x40/0x50

>>>>>> [    3.030301]  genpd_lock_mtx+0x18/0x2c

>>>>>> [    3.030303]  dev_pm_genpd_set_performance_state+0x94/0x1a0

>>>>>> [    3.030305]  reg_domain_enable+0x28/0x4c

>>>>>> [    3.030308]  _regulator_do_enable+0x420/0x6b0

>>>>>> [    3.030310]  _regulator_enable+0x178/0x1f0

>>>>>> [    3.030312]  regulator_enable+0x3c/0x80

>>>>>

>>>>> At a closer look, I am pretty sure that it's the wrong code design

>>>>> that triggers this problem, rather than that we have a real problem in

>>>>> genpd. To put it simply, the code in genpd isn't designed to work like

>>>>> this. We will end up in circular looking paths, leading to deadlocks,

>>>>> sooner or later if we allow the above code path.

>>>>>

>>>>> To fix it, the regulator here needs to be converted to a proper PM

>>>>> domain. This PM domain should be assigned as the parent to the one

>>>>> that is requested to be powered on.

>>>>

>>>> This more or less resembles original design, replaced per review

>>>> request to use separate regulator

>>>> (https://lore.kernel.org/linux-arm-msm/160269659638.884498.4031967462806977493@swboyd.mtv.corp.google.com/,

>>>> https://lore.kernel.org/linux-arm-msm/20201023131925.334864-1-dmitry.baryshkov@linaro.org/).

>>>

>>> Thanks for the pointers. In hindsight, it looks like the

>>> "regulator-fixed-domain" DT binding wasn't the right thing.

>>>

>>> Fortunately, it looks like the problem can be quite easily fixed, by

>>> moving to a correct model of the domain hierarchy.

>>>

>>

>> Can you give some pointers to how we actually fix this?

>>

>> The problem that lead us down this path is that drivers/clk/qcom/gdsc.c

>> describes power domains, which are parented by domains provided by

>> drivers/soc/qcom/rpmhpd.c.

>>

>> But I am unable to find a way for the gdsc driver to get hold of the

>> struct generic_pm_domain of the resources exposed by the rpmhpd driver.

> 

> You don't need a handle to the struct generic_pm_domain, to assign a

> parent/child domain. Instead you can use of_genpd_add_subdomain(),

> which takes two "struct of_phandle_args*" corresponding to the

> parent/child device nodes of the genpd providers and then let genpd

> internally do the look up.


I've taken a look onto of_genpd_add_subdomain. Please correct me if I'm 
wrong, I have the feeling that this function is badly designed. It 
provokes to use the following sequence:
- register child domain
- register child's domain provider
- mark child as a subdomain of a parent.

So we have a (short) timeslice when users can get hold of child domain, 
but the system knows about a child domain, but does not about a 
parent/child relationship.

I think this function should be changed to take struct generic_pm_domain 
as a second argument. I will attempt refactoring cpuidle-psci-domain to 
follow this, let's see if this will work.

Another option would be to export genpd_get_from_provider() and to use 
genpd_add_subdomain() directly.

I think I'd need this function anyway for the gdsc code. During 
gdsc_init() we check gdsc status and this requires register access (and 
thus powering on the parent domain) before the gdsc is registered itself 
as a power domain.

-- 
With best wishes
Dmitry
Ulf Hansson June 29, 2021, 11:05 a.m. UTC | #16
On Mon, 28 Jun 2021 at 21:55, Dmitry Baryshkov
<dmitry.baryshkov@linaro.org> wrote:
>

> Hi,

>

> On 17/06/2021 12:07, Ulf Hansson wrote:

> > + Rajendra

> >

> > On Tue, 15 Jun 2021 at 17:55, Bjorn Andersson

> > <bjorn.andersson@linaro.org> wrote:

> >>

> >> On Tue 15 Jun 05:17 CDT 2021, Ulf Hansson wrote:

> >>

> >>> + Mark

> >>>

> >>> On Fri, 11 Jun 2021 at 16:34, Dmitry Baryshkov

> >>> <dmitry.baryshkov@linaro.org> wrote:

> >>>>

> >>>> Added Stephen to Cc list

> >>>>

> >>>> On Fri, 11 Jun 2021 at 16:50, Ulf Hansson <ulf.hansson@linaro.org> wrote:

> >>>>>

> >>>>> On Fri, 11 Jun 2021 at 12:15, Dmitry Baryshkov

> >>>>> <dmitry.baryshkov@linaro.org> wrote:

> >>>>>>

> >>>>>> In case of nested genpds it is easy to get the following warning from

> >>>>>> lockdep, because all genpd's mutexes share same locking class. Use the

> >>>>>> per-genpd locking class to stop lockdep from warning about possible

> >>>>>> deadlocks. It is not possible to directly use genpd nested locking, as

> >>>>>> it is not the genpd code calling genpd. There are interim calls to

> >>>>>> regulator core.

> >>>>>>

> >>>>>> [    3.030219] ============================================

> >>>>>> [    3.030220] WARNING: possible recursive locking detected

> >>>>>> [    3.030221] 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480 Not tainted

> >>>>>> [    3.030222] --------------------------------------------

> >>>>>> [    3.030223] kworker/u16:0/7 is trying to acquire lock:

> >>>>>> [    3.030224] ffffde0eabd29aa0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> >>>>>> [    3.030236]

> >>>>>> [    3.030236] but task is already holding lock:

> >>>>>> [    3.030236] ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> >>>>>> [    3.030240]

> >>>>>> [    3.030240] other info that might help us debug this:

> >>>>>> [    3.030240]  Possible unsafe locking scenario:

> >>>>>> [    3.030240]

> >>>>>> [    3.030241]        CPU0

> >>>>>> [    3.030241]        ----

> >>>>>> [    3.030242]   lock(&genpd->mlock);

> >>>>>> [    3.030243]   lock(&genpd->mlock);

> >>>>>> [    3.030244]

> >>>>>> [    3.030244]  *** DEADLOCK ***

> >>>>>> [    3.030244]

> >>>>>> [    3.030244]  May be due to missing lock nesting notation

> >>>>>> [    3.030244]

> >>>>>> [    3.030245] 6 locks held by kworker/u16:0/7:

> >>>>>> [    3.030246]  #0: ffff6cca00010938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> >>>>>> [    3.030252]  #1: ffff8000100c3db0 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1f0/0x730

> >>>>>> [    3.030255]  #2: ffff6cca00ce3188 (&dev->mutex){....}-{3:3}, at: __device_attach+0x3c/0x184

> >>>>>> [    3.030260]  #3: ffffde0eabcfd6d0 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x18/0x2c

> >>>>>> [    3.030264]  #4: ffff8000100c3968 (regulator_ww_class_acquire){+.+.}-{0:0}, at: regulator_lock_dependent+0x6c/0x1b0

> >>>>>> [    3.030270]  #5: ffff6cca00a59158 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive+0x94/0x1d0

> >>>>>> [    3.030273]

> >>>>>> [    3.030273] stack backtrace:

> >>>>>> [    3.030275] CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.13.0-rc3-00054-gf8f0a2f2b643-dirty #2480

> >>>>>> [    3.030276] Hardware name: Qualcomm Technologies, Inc. Robotics RB5 (DT)

> >>>>>> [    3.030278] Workqueue: events_unbound deferred_probe_work_func

> >>>>>> [    3.030280] Call trace:

> >>>>>> [    3.030281]  dump_backtrace+0x0/0x1a0

> >>>>>> [    3.030284]  show_stack+0x18/0x24

> >>>>>> [    3.030286]  dump_stack+0x108/0x188

> >>>>>> [    3.030289]  __lock_acquire+0xa20/0x1e0c

> >>>>>> [    3.030292]  lock_acquire.part.0+0xc8/0x320

> >>>>>> [    3.030294]  lock_acquire+0x68/0x84

> >>>>>> [    3.030296]  __mutex_lock+0xa0/0x4f0

> >>>>>> [    3.030299]  mutex_lock_nested+0x40/0x50

> >>>>>> [    3.030301]  genpd_lock_mtx+0x18/0x2c

> >>>>>> [    3.030303]  dev_pm_genpd_set_performance_state+0x94/0x1a0

> >>>>>> [    3.030305]  reg_domain_enable+0x28/0x4c

> >>>>>> [    3.030308]  _regulator_do_enable+0x420/0x6b0

> >>>>>> [    3.030310]  _regulator_enable+0x178/0x1f0

> >>>>>> [    3.030312]  regulator_enable+0x3c/0x80

> >>>>>

> >>>>> At a closer look, I am pretty sure that it's the wrong code design

> >>>>> that triggers this problem, rather than that we have a real problem in

> >>>>> genpd. To put it simply, the code in genpd isn't designed to work like

> >>>>> this. We will end up in circular looking paths, leading to deadlocks,

> >>>>> sooner or later if we allow the above code path.

> >>>>>

> >>>>> To fix it, the regulator here needs to be converted to a proper PM

> >>>>> domain. This PM domain should be assigned as the parent to the one

> >>>>> that is requested to be powered on.

> >>>>

> >>>> This more or less resembles original design, replaced per review

> >>>> request to use separate regulator

> >>>> (https://lore.kernel.org/linux-arm-msm/160269659638.884498.4031967462806977493@swboyd.mtv.corp.google.com/,

> >>>> https://lore.kernel.org/linux-arm-msm/20201023131925.334864-1-dmitry.baryshkov@linaro.org/).

> >>>

> >>> Thanks for the pointers. In hindsight, it looks like the

> >>> "regulator-fixed-domain" DT binding wasn't the right thing.

> >>>

> >>> Fortunately, it looks like the problem can be quite easily fixed, by

> >>> moving to a correct model of the domain hierarchy.

> >>>

> >>

> >> Can you give some pointers to how we actually fix this?

> >>

> >> The problem that lead us down this path is that drivers/clk/qcom/gdsc.c

> >> describes power domains, which are parented by domains provided by

> >> drivers/soc/qcom/rpmhpd.c.

> >>

> >> But I am unable to find a way for the gdsc driver to get hold of the

> >> struct generic_pm_domain of the resources exposed by the rpmhpd driver.

> >

> > You don't need a handle to the struct generic_pm_domain, to assign a

> > parent/child domain. Instead you can use of_genpd_add_subdomain(),

> > which takes two "struct of_phandle_args*" corresponding to the

> > parent/child device nodes of the genpd providers and then let genpd

> > internally do the look up.

>

> I've taken a look onto of_genpd_add_subdomain. Please correct me if I'm

> wrong, I have the feeling that this function is badly designed. It

> provokes to use the following sequence:

> - register child domain

> - register child's domain provider

> - mark child as a subdomain of a parent.

>

> So we have a (short) timeslice when users can get hold of child domain,

> but the system knows about a child domain, but does not about a

> parent/child relationship.


Correct!

This is tricky, but the best we have managed to come up with, so far.

Additionally, I think this hasn't been an issue, because providers and
subdomains have been registered way earlier than consumers. Of course,
it would be nice with a more robust solution.

>

> I think this function should be changed to take struct generic_pm_domain

> as a second argument. I will attempt refactoring cpuidle-psci-domain to

> follow this, let's see if this will work.


I am not sure what is the best approach here. You may be right.

>

> Another option would be to export genpd_get_from_provider() and to use

> genpd_add_subdomain() directly.


That could work too.

Another option would be to introduce an intermediate state for the
genpd provider, that can be used to prevent devices from getting
attached to it (returning -EPROBE_DEFER if that happens), until the
topology (child/parent domains) has been initialized as well. Just
thinking out loud...

>

> I think I'd need this function anyway for the gdsc code. During

> gdsc_init() we check gdsc status and this requires register access (and

> thus powering on the parent domain) before the gdsc is registered itself

> as a power domain.


As a workaround (temporary), perhaps you can add a ->sync_state()
callback to mitigate part of the problems (which gets called when
*all* consumers are attached), along the lines of what we also do in
the cpuidle-psci-domain.

Kind regards
Uffe
Bjorn Andersson June 29, 2021, 3:09 p.m. UTC | #17
On Mon 28 Jun 14:55 CDT 2021, Dmitry Baryshkov wrote:

> Hi,

> 

> On 17/06/2021 12:07, Ulf Hansson wrote:

> > + Rajendra

> > 

> > On Tue, 15 Jun 2021 at 17:55, Bjorn Andersson

> > <bjorn.andersson@linaro.org> wrote:

[..]
> > > But I am unable to find a way for the gdsc driver to get hold of the

> > > struct generic_pm_domain of the resources exposed by the rpmhpd driver.

> > 

> > You don't need a handle to the struct generic_pm_domain, to assign a

> > parent/child domain. Instead you can use of_genpd_add_subdomain(),

> > which takes two "struct of_phandle_args*" corresponding to the

> > parent/child device nodes of the genpd providers and then let genpd

> > internally do the look up.

> 

[..]
> 

> I think I'd need this function anyway for the gdsc code. During gdsc_init()

> we check gdsc status and this requires register access (and thus powering on

> the parent domain) before the gdsc is registered itself as a power domain.

> 


But this is a register access in the dispcc block, which is the context
that our gdsc_init() operates. So describing that MMCX is the
power-domain for dispcc should ensure that the power-domain is enabled.

We do however need to make sure that dispcc doesn't hog its
power-domain, and that any register accesses in runtime is done with the
parenting power-domain enabled. E.g. the clock framework wraps all
operations in pm_runtime_get/put(), but I don't see anything in the
gnepd code for this.


And for gcc I'm worried that we might have some GDSCs that are parented
by CX and some by MX, but I do still think that the register accesses
are only related to one of these.

But this seems like a continuation of the special case in dispcc, so I
think we should be able to focus on getting that right before we attempt
the general case (and I don't know if we have a need for this today).

Regards,
Bjorn
Ulf Hansson July 1, 2021, 10:06 a.m. UTC | #18
On Tue, 29 Jun 2021 at 17:09, Bjorn Andersson
<bjorn.andersson@linaro.org> wrote:
>

> On Mon 28 Jun 14:55 CDT 2021, Dmitry Baryshkov wrote:

>

> > Hi,

> >

> > On 17/06/2021 12:07, Ulf Hansson wrote:

> > > + Rajendra

> > >

> > > On Tue, 15 Jun 2021 at 17:55, Bjorn Andersson

> > > <bjorn.andersson@linaro.org> wrote:

> [..]

> > > > But I am unable to find a way for the gdsc driver to get hold of the

> > > > struct generic_pm_domain of the resources exposed by the rpmhpd driver.

> > >

> > > You don't need a handle to the struct generic_pm_domain, to assign a

> > > parent/child domain. Instead you can use of_genpd_add_subdomain(),

> > > which takes two "struct of_phandle_args*" corresponding to the

> > > parent/child device nodes of the genpd providers and then let genpd

> > > internally do the look up.

> >

> [..]

> >

> > I think I'd need this function anyway for the gdsc code. During gdsc_init()

> > we check gdsc status and this requires register access (and thus powering on

> > the parent domain) before the gdsc is registered itself as a power domain.

> >

>

> But this is a register access in the dispcc block, which is the context

> that our gdsc_init() operates. So describing that MMCX is the

> power-domain for dispcc should ensure that the power-domain is enabled.


Right.

As a note, when we assign a child domain to a parent domain, via
of_genpd_add_subdomain() for example - and the child domain has been
powered on, this requires the parent domain to be turned on as well.

>

> We do however need to make sure that dispcc doesn't hog its

> power-domain, and that any register accesses in runtime is done with the

> parenting power-domain enabled. E.g. the clock framework wraps all

> operations in pm_runtime_get/put(), but I don't see anything in the

> gnepd code for this.

>

>

> And for gcc I'm worried that we might have some GDSCs that are parented

> by CX and some by MX, but I do still think that the register accesses

> are only related to one of these.

>

> But this seems like a continuation of the special case in dispcc, so I

> think we should be able to focus on getting that right before we attempt

> the general case (and I don't know if we have a need for this today).

>

> Regards,

> Bjorn


Unfortunately, I didn't understand all the above things.

In any case, please tell me if there is anything else that blocks you
from moving forward with the power domain conversion? I am happy to
help.

Kind regards
Uffe
Dmitry Baryshkov July 1, 2021, 11:01 a.m. UTC | #19
On Thu, 1 Jul 2021 at 13:07, Ulf Hansson <ulf.hansson@linaro.org> wrote:
>

> On Tue, 29 Jun 2021 at 17:09, Bjorn Andersson

> <bjorn.andersson@linaro.org> wrote:

> >

> > On Mon 28 Jun 14:55 CDT 2021, Dmitry Baryshkov wrote:

> >

> > > Hi,

> > >

> > > On 17/06/2021 12:07, Ulf Hansson wrote:

> > > > + Rajendra

> > > >

> > > > On Tue, 15 Jun 2021 at 17:55, Bjorn Andersson

> > > > <bjorn.andersson@linaro.org> wrote:

> > [..]

> > > > > But I am unable to find a way for the gdsc driver to get hold of the

> > > > > struct generic_pm_domain of the resources exposed by the rpmhpd driver.

> > > >

> > > > You don't need a handle to the struct generic_pm_domain, to assign a

> > > > parent/child domain. Instead you can use of_genpd_add_subdomain(),

> > > > which takes two "struct of_phandle_args*" corresponding to the

> > > > parent/child device nodes of the genpd providers and then let genpd

> > > > internally do the look up.

> > >

> > [..]

> > >

> > > I think I'd need this function anyway for the gdsc code. During gdsc_init()

> > > we check gdsc status and this requires register access (and thus powering on

> > > the parent domain) before the gdsc is registered itself as a power domain.

> > >

> >

> > But this is a register access in the dispcc block, which is the context

> > that our gdsc_init() operates. So describing that MMCX is the

> > power-domain for dispcc should ensure that the power-domain is enabled.

>

> Right.

>

> As a note, when we assign a child domain to a parent domain, via

> of_genpd_add_subdomain() for example - and the child domain has been

> powered on, this requires the parent domain to be turned on as well.


Most probably we should also teach genpd code to call pm_runtime
functions on respective devices when the genpd is powered on or off.
For now I had to do this manually.

>

> >

> > We do however need to make sure that dispcc doesn't hog its

> > power-domain, and that any register accesses in runtime is done with the

> > parenting power-domain enabled. E.g. the clock framework wraps all

> > operations in pm_runtime_get/put(), but I don't see anything in the

> > gnepd code for this.

> >

> >

> > And for gcc I'm worried that we might have some GDSCs that are parented

> > by CX and some by MX, but I do still think that the register accesses

> > are only related to one of these.

> >

> > But this seems like a continuation of the special case in dispcc, so I

> > think we should be able to focus on getting that right before we attempt

> > the general case (and I don't know if we have a need for this today).

> >

> > Regards,

> > Bjorn

>

> Unfortunately, I didn't understand all the above things.

>

> In any case, please tell me if there is anything else that blocks you

> from moving forward with the power domain conversion? I am happy to

> help.


Patch series was submitted:
https://lore.kernel.org/linux-arm-msm/20210630133149.3204290-1-dmitry.baryshkov@linaro.org/



-- 
With best wishes
Dmitry
Ulf Hansson July 1, 2021, 3:14 p.m. UTC | #20
On Thu, 1 Jul 2021 at 13:01, Dmitry Baryshkov
<dmitry.baryshkov@linaro.org> wrote:
>

> On Thu, 1 Jul 2021 at 13:07, Ulf Hansson <ulf.hansson@linaro.org> wrote:

> >

> > On Tue, 29 Jun 2021 at 17:09, Bjorn Andersson

> > <bjorn.andersson@linaro.org> wrote:

> > >

> > > On Mon 28 Jun 14:55 CDT 2021, Dmitry Baryshkov wrote:

> > >

> > > > Hi,

> > > >

> > > > On 17/06/2021 12:07, Ulf Hansson wrote:

> > > > > + Rajendra

> > > > >

> > > > > On Tue, 15 Jun 2021 at 17:55, Bjorn Andersson

> > > > > <bjorn.andersson@linaro.org> wrote:

> > > [..]

> > > > > > But I am unable to find a way for the gdsc driver to get hold of the

> > > > > > struct generic_pm_domain of the resources exposed by the rpmhpd driver.

> > > > >

> > > > > You don't need a handle to the struct generic_pm_domain, to assign a

> > > > > parent/child domain. Instead you can use of_genpd_add_subdomain(),

> > > > > which takes two "struct of_phandle_args*" corresponding to the

> > > > > parent/child device nodes of the genpd providers and then let genpd

> > > > > internally do the look up.

> > > >

> > > [..]

> > > >

> > > > I think I'd need this function anyway for the gdsc code. During gdsc_init()

> > > > we check gdsc status and this requires register access (and thus powering on

> > > > the parent domain) before the gdsc is registered itself as a power domain.

> > > >

> > >

> > > But this is a register access in the dispcc block, which is the context

> > > that our gdsc_init() operates. So describing that MMCX is the

> > > power-domain for dispcc should ensure that the power-domain is enabled.

> >

> > Right.

> >

> > As a note, when we assign a child domain to a parent domain, via

> > of_genpd_add_subdomain() for example - and the child domain has been

> > powered on, this requires the parent domain to be turned on as well.

>

> Most probably we should also teach genpd code to call pm_runtime

> functions on respective devices when the genpd is powered on or off.

> For now I had to do this manually.


No, that's not the way it works or should work for that matter.

It's the runtime PM status of the devices that are attached to a
genpd, that controls whether a genpd should be powered on/off.
Additionally, if there is a child domain powered on, then its parent
needs to be powered on too and so forth.

>

> >

> > >

> > > We do however need to make sure that dispcc doesn't hog its

> > > power-domain, and that any register accesses in runtime is done with the

> > > parenting power-domain enabled. E.g. the clock framework wraps all

> > > operations in pm_runtime_get/put(), but I don't see anything in the

> > > gnepd code for this.

> > >

> > >

> > > And for gcc I'm worried that we might have some GDSCs that are parented

> > > by CX and some by MX, but I do still think that the register accesses

> > > are only related to one of these.

> > >

> > > But this seems like a continuation of the special case in dispcc, so I

> > > think we should be able to focus on getting that right before we attempt

> > > the general case (and I don't know if we have a need for this today).

> > >

> > > Regards,

> > > Bjorn

> >

> > Unfortunately, I didn't understand all the above things.

> >

> > In any case, please tell me if there is anything else that blocks you

> > from moving forward with the power domain conversion? I am happy to

> > help.

>

> Patch series was submitted:

> https://lore.kernel.org/linux-arm-msm/20210630133149.3204290-1-dmitry.baryshkov@linaro.org/


Okay, I will have a look over there. Thanks!

Kind regards
Uffe
diff mbox series

Patch

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 74219d032910..bdf439b48763 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1899,20 +1899,33 @@  static int genpd_set_default_power_state(struct generic_pm_domain *genpd)
 	return 0;
 }
 
-static void genpd_lock_init(struct generic_pm_domain *genpd)
+static int genpd_lock_init(struct generic_pm_domain *genpd)
 {
 	if (genpd->flags & GENPD_FLAG_IRQ_SAFE) {
 		spin_lock_init(&genpd->slock);
 		genpd->lock_ops = &genpd_spin_ops;
 	} else {
-		mutex_init(&genpd->mlock);
+		/* Some genpds are static, some are dynamically allocated. To
+		 * make lockdep happy always allocate the key dynamically and
+		 * register it. */
+		genpd->mlock_key = kzalloc(sizeof(genpd->mlock_key), GFP_KERNEL);
+		if (!genpd->mlock_key)
+			return -ENOMEM;
+
+		lockdep_register_key(genpd->mlock_key);
+
+		__mutex_init(&genpd->mlock, genpd->name, genpd->mlock_key);
 		genpd->lock_ops = &genpd_mtx_ops;
 	}
+
+	return 0;
 }
 
 static void genpd_lock_destroy(struct generic_pm_domain *genpd) {
-	if (!(genpd->flags & GENPD_FLAG_IRQ_SAFE))
+	if (!(genpd->flags & GENPD_FLAG_IRQ_SAFE)) {
 		mutex_destroy(&genpd->mlock);
+		kfree(genpd->mlock_key);
+	}
 }
 
 /**
@@ -1935,7 +1948,10 @@  int pm_genpd_init(struct generic_pm_domain *genpd,
 	INIT_LIST_HEAD(&genpd->child_links);
 	INIT_LIST_HEAD(&genpd->dev_list);
 	RAW_INIT_NOTIFIER_HEAD(&genpd->power_notifiers);
-	genpd_lock_init(genpd);
+	ret = genpd_lock_init(genpd);
+	if (ret)
+		return ret;
+
 	genpd->gov = gov;
 	INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);
 	atomic_set(&genpd->sd_count, 0);
@@ -2040,7 +2056,6 @@  static int genpd_remove(struct generic_pm_domain *genpd)
 		free_cpumask_var(genpd->cpus);
 	if (genpd->free_states)
 		genpd->free_states(genpd->states, genpd->state_count);
-	genpd_lock_destroy(genpd);
 
 	pr_debug("%s: removed %s\n", __func__, genpd->name);