diff mbox series

[v2,13/14] soc/tegra: pmc: Add core power domain

Message ID 20210523231335.8238-14-digetx@gmail.com
State New
Headers show
Series NVIDIA Tegra memory and power management changes for 5.14 | expand

Commit Message

Dmitry Osipenko May 23, 2021, 11:13 p.m. UTC
NVIDIA Tegra SoCs have multiple power domains, each domain corresponds
to an external SoC power rail. Core power domain covers vast majority of
hardware blocks within a Tegra SoC. The voltage of a power domain should
be set to a level which satisfies all devices within the power domain.
Add support for the core power domain which controls voltage state of the
domain. This allows us to support system-wide DVFS on Tegra20-210 SoCs.
The PMC powergate domains now are sub-domains of the core domain, this
requires device-tree updating, older DTBs are unaffected.

Tested-by: Peter Geis <pgwipeout@gmail.com> # Ouya T30
Tested-by: Paul Fertser <fercerpav@gmail.com> # PAZ00 T20
Tested-by: Nicolas Chauvet <kwizart@gmail.com> # PAZ00 T20 and TK1 T124
Tested-by: Matt Merhar <mattmerhar@protonmail.com> # Ouya T30
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/soc/tegra/Kconfig  |  14 ++++
 drivers/soc/tegra/pmc.c    | 143 +++++++++++++++++++++++++++++++++++++
 include/soc/tegra/common.h |   7 ++
 3 files changed, 164 insertions(+)

Comments

Ulf Hansson May 24, 2021, 5:04 p.m. UTC | #1
On Mon, 24 May 2021 at 01:13, Dmitry Osipenko <digetx@gmail.com> wrote:
>
> NVIDIA Tegra SoCs have multiple power domains, each domain corresponds
> to an external SoC power rail. Core power domain covers vast majority of
> hardware blocks within a Tegra SoC. The voltage of a power domain should
> be set to a level which satisfies all devices within the power domain.
> Add support for the core power domain which controls voltage state of the
> domain. This allows us to support system-wide DVFS on Tegra20-210 SoCs.
> The PMC powergate domains now are sub-domains of the core domain, this
> requires device-tree updating, older DTBs are unaffected.
>
> Tested-by: Peter Geis <pgwipeout@gmail.com> # Ouya T30
> Tested-by: Paul Fertser <fercerpav@gmail.com> # PAZ00 T20
> Tested-by: Nicolas Chauvet <kwizart@gmail.com> # PAZ00 T20 and TK1 T124
> Tested-by: Matt Merhar <mattmerhar@protonmail.com> # Ouya T30
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>

[...]

> +
> +static int tegra_pmc_core_pd_add(struct tegra_pmc *pmc, struct device_node *np)
> +{
> +       static struct lock_class_key tegra_core_domain_lock_class;
> +       struct generic_pm_domain *genpd;
> +       const char *rname = "core";
> +       int err;
> +
> +       genpd = devm_kzalloc(pmc->dev, sizeof(*genpd), GFP_KERNEL);
> +       if (!genpd)
> +               return -ENOMEM;
> +
> +       genpd->name = np->name;
> +       genpd->set_performance_state = tegra_pmc_core_pd_set_performance_state;
> +       genpd->opp_to_performance_state = tegra_pmc_core_pd_opp_to_performance_state;
> +
> +       err = devm_pm_opp_set_regulators(pmc->dev, &rname, 1);
> +       if (err)
> +               return dev_err_probe(pmc->dev, err,
> +                                    "failed to set core OPP regulator\n");
> +
> +       err = pm_genpd_init(genpd, NULL, false);
> +       if (err) {
> +               dev_err(pmc->dev, "failed to init core genpd: %d\n", err);
> +               return err;
> +       }
> +
> +       /*
> +        * We have a "PMC pwrgate -> Core" hierarchy of the power domains
> +        * where PMC needs to resume and change performance (voltage) of the
> +        * Core domain from the PMC GENPD on/off callbacks, hence we need
> +        * to annotate the lock in order to remove confusion from the
> +        * lockdep checker when a nested access happens.
> +        */

Can you elaborate a bit more on this?

Are you saying that when the child domain (PMC pwrgate) gets powered
off, you want to drop its aggregated votes it may hold for the
performance state, as otherwise it may affect the parent domain (core
domain)?

I guess this would be a valid scenario to optimize for, especially if
you have more than one child domain of the core power domain, right?

If you have only one child domain, would it be sufficient to assign
->power_on|off() callbacks for the core domain and deal with the
performance stare votes from there instead?

> +       lockdep_set_class(&genpd->mlock, &tegra_core_domain_lock_class);
> +
> +       err = of_genpd_add_provider_simple(np, genpd);
> +       if (err) {
> +               dev_err(pmc->dev, "failed to add core genpd: %d\n", err);
> +               goto remove_genpd;
> +       }
> +
> +       return 0;
> +
> +remove_genpd:
> +       pm_genpd_remove(genpd);
> +
> +       return err;
> +}

[...]

> +static void tegra_pmc_sync_state(struct device *dev)
> +{
> +       int err;
> +
> +       pmc->core_domain_state_synced = true;
> +
> +       /* this is a no-op if core regulator isn't used */
> +       mutex_lock(&pmc->powergates_lock);
> +       err = dev_pm_opp_sync_regulators(dev);
> +       mutex_unlock(&pmc->powergates_lock);
> +
> +       if (err)
> +               dev_err(dev, "failed to sync regulators: %d\n", err);
> +}
> +

Nitpick.

Would you mind splitting the "sync_state" thingy out into a separate
patch on top of $subject patch?

I think it would be nice, especially since it shares a function via
include/soc/tegra/common.h - that would make it clear to what part
that belongs to.

>  static struct platform_driver tegra_pmc_driver = {
>         .driver = {
>                 .name = "tegra-pmc",
> @@ -3680,6 +3822,7 @@ static struct platform_driver tegra_pmc_driver = {
>  #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_ARM)
>                 .pm = &tegra_pmc_pm_ops,
>  #endif
> +               .sync_state = tegra_pmc_sync_state,
>         },
>         .probe = tegra_pmc_probe,
>  };
> diff --git a/include/soc/tegra/common.h b/include/soc/tegra/common.h
> index af41ad80ec21..135a6956a18c 100644
> --- a/include/soc/tegra/common.h
> +++ b/include/soc/tegra/common.h
> @@ -23,6 +23,8 @@ struct tegra_core_opp_params {
>  #ifdef CONFIG_ARCH_TEGRA
>  bool soc_is_tegra(void);
>
> +bool tegra_soc_core_domain_state_synced(void);
> +
>  int devm_tegra_core_dev_init_opp_table(struct device *dev,
>                                        struct tegra_core_opp_params *params);
>  #else
> @@ -31,6 +33,11 @@ static inline bool soc_is_tegra(void)
>         return false;
>  }
>
> +static inline bool tegra_soc_core_domain_state_synced(void)
> +{
> +       return false;
> +}
> +
>  static inline int
>  devm_tegra_core_dev_init_opp_table(struct device *dev,
>                                    struct tegra_core_opp_params *params)

Kind regards
Uffe
Dmitry Osipenko May 24, 2021, 8:23 p.m. UTC | #2
24.05.2021 20:04, Ulf Hansson пишет:
> On Mon, 24 May 2021 at 01:13, Dmitry Osipenko <digetx@gmail.com> wrote:
>>
>> NVIDIA Tegra SoCs have multiple power domains, each domain corresponds
>> to an external SoC power rail. Core power domain covers vast majority of
>> hardware blocks within a Tegra SoC. The voltage of a power domain should
>> be set to a level which satisfies all devices within the power domain.
>> Add support for the core power domain which controls voltage state of the
>> domain. This allows us to support system-wide DVFS on Tegra20-210 SoCs.
>> The PMC powergate domains now are sub-domains of the core domain, this
>> requires device-tree updating, older DTBs are unaffected.
>>
>> Tested-by: Peter Geis <pgwipeout@gmail.com> # Ouya T30
>> Tested-by: Paul Fertser <fercerpav@gmail.com> # PAZ00 T20
>> Tested-by: Nicolas Chauvet <kwizart@gmail.com> # PAZ00 T20 and TK1 T124
>> Tested-by: Matt Merhar <mattmerhar@protonmail.com> # Ouya T30
>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
> 
> [...]
> 
>> +
>> +static int tegra_pmc_core_pd_add(struct tegra_pmc *pmc, struct device_node *np)
>> +{
>> +       static struct lock_class_key tegra_core_domain_lock_class;
>> +       struct generic_pm_domain *genpd;
>> +       const char *rname = "core";
>> +       int err;
>> +
>> +       genpd = devm_kzalloc(pmc->dev, sizeof(*genpd), GFP_KERNEL);
>> +       if (!genpd)
>> +               return -ENOMEM;
>> +
>> +       genpd->name = np->name;
>> +       genpd->set_performance_state = tegra_pmc_core_pd_set_performance_state;
>> +       genpd->opp_to_performance_state = tegra_pmc_core_pd_opp_to_performance_state;
>> +
>> +       err = devm_pm_opp_set_regulators(pmc->dev, &rname, 1);
>> +       if (err)
>> +               return dev_err_probe(pmc->dev, err,
>> +                                    "failed to set core OPP regulator\n");
>> +
>> +       err = pm_genpd_init(genpd, NULL, false);
>> +       if (err) {
>> +               dev_err(pmc->dev, "failed to init core genpd: %d\n", err);
>> +               return err;
>> +       }
>> +
>> +       /*
>> +        * We have a "PMC pwrgate -> Core" hierarchy of the power domains
>> +        * where PMC needs to resume and change performance (voltage) of the
>> +        * Core domain from the PMC GENPD on/off callbacks, hence we need
>> +        * to annotate the lock in order to remove confusion from the
>> +        * lockdep checker when a nested access happens.
>> +        */
> 
> Can you elaborate a bit more on this?
> 
> Are you saying that when the child domain (PMC pwrgate) gets powered
> off, you want to drop its aggregated votes it may hold for the
> performance state, as otherwise it may affect the parent domain (core
> domain)?

Yes, in particular we want to remove/add the performance vote when clk is disabled/enabled, see tegra_clock_runtime_resume/suspend() of the clk-runtimePM driver [1]. I'll send that clk patch separately once this series and some other tegra-clk patches will be merged, otherwise there are too many dependencies and it's too difficult to review.

[1] https://patchwork.ozlabs.org/project/linux-tegra/patch/20201217180638.22748-33-digetx@gmail.com/

Please see the example lockdep trace in the end of the email that is fixed by the mutex annotation. What we have there is the tegra-host1x device driver that resumes PMC powergate domain on Tegra30, the PMC driver enables clock from the genpd.power_on callback of the powergate domain and this propagates to the clock's RPM callback which sets the performance vote of the core domain. Hence core domain vote is set from within of the powergate domain. 

> I guess this would be a valid scenario to optimize for, especially if
> you have more than one child domain of the core power domain, right?
> 
> If you have only one child domain, would it be sufficient to assign
> ->power_on|off() callbacks for the core domain and deal with the
> performance stare votes from there instead?

The core domain is the parent domain of the PMC domains + some devices directly belong to the core domain. The GENPD core aggregates the performance votes from the children domains and from devices of the parent core, this all works great already.

It sounds to me that you're suggesting to reinvent the aggregation logic within the PMC driver and create a fake hierarchy that doesn't match hardware. It won't help the lockdep warning anyways.

I actually don't quite understand what problem you're trying to solve, could you please explain? The lockdep warning is harmless, we just need to annotate the mutex lock class.

If you don't feel comfortable with the usage of lockdep_set_class() in the driver, then maybe it should be possible to make it a part of the pm_genpd_init(). For example like we did that for tegra-host1x driver recently [2].

[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a24f98176d1efae2c37d3438c57a624d530d9c33


 LOCKDEP
 ============================================
 WARNING: possible recursive locking detected
 5.13.0-rc3-next-20210524-00202-g80a288f17147-dirty #7935 Tainted: G        W        
 --------------------------------------------
 kworker/u8:2/96 is trying to acquire lock:
 c202e494 (&genpd->mlock){+.+.}-{3:3}, at: genpd_runtime_resume+0x95/0x174
 
               but task is already holding lock:
 c35d9454 (&genpd->mlock){+.+.}-{3:3}, at: genpd_runtime_resume+0x95/0x174
 
               other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(&genpd->mlock);
   lock(&genpd->mlock);
 
                *** DEADLOCK ***

  May be due to missing lock nesting notation

 5 locks held by kworker/u8:2/96:
  #0: c2024ea8 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x15a/0x600
  #1: c2a31f20 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x15a/0x600
  #2: c35f04d8 (&dev->mutex){....}-{3:3}, at: __device_attach+0x29/0xdc
  #3: c35d9454 (&genpd->mlock){+.+.}-{3:3}, at: genpd_runtime_resume+0x95/0x174
  #4: c13fbbcc (prepare_lock){+.+.}-{3:3}, at: clk_prepare_lock+0x17/0xac
 
               stack backtrace:
 CPU: 0 PID: 96 Comm: kworker/u8:2 Tainted: G        W         5.13.0-rc3-next-20210524-00202-g80a288f17147-dirty #7935
 Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)
 Workqueue: events_unbound deferred_probe_work_func
 [<c010d1cd>] (unwind_backtrace) from [<c0109639>] (show_stack+0x11/0x14)
 [<c0109639>] (show_stack) from [<c0ba6dab>] (dump_stack_lvl+0x97/0xb0)
 [<c0ba6dab>] (dump_stack_lvl) from [<c017b24f>] (__lock_acquire+0x7fb/0x2534)
 [<c017b24f>] (__lock_acquire) from [<c017d75b>] (lock_acquire+0xf3/0x424)
 [<c017d75b>] (lock_acquire) from [<c0bb0daf>] (__mutex_lock+0x87/0x7f4)
 [<c0bb0daf>] (__mutex_lock) from [<c0bb1535>] (mutex_lock_nested+0x19/0x20)
 [<c0bb1535>] (mutex_lock_nested) from [<c0669ced>] (genpd_runtime_resume+0x95/0x174)
 [<c0669ced>] (genpd_runtime_resume) from [<c0660167>] (__rpm_callback+0x7b/0xc8)
 [<c0660167>] (__rpm_callback) from [<c06601cd>] (rpm_callback+0x19/0x60)
 [<c06601cd>] (rpm_callback) from [<c065fde3>] (rpm_resume+0x47f/0x65c)
 [<c065fde3>] (rpm_resume) from [<c066000f>] (__pm_runtime_resume+0x4f/0x78)
 [<c066000f>] (__pm_runtime_resume) from [<c05857f7>] (clk_pm_runtime_get.part.0+0x13/0x54)
 [<c05857f7>] (clk_pm_runtime_get.part.0) from [<c05881e9>] (clk_core_set_rate_nolock+0x81/0x1cc)
 [<c05881e9>] (clk_core_set_rate_nolock) from [<c0588353>] (clk_set_rate+0x1f/0x44)
 [<c0588353>] (clk_set_rate) from [<c0597cd3>] (tegra_powergate_prepare_clocks+0x2f/0x94)
 [<c0597cd3>] (tegra_powergate_prepare_clocks) from [<c059a4d1>] (tegra_powergate_power_up+0x45/0xec)
 [<c059a4d1>] (tegra_powergate_power_up) from [<c0ba7211>] (tegra_genpd_power_on+0x2b/0x50)
 [<c0ba7211>] (tegra_genpd_power_on) from [<c0667231>] (_genpd_power_on+0x6d/0xb8)
 [<c0667231>] (_genpd_power_on) from [<c066999d>] (genpd_power_on.part.0+0x85/0xf0)
 [<c066999d>] (genpd_power_on.part.0) from [<c0669cfb>] (genpd_runtime_resume+0xa3/0x174)
 [<c0669cfb>] (genpd_runtime_resume) from [<c0660167>] (__rpm_callback+0x7b/0xc8)
 [<c0660167>] (__rpm_callback) from [<c06601cd>] (rpm_callback+0x19/0x60)
 [<c06601cd>] (rpm_callback) from [<c065fde3>] (rpm_resume+0x47f/0x65c)
 [<c065fde3>] (rpm_resume) from [<c066000f>] (__pm_runtime_resume+0x4f/0x78)
 [<c066000f>] (__pm_runtime_resume) from [<c065675f>] (__device_attach+0x83/0xdc)
 [<c065675f>] (__device_attach) from [<c0655d55>] (bus_probe_device+0x5d/0x64)
 [<c0655d55>] (bus_probe_device) from [<c06560b7>] (deferred_probe_work_func+0x63/0x88)
 [<c06560b7>] (deferred_probe_work_func) from [<c0139993>] (process_one_work+0x1eb/0x600)
 [<c0139993>] (process_one_work) from [<c0139fcf>] (worker_thread+0x227/0x3bc)
 [<c0139fcf>] (worker_thread) from [<c0140ab3>] (kthread+0x13f/0x15c)
 [<c0140ab3>] (kthread) from [<c0100159>] (ret_from_fork+0x11/0x38)
 Exception stack(0xc2a31fb0 to 0xc2a31ff8)
Dmitry Osipenko May 24, 2021, 8:25 p.m. UTC | #3
24.05.2021 20:04, Ulf Hansson пишет:
>> +static void tegra_pmc_sync_state(struct device *dev)
>> +{
>> +       int err;
>> +
>> +       pmc->core_domain_state_synced = true;
>> +
>> +       /* this is a no-op if core regulator isn't used */
>> +       mutex_lock(&pmc->powergates_lock);
>> +       err = dev_pm_opp_sync_regulators(dev);
>> +       mutex_unlock(&pmc->powergates_lock);
>> +
>> +       if (err)
>> +               dev_err(dev, "failed to sync regulators: %d\n", err);
>> +}
>> +
> Nitpick.
> 
> Would you mind splitting the "sync_state" thingy out into a separate
> patch on top of $subject patch?
> 
> I think it would be nice, especially since it shares a function via
> include/soc/tegra/common.h - that would make it clear to what part
> that belongs to.
> 

I'll split it in v3. Thank you for the review.
Ulf Hansson May 31, 2021, 1:17 p.m. UTC | #4
On Mon, 24 May 2021 at 22:23, Dmitry Osipenko <digetx@gmail.com> wrote:
>

> 24.05.2021 20:04, Ulf Hansson пишет:

> > On Mon, 24 May 2021 at 01:13, Dmitry Osipenko <digetx@gmail.com> wrote:

> >>

> >> NVIDIA Tegra SoCs have multiple power domains, each domain corresponds

> >> to an external SoC power rail. Core power domain covers vast majority of

> >> hardware blocks within a Tegra SoC. The voltage of a power domain should

> >> be set to a level which satisfies all devices within the power domain.

> >> Add support for the core power domain which controls voltage state of the

> >> domain. This allows us to support system-wide DVFS on Tegra20-210 SoCs.

> >> The PMC powergate domains now are sub-domains of the core domain, this

> >> requires device-tree updating, older DTBs are unaffected.

> >>

> >> Tested-by: Peter Geis <pgwipeout@gmail.com> # Ouya T30

> >> Tested-by: Paul Fertser <fercerpav@gmail.com> # PAZ00 T20

> >> Tested-by: Nicolas Chauvet <kwizart@gmail.com> # PAZ00 T20 and TK1 T124

> >> Tested-by: Matt Merhar <mattmerhar@protonmail.com> # Ouya T30

> >> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>

> >

> > [...]

> >

> >> +

> >> +static int tegra_pmc_core_pd_add(struct tegra_pmc *pmc, struct device_node *np)

> >> +{

> >> +       static struct lock_class_key tegra_core_domain_lock_class;

> >> +       struct generic_pm_domain *genpd;

> >> +       const char *rname = "core";

> >> +       int err;

> >> +

> >> +       genpd = devm_kzalloc(pmc->dev, sizeof(*genpd), GFP_KERNEL);

> >> +       if (!genpd)

> >> +               return -ENOMEM;

> >> +

> >> +       genpd->name = np->name;

> >> +       genpd->set_performance_state = tegra_pmc_core_pd_set_performance_state;

> >> +       genpd->opp_to_performance_state = tegra_pmc_core_pd_opp_to_performance_state;

> >> +

> >> +       err = devm_pm_opp_set_regulators(pmc->dev, &rname, 1);

> >> +       if (err)

> >> +               return dev_err_probe(pmc->dev, err,

> >> +                                    "failed to set core OPP regulator\n");

> >> +

> >> +       err = pm_genpd_init(genpd, NULL, false);

> >> +       if (err) {

> >> +               dev_err(pmc->dev, "failed to init core genpd: %d\n", err);

> >> +               return err;

> >> +       }

> >> +

> >> +       /*

> >> +        * We have a "PMC pwrgate -> Core" hierarchy of the power domains

> >> +        * where PMC needs to resume and change performance (voltage) of the

> >> +        * Core domain from the PMC GENPD on/off callbacks, hence we need

> >> +        * to annotate the lock in order to remove confusion from the

> >> +        * lockdep checker when a nested access happens.

> >> +        */

> >

> > Can you elaborate a bit more on this?

> >

> > Are you saying that when the child domain (PMC pwrgate) gets powered

> > off, you want to drop its aggregated votes it may hold for the

> > performance state, as otherwise it may affect the parent domain (core

> > domain)?

>

> Yes, in particular we want to remove/add the performance vote when clk is disabled/enabled, see tegra_clock_runtime_resume/suspend() of the clk-runtimePM driver [1]. I'll send that clk patch separately once this series and some other tegra-clk patches will be merged, otherwise there are too many dependencies and it's too difficult to review.


You are likely correct from the merging point of view, but for
completeness I would prefer to look at the whole series. Would you
mind folding in some of these changes too?

>

> [1] https://patchwork.ozlabs.org/project/linux-tegra/patch/20201217180638.22748-33-digetx@gmail.com/


Hmm. In general, the new changes to genpd and the opp library with
"performance states" for DVFS, should help to avoid using clock rate
notifications.

Instead of updating the performance votes from the clock provider
driver, the more proper thing would be to let the clock consumer
driver (various drivers) to call dev_pm_opp_set_rate() when it needs
to move to a new OPP. This also means calling dev_pm_opp_set_rate(dev,
0) when the votes for an OPP can be dropped.

In this way, the opp library will call genpd to update the performance
state vote for the corresponding device.

>

> Please see the example lockdep trace in the end of the email that is fixed by the mutex annotation. What we have there is the tegra-host1x device driver that resumes PMC powergate domain on Tegra30, the PMC driver enables clock from the genpd.power_on callback of the powergate domain and this propagates to the clock's RPM callback which sets the performance vote of the core domain. Hence core domain vote is set from within of the powergate domain.


Right, this sounds like a fragile looking path. Are you sure it can't
lead into deadlock situations?

In any case, we designed dev_pm_opp_set_rate() (and its friends in
genpd) with these locking issues in mind.

>

> > I guess this would be a valid scenario to optimize for, especially if

> > you have more than one child domain of the core power domain, right?

> >

> > If you have only one child domain, would it be sufficient to assign

> > ->power_on|off() callbacks for the core domain and deal with the

> > performance stare votes from there instead?

>

> The core domain is the parent domain of the PMC domains + some devices directly belong to the core domain. The GENPD core aggregates the performance votes from the children domains and from devices of the parent core, this all works great already.

>

> It sounds to me that you're suggesting to reinvent the aggregation logic within the PMC driver and create a fake hierarchy that doesn't match hardware. It won't help the lockdep warning anyways.

>

> I actually don't quite understand what problem you're trying to solve, could you please explain? The lockdep warning is harmless, we just need to annotate the mutex lock class.

>

> If you don't feel comfortable with the usage of lockdep_set_class() in the driver, then maybe it should be possible to make it a part of the pm_genpd_init(). For example like we did that for tegra-host1x driver recently [2].


I was not trying to solve a problem, but was just curious and wanted
to ask about the reasons for the lockdep_set_class(), as it simply
caught my attention while reviewing.

Looks like there may be something fishy going on, but let's see, I may be wrong.

>

> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a24f98176d1efae2c37d3438c57a624d530d9c33

>

>

>  LOCKDEP

>  ============================================

>  WARNING: possible recursive locking detected

>  5.13.0-rc3-next-20210524-00202-g80a288f17147-dirty #7935 Tainted: G        W

>  --------------------------------------------

>  kworker/u8:2/96 is trying to acquire lock:

>  c202e494 (&genpd->mlock){+.+.}-{3:3}, at: genpd_runtime_resume+0x95/0x174

>

>                but task is already holding lock:

>  c35d9454 (&genpd->mlock){+.+.}-{3:3}, at: genpd_runtime_resume+0x95/0x174

>

>                other info that might help us debug this:

>   Possible unsafe locking scenario:

>

>         CPU0

>         ----

>    lock(&genpd->mlock);

>    lock(&genpd->mlock);

>

>                 *** DEADLOCK ***

>

>   May be due to missing lock nesting notation

>

>  5 locks held by kworker/u8:2/96:

>   #0: c2024ea8 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x15a/0x600

>   #1: c2a31f20 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x15a/0x600

>   #2: c35f04d8 (&dev->mutex){....}-{3:3}, at: __device_attach+0x29/0xdc

>   #3: c35d9454 (&genpd->mlock){+.+.}-{3:3}, at: genpd_runtime_resume+0x95/0x174

>   #4: c13fbbcc (prepare_lock){+.+.}-{3:3}, at: clk_prepare_lock+0x17/0xac

>

>                stack backtrace:

>  CPU: 0 PID: 96 Comm: kworker/u8:2 Tainted: G        W         5.13.0-rc3-next-20210524-00202-g80a288f17147-dirty #7935

>  Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)

>  Workqueue: events_unbound deferred_probe_work_func

>  [<c010d1cd>] (unwind_backtrace) from [<c0109639>] (show_stack+0x11/0x14)

>  [<c0109639>] (show_stack) from [<c0ba6dab>] (dump_stack_lvl+0x97/0xb0)

>  [<c0ba6dab>] (dump_stack_lvl) from [<c017b24f>] (__lock_acquire+0x7fb/0x2534)

>  [<c017b24f>] (__lock_acquire) from [<c017d75b>] (lock_acquire+0xf3/0x424)

>  [<c017d75b>] (lock_acquire) from [<c0bb0daf>] (__mutex_lock+0x87/0x7f4)

>  [<c0bb0daf>] (__mutex_lock) from [<c0bb1535>] (mutex_lock_nested+0x19/0x20)

>  [<c0bb1535>] (mutex_lock_nested) from [<c0669ced>] (genpd_runtime_resume+0x95/0x174)

>  [<c0669ced>] (genpd_runtime_resume) from [<c0660167>] (__rpm_callback+0x7b/0xc8)

>  [<c0660167>] (__rpm_callback) from [<c06601cd>] (rpm_callback+0x19/0x60)

>  [<c06601cd>] (rpm_callback) from [<c065fde3>] (rpm_resume+0x47f/0x65c)

>  [<c065fde3>] (rpm_resume) from [<c066000f>] (__pm_runtime_resume+0x4f/0x78)

>  [<c066000f>] (__pm_runtime_resume) from [<c05857f7>] (clk_pm_runtime_get.part.0+0x13/0x54)

>  [<c05857f7>] (clk_pm_runtime_get.part.0) from [<c05881e9>] (clk_core_set_rate_nolock+0x81/0x1cc)

>  [<c05881e9>] (clk_core_set_rate_nolock) from [<c0588353>] (clk_set_rate+0x1f/0x44)

>  [<c0588353>] (clk_set_rate) from [<c0597cd3>] (tegra_powergate_prepare_clocks+0x2f/0x94)

>  [<c0597cd3>] (tegra_powergate_prepare_clocks) from [<c059a4d1>] (tegra_powergate_power_up+0x45/0xec)

>  [<c059a4d1>] (tegra_powergate_power_up) from [<c0ba7211>] (tegra_genpd_power_on+0x2b/0x50)

>  [<c0ba7211>] (tegra_genpd_power_on) from [<c0667231>] (_genpd_power_on+0x6d/0xb8)

>  [<c0667231>] (_genpd_power_on) from [<c066999d>] (genpd_power_on.part.0+0x85/0xf0)

>  [<c066999d>] (genpd_power_on.part.0) from [<c0669cfb>] (genpd_runtime_resume+0xa3/0x174)

>  [<c0669cfb>] (genpd_runtime_resume) from [<c0660167>] (__rpm_callback+0x7b/0xc8)

>  [<c0660167>] (__rpm_callback) from [<c06601cd>] (rpm_callback+0x19/0x60)

>  [<c06601cd>] (rpm_callback) from [<c065fde3>] (rpm_resume+0x47f/0x65c)

>  [<c065fde3>] (rpm_resume) from [<c066000f>] (__pm_runtime_resume+0x4f/0x78)

>  [<c066000f>] (__pm_runtime_resume) from [<c065675f>] (__device_attach+0x83/0xdc)

>  [<c065675f>] (__device_attach) from [<c0655d55>] (bus_probe_device+0x5d/0x64)

>  [<c0655d55>] (bus_probe_device) from [<c06560b7>] (deferred_probe_work_func+0x63/0x88)

>  [<c06560b7>] (deferred_probe_work_func) from [<c0139993>] (process_one_work+0x1eb/0x600)

>  [<c0139993>] (process_one_work) from [<c0139fcf>] (worker_thread+0x227/0x3bc)

>  [<c0139fcf>] (worker_thread) from [<c0140ab3>] (kthread+0x13f/0x15c)

>  [<c0140ab3>] (kthread) from [<c0100159>] (ret_from_fork+0x11/0x38)

>  Exception stack(0xc2a31fb0 to 0xc2a31ff8)


Thanks for sharing the log.

Could you perhaps point me to the corresponding DTS files. I would
like to understand more about the PM domain hierarchy and where the
clock controller may be located.

Kind regards
Uffe
Dmitry Osipenko May 31, 2021, 8:07 p.m. UTC | #5
31.05.2021 16:17, Ulf Hansson пишет:
> On Mon, 24 May 2021 at 22:23, Dmitry Osipenko <digetx@gmail.com> wrote:
>>
>> 24.05.2021 20:04, Ulf Hansson пишет:
>>> On Mon, 24 May 2021 at 01:13, Dmitry Osipenko <digetx@gmail.com> wrote:
>>>>
>>>> NVIDIA Tegra SoCs have multiple power domains, each domain corresponds
>>>> to an external SoC power rail. Core power domain covers vast majority of
>>>> hardware blocks within a Tegra SoC. The voltage of a power domain should
>>>> be set to a level which satisfies all devices within the power domain.
>>>> Add support for the core power domain which controls voltage state of the
>>>> domain. This allows us to support system-wide DVFS on Tegra20-210 SoCs.
>>>> The PMC powergate domains now are sub-domains of the core domain, this
>>>> requires device-tree updating, older DTBs are unaffected.
>>>>
>>>> Tested-by: Peter Geis <pgwipeout@gmail.com> # Ouya T30
>>>> Tested-by: Paul Fertser <fercerpav@gmail.com> # PAZ00 T20
>>>> Tested-by: Nicolas Chauvet <kwizart@gmail.com> # PAZ00 T20 and TK1 T124
>>>> Tested-by: Matt Merhar <mattmerhar@protonmail.com> # Ouya T30
>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>
>>> [...]
>>>
>>>> +
>>>> +static int tegra_pmc_core_pd_add(struct tegra_pmc *pmc, struct device_node *np)
>>>> +{
>>>> +       static struct lock_class_key tegra_core_domain_lock_class;
>>>> +       struct generic_pm_domain *genpd;
>>>> +       const char *rname = "core";
>>>> +       int err;
>>>> +
>>>> +       genpd = devm_kzalloc(pmc->dev, sizeof(*genpd), GFP_KERNEL);
>>>> +       if (!genpd)
>>>> +               return -ENOMEM;
>>>> +
>>>> +       genpd->name = np->name;
>>>> +       genpd->set_performance_state = tegra_pmc_core_pd_set_performance_state;
>>>> +       genpd->opp_to_performance_state = tegra_pmc_core_pd_opp_to_performance_state;
>>>> +
>>>> +       err = devm_pm_opp_set_regulators(pmc->dev, &rname, 1);
>>>> +       if (err)
>>>> +               return dev_err_probe(pmc->dev, err,
>>>> +                                    "failed to set core OPP regulator\n");
>>>> +
>>>> +       err = pm_genpd_init(genpd, NULL, false);
>>>> +       if (err) {
>>>> +               dev_err(pmc->dev, "failed to init core genpd: %d\n", err);
>>>> +               return err;
>>>> +       }
>>>> +
>>>> +       /*
>>>> +        * We have a "PMC pwrgate -> Core" hierarchy of the power domains
>>>> +        * where PMC needs to resume and change performance (voltage) of the
>>>> +        * Core domain from the PMC GENPD on/off callbacks, hence we need
>>>> +        * to annotate the lock in order to remove confusion from the
>>>> +        * lockdep checker when a nested access happens.
>>>> +        */
>>>
>>> Can you elaborate a bit more on this?
>>>
>>> Are you saying that when the child domain (PMC pwrgate) gets powered
>>> off, you want to drop its aggregated votes it may hold for the
>>> performance state, as otherwise it may affect the parent domain (core
>>> domain)?
>>
>> Yes, in particular we want to remove/add the performance vote when clk is disabled/enabled, see tegra_clock_runtime_resume/suspend() of the clk-runtimePM driver [1]. I'll send that clk patch separately once this series and some other tegra-clk patches will be merged, otherwise there are too many dependencies and it's too difficult to review.
> 
> You are likely correct from the merging point of view, but for
> completeness I would prefer to look at the whole series. Would you
> mind folding in some of these changes too?

Sure, I will send those changes once this fundamental patchset will be merged. 

>> [1] https://patchwork.ozlabs.org/project/linux-tegra/patch/20201217180638.22748-33-digetx@gmail.com/
> 
> Hmm. In general, the new changes to genpd and the opp library with
> "performance states" for DVFS, should help to avoid using clock rate
> notifications.
> 
> Instead of updating the performance votes from the clock provider
> driver, the more proper thing would be to let the clock consumer
> driver (various drivers) to call dev_pm_opp_set_rate() when it needs
> to move to a new OPP. This also means calling dev_pm_opp_set_rate(dev,
> 0) when the votes for an OPP can be dropped.
> 
> In this way, the opp library will call genpd to update the performance
> state vote for the corresponding device.

This is not sufficient for Tegra because we have individual OPP tables for the root PLLs, system clocks and device clocks. The device clocks could be muxed to a different PLLs, depending on clk requirements of a particular board.

>> Please see the example lockdep trace in the end of the email that is fixed by the mutex annotation. What we have there is the tegra-host1x device driver that resumes PMC powergate domain on Tegra30, the PMC driver enables clock from the genpd.power_on callback of the powergate domain and this propagates to the clock's RPM callback which sets the performance vote of the core domain. Hence core domain vote is set from within of the powergate domain.
> 
> Right, this sounds like a fragile looking path. Are you sure it can't
> lead into deadlock situations?

The locking was a concern, but turned out that it works nicely for the Tegra clk/power tree. This approach may not be suitable for other SoCs. 

> In any case, we designed dev_pm_opp_set_rate() (and its friends in
> genpd) with these locking issues in mind.

The device drivers don't manage the parent clocks directly and OPP core doesn't support this use-case where OPP needs to be applied to a generic/parent PLL clock. Moving the OPP management to the clk driver is the easy solution which works good in practice for Tegra, it also removes a need to switch each driver to dev_pm_opp_set_rate() usage.

>>> I guess this would be a valid scenario to optimize for, especially if
>>> you have more than one child domain of the core power domain, right?
>>>
>>> If you have only one child domain, would it be sufficient to assign
>>> ->power_on|off() callbacks for the core domain and deal with the
>>> performance stare votes from there instead?
>>
>> The core domain is the parent domain of the PMC domains + some devices directly belong to the core domain. The GENPD core aggregates the performance votes from the children domains and from devices of the parent core, this all works great already.
>>
>> It sounds to me that you're suggesting to reinvent the aggregation logic within the PMC driver and create a fake hierarchy that doesn't match hardware. It won't help the lockdep warning anyways.
>>
>> I actually don't quite understand what problem you're trying to solve, could you please explain? The lockdep warning is harmless, we just need to annotate the mutex lock class.
>>
>> If you don't feel comfortable with the usage of lockdep_set_class() in the driver, then maybe it should be possible to make it a part of the pm_genpd_init(). For example like we did that for tegra-host1x driver recently [2].
> 
> I was not trying to solve a problem, but was just curious and wanted
> to ask about the reasons for the lockdep_set_class(), as it simply
> caught my attention while reviewing.
> 
> Looks like there may be something fishy going on, but let's see, I may be wrong.
> 
>>
>> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a24f98176d1efae2c37d3438c57a624d530d9c33
>>
>>
>>  LOCKDEP
>>  ============================================
...
> 
> Thanks for sharing the log.
> 
> Could you perhaps point me to the corresponding DTS files. I would
> like to understand more about the PM domain hierarchy and where the
> clock controller may be located.

https://github.com/grate-driver/linux/tree/master has all the latest versions of patches applied.

For example please see clock@60006000 and pmc@7000e400 nodes of [1].

[1] https://github.com/grate-driver/linux/blob/master/arch/arm/boot/dts/tegra30.dtsi

This is how output of pm_genpd_summary looks like on Tegra30 Nexus 7, where performance=voltage:

domain                          status          children                           performance
    /device                                             runtime status
----------------------------------------------------------------------------------------------
heg                             on                                                 1000000
    /devices/soc0/50000000.host1x                       active                     1000000
    /devices/soc0/50000000.host1x/540c0000.epp          suspended                  0
    /devices/soc0/50000000.host1x/54140000.gr2d         suspended                  0
mpe                             off-0                                              0
    /devices/soc0/50000000.host1x/54040000.mpe          suspended                  0
vdec                            off-0                                              0
    /devices/soc0/6001a000.vde                          suspended                  0
venc                            off-0                                              0
    /devices/soc0/50000000.host1x/54080000.vi           suspended                  0
    /devices/soc0/50000000.host1x/54100000.isp          suspended                  0
3d1                             off-0                                              0
    /devices/genpd:1:54180000.gr3d                      suspended                  0
3d0                             off-0                                              0
    /devices/genpd:0:54180000.gr3d                      suspended                  0
core-domain                     on                                                 1000000
                                                3d0, 3d1, venc, vdec, mpe, heg
    /devices/soc0/7000f400.memory-controller            unsupported                1000000
    /devices/platform/tegra_clk_pll_c                   active                     1000000
    /devices/platform/tegra_clk_pll_e                   suspended                  0
    /devices/platform/tegra_clk_pll_m                   active                     1000000
    /devices/platform/tegra_clk_sclk                    active                     1000000
    /devices/platform/tegra_clk_dsia                    suspended                  0
    /devices/platform/tegra_clk_dsib                    suspended                  0
    /devices/platform/tegra_clk_spdif_out               suspended                  0
    /devices/platform/tegra_clk_se                      suspended                  0
    /devices/platform/tegra_clk_hdmi                    suspended                  0
    /devices/platform/tegra_clk_pwm                     active                     1000000
    /devices/platform/tegra_clk_pcie                    suspended                  0
    /devices/platform/tegra_clk_afi                     suspended                  0
    /devices/platform/tegra_clk_vde                     suspended                  0
    /devices/platform/tegra_clk_vi                      suspended                  0
    /devices/platform/tegra_clk_epp                     suspended                  0
    /devices/platform/tegra_clk_mpe                     suspended                  0
    /devices/platform/tegra_clk_nor                     suspended                  0
    /devices/platform/tegra_clk_sdmmc1                  suspended                  0
    /devices/platform/tegra_clk_sdmmc3                  active                     950000
    /devices/platform/tegra_clk_mipi                    suspended                  0
    /devices/platform/tegra_clk_sbc1                    suspended                  0
    /devices/platform/tegra_clk_sbc2                    suspended                  0
    /devices/platform/tegra_clk_sbc3                    suspended                  0
    /devices/platform/tegra_clk_sbc4                    suspended                  0
    /devices/platform/tegra_clk_sbc5                    suspended                  0
    /devices/platform/tegra_clk_sbc6                    suspended                  0
    /devices/platform/tegra_clk_cve                     suspended                  0
    /devices/platform/tegra_clk_tvo                     suspended                  0
    /devices/platform/tegra_clk_tvdac                   suspended                  0
    /devices/platform/tegra_clk_ndflash                 suspended                  0
    /devices/platform/tegra_clk_sata_oob                suspended                  0
    /devices/platform/tegra_clk_sata                    suspended                  0
    /devices/platform/tegra_clk_fuse_burn               suspended                  0
    /devices/platform/tegra_clk_usbd                    active                     1000000
    /devices/platform/tegra_clk_usb2                    suspended                  0
    /devices/platform/tegra_clk_usb3                    suspended                  0
    /devices/soc0/50000000.host1x/54200000.dc           active                     1000000
    /devices/soc0/50000000.host1x/54240000.dc           suspended                  0
Dmitry Osipenko June 1, 2021, 3:48 p.m. UTC | #6
01.06.2021 13:19, Ulf Hansson пишет:
...
>> This is not sufficient for Tegra because we have individual OPP tables for the root PLLs, system clocks and device clocks. The device clocks could be muxed to a different PLLs, depending on clk requirements of a particular board.

> 

> Are you saying that the clock providers for the "root PLLs" and

> "system clocks" have OPP tables themselves? If so, would you mind

> posting a patch for an updated DT binding for these changes, so it can

> be discussed separately?


I will post all those patches soon, thank you.

...
>> The device drivers don't manage the parent clocks directly and OPP core doesn't support this use-case where OPP needs to be applied to a generic/parent PLL clock. Moving the OPP management to the clk driver is the easy solution which works good in practice for Tegra, it also removes a need to switch each driver to dev_pm_opp_set_rate() usage.

> 

> I admit, if clock consumer drivers could avoid calling

> dev_pm_opp_set_rate|opp(), that would be nice. But, as I stated, it's

> a fragile path from locking point of view, to call

> dev_pm_opp_set_rate|opp() from a clock provider driver. Personally, I

> think it's better to avoid it.

> 

> More importantly, you also need to convince the clock subsystem

> maintainers, that setting an OPP internally from the clock provider

> driver is a good idea. As far as I can tell, they have said *no* to

> this, since the common clock framework was invented, I believe for

> good reasons.


Pushing the OPP into a CCF driver is indeed not ideal. I'm open to new
ideas. I will post those patches where we could discuss this in a more
details.

...
>> For example please see clock@60006000 and pmc@7000e400 nodes of [1].

>>

>> [1] https://github.com/grate-driver/linux/blob/master/arch/arm/boot/dts/tegra30.dtsi

> 

> Thanks, that certainly helped me understand better!

> 

> I see that you want to add OPP tables to clock provider nodes. As I

> said above, an updated DT binding is probably a good idea to discuss

> separately.

...
> 

> Okay, to not stall things from moving forward, may I suggest that you

> simply drop the call to lockdep_set_class() (and the corresponding

> comment) for now.

> 

> Then you can continue to post the next parts - and if it turns out

> that lockdep_set_class() becomes needed, you can always add it back

> then.


Thank you very much for helping with reviewing this all. I'll drop the
lockdep_set_class() and post the v7 shortly. Afterwards, I'll send the
rest of clk, device-tree and etc related patches targeting 5.15.
diff mbox series

Patch

diff --git a/drivers/soc/tegra/Kconfig b/drivers/soc/tegra/Kconfig
index 976dee036470..7057254604ee 100644
--- a/drivers/soc/tegra/Kconfig
+++ b/drivers/soc/tegra/Kconfig
@@ -13,6 +13,7 @@  config ARCH_TEGRA_2x_SOC
 	select PINCTRL_TEGRA20
 	select PL310_ERRATA_727915 if CACHE_L2X0
 	select PL310_ERRATA_769419 if CACHE_L2X0
+	select SOC_TEGRA_COMMON
 	select SOC_TEGRA_FLOWCTRL
 	select SOC_TEGRA_PMC
 	select SOC_TEGRA20_VOLTAGE_COUPLER
@@ -27,6 +28,7 @@  config ARCH_TEGRA_3x_SOC
 	select ARM_ERRATA_764369 if SMP
 	select PINCTRL_TEGRA30
 	select PL310_ERRATA_769419 if CACHE_L2X0
+	select SOC_TEGRA_COMMON
 	select SOC_TEGRA_FLOWCTRL
 	select SOC_TEGRA_PMC
 	select SOC_TEGRA30_VOLTAGE_COUPLER
@@ -40,6 +42,7 @@  config ARCH_TEGRA_114_SOC
 	select ARM_ERRATA_798181 if SMP
 	select HAVE_ARM_ARCH_TIMER
 	select PINCTRL_TEGRA114
+	select SOC_TEGRA_COMMON
 	select SOC_TEGRA_FLOWCTRL
 	select SOC_TEGRA_PMC
 	select TEGRA_TIMER
@@ -51,6 +54,7 @@  config ARCH_TEGRA_124_SOC
 	bool "Enable support for Tegra124 family"
 	select HAVE_ARM_ARCH_TIMER
 	select PINCTRL_TEGRA124
+	select SOC_TEGRA_COMMON
 	select SOC_TEGRA_FLOWCTRL
 	select SOC_TEGRA_PMC
 	select TEGRA_TIMER
@@ -66,6 +70,7 @@  if ARM64
 config ARCH_TEGRA_132_SOC
 	bool "NVIDIA Tegra132 SoC"
 	select PINCTRL_TEGRA124
+	select SOC_TEGRA_COMMON
 	select SOC_TEGRA_FLOWCTRL
 	select SOC_TEGRA_PMC
 	help
@@ -77,6 +82,7 @@  config ARCH_TEGRA_132_SOC
 config ARCH_TEGRA_210_SOC
 	bool "NVIDIA Tegra210 SoC"
 	select PINCTRL_TEGRA210
+	select SOC_TEGRA_COMMON
 	select SOC_TEGRA_FLOWCTRL
 	select SOC_TEGRA_PMC
 	select TEGRA_TIMER
@@ -99,6 +105,7 @@  config ARCH_TEGRA_186_SOC
 	select TEGRA_BPMP
 	select TEGRA_HSP_MBOX
 	select TEGRA_IVC
+	select SOC_TEGRA_COMMON
 	select SOC_TEGRA_PMC
 	help
 	  Enable support for the NVIDIA Tegar186 SoC. The Tegra186 features a
@@ -115,6 +122,7 @@  config ARCH_TEGRA_194_SOC
 	select TEGRA_BPMP
 	select TEGRA_HSP_MBOX
 	select TEGRA_IVC
+	select SOC_TEGRA_COMMON
 	select SOC_TEGRA_PMC
 	help
 	  Enable support for the NVIDIA Tegra194 SoC.
@@ -125,6 +133,7 @@  config ARCH_TEGRA_234_SOC
 	select TEGRA_BPMP
 	select TEGRA_HSP_MBOX
 	select TEGRA_IVC
+	select SOC_TEGRA_COMMON
 	select SOC_TEGRA_PMC
 	help
 	  Enable support for the NVIDIA Tegra234 SoC.
@@ -132,6 +141,11 @@  config ARCH_TEGRA_234_SOC
 endif
 endif
 
+config SOC_TEGRA_COMMON
+	bool
+	select PM_OPP
+	select PM_GENERIC_DOMAINS
+
 config SOC_TEGRA_FUSE
 	def_bool y
 	depends on ARCH_TEGRA
diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c
index 8e3b78bb2ac2..33b6e0885020 100644
--- a/drivers/soc/tegra/pmc.c
+++ b/drivers/soc/tegra/pmc.c
@@ -38,6 +38,7 @@ 
 #include <linux/pinctrl/pinctrl.h>
 #include <linux/platform_device.h>
 #include <linux/pm_domain.h>
+#include <linux/pm_opp.h>
 #include <linux/reboot.h>
 #include <linux/regmap.h>
 #include <linux/reset.h>
@@ -428,6 +429,8 @@  struct tegra_pmc {
 	struct irq_chip irq;
 
 	struct notifier_block clk_nb;
+
+	bool core_domain_state_synced;
 };
 
 static struct tegra_pmc *pmc = &(struct tegra_pmc) {
@@ -1302,12 +1305,115 @@  static int tegra_powergate_add(struct tegra_pmc *pmc, struct device_node *np)
 	return err;
 }
 
+bool tegra_soc_core_domain_state_synced(void)
+{
+	return pmc->core_domain_state_synced;
+}
+
+static int
+tegra_pmc_core_pd_set_performance_state(struct generic_pm_domain *genpd,
+					unsigned int level)
+{
+	struct dev_pm_opp *opp;
+	int err;
+
+	opp = dev_pm_opp_find_level_ceil(&genpd->dev, &level);
+	if (IS_ERR(opp)) {
+		dev_err(&genpd->dev, "failed to find OPP for level %u: %pe\n",
+			level, opp);
+		return PTR_ERR(opp);
+	}
+
+	mutex_lock(&pmc->powergates_lock);
+	err = dev_pm_opp_set_opp(pmc->dev, opp);
+	mutex_unlock(&pmc->powergates_lock);
+
+	dev_pm_opp_put(opp);
+
+	if (err) {
+		dev_err(&genpd->dev, "failed to set voltage to %duV: %d\n",
+			level, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static unsigned int
+tegra_pmc_core_pd_opp_to_performance_state(struct generic_pm_domain *genpd,
+					   struct dev_pm_opp *opp)
+{
+	return dev_pm_opp_get_level(opp);
+}
+
+static int tegra_pmc_core_pd_add(struct tegra_pmc *pmc, struct device_node *np)
+{
+	static struct lock_class_key tegra_core_domain_lock_class;
+	struct generic_pm_domain *genpd;
+	const char *rname = "core";
+	int err;
+
+	genpd = devm_kzalloc(pmc->dev, sizeof(*genpd), GFP_KERNEL);
+	if (!genpd)
+		return -ENOMEM;
+
+	genpd->name = np->name;
+	genpd->set_performance_state = tegra_pmc_core_pd_set_performance_state;
+	genpd->opp_to_performance_state = tegra_pmc_core_pd_opp_to_performance_state;
+
+	err = devm_pm_opp_set_regulators(pmc->dev, &rname, 1);
+	if (err)
+		return dev_err_probe(pmc->dev, err,
+				     "failed to set core OPP regulator\n");
+
+	err = pm_genpd_init(genpd, NULL, false);
+	if (err) {
+		dev_err(pmc->dev, "failed to init core genpd: %d\n", err);
+		return err;
+	}
+
+	/*
+	 * We have a "PMC pwrgate -> Core" hierarchy of the power domains
+	 * where PMC needs to resume and change performance (voltage) of the
+	 * Core domain from the PMC GENPD on/off callbacks, hence we need
+	 * to annotate the lock in order to remove confusion from the
+	 * lockdep checker when a nested access happens.
+	 */
+	lockdep_set_class(&genpd->mlock, &tegra_core_domain_lock_class);
+
+	err = of_genpd_add_provider_simple(np, genpd);
+	if (err) {
+		dev_err(pmc->dev, "failed to add core genpd: %d\n", err);
+		goto remove_genpd;
+	}
+
+	return 0;
+
+remove_genpd:
+	pm_genpd_remove(genpd);
+
+	return err;
+}
+
 static int tegra_powergate_init(struct tegra_pmc *pmc,
 				struct device_node *parent)
 {
+	struct of_phandle_args child_args, parent_args;
 	struct device_node *np, *child;
 	int err = 0;
 
+	/*
+	 * Core power domain is the parent of powergate domains, hence it
+	 * should be registered first.
+	 */
+	np = of_get_child_by_name(parent, "core-domain");
+	if (np) {
+		err = tegra_pmc_core_pd_add(pmc, np);
+		of_node_put(np);
+		if (err)
+			return err;
+	}
+
 	np = of_get_child_by_name(parent, "powergates");
 	if (!np)
 		return 0;
@@ -1318,6 +1424,21 @@  static int tegra_powergate_init(struct tegra_pmc *pmc,
 			of_node_put(child);
 			break;
 		}
+
+		if (of_parse_phandle_with_args(child, "power-domains",
+					       "#power-domain-cells",
+					       0, &parent_args))
+			continue;
+
+		child_args.np = child;
+		child_args.args_count = 0;
+
+		err = of_genpd_add_subdomain(&parent_args, &child_args);
+		of_node_put(parent_args.np);
+		if (err) {
+			of_node_put(child);
+			break;
+		}
 	}
 
 	of_node_put(np);
@@ -1361,6 +1482,12 @@  static void tegra_powergate_remove_all(struct device_node *parent)
 	}
 
 	of_node_put(np);
+
+	np = of_get_child_by_name(parent, "core-domain");
+	if (np) {
+		of_genpd_del_provider(np);
+		of_genpd_remove_last(np);
+	}
 }
 
 static const struct tegra_io_pad_soc *
@@ -3672,6 +3799,21 @@  static const struct of_device_id tegra_pmc_match[] = {
 	{ }
 };
 
+static void tegra_pmc_sync_state(struct device *dev)
+{
+	int err;
+
+	pmc->core_domain_state_synced = true;
+
+	/* this is a no-op if core regulator isn't used */
+	mutex_lock(&pmc->powergates_lock);
+	err = dev_pm_opp_sync_regulators(dev);
+	mutex_unlock(&pmc->powergates_lock);
+
+	if (err)
+		dev_err(dev, "failed to sync regulators: %d\n", err);
+}
+
 static struct platform_driver tegra_pmc_driver = {
 	.driver = {
 		.name = "tegra-pmc",
@@ -3680,6 +3822,7 @@  static struct platform_driver tegra_pmc_driver = {
 #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_ARM)
 		.pm = &tegra_pmc_pm_ops,
 #endif
+		.sync_state = tegra_pmc_sync_state,
 	},
 	.probe = tegra_pmc_probe,
 };
diff --git a/include/soc/tegra/common.h b/include/soc/tegra/common.h
index af41ad80ec21..135a6956a18c 100644
--- a/include/soc/tegra/common.h
+++ b/include/soc/tegra/common.h
@@ -23,6 +23,8 @@  struct tegra_core_opp_params {
 #ifdef CONFIG_ARCH_TEGRA
 bool soc_is_tegra(void);
 
+bool tegra_soc_core_domain_state_synced(void);
+
 int devm_tegra_core_dev_init_opp_table(struct device *dev,
 				       struct tegra_core_opp_params *params);
 #else
@@ -31,6 +33,11 @@  static inline bool soc_is_tegra(void)
 	return false;
 }
 
+static inline bool tegra_soc_core_domain_state_synced(void)
+{
+	return false;
+}
+
 static inline int
 devm_tegra_core_dev_init_opp_table(struct device *dev,
 				   struct tegra_core_opp_params *params)