diff mbox series

[v7,02/37] soc/tegra: pmc: Implement attach_dev() of power domain drivers

Message ID 20210701232728.23591-3-digetx@gmail.com
State New
Headers show
Series NVIDIA Tegra power management patches for 5.15 | expand

Commit Message

Dmitry Osipenko July 1, 2021, 11:26 p.m. UTC
Implement attach_dev() callback of power domain drivers that initializes
the domain's performance state. GENPD core will apply the performance
state on the first runtime PM resume of the attached device.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/soc/tegra/pmc.c | 147 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)

Comments

Ulf Hansson Aug. 2, 2021, 2:48 p.m. UTC | #1
On Fri, 2 Jul 2021 at 01:28, Dmitry Osipenko <digetx@gmail.com> wrote:
>

> Implement attach_dev() callback of power domain drivers that initializes

> the domain's performance state. GENPD core will apply the performance

> state on the first runtime PM resume of the attached device.

>

> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>

> ---

>  drivers/soc/tegra/pmc.c | 147 ++++++++++++++++++++++++++++++++++++++++

>  1 file changed, 147 insertions(+)

>

> diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c

> index f63dfb2ca3f9..ebafb818b08e 100644

> --- a/drivers/soc/tegra/pmc.c

> +++ b/drivers/soc/tegra/pmc.c

> @@ -506,6 +506,151 @@ static void tegra_pmc_scratch_writel(struct tegra_pmc *pmc, u32 value,

>                 writel(value, pmc->scratch + offset);

>  }

>

> +static const char * const tegra_emc_compats[] = {

> +       "nvidia,tegra20-emc",

> +       "nvidia,tegra30-emc",

> +       NULL,

> +};

> +

> +/*

> + * This GENPD callback is used by both powergate and core domains.

> + *

> + * We retrieve clock rate of the attached device and initialize domain's

> + * performance state in accordance to the clock rate.

> + */

> +static int tegra_pmc_pd_attach_dev(struct generic_pm_domain *genpd,

> +                                  struct device *dev)

> +{

> +       struct generic_pm_domain_data *gpd_data = dev_gpd_data(dev);

> +       struct opp_table *opp_table, *pd_opp_table;

> +       struct generic_pm_domain *core_genpd;

> +       struct dev_pm_opp *opp, *pd_opp;

> +       unsigned long rate, state;

> +       struct gpd_link *link;

> +       struct clk *clk;

> +       u32 hw_version;

> +       int ret;

> +

> +       /*

> +        * Tegra114+ SocS don't support OPP yet.  But if they will get OPP

> +        * support, then we want to skip OPP for older kernels to preserve

> +        * compatibility of newer DTBs with older kernels.

> +        */

> +       if (!pmc->soc->supports_core_domain)

> +               return 0;

> +

> +       /*

> +        * The EMC devices are a special case because we have a protection

> +        * from non-EMC drivers getting clock handle before EMC driver is

> +        * fully initialized.  The goal of the protection is to prevent

> +        * devfreq driver from getting failures if it will try to change

> +        * EMC clock rate until clock is fully initialized.  The EMC drivers

> +        * will initialize the performance state by themselves.

> +        */

> +       if (of_device_compatible_match(dev->of_node, tegra_emc_compats))

> +               return 0;

> +

> +       clk = clk_get(dev, NULL);

> +       if (IS_ERR(clk)) {

> +               dev_err(&genpd->dev, "failed to get clk of %s: %pe\n",

> +                       dev_name(dev), clk);

> +               return PTR_ERR(clk);

> +       }

> +

> +       rate = clk_get_rate(clk);

> +       if (!rate) {

> +               dev_err(&genpd->dev, "failed to get clk rate of %s\n",

> +                       dev_name(dev));

> +               ret = -EINVAL;

> +               goto put_clk;

> +       }

> +

> +       if (of_machine_is_compatible("nvidia,tegra20"))

> +               hw_version = BIT(tegra_sku_info.soc_process_id);

> +       else

> +               hw_version = BIT(tegra_sku_info.soc_speedo_id);

> +

> +       opp_table = dev_pm_opp_set_supported_hw(dev, &hw_version, 1);

> +       if (IS_ERR(opp_table)) {

> +               dev_err(&genpd->dev, "failed to set OPP supported HW for %s: %d\n",

> +                       dev_name(dev), ret);

> +               ret = PTR_ERR(opp_table);

> +               goto put_clk;

> +       }

> +

> +       ret = dev_pm_opp_of_add_table(dev);

> +       if (ret) {

> +               /* older DTBs that don't have OPPs will get -ENODEV here */

> +               if (ret != -ENODEV)

> +                       dev_err(&genpd->dev, "failed to get OPP table of %s: %d\n",

> +                               dev_name(dev), ret);

> +               else

> +                       ret = 0;

> +

> +               goto put_supported_hw;

> +       }

> +

> +       /* find suitable OPP for the rate */

> +       opp = dev_pm_opp_find_freq_ceil(dev, &rate);

> +

> +       if (opp == ERR_PTR(-ERANGE))

> +               opp = dev_pm_opp_find_freq_floor(dev, &rate);

> +

> +       if (IS_ERR(opp)) {

> +               dev_err(&genpd->dev, "failed to find OPP for %luHz of %s: %pe\n",

> +                       rate, dev_name(dev), opp);

> +               ret = PTR_ERR(opp);

> +               goto remove_dev_table;

> +       }

> +

> +       if (!list_empty(&genpd->child_links)) {

> +               link = list_first_entry(&genpd->child_links, struct gpd_link,

> +                                       child_node);

> +               core_genpd = link->parent;

> +       } else {

> +               core_genpd = genpd;

> +       }


This looks a bit odd to me. A genpd provider shouldn't need to walk
these links as these are considered internals to genpd. Normally this
needs lockings, etc.

Why exactly do you need this?

> +

> +       pd_opp_table = dev_pm_opp_get_opp_table(&core_genpd->dev);

> +       if (IS_ERR(pd_opp_table)) {

> +               dev_err(&genpd->dev, "failed to get OPP table of %s: %pe\n",

> +                       dev_name(&core_genpd->dev), pd_opp_table);

> +               ret = PTR_ERR(pd_opp_table);

> +               goto put_dev_opp;

> +       }

> +

> +       pd_opp = dev_pm_opp_xlate_required_opp(opp_table, pd_opp_table, opp);

> +       if (IS_ERR(pd_opp)) {

> +               dev_err(&genpd->dev,

> +                       "failed to xlate required OPP for %luHz of %s: %pe\n",

> +                       rate, dev_name(dev), pd_opp);

> +               ret = PTR_ERR(pd_opp);

> +               goto put_pd_opp_table;

> +       }

> +

> +       /*

> +        * The initialized state will be applied by GENPD core on the first

> +        * RPM-resume of the device.  This means that drivers don't need to

> +        * explicitly initialize performance state.

> +        */

> +       state = pm_genpd_opp_to_performance_state(&core_genpd->dev, pd_opp);

> +       gpd_data->rpm_pstate = state;


Could the above be replaced with Rajendra's suggestion [1], which
changes genpd to internally during attach, to set a default
performance state when there is a "required-opp" specified in the
device  node?

[...]

Kind regards
Uffe

[1]
https://lkml.org/lkml/2021/7/20/99
Dmitry Osipenko Aug. 2, 2021, 6:23 p.m. UTC | #2
02.08.2021 17:48, Ulf Hansson пишет:
...
>> +       if (!list_empty(&genpd->child_links)) {

>> +               link = list_first_entry(&genpd->child_links, struct gpd_link,

>> +                                       child_node);

>> +               core_genpd = link->parent;

>> +       } else {

>> +               core_genpd = genpd;

>> +       }

> 

> This looks a bit odd to me. A genpd provider shouldn't need to walk

> these links as these are considered internals to genpd. Normally this

> needs lockings, etc.

> 

> Why exactly do you need this?


We have a chain of PMC domain -> core domain, both domains are created
and liked together by this PMC driver. Devices are attached to either
PMC domain or to core domain. PMC domain doesn't handle the performance
changes, performance requests go down to the core domain.

This is needed in order to translate the device's OPP into performance
state of the core domain, based on the domain to which device is attached.

>> +

>> +       pd_opp_table = dev_pm_opp_get_opp_table(&core_genpd->dev);

>> +       if (IS_ERR(pd_opp_table)) {

>> +               dev_err(&genpd->dev, "failed to get OPP table of %s: %pe\n",

>> +                       dev_name(&core_genpd->dev), pd_opp_table);

>> +               ret = PTR_ERR(pd_opp_table);

>> +               goto put_dev_opp;

>> +       }

>> +

>> +       pd_opp = dev_pm_opp_xlate_required_opp(opp_table, pd_opp_table, opp);

>> +       if (IS_ERR(pd_opp)) {

>> +               dev_err(&genpd->dev,

>> +                       "failed to xlate required OPP for %luHz of %s: %pe\n",

>> +                       rate, dev_name(dev), pd_opp);

>> +               ret = PTR_ERR(pd_opp);

>> +               goto put_pd_opp_table;

>> +       }

>> +

>> +       /*

>> +        * The initialized state will be applied by GENPD core on the first

>> +        * RPM-resume of the device.  This means that drivers don't need to

>> +        * explicitly initialize performance state.

>> +        */

>> +       state = pm_genpd_opp_to_performance_state(&core_genpd->dev, pd_opp);

>> +       gpd_data->rpm_pstate = state;

> 

> Could the above be replaced with Rajendra's suggestion [1], which

> changes genpd to internally during attach, to set a default

> performance state when there is a "required-opp" specified in the

> device  node?


It's not a "static" performance level here, but any level depending on
h/w state left from bootloader and etc. The performance level
corresponds to the voltage of the core domain, hence we need to
initialize the voltage vote before device is resumed.
Ulf Hansson Aug. 4, 2021, 9:59 a.m. UTC | #3
On Mon, 2 Aug 2021 at 20:23, Dmitry Osipenko <digetx@gmail.com> wrote:
>

> 02.08.2021 17:48, Ulf Hansson пишет:

> ...

> >> +       if (!list_empty(&genpd->child_links)) {

> >> +               link = list_first_entry(&genpd->child_links, struct gpd_link,

> >> +                                       child_node);

> >> +               core_genpd = link->parent;

> >> +       } else {

> >> +               core_genpd = genpd;

> >> +       }

> >

> > This looks a bit odd to me. A genpd provider shouldn't need to walk

> > these links as these are considered internals to genpd. Normally this

> > needs lockings, etc.

> >

> > Why exactly do you need this?

>

> We have a chain of PMC domain -> core domain, both domains are created

> and liked together by this PMC driver. Devices are attached to either

> PMC domain or to core domain. PMC domain doesn't handle the performance

> changes, performance requests go down to the core domain.


Did I get this right? The core domain is the parent to the PMC domain?

>

> This is needed in order to translate the device's OPP into performance

> state of the core domain, based on the domain to which device is attached.


So, the PMC domain doesn't have an OPP table associated with it, but
some of its attached devices may still have available OPPs, which
should be managed through the parent domain (core domain). Correct?

Is there a DT patch in the series that I can look at that shows how
this is encoded?

Hmm, I have the feeling that we should try to manage in some generic
way in genpd, rather than having to deal with it here.

>

> >> +

> >> +       pd_opp_table = dev_pm_opp_get_opp_table(&core_genpd->dev);

> >> +       if (IS_ERR(pd_opp_table)) {

> >> +               dev_err(&genpd->dev, "failed to get OPP table of %s: %pe\n",

> >> +                       dev_name(&core_genpd->dev), pd_opp_table);

> >> +               ret = PTR_ERR(pd_opp_table);

> >> +               goto put_dev_opp;

> >> +       }

> >> +

> >> +       pd_opp = dev_pm_opp_xlate_required_opp(opp_table, pd_opp_table, opp);

> >> +       if (IS_ERR(pd_opp)) {

> >> +               dev_err(&genpd->dev,

> >> +                       "failed to xlate required OPP for %luHz of %s: %pe\n",

> >> +                       rate, dev_name(dev), pd_opp);

> >> +               ret = PTR_ERR(pd_opp);

> >> +               goto put_pd_opp_table;

> >> +       }

> >> +

> >> +       /*

> >> +        * The initialized state will be applied by GENPD core on the first

> >> +        * RPM-resume of the device.  This means that drivers don't need to

> >> +        * explicitly initialize performance state.

> >> +        */

> >> +       state = pm_genpd_opp_to_performance_state(&core_genpd->dev, pd_opp);

> >> +       gpd_data->rpm_pstate = state;

> >

> > Could the above be replaced with Rajendra's suggestion [1], which

> > changes genpd to internally during attach, to set a default

> > performance state when there is a "required-opp" specified in the

> > device  node?

>

> It's not a "static" performance level here, but any level depending on

> h/w state left from bootloader and etc. The performance level

> corresponds to the voltage of the core domain, hence we need to

> initialize the voltage vote before device is resumed.


Why not let the driver deal with this instead? It should be able to
probe its device, no matter what state the bootloader has put the
device into.

To me, it sounds like a call to dev_pm_genpd_set_performance_state()
(perhaps via dev_pm_opp_set_opp() or dev_pm_opp_set_rate()) from the
driver itself, should be sufficient?

I understand that it means the domain may change the OPP during boot,
without respecting a vote for a device that has not been probed yet.
But is there a problem with this?

Kind regards
Uffe
Dmitry Osipenko Aug. 4, 2021, 9:16 p.m. UTC | #4
04.08.2021 12:59, Ulf Hansson пишет:
> On Mon, 2 Aug 2021 at 20:23, Dmitry Osipenko <digetx@gmail.com> wrote:

>>

>> 02.08.2021 17:48, Ulf Hansson пишет:

>> ...

>>>> +       if (!list_empty(&genpd->child_links)) {

>>>> +               link = list_first_entry(&genpd->child_links, struct gpd_link,

>>>> +                                       child_node);

>>>> +               core_genpd = link->parent;

>>>> +       } else {

>>>> +               core_genpd = genpd;

>>>> +       }

>>>

>>> This looks a bit odd to me. A genpd provider shouldn't need to walk

>>> these links as these are considered internals to genpd. Normally this

>>> needs lockings, etc.

>>>

>>> Why exactly do you need this?

>>

>> We have a chain of PMC domain -> core domain, both domains are created

>> and liked together by this PMC driver. Devices are attached to either

>> PMC domain or to core domain. PMC domain doesn't handle the performance

>> changes, performance requests go down to the core domain.

> 

> Did I get this right? The core domain is the parent to the PMC domain?


You got this right.

>>

>> This is needed in order to translate the device's OPP into performance

>> state of the core domain, based on the domain to which device is attached.

> 

> So, the PMC domain doesn't have an OPP table associated with it, but

> some of its attached devices may still have available OPPs, which

> should be managed through the parent domain (core domain). Correct?


Yes, the OPPs are specified only for the core domain.

> Is there a DT patch in the series that I can look at that shows how

> this is encoded?


See patches which are adding domains and OPPs to DTs:

https://patchwork.ozlabs.org/project/linux-tegra/patch/20210701232728.23591-34-digetx@gmail.com/

https://patchwork.ozlabs.org/project/linux-tegra/patch/20210701232728.23591-35-digetx@gmail.com/

> Hmm, I have the feeling that we should try to manage in some generic

> way in genpd, rather than having to deal with it here.


Still it requires a platform-specific knowledge. It could be some new
genpd hook for the initialization. But I don't know what other platforms
may want to initialize, so it's not clear to me how to make it generic.

>>>> +

>>>> +       pd_opp_table = dev_pm_opp_get_opp_table(&core_genpd->dev);

>>>> +       if (IS_ERR(pd_opp_table)) {

>>>> +               dev_err(&genpd->dev, "failed to get OPP table of %s: %pe\n",

>>>> +                       dev_name(&core_genpd->dev), pd_opp_table);

>>>> +               ret = PTR_ERR(pd_opp_table);

>>>> +               goto put_dev_opp;

>>>> +       }

>>>> +

>>>> +       pd_opp = dev_pm_opp_xlate_required_opp(opp_table, pd_opp_table, opp);

>>>> +       if (IS_ERR(pd_opp)) {

>>>> +               dev_err(&genpd->dev,

>>>> +                       "failed to xlate required OPP for %luHz of %s: %pe\n",

>>>> +                       rate, dev_name(dev), pd_opp);

>>>> +               ret = PTR_ERR(pd_opp);

>>>> +               goto put_pd_opp_table;

>>>> +       }

>>>> +

>>>> +       /*

>>>> +        * The initialized state will be applied by GENPD core on the first

>>>> +        * RPM-resume of the device.  This means that drivers don't need to

>>>> +        * explicitly initialize performance state.

>>>> +        */

>>>> +       state = pm_genpd_opp_to_performance_state(&core_genpd->dev, pd_opp);

>>>> +       gpd_data->rpm_pstate = state;

>>>

>>> Could the above be replaced with Rajendra's suggestion [1], which

>>> changes genpd to internally during attach, to set a default

>>> performance state when there is a "required-opp" specified in the

>>> device  node?

>>

>> It's not a "static" performance level here, but any level depending on

>> h/w state left from bootloader and etc. The performance level

>> corresponds to the voltage of the core domain, hence we need to

>> initialize the voltage vote before device is resumed.

> 

> Why not let the driver deal with this instead? It should be able to

> probe its device, no matter what state the bootloader has put the

> device into.

> 

> To me, it sounds like a call to dev_pm_genpd_set_performance_state()

> (perhaps via dev_pm_opp_set_opp() or dev_pm_opp_set_rate()) from the

> driver itself, should be sufficient?


We did that in a previous versions of this series where drivers were
calling devm_tegra_core_dev_init_opp_table() helper during the probe to
initialize performance state of the domain. Moving OPP state
initialization into central place made drivers cleaner by removing the
boilerplate code.

I can revert back to the previous variant, although this variant works
well too.

> I understand that it means the domain may change the OPP during boot,

> without respecting a vote for a device that has not been probed yet.

> But is there a problem with this?


Domains themselves don't change OPP, there is no problem with that. The
point is to have cleaner code in the drivers.
Ulf Hansson Aug. 9, 2021, 2:15 p.m. UTC | #5
On Wed, 4 Aug 2021 at 23:17, Dmitry Osipenko <digetx@gmail.com> wrote:
>

> 04.08.2021 12:59, Ulf Hansson пишет:

> > On Mon, 2 Aug 2021 at 20:23, Dmitry Osipenko <digetx@gmail.com> wrote:

> >>

> >> 02.08.2021 17:48, Ulf Hansson пишет:

> >> ...

> >>>> +       if (!list_empty(&genpd->child_links)) {

> >>>> +               link = list_first_entry(&genpd->child_links, struct gpd_link,

> >>>> +                                       child_node);

> >>>> +               core_genpd = link->parent;

> >>>> +       } else {

> >>>> +               core_genpd = genpd;

> >>>> +       }

> >>>

> >>> This looks a bit odd to me. A genpd provider shouldn't need to walk

> >>> these links as these are considered internals to genpd. Normally this

> >>> needs lockings, etc.

> >>>

> >>> Why exactly do you need this?

> >>

> >> We have a chain of PMC domain -> core domain, both domains are created

> >> and liked together by this PMC driver. Devices are attached to either

> >> PMC domain or to core domain. PMC domain doesn't handle the performance

> >> changes, performance requests go down to the core domain.

> >

> > Did I get this right? The core domain is the parent to the PMC domain?

>

> You got this right.

>

> >>

> >> This is needed in order to translate the device's OPP into performance

> >> state of the core domain, based on the domain to which device is attached.

> >

> > So, the PMC domain doesn't have an OPP table associated with it, but

> > some of its attached devices may still have available OPPs, which

> > should be managed through the parent domain (core domain). Correct?

>

> Yes, the OPPs are specified only for the core domain.

>

> > Is there a DT patch in the series that I can look at that shows how

> > this is encoded?

>

> See patches which are adding domains and OPPs to DTs:

>

> https://patchwork.ozlabs.org/project/linux-tegra/patch/20210701232728.23591-34-digetx@gmail.com/

>

> https://patchwork.ozlabs.org/project/linux-tegra/patch/20210701232728.23591-35-digetx@gmail.com/


Thanks, this looks sane to me!

>

> > Hmm, I have the feeling that we should try to manage in some generic

> > way in genpd, rather than having to deal with it here.

>

> Still it requires a platform-specific knowledge. It could be some new

> genpd hook for the initialization. But I don't know what other platforms

> may want to initialize, so it's not clear to me how to make it generic.


Right. We may need some more thinking around this, if needed at all (see below).

>

> >>>> +

> >>>> +       pd_opp_table = dev_pm_opp_get_opp_table(&core_genpd->dev);

> >>>> +       if (IS_ERR(pd_opp_table)) {

> >>>> +               dev_err(&genpd->dev, "failed to get OPP table of %s: %pe\n",

> >>>> +                       dev_name(&core_genpd->dev), pd_opp_table);

> >>>> +               ret = PTR_ERR(pd_opp_table);

> >>>> +               goto put_dev_opp;

> >>>> +       }

> >>>> +

> >>>> +       pd_opp = dev_pm_opp_xlate_required_opp(opp_table, pd_opp_table, opp);

> >>>> +       if (IS_ERR(pd_opp)) {

> >>>> +               dev_err(&genpd->dev,

> >>>> +                       "failed to xlate required OPP for %luHz of %s: %pe\n",

> >>>> +                       rate, dev_name(dev), pd_opp);

> >>>> +               ret = PTR_ERR(pd_opp);

> >>>> +               goto put_pd_opp_table;

> >>>> +       }

> >>>> +

> >>>> +       /*

> >>>> +        * The initialized state will be applied by GENPD core on the first

> >>>> +        * RPM-resume of the device.  This means that drivers don't need to

> >>>> +        * explicitly initialize performance state.

> >>>> +        */

> >>>> +       state = pm_genpd_opp_to_performance_state(&core_genpd->dev, pd_opp);

> >>>> +       gpd_data->rpm_pstate = state;

> >>>

> >>> Could the above be replaced with Rajendra's suggestion [1], which

> >>> changes genpd to internally during attach, to set a default

> >>> performance state when there is a "required-opp" specified in the

> >>> device  node?

> >>

> >> It's not a "static" performance level here, but any level depending on

> >> h/w state left from bootloader and etc. The performance level

> >> corresponds to the voltage of the core domain, hence we need to

> >> initialize the voltage vote before device is resumed.

> >

> > Why not let the driver deal with this instead? It should be able to

> > probe its device, no matter what state the bootloader has put the

> > device into.

> >

> > To me, it sounds like a call to dev_pm_genpd_set_performance_state()

> > (perhaps via dev_pm_opp_set_opp() or dev_pm_opp_set_rate()) from the

> > driver itself, should be sufficient?

>

> We did that in a previous versions of this series where drivers were

> calling devm_tegra_core_dev_init_opp_table() helper during the probe to

> initialize performance state of the domain. Moving OPP state

> initialization into central place made drivers cleaner by removing the

> boilerplate code.


I am not against doing this in a central place, like $subject patch
suggests. As a matter of fact, it makes perfect sense to me.

However, what I am concerned about, is that you require to use genpd
internal data structures to do it. I think we should try to avoid
that.

>

> I can revert back to the previous variant, although this variant works

> well too.


I looked at that code and in that path we end up calling
dev_pm_opp_set_rate(), after it has initialized the opp table for the
device.

Rather than doing the OF parsing above to find out the current state
for the device, why can't you just call dev_pm_opp_set_rate() to
initialize a proper vote instead?

>

> > I understand that it means the domain may change the OPP during boot,

> > without respecting a vote for a device that has not been probed yet.

> > But is there a problem with this?

>

> Domains themselves don't change OPP, there is no problem with that. The

> point is to have cleaner code in the drivers.


Alright, this makes sense to me as well.

Kind regards
Uffe
Dmitry Osipenko Aug. 9, 2021, 11:56 p.m. UTC | #6
09.08.2021 17:15, Ulf Hansson пишет:
>> We did that in a previous versions of this series where drivers were

>> calling devm_tegra_core_dev_init_opp_table() helper during the probe to

>> initialize performance state of the domain. Moving OPP state

>> initialization into central place made drivers cleaner by removing the

>> boilerplate code.

> I am not against doing this in a central place, like $subject patch

> suggests. As a matter of fact, it makes perfect sense to me.

> 

> However, what I am concerned about, is that you require to use genpd

> internal data structures to do it. I think we should try to avoid

> that.


Alright, what do you think about this:

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index a934c679e6ce..5faed62075e9 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2669,12 +2669,37 @@ static int __genpd_dev_pm_attach(struct device *dev, struct device *base_dev,
 	dev->pm_domain->detach = genpd_dev_pm_detach;
 	dev->pm_domain->sync = genpd_dev_pm_sync;
 
+	if (pd->default_performance_state) {
+		unsigned int default_pstate;
+
+		ret = pd->default_performance_state(pd, dev);
+		if (ret < 0) {
+			dev_err(dev, "failed to get default performance state for PM domain %s: %d\n",
+				pd->name, ret);
+			goto out;
+		}
+
+		default_pstate = ret;
+
+		if (power_on) {
+			ret = dev_pm_genpd_set_performance_state(dev, default_pstate);
+			if (ret) {
+				dev_err(dev, "failed to set default performance state %u for PM domain %s: %d\n",
+					default_pstate, pd->name, ret);
+				goto out;
+			}
+		} else {
+			dev_gpd_data(dev)->rpm_pstate = default_pstate;
+		}
+	}
+
 	if (power_on) {
 		genpd_lock(pd);
 		ret = genpd_power_on(pd, 0);
 		genpd_unlock(pd);
 	}
 
+out:
 	if (ret)
 		genpd_remove_device(pd, dev);
 
diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c
index 81d1f019fa0c..9efb55f52462 100644
--- a/drivers/soc/tegra/pmc.c
+++ b/drivers/soc/tegra/pmc.c
@@ -518,15 +518,14 @@ static const char * const tegra_emc_compats[] = {
  * We retrieve clock rate of the attached device and initialize domain's
  * performance state in accordance to the clock rate.
  */
-static int tegra_pmc_pd_attach_dev(struct generic_pm_domain *genpd,
-				   struct device *dev)
+static int tegra_pmc_genpd_default_perf_state(struct generic_pm_domain *genpd,
+					      struct device *dev)
 {
-	struct generic_pm_domain_data *gpd_data = dev_gpd_data(dev);
 	struct opp_table *opp_table, *pd_opp_table;
 	struct generic_pm_domain *core_genpd;
 	struct dev_pm_opp *opp, *pd_opp;
-	unsigned long rate, state;
 	struct gpd_link *link;
+	unsigned long rate;
 	struct clk *clk;
 	u32 hw_version;
 	int ret;
@@ -633,8 +632,7 @@ static int tegra_pmc_pd_attach_dev(struct generic_pm_domain *genpd,
 	 * RPM-resume of the device.  This means that drivers don't need to
 	 * explicitly initialize performance state.
 	 */
-	state = pm_genpd_opp_to_performance_state(&core_genpd->dev, pd_opp);
-	gpd_data->rpm_pstate = state;
+	ret = pm_genpd_opp_to_performance_state(&core_genpd->dev, pd_opp);
 	dev_pm_opp_put(pd_opp);
 
 put_pd_opp_table:
@@ -1383,7 +1381,7 @@ static int tegra_powergate_add(struct tegra_pmc *pmc, struct device_node *np)
 
 	pg->id = id;
 	pg->genpd.name = np->name;
-	pg->genpd.attach_dev = tegra_pmc_pd_attach_dev;
+	pg->genpd.default_performance_state = tegra_pmc_genpd_default_perf_state;
 	pg->genpd.power_off = tegra_genpd_power_off;
 	pg->genpd.power_on = tegra_genpd_power_on;
 	pg->pmc = pmc;
@@ -1500,7 +1498,7 @@ static int tegra_pmc_core_pd_add(struct tegra_pmc *pmc, struct device_node *np)
 		return -ENOMEM;
 
 	genpd->name = np->name;
-	genpd->attach_dev = tegra_pmc_pd_attach_dev;
+	genpd->default_performance_state = tegra_pmc_genpd_default_perf_state;
 	genpd->set_performance_state = tegra_pmc_core_pd_set_performance_state;
 	genpd->opp_to_performance_state = tegra_pmc_core_pd_opp_to_performance_state;
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 21a0577305ef..cd4867817ca5 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -143,6 +143,8 @@ struct generic_pm_domain {
 			  struct device *dev);
 	void (*detach_dev)(struct generic_pm_domain *domain,
 			   struct device *dev);
+	int (*default_performance_state)(struct generic_pm_domain *domain,
+					 struct device *dev);
 	unsigned int flags;		/* Bit field of configs for genpd */
 	struct genpd_power_state *states;
 	void (*free_states)(struct genpd_power_state *states,

>> I can revert back to the previous variant, although this variant works

>> well too.

> I looked at that code and in that path we end up calling

> dev_pm_opp_set_rate(), after it has initialized the opp table for the

> device.

> 

> Rather than doing the OF parsing above to find out the current state

> for the device, why can't you just call dev_pm_opp_set_rate() to

> initialize a proper vote instead?

> 


For some devices clock rate is either preset by bootloader, or by clk driver, or by assigned-clocks in a device-tree. And then I don't see what's the difference in comparison to initialization for the current rate.

For some devices, like memory controller, we can't just change the clock rate because it's a complex procedure and some boards will use fixed rate, but the power vote still must be initialized.
Ulf Hansson Aug. 10, 2021, 10:51 a.m. UTC | #7
On Tue, 10 Aug 2021 at 01:56, Dmitry Osipenko <digetx@gmail.com> wrote:
>

> 09.08.2021 17:15, Ulf Hansson пишет:

> >> We did that in a previous versions of this series where drivers were

> >> calling devm_tegra_core_dev_init_opp_table() helper during the probe to

> >> initialize performance state of the domain. Moving OPP state

> >> initialization into central place made drivers cleaner by removing the

> >> boilerplate code.

> > I am not against doing this in a central place, like $subject patch

> > suggests. As a matter of fact, it makes perfect sense to me.

> >

> > However, what I am concerned about, is that you require to use genpd

> > internal data structures to do it. I think we should try to avoid

> > that.

>

> Alright, what do you think about this:

>

> diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c

> index a934c679e6ce..5faed62075e9 100644

> --- a/drivers/base/power/domain.c

> +++ b/drivers/base/power/domain.c

> @@ -2669,12 +2669,37 @@ static int __genpd_dev_pm_attach(struct device *dev, struct device *base_dev,

>         dev->pm_domain->detach = genpd_dev_pm_detach;

>         dev->pm_domain->sync = genpd_dev_pm_sync;

>

> +       if (pd->default_performance_state) {

> +               unsigned int default_pstate;

> +

> +               ret = pd->default_performance_state(pd, dev);

> +               if (ret < 0) {

> +                       dev_err(dev, "failed to get default performance state for PM domain %s: %d\n",

> +                               pd->name, ret);

> +                       goto out;

> +               }


Adding a new callback seems reasonable to support this.

> +

> +               default_pstate = ret;

> +

> +               if (power_on) {

> +                       ret = dev_pm_genpd_set_performance_state(dev, default_pstate);


However, this is more questionable to me.

First, I don't think we should care about whether this is "power_on"
or not. At this point, performance states are treated orthogonal to
idle states in genpd. We may decide to change that in some way, but
that deserves a different change.

Second, I don't think we should call
dev_pm_genpd_set_performance_state() from here. It's probably better
handled from the genpd callback itself, if/when needed.

That said, perhaps the new callback should just return a regular error
code and zero on success, rather than the current performance state.
See more below.

> +                       if (ret) {

> +                               dev_err(dev, "failed to set default performance state %u for PM domain %s: %d\n",

> +                                       default_pstate, pd->name, ret);

> +                               goto out;

> +                       }

> +               } else {

> +                       dev_gpd_data(dev)->rpm_pstate = default_pstate;


No, this isn't the right thing to do.

It looks like you are trying to use the ->rpm_pstate for
synchronization with runtime PM for consumer drivers. This is fragile
as it depends on the runtime PM deployment in the consumer driver. I
think you should look at ->rpm_pstate as a variable solely for
managing save/restore of the performance state for the device, during
runtime suspend/resume in genpd.

Synchronization of a vote for a performance state for a device, needs
to be managed by calling dev_pm_genpd_set_performance_state() - or by
calling an OPP function that calls it, like dev_pm_opp_set_rate(), for
example.

> +               }

> +       }

> +

>         if (power_on) {

>                 genpd_lock(pd);

>                 ret = genpd_power_on(pd, 0);

>                 genpd_unlock(pd);

>         }

>

> +out:

>         if (ret)

>                 genpd_remove_device(pd, dev);

>

> diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c

> index 81d1f019fa0c..9efb55f52462 100644

> --- a/drivers/soc/tegra/pmc.c

> +++ b/drivers/soc/tegra/pmc.c

> @@ -518,15 +518,14 @@ static const char * const tegra_emc_compats[] = {

>   * We retrieve clock rate of the attached device and initialize domain's

>   * performance state in accordance to the clock rate.

>   */

> -static int tegra_pmc_pd_attach_dev(struct generic_pm_domain *genpd,

> -                                  struct device *dev)

> +static int tegra_pmc_genpd_default_perf_state(struct generic_pm_domain *genpd,

> +                                             struct device *dev)

>  {

> -       struct generic_pm_domain_data *gpd_data = dev_gpd_data(dev);

>         struct opp_table *opp_table, *pd_opp_table;

>         struct generic_pm_domain *core_genpd;

>         struct dev_pm_opp *opp, *pd_opp;

> -       unsigned long rate, state;

>         struct gpd_link *link;

> +       unsigned long rate;

>         struct clk *clk;

>         u32 hw_version;

>         int ret;

> @@ -633,8 +632,7 @@ static int tegra_pmc_pd_attach_dev(struct generic_pm_domain *genpd,

>          * RPM-resume of the device.  This means that drivers don't need to

>          * explicitly initialize performance state.

>          */

> -       state = pm_genpd_opp_to_performance_state(&core_genpd->dev, pd_opp);

> -       gpd_data->rpm_pstate = state;

> +       ret = pm_genpd_opp_to_performance_state(&core_genpd->dev, pd_opp);


I don't see how this avoids tegra_pmc_genpd_default_perf_state() from
having to walk &genpd->child_links.

That's still an issue, right?

>         dev_pm_opp_put(pd_opp);

>

>  put_pd_opp_table:

> @@ -1383,7 +1381,7 @@ static int tegra_powergate_add(struct tegra_pmc *pmc, struct device_node *np)

>

>         pg->id = id;

>         pg->genpd.name = np->name;

> -       pg->genpd.attach_dev = tegra_pmc_pd_attach_dev;

> +       pg->genpd.default_performance_state = tegra_pmc_genpd_default_perf_state;

>         pg->genpd.power_off = tegra_genpd_power_off;

>         pg->genpd.power_on = tegra_genpd_power_on;

>         pg->pmc = pmc;

> @@ -1500,7 +1498,7 @@ static int tegra_pmc_core_pd_add(struct tegra_pmc *pmc, struct device_node *np)

>                 return -ENOMEM;

>

>         genpd->name = np->name;

> -       genpd->attach_dev = tegra_pmc_pd_attach_dev;

> +       genpd->default_performance_state = tegra_pmc_genpd_default_perf_state;

>         genpd->set_performance_state = tegra_pmc_core_pd_set_performance_state;

>         genpd->opp_to_performance_state = tegra_pmc_core_pd_opp_to_performance_state;

>

> diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h

> index 21a0577305ef..cd4867817ca5 100644

> --- a/include/linux/pm_domain.h

> +++ b/include/linux/pm_domain.h

> @@ -143,6 +143,8 @@ struct generic_pm_domain {

>                           struct device *dev);

>         void (*detach_dev)(struct generic_pm_domain *domain,

>                            struct device *dev);

> +       int (*default_performance_state)(struct generic_pm_domain *domain,

> +                                        struct device *dev);

>         unsigned int flags;             /* Bit field of configs for genpd */

>         struct genpd_power_state *states;

>         void (*free_states)(struct genpd_power_state *states,

>

> >> I can revert back to the previous variant, although this variant works

> >> well too.

> > I looked at that code and in that path we end up calling

> > dev_pm_opp_set_rate(), after it has initialized the opp table for the

> > device.

> >

> > Rather than doing the OF parsing above to find out the current state

> > for the device, why can't you just call dev_pm_opp_set_rate() to

> > initialize a proper vote instead?

> >

>

> For some devices clock rate is either preset by bootloader, or by clk driver, or by assigned-clocks in a device-tree. And then I don't see what's the difference in comparison to initialization for the current rate.

>

> For some devices, like memory controller, we can't just change the clock rate because it's a complex procedure and some boards will use fixed rate, but the power vote still must be initialized.


I am not saying you should change the clock rate. The current code
path that runs via devm_tegra_core_dev_init_opp_table() just calls
clk_get_rate and then dev_pm_opp_set_rate() with the current rate to
vote for the corresponding OPP level. Right?

Isn't this exactly what you want? No?

Kind regards
Uffe
Dmitry Osipenko Aug. 11, 2021, 7:30 p.m. UTC | #8
10.08.2021 13:51, Ulf Hansson пишет:
...
>> +               if (power_on) {

>> +                       ret = dev_pm_genpd_set_performance_state(dev, default_pstate);

> 

> However, this is more questionable to me.

> 

> First, I don't think we should care about whether this is "power_on"

> or not. At this point, performance states are treated orthogonal to

> idle states in genpd. We may decide to change that in some way, but

> that deserves a different change.


Okay

> Second, I don't think we should call

> dev_pm_genpd_set_performance_state() from here. It's probably better

> handled from the genpd callback itself, if/when needed.

Indeed

> That said, perhaps the new callback should just return a regular error

> code and zero on success, rather than the current performance state.

> See more below.

> 

>> +                       if (ret) {

>> +                               dev_err(dev, "failed to set default performance state %u for PM domain %s: %d\n",

>> +                                       default_pstate, pd->name, ret);

>> +                               goto out;

>> +                       }

>> +               } else {

>> +                       dev_gpd_data(dev)->rpm_pstate = default_pstate;

> 

> No, this isn't the right thing to do.

> 

> It looks like you are trying to use the ->rpm_pstate for

> synchronization with runtime PM for consumer drivers. This is fragile

> as it depends on the runtime PM deployment in the consumer driver. I

> think you should look at ->rpm_pstate as a variable solely for

> managing save/restore of the performance state for the device, during

> runtime suspend/resume in genpd.

> 

> Synchronization of a vote for a performance state for a device, needs

> to be managed by calling dev_pm_genpd_set_performance_state() - or by

> calling an OPP function that calls it, like dev_pm_opp_set_rate(), for

> example.


The !power_on case should be skipped at all because common core code
can't handle it properly, hence rpm_pstate shouldn't be touched. I
realized it only after sending out this email.

...
>>> Rather than doing the OF parsing above to find out the current state

>>> for the device, why can't you just call dev_pm_opp_set_rate() to

>>> initialize a proper vote instead?

>>>

>>

>> For some devices clock rate is either preset by bootloader, or by clk driver, or by assigned-clocks in a device-tree. And then I don't see what's the difference in comparison to initialization for the current rate.

>>

>> For some devices, like memory controller, we can't just change the clock rate because it's a complex procedure and some boards will use fixed rate, but the power vote still must be initialized.

> 

> I am not saying you should change the clock rate. The current code

> path that runs via devm_tegra_core_dev_init_opp_table() just calls

> clk_get_rate and then dev_pm_opp_set_rate() with the current rate to

> vote for the corresponding OPP level. Right?

> 

> Isn't this exactly what you want? No?


I see now what you meant, it's actually a simpler variant and it works
too. Thank you for the suggestion, I'll prepare v8.
Dmitry Osipenko Aug. 11, 2021, 10:41 p.m. UTC | #9
11.08.2021 22:30, Dmitry Osipenko пишет:
> 10.08.2021 13:51, Ulf Hansson пишет:

> ...

>>> +               if (power_on) {

>>> +                       ret = dev_pm_genpd_set_performance_state(dev, default_pstate);

>>

>> However, this is more questionable to me.

>>

>> First, I don't think we should care about whether this is "power_on"

>> or not. At this point, performance states are treated orthogonal to

>> idle states in genpd. We may decide to change that in some way, but

>> that deserves a different change.

> 

> Okay

> 

>> Second, I don't think we should call

>> dev_pm_genpd_set_performance_state() from here. It's probably better

>> handled from the genpd callback itself, if/when needed.

> Indeed

> 

>> That said, perhaps the new callback should just return a regular error

>> code and zero on success, rather than the current performance state.

>> See more below.

>>

>>> +                       if (ret) {

>>> +                               dev_err(dev, "failed to set default performance state %u for PM domain %s: %d\n",

>>> +                                       default_pstate, pd->name, ret);

>>> +                               goto out;

>>> +                       }

>>> +               } else {

>>> +                       dev_gpd_data(dev)->rpm_pstate = default_pstate;

>>

>> No, this isn't the right thing to do.

>>

>> It looks like you are trying to use the ->rpm_pstate for

>> synchronization with runtime PM for consumer drivers. This is fragile

>> as it depends on the runtime PM deployment in the consumer driver. I

>> think you should look at ->rpm_pstate as a variable solely for

>> managing save/restore of the performance state for the device, during

>> runtime suspend/resume in genpd.

>>

>> Synchronization of a vote for a performance state for a device, needs

>> to be managed by calling dev_pm_genpd_set_performance_state() - or by

>> calling an OPP function that calls it, like dev_pm_opp_set_rate(), for

>> example.

> 

> The !power_on case should be skipped at all because common core code

> can't handle it properly, hence rpm_pstate shouldn't be touched. I

> realized it only after sending out this email.

> 

> ...

>>>> Rather than doing the OF parsing above to find out the current state

>>>> for the device, why can't you just call dev_pm_opp_set_rate() to

>>>> initialize a proper vote instead?

>>>>

>>>

>>> For some devices clock rate is either preset by bootloader, or by clk driver, or by assigned-clocks in a device-tree. And then I don't see what's the difference in comparison to initialization for the current rate.

>>>

>>> For some devices, like memory controller, we can't just change the clock rate because it's a complex procedure and some boards will use fixed rate, but the power vote still must be initialized.

>>

>> I am not saying you should change the clock rate. The current code

>> path that runs via devm_tegra_core_dev_init_opp_table() just calls

>> clk_get_rate and then dev_pm_opp_set_rate() with the current rate to

>> vote for the corresponding OPP level. Right?

>>

>> Isn't this exactly what you want? No?

> 

> I see now what you meant, it's actually a simpler variant and it works

> too. Thank you for the suggestion, I'll prepare v8.

> 


My bad, it doesn't work at all. I actually need to use the rpm_pstate or
something else because performance state is coupled with the enable
state of the device. If device is never rpm-suspended by consumer
driver, then the initialized performance state is never dropped. Hence I
want to initialize the state which is set only when device is resumed.

I'll need to think more about it.
Dmitry Osipenko Aug. 12, 2021, 1:40 a.m. UTC | #10
12.08.2021 01:41, Dmitry Osipenko пишет:
>>> I am not saying you should change the clock rate. The current code

>>> path that runs via devm_tegra_core_dev_init_opp_table() just calls

>>> clk_get_rate and then dev_pm_opp_set_rate() with the current rate to

>>> vote for the corresponding OPP level. Right?

>>>

>>> Isn't this exactly what you want? No?

>> I see now what you meant, it's actually a simpler variant and it works

>> too. Thank you for the suggestion, I'll prepare v8.

>>

> My bad, it doesn't work at all. I actually need to use the rpm_pstate or

> something else because performance state is coupled with the enable

> state of the device. If device is never rpm-suspended by consumer

> driver, then the initialized performance state is never dropped. Hence I

> want to initialize the state which is set only when device is resumed.

> 

> I'll need to think more about it.


GENPD core has these false assumptions:

1. It assumes that by default all devices are at zero performance level
at a boot time. This is not true for Tegra because hardware is
pre-initialized independently from GENPD.

2. It assumes that nothing depends on performance level and devices can
operate at any level at any time. Not true for Tegra and other platforms
where performance level is coupled with clocks state of attached
devices. OPP framework glues clock and performance level together for
us, which works good so far.

Hence I either need to patch every driver to use dev_pm_opp_set_rate in
order to sync clk rate with the perf level at device runtime, or I need
to preset the rpm perf level to allow GENPD core to set the level before
device is resumed.
Ulf Hansson Aug. 12, 2021, 11:17 a.m. UTC | #11
On Thu, 12 Aug 2021 at 03:40, Dmitry Osipenko <digetx@gmail.com> wrote:
>

> 12.08.2021 01:41, Dmitry Osipenko пишет:

> >>> I am not saying you should change the clock rate. The current code

> >>> path that runs via devm_tegra_core_dev_init_opp_table() just calls

> >>> clk_get_rate and then dev_pm_opp_set_rate() with the current rate to

> >>> vote for the corresponding OPP level. Right?

> >>>

> >>> Isn't this exactly what you want? No?

> >> I see now what you meant, it's actually a simpler variant and it works

> >> too. Thank you for the suggestion, I'll prepare v8.

> >>

> > My bad, it doesn't work at all. I actually need to use the rpm_pstate or

> > something else because performance state is coupled with the enable

> > state of the device. If device is never rpm-suspended by consumer

> > driver, then the initialized performance state is never dropped. Hence I

> > want to initialize the state which is set only when device is resumed.

> >

> > I'll need to think more about it.

>

> GENPD core has these false assumptions:

>

> 1. It assumes that by default all devices are at zero performance level

> at a boot time. This is not true for Tegra because hardware is

> pre-initialized independently from GENPD.


Right, which is similar to other SoCs.

>

> 2. It assumes that nothing depends on performance level and devices can

> operate at any level at any time. Not true for Tegra and other platforms

> where performance level is coupled with clocks state of attached

> devices. OPP framework glues clock and performance level together for

> us, which works good so far.


Right, OPPs need to be managed differently depending on the SoC.
That's why genpd is there to help and to model this as "performance
states" and to allow operations to be set through SoC specific
callabacks, for example.

More importantly, the assumption is that in general, consumer drivers
should use the OPP library to vote/set OPP levels, they shouldn't call
dev_pm_genpd_set_performance_state() - unless they know exactly what
they are doing.

>

> Hence I either need to patch every driver to use dev_pm_opp_set_rate in

> order to sync clk rate with the perf level at device runtime, or I need

> to preset the rpm perf level to allow GENPD core to set the level before

> device is resumed.


When the device is getting attached to its genpd (during the probe
sequence and for a single PM domain case), runtime PM is disabled for
the device. If you would call dev_pm_opp_set_rate() from a genpd
callback during attach (without changing the rate), this means you
would update/synchronize the vote. In this way, the vote is set before
the device is runtime resumed by the driver, right?

On the other hand, patching the driver should also be quite simple and
you need to do that anyways, don't you?

Kind regards
Uffe
Dmitry Osipenko Aug. 12, 2021, 4:24 p.m. UTC | #12
12.08.2021 14:17, Ulf Hansson пишет:
> On Thu, 12 Aug 2021 at 03:40, Dmitry Osipenko <digetx@gmail.com> wrote:

>>

>> 12.08.2021 01:41, Dmitry Osipenko пишет:

>>>>> I am not saying you should change the clock rate. The current code

>>>>> path that runs via devm_tegra_core_dev_init_opp_table() just calls

>>>>> clk_get_rate and then dev_pm_opp_set_rate() with the current rate to

>>>>> vote for the corresponding OPP level. Right?

>>>>>

>>>>> Isn't this exactly what you want? No?

>>>> I see now what you meant, it's actually a simpler variant and it works

>>>> too. Thank you for the suggestion, I'll prepare v8.

>>>>

>>> My bad, it doesn't work at all. I actually need to use the rpm_pstate or

>>> something else because performance state is coupled with the enable

>>> state of the device. If device is never rpm-suspended by consumer

>>> driver, then the initialized performance state is never dropped. Hence I

>>> want to initialize the state which is set only when device is resumed.

>>>

>>> I'll need to think more about it.

>>

>> GENPD core has these false assumptions:

>>

>> 1. It assumes that by default all devices are at zero performance level

>> at a boot time. This is not true for Tegra because hardware is

>> pre-initialized independently from GENPD.

> 

> Right, which is similar to other SoCs.

> 

>>

>> 2. It assumes that nothing depends on performance level and devices can

>> operate at any level at any time. Not true for Tegra and other platforms

>> where performance level is coupled with clocks state of attached

>> devices. OPP framework glues clock and performance level together for

>> us, which works good so far.

> 

> Right, OPPs need to be managed differently depending on the SoC.

> That's why genpd is there to help and to model this as "performance

> states" and to allow operations to be set through SoC specific

> callabacks, for example.

> 

> More importantly, the assumption is that in general, consumer drivers

> should use the OPP library to vote/set OPP levels, they shouldn't call

> dev_pm_genpd_set_performance_state() - unless they know exactly what

> they are doing.

> 

>>

>> Hence I either need to patch every driver to use dev_pm_opp_set_rate in

>> order to sync clk rate with the perf level at device runtime, or I need

>> to preset the rpm perf level to allow GENPD core to set the level before

>> device is resumed.

> 

> When the device is getting attached to its genpd (during the probe

> sequence and for a single PM domain case), runtime PM is disabled for

> the device. If you would call dev_pm_opp_set_rate() from a genpd

> callback during attach (without changing the rate), this means you

> would update/synchronize the vote. In this way, the vote is set before

> the device is runtime resumed by the driver, right?


I was doing that previously, but then realized that for some devices the
state needs to be synced only once device is rpm-resumed. Making RPM
driver to perform the syncing worked well.

> On the other hand, patching the driver should also be quite simple and

> you need to do that anyways, don't you?


It will be extra boilerplate code in the drivers, but perhaps it's the
best option for today, I'll consider it for v8. Thank you for helping
with the review.
diff mbox series

Patch

diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c
index f63dfb2ca3f9..ebafb818b08e 100644
--- a/drivers/soc/tegra/pmc.c
+++ b/drivers/soc/tegra/pmc.c
@@ -506,6 +506,151 @@  static void tegra_pmc_scratch_writel(struct tegra_pmc *pmc, u32 value,
 		writel(value, pmc->scratch + offset);
 }
 
+static const char * const tegra_emc_compats[] = {
+	"nvidia,tegra20-emc",
+	"nvidia,tegra30-emc",
+	NULL,
+};
+
+/*
+ * This GENPD callback is used by both powergate and core domains.
+ *
+ * We retrieve clock rate of the attached device and initialize domain's
+ * performance state in accordance to the clock rate.
+ */
+static int tegra_pmc_pd_attach_dev(struct generic_pm_domain *genpd,
+				   struct device *dev)
+{
+	struct generic_pm_domain_data *gpd_data = dev_gpd_data(dev);
+	struct opp_table *opp_table, *pd_opp_table;
+	struct generic_pm_domain *core_genpd;
+	struct dev_pm_opp *opp, *pd_opp;
+	unsigned long rate, state;
+	struct gpd_link *link;
+	struct clk *clk;
+	u32 hw_version;
+	int ret;
+
+	/*
+	 * Tegra114+ SocS don't support OPP yet.  But if they will get OPP
+	 * support, then we want to skip OPP for older kernels to preserve
+	 * compatibility of newer DTBs with older kernels.
+	 */
+	if (!pmc->soc->supports_core_domain)
+		return 0;
+
+	/*
+	 * The EMC devices are a special case because we have a protection
+	 * from non-EMC drivers getting clock handle before EMC driver is
+	 * fully initialized.  The goal of the protection is to prevent
+	 * devfreq driver from getting failures if it will try to change
+	 * EMC clock rate until clock is fully initialized.  The EMC drivers
+	 * will initialize the performance state by themselves.
+	 */
+	if (of_device_compatible_match(dev->of_node, tegra_emc_compats))
+		return 0;
+
+	clk = clk_get(dev, NULL);
+	if (IS_ERR(clk)) {
+		dev_err(&genpd->dev, "failed to get clk of %s: %pe\n",
+			dev_name(dev), clk);
+		return PTR_ERR(clk);
+	}
+
+	rate = clk_get_rate(clk);
+	if (!rate) {
+		dev_err(&genpd->dev, "failed to get clk rate of %s\n",
+			dev_name(dev));
+		ret = -EINVAL;
+		goto put_clk;
+	}
+
+	if (of_machine_is_compatible("nvidia,tegra20"))
+		hw_version = BIT(tegra_sku_info.soc_process_id);
+	else
+		hw_version = BIT(tegra_sku_info.soc_speedo_id);
+
+	opp_table = dev_pm_opp_set_supported_hw(dev, &hw_version, 1);
+	if (IS_ERR(opp_table)) {
+		dev_err(&genpd->dev, "failed to set OPP supported HW for %s: %d\n",
+			dev_name(dev), ret);
+		ret = PTR_ERR(opp_table);
+		goto put_clk;
+	}
+
+	ret = dev_pm_opp_of_add_table(dev);
+	if (ret) {
+		/* older DTBs that don't have OPPs will get -ENODEV here */
+		if (ret != -ENODEV)
+			dev_err(&genpd->dev, "failed to get OPP table of %s: %d\n",
+				dev_name(dev), ret);
+		else
+			ret = 0;
+
+		goto put_supported_hw;
+	}
+
+	/* find suitable OPP for the rate */
+	opp = dev_pm_opp_find_freq_ceil(dev, &rate);
+
+	if (opp == ERR_PTR(-ERANGE))
+		opp = dev_pm_opp_find_freq_floor(dev, &rate);
+
+	if (IS_ERR(opp)) {
+		dev_err(&genpd->dev, "failed to find OPP for %luHz of %s: %pe\n",
+			rate, dev_name(dev), opp);
+		ret = PTR_ERR(opp);
+		goto remove_dev_table;
+	}
+
+	if (!list_empty(&genpd->child_links)) {
+		link = list_first_entry(&genpd->child_links, struct gpd_link,
+					child_node);
+		core_genpd = link->parent;
+	} else {
+		core_genpd = genpd;
+	}
+
+	pd_opp_table = dev_pm_opp_get_opp_table(&core_genpd->dev);
+	if (IS_ERR(pd_opp_table)) {
+		dev_err(&genpd->dev, "failed to get OPP table of %s: %pe\n",
+			dev_name(&core_genpd->dev), pd_opp_table);
+		ret = PTR_ERR(pd_opp_table);
+		goto put_dev_opp;
+	}
+
+	pd_opp = dev_pm_opp_xlate_required_opp(opp_table, pd_opp_table, opp);
+	if (IS_ERR(pd_opp)) {
+		dev_err(&genpd->dev,
+			"failed to xlate required OPP for %luHz of %s: %pe\n",
+			rate, dev_name(dev), pd_opp);
+		ret = PTR_ERR(pd_opp);
+		goto put_pd_opp_table;
+	}
+
+	/*
+	 * The initialized state will be applied by GENPD core on the first
+	 * RPM-resume of the device.  This means that drivers don't need to
+	 * explicitly initialize performance state.
+	 */
+	state = pm_genpd_opp_to_performance_state(&core_genpd->dev, pd_opp);
+	gpd_data->rpm_pstate = state;
+	dev_pm_opp_put(pd_opp);
+
+put_pd_opp_table:
+	dev_pm_opp_put_opp_table(pd_opp_table);
+put_dev_opp:
+	dev_pm_opp_put(opp);
+remove_dev_table:
+	dev_pm_opp_of_remove_table(dev);
+put_supported_hw:
+	dev_pm_opp_put_supported_hw(opp_table);
+put_clk:
+	clk_put(clk);
+
+	return ret;
+}
+
 /*
  * TODO Figure out a way to call this with the struct tegra_pmc * passed in.
  * This currently doesn't work because readx_poll_timeout() can only operate
@@ -1238,6 +1383,7 @@  static int tegra_powergate_add(struct tegra_pmc *pmc, struct device_node *np)
 
 	pg->id = id;
 	pg->genpd.name = np->name;
+	pg->genpd.attach_dev = tegra_pmc_pd_attach_dev;
 	pg->genpd.power_off = tegra_genpd_power_off;
 	pg->genpd.power_on = tegra_genpd_power_on;
 	pg->pmc = pmc;
@@ -1354,6 +1500,7 @@  static int tegra_pmc_core_pd_add(struct tegra_pmc *pmc, struct device_node *np)
 		return -ENOMEM;
 
 	genpd->name = np->name;
+	genpd->attach_dev = tegra_pmc_pd_attach_dev;
 	genpd->set_performance_state = tegra_pmc_core_pd_set_performance_state;
 	genpd->opp_to_performance_state = tegra_pmc_core_pd_opp_to_performance_state;