[v2,3/4] cpufreq: mediatek: add Mediatek cpufreq driver

Message ID 1425458956-20665-4-git-send-email-pi-cheng.chen@linaro.org
State New
Headers show

Commit Message

pi-cheng.chen March 4, 2015, 8:49 a.m.
In this patch, some SoC specific voltage scaling flow is implemented in the
cpufreq notifier of mtk-cpufreq driver.

Signed-off-by: pi-cheng.chen <pi-cheng.chen@linaro.org>
---
 drivers/cpufreq/Kconfig.arm   |   6 +
 drivers/cpufreq/Makefile      |   1 +
 drivers/cpufreq/mtk-cpufreq.c | 346 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 353 insertions(+)
 create mode 100644 drivers/cpufreq/mtk-cpufreq.c

Comments

Viresh Kumar March 4, 2015, 11:09 a.m. | #1
Haven't reviewed it completely yet, but this is all I have done.

On 4 March 2015 at 14:19, pi-cheng.chen <pi-cheng.chen@linaro.org> wrote:

> +static int mtk_cpufreq_notify(struct notifier_block *nb,
> +                             unsigned long action, void *data)
> +{
> +       struct cpufreq_freqs *freqs = data;
> +       struct cpu_opp_table *opp_tbl = dvfs_info->opp_tbl;

There is only one dvfs info ? but there are two clusters, sorry got confused
a bit..

> +       int old_vproc, new_vproc, old_index, new_index;
> +
> +       if (!cpumask_test_cpu(freqs->cpu, &dvfs_info->cpus))
> +               return NOTIFY_DONE;
> +
> +       old_vproc = regulator_get_voltage(dvfs_info->proc_reg);
> +       old_index = cpu_opp_table_get_volt_index(old_vproc);
> +       new_index = cpu_opp_table_get_freq_index(freqs->new * 1000);
> +       new_vproc = opp_tbl[new_index].vproc;
> +
> +       if (old_vproc == new_vproc)
> +               return 0;
> +
> +       if ((action == CPUFREQ_PRECHANGE && old_vproc < new_vproc) ||
> +           (action == CPUFREQ_POSTCHANGE && old_vproc > new_vproc))
> +               mtk_cpufreq_voltage_trace(old_index, new_index);
> +
> +       return NOTIFY_OK;
> +}
> +
> +static struct notifier_block mtk_cpufreq_nb = {
> +       .notifier_call = mtk_cpufreq_notify,
> +};
> +
> +static int cpu_opp_table_init(struct device *dev)
> +{
> +       struct device *cpu_dev = dvfs_info->cpu_dev;
> +       struct cpu_opp_table *opp_tbl;
> +       struct dev_pm_opp *opp;
> +       int ret, cnt, i;
> +       unsigned long rate, vproc, vsram;
> +
> +       ret = of_init_opp_table(cpu_dev);
> +       if (ret) {
> +               dev_err(dev, "Failed to init mtk_opp_table: %d\n", ret);
> +               return ret;
> +       }
> +
> +       rcu_read_lock();
> +
> +       cnt = dev_pm_opp_get_opp_count(cpu_dev);
> +       if (cnt < 0) {
> +               dev_err(cpu_dev, "No OPP table is found: %d", cnt);
> +               ret = cnt;
> +               goto out_free_opp_tbl;
> +       }
> +
> +       opp_tbl = devm_kcalloc(dev, (cnt + 1), sizeof(struct cpu_opp_table),
> +                              GFP_ATOMIC);
> +       if (!opp_tbl) {
> +               ret = -ENOMEM;
> +               goto out_free_opp_tbl;
> +       }
> +
> +       for (i = 0, rate = 0; i < cnt; i++, rate++) {
> +               opp = dev_pm_opp_find_freq_ceil(cpu_dev, &rate);
> +               if (IS_ERR(opp)) {
> +                       ret = PTR_ERR(opp);
> +                       goto out_free_opp_tbl;
> +               }
> +
> +               vproc = dev_pm_opp_get_voltage(opp);
> +               vproc = get_regulator_voltage_ceil(dvfs_info->proc_reg, vproc);
> +               vsram = vproc + VOLT_SHIFT_LOWER_LIMIT;
> +               vsram = get_regulator_voltage_ceil(dvfs_info->sram_reg, vsram);
> +
> +               if (vproc < 0 || vsram < 0) {
> +                       ret = -EINVAL;
> +                       goto out_free_opp_tbl;
> +               }
> +
> +               opp_tbl[i].freq = rate;
> +               opp_tbl[i].vproc = vproc;
> +               opp_tbl[i].vsram = vsram;
> +       }
> +
> +       opp_tbl[i].freq = 0;
> +       opp_tbl[i].vproc = -1;
> +       opp_tbl[i].vsram = -1;
> +       dvfs_info->opp_tbl = opp_tbl;
> +
> +out_free_opp_tbl:
> +       rcu_read_unlock();
> +       of_free_opp_table(cpu_dev);
> +
> +       return ret;
> +}
> +
> +static struct cpufreq_cpu_domain *get_cpu_domain(struct list_head *domain_list,
> +                                                int cpu)
> +{
> +       struct list_head *node;
> +
> +       list_for_each(node, domain_list) {
> +               struct cpufreq_cpu_domain *domain;
> +
> +               domain = container_of(node, struct cpufreq_cpu_domain, node);
> +               if (cpumask_test_cpu(cpu, &domain->cpus))
> +                       return domain;
> +       }
> +
> +       return NULL;
> +}
> +
> +static int mtk_cpufreq_probe(struct platform_device *pdev)

On a dual cluster big LITTLE (your system), how many times is probe
getting called ? Once or twice, i.e. for each cluster ??

> +{
> +       struct clk *inter_clk;
> +       struct cpufreq_dt_platform_data *pd;
> +       struct platform_device *dev;
> +       unsigned long inter_freq;
> +       int cpu, ret;
> +
> +       inter_clk = clk_get(&pdev->dev, NULL);

How is this supposed to work ? How will pdev->dev give intermediate
clock ?

> +       if (IS_ERR(inter_clk)) {
> +               if (PTR_ERR(inter_clk) == -EPROBE_DEFER) {
> +                       dev_warn(&pdev->dev, "clock not ready. defer probeing.\n");
> +                       return -EPROBE_DEFER;
> +               }
> +
> +               dev_err(&pdev->dev, "Failed to get intermediate clock\n");
> +               return -ENODEV;
> +       }
> +       inter_freq = clk_get_rate(inter_clk);
> +
> +       pd = devm_kzalloc(&pdev->dev, sizeof(*pd), GFP_KERNEL);
> +       if (!pd)
> +               return -ENOMEM;
> +
> +       dvfs_info = devm_kzalloc(&pdev->dev, sizeof(*dvfs_info), GFP_KERNEL);
> +       if (!dvfs_info)
> +               return -ENOMEM;

Instead of two allocations, you could have made pd part of dvfs_info
and allocated only
once.

> +
> +       pd->independent_clocks = 1,

s/,/; ??

> +       INIT_LIST_HEAD(&pd->domain_list);
> +
> +       for_each_possible_cpu(cpu) {
> +               struct device *cpu_dev;
> +               struct cpufreq_cpu_domain *new_domain;
> +               struct regulator *proc_reg, *sram_reg;
> +
> +               cpu_dev = get_cpu_device(cpu);

This should be done in the below if block only.

> +               if (!dvfs_info->cpu_dev) {
> +                       proc_reg = regulator_get_exclusive(cpu_dev, "proc");
> +                       sram_reg = regulator_get_exclusive(cpu_dev, "sram");
> +
> +                       if (PTR_ERR(proc_reg) == -EPROBE_DEFER ||
> +                           PTR_ERR(sram_reg) == -EPROBE_DEFER)
> +                               return -EPROBE_DEFER;
> +
> +                       if (!IS_ERR_OR_NULL(proc_reg) &&
> +                           !IS_ERR_OR_NULL(sram_reg)) {
> +                               dvfs_info->cpu_dev = cpu_dev;
> +                               dvfs_info->proc_reg = proc_reg;
> +                               dvfs_info->sram_reg = sram_reg;
> +                               cpumask_copy(&dvfs_info->cpus,
> +                                            &cpu_topology[cpu].core_sibling);
> +                       }
> +               }
> +
> +               if (get_cpu_domain(&pd->domain_list, cpu))
> +                       continue;

This isn't required if you do below..

> +
> +               new_domain = devm_kzalloc(&pdev->dev, sizeof(*new_domain),
> +                                         GFP_KERNEL);
> +               if (!new_domain)
> +                       return -ENOMEM;
> +
> +               cpumask_copy(&new_domain->cpus,
> +                            &cpu_topology[cpu].core_sibling);
> +               new_domain->intermediate_freq = inter_freq;
> +               list_add(&new_domain->node, &pd->domain_list);

Just issue a 'break' from here as you don't want to let this loop run again.

> +       }
> +
> +       if (IS_ERR_OR_NULL(dvfs_info->proc_reg) ||
> +           IS_ERR_OR_NULL(dvfs_info->sram_reg)) {
> +               dev_err(&pdev->dev, "Failed to get regulators\n");
> +               return -ENODEV;
> +       }

If you really need these, then don't allocate new_domain unless you find a CPU
with these regulators..

> +       ret = cpu_opp_table_init(&pdev->dev);
> +       if (ret) {
> +               dev_err(&pdev->dev, "Failed to setup cpu_opp_table: %d\n",
> +                       ret);
> +               return ret;
> +       }
> +
> +       ret = cpufreq_register_notifier(&mtk_cpufreq_nb,
> +                                       CPUFREQ_TRANSITION_NOTIFIER);
> +       if (ret) {
> +               dev_err(&pdev->dev, "Failed to register cpufreq notifier\n");
> +               return ret;
> +       }

Don't want to free OPP table here on error ?

> +       dev = platform_device_register_data(NULL, "cpufreq-dt", -1, pd,
> +                                           sizeof(*pd));

So this routine is going to be called only once. Then how are you
initializing stuff
for both the clusters in the upper for loop ? It looked very very confusing.

> +       if (IS_ERR(dev)) {
> +               dev_err(&pdev->dev,
> +                       "Failed to register cpufreq-dt platform device\n");
> +               return PTR_ERR(dev);
> +       }
> +
> +       return 0;
> +}
> +
> +static const struct of_device_id mtk_cpufreq_match[] = {
> +       {
> +               .compatible = "mediatek,mtk-cpufreq",

Can't you use "mediatek,mt8173" here ?

> +       },
> +       {}
> +};
> +MODULE_DEVICE_TABLE(of, mtk_cpufreq_match);
> +
> +static struct platform_driver mtk_cpufreq_platdrv = {
> +       .driver = {
> +               .name   = "mtk-cpufreq",
> +               .of_match_table = mtk_cpufreq_match,
> +       },
> +       .probe  = mtk_cpufreq_probe,
> +};
> +module_platform_driver(mtk_cpufreq_platdrv);
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
pi-cheng.chen March 5, 2015, 7:27 a.m. | #2
Hi Viresh,

Thanks for reviewing.
Please see my reply below:

On 4 March 2015 at 19:09, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> Haven't reviewed it completely yet, but this is all I have done.
>
> On 4 March 2015 at 14:19, pi-cheng.chen <pi-cheng.chen@linaro.org> wrote:
>
>> +static int mtk_cpufreq_notify(struct notifier_block *nb,
>> +                             unsigned long action, void *data)
>> +{
>> +       struct cpufreq_freqs *freqs = data;
>> +       struct cpu_opp_table *opp_tbl = dvfs_info->opp_tbl;
>
> There is only one dvfs info ? but there are two clusters, sorry got confused
> a bit..

There are 2 clusters, but only the big cluster need to do voltage scaling in the
notifier, since the voltage controlling is done by cpufreq-dt driver
in this version.
Therefore only one dvfs_info struct here.

>
>> +       int old_vproc, new_vproc, old_index, new_index;
>> +
>> +       if (!cpumask_test_cpu(freqs->cpu, &dvfs_info->cpus))
>> +               return NOTIFY_DONE;
>> +
>> +       old_vproc = regulator_get_voltage(dvfs_info->proc_reg);
>> +       old_index = cpu_opp_table_get_volt_index(old_vproc);
>> +       new_index = cpu_opp_table_get_freq_index(freqs->new * 1000);
>> +       new_vproc = opp_tbl[new_index].vproc;
>> +
>> +       if (old_vproc == new_vproc)
>> +               return 0;
>> +
>> +       if ((action == CPUFREQ_PRECHANGE && old_vproc < new_vproc) ||
>> +           (action == CPUFREQ_POSTCHANGE && old_vproc > new_vproc))
>> +               mtk_cpufreq_voltage_trace(old_index, new_index);
>> +
>> +       return NOTIFY_OK;
>> +}
>> +
>> +static struct notifier_block mtk_cpufreq_nb = {
>> +       .notifier_call = mtk_cpufreq_notify,
>> +};
>> +
>> +static int cpu_opp_table_init(struct device *dev)
>> +{
>> +       struct device *cpu_dev = dvfs_info->cpu_dev;
>> +       struct cpu_opp_table *opp_tbl;
>> +       struct dev_pm_opp *opp;
>> +       int ret, cnt, i;
>> +       unsigned long rate, vproc, vsram;
>> +
>> +       ret = of_init_opp_table(cpu_dev);
>> +       if (ret) {
>> +               dev_err(dev, "Failed to init mtk_opp_table: %d\n", ret);
>> +               return ret;
>> +       }
>> +
>> +       rcu_read_lock();
>> +
>> +       cnt = dev_pm_opp_get_opp_count(cpu_dev);
>> +       if (cnt < 0) {
>> +               dev_err(cpu_dev, "No OPP table is found: %d", cnt);
>> +               ret = cnt;
>> +               goto out_free_opp_tbl;
>> +       }
>> +
>> +       opp_tbl = devm_kcalloc(dev, (cnt + 1), sizeof(struct cpu_opp_table),
>> +                              GFP_ATOMIC);
>> +       if (!opp_tbl) {
>> +               ret = -ENOMEM;
>> +               goto out_free_opp_tbl;
>> +       }
>> +
>> +       for (i = 0, rate = 0; i < cnt; i++, rate++) {
>> +               opp = dev_pm_opp_find_freq_ceil(cpu_dev, &rate);
>> +               if (IS_ERR(opp)) {
>> +                       ret = PTR_ERR(opp);
>> +                       goto out_free_opp_tbl;
>> +               }
>> +
>> +               vproc = dev_pm_opp_get_voltage(opp);
>> +               vproc = get_regulator_voltage_ceil(dvfs_info->proc_reg, vproc);
>> +               vsram = vproc + VOLT_SHIFT_LOWER_LIMIT;
>> +               vsram = get_regulator_voltage_ceil(dvfs_info->sram_reg, vsram);
>> +
>> +               if (vproc < 0 || vsram < 0) {
>> +                       ret = -EINVAL;
>> +                       goto out_free_opp_tbl;
>> +               }
>> +
>> +               opp_tbl[i].freq = rate;
>> +               opp_tbl[i].vproc = vproc;
>> +               opp_tbl[i].vsram = vsram;
>> +       }
>> +
>> +       opp_tbl[i].freq = 0;
>> +       opp_tbl[i].vproc = -1;
>> +       opp_tbl[i].vsram = -1;
>> +       dvfs_info->opp_tbl = opp_tbl;
>> +
>> +out_free_opp_tbl:
>> +       rcu_read_unlock();
>> +       of_free_opp_table(cpu_dev);
>> +
>> +       return ret;
>> +}
>> +
>> +static struct cpufreq_cpu_domain *get_cpu_domain(struct list_head *domain_list,
>> +                                                int cpu)
>> +{
>> +       struct list_head *node;
>> +
>> +       list_for_each(node, domain_list) {
>> +               struct cpufreq_cpu_domain *domain;
>> +
>> +               domain = container_of(node, struct cpufreq_cpu_domain, node);
>> +               if (cpumask_test_cpu(cpu, &domain->cpus))
>> +                       return domain;
>> +       }
>> +
>> +       return NULL;
>> +}
>> +
>> +static int mtk_cpufreq_probe(struct platform_device *pdev)
>
> On a dual cluster big LITTLE (your system), how many times is probe
> getting called ? Once or twice, i.e. for each cluster ??

The probe function will be called only once since it's triggered by the device
tree node in the 2nd patch of this series. Though it's not acceptable obviously.

>
>> +{
>> +       struct clk *inter_clk;
>> +       struct cpufreq_dt_platform_data *pd;
>> +       struct platform_device *dev;
>> +       unsigned long inter_freq;
>> +       int cpu, ret;
>> +
>> +       inter_clk = clk_get(&pdev->dev, NULL);
>
> How is this supposed to work ? How will pdev->dev give intermediate
> clock ?

It works with the the device tree binding in the 2nd patch of this series, too.
Since the cpufreq node is not allowed, would you have some suggestions on
how to get the intermediate clock source in this case?

>
>> +       if (IS_ERR(inter_clk)) {
>> +               if (PTR_ERR(inter_clk) == -EPROBE_DEFER) {
>> +                       dev_warn(&pdev->dev, "clock not ready. defer probeing.\n");
>> +                       return -EPROBE_DEFER;
>> +               }
>> +
>> +               dev_err(&pdev->dev, "Failed to get intermediate clock\n");
>> +               return -ENODEV;
>> +       }
>> +       inter_freq = clk_get_rate(inter_clk);
>> +
>> +       pd = devm_kzalloc(&pdev->dev, sizeof(*pd), GFP_KERNEL);
>> +       if (!pd)
>> +               return -ENOMEM;
>> +
>> +       dvfs_info = devm_kzalloc(&pdev->dev, sizeof(*dvfs_info), GFP_KERNEL);
>> +       if (!dvfs_info)
>> +               return -ENOMEM;
>
> Instead of two allocations, you could have made pd part of dvfs_info
> and allocated only
> once.

Will do it.

>
>> +
>> +       pd->independent_clocks = 1,
>
> s/,/; ??

It's strange that I didn't get a compiling error here.
Will fix it.

>
>> +       INIT_LIST_HEAD(&pd->domain_list);
>> +
>> +       for_each_possible_cpu(cpu) {
>> +               struct device *cpu_dev;
>> +               struct cpufreq_cpu_domain *new_domain;
>> +               struct regulator *proc_reg, *sram_reg;
>> +
>> +               cpu_dev = get_cpu_device(cpu);
>
> This should be done in the below if block only.

Will do it.

>
>> +               if (!dvfs_info->cpu_dev) {
>> +                       proc_reg = regulator_get_exclusive(cpu_dev, "proc");
>> +                       sram_reg = regulator_get_exclusive(cpu_dev, "sram");
>> +
>> +                       if (PTR_ERR(proc_reg) == -EPROBE_DEFER ||
>> +                           PTR_ERR(sram_reg) == -EPROBE_DEFER)
>> +                               return -EPROBE_DEFER;
>> +
>> +                       if (!IS_ERR_OR_NULL(proc_reg) &&
>> +                           !IS_ERR_OR_NULL(sram_reg)) {
>> +                               dvfs_info->cpu_dev = cpu_dev;
>> +                               dvfs_info->proc_reg = proc_reg;
>> +                               dvfs_info->sram_reg = sram_reg;
>> +                               cpumask_copy(&dvfs_info->cpus,
>> +                                            &cpu_topology[cpu].core_sibling);
>> +                       }
>> +               }
>> +
>> +               if (get_cpu_domain(&pd->domain_list, cpu))
>> +                       continue;
>
> This isn't required if you do below..

Please see below.

>
>> +
>> +               new_domain = devm_kzalloc(&pdev->dev, sizeof(*new_domain),
>> +                                         GFP_KERNEL);
>> +               if (!new_domain)
>> +                       return -ENOMEM;
>> +
>> +               cpumask_copy(&new_domain->cpus,
>> +                            &cpu_topology[cpu].core_sibling);
>> +               new_domain->intermediate_freq = inter_freq;
>> +               list_add(&new_domain->node, &pd->domain_list);
>
> Just issue a 'break' from here as you don't want to let this loop run again.

Please see below.

>
>> +       }
>> +
>> +       if (IS_ERR_OR_NULL(dvfs_info->proc_reg) ||
>> +           IS_ERR_OR_NULL(dvfs_info->sram_reg)) {
>> +               dev_err(&pdev->dev, "Failed to get regulators\n");
>> +               return -ENODEV;
>> +       }
>
> If you really need these, then don't allocate new_domain unless you find a CPU
> with these regulators..

Please see below.

>
>> +       ret = cpu_opp_table_init(&pdev->dev);
>> +       if (ret) {
>> +               dev_err(&pdev->dev, "Failed to setup cpu_opp_table: %d\n",
>> +                       ret);
>> +               return ret;
>> +       }
>> +
>> +       ret = cpufreq_register_notifier(&mtk_cpufreq_nb,
>> +                                       CPUFREQ_TRANSITION_NOTIFIER);
>> +       if (ret) {
>> +               dev_err(&pdev->dev, "Failed to register cpufreq notifier\n");
>> +               return ret;
>> +       }
>
> Don't want to free OPP table here on error ?

Please correct me if I was wrong. Since the OPP table in the dvfs_info is
allocated by devm_kzalloc(), it is supposed to be freed if the probe function
failed, isn't it?

And the OPP table initialized by of_init_opp_table() in cpu_opp_table_init()
was freed right before the function return since it will be initialized again in
the cpufreq-dt driver.

>
>> +       dev = platform_device_register_data(NULL, "cpufreq-dt", -1, pd,
>> +                                           sizeof(*pd));
>
> So this routine is going to be called only once. Then how are you
> initializing stuff
> for both the clusters in the upper for loop ? It looked very very confusing.

Please let me clarify this here.
We have two clusters, one for big and another for little cores. For
the little cores'
cluster, only one voltage source needs to be controlled when doing CPU DVFS.
Therefore the voltage scaling of little cores' cluster is done in the
cpufreq-dt.
But for the big cores' cluster, there are two voltage sources here to
be controlled
and these two voltage source need to be scaled up and down in a SoC specific
manner which is implemented in the mtk_cpufreq_voltage_trace() function.
Hence, we put the voltage scaling of big cores' cluster in the cpufreq
notifier and
that's also why we need a mtk-cpufreq driver in addition to cpufreq-dt.

In the confusing loop above, I am trying to solve two problems:
1. to find out which CPUs shares the same clock / power domains among all CPUs
2. to initialize the dvfs_info which is only needed by big cores' cluster

I think that's why the loop looks so confusing. Maybe doing it in two
separate loops
will make the code more readable? I'll try it in next version.

>
>> +       if (IS_ERR(dev)) {
>> +               dev_err(&pdev->dev,
>> +                       "Failed to register cpufreq-dt platform device\n");
>> +               return PTR_ERR(dev);
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static const struct of_device_id mtk_cpufreq_match[] = {
>> +       {
>> +               .compatible = "mediatek,mtk-cpufreq",
>
> Can't you use "mediatek,mt8173" here ?

Again, the device tree binding in the 2nd patch of this series. I am
trying to get
the intermediate clock source from the device tree. The reason why I am doing
this is the intermediate clock source might be different among
different Mediatek
SoCs. Either different clock ID or different intermediate frequency. I
want to keep
the flexibility of the driver so I am trying to specify the
intermediate clock source
in the device tree. I think I need to find out some other way to do it
since it's not
allowed to do it by creating a "cpufreq node" in device tree.

Thanks again for reviewing.

Best Regards,
Pi-Cheng

>
>> +       },
>> +       {}
>> +};
>> +MODULE_DEVICE_TABLE(of, mtk_cpufreq_match);
>> +
>> +static struct platform_driver mtk_cpufreq_platdrv = {
>> +       .driver = {
>> +               .name   = "mtk-cpufreq",
>> +               .of_match_table = mtk_cpufreq_match,
>> +       },
>> +       .probe  = mtk_cpufreq_probe,
>> +};
>> +module_platform_driver(mtk_cpufreq_platdrv);
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar March 5, 2015, 9:55 a.m. | #3
On 5 March 2015 at 12:57, Pi-Cheng Chen <pi-cheng.chen@linaro.org> wrote:

> On 4 March 2015 at 19:09, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> There are 2 clusters, but only the big cluster need to do voltage scaling in the
> notifier, since the voltage controlling is done by cpufreq-dt driver
> in this version.
> Therefore only one dvfs_info struct here.

Do you really think its readable enough that way? You must have added some
comments on how this is working. Also, what about putting this stuff in your
regulator driver, so that you don't really have to do this in PRE/POST
notifiers.

>>> +       inter_clk = clk_get(&pdev->dev, NULL);
>>
>> How is this supposed to work ? How will pdev->dev give intermediate
>> clock ?
>
> It works with the the device tree binding in the 2nd patch of this series, too.
> Since the cpufreq node is not allowed, would you have some suggestions on
> how to get the intermediate clock source in this case?

How exactly? I am not doubting your work, just that I don't know how that DT
binding will reflect here with clock_get for pdev->dev..

>>> +       pd->independent_clocks = 1,
>>
>> s/,/; ??
>
> It's strange that I didn't get a compiling error here.
> Will fix it.

Its a perfectly valid statement :) and so no errors. Both will execute as they
will in case of ';', just that output of the later one will be
returned. But there
in no variable on LHS (left-hand-side) and so the value doesn't matter.

>> Don't want to free OPP table here on error ?
>
> Please correct me if I was wrong. Since the OPP table in the dvfs_info is
> allocated by devm_kzalloc(), it is supposed to be freed if the probe function
> failed, isn't it?
>
> And the OPP table initialized by of_init_opp_table() in cpu_opp_table_init()
> was freed right before the function return since it will be initialized again in
> the cpufreq-dt driver.

Okay, I was talking about this only and I missed it. We probably need to fix
this in OPP library so that multiple callers are allowed.

>>> +       dev = platform_device_register_data(NULL, "cpufreq-dt", -1, pd,
>>> +                                           sizeof(*pd));
>>
>> So this routine is going to be called only once. Then how are you
>> initializing stuff
>> for both the clusters in the upper for loop ? It looked very very confusing.
>
> Please let me clarify this here.
> We have two clusters, one for big and another for little cores. For
> the little cores'
> cluster, only one voltage source needs to be controlled when doing CPU DVFS.
> Therefore the voltage scaling of little cores' cluster is done in the
> cpufreq-dt.
> But for the big cores' cluster, there are two voltage sources here to
> be controlled
> and these two voltage source need to be scaled up and down in a SoC specific
> manner which is implemented in the mtk_cpufreq_voltage_trace() function.
> Hence, we put the voltage scaling of big cores' cluster in the cpufreq
> notifier and
> that's also why we need a mtk-cpufreq driver in addition to cpufreq-dt.
>
> In the confusing loop above, I am trying to solve two problems:
> 1. to find out which CPUs shares the same clock / power domains among all CPUs
> 2. to initialize the dvfs_info which is only needed by big cores' cluster
>
> I think that's why the loop looks so confusing. Maybe doing it in two
> separate loops
> will make the code more readable? I'll try it in next version.

Yes.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
pi-cheng.chen March 6, 2015, 5:49 a.m. | #4
+cc Sascha

On 5 March 2015 at 17:55, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> On 5 March 2015 at 12:57, Pi-Cheng Chen <pi-cheng.chen@linaro.org> wrote:
>
>> On 4 March 2015 at 19:09, Viresh Kumar <viresh.kumar@linaro.org> wrote:
>> There are 2 clusters, but only the big cluster need to do voltage scaling in the
>> notifier, since the voltage controlling is done by cpufreq-dt driver
>> in this version.
>> Therefore only one dvfs_info struct here.
>
> Do you really think its readable enough that way? You must have added some
> comments on how this is working. Also, what about putting this stuff in your
> regulator driver, so that you don't really have to do this in PRE/POST
> notifiers.

Okay. I will add comments to describe some details about this. About putting
those stuff into regulator driver, I think you mean creating a
"virtual regulator
device" and put all the voltage controlling complex into the driver, right?
Maybe it's a good idea in this case, but I am sure if this kind of
virtual regulator
is acceptable. And the flexibility might be an issue, since we might
use different
PMIC for same SoC on different board.

>
>>>> +       inter_clk = clk_get(&pdev->dev, NULL);
>>>
>>> How is this supposed to work ? How will pdev->dev give intermediate
>>> clock ?
>>
>> It works with the the device tree binding in the 2nd patch of this series, too.
>> Since the cpufreq node is not allowed, would you have some suggestions on
>> how to get the intermediate clock source in this case?
>
> How exactly? I am not doubting your work, just that I don't know how that DT
> binding will reflect here with clock_get for pdev->dev..

Please correct me if I was wrong. IIUC, It does:
clk_get() -> __of_clk_get_by_name() -> __of_clk_get()
The "mtk-cpufreq" device tree node specified the intermediate clock source in
"clocks" property. And the pdev here came from the "mtk-cpufreq" device tree
node, so we can get the "clock specifier" by calling
of_parse_phandle_with_args()
to find "clocks" property in __of_clk_get().

>
>>>> +       pd->independent_clocks = 1,
>>>
>>> s/,/; ??
>>
>> It's strange that I didn't get a compiling error here.
>> Will fix it.
>
> Its a perfectly valid statement :) and so no errors. Both will execute as they
> will in case of ';', just that output of the later one will be
> returned. But there
> in no variable on LHS (left-hand-side) and so the value doesn't matter.

Thanks for your explanation. :)

>
>>> Don't want to free OPP table here on error ?
>>
>> Please correct me if I was wrong. Since the OPP table in the dvfs_info is
>> allocated by devm_kzalloc(), it is supposed to be freed if the probe function
>> failed, isn't it?
>>
>> And the OPP table initialized by of_init_opp_table() in cpu_opp_table_init()
>> was freed right before the function return since it will be initialized again in
>> the cpufreq-dt driver.
>
> Okay, I was talking about this only and I missed it. We probably need to fix
> this in OPP library so that multiple callers are allowed.
>
>>>> +       dev = platform_device_register_data(NULL, "cpufreq-dt", -1, pd,
>>>> +                                           sizeof(*pd));
>>>
>>> So this routine is going to be called only once. Then how are you
>>> initializing stuff
>>> for both the clusters in the upper for loop ? It looked very very confusing.
>>
>> Please let me clarify this here.
>> We have two clusters, one for big and another for little cores. For
>> the little cores'
>> cluster, only one voltage source needs to be controlled when doing CPU DVFS.
>> Therefore the voltage scaling of little cores' cluster is done in the
>> cpufreq-dt.
>> But for the big cores' cluster, there are two voltage sources here to
>> be controlled
>> and these two voltage source need to be scaled up and down in a SoC specific
>> manner which is implemented in the mtk_cpufreq_voltage_trace() function.
>> Hence, we put the voltage scaling of big cores' cluster in the cpufreq
>> notifier and
>> that's also why we need a mtk-cpufreq driver in addition to cpufreq-dt.
>>
>> In the confusing loop above, I am trying to solve two problems:
>> 1. to find out which CPUs shares the same clock / power domains among all CPUs
>> 2. to initialize the dvfs_info which is only needed by big cores' cluster
>>
>> I think that's why the loop looks so confusing. Maybe doing it in two
>> separate loops
>> will make the code more readable? I'll try it in next version.
>
> Yes.

Combining comments and suggestions from you and Sascha[1], I conclude some
architectural changes are going to be made in the next version:

1. Use set_rate hook instead of determine_rate in clk driver, and
switch to intermeidate
    PLL parent and back to original CPU PLL parent explicitly in set_rate
2. Therefore we don't need intermediate frequency support in
cpufreq-dt to implement
    cpufreq support for Mediatek SoC
3. Use clk notifier to handle voltage controlling corresponding to
intermediate clock rate
4. Due to 3. we need to move all voltage controlling part back into
the notifier in
    mtk-cpufreq (Voltage controlling for little cores' cluster is
handled in cpufreq-dt in this
    version.)

And I have some other questions:
1. According to the discussion[1], should we keep on working on the intermediate
    frequency support in cpufreq-dt?
2. Will the code be simpler to have a Mediatek cpufreq driver to
handle all CPU DVFS
    complexity instead of cpufreq-dt in the situation that all voltage
scaling things need
    to be done in the clk / cpufreq notifier of mtk-cpufreq driver?

[1] http://marc.info/?l=linux-kernel&m=142546618015551&w=2

Best Regards,
Pi-Cheng
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
pi-cheng.chen March 10, 2015, 1:57 a.m. | #5
On 10 March 2015 at 00:28, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> On Wed, Mar 04, 2015 at 04:49:15PM +0800, pi-cheng.chen wrote:
>> +static int cpu_opp_table_get_freq_index(unsigned int freq)
>> +{
>> +     struct cpu_opp_table *opp_tbl = dvfs_info->opp_tbl;
>> +     int i;
>> +
>> +     for (i = 0; opp_tbl[i].freq != 0; i++) {
>> +             if (opp_tbl[i].freq >= freq)
>> +                     return i;
>> +     }
>> +
>> +     return -1;
>
> My "return -1" detector fired on this...
>
>> +static int cpu_opp_table_get_volt_index(unsigned int volt)
>> +{
>> +     struct cpu_opp_table *opp_tbl = dvfs_info->opp_tbl;
>> +     int i;
>> +
>> +     for (i = 0; opp_tbl[i].vproc != -1; i++)
>> +             if (opp_tbl[i].vproc >= volt)
>> +                     return i;
>> +
>> +     return -1;
>
> And this.
>
>> +static int mtk_cpufreq_notify(struct notifier_block *nb,
>> +                           unsigned long action, void *data)
>> +{
>> +     struct cpufreq_freqs *freqs = data;
>> +     struct cpu_opp_table *opp_tbl = dvfs_info->opp_tbl;
>> +     int old_vproc, new_vproc, old_index, new_index;
>> +
>> +     if (!cpumask_test_cpu(freqs->cpu, &dvfs_info->cpus))
>> +             return NOTIFY_DONE;
>> +
>> +     old_vproc = regulator_get_voltage(dvfs_info->proc_reg);
>> +     old_index = cpu_opp_table_get_volt_index(old_vproc);
>> +     new_index = cpu_opp_table_get_freq_index(freqs->new * 1000);
>> +     new_vproc = opp_tbl[new_index].vproc;
>
> Let's say that cpu_opp_table_get_freq_index() returns -1.  We then
> do no error checking on this, and access the memory immediately
> preceding opp_tbl[0].
>
> Since we'll be loading garbage from opp_tbl[-1], this probably means
> that mtk_cpufreq_voltage_trace() will go wrong.  Your method of using
> the vproc values to work out which direction we should walk between
> old_index...new_index means that we could end up walking through
> almost the whole UINT_MAX range to wrap around to the new index.
>
> Yet again, "return -1" proves to be a sure sign of a bug.

Hi Russell,

Thanks for your reviewing. I'll fix it in next version.

Best Regards,
Pi-Cheng

>
> --
> FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
> according to speedtest.net.
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar March 10, 2015, 2:50 a.m. | #6
On 6 March 2015 at 11:19, Pi-Cheng Chen <pi-cheng.chen@linaro.org> wrote:
> On 5 March 2015 at 17:55, Viresh Kumar <viresh.kumar@linaro.org> wrote:

> About putting
> those stuff into regulator driver, I think you mean creating a
> "virtual regulator
> device" and put all the voltage controlling complex into the driver, right?
> Maybe it's a good idea in this case, but I am sure if this kind of
> virtual regulator is acceptable.

@Mark: Is this allowed to create virtual regulator for a CPU ?

> And the flexibility might be an issue, since we might
> use different
> PMIC for same SoC on different board.

We can talk about that separately once Mark replies to my query.


> Combining comments and suggestions from you and Sascha[1], I conclude some
> architectural changes are going to be made in the next version:
>
> 1. Use set_rate hook instead of determine_rate in clk driver, and
> switch to intermeidate
>     PLL parent and back to original CPU PLL parent explicitly in set_rate

Lets wait for Russell's answer to the query I posted before making any
progress here.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar March 11, 2015, 11:03 a.m. | #7
On 11 March 2015 at 16:23, Mark Brown <broonie@kernel.org> wrote:
> On Tue, Mar 10, 2015 at 08:20:43AM +0530, Viresh Kumar wrote:
>
> Please don't send upstream e-mail to my work account, I use this address
> pretty consistently for upstream.  Upstream mail to my work account
> frequently ends up unread.

Sorry about that, I did exactly opposite of this earlier :(

>> On 6 March 2015 at 11:19, Pi-Cheng Chen <pi-cheng.chen@linaro.org> wrote:
>> > On 5 March 2015 at 17:55, Viresh Kumar <viresh.kumar@linaro.org> wrote:
>
>> > About putting
>> > those stuff into regulator driver, I think you mean creating a
>> > "virtual regulator
>> > device" and put all the voltage controlling complex into the driver, right?
>> > Maybe it's a good idea in this case, but I am sure if this kind of
>> > virtual regulator is acceptable.
>
>> @Mark: Is this allowed to create virtual regulator for a CPU ?
>
> I don't really know what the above means or what problem it's supposed
> to solve.

On mediatek platform, they need to configure two regulators in order to change
DVFS state of the big cluster. The generic cpufreq-dt driver and earlier OPP
bindings have support for a single regulator only and so what Pi-cheng tried
to do is,
- Configure one of the regulators using cpufreq-dt
- And other one using cpufreq frequency change notifiers

This looks awkward..

What I suggested was to create another virtual regulator for CPU which will
eventually configure both the regulators. And so the question that such
virtual regulators are allowed or not.
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar March 11, 2015, 11:46 a.m. | #8
On 11 March 2015 at 17:12, Lucas Stach <l.stach@pengutronix.de> wrote:
> Instead of creating virtual regulators I would be strongly in favor of
> reviving the voltage-domain work. That would allow us to push all those
> voltage dependencies we have seen on various SoCs into the domain
> handling code and don't care about it in the drivers.
>
> In that case cpufreq-dt wouldn't control a regulator directly, but
> request a specific voltage from the domain the CPUs are located in and
> those in turn would control the regulators supplying them.

I agree that it would be the right approach but who is going to do that stuff ?

I think until the time we revive the voltage-domain stuff we need to support
mediatek's driver. And probably a virtual regulator is the best approach
unless someone else comes up with another idea.
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar March 12, 2015, 9:28 a.m. | #9
On 11 March 2015 at 18:15, Mark Brown <broonie@kernel.org> wrote:
> Ugh, no - that's a hideous bodge which is only going to create trouble
> later.  Remember, DT is an ABI and should describe the hardware so if
> we're doing bodges that are visible there to shoehorn things onto our
> implementation that's bad.  The concerns that Pi-Cheng had about what
> happens if the PMIC gets changed definitely seem relevant here too.

Hmm..

> Why not just write a custom cpufreq driver if it's too hard to abstract?

Hmm, probably all that can be solved with the new OPP bindings where
we can have support for multiple regulator or clock sources to the CPU.

@Pi-cheng: How are you going to pass voltages for both the regulators
as OPPs today only support a single regulator, in case you have to write
your own driver.
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
pi-cheng.chen March 12, 2015, 11:15 a.m. | #10
On Thu, Mar 12, 2015 at 5:28 PM, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> On 11 March 2015 at 18:15, Mark Brown <broonie@kernel.org> wrote:
>> Ugh, no - that's a hideous bodge which is only going to create trouble
>> later.  Remember, DT is an ABI and should describe the hardware so if
>> we're doing bodges that are visible there to shoehorn things onto our
>> implementation that's bad.  The concerns that Pi-Cheng had about what
>> happens if the PMIC gets changed definitely seem relevant here too.
>
> Hmm..
>
>> Why not just write a custom cpufreq driver if it's too hard to abstract?
>
> Hmm, probably all that can be solved with the new OPP bindings where
> we can have support for multiple regulator or clock sources to the CPU.
>
> @Pi-cheng: How are you going to pass voltages for both the regulators
> as OPPs today only support a single regulator, in case you have to write
> your own driver.

The voltages of the two regulators need to be always under a limitation:
100mV < Vsram - Vproc < 200mV
For now, I just calculate the OPPs of Vsram from OPPs of Vproc.

Another thing I should mention, if the voltage difference of two adjacent
OPPs is greater than 100mV, we need to set the regulator to some
voltages which are not in OPP table considering the limitation above.
I think that will make it more difficult to model such flow in a generic
framework.

Best Regards,
Pi-Cheng

>
> _______________________________________________
> Linux-mediatek mailing list
> Linux-mediatek@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-mediatek
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar March 18, 2015, 6:59 a.m. | #11
On 12 March 2015 at 16:45, Pi-Cheng Chen <pi-cheng.chen@linaro.org> wrote:
> The voltages of the two regulators need to be always under a limitation:
> 100mV < Vsram - Vproc < 200mV
> For now, I just calculate the OPPs of Vsram from OPPs of Vproc.
>
> Another thing I should mention, if the voltage difference of two adjacent
> OPPs is greater than 100mV, we need to set the regulator to some
> voltages which are not in OPP table considering the limitation above.
> I think that will make it more difficult to model such flow in a generic
> framework.

Actually the problem is that OPP tables are insufficient for such cases,
and we are adding work-arounds to fix that. Though it will get fixed
with the new bindings we are adding.

I am confused on what we should be doing here. Even if you write
your own driver, you will be abusing DT with incorrect information.

Even if you go around adding a new driver, I would like you to fallback
to cpufreq-dt ones the new OPP bindings are in place.
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 1b06fc4..f421653 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -263,3 +263,9 @@  config ARM_PXA2xx_CPUFREQ
 	  This add the CPUFreq driver support for Intel PXA2xx SOCs.
 
 	  If in doubt, say N.
+
+config ARM_MTK_CPUFREQ
+	bool "Mediatek CPUFreq support"
+	depends on ARCH_MEDIATEK && CPUFREQ_DT && REGULATOR
+	help
+	  This adds the CPUFreq driver support for Mediatek SoCs.
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 82a1821..05cb596 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -62,6 +62,7 @@  obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ)	+= highbank-cpufreq.o
 obj-$(CONFIG_ARM_IMX6Q_CPUFREQ)		+= imx6q-cpufreq.o
 obj-$(CONFIG_ARM_INTEGRATOR)		+= integrator-cpufreq.o
 obj-$(CONFIG_ARM_KIRKWOOD_CPUFREQ)	+= kirkwood-cpufreq.o
+obj-$(CONFIG_ARM_MTK_CPUFREQ)		+= mtk-cpufreq.o
 obj-$(CONFIG_ARM_OMAP2PLUS_CPUFREQ)	+= omap-cpufreq.o
 obj-$(CONFIG_ARM_PXA2xx_CPUFREQ)	+= pxa2xx-cpufreq.o
 obj-$(CONFIG_PXA3xx)			+= pxa3xx-cpufreq.o
diff --git a/drivers/cpufreq/mtk-cpufreq.c b/drivers/cpufreq/mtk-cpufreq.c
new file mode 100644
index 0000000..344d588
--- /dev/null
+++ b/drivers/cpufreq/mtk-cpufreq.c
@@ -0,0 +1,346 @@ 
+/*
+* Copyright (c) 2015 Linaro Ltd.
+* Author: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License version 2 as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*/
+
+#include <linux/clk.h>
+#include <linux/cpu.h>
+#include <linux/cpufreq.h>
+#include <linux/cpufreq-dt.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/pm_opp.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+
+#define VOLT_SHIFT_LOWER_LIMIT		100000
+#define VOLT_SHIFT_UPPER_LIMIT		200000
+
+struct cpu_opp_table {
+	unsigned int freq;
+	int vproc;
+	int vsram;
+};
+
+static struct dvfs_info {
+	struct cpumask cpus;
+	struct cpu_opp_table *opp_tbl;
+	struct device *cpu_dev;
+	struct regulator *proc_reg;
+	struct regulator *sram_reg;
+} *dvfs_info;
+
+static int cpu_opp_table_get_freq_index(unsigned int freq)
+{
+	struct cpu_opp_table *opp_tbl = dvfs_info->opp_tbl;
+	int i;
+
+	for (i = 0; opp_tbl[i].freq != 0; i++) {
+		if (opp_tbl[i].freq >= freq)
+			return i;
+	}
+
+	return -1;
+}
+
+static int cpu_opp_table_get_volt_index(unsigned int volt)
+{
+	struct cpu_opp_table *opp_tbl = dvfs_info->opp_tbl;
+	int i;
+
+	for (i = 0; opp_tbl[i].vproc != -1; i++)
+		if (opp_tbl[i].vproc >= volt)
+			return i;
+
+	return -1;
+}
+
+static int get_regulator_voltage_ceil(struct regulator *regulator, int voltage)
+{
+	int cnt, i, volt = -1;
+
+	cnt = regulator_count_voltages(regulator);
+
+	for (i = 0; i < cnt && volt < voltage; i++)
+		volt = regulator_list_voltage(regulator, i);
+
+	return volt;
+}
+
+static int mtk_cpufreq_voltage_trace(int old_index, int new_index)
+{
+	struct cpu_opp_table *opp_tbl = dvfs_info->opp_tbl;
+	int old_vproc, new_vproc, i, j;
+
+	old_vproc = regulator_get_voltage(dvfs_info->proc_reg);
+	new_vproc = opp_tbl[new_index].vproc;
+
+	if (old_vproc > new_vproc) {
+		for (i = old_index; i > new_index;) {
+			for (j = i; j >= new_index; j--)
+				if (opp_tbl[i].vsram - opp_tbl[j].vproc
+				    > VOLT_SHIFT_UPPER_LIMIT)
+					break;
+			i = j + 1;
+
+			regulator_set_voltage_tol(dvfs_info->proc_reg,
+						  opp_tbl[i].vproc, 0);
+			regulator_set_voltage_tol(dvfs_info->sram_reg,
+						  opp_tbl[i].vsram, 0);
+		}
+	} else if (old_vproc < new_vproc) {
+		for (i = old_index; i < new_index;) {
+			for (j = i; j <= new_index; j++)
+				if (opp_tbl[j].vsram - opp_tbl[i].vproc
+				    > VOLT_SHIFT_UPPER_LIMIT)
+					break;
+			i = j - 1;
+
+			regulator_set_voltage_tol(dvfs_info->sram_reg,
+						  opp_tbl[i].vsram, 0);
+			regulator_set_voltage_tol(dvfs_info->proc_reg,
+						  opp_tbl[i].vproc, 0);
+		}
+	}
+
+	return 0;
+}
+
+static int mtk_cpufreq_notify(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct cpufreq_freqs *freqs = data;
+	struct cpu_opp_table *opp_tbl = dvfs_info->opp_tbl;
+	int old_vproc, new_vproc, old_index, new_index;
+
+	if (!cpumask_test_cpu(freqs->cpu, &dvfs_info->cpus))
+		return NOTIFY_DONE;
+
+	old_vproc = regulator_get_voltage(dvfs_info->proc_reg);
+	old_index = cpu_opp_table_get_volt_index(old_vproc);
+	new_index = cpu_opp_table_get_freq_index(freqs->new * 1000);
+	new_vproc = opp_tbl[new_index].vproc;
+
+	if (old_vproc == new_vproc)
+		return 0;
+
+	if ((action == CPUFREQ_PRECHANGE && old_vproc < new_vproc) ||
+	    (action == CPUFREQ_POSTCHANGE && old_vproc > new_vproc))
+		mtk_cpufreq_voltage_trace(old_index, new_index);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block mtk_cpufreq_nb = {
+	.notifier_call = mtk_cpufreq_notify,
+};
+
+static int cpu_opp_table_init(struct device *dev)
+{
+	struct device *cpu_dev = dvfs_info->cpu_dev;
+	struct cpu_opp_table *opp_tbl;
+	struct dev_pm_opp *opp;
+	int ret, cnt, i;
+	unsigned long rate, vproc, vsram;
+
+	ret = of_init_opp_table(cpu_dev);
+	if (ret) {
+		dev_err(dev, "Failed to init mtk_opp_table: %d\n", ret);
+		return ret;
+	}
+
+	rcu_read_lock();
+
+	cnt = dev_pm_opp_get_opp_count(cpu_dev);
+	if (cnt < 0) {
+		dev_err(cpu_dev, "No OPP table is found: %d", cnt);
+		ret = cnt;
+		goto out_free_opp_tbl;
+	}
+
+	opp_tbl = devm_kcalloc(dev, (cnt + 1), sizeof(struct cpu_opp_table),
+			       GFP_ATOMIC);
+	if (!opp_tbl) {
+		ret = -ENOMEM;
+		goto out_free_opp_tbl;
+	}
+
+	for (i = 0, rate = 0; i < cnt; i++, rate++) {
+		opp = dev_pm_opp_find_freq_ceil(cpu_dev, &rate);
+		if (IS_ERR(opp)) {
+			ret = PTR_ERR(opp);
+			goto out_free_opp_tbl;
+		}
+
+		vproc = dev_pm_opp_get_voltage(opp);
+		vproc = get_regulator_voltage_ceil(dvfs_info->proc_reg, vproc);
+		vsram = vproc + VOLT_SHIFT_LOWER_LIMIT;
+		vsram = get_regulator_voltage_ceil(dvfs_info->sram_reg, vsram);
+
+		if (vproc < 0 || vsram < 0) {
+			ret = -EINVAL;
+			goto out_free_opp_tbl;
+		}
+
+		opp_tbl[i].freq = rate;
+		opp_tbl[i].vproc = vproc;
+		opp_tbl[i].vsram = vsram;
+	}
+
+	opp_tbl[i].freq = 0;
+	opp_tbl[i].vproc = -1;
+	opp_tbl[i].vsram = -1;
+	dvfs_info->opp_tbl = opp_tbl;
+
+out_free_opp_tbl:
+	rcu_read_unlock();
+	of_free_opp_table(cpu_dev);
+
+	return ret;
+}
+
+static struct cpufreq_cpu_domain *get_cpu_domain(struct list_head *domain_list,
+						 int cpu)
+{
+	struct list_head *node;
+
+	list_for_each(node, domain_list) {
+		struct cpufreq_cpu_domain *domain;
+
+		domain = container_of(node, struct cpufreq_cpu_domain, node);
+		if (cpumask_test_cpu(cpu, &domain->cpus))
+			return domain;
+	}
+
+	return NULL;
+}
+
+static int mtk_cpufreq_probe(struct platform_device *pdev)
+{
+	struct clk *inter_clk;
+	struct cpufreq_dt_platform_data *pd;
+	struct platform_device *dev;
+	unsigned long inter_freq;
+	int cpu, ret;
+
+	inter_clk = clk_get(&pdev->dev, NULL);
+	if (IS_ERR(inter_clk)) {
+		if (PTR_ERR(inter_clk) == -EPROBE_DEFER) {
+			dev_warn(&pdev->dev, "clock not ready. defer probeing.\n");
+			return -EPROBE_DEFER;
+		}
+
+		dev_err(&pdev->dev, "Failed to get intermediate clock\n");
+		return -ENODEV;
+	}
+	inter_freq = clk_get_rate(inter_clk);
+
+	pd = devm_kzalloc(&pdev->dev, sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return -ENOMEM;
+
+	dvfs_info = devm_kzalloc(&pdev->dev, sizeof(*dvfs_info), GFP_KERNEL);
+	if (!dvfs_info)
+		return -ENOMEM;
+
+	pd->independent_clocks = 1,
+	INIT_LIST_HEAD(&pd->domain_list);
+
+	for_each_possible_cpu(cpu) {
+		struct device *cpu_dev;
+		struct cpufreq_cpu_domain *new_domain;
+		struct regulator *proc_reg, *sram_reg;
+
+		cpu_dev = get_cpu_device(cpu);
+
+		if (!dvfs_info->cpu_dev) {
+			proc_reg = regulator_get_exclusive(cpu_dev, "proc");
+			sram_reg = regulator_get_exclusive(cpu_dev, "sram");
+
+			if (PTR_ERR(proc_reg) == -EPROBE_DEFER ||
+			    PTR_ERR(sram_reg) == -EPROBE_DEFER)
+				return -EPROBE_DEFER;
+
+			if (!IS_ERR_OR_NULL(proc_reg) &&
+			    !IS_ERR_OR_NULL(sram_reg)) {
+				dvfs_info->cpu_dev = cpu_dev;
+				dvfs_info->proc_reg = proc_reg;
+				dvfs_info->sram_reg = sram_reg;
+				cpumask_copy(&dvfs_info->cpus,
+					     &cpu_topology[cpu].core_sibling);
+			}
+		}
+
+		if (get_cpu_domain(&pd->domain_list, cpu))
+			continue;
+
+		new_domain = devm_kzalloc(&pdev->dev, sizeof(*new_domain),
+					  GFP_KERNEL);
+		if (!new_domain)
+			return -ENOMEM;
+
+		cpumask_copy(&new_domain->cpus,
+			     &cpu_topology[cpu].core_sibling);
+		new_domain->intermediate_freq = inter_freq;
+		list_add(&new_domain->node, &pd->domain_list);
+	}
+
+	if (IS_ERR_OR_NULL(dvfs_info->proc_reg) ||
+	    IS_ERR_OR_NULL(dvfs_info->sram_reg)) {
+		dev_err(&pdev->dev, "Failed to get regulators\n");
+		return -ENODEV;
+	}
+
+	ret = cpu_opp_table_init(&pdev->dev);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to setup cpu_opp_table: %d\n",
+			ret);
+		return ret;
+	}
+
+	ret = cpufreq_register_notifier(&mtk_cpufreq_nb,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to register cpufreq notifier\n");
+		return ret;
+	}
+
+	dev = platform_device_register_data(NULL, "cpufreq-dt", -1, pd,
+					    sizeof(*pd));
+	if (IS_ERR(dev)) {
+		dev_err(&pdev->dev,
+			"Failed to register cpufreq-dt platform device\n");
+		return PTR_ERR(dev);
+	}
+
+	return 0;
+}
+
+static const struct of_device_id mtk_cpufreq_match[] = {
+	{
+		.compatible = "mediatek,mtk-cpufreq",
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(of, mtk_cpufreq_match);
+
+static struct platform_driver mtk_cpufreq_platdrv = {
+	.driver	= {
+		.name	= "mtk-cpufreq",
+		.of_match_table = mtk_cpufreq_match,
+	},
+	.probe	= mtk_cpufreq_probe,
+};
+module_platform_driver(mtk_cpufreq_platdrv);
+