diff mbox

[2/2] cpufreq: cpu0: Extend support beyond CPU0

Message ID e9c0eef07745e561b9d12dcb50c3e2f7eb3e8880.1403684824.git.viresh.kumar@linaro.org
State New
Headers show

Commit Message

Viresh Kumar June 25, 2014, 8:42 a.m. UTC
cpufreq-cpu0 driver supports only platforms which have single clock line shared
among all CPUs.

We already have platforms where this limitation doesn't hold true.For example on
Qualcomm's KRAIT all CPUs have separate clock line and so have separate
policies.

Instead of adding another driver for this (Stephen just tried that:
https://lkml.org/lkml/2014/6/24/918), we must reuse cpufreq-cpu0 driver.

cpufreq-cpu0 must be updated to break the assumption on which it is based (all
cores sharing clock line) and this patch tries to do exactly that.

As we don't have standard DT bindings to mention CPUs sharing clock-line, i.e.
affected_cpus, this patch also have few limitations. Though easy to fix once we
have proper bindings.

Limitation: We only supports two types of platforms:
- All CPUs sharing same clock line, existing user platforms
- All CPUs have separate clock lines, KRAIT

And so platforms which have multiple clusters with multiple CPUs per cluster
aren't supported yet. We need proper bindings for that first.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 .../devicetree/bindings/cpufreq/cpufreq-cpu0.txt   |   8 +-
 drivers/cpufreq/Kconfig                            |   5 +-
 drivers/cpufreq/cpufreq-cpu0.c                     | 280 +++++++++++++--------
 3 files changed, 190 insertions(+), 103 deletions(-)

Comments

Stephen Boyd June 25, 2014, 7:02 p.m. UTC | #1
On 06/25/14 01:42, Viresh Kumar wrote:
> cpufreq-cpu0 driver supports only platforms which have single clock line shared
> among all CPUs.
>
> We already have platforms where this limitation doesn't hold true.For example on
> Qualcomm's KRAIT all CPUs have separate clock line and so have separate
> policies.

Krait is not all capitalized.

>
> Instead of adding another driver for this (Stephen just tried that:
> https://lkml.org/lkml/2014/6/24/918), we must reuse cpufreq-cpu0 driver.
>
> cpufreq-cpu0 must be updated to break the assumption on which it is based (all
> cores sharing clock line) and this patch tries to do exactly that.
>
> As we don't have standard DT bindings to mention CPUs sharing clock-line, i.e.
> affected_cpus, this patch also have few limitations. Though easy to fix once we
> have proper bindings.

It should be easy enough to read the clocks property from DT for all the
CPU nodes and check to see if they're the same? I don't think we need an
affected-cpus property. Also I'd like to see current DTs specifying the
same data (clocks, regulators, opps) for each CPU node even if they all
share the same clock, regulator, and opp. It seems that the cpu0 binding
just wanted to save some space in the DT by consolidating them all under
one node, but that seems misguided. Case in point, now that we have
configurations that need more than one clock per cluster the code is
complicated and the binding is not uniform.

>
> Limitation: We only supports two types of platforms:
> - All CPUs sharing same clock line, existing user platforms
> - All CPUs have separate clock lines, KRAIT
>
> And so platforms which have multiple clusters with multiple CPUs per cluster
> aren't supported yet. We need proper bindings for that first.
>
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
>  .../devicetree/bindings/cpufreq/cpufreq-cpu0.txt   |   8 +-
>  drivers/cpufreq/Kconfig                            |   5 +-
>  drivers/cpufreq/cpufreq-cpu0.c                     | 280 +++++++++++++--------
>  3 files changed, 190 insertions(+), 103 deletions(-)
>
> diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-cpu0.txt b/Documentation/devicetree/bindings/cpufreq/cpufreq-cpu0.txt
> index f055515..fa8861e 100644
> --- a/Documentation/devicetree/bindings/cpufreq/cpufreq-cpu0.txt
> +++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-cpu0.txt
> @@ -1,15 +1,17 @@
>  Generic CPU0 cpufreq driver
>  
> -It is a generic cpufreq driver for CPU0 frequency management.  It
> +It is a generic cpufreq driver for CPU frequency management.  It
>  supports both uniprocessor (UP) and symmetric multiprocessor (SMP)
> -systems which share clock and voltage across all CPUs.
> +systems which share clock and voltage across all CPUs OR have separate
> +clock and voltage for all CPUs.
>  
>  Both required and optional properties listed below must be defined
>  under node /cpus/cpu@0.
>  
>  Required properties:
>  - operating-points: Refer to Documentation/devicetree/bindings/power/opp.txt
> -  for details
> +  for details. This should either be present only in cpu@0, if CPUs share clock
> +  line OR should be present in all cpu nodes, if they have separate clock lines.
>  
>  Optional properties:
>  - clock-latency: Specify the possible maximum transition latency for clock,
> diff --git a/drivers/cpufreq/cpufreq-cpu0.c b/drivers/cpufreq/cpufreq-cpu0.c
> index ee1ae30..d29f6a0 100644
> --- a/drivers/cpufreq/cpufreq-cpu0.c
> +++ b/drivers/cpufreq/cpufreq-cpu0.c
> @@ -25,34 +25,43 @@
>  #include <linux/slab.h>
>  #include <linux/thermal.h>
>  
> -static unsigned int transition_latency;
> -static unsigned int voltage_tolerance; /* in percentage */
> +struct private_data {
> +	struct device *cpu_dev;
> +	struct regulator *cpu_reg;
> +	struct thermal_cooling_device *cdev;
> +	unsigned int voltage_tolerance; /* in percentage */
> +};
>  
> -static struct device *cpu_dev;
> -static struct clk *cpu_clk;
> -static struct regulator *cpu_reg;
> -static struct cpufreq_frequency_table *freq_table;
> -static struct thermal_cooling_device *cdev;
> +/*
> + * Currently this driver supports only two types of platforms:
> + * - All CPUs sharing same clock line, clock_per_cpu == false
> + * - All CPUs having separate clock lines, clock_per_cpu == true
> + *
> + * TODO: Scan affected/related CPUs from proper DT bindings
> + */
> +bool clock_per_cpu = false;
>  
>  static int cpu0_set_target(struct cpufreq_policy *policy, unsigned int index)
>  {
>  	struct dev_pm_opp *opp;
> +	struct private_data *priv = policy->driver_data;
>  	unsigned long volt = 0, volt_old = 0, tol = 0;
>  	unsigned int old_freq, new_freq;
>  	long freq_Hz, freq_exact;
>  	int ret;
>  
> -	freq_Hz = clk_round_rate(cpu_clk, freq_table[index].frequency * 1000);
> +	freq_Hz = clk_round_rate(policy->clk,
> +				 policy->freq_table[index].frequency * 1000);
>  	if (freq_Hz <= 0)
> -		freq_Hz = freq_table[index].frequency * 1000;
> +		freq_Hz = policy->freq_table[index].frequency * 1000;
>  
>  	freq_exact = freq_Hz;
>  	new_freq = freq_Hz / 1000;
> -	old_freq = clk_get_rate(cpu_clk) / 1000;
> +	old_freq = clk_get_rate(policy->clk) / 1000;
>  
> -	if (!IS_ERR(cpu_reg)) {
> +	if (!IS_ERR(priv->cpu_reg)) {
>  		rcu_read_lock();
> -		opp = dev_pm_opp_find_freq_ceil(cpu_dev, &freq_Hz);
> +		opp = dev_pm_opp_find_freq_ceil(priv->cpu_dev, &freq_Hz);
>  		if (IS_ERR(opp)) {
>  			rcu_read_unlock();
>  			pr_err("failed to find OPP for %ld\n", freq_Hz);
> @@ -60,8 +69,8 @@ static int cpu0_set_target(struct cpufreq_policy *policy, unsigned int index)
>  		}
>  		volt = dev_pm_opp_get_voltage(opp);
>  		rcu_read_unlock();
> -		tol = volt * voltage_tolerance / 100;
> -		volt_old = regulator_get_voltage(cpu_reg);
> +		tol = volt * priv->voltage_tolerance / 100;
> +		volt_old = regulator_get_voltage(priv->cpu_reg);
>  	}
>  
>  	pr_debug("%u MHz, %ld mV --> %u MHz, %ld mV\n",

This patch would be less noisy and easier to review if we used local
variables in this function for cpu_req, voltage_tolerance, and cpu_clk.
Can we do that please?

>
> @@ -99,77 +108,70 @@ static int cpu0_set_target(struct cpufreq_policy *policy, unsigned int index)
>  
>  static int cpu0_cpufreq_init(struct cpufreq_policy *policy)
>  {
> -	policy->clk = cpu_clk;
> -	return cpufreq_generic_init(policy, freq_table, transition_latency);
> -}
> -
> -static struct cpufreq_driver cpu0_cpufreq_driver = {
> -	.flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
> -	.verify = cpufreq_generic_frequency_table_verify,
> -	.target_index = cpu0_set_target,
> -	.get = cpufreq_generic_get,
> -	.init = cpu0_cpufreq_init,
> -	.name = "generic_cpu0",
> -	.attr = cpufreq_generic_attr,
> -};
> -
> -static int cpu0_cpufreq_probe(struct platform_device *pdev)
> -{
> +	struct cpufreq_frequency_table *freq_table;
> +	struct private_data *priv;
>  	struct device_node *np;
> -	int ret;
> +	char name[] = "cpuX";
> +	int ret, cpu = policy->cpu;
>  
> -	cpu_dev = get_cpu_device(0);
> -	if (!cpu_dev) {
> -		pr_err("failed to get cpu0 device\n");
> -		return -ENODEV;
> +	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> +	if (!priv) {
> +		pr_err("%s: Memory allocation failed\n", __func__);

Useless allocation error message.

> +		return -ENOMEM;
>  	}
>  
> -	np = of_node_get(cpu_dev->of_node);
> -	if (!np) {
> -		pr_err("failed to find cpu0 node\n");
> -		return -ENOENT;
> -	}
> +	/* Below calls can't fail from here */
> +	priv->cpu_dev = get_cpu_device(cpu);
> +	np = of_node_get(priv->cpu_dev->of_node);
>  
> -	cpu_reg = regulator_get_optional(cpu_dev, "cpu0");
> -	if (IS_ERR(cpu_reg)) {
> -		/*
> -		 * If cpu0 regulator supply node is present, but regulator is
> -		 * not yet registered, we should try defering probe.
> -		 */
> -		if (PTR_ERR(cpu_reg) == -EPROBE_DEFER) {
> -			dev_err(cpu_dev, "cpu0 regulator not ready, retry\n");
> -			ret = -EPROBE_DEFER;
> -			goto out_put_node;
> -		}
> -		pr_warn("failed to get cpu0 regulator: %ld\n",
> -			PTR_ERR(cpu_reg));
> +	policy->clk = clk_get(priv->cpu_dev, NULL);
> +	if (IS_ERR(policy->clk)) {
> +		ret = PTR_ERR(policy->clk);
> +		pr_err("%s: failed to get cpu%d clock: %d\n", __func__, cpu,
> +		       ret);
> +		goto out_put_node;
>  	}

This won't work all the time. We need probe defer to work for the clocks
because sometimes CPU clocks are registered after this driver probes. In
my case this is always true unless I play games with initcall levels.
The same is true for regulators.

>  
> -	cpu_clk = clk_get(cpu_dev, NULL);
> -	if (IS_ERR(cpu_clk)) {
> -		ret = PTR_ERR(cpu_clk);
> -		pr_err("failed to get cpu0 clock: %d\n", ret);
> -		goto out_put_reg;
> +	ret = of_init_opp_table(priv->cpu_dev);
> +	if (ret) {
> +		pr_err("%s: failed to init OPP table for cpu%d: %d\n", __func__,
> +		       cpu, ret);
> +		goto out_put_clk;
>  	}
>  
> -	ret = of_init_opp_table(cpu_dev);
> +	ret = dev_pm_opp_init_cpufreq_table(priv->cpu_dev, &freq_table);
>  	if (ret) {
> -		pr_err("failed to init OPP table: %d\n", ret);
> +		pr_err("%s: failed to init cpufreq table for cpu%d: %d\n",
> +		       __func__, cpu, ret);
>  		goto out_put_clk;
>  	}
>  
> -	ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
> +	ret = cpufreq_table_validate_and_show(policy, freq_table);
>  	if (ret) {
> -		pr_err("failed to init cpufreq table: %d\n", ret);
> -		goto out_put_clk;
> +		pr_err("%s: invalid frequency table for cpu%d: %d\n", __func__,
> +		       cpu, ret);
> +		goto out_put_reg;
>  	}
>  
> -	of_property_read_u32(np, "voltage-tolerance", &voltage_tolerance);
> +	/* Set mask of CPUs sharing clock line with policy->cpu */
> +	if (!clock_per_cpu)
> +		cpumask_setall(policy->cpus);
>  
> -	if (of_property_read_u32(np, "clock-latency", &transition_latency))
> -		transition_latency = CPUFREQ_ETERNAL;
> +	of_property_read_u32(np, "voltage-tolerance", &priv->voltage_tolerance);
>  
> -	if (!IS_ERR(cpu_reg)) {
> +	if (of_property_read_u32(np, "clock-latency",
> +				 &policy->cpuinfo.transition_latency))
> +		policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
> +
> +	/* NOTE: Supports only 10 CPUs: 0-9 */
> +	WARN_ONCE(cpu > 9, "cpu: %d", cpu);

Why would the supply name be called cpuN-supply? Why not use cpu-supply
in each CPU node? That way there is no limit on the number of CPUs. We
can always check for cpu0-supply first to be compatible with older DTs
but going forward we should be able to just use cpu-supply.

> +
> +	name[3] = '0' + cpu;
> +	priv->cpu_reg = regulator_get_optional(priv->cpu_dev, name);
> +	if (IS_ERR(priv->cpu_reg)) {
> +		pr_warn("%s: failed to get cpu%d's regulator: %ld\n", __func__,
> +			cpu, PTR_ERR(priv->cpu_reg));
> +	} else {
>  		struct dev_pm_opp *opp;
>  		unsigned long min_uV, max_uV;
>  		int i;
[...]
> +
> +static int cpu0_cpufreq_probe(struct platform_device *pdev)
> +{
> +	const struct property *prop;
> +	struct device *cpu_dev;
> +	struct regulator *cpu_reg;
> +	int ret, cpu, count = 0;
> +
> +	/* Optimization for regulator's -EPROBE_DEFER case */
> +	if (unlikely(clock_per_cpu))
> +		goto scan_regulator;

Why not look for the regulator first then?

> +
> +	/*
> +	 * Scan all CPU nodes to set clock_per_cpu
> +	 *
> +	 * FIXME: Scan affected-CPUs from DT and remove this loop.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		cpu_dev = get_cpu_device(cpu);
> +		if (!cpu_dev) {
> +			pr_err("%s: failed to get cpu%d device\n", __func__,
> +			       cpu);
> +			return -ENODEV;
> +		}
> +
> +		/* Count how many CPU nodes have defined operating-points */
> +		prop = of_find_property(cpu_dev->of_node, "operating-points",
> +					NULL);
> +		if (prop) {
> +			count++;
> +		} else if (cpu == 0) {
> +			pr_err("%s: operating-points missing: cpu%d\n",
> +			       __func__, cpu);
> +			return -ENOENT;
> +		}
> +	}
> +
> +	/* Either one or all CPUs must have operating-points property */
> +	if (count == cpu + 1) {

Is this supposed to be count == cpu? At this point in the code count
equals the number of CPUs present.

> +		clock_per_cpu = true;
> +	} else if (count != 1) {
> +		pr_err("%s: operating-points missing for few CPUs\n", __func__);
> +		return -ENOENT;
> +	}
> +
> +scan_regulator:
> +	/* Check only CPU0's regulator for EPROBE_DEFER */
> +	cpu_dev = get_cpu_device(0); /* Can't fail now */
> +	cpu_reg = regulator_get_optional(cpu_dev, "cpu0");
> +	if (!IS_ERR(cpu_reg)) {
> +		regulator_put(cpu_reg);
> +	} else if (PTR_ERR(cpu_reg) == -EPROBE_DEFER) {
> +		/*
> +		 * If cpu0 regulator supply node is present, but regulator is
> +		 * not yet registered, we should try defering probe.
> +		 */
> +		dev_err(cpu_dev, "cpu0 regulator not ready, retry\n");
> +		return -EPROBE_DEFER;
> +	}

I don't think this driver should be using regulator_get_optional() (Mark
B. please correct me if I'm wrong). I doubt a supply is actually
optional for CPUs, just some DTs aren't specifying them. In those cases,
the regulator core will insert a dummy supply and the code will work
without having to check for probe defer and error pointers.
Viresh Kumar June 26, 2014, 1:55 a.m. UTC | #2
On 26 June 2014 00:32, Stephen Boyd <sboyd@codeaurora.org> wrote:
> It should be easy enough to read the clocks property from DT for all the
> CPU nodes and check to see if they're the same?

Not everybody has clocks supported in DT and I am not sure if it will
even work for the current users as well..

But yeah, that's one way of finding it out.

> I don't think we need an affected-cpus property.

There was some work going around to fix bindings for CPUs that share
resources, for the bigger task of making "cpufreq work well with
scheduler". And we can use them probably.

> Also I'd like to see current DTs specifying the
> same data (clocks, regulators, opps) for each CPU node even if they all
> share the same clock, regulator, and opp. It seems that the cpu0 binding
> just wanted to save some space in the DT by consolidating them all under
> one node, but that seems misguided. Case in point, now that we have
> configurations that need more than one clock per cluster the code is
> complicated and the binding is not uniform.

Hmm, probably you are right. But again we may not have nodes for
these in DT for few users of cpufreq-cpu0 and we need to investigate
for that.

>> diff --git a/drivers/cpufreq/cpufreq-cpu0.c b/drivers/cpufreq/cpufreq-cpu0.c
> This patch would be less noisy and easier to review if we used local
> variables in this function for cpu_req, voltage_tolerance, and cpu_clk.
> Can we do that please?

Hmm, yeah will try to make it less noisy.

>> -static int cpu0_cpufreq_probe(struct platform_device *pdev)
>> -{
>> +     struct cpufreq_frequency_table *freq_table;
>> +     struct private_data *priv;
>>       struct device_node *np;
>> -     int ret;
>> +     char name[] = "cpuX";
>> +     int ret, cpu = policy->cpu;
>>
>> -     cpu_dev = get_cpu_device(0);
>> -     if (!cpu_dev) {
>> -             pr_err("failed to get cpu0 device\n");
>> -             return -ENODEV;
>> +     priv = kzalloc(sizeof(*priv), GFP_KERNEL);
>> +     if (!priv) {
>> +             pr_err("%s: Memory allocation failed\n", __func__);
>
> Useless allocation error message.

Why so useless? You meant we should mention 'priv' here?
Will do.

>> +             return -ENOMEM;
>>       }
>>
>> -     np = of_node_get(cpu_dev->of_node);
>> -     if (!np) {
>> -             pr_err("failed to find cpu0 node\n");
>> -             return -ENOENT;
>> -     }
>> +     /* Below calls can't fail from here */
>> +     priv->cpu_dev = get_cpu_device(cpu);
>> +     np = of_node_get(priv->cpu_dev->of_node);
>>
>> -     cpu_reg = regulator_get_optional(cpu_dev, "cpu0");
>> -     if (IS_ERR(cpu_reg)) {
>> -             /*
>> -              * If cpu0 regulator supply node is present, but regulator is
>> -              * not yet registered, we should try defering probe.
>> -              */
>> -             if (PTR_ERR(cpu_reg) == -EPROBE_DEFER) {
>> -                     dev_err(cpu_dev, "cpu0 regulator not ready, retry\n");
>> -                     ret = -EPROBE_DEFER;
>> -                     goto out_put_node;
>> -             }
>> -             pr_warn("failed to get cpu0 regulator: %ld\n",
>> -                     PTR_ERR(cpu_reg));
>> +     policy->clk = clk_get(priv->cpu_dev, NULL);
>> +     if (IS_ERR(policy->clk)) {
>> +             ret = PTR_ERR(policy->clk);
>> +             pr_err("%s: failed to get cpu%d clock: %d\n", __func__, cpu,
>> +                    ret);
>> +             goto out_put_node;
>>       }
>
> This won't work all the time. We need probe defer to work for the clocks
> because sometimes CPU clocks are registered after this driver probes. In
> my case this is always true unless I play games with initcall levels.
> The same is true for regulators.

Okay..

>> -     cpu_clk = clk_get(cpu_dev, NULL);
>> -     if (IS_ERR(cpu_clk)) {
>> -             ret = PTR_ERR(cpu_clk);
>> -             pr_err("failed to get cpu0 clock: %d\n", ret);
>> -             goto out_put_reg;
>> +     ret = of_init_opp_table(priv->cpu_dev);
>> +     if (ret) {
>> +             pr_err("%s: failed to init OPP table for cpu%d: %d\n", __func__,
>> +                    cpu, ret);
>> +             goto out_put_clk;
>>       }
>>
>> -     ret = of_init_opp_table(cpu_dev);
>> +     ret = dev_pm_opp_init_cpufreq_table(priv->cpu_dev, &freq_table);
>>       if (ret) {
>> -             pr_err("failed to init OPP table: %d\n", ret);
>> +             pr_err("%s: failed to init cpufreq table for cpu%d: %d\n",
>> +                    __func__, cpu, ret);
>>               goto out_put_clk;
>>       }
>>
>> -     ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
>> +     ret = cpufreq_table_validate_and_show(policy, freq_table);
>>       if (ret) {
>> -             pr_err("failed to init cpufreq table: %d\n", ret);
>> -             goto out_put_clk;
>> +             pr_err("%s: invalid frequency table for cpu%d: %d\n", __func__,
>> +                    cpu, ret);
>> +             goto out_put_reg;
>>       }
>>
>> -     of_property_read_u32(np, "voltage-tolerance", &voltage_tolerance);
>> +     /* Set mask of CPUs sharing clock line with policy->cpu */
>> +     if (!clock_per_cpu)
>> +             cpumask_setall(policy->cpus);
>>
>> -     if (of_property_read_u32(np, "clock-latency", &transition_latency))
>> -             transition_latency = CPUFREQ_ETERNAL;
>> +     of_property_read_u32(np, "voltage-tolerance", &priv->voltage_tolerance);
>>
>> -     if (!IS_ERR(cpu_reg)) {
>> +     if (of_property_read_u32(np, "clock-latency",
>> +                              &policy->cpuinfo.transition_latency))
>> +             policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
>> +
>> +     /* NOTE: Supports only 10 CPUs: 0-9 */
>> +     WARN_ONCE(cpu > 9, "cpu: %d", cpu);
>
> Why would the supply name be called cpuN-supply? Why not use cpu-supply
> in each CPU node? That way there is no limit on the number of CPUs. We
> can always check for cpu0-supply first to be compatible with older DTs
> but going forward we should be able to just use cpu-supply.

Will investigate on what all needs to change for it to work..

>> +static int cpu0_cpufreq_probe(struct platform_device *pdev)
>> +{
>> +     const struct property *prop;
>> +     struct device *cpu_dev;
>> +     struct regulator *cpu_reg;
>> +     int ret, cpu, count = 0;
>> +
>> +     /* Optimization for regulator's -EPROBE_DEFER case */
>> +     if (unlikely(clock_per_cpu))
>> +             goto scan_regulator;
>
> Why not look for the regulator first then?

Maybe yes.

>> +     /*
>> +      * Scan all CPU nodes to set clock_per_cpu
>> +      *
>> +      * FIXME: Scan affected-CPUs from DT and remove this loop.
>> +      */
>> +     for_each_possible_cpu(cpu) {
>> +             cpu_dev = get_cpu_device(cpu);
>> +             if (!cpu_dev) {
>> +                     pr_err("%s: failed to get cpu%d device\n", __func__,
>> +                            cpu);
>> +                     return -ENODEV;
>> +             }
>> +
>> +             /* Count how many CPU nodes have defined operating-points */
>> +             prop = of_find_property(cpu_dev->of_node, "operating-points",
>> +                                     NULL);
>> +             if (prop) {
>> +                     count++;
>> +             } else if (cpu == 0) {
>> +                     pr_err("%s: operating-points missing: cpu%d\n",
>> +                            __func__, cpu);
>> +                     return -ENOENT;
>> +             }
>> +     }
>> +
>> +     /* Either one or all CPUs must have operating-points property */
>> +     if (count == cpu + 1) {
>
> Is this supposed to be count == cpu? At this point in the code count
> equals the number of CPUs present.

Let me know if my calculations are wrong but I wrote this intentionally.

CPUs start from zero and count is incremented by one for each cpu.
So, if there are 4 CPUs, then count is going to be 4 and cpu is going
to be 3?

>> +     cpu_reg = regulator_get_optional(cpu_dev, "cpu0");

> I don't think this driver should be using regulator_get_optional() (Mark
> B. please correct me if I'm wrong). I doubt a supply is actually
> optional for CPUs, just some DTs aren't specifying them. In those cases,
> the regulator core will insert a dummy supply and the code will work
> without having to check for probe defer and error pointers.

Wasn't aware of this, will see if that can be done in a separate patch.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Viresh Kumar June 26, 2014, 7:34 a.m. UTC | #3
On 26 June 2014 00:32, Stephen Boyd <sboyd@codeaurora.org> wrote:

>> +     cpu_reg = regulator_get_optional(cpu_dev, "cpu0");

> I don't think this driver should be using regulator_get_optional() (Mark
> B. please correct me if I'm wrong). I doubt a supply is actually
> optional for CPUs, just some DTs aren't specifying them. In those cases,
> the regulator core will insert a dummy supply and the code will work
> without having to check for probe defer and error pointers.

This is what Mark did earlier, don't know if he would like to revert it:
7d74897 (cpufreq: cpufreq-cpu0: Use devm_regulator_get_optional()).
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar June 26, 2014, 10:52 a.m. UTC | #4
On 26 June 2014 00:32, Stephen Boyd <sboyd@codeaurora.org> wrote:
> I don't think this driver should be using regulator_get_optional() (Mark
> B. please correct me if I'm wrong). I doubt a supply is actually
> optional for CPUs, just some DTs aren't specifying them. In those cases,
> the regulator core will insert a dummy supply and the code will work
> without having to check for probe defer and error pointers.

Hi Stephen,

Thanks for your comments.

Leaving the above one, I have tried to fix all you mentioned. And it surely
looks much better now.

I would like to wait for a day or two before sending V2, as people might
be reviewing it and the above issue is still wide open..

But in case you wanna test it (completely changed I must say, but
for good), its here:

git://git.linaro.org/people/viresh.kumar/linux.git cpufreq/cpu0-krait-v2

Let me know if you still find it childish :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Mark Brown June 26, 2014, 10:08 p.m. UTC | #5
On Wed, Jun 25, 2014 at 12:02:25PM -0700, Stephen Boyd wrote:

> I don't think this driver should be using regulator_get_optional() (Mark
> B. please correct me if I'm wrong). I doubt a supply is actually
> optional for CPUs, just some DTs aren't specifying them. In those cases,
> the regulator core will insert a dummy supply and the code will work
> without having to check for probe defer and error pointers.

Since the cpufreq driver is actively working with the regulator,
changing voltages and so on it's mostly OK, it's kind of on the border
of something that should do this but there doesn't seem to be a
reasonable way for it to handle not being able to read the voltage
cleanly.
Mike Turquette June 27, 2014, 1:53 a.m. UTC | #6
Quoting Stephen Boyd (2014-06-26 17:06:00)
> Finally, checking for equivalent pointers from clk_get() will work now,

Please don't do that. Even though it works for the current
implementation, comparing those pointers from a driver violates how
clkdev is supposed to work. The pointer returned by clk_get should only
be dereferenced by a driver to check if it is an error code. Anything
besides an error code is no business of the driver.

> but it isn't future-proof if/when the clock framework starts returning
> dynamically allocated clock pointers for each clk_get() invocation.
> Maybe we need a function in the common clock framework that tells us if
> the clocks are the same either via DT or by taking two clock pointers?

I looked through the patch briefly and did not see why we would need to
do this. Any hint?

Thanks,
Mike

> 
> -- 
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
> hosted by The Linux Foundation
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar June 27, 2014, 2:15 a.m. UTC | #7
On 27 June 2014 07:23, Mike Turquette <mturquette@linaro.org> wrote:
>> but it isn't future-proof if/when the clock framework starts returning
>> dynamically allocated clock pointers for each clk_get() invocation.
>> Maybe we need a function in the common clock framework that tells us if
>> the clocks are the same either via DT or by taking two clock pointers?
>
> I looked through the patch briefly and did not see why we would need to
> do this. Any hint?

We want to know which CPUs are sharing clock line, so that we can
fill affected-cpus field of cpufreq core.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar June 27, 2014, 2:26 a.m. UTC | #8
On 27 June 2014 05:36, Stephen Boyd <sboyd@codeaurora.org> wrote:
> I gave it a spin. It looks mostly good except for the infinite loop:

Great !!

> diff --git a/drivers/cpufreq/cpufreq-cpu0.c b/drivers/cpufreq/cpufreq-cpu0.c
> index b7ee67c4d1c0..6744321ae33d 100644
> --- a/drivers/cpufreq/cpufreq-cpu0.c
> +++ b/drivers/cpufreq/cpufreq-cpu0.c
> @@ -138,8 +138,10 @@ try_again:
>                 }
>
>                 /* Try with "cpu-supply" */
> -               if (reg == reg_cpu0)
> +               if (reg == reg_cpu0) {
> +                       reg = reg_cpu;
>                         goto try_again;
> +               }
>
>                 dev_warn(cpu_dev, "failed to get cpu%d regulator: %ld\n",
>                          cpu, PTR_ERR(cpu_reg));

Awesome code, I wrote it. Thanks for fixing it.

> and I think we just want reg_cpu to be "cpu", not "cpu-supply" because I
> think the regulator core adds in the "-supply" part already.

Okay.

> After fixing that I can get cpufreq going. I'm currently working on
> populating the OPPs at runtime without relying on DT. So eventually I'll
> need this patch:

I see..

> diff --git a/drivers/cpufreq/cpufreq-cpu0.c b/drivers/cpufreq/cpufreq-cpu0.c
> index b7ee67c4d1c0..6744321ae33d 100644
> --- a/drivers/cpufreq/cpufreq-cpu0.c
> +++ b/drivers/cpufreq/cpufreq-cpu0.c
> @@ -239,11 +241,6 @@ static int cpu0_cpufreq_init(struct cpufreq_policy *policy)
>         }
>
>         ret = of_init_opp_table(cpu_dev);
> -       if (ret) {
> -               dev_err(cpu_dev, "failed to init OPP table: %d\n", ret);
> -               goto out_put_node;
> -       }
> -
>         ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
>         if (ret) {
>                 dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret);
>
> which I hope is ok.

I need to see details of these routines once again, but will surely make
it work for you.
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shawn Guo June 28, 2014, 2:52 p.m. UTC | #9
On Wed, Jun 25, 2014 at 02:12:29PM +0530, Viresh Kumar wrote:
> cpufreq-cpu0 driver supports only platforms which have single clock line shared
> among all CPUs.
> 
> We already have platforms where this limitation doesn't hold true.For example on
> Qualcomm's KRAIT all CPUs have separate clock line and so have separate
> policies.
> 
> Instead of adding another driver for this (Stephen just tried that:
> https://lkml.org/lkml/2014/6/24/918), we must reuse cpufreq-cpu0 driver.
> 
> cpufreq-cpu0 must be updated to break the assumption on which it is based (all
> cores sharing clock line) and this patch tries to do exactly that.
> 
> As we don't have standard DT bindings to mention CPUs sharing clock-line, i.e.
> affected_cpus, this patch also have few limitations. Though easy to fix once we
> have proper bindings.
> 
> Limitation: We only supports two types of platforms:
> - All CPUs sharing same clock line, existing user platforms
> - All CPUs have separate clock lines, KRAIT
> 
> And so platforms which have multiple clusters with multiple CPUs per cluster
> aren't supported yet. We need proper bindings for that first.
> 
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
>  .../devicetree/bindings/cpufreq/cpufreq-cpu0.txt   |   8 +-
>  drivers/cpufreq/Kconfig                            |   5 +-
>  drivers/cpufreq/cpufreq-cpu0.c                     | 280 +++++++++++++--------
>  3 files changed, 190 insertions(+), 103 deletions(-)

Hi Viresh,

Thanks for all the effort on maintaining and improving cpufreq-cpu0
driver.  Your patch rewrote the most part of the driver, so I'd like to
hand over the driver to you.  Please add yourself as the primary person
for MODULE_AUTHOR.

Shawn
Viresh Kumar June 30, 2014, 7:57 a.m. UTC | #10
On 27 June 2014 07:45, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> On 27 June 2014 07:23, Mike Turquette <mturquette@linaro.org> wrote:
>>> but it isn't future-proof if/when the clock framework starts returning
>>> dynamically allocated clock pointers for each clk_get() invocation.
>>> Maybe we need a function in the common clock framework that tells us if
>>> the clocks are the same either via DT or by taking two clock pointers?
>>
>> I looked through the patch briefly and did not see why we would need to
>> do this. Any hint?
>
> We want to know which CPUs are sharing clock line, so that we can
> fill affected-cpus field of cpufreq core.

What about comparing "clocks" property in cpu DT nodes?

@Rob/Grant: I tried looking for an existing routine to do that, but couldn't
find it. And so wrote one.

I am not good at DT stuff and so I do hope there would be few correction
required here. Let me know if this can be added to drivers/of/base.c :

+/**
+ * of_property_match - Match property in two nodes
+ * @np1, np2: Nodes to match
+ * @list_name: property to match
+ *
+ * Returns 1 on match, 0 on no match, and error for missing property.
+ */
+static int of_property_match(const struct device_node *np1,
+                             const struct device_node *np2,
+                             const char *list_name)
+{
+       const __be32 *list1, *list2, *list1_end;
+       int size1, size2;
+       phandle phandle1, phandle2;
+
+       /* Retrieve the list property */
+       list1 = of_get_property(np1, list_name, &size1);
+       if (!list1)
+               return -ENOENT;
+
+       list2 = of_get_property(np2, list_name, &size2);
+       if (!list2)
+               return -ENOENT;
+
+       if (size1 != size2)
+               return 0;
+
+       list1_end = list1 + size1 / sizeof(*list1);
+
+       /* Loop over the phandles */
+       while (list1 < list1_end) {
+               phandle1 = be32_to_cpup(list1++);
+               phandle2 = be32_to_cpup(list2++);
+
+               if (phandle1 != phandle2)
+                       return 0;
+       }
+
+       return 1;
+}

@Stephen: I have updated my tree with this change, in case you wanna
try.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rob Herring June 30, 2014, 6:33 p.m. UTC | #11
On Mon, Jun 30, 2014 at 2:57 AM, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> On 27 June 2014 07:45, Viresh Kumar <viresh.kumar@linaro.org> wrote:
>> On 27 June 2014 07:23, Mike Turquette <mturquette@linaro.org> wrote:
>>>> but it isn't future-proof if/when the clock framework starts returning
>>>> dynamically allocated clock pointers for each clk_get() invocation.
>>>> Maybe we need a function in the common clock framework that tells us if
>>>> the clocks are the same either via DT or by taking two clock pointers?
>>>
>>> I looked through the patch briefly and did not see why we would need to
>>> do this. Any hint?
>>
>> We want to know which CPUs are sharing clock line, so that we can
>> fill affected-cpus field of cpufreq core.
>
> What about comparing "clocks" property in cpu DT nodes?

What if a different clock is selected for some reason. I think a clock
api function would be better.

That being said, I don't really have any issue with such a function.
Some comments on the implementation.

>
> @Rob/Grant: I tried looking for an existing routine to do that, but couldn't
> find it. And so wrote one.
>
> I am not good at DT stuff and so I do hope there would be few correction
> required here. Let me know if this can be added to drivers/of/base.c :
>
> +/**
> + * of_property_match - Match property in two nodes
> + * @np1, np2: Nodes to match
> + * @list_name: property to match
> + *
> + * Returns 1 on match, 0 on no match, and error for missing property.
> + */
> +static int of_property_match(const struct device_node *np1,
> +                             const struct device_node *np2,
> +                             const char *list_name)
> +{
> +       const __be32 *list1, *list2, *list1_end;

s/list/prop/

Everywhere.

> +       int size1, size2;
> +       phandle phandle1, phandle2;
> +
> +       /* Retrieve the list property */
> +       list1 = of_get_property(np1, list_name, &size1);
> +       if (!list1)
> +               return -ENOENT;
> +
> +       list2 = of_get_property(np2, list_name, &size2);
> +       if (!list2)
> +               return -ENOENT;
> +
> +       if (size1 != size2)
> +               return 0;
> +
> +       list1_end = list1 + size1 / sizeof(*list1);
> +
> +       /* Loop over the phandles */
> +       while (list1 < list1_end) {
> +               phandle1 = be32_to_cpup(list1++);
> +               phandle2 = be32_to_cpup(list2++);
> +
> +               if (phandle1 != phandle2)
> +                       return 0;
> +       }

You can just do a memcmp here.

This is wrong anyway because you don't know #clock-cells size.

Rob
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar July 1, 2014, 11:14 a.m. UTC | #12
On 1 July 2014 00:03, Rob Herring <rob.herring@linaro.org> wrote:
>> What about comparing "clocks" property in cpu DT nodes?
>
> What if a different clock is selected for some reason.

I don't know why that will happen for CPUs sharing clock line.

> I think a clock api function would be better.

@Mike: What do you think? I think we can get a clock API for
this.

> That being said, I don't really have any issue with such a function.
> Some comments on the implementation.

>> +static int of_property_match(const struct device_node *np1,
>> +                             const struct device_node *np2,
>> +                             const char *list_name)
>> +{
>> +       const __be32 *list1, *list2, *list1_end;
>
> s/list/prop/
>
> Everywhere.

Ok.

>> +       int size1, size2;
>> +       phandle phandle1, phandle2;
>> +
>> +       /* Retrieve the list property */
>> +       list1 = of_get_property(np1, list_name, &size1);
>> +       if (!list1)
>> +               return -ENOENT;
>> +
>> +       list2 = of_get_property(np2, list_name, &size2);
>> +       if (!list2)
>> +               return -ENOENT;
>> +
>> +       if (size1 != size2)
>> +               return 0;
>> +
>> +       list1_end = list1 + size1 / sizeof(*list1);
>> +
>> +       /* Loop over the phandles */
>> +       while (list1 < list1_end) {
>> +               phandle1 = be32_to_cpup(list1++);
>> +               phandle2 = be32_to_cpup(list2++);
>> +
>> +               if (phandle1 != phandle2)
>> +                       return 0;
>> +       }
>
> You can just do a memcmp here.

Yeah, that would be much better.

> This is wrong anyway because you don't know #clock-cells size.

I was actually comparing all the clock-cells, whatever there number
is to make sure "clocks" properties are exactly same. Anyway
memcmp will still guarantee that.

Thanks for your review.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Mike Turquette July 1, 2014, 10 p.m. UTC | #13
Quoting Viresh Kumar (2014-07-01 04:14:04)
> On 1 July 2014 00:03, Rob Herring <rob.herring@linaro.org> wrote:
> >> What about comparing "clocks" property in cpu DT nodes?
> >
> > What if a different clock is selected for some reason.
> 
> I don't know why that will happen for CPUs sharing clock line.
> 
> > I think a clock api function would be better.
> 
> @Mike: What do you think? I think we can get a clock API for
> this.

I can't help but think this is a pretty ugly solution. Why not specify
the nature of the cpu clock(s) in DT directly? There was a thread
already that discussed adding such a property to the CPU DT binding but
it seems to have gone cold[1]. Furthermore my mailer sucks and I see now
that my response to that thread never hit the list due to mangled
headers. Here is a copy/paste of my response to the aforementioned
thread:

"""
I'll join the bikeshedding.

The hardware property that matters for cpufreq-cpu0 users is that a
multi-core CPU uses a single clock input to scale frequency across all
of the cores in that cluster. So an accurate description is:

scaling-method = "clock-ganged"; //hardware-people-speak

Or,

scaling-method = "clock-shared"; //software-people-speak

Versus independently scalable CPUs in an SMP cluster:

scaling-method = "independent"; //x86, Krait, etc.

Or perhaps instead of "independent" at the parent "cpus" node we would
put the following in each cpu@N node:

scaling-method = "clock";

Or "psci" or "acpi" or whatever.

Thought exercise: for Hyperthreaded(tm) CPUs with 2 virtual cores for
every hard CPU (and multiple CPUs in a cluster):

scaling-method = "paired";

Or more simply, "hyperthreaded".
"""

Regards,
Mike

[1] http://www.spinics.net/lists/cpufreq/msg10034.html

> 
> > That being said, I don't really have any issue with such a function.
> > Some comments on the implementation.
> 
> >> +static int of_property_match(const struct device_node *np1,
> >> +                             const struct device_node *np2,
> >> +                             const char *list_name)
> >> +{
> >> +       const __be32 *list1, *list2, *list1_end;
> >
> > s/list/prop/
> >
> > Everywhere.
> 
> Ok.
> 
> >> +       int size1, size2;
> >> +       phandle phandle1, phandle2;
> >> +
> >> +       /* Retrieve the list property */
> >> +       list1 = of_get_property(np1, list_name, &size1);
> >> +       if (!list1)
> >> +               return -ENOENT;
> >> +
> >> +       list2 = of_get_property(np2, list_name, &size2);
> >> +       if (!list2)
> >> +               return -ENOENT;
> >> +
> >> +       if (size1 != size2)
> >> +               return 0;
> >> +
> >> +       list1_end = list1 + size1 / sizeof(*list1);
> >> +
> >> +       /* Loop over the phandles */
> >> +       while (list1 < list1_end) {
> >> +               phandle1 = be32_to_cpup(list1++);
> >> +               phandle2 = be32_to_cpup(list2++);
> >> +
> >> +               if (phandle1 != phandle2)
> >> +                       return 0;
> >> +       }
> >
> > You can just do a memcmp here.
> 
> Yeah, that would be much better.
> 
> > This is wrong anyway because you don't know #clock-cells size.
> 
> I was actually comparing all the clock-cells, whatever there number
> is to make sure "clocks" properties are exactly same. Anyway
> memcmp will still guarantee that.
> 
> Thanks for your review.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Viresh Kumar July 2, 2014, 3:32 a.m. UTC | #14
On 2 July 2014 03:30, Mike Turquette <mturquette@linaro.org> wrote:
> I can't help but think this is a pretty ugly solution. Why not specify

Thanks :)

> the nature of the cpu clock(s) in DT directly? There was a thread
> already that discussed adding such a property to the CPU DT binding but
> it seems to have gone cold[1]. Furthermore my mailer sucks and I see now
> that my response to that thread never hit the list due to mangled
> headers. Here is a copy/paste of my response to the aforementioned
> thread:

Atleast I received it.

Yes, I do agree that we need to get this from the DT in more elegant
way but it is going to take some time in doing that, and probably some
people are working on it as that might be used in scheduler-cpufreq
coordination as well..

For now we can go ahead and make it workable, even if it isn't that
elegant and update it later on.

Thanks for your inputs.
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-cpu0.txt b/Documentation/devicetree/bindings/cpufreq/cpufreq-cpu0.txt
index f055515..fa8861e 100644
--- a/Documentation/devicetree/bindings/cpufreq/cpufreq-cpu0.txt
+++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-cpu0.txt
@@ -1,15 +1,17 @@ 
 Generic CPU0 cpufreq driver
 
-It is a generic cpufreq driver for CPU0 frequency management.  It
+It is a generic cpufreq driver for CPU frequency management.  It
 supports both uniprocessor (UP) and symmetric multiprocessor (SMP)
-systems which share clock and voltage across all CPUs.
+systems which share clock and voltage across all CPUs OR have separate
+clock and voltage for all CPUs.
 
 Both required and optional properties listed below must be defined
 under node /cpus/cpu@0.
 
 Required properties:
 - operating-points: Refer to Documentation/devicetree/bindings/power/opp.txt
-  for details
+  for details. This should either be present only in cpu@0, if CPUs share clock
+  line OR should be present in all cpu nodes, if they have separate clock lines.
 
 Optional properties:
 - clock-latency: Specify the possible maximum transition latency for clock,
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index ffe350f..0e8d983 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -190,9 +190,10 @@  config GENERIC_CPUFREQ_CPU0
 	depends on !CPU_THERMAL || THERMAL
 	select PM_OPP
 	help
-	  This adds a generic cpufreq driver for CPU0 frequency management.
+	  This adds a generic cpufreq driver for CPU frequency management.
 	  It supports both uniprocessor (UP) and symmetric multiprocessor (SMP)
-	  systems which share clock and voltage across all CPUs.
+	  systems which share clock and voltage across all CPUs OR have separate
+	  clock and voltage for all CPUs.
 
 	  If in doubt, say N.
 
diff --git a/drivers/cpufreq/cpufreq-cpu0.c b/drivers/cpufreq/cpufreq-cpu0.c
index ee1ae30..d29f6a0 100644
--- a/drivers/cpufreq/cpufreq-cpu0.c
+++ b/drivers/cpufreq/cpufreq-cpu0.c
@@ -25,34 +25,43 @@ 
 #include <linux/slab.h>
 #include <linux/thermal.h>
 
-static unsigned int transition_latency;
-static unsigned int voltage_tolerance; /* in percentage */
+struct private_data {
+	struct device *cpu_dev;
+	struct regulator *cpu_reg;
+	struct thermal_cooling_device *cdev;
+	unsigned int voltage_tolerance; /* in percentage */
+};
 
-static struct device *cpu_dev;
-static struct clk *cpu_clk;
-static struct regulator *cpu_reg;
-static struct cpufreq_frequency_table *freq_table;
-static struct thermal_cooling_device *cdev;
+/*
+ * Currently this driver supports only two types of platforms:
+ * - All CPUs sharing same clock line, clock_per_cpu == false
+ * - All CPUs having separate clock lines, clock_per_cpu == true
+ *
+ * TODO: Scan affected/related CPUs from proper DT bindings
+ */
+bool clock_per_cpu = false;
 
 static int cpu0_set_target(struct cpufreq_policy *policy, unsigned int index)
 {
 	struct dev_pm_opp *opp;
+	struct private_data *priv = policy->driver_data;
 	unsigned long volt = 0, volt_old = 0, tol = 0;
 	unsigned int old_freq, new_freq;
 	long freq_Hz, freq_exact;
 	int ret;
 
-	freq_Hz = clk_round_rate(cpu_clk, freq_table[index].frequency * 1000);
+	freq_Hz = clk_round_rate(policy->clk,
+				 policy->freq_table[index].frequency * 1000);
 	if (freq_Hz <= 0)
-		freq_Hz = freq_table[index].frequency * 1000;
+		freq_Hz = policy->freq_table[index].frequency * 1000;
 
 	freq_exact = freq_Hz;
 	new_freq = freq_Hz / 1000;
-	old_freq = clk_get_rate(cpu_clk) / 1000;
+	old_freq = clk_get_rate(policy->clk) / 1000;
 
-	if (!IS_ERR(cpu_reg)) {
+	if (!IS_ERR(priv->cpu_reg)) {
 		rcu_read_lock();
-		opp = dev_pm_opp_find_freq_ceil(cpu_dev, &freq_Hz);
+		opp = dev_pm_opp_find_freq_ceil(priv->cpu_dev, &freq_Hz);
 		if (IS_ERR(opp)) {
 			rcu_read_unlock();
 			pr_err("failed to find OPP for %ld\n", freq_Hz);
@@ -60,8 +69,8 @@  static int cpu0_set_target(struct cpufreq_policy *policy, unsigned int index)
 		}
 		volt = dev_pm_opp_get_voltage(opp);
 		rcu_read_unlock();
-		tol = volt * voltage_tolerance / 100;
-		volt_old = regulator_get_voltage(cpu_reg);
+		tol = volt * priv->voltage_tolerance / 100;
+		volt_old = regulator_get_voltage(priv->cpu_reg);
 	}
 
 	pr_debug("%u MHz, %ld mV --> %u MHz, %ld mV\n",
@@ -69,28 +78,28 @@  static int cpu0_set_target(struct cpufreq_policy *policy, unsigned int index)
 		 new_freq / 1000, volt ? volt / 1000 : -1);
 
 	/* scaling up?  scale voltage before frequency */
-	if (!IS_ERR(cpu_reg) && new_freq > old_freq) {
-		ret = regulator_set_voltage_tol(cpu_reg, volt, tol);
+	if (!IS_ERR(priv->cpu_reg) && new_freq > old_freq) {
+		ret = regulator_set_voltage_tol(priv->cpu_reg, volt, tol);
 		if (ret) {
 			pr_err("failed to scale voltage up: %d\n", ret);
 			return ret;
 		}
 	}
 
-	ret = clk_set_rate(cpu_clk, freq_exact);
+	ret = clk_set_rate(policy->clk, freq_exact);
 	if (ret) {
 		pr_err("failed to set clock rate: %d\n", ret);
-		if (!IS_ERR(cpu_reg))
-			regulator_set_voltage_tol(cpu_reg, volt_old, tol);
+		if (!IS_ERR(priv->cpu_reg))
+			regulator_set_voltage_tol(priv->cpu_reg, volt_old, tol);
 		return ret;
 	}
 
 	/* scaling down?  scale voltage after frequency */
-	if (!IS_ERR(cpu_reg) && new_freq < old_freq) {
-		ret = regulator_set_voltage_tol(cpu_reg, volt, tol);
+	if (!IS_ERR(priv->cpu_reg) && new_freq < old_freq) {
+		ret = regulator_set_voltage_tol(priv->cpu_reg, volt, tol);
 		if (ret) {
 			pr_err("failed to scale voltage down: %d\n", ret);
-			clk_set_rate(cpu_clk, old_freq * 1000);
+			clk_set_rate(policy->clk, old_freq * 1000);
 		}
 	}
 
@@ -99,77 +108,70 @@  static int cpu0_set_target(struct cpufreq_policy *policy, unsigned int index)
 
 static int cpu0_cpufreq_init(struct cpufreq_policy *policy)
 {
-	policy->clk = cpu_clk;
-	return cpufreq_generic_init(policy, freq_table, transition_latency);
-}
-
-static struct cpufreq_driver cpu0_cpufreq_driver = {
-	.flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
-	.verify = cpufreq_generic_frequency_table_verify,
-	.target_index = cpu0_set_target,
-	.get = cpufreq_generic_get,
-	.init = cpu0_cpufreq_init,
-	.name = "generic_cpu0",
-	.attr = cpufreq_generic_attr,
-};
-
-static int cpu0_cpufreq_probe(struct platform_device *pdev)
-{
+	struct cpufreq_frequency_table *freq_table;
+	struct private_data *priv;
 	struct device_node *np;
-	int ret;
+	char name[] = "cpuX";
+	int ret, cpu = policy->cpu;
 
-	cpu_dev = get_cpu_device(0);
-	if (!cpu_dev) {
-		pr_err("failed to get cpu0 device\n");
-		return -ENODEV;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		pr_err("%s: Memory allocation failed\n", __func__);
+		return -ENOMEM;
 	}
 
-	np = of_node_get(cpu_dev->of_node);
-	if (!np) {
-		pr_err("failed to find cpu0 node\n");
-		return -ENOENT;
-	}
+	/* Below calls can't fail from here */
+	priv->cpu_dev = get_cpu_device(cpu);
+	np = of_node_get(priv->cpu_dev->of_node);
 
-	cpu_reg = regulator_get_optional(cpu_dev, "cpu0");
-	if (IS_ERR(cpu_reg)) {
-		/*
-		 * If cpu0 regulator supply node is present, but regulator is
-		 * not yet registered, we should try defering probe.
-		 */
-		if (PTR_ERR(cpu_reg) == -EPROBE_DEFER) {
-			dev_err(cpu_dev, "cpu0 regulator not ready, retry\n");
-			ret = -EPROBE_DEFER;
-			goto out_put_node;
-		}
-		pr_warn("failed to get cpu0 regulator: %ld\n",
-			PTR_ERR(cpu_reg));
+	policy->clk = clk_get(priv->cpu_dev, NULL);
+	if (IS_ERR(policy->clk)) {
+		ret = PTR_ERR(policy->clk);
+		pr_err("%s: failed to get cpu%d clock: %d\n", __func__, cpu,
+		       ret);
+		goto out_put_node;
 	}
 
-	cpu_clk = clk_get(cpu_dev, NULL);
-	if (IS_ERR(cpu_clk)) {
-		ret = PTR_ERR(cpu_clk);
-		pr_err("failed to get cpu0 clock: %d\n", ret);
-		goto out_put_reg;
+	ret = of_init_opp_table(priv->cpu_dev);
+	if (ret) {
+		pr_err("%s: failed to init OPP table for cpu%d: %d\n", __func__,
+		       cpu, ret);
+		goto out_put_clk;
 	}
 
-	ret = of_init_opp_table(cpu_dev);
+	ret = dev_pm_opp_init_cpufreq_table(priv->cpu_dev, &freq_table);
 	if (ret) {
-		pr_err("failed to init OPP table: %d\n", ret);
+		pr_err("%s: failed to init cpufreq table for cpu%d: %d\n",
+		       __func__, cpu, ret);
 		goto out_put_clk;
 	}
 
-	ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
+	ret = cpufreq_table_validate_and_show(policy, freq_table);
 	if (ret) {
-		pr_err("failed to init cpufreq table: %d\n", ret);
-		goto out_put_clk;
+		pr_err("%s: invalid frequency table for cpu%d: %d\n", __func__,
+		       cpu, ret);
+		goto out_put_reg;
 	}
 
-	of_property_read_u32(np, "voltage-tolerance", &voltage_tolerance);
+	/* Set mask of CPUs sharing clock line with policy->cpu */
+	if (!clock_per_cpu)
+		cpumask_setall(policy->cpus);
 
-	if (of_property_read_u32(np, "clock-latency", &transition_latency))
-		transition_latency = CPUFREQ_ETERNAL;
+	of_property_read_u32(np, "voltage-tolerance", &priv->voltage_tolerance);
 
-	if (!IS_ERR(cpu_reg)) {
+	if (of_property_read_u32(np, "clock-latency",
+				 &policy->cpuinfo.transition_latency))
+		policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+
+	/* NOTE: Supports only 10 CPUs: 0-9 */
+	WARN_ONCE(cpu > 9, "cpu: %d", cpu);
+
+	name[3] = '0' + cpu;
+	priv->cpu_reg = regulator_get_optional(priv->cpu_dev, name);
+	if (IS_ERR(priv->cpu_reg)) {
+		pr_warn("%s: failed to get cpu%d's regulator: %ld\n", __func__,
+			cpu, PTR_ERR(priv->cpu_reg));
+	} else {
 		struct dev_pm_opp *opp;
 		unsigned long min_uV, max_uV;
 		int i;
@@ -182,22 +184,16 @@  static int cpu0_cpufreq_probe(struct platform_device *pdev)
 		for (i = 0; freq_table[i].frequency != CPUFREQ_TABLE_END; i++)
 			;
 		rcu_read_lock();
-		opp = dev_pm_opp_find_freq_exact(cpu_dev,
+		opp = dev_pm_opp_find_freq_exact(priv->cpu_dev,
 				freq_table[0].frequency * 1000, true);
 		min_uV = dev_pm_opp_get_voltage(opp);
-		opp = dev_pm_opp_find_freq_exact(cpu_dev,
+		opp = dev_pm_opp_find_freq_exact(priv->cpu_dev,
 				freq_table[i-1].frequency * 1000, true);
 		max_uV = dev_pm_opp_get_voltage(opp);
 		rcu_read_unlock();
-		ret = regulator_set_voltage_time(cpu_reg, min_uV, max_uV);
+		ret = regulator_set_voltage_time(priv->cpu_reg, min_uV, max_uV);
 		if (ret > 0)
-			transition_latency += ret * 1000;
-	}
-
-	ret = cpufreq_register_driver(&cpu0_cpufreq_driver);
-	if (ret) {
-		pr_err("failed register driver: %d\n", ret);
-		goto out_free_table;
+			policy->cpuinfo.transition_latency += ret * 1000;
 	}
 
 	/*
@@ -205,34 +201,122 @@  static int cpu0_cpufreq_probe(struct platform_device *pdev)
 	 * thermal DT code takes care of matching them.
 	 */
 	if (of_find_property(np, "#cooling-cells", NULL)) {
-		cdev = of_cpufreq_cooling_register(np, cpu_present_mask);
-		if (IS_ERR(cdev))
+		priv->cdev = of_cpufreq_cooling_register(np, policy->cpus);
+		if (IS_ERR(priv->cdev))
 			pr_err("running cpufreq without cooling device: %ld\n",
-			       PTR_ERR(cdev));
+			       PTR_ERR(priv->cdev));
 	}
 
+	policy->driver_data = priv;
 	of_node_put(np);
 	return 0;
 
-out_free_table:
-	dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table);
-out_put_clk:
-	if (!IS_ERR(cpu_clk))
-		clk_put(cpu_clk);
 out_put_reg:
-	if (!IS_ERR(cpu_reg))
-		regulator_put(cpu_reg);
+	if (!IS_ERR(priv->cpu_reg))
+		regulator_put(priv->cpu_reg);
+	dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &freq_table);
+out_put_clk:
+	clk_put(policy->clk);
 out_put_node:
 	of_node_put(np);
+	kfree(priv);
+	return ret;
+}
+
+static int cpu0_cpufreq_exit(struct cpufreq_policy *policy)
+{
+	struct private_data *priv = policy->driver_data;
+
+	cpufreq_cooling_unregister(priv->cdev);
+	if (!IS_ERR(priv->cpu_reg))
+		regulator_put(priv->cpu_reg);
+	dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
+	clk_put(policy->clk);
+	kfree(priv);
+
+	return 0;
+}
+
+static struct cpufreq_driver cpu0_cpufreq_driver = {
+	.flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.verify = cpufreq_generic_frequency_table_verify,
+	.target_index = cpu0_set_target,
+	.get = cpufreq_generic_get,
+	.init = cpu0_cpufreq_init,
+	.exit = cpu0_cpufreq_exit,
+	.name = "generic_cpu0",
+	.attr = cpufreq_generic_attr,
+};
+
+static int cpu0_cpufreq_probe(struct platform_device *pdev)
+{
+	const struct property *prop;
+	struct device *cpu_dev;
+	struct regulator *cpu_reg;
+	int ret, cpu, count = 0;
+
+	/* Optimization for regulator's -EPROBE_DEFER case */
+	if (unlikely(clock_per_cpu))
+		goto scan_regulator;
+
+	/*
+	 * Scan all CPU nodes to set clock_per_cpu
+	 *
+	 * FIXME: Scan affected-CPUs from DT and remove this loop.
+	 */
+	for_each_possible_cpu(cpu) {
+		cpu_dev = get_cpu_device(cpu);
+		if (!cpu_dev) {
+			pr_err("%s: failed to get cpu%d device\n", __func__,
+			       cpu);
+			return -ENODEV;
+		}
+
+		/* Count how many CPU nodes have defined operating-points */
+		prop = of_find_property(cpu_dev->of_node, "operating-points",
+					NULL);
+		if (prop) {
+			count++;
+		} else if (cpu == 0) {
+			pr_err("%s: operating-points missing: cpu%d\n",
+			       __func__, cpu);
+			return -ENOENT;
+		}
+	}
+
+	/* Either one or all CPUs must have operating-points property */
+	if (count == cpu + 1) {
+		clock_per_cpu = true;
+	} else if (count != 1) {
+		pr_err("%s: operating-points missing for few CPUs\n", __func__);
+		return -ENOENT;
+	}
+
+scan_regulator:
+	/* Check only CPU0's regulator for EPROBE_DEFER */
+	cpu_dev = get_cpu_device(0); /* Can't fail now */
+	cpu_reg = regulator_get_optional(cpu_dev, "cpu0");
+	if (!IS_ERR(cpu_reg)) {
+		regulator_put(cpu_reg);
+	} else if (PTR_ERR(cpu_reg) == -EPROBE_DEFER) {
+		/*
+		 * If cpu0 regulator supply node is present, but regulator is
+		 * not yet registered, we should try defering probe.
+		 */
+		dev_err(cpu_dev, "cpu0 regulator not ready, retry\n");
+		return -EPROBE_DEFER;
+	}
+
+	ret = cpufreq_register_driver(&cpu0_cpufreq_driver);
+	if (ret)
+		pr_err("failed register driver: %d\n", ret);
+
 	return ret;
 }
 
 static int cpu0_cpufreq_remove(struct platform_device *pdev)
 {
-	cpufreq_cooling_unregister(cdev);
 	cpufreq_unregister_driver(&cpu0_cpufreq_driver);
-	dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table);
-
 	return 0;
 }