diff mbox series

[RFC,v2,4/6] PM: opp: allow control of multiple clocks

Message ID 20220411154347.491396-5-krzysztof.kozlowski@linaro.org
State New
Headers show
Series ufs: set power domain performance state when scaling gears | expand

Commit Message

Krzysztof Kozlowski April 11, 2022, 3:43 p.m. UTC
Devices might need to control several clocks when scaling the frequency
and voltage.  Example is the Universal Flash Storage (UFS) which scales
several independent clocks with change of performance levels.

Add parsing of multiple clocks and clock names and scale all of them,
when needed.  If only one clock is provided, the code should behave the
same as before.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 drivers/opp/core.c     | 205 ++++++++++++++++++++++++++++++++---------
 drivers/opp/of.c       |  48 ++++++++++
 drivers/opp/opp.h      |   9 +-
 include/linux/pm_opp.h |  23 +++++
 4 files changed, 242 insertions(+), 43 deletions(-)

Comments

Bjorn Andersson April 12, 2022, 5:15 p.m. UTC | #1
On Mon 11 Apr 08:43 PDT 2022, Krzysztof Kozlowski wrote:

> Devices might need to control several clocks when scaling the frequency
> and voltage.  Example is the Universal Flash Storage (UFS) which scales
> several independent clocks with change of performance levels.
> 
> Add parsing of multiple clocks and clock names and scale all of them,
> when needed.  If only one clock is provided, the code should behave the
> same as before.
> 
> Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
> ---
>  drivers/opp/core.c     | 205 ++++++++++++++++++++++++++++++++---------
>  drivers/opp/of.c       |  48 ++++++++++
>  drivers/opp/opp.h      |   9 +-
>  include/linux/pm_opp.h |  23 +++++
>  4 files changed, 242 insertions(+), 43 deletions(-)
> 
> diff --git a/drivers/opp/core.c b/drivers/opp/core.c
[..]
> @@ -1295,21 +1344,32 @@ static struct opp_table *_update_opp_table_clk(struct device *dev,
>  	 * Return early if we don't need to get clk or we have already tried it
>  	 * earlier.
>  	 */
> -	if (!getclk || IS_ERR(opp_table) || opp_table->clk)
> +	if (!getclk || IS_ERR(opp_table) || opp_table->clks)
>  		return opp_table;
>  
> +	opp_table->clks = kmalloc_array(1, sizeof(*opp_table->clks),
> +					GFP_KERNEL);

This seems to be 81 chars long, perhaps worth not line breaking?

> +	if (!opp_table->clks)
> +		return ERR_PTR(-ENOMEM);
> +
>  	/* Find clk for the device */
> -	opp_table->clk = clk_get(dev, NULL);
> +	opp_table->clks[0] = clk_get(dev, NULL);
>  
> -	ret = PTR_ERR_OR_ZERO(opp_table->clk);
> -	if (!ret)
> +	ret = PTR_ERR_OR_ZERO(opp_table->clks[0]);
> +	if (!ret) {
> +		opp_table->clk_count = 1;
>  		return opp_table;
> +	}
[..]
> +struct opp_table *dev_pm_opp_set_clknames(struct device *dev,
> +					  const char * const names[],
> +					  unsigned int count)
>  {
>  	struct opp_table *opp_table;
> -	int ret;
> +	struct clk *clk;
> +	int ret, i;
>  
>  	opp_table = _add_opp_table(dev, false);
>  	if (IS_ERR(opp_table))
> @@ -2159,70 +2259,92 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
>  	}
>  
>  	/* clk shouldn't be initialized at this point */
> -	if (WARN_ON(opp_table->clk)) {
> +	if (WARN_ON(opp_table->clks)) {
>  		ret = -EBUSY;
>  		goto err;
>  	}
>  
> -	/* Find clk for the device */
> -	opp_table->clk = clk_get(dev, name);
> -	if (IS_ERR(opp_table->clk)) {
> -		ret = dev_err_probe(dev, PTR_ERR(opp_table->clk),
> -				    "%s: Couldn't find clock\n", __func__);
> +	opp_table->clks = kmalloc_array(count, sizeof(*opp_table->clks),
> +					GFP_KERNEL);
> +	if (!opp_table->clks) {
> +		ret = -ENOMEM;
>  		goto err;
>  	}
>  
> +	for (i = 0; i < count; i++) {
> +		clk = clk_get(dev, names[i]);
> +		if (IS_ERR(clk)) {
> +			ret =  dev_err_probe(dev, PTR_ERR(clk),
> +					     "%s: Couldn't find clock %s\n",
> +					     __func__, names[i]);
> +			goto free_clks;
> +		}
> +
> +		opp_table->clks[i] = clk;
> +	}

Wouldn't it be convenient to make clks a struct clk_bulk_data array
and use clk_bulk_get()/clk_bulk_put() instead?

> +
> +	opp_table->clk_count = count;
> +
>  	return opp_table;
>  
> +free_clks:
> +	while (i != 0)
> +		clk_put(opp_table->clks[--i]);
> +
> +	kfree(opp_table->clks);
> +	opp_table->clks = NULL;
> +	opp_table->clk_count = -1;
>  err:
>  	dev_pm_opp_put_opp_table(opp_table);
>  
>  	return ERR_PTR(ret);
>  }
> -EXPORT_SYMBOL_GPL(dev_pm_opp_set_clkname);
> +EXPORT_SYMBOL_GPL(dev_pm_opp_set_clknames);
[..]
> +static int _read_clocks(struct dev_pm_opp *opp, struct opp_table *opp_table,
> +			struct device_node *np)
> +{
> +	int count, ret;
> +	u64 *freq;
> +
> +	count = of_property_count_u64_elems(np, "opp-hz");
> +	if (count < 0) {
> +		pr_err("%s: Invalid %s property (%d)\n",
> +			__func__, of_node_full_name(np), count);

Wouldn't %pOF be convenient to use here, seems like it becomes short
enough that you don't have to wrap this line then.

> +		return count;
> +	}
> +
> +	if (count != opp_table->clk_count) {
> +		pr_err("%s: number of rates %d does not match number of clocks %d in %s\n",
> +		       __func__, count, opp_table->clk_count,
> +		       of_node_full_name(np));
> +		return -EINVAL;
> +	}
> +
> +	freq = kmalloc_array(count, sizeof(*freq), GFP_KERNEL);
> +	if (!freq)
> +		return -ENOMEM;
> +
> +	ret = of_property_read_u64_array(np, "opp-hz", freq, count);
> +	if (ret) {
> +		pr_err("%s: error parsing %s: %d\n", __func__,
> +		       of_node_full_name(np), ret);
> +		ret = -EINVAL;
> +		goto free_freq;
> +	}

Regards,
Bjorn
Krzysztof Kozlowski April 13, 2022, 9:07 a.m. UTC | #2
On 12/04/2022 19:15, Bjorn Andersson wrote:
>>  
>> +	opp_table->clks = kmalloc_array(1, sizeof(*opp_table->clks),
>> +					GFP_KERNEL);
> 
> This seems to be 81 chars long, perhaps worth not line breaking?

I doubt that it will increase the readability:

	opp_table->clks = kmalloc_array(1,
					sizeof(*opp_table->clks),
					GFP_KERNEL);

80-character is not anymore that strict hard limit and in such case
using 1-2 characters longer improves the code.

> 
>> +	if (!opp_table->clks)
>> +		return ERR_PTR(-ENOMEM);
>> +
>>  	/* Find clk for the device */
>> -	opp_table->clk = clk_get(dev, NULL);
>> +	opp_table->clks[0] = clk_get(dev, NULL);
>>  
>> -	ret = PTR_ERR_OR_ZERO(opp_table->clk);
>> -	if (!ret)
>> +	ret = PTR_ERR_OR_ZERO(opp_table->clks[0]);
>> +	if (!ret) {
>> +		opp_table->clk_count = 1;
>>  		return opp_table;
>> +	}
> [..]
>> +struct opp_table *dev_pm_opp_set_clknames(struct device *dev,
>> +					  const char * const names[],
>> +					  unsigned int count)
>>  {
>>  	struct opp_table *opp_table;
>> -	int ret;
>> +	struct clk *clk;
>> +	int ret, i;
>>  
>>  	opp_table = _add_opp_table(dev, false);
>>  	if (IS_ERR(opp_table))
>> @@ -2159,70 +2259,92 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
>>  	}
>>  
>>  	/* clk shouldn't be initialized at this point */
>> -	if (WARN_ON(opp_table->clk)) {
>> +	if (WARN_ON(opp_table->clks)) {
>>  		ret = -EBUSY;
>>  		goto err;
>>  	}
>>  
>> -	/* Find clk for the device */
>> -	opp_table->clk = clk_get(dev, name);
>> -	if (IS_ERR(opp_table->clk)) {
>> -		ret = dev_err_probe(dev, PTR_ERR(opp_table->clk),
>> -				    "%s: Couldn't find clock\n", __func__);
>> +	opp_table->clks = kmalloc_array(count, sizeof(*opp_table->clks),
>> +					GFP_KERNEL);
>> +	if (!opp_table->clks) {
>> +		ret = -ENOMEM;
>>  		goto err;
>>  	}
>>  
>> +	for (i = 0; i < count; i++) {
>> +		clk = clk_get(dev, names[i]);
>> +		if (IS_ERR(clk)) {
>> +			ret =  dev_err_probe(dev, PTR_ERR(clk),
>> +					     "%s: Couldn't find clock %s\n",
>> +					     __func__, names[i]);
>> +			goto free_clks;
>> +		}
>> +
>> +		opp_table->clks[i] = clk;
>> +	}
> 
> Wouldn't it be convenient to make clks a struct clk_bulk_data array
> and use clk_bulk_get()/clk_bulk_put() instead?

I was thinking about this but clk_bulk_get() requires struct
clk_bulk_data, so the code in "get" is not actually smaller if function
receives array of clock names.

OTOH, usage of clk_bulk_get() would reduce code in: _put_clocks(). Rest
of the code would be more-or-less the same, including all corner cases
when clocks are missing.

> 
>> +
>> +	opp_table->clk_count = count;
>> +
>>  	return opp_table;
>>  
>> +free_clks:
>> +	while (i != 0)
>> +		clk_put(opp_table->clks[--i]);
>> +
>> +	kfree(opp_table->clks);
>> +	opp_table->clks = NULL;
>> +	opp_table->clk_count = -1;
>>  err:
>>  	dev_pm_opp_put_opp_table(opp_table);
>>  
>>  	return ERR_PTR(ret);
>>  }
>> -EXPORT_SYMBOL_GPL(dev_pm_opp_set_clkname);
>> +EXPORT_SYMBOL_GPL(dev_pm_opp_set_clknames);
> [..]
>> +static int _read_clocks(struct dev_pm_opp *opp, struct opp_table *opp_table,
>> +			struct device_node *np)
>> +{
>> +	int count, ret;
>> +	u64 *freq;
>> +
>> +	count = of_property_count_u64_elems(np, "opp-hz");
>> +	if (count < 0) {
>> +		pr_err("%s: Invalid %s property (%d)\n",
>> +			__func__, of_node_full_name(np), count);
> 
> Wouldn't %pOF be convenient to use here, seems like it becomes short
> enough that you don't have to wrap this line then.

Yes, I forgot about %pOF.

> 
>> +		return count;
>> +	}
>> +
>> +	if (count != opp_table->clk_count) {
>> +		pr_err("%s: number of rates %d does not match number of clocks %d in %s\n",
>> +		       __func__, count, opp_table->clk_count,
>> +		       of_node_full_name(np));
>> +		return -EINVAL;
>> +	}
>> +
>> +	freq = kmalloc_array(count, sizeof(*freq), GFP_KERNEL);
>> +	if (!freq)
>> +		return -ENOMEM;
>> +
>> +	ret = of_property_read_u64_array(np, "opp-hz", freq, count);
>> +	if (ret) {
>> +		pr_err("%s: error parsing %s: %d\n", __func__,
>> +		       of_node_full_name(np), ret);
>> +		ret = -EINVAL;
>> +		goto free_freq;
>> +	}
> 
> Regards,
> Bjorn


Best regards,
Krzysztof
Bjorn Andersson April 20, 2022, 3:06 a.m. UTC | #3
On Wed 13 Apr 02:07 PDT 2022, Krzysztof Kozlowski wrote:

> On 12/04/2022 19:15, Bjorn Andersson wrote:
> >>  
> >> +	opp_table->clks = kmalloc_array(1, sizeof(*opp_table->clks),
> >> +					GFP_KERNEL);
> > 
> > This seems to be 81 chars long, perhaps worth not line breaking?
> 
> I doubt that it will increase the readability:
> 
> 	opp_table->clks = kmalloc_array(1,
> 					sizeof(*opp_table->clks),
> 					GFP_KERNEL);
> 
> 80-character is not anymore that strict hard limit and in such case
> using 1-2 characters longer improves the code.
> 

I was suggesting that you remove the line break

	opp_table->clks = kmalloc_array(1, sizeof(*opp_table->clks), GFP_KERNEL);

Seems to be 81 chars long, which is fine in my book with or without the
80-char guideline.

> > 
> >> +	if (!opp_table->clks)
> >> +		return ERR_PTR(-ENOMEM);
> >> +
> >>  	/* Find clk for the device */
> >> -	opp_table->clk = clk_get(dev, NULL);
> >> +	opp_table->clks[0] = clk_get(dev, NULL);
> >>  
> >> -	ret = PTR_ERR_OR_ZERO(opp_table->clk);
> >> -	if (!ret)
> >> +	ret = PTR_ERR_OR_ZERO(opp_table->clks[0]);
> >> +	if (!ret) {
> >> +		opp_table->clk_count = 1;
> >>  		return opp_table;
> >> +	}
> > [..]
> >> +struct opp_table *dev_pm_opp_set_clknames(struct device *dev,
> >> +					  const char * const names[],
> >> +					  unsigned int count)
> >>  {
> >>  	struct opp_table *opp_table;
> >> -	int ret;
> >> +	struct clk *clk;
> >> +	int ret, i;
> >>  
> >>  	opp_table = _add_opp_table(dev, false);
> >>  	if (IS_ERR(opp_table))
> >> @@ -2159,70 +2259,92 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
> >>  	}
> >>  
> >>  	/* clk shouldn't be initialized at this point */
> >> -	if (WARN_ON(opp_table->clk)) {
> >> +	if (WARN_ON(opp_table->clks)) {
> >>  		ret = -EBUSY;
> >>  		goto err;
> >>  	}
> >>  
> >> -	/* Find clk for the device */
> >> -	opp_table->clk = clk_get(dev, name);
> >> -	if (IS_ERR(opp_table->clk)) {
> >> -		ret = dev_err_probe(dev, PTR_ERR(opp_table->clk),
> >> -				    "%s: Couldn't find clock\n", __func__);
> >> +	opp_table->clks = kmalloc_array(count, sizeof(*opp_table->clks),
> >> +					GFP_KERNEL);
> >> +	if (!opp_table->clks) {
> >> +		ret = -ENOMEM;
> >>  		goto err;
> >>  	}
> >>  
> >> +	for (i = 0; i < count; i++) {
> >> +		clk = clk_get(dev, names[i]);
> >> +		if (IS_ERR(clk)) {
> >> +			ret =  dev_err_probe(dev, PTR_ERR(clk),
> >> +					     "%s: Couldn't find clock %s\n",
> >> +					     __func__, names[i]);
> >> +			goto free_clks;
> >> +		}
> >> +
> >> +		opp_table->clks[i] = clk;
> >> +	}
> > 
> > Wouldn't it be convenient to make clks a struct clk_bulk_data array
> > and use clk_bulk_get()/clk_bulk_put() instead?
> 
> I was thinking about this but clk_bulk_get() requires struct
> clk_bulk_data, so the code in "get" is not actually smaller if function
> receives array of clock names.
> 
> OTOH, usage of clk_bulk_get() would reduce code in: _put_clocks(). Rest
> of the code would be more-or-less the same, including all corner cases
> when clocks are missing.
> 

Fair enough, I think you're right that it's not going to be much
difference.

Regards,
Bjorn


> > 
> >> +
> >> +	opp_table->clk_count = count;
> >> +
> >>  	return opp_table;
> >>  
> >> +free_clks:
> >> +	while (i != 0)
> >> +		clk_put(opp_table->clks[--i]);
> >> +
> >> +	kfree(opp_table->clks);
> >> +	opp_table->clks = NULL;
> >> +	opp_table->clk_count = -1;
> >>  err:
> >>  	dev_pm_opp_put_opp_table(opp_table);
> >>  
> >>  	return ERR_PTR(ret);
> >>  }
> >> -EXPORT_SYMBOL_GPL(dev_pm_opp_set_clkname);
> >> +EXPORT_SYMBOL_GPL(dev_pm_opp_set_clknames);
> > [..]
> >> +static int _read_clocks(struct dev_pm_opp *opp, struct opp_table *opp_table,
> >> +			struct device_node *np)
> >> +{
> >> +	int count, ret;
> >> +	u64 *freq;
> >> +
> >> +	count = of_property_count_u64_elems(np, "opp-hz");
> >> +	if (count < 0) {
> >> +		pr_err("%s: Invalid %s property (%d)\n",
> >> +			__func__, of_node_full_name(np), count);
> > 
> > Wouldn't %pOF be convenient to use here, seems like it becomes short
> > enough that you don't have to wrap this line then.
> 
> Yes, I forgot about %pOF.
> 
> > 
> >> +		return count;
> >> +	}
> >> +
> >> +	if (count != opp_table->clk_count) {
> >> +		pr_err("%s: number of rates %d does not match number of clocks %d in %s\n",
> >> +		       __func__, count, opp_table->clk_count,
> >> +		       of_node_full_name(np));
> >> +		return -EINVAL;
> >> +	}
> >> +
> >> +	freq = kmalloc_array(count, sizeof(*freq), GFP_KERNEL);
> >> +	if (!freq)
> >> +		return -ENOMEM;
> >> +
> >> +	ret = of_property_read_u64_array(np, "opp-hz", freq, count);
> >> +	if (ret) {
> >> +		pr_err("%s: error parsing %s: %d\n", __func__,
> >> +		       of_node_full_name(np), ret);
> >> +		ret = -EINVAL;
> >> +		goto free_freq;
> >> +	}
> > 
> > Regards,
> > Bjorn
> 
> 
> Best regards,
> Krzysztof
Viresh Kumar April 25, 2022, 7:27 a.m. UTC | #4
On 11-04-22, 17:43, Krzysztof Kozlowski wrote:
> Devices might need to control several clocks when scaling the frequency
> and voltage.  Example is the Universal Flash Storage (UFS) which scales
> several independent clocks with change of performance levels.
> 
> Add parsing of multiple clocks and clock names

This part is fine, the OPP core should be able to do this.

> and scale all of them,

This is tricky as the OPP core can't really assume the order in which the clocks
needs to be programmed. We had the same problem with multiple regulators and the
same is left for drivers to do via the custom-api.

Either we can take the same route here, and let platforms add their own OPP
drivers which can handle this, Or hide this all behind a basic device clock's
driver, which you get with clk_get(dev, NULL).

> diff --git a/drivers/opp/core.c b/drivers/opp/core.c

> +static int _generic_set_opp_clks_only(struct device *dev,
> +				      struct opp_table *opp_table,
> +				      struct dev_pm_opp *opp)
> +{
> +	int i, ret;
> +
> +	if (!opp_table->clks)
> +		return 0;
> +
> +	for (i = 0; i < opp_table->clk_count; i++) {
> +		if (opp->rates[i]) {

This should mean that we can disable that clock and it isn't required.

> +			ret = _generic_set_opp_clk_only(dev, opp_table->clks[i],
> +							opp->rates[i]);
> +			if (ret) {
> +				dev_err(dev, "%s: failed to set clock %pC rate: %d\n",
> +					__func__, opp_table->clks[i], ret);
> +				return ret;
> +			}
> +		}
> +	}
> +
> +	return 0;
> +}

As said earlier, this won't work in the core.

> +
>  static int _generic_set_opp_regulator(struct opp_table *opp_table,
>  				      struct device *dev,
>  				      struct dev_pm_opp *opp,
> @@ -796,7 +835,7 @@ static int _generic_set_opp_regulator(struct opp_table *opp_table,
>  	}
>  
>  	/* Change frequency */
> -	ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq);
> +	ret = _generic_set_opp_clks_only(dev, opp_table, opp);
>  	if (ret)
>  		goto restore_voltage;
>  
> @@ -820,7 +859,7 @@ static int _generic_set_opp_regulator(struct opp_table *opp_table,
>  	return 0;
>  
>  restore_freq:
> -	if (_generic_set_opp_clk_only(dev, opp_table->clk, old_opp->rate))
> +	if (_generic_set_opp_clks_only(dev, opp_table, old_opp))
>  		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
>  			__func__, old_opp->rate);
>  restore_voltage:
> @@ -880,7 +919,7 @@ static int _set_opp_custom(const struct opp_table *opp_table,

This is where we can handle it in your case, if you don't want to hide it behind
a clk driver.

>  	}
>  
>  	data->regulators = opp_table->regulators;
> -	data->clk = opp_table->clk;
> +	data->clk = (opp_table->clks ? opp_table->clks[0] : NULL);
>  	data->dev = dev;
>  	data->old_opp.rate = old_opp->rate;
>  	data->new_opp.rate = freq;
> @@ -969,8 +1008,8 @@ static void _find_current_opp(struct device *dev, struct opp_table *opp_table)

I think this routine breaks as soon as we add support for multiple clocks.
clks[0]'s frequency can be same for multiple OPPs and this won't get you the
right OPP then.

>  	struct dev_pm_opp *opp = ERR_PTR(-ENODEV);
>  	unsigned long freq;
>  
> -	if (!IS_ERR(opp_table->clk)) {
> -		freq = clk_get_rate(opp_table->clk);
> +	if (opp_table->clks && !IS_ERR(opp_table->clks[0])) {
> +		freq = clk_get_rate(opp_table->clks[0]);
>  		opp = _find_freq_ceil(opp_table, &freq);
>  	}
>  
> @@ -1070,7 +1109,7 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
>  						 scaling_down);
>  	} else {
>  		/* Only frequency scaling */
> -		ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq);
> +		ret = _generic_set_opp_clks_only(dev, opp_table, opp);
>  	}
>  
>  	if (ret)
> @@ -1135,11 +1174,15 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)

This should have a BUG or WARN _ON() now if clock count is more than one. This
routine can't be called unless custom handler is available.

I skipped rest of the code as we need to work/decide on the design first.

Thanks.
Krzysztof Kozlowski April 25, 2022, 10:03 a.m. UTC | #5
On 23/04/2022 01:44, Stephen Boyd wrote:
> Quoting Krzysztof Kozlowski (2022-04-11 08:43:45)
>> Devices might need to control several clocks when scaling the frequency
>> and voltage.  Example is the Universal Flash Storage (UFS) which scales
>> several independent clocks with change of performance levels.
>>
>> Add parsing of multiple clocks and clock names and scale all of them,
>> when needed.  If only one clock is provided, the code should behave the
>> same as before.
>>
>> Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
>> ---
> 
> I vaguely recall that scaling more than one clk with an OPP table is
> confusing? I think it's because things like dev_pm_opp_find_freq_ceil()
> don't make sense when there's more than one frequency table. How is that
> handled here?

The assumption (which might need better documentation) is that first
clock frequency is the main one:
1. It is still in opp->rate field, so it is used everywhere when OPPs
are compared/checked for rates.
1. Usually is used also in opp-table nodes names.

The logical explanation is that devices has some main operating
frequency, e.g. the core clock, and this determines the performance. In
the same time such device might not be able to scale this on core clock
independently from others, this this patches.

Best regards,
Krzysztof
Krzysztof Kozlowski May 9, 2022, 10:38 a.m. UTC | #6
On 25/04/2022 09:27, Viresh Kumar wrote:
> On 11-04-22, 17:43, Krzysztof Kozlowski wrote:
>> Devices might need to control several clocks when scaling the frequency
>> and voltage.  Example is the Universal Flash Storage (UFS) which scales
>> several independent clocks with change of performance levels.
>>
>> Add parsing of multiple clocks and clock names
> 
> This part is fine, the OPP core should be able to do this.

Sorry for late reply, I think I my filters archived it or I missed it.

> 
>> and scale all of them,
> 
> This is tricky as the OPP core can't really assume the order in which the clocks
> needs to be programmed. We had the same problem with multiple regulators and the
> same is left for drivers to do via the custom-api.
> 
> Either we can take the same route here, and let platforms add their own OPP
> drivers which can handle this, Or hide this all behind a basic device clock's
> driver, which you get with clk_get(dev, NULL).

For my use case, the order of scaling will be the same as in previous
implementation, because UFS drivers just got bunch of clocks with
freq-table-hz property and were scaling in DT order.

If drivers need something better, they can always provide custom-opp
thus replacing my method. My implementation here does not restrict them.

For the drivers where the order does not matter, why forcing each driver
to provide its own implementation of clock scaling? Isn't shared generic
PM OPP code a way to remove code duplication?

> 
>> diff --git a/drivers/opp/core.c b/drivers/opp/core.c
> 
>> +static int _generic_set_opp_clks_only(struct device *dev,
>> +				      struct opp_table *opp_table,
>> +				      struct dev_pm_opp *opp)
>> +{
>> +	int i, ret;
>> +
>> +	if (!opp_table->clks)
>> +		return 0;
>> +
>> +	for (i = 0; i < opp_table->clk_count; i++) {
>> +		if (opp->rates[i]) {
> 
> This should mean that we can disable that clock and it isn't required.

No, it does not mean that. The DT might provide several clocks which
only some are important for frequency scaling. All others just need to
be enabled.

Maybe you prefer to skip getting such clocks in PM OPP?

> 
>> +			ret = _generic_set_opp_clk_only(dev, opp_table->clks[i],
>> +							opp->rates[i]);
>> +			if (ret) {
>> +				dev_err(dev, "%s: failed to set clock %pC rate: %d\n",
>> +					__func__, opp_table->clks[i], ret);
>> +				return ret;
>> +			}
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
> 
> As said earlier, this won't work in the core.
> 
>> +
>>  static int _generic_set_opp_regulator(struct opp_table *opp_table,
>>  				      struct device *dev,
>>  				      struct dev_pm_opp *opp,
>> @@ -796,7 +835,7 @@ static int _generic_set_opp_regulator(struct opp_table *opp_table,
>>  	}
>>  
>>  	/* Change frequency */
>> -	ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq);
>> +	ret = _generic_set_opp_clks_only(dev, opp_table, opp);
>>  	if (ret)
>>  		goto restore_voltage;
>>  
>> @@ -820,7 +859,7 @@ static int _generic_set_opp_regulator(struct opp_table *opp_table,
>>  	return 0;
>>  
>>  restore_freq:
>> -	if (_generic_set_opp_clk_only(dev, opp_table->clk, old_opp->rate))
>> +	if (_generic_set_opp_clks_only(dev, opp_table, old_opp))
>>  		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
>>  			__func__, old_opp->rate);
>>  restore_voltage:
>> @@ -880,7 +919,7 @@ static int _set_opp_custom(const struct opp_table *opp_table,
> 
> This is where we can handle it in your case, if you don't want to hide it behind
> a clk driver.
> 
>>  	}
>>  
>>  	data->regulators = opp_table->regulators;
>> -	data->clk = opp_table->clk;
>> +	data->clk = (opp_table->clks ? opp_table->clks[0] : NULL);
>>  	data->dev = dev;
>>  	data->old_opp.rate = old_opp->rate;
>>  	data->new_opp.rate = freq;
>> @@ -969,8 +1008,8 @@ static void _find_current_opp(struct device *dev, struct opp_table *opp_table)
> 
> I think this routine breaks as soon as we add support for multiple clocks.
> clks[0]'s frequency can be same for multiple OPPs and this won't get you the
> right OPP then.

I don't think so and this was raised also by Stephen - only the first
clock is considered the one used for all PM OPP frequency operations,
like get ceil/floor.

The assumption (which might need better documentation) is that first
clock frequency is the main one:
1. It is still in opp->rate field, so it is used everywhere when OPPs
are compared/checked for rates.
1. Usually is used also in opp-table nodes names.

The logical explanation is that devices has some main operating
frequency, e.g. the core clock, and this determines the performance. In
the same time such device might not be able to scale this one core clock
independently from others, therefore this set of patches.

> 
>>  	struct dev_pm_opp *opp = ERR_PTR(-ENODEV);
>>  	unsigned long freq;
>>  
>> -	if (!IS_ERR(opp_table->clk)) {
>> -		freq = clk_get_rate(opp_table->clk);
>> +	if (opp_table->clks && !IS_ERR(opp_table->clks[0])) {
>> +		freq = clk_get_rate(opp_table->clks[0]);
>>  		opp = _find_freq_ceil(opp_table, &freq);
>>  	}
>>  
>> @@ -1070,7 +1109,7 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
>>  						 scaling_down);
>>  	} else {
>>  		/* Only frequency scaling */
>> -		ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq);
>> +		ret = _generic_set_opp_clks_only(dev, opp_table, opp);
>>  	}
>>  
>>  	if (ret)
>> @@ -1135,11 +1174,15 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
> 
> This should have a BUG or WARN _ON() now if clock count is more than one. This
> routine can't be called unless custom handler is available.
> 
> I skipped rest of the code as we need to work/decide on the design first.



Best regards,
Krzysztof
Viresh Kumar May 10, 2022, 4:40 a.m. UTC | #7
On 09-05-22, 12:38, Krzysztof Kozlowski wrote:
> On 25/04/2022 09:27, Viresh Kumar wrote:
> > This is tricky as the OPP core can't really assume the order in which the clocks
> > needs to be programmed. We had the same problem with multiple regulators and the
> > same is left for drivers to do via the custom-api.
> > 
> > Either we can take the same route here, and let platforms add their own OPP
> > drivers which can handle this, Or hide this all behind a basic device clock's
> > driver, which you get with clk_get(dev, NULL).
> 
> For my use case, the order of scaling will be the same as in previous
> implementation, because UFS drivers just got bunch of clocks with
> freq-table-hz property and were scaling in DT order.
> 
> If drivers need something better, they can always provide custom-opp
> thus replacing my method. My implementation here does not restrict them.
> 
> For the drivers where the order does not matter, why forcing each driver
> to provide its own implementation of clock scaling? Isn't shared generic
> PM OPP code a way to remove code duplication?

Code duplication is a good argument and I am in favor of avoiding it,
but nevertheless this shouldn't be something which platforms can pick
by mistake, just because they didn't go through core code. In other
words, this shouldn't be the default behavior of the core.

If we want, core can provide a helper to get rid of the duplication
though, but the user explicitly needs to use it.

> >> diff --git a/drivers/opp/core.c b/drivers/opp/core.c
> > 
> >> +static int _generic_set_opp_clks_only(struct device *dev,
> >> +				      struct opp_table *opp_table,
> >> +				      struct dev_pm_opp *opp)
> >> +{
> >> +	int i, ret;
> >> +
> >> +	if (!opp_table->clks)
> >> +		return 0;
> >> +
> >> +	for (i = 0; i < opp_table->clk_count; i++) {
> >> +		if (opp->rates[i]) {
> > 
> > This should mean that we can disable that clock and it isn't required.
> 
> No, it does not mean that. The DT might provide several clocks which
> only some are important for frequency scaling. All others just need to
> be enabled.
> 
> Maybe you prefer to skip getting such clocks in PM OPP?

They shouldn't reach the OPP core then. What will the OPP core do if a
clock has a value for one OPP and not the other ?

> >> @@ -969,8 +1008,8 @@ static void _find_current_opp(struct device *dev, struct opp_table *opp_table)
> > 
> > I think this routine breaks as soon as we add support for multiple clocks.
> > clks[0]'s frequency can be same for multiple OPPs and this won't get you the
> > right OPP then.
> 
> I don't think so and this was raised also by Stephen - only the first
> clock is considered the one used for all PM OPP frequency operations,
> like get ceil/floor.

IMHO, this is broken by design. I can easily see that someone wants to
have few variants of all other frequencies for the same frequency of
the so called "main" clock, i.e. multiple OPPs with same "main" freq
value.  I don't think we can mark the clocks "main" or otherwise as
easily for every platform.

Stephen, any inputs on this ?

> The assumption (which might need better documentation) is that first
> clock frequency is the main one:
> 1. It is still in opp->rate field, so it is used everywhere when OPPs
> are compared/checked for rates.
> 1. Usually is used also in opp-table nodes names.
> 
> The logical explanation is that devices has some main operating
> frequency, e.g. the core clock, and this determines the performance. In
> the same time such device might not be able to scale this one core clock
> independently from others, therefore this set of patches.

I understand what you are saying, but I can feel that it will break or
will force bad bug-fixes into the core at a later point of time.

I think it would be better to take it slowly and see how it goes. Lets
first add support for the OPP core to parse and store this data and
then we can add support to use it, or at least do all this in separate
patches so they are easier to review/apply.
Krzysztof Kozlowski May 10, 2022, 1:09 p.m. UTC | #8
On 10/05/2022 06:40, Viresh Kumar wrote:
> On 09-05-22, 12:38, Krzysztof Kozlowski wrote:
>> On 25/04/2022 09:27, Viresh Kumar wrote:
>>> This is tricky as the OPP core can't really assume the order in which the clocks
>>> needs to be programmed. We had the same problem with multiple regulators and the
>>> same is left for drivers to do via the custom-api.
>>>
>>> Either we can take the same route here, and let platforms add their own OPP
>>> drivers which can handle this, Or hide this all behind a basic device clock's
>>> driver, which you get with clk_get(dev, NULL).
>>
>> For my use case, the order of scaling will be the same as in previous
>> implementation, because UFS drivers just got bunch of clocks with
>> freq-table-hz property and were scaling in DT order.
>>
>> If drivers need something better, they can always provide custom-opp
>> thus replacing my method. My implementation here does not restrict them.
>>
>> For the drivers where the order does not matter, why forcing each driver
>> to provide its own implementation of clock scaling? Isn't shared generic
>> PM OPP code a way to remove code duplication?
> 
> Code duplication is a good argument and I am in favor of avoiding it,
> but nevertheless this shouldn't be something which platforms can pick
> by mistake, just because they didn't go through core code. In other
> words, this shouldn't be the default behavior of the core.
> 
> If we want, core can provide a helper to get rid of the duplication
> though, but the user explicitly needs to use it.

OK, that sounds like a solution.

> 
>>>> diff --git a/drivers/opp/core.c b/drivers/opp/core.c
>>>
>>>> +static int _generic_set_opp_clks_only(struct device *dev,
>>>> +				      struct opp_table *opp_table,
>>>> +				      struct dev_pm_opp *opp)
>>>> +{
>>>> +	int i, ret;
>>>> +
>>>> +	if (!opp_table->clks)
>>>> +		return 0;
>>>> +
>>>> +	for (i = 0; i < opp_table->clk_count; i++) {
>>>> +		if (opp->rates[i]) {
>>>
>>> This should mean that we can disable that clock and it isn't required.
>>
>> No, it does not mean that. The DT might provide several clocks which
>> only some are important for frequency scaling. All others just need to
>> be enabled.
>>
>> Maybe you prefer to skip getting such clocks in PM OPP?
> 
> They shouldn't reach the OPP core then. What will the OPP core do if a
> clock has a value for one OPP and not the other ?

That would be the same mistake as providing one voltage as 0 or with
something outside of a spec (but still within regulators min/max).
Mistakes in DTS create undesirable behavior and this part is no different.

However I understand your point - since the driver provides the list of
clocks to OPP, it should not provide ones which are irrelevant.

> 
>>>> @@ -969,8 +1008,8 @@ static void _find_current_opp(struct device *dev, struct opp_table *opp_table)
>>>
>>> I think this routine breaks as soon as we add support for multiple clocks.
>>> clks[0]'s frequency can be same for multiple OPPs and this won't get you the
>>> right OPP then.
>>
>> I don't think so and this was raised also by Stephen - only the first
>> clock is considered the one used for all PM OPP frequency operations,
>> like get ceil/floor.
> 
> IMHO, this is broken by design. I can easily see that someone wants to
> have few variants of all other frequencies for the same frequency of
> the so called "main" clock, i.e. multiple OPPs with same "main" freq
> value.  I don't think we can mark the clocks "main" or otherwise as
> easily for every platform.
> 
> Stephen, any inputs on this ?

In such case, matching opps by frequency would be a quite different API.
The drivers can use now:
https://github.com/krzk/linux/commit/ebc31798494fcc66389ae409dce6d9489c16156a#diff-b6370444c32afa2e55d9b6150f355ba6f4d20c5ed5da5399ea8295d323de8267R1200

If you assume that this frequency can be used for multiple OPPs, then
the API should be different. Something like:
int dev_pm_opp_set_rate(struct device *dev, unsigned long *target_freqs,
                        size_t num_freqs);

Finding right opp for given frequencies would be also quite much more
complicated task. Not a simple ceil/floor search by one frequency.

I don't need that use-case and my implementation does not prevent anyone
from implementing it in the future. IOW, why developing now complex
solution which no one currently needs? If anyone needs such scaling by
multiple-frequencies, the PM OPP can be reworked/extended/improved again.

Additionally let me point also that my implementation targets not a
specific one driver, but actually entire subsystem of drivers - all UFS
drivers.

> 
>> The assumption (which might need better documentation) is that first
>> clock frequency is the main one:
>> 1. It is still in opp->rate field, so it is used everywhere when OPPs
>> are compared/checked for rates.
>> 1. Usually is used also in opp-table nodes names.
>>
>> The logical explanation is that devices has some main operating
>> frequency, e.g. the core clock, and this determines the performance. In
>> the same time such device might not be able to scale this one core clock
>> independently from others, therefore this set of patches.
> 
> I understand what you are saying, but I can feel that it will break or
> will force bad bug-fixes into the core at a later point of time.
> 
> I think it would be better to take it slowly and see how it goes. Lets
> first add support for the OPP core to parse and store this data and
> then we can add support to use it, or at least do all this in separate
> patches so they are easier to review/apply.

Sure, I'll split the patch to smaller chunks.

Best regards,
Krzysztof
Viresh Kumar May 11, 2022, 5:06 a.m. UTC | #9
On 10-05-22, 15:09, Krzysztof Kozlowski wrote:
> On 10/05/2022 06:40, Viresh Kumar wrote:
> > They shouldn't reach the OPP core then. What will the OPP core do if a
> > clock has a value for one OPP and not the other ?
> 
> That would be the same mistake as providing one voltage as 0 or with
> something outside of a spec (but still within regulators min/max).
> Mistakes in DTS create undesirable behavior and this part is no different.

Right, I agree and so it shouldn't be allowed in principle.

> However I understand your point - since the driver provides the list of
> clocks to OPP, it should not provide ones which are irrelevant.

Right.

> > IMHO, this is broken by design. I can easily see that someone wants to
> > have few variants of all other frequencies for the same frequency of
> > the so called "main" clock, i.e. multiple OPPs with same "main" freq
> > value.  I don't think we can mark the clocks "main" or otherwise as
> > easily for every platform.
> > 
> > Stephen, any inputs on this ?
> 
> In such case, matching opps by frequency would be a quite different API.
> The drivers can use now:
> https://github.com/krzk/linux/commit/ebc31798494fcc66389ae409dce6d9489c16156a#diff-b6370444c32afa2e55d9b6150f355ba6f4d20c5ed5da5399ea8295d323de8267R1200
> 
> If you assume that this frequency can be used for multiple OPPs, then
> the API should be different. Something like:
> int dev_pm_opp_set_rate(struct device *dev, unsigned long *target_freqs,
>                         size_t num_freqs);

At this point I am not looking for a new API, but just continuing the discussion
to understand what different hardwares want or look like.

> Finding right opp for given frequencies would be also quite much more
> complicated task. Not a simple ceil/floor search by one frequency.

Right.

> I don't need that use-case and my implementation does not prevent anyone
> from implementing it in the future. IOW, why developing now complex
> solution which no one currently needs? If anyone needs such scaling by
> multiple-frequencies, the PM OPP can be reworked/extended/improved again.

It isn't about being complex or simple for me, but the design needs to be
robust. Either we can have a guaranteed "main" frequency or not and that would
decide how we need to proceed here.
Krzysztof Kozlowski May 19, 2022, 8:03 a.m. UTC | #10
On 19/05/2022 01:57, Stephen Boyd wrote:
> Quoting Viresh Kumar (2022-05-10 22:06:43)
>> On 10-05-22, 15:09, Krzysztof Kozlowski wrote:
>>> On 10/05/2022 06:40, Viresh Kumar wrote:
>>>> IMHO, this is broken by design. I can easily see that someone wants to
>>>> have few variants of all other frequencies for the same frequency of
>>>> the so called "main" clock, i.e. multiple OPPs with same "main" freq
>>>> value.  I don't think we can mark the clocks "main" or otherwise as
>>>> easily for every platform.
>>>>
>>>> Stephen, any inputs on this ?
>>>
>>> In such case, matching opps by frequency would be a quite different API.
>>> The drivers can use now:
>>> https://github.com/krzk/linux/commit/ebc31798494fcc66389ae409dce6d9489c16156a#diff-b6370444c32afa2e55d9b6150f355ba6f4d20c5ed5da5399ea8295d323de8267R1200
>>>
>>> If you assume that this frequency can be used for multiple OPPs, then
>>> the API should be different. Something like:
>>> int dev_pm_opp_set_rate(struct device *dev, unsigned long *target_freqs,
>>>                         size_t num_freqs);
>>
>> At this point I am not looking for a new API, but just continuing the discussion
>> to understand what different hardwares want or look like.
> 
> I think for UFS they don't want a rate API at all. They want to set a
> "clock gear" and that translates into whatever that means for OPP; be it
> a clk frequency (or two), an interconnect bandwidth (or multiple?), and some
> performance state (or many) for any power domains. I think the gear
> design is built into the UFS spec. If it isn't then I'm misremembering
> things.

Yes, true. The clock frequencies are still changed with each gear, but
in general the UFS indeed operates on gear concept.

Best regards,
Krzysztof
Viresh Kumar May 26, 2022, 10:27 a.m. UTC | #11
On 25-05-22, 09:04, Stephen Boyd wrote:
> I'm saying that each OPP table would be for a single clk, but they would
> be connected through required-opps for the device's OPP table.

Ahh, okay.

> It would
> mean that dev_pm_opp_set_clkname() would need extension to let a driver
> indicate which clk is associated with an OPP table.

Hmm, just that it complicates simple cases. Lets see.

> From your other
> reply on v3 it seems that you're leaning towards having an array of
> frequency values in the OPP table instead of doing table linking?

I am not against that to be honest, we have done that for voltages and
current already. I am just not fine with having any one of them as the
primary clock. I liked your idea of reusing "level" for that.

I have started some rewriting of the core, to simplify things and
reduce the number of ever increasing APIs (which you suggested earlier
once). Lets see where we land eventually.
Viresh Kumar May 31, 2022, 10:30 a.m. UTC | #12
On 19-05-22, 10:03, Krzysztof Kozlowski wrote:
> Yes, true. The clock frequencies are still changed with each gear, but
> in general the UFS indeed operates on gear concept.

Hi Krzysztof,

I have redesigned the OPP core a bit (two patchsets until now) to make
it easier to add multiple clock support going forward. I need some
inputs from you before moving forward with it now. Will this work for
your use case:

- Add support for multiple clocks, where none of them is primary.

- Which means you won't be able to use dev_pm_opp_set_rate() but will
  need something like dev_pm_opp_set_level(), will add it.

- That is, your OPP table will need to implement levels (I think of
  them as UFS gears) and then call dev_pm_opp_set_level() instead.

- This new API will work just like dev_pm_opp_set_rate(), except that
  it will find the target OPP based on level instead of freq and
  support configuration of multiple clock frequencies.

- Of course both these APIs will share most of the code.
Krzysztof Kozlowski June 1, 2022, 11:23 a.m. UTC | #13
On 31/05/2022 12:30, Viresh Kumar wrote:
> On 19-05-22, 10:03, Krzysztof Kozlowski wrote:
>> Yes, true. The clock frequencies are still changed with each gear, but
>> in general the UFS indeed operates on gear concept.
> 
> Hi Krzysztof,
> 
> I have redesigned the OPP core a bit (two patchsets until now) to make
> it easier to add multiple clock support going forward. I need some
> inputs from you before moving forward with it now. Will this work for
> your use case:
> 
> - Add support for multiple clocks, where none of them is primary.
> 
> - Which means you won't be able to use dev_pm_opp_set_rate() but will
>   need something like dev_pm_opp_set_level(), will add it.
> 
> - That is, your OPP table will need to implement levels (I think of
>   them as UFS gears) and then call dev_pm_opp_set_level() instead.
> 
> - This new API will work just like dev_pm_opp_set_rate(), except that
>   it will find the target OPP based on level instead of freq and
>   support configuration of multiple clock frequencies.
> 
> - Of course both these APIs will share most of the code.

Hi Viresh,

In general this looks reasonable and matches how the UFS gears should be
modeled. It does not match how UFS drivers implemented the clock
scaling, but that's the internal problem of UFS drivers. They scale the
clocks only max or min, even though there are multiple gears in between.
The new approach looks therefore appropriate.

Best regards,
Krzysztof
diff mbox series

Patch

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 2945f3c1ce09..5dcd7157f6ab 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -66,6 +66,21 @@  static struct opp_table *_find_opp_table_unlocked(struct device *dev)
 	return ERR_PTR(-ENODEV);
 }
 
+static void _put_clocks(struct opp_table *opp_table)
+{
+	int i;
+
+	if (!opp_table->clks)
+		return;
+
+	for (i = opp_table->clk_count - 1; i >= 0; i--)
+		clk_put(opp_table->clks[i]);
+
+	kfree(opp_table->clks);
+	opp_table->clks = NULL;
+	opp_table->clk_count = -1;
+};
+
 /**
  * _find_opp_table() - find opp_table struct using device pointer
  * @dev:	device pointer used to lookup OPP table
@@ -772,6 +787,30 @@  static inline int _generic_set_opp_clk_only(struct device *dev, struct clk *clk,
 	return ret;
 }
 
+static int _generic_set_opp_clks_only(struct device *dev,
+				      struct opp_table *opp_table,
+				      struct dev_pm_opp *opp)
+{
+	int i, ret;
+
+	if (!opp_table->clks)
+		return 0;
+
+	for (i = 0; i < opp_table->clk_count; i++) {
+		if (opp->rates[i]) {
+			ret = _generic_set_opp_clk_only(dev, opp_table->clks[i],
+							opp->rates[i]);
+			if (ret) {
+				dev_err(dev, "%s: failed to set clock %pC rate: %d\n",
+					__func__, opp_table->clks[i], ret);
+				return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
 static int _generic_set_opp_regulator(struct opp_table *opp_table,
 				      struct device *dev,
 				      struct dev_pm_opp *opp,
@@ -796,7 +835,7 @@  static int _generic_set_opp_regulator(struct opp_table *opp_table,
 	}
 
 	/* Change frequency */
-	ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq);
+	ret = _generic_set_opp_clks_only(dev, opp_table, opp);
 	if (ret)
 		goto restore_voltage;
 
@@ -820,7 +859,7 @@  static int _generic_set_opp_regulator(struct opp_table *opp_table,
 	return 0;
 
 restore_freq:
-	if (_generic_set_opp_clk_only(dev, opp_table->clk, old_opp->rate))
+	if (_generic_set_opp_clks_only(dev, opp_table, old_opp))
 		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
 			__func__, old_opp->rate);
 restore_voltage:
@@ -880,7 +919,7 @@  static int _set_opp_custom(const struct opp_table *opp_table,
 	}
 
 	data->regulators = opp_table->regulators;
-	data->clk = opp_table->clk;
+	data->clk = (opp_table->clks ? opp_table->clks[0] : NULL);
 	data->dev = dev;
 	data->old_opp.rate = old_opp->rate;
 	data->new_opp.rate = freq;
@@ -969,8 +1008,8 @@  static void _find_current_opp(struct device *dev, struct opp_table *opp_table)
 	struct dev_pm_opp *opp = ERR_PTR(-ENODEV);
 	unsigned long freq;
 
-	if (!IS_ERR(opp_table->clk)) {
-		freq = clk_get_rate(opp_table->clk);
+	if (opp_table->clks && !IS_ERR(opp_table->clks[0])) {
+		freq = clk_get_rate(opp_table->clks[0]);
 		opp = _find_freq_ceil(opp_table, &freq);
 	}
 
@@ -1070,7 +1109,7 @@  static int _set_opp(struct device *dev, struct opp_table *opp_table,
 						 scaling_down);
 	} else {
 		/* Only frequency scaling */
-		ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq);
+		ret = _generic_set_opp_clks_only(dev, opp_table, opp);
 	}
 
 	if (ret)
@@ -1135,11 +1174,15 @@  int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 		 * equivalent to a clk_set_rate()
 		 */
 		if (!_get_opp_count(opp_table)) {
-			ret = _generic_set_opp_clk_only(dev, opp_table->clk, target_freq);
+			if (opp_table->clks)
+				ret = _generic_set_opp_clk_only(dev,
+								opp_table->clks[0],
+								target_freq);
 			goto put_opp_table;
 		}
 
-		freq = clk_round_rate(opp_table->clk, target_freq);
+		if (opp_table->clks)
+			freq = clk_round_rate(opp_table->clks[0], target_freq);
 		if ((long)freq <= 0)
 			freq = target_freq;
 
@@ -1156,6 +1199,11 @@  int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 				__func__, freq, ret);
 			goto put_opp_table;
 		}
+		/*
+		 * opp->rates are used for scaling clocks, so be sure accurate
+		 * 'freq' is used, instead what was defined via e.g. Devicetree.
+		 */
+		opp->rates[0] = freq;
 	}
 
 	ret = _set_opp(dev, opp_table, opp, freq);
@@ -1246,7 +1294,8 @@  static struct opp_table *_allocate_opp_table(struct device *dev, int index)
 	INIT_LIST_HEAD(&opp_table->dev_list);
 	INIT_LIST_HEAD(&opp_table->lazy);
 
-	/* Mark regulator count uninitialized */
+	/* Mark regulator/clk count uninitialized */
+	opp_table->clk_count = -1;
 	opp_table->regulator_count = -1;
 
 	opp_dev = _add_opp_dev(dev, opp_table);
@@ -1295,21 +1344,32 @@  static struct opp_table *_update_opp_table_clk(struct device *dev,
 	 * Return early if we don't need to get clk or we have already tried it
 	 * earlier.
 	 */
-	if (!getclk || IS_ERR(opp_table) || opp_table->clk)
+	if (!getclk || IS_ERR(opp_table) || opp_table->clks)
 		return opp_table;
 
+	opp_table->clks = kmalloc_array(1, sizeof(*opp_table->clks),
+					GFP_KERNEL);
+	if (!opp_table->clks)
+		return ERR_PTR(-ENOMEM);
+
 	/* Find clk for the device */
-	opp_table->clk = clk_get(dev, NULL);
+	opp_table->clks[0] = clk_get(dev, NULL);
 
-	ret = PTR_ERR_OR_ZERO(opp_table->clk);
-	if (!ret)
+	ret = PTR_ERR_OR_ZERO(opp_table->clks[0]);
+	if (!ret) {
+		opp_table->clk_count = 1;
 		return opp_table;
+	}
 
 	if (ret == -ENOENT) {
+		opp_table->clk_count = 0;
 		dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__, ret);
 		return opp_table;
 	}
 
+	kfree(opp_table->clks);
+	opp_table->clks = NULL;
+	opp_table->clk_count = -1;
 	dev_pm_opp_put_opp_table(opp_table);
 	dev_err_probe(dev, ret, "Couldn't find clock\n");
 
@@ -1408,9 +1468,7 @@  static void _opp_table_kref_release(struct kref *kref)
 
 	_of_clear_opp_table(opp_table);
 
-	/* Release clk */
-	if (!IS_ERR(opp_table->clk))
-		clk_put(opp_table->clk);
+	_put_clocks(opp_table);
 
 	if (opp_table->paths) {
 		for (i = 0; i < opp_table->path_count; i++)
@@ -2144,9 +2202,51 @@  EXPORT_SYMBOL_GPL(devm_pm_opp_set_regulators);
  * This must be called before any OPPs are initialized for the device.
  */
 struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
+{
+	return dev_pm_opp_set_clknames(dev, &name, 1);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_clkname);
+
+/**
+ * dev_pm_opp_put_clkname() - Releases resources blocked for clk.
+ * @opp_table: OPP table returned from dev_pm_opp_set_clkname().
+ */
+void dev_pm_opp_put_clkname(struct opp_table *opp_table)
+{
+	return dev_pm_opp_put_clknames(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_clkname);
+
+/**
+ * devm_pm_opp_set_clkname() - Set clk name for the device
+ * @dev: Device for which clk name is being set.
+ * @name: Clk name.
+ *
+ * This is a resource-managed variant of dev_pm_opp_set_clkname().
+ *
+ * Return: 0 on success and errorno otherwise.
+ */
+int devm_pm_opp_set_clkname(struct device *dev, const char *name)
+{
+	return devm_pm_opp_set_clknames(dev, &name, 1);
+}
+EXPORT_SYMBOL_GPL(devm_pm_opp_set_clkname);
+
+/**
+ * dev_pm_opp_set_clknames() - Set clk names for the device
+ * @dev: Device for which clock names are being set.
+ * @names: Array of pointers to the names of the clocks.
+ * @count: Number of clocks.
+ *
+ * See: dev_pm_opp_set_clkname()
+ */
+struct opp_table *dev_pm_opp_set_clknames(struct device *dev,
+					  const char * const names[],
+					  unsigned int count)
 {
 	struct opp_table *opp_table;
-	int ret;
+	struct clk *clk;
+	int ret, i;
 
 	opp_table = _add_opp_table(dev, false);
 	if (IS_ERR(opp_table))
@@ -2159,70 +2259,92 @@  struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
 	}
 
 	/* clk shouldn't be initialized at this point */
-	if (WARN_ON(opp_table->clk)) {
+	if (WARN_ON(opp_table->clks)) {
 		ret = -EBUSY;
 		goto err;
 	}
 
-	/* Find clk for the device */
-	opp_table->clk = clk_get(dev, name);
-	if (IS_ERR(opp_table->clk)) {
-		ret = dev_err_probe(dev, PTR_ERR(opp_table->clk),
-				    "%s: Couldn't find clock\n", __func__);
+	opp_table->clks = kmalloc_array(count, sizeof(*opp_table->clks),
+					GFP_KERNEL);
+	if (!opp_table->clks) {
+		ret = -ENOMEM;
 		goto err;
 	}
 
+	for (i = 0; i < count; i++) {
+		clk = clk_get(dev, names[i]);
+		if (IS_ERR(clk)) {
+			ret =  dev_err_probe(dev, PTR_ERR(clk),
+					     "%s: Couldn't find clock %s\n",
+					     __func__, names[i]);
+			goto free_clks;
+		}
+
+		opp_table->clks[i] = clk;
+	}
+
+	opp_table->clk_count = count;
+
 	return opp_table;
 
+free_clks:
+	while (i != 0)
+		clk_put(opp_table->clks[--i]);
+
+	kfree(opp_table->clks);
+	opp_table->clks = NULL;
+	opp_table->clk_count = -1;
 err:
 	dev_pm_opp_put_opp_table(opp_table);
 
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_clkname);
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_clknames);
 
 /**
- * dev_pm_opp_put_clkname() - Releases resources blocked for clk.
- * @opp_table: OPP table returned from dev_pm_opp_set_clkname().
+ * dev_pm_opp_put_clknames() - Releases resources blocked for clk.
+ * @opp_table: OPP table returned from dev_pm_opp_set_clknames().
  */
-void dev_pm_opp_put_clkname(struct opp_table *opp_table)
+void dev_pm_opp_put_clknames(struct opp_table *opp_table)
 {
 	if (unlikely(!opp_table))
 		return;
 
-	clk_put(opp_table->clk);
-	opp_table->clk = ERR_PTR(-EINVAL);
+	_put_clocks(opp_table);
 
 	dev_pm_opp_put_opp_table(opp_table);
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_clkname);
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_clknames);
 
-static void devm_pm_opp_clkname_release(void *data)
+static void devm_pm_opp_clknames_release(void *data)
 {
-	dev_pm_opp_put_clkname(data);
+	dev_pm_opp_put_clknames(data);
 }
 
 /**
- * devm_pm_opp_set_clkname() - Set clk name for the device
- * @dev: Device for which clk name is being set.
- * @name: Clk name.
+ * devm_pm_opp_set_clknames() - Set clock names for the device
+ * @dev: Device for which clock names are being set.
+ * @names: Array of pointers to the names of the clocks.
+ * @count: Number of clocks.
  *
- * This is a resource-managed variant of dev_pm_opp_set_clkname().
+ * This is a resource-managed variant of dev_pm_opp_set_clknames().
  *
  * Return: 0 on success and errorno otherwise.
  */
-int devm_pm_opp_set_clkname(struct device *dev, const char *name)
+int devm_pm_opp_set_clknames(struct device *dev,
+			     const char * const names[],
+			     unsigned int count)
 {
 	struct opp_table *opp_table;
 
-	opp_table = dev_pm_opp_set_clkname(dev, name);
+	opp_table = dev_pm_opp_set_clknames(dev, names, count);
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
-	return devm_add_action_or_reset(dev, devm_pm_opp_clkname_release,
+	return devm_add_action_or_reset(dev, devm_pm_opp_clknames_release,
 					opp_table);
 }
-EXPORT_SYMBOL_GPL(devm_pm_opp_set_clkname);
+EXPORT_SYMBOL_GPL(devm_pm_opp_set_clknames);
 
 /**
  * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
@@ -2637,7 +2759,8 @@  int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
-	/* Fix regulator count for dynamic OPPs */
+	/* Fix regulator/clk count for dynamic OPPs */
+	opp_table->clk_count = 1;
 	opp_table->regulator_count = 1;
 
 	ret = _opp_add_v1(opp_table, dev, freq, u_volt, true);
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 440ab5a03df9..26ab58b71c2d 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -767,6 +767,47 @@  void dev_pm_opp_of_remove_table(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
 
+static int _read_clocks(struct dev_pm_opp *opp, struct opp_table *opp_table,
+			struct device_node *np)
+{
+	int count, ret;
+	u64 *freq;
+
+	count = of_property_count_u64_elems(np, "opp-hz");
+	if (count < 0) {
+		pr_err("%s: Invalid %s property (%d)\n",
+			__func__, of_node_full_name(np), count);
+		return count;
+	}
+
+	if (count != opp_table->clk_count) {
+		pr_err("%s: number of rates %d does not match number of clocks %d in %s\n",
+		       __func__, count, opp_table->clk_count,
+		       of_node_full_name(np));
+		return -EINVAL;
+	}
+
+	freq = kmalloc_array(count, sizeof(*freq), GFP_KERNEL);
+	if (!freq)
+		return -ENOMEM;
+
+	ret = of_property_read_u64_array(np, "opp-hz", freq, count);
+	if (ret) {
+		pr_err("%s: error parsing %s: %d\n", __func__,
+		       of_node_full_name(np), ret);
+		ret = -EINVAL;
+		goto free_freq;
+	}
+
+	opp->rates = freq;
+	return 0;
+
+free_freq:
+	kfree(freq);
+
+	return ret;
+}
+
 static int _read_bw(struct dev_pm_opp *new_opp, struct opp_table *table,
 		    struct device_node *np, bool peak)
 {
@@ -827,6 +868,13 @@  static int _read_opp_key(struct dev_pm_opp *new_opp, struct opp_table *table,
 	}
 	*rate_not_available = !!ret;
 
+	if (!ret) {
+		ret = _read_clocks(new_opp, table, np);
+		/* The properties were found but we failed to parse them */
+		if (ret && ret != -ENODEV)
+			return ret;
+	}
+
 	/*
 	 * Bandwidth consists of peak and average (optional) values:
 	 * opp-peak-kBps = <path1_value path2_value>;
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 45e3a55239a1..2c4502cff3b2 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -60,6 +60,7 @@  extern struct list_head opp_tables, lazy_opp_tables;
  * @pstate: Device's power domain's performance state.
  * @rate:	Frequency in hertz
  * @level:	Performance level
+ * @rates:	Frequency rates for the clocks.
  * @supplies:	Power supplies voltage/current values
  * @bandwidth:	Interconnect bandwidth values
  * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
@@ -84,6 +85,7 @@  struct dev_pm_opp {
 	unsigned long rate;
 	unsigned int level;
 
+	u64 *rates;
 	struct dev_pm_opp_supply *supplies;
 	struct dev_pm_opp_icc_bw *bandwidth;
 
@@ -149,7 +151,9 @@  enum opp_table_access {
  * @supported_hw: Array of version number to support.
  * @supported_hw_count: Number of elements in supported_hw array.
  * @prop_name: A name to postfix to many DT properties, while parsing them.
- * @clk: Device's clock handle
+ * @clks: Device clocks handles
+ * @clk_count: Number of clocks. Its value can be -1 (uninitialized), 0 (no
+ * clock handle provided)  or > 0 (has clock handles).
  * @regulators: Supply regulators
  * @regulator_count: Number of power supply regulators. Its value can be -1
  * (uninitialized), 0 (no opp-microvolt property) or > 0 (has opp-microvolt
@@ -200,7 +204,8 @@  struct opp_table {
 	unsigned int *supported_hw;
 	unsigned int supported_hw_count;
 	const char *prop_name;
-	struct clk *clk;
+	struct clk **clks;
+	int clk_count;
 	struct regulator **regulators;
 	int regulator_count;
 	struct icc_path **paths;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 0d85a63a1f78..cb50ccf6f818 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -158,6 +158,13 @@  int devm_pm_opp_set_regulators(struct device *dev, const char * const names[], u
 struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 int devm_pm_opp_set_clkname(struct device *dev, const char *name);
+struct opp_table *dev_pm_opp_set_clknames(struct device *dev,
+					  const char * const names[],
+					  unsigned int count);
+void dev_pm_opp_put_clknames(struct opp_table *opp_table);
+int devm_pm_opp_set_clknames(struct device *dev,
+			     const char * const names[],
+			     unsigned int count);
 struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
 int devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
@@ -386,6 +393,22 @@  static inline int devm_pm_opp_set_clkname(struct device *dev, const char *name)
 	return -EOPNOTSUPP;
 }
 
+static inline struct opp_table *dev_pm_opp_set_clknames(struct device *dev,
+							const char * const names[],
+							unsigned int count)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void dev_pm_opp_put_clknames(struct opp_table *opp_table) {}
+
+static inline int devm_pm_opp_set_clknames(struct device *dev,
+					   const char * const names[],
+					   unsigned int count)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs)
 {
 	return ERR_PTR(-EOPNOTSUPP);