diff mbox series

[v3,3/6] cpufreq: qcom-cpufreq-hw: Add dcvs interrupt support

Message ID 20210708120656.663851-4-thara.gopinath@linaro.org
State New
Headers show
Series Introduce LMh driver for Qualcomm SoCs | expand

Commit Message

Thara Gopinath July 8, 2021, 12:06 p.m. UTC
Add interrupt support to notify the kernel of h/w initiated frequency
throttling by LMh. Convey this to scheduler via thermal presssure
interface.

Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>

---

v2->v3:
	- Cosmetic fixes from review comments on the list.
	- Moved all LMh initializations to qcom_cpufreq_hw_lmh_init.
	- Added freeing of LMh interrupt and cancelling the polling worker to
	  qcom_cpufreq_hw_cpu_exit as per Viresh's suggestion.
	- LMh interrupts are now tied to cpu dev and not cpufreq dev. This will be
	  useful for further generation of SoCs where the same interrupt signals
	  multiple cpu clusters.

v1->v2:
	- Introduced qcom_cpufreq_hw_lmh_init to consolidate LMh related initializations
	  as per Viresh's review comment.
	- Moved the piece of code restarting polling/re-enabling LMh interrupt to
	  qcom_lmh_dcvs_notify therby simplifying isr and timer callback as per Viresh's
	  suggestion.
	- Droped cpus from qcom_cpufreq_data and instead using cpus from cpufreq_policy in
	  qcom_lmh_dcvs_notify as per Viresh's review comment.
	- Dropped dt property qcom,support-lmh as per Bjorn's suggestion.
	- Other minor/cosmetic fixes

 drivers/cpufreq/qcom-cpufreq-hw.c | 118 ++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)

-- 
2.25.1

Comments

Viresh Kumar July 9, 2021, 6:46 a.m. UTC | #1
On 08-07-21, 08:06, Thara Gopinath wrote:
>  static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)

>  {

>  	struct platform_device *pdev = cpufreq_get_driver_data();

> @@ -370,6 +480,10 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)

>  			dev_warn(cpu_dev, "failed to enable boost: %d\n", ret);

>  	}

>  

> +	ret = qcom_cpufreq_hw_lmh_init(policy, index);


You missed unregistering EM here (which is also missing from exit,
which you need to fix first in a separate patch).

> +	if (ret)

> +		goto error;

> +

>  	return 0;

>  error:

>  	kfree(data);

> @@ -389,6 +503,10 @@ static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)

>  

>  	dev_pm_opp_remove_all_dynamic(cpu_dev);

>  	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);

> +	if (data->lmh_dcvs_irq > 0) {

> +		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);


Why using devm variants here and while requesting the irq ? 

> +		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);

> +	}


Please move this to qcom_cpufreq_hw_lmh_exit() or something.

Now with sequence of disabling interrupt, etc, I see a potential
problem.

CPU0                                    CPU1

qcom_cpufreq_hw_cpu_exit()
-> devm_free_irq();
                                        qcom_lmh_dcvs_poll()
                                        -> qcom_lmh_dcvs_notify()
                                          -> enable_irq()

-> cancel_delayed_work_sync();


What will happen if enable_irq() gets called after freeing the irq ?
Not sure, but it looks like you will hit this then from manage.c:

WARN(!desc->irq_data.chip, KERN_ERR "enable_irq before
                                     setup/request_irq: irq %u\n", irq))

?

You got a chicken n egg problem :)

-- 
viresh
Thara Gopinath July 9, 2021, 3:37 p.m. UTC | #2
On 7/9/21 2:46 AM, Viresh Kumar wrote:
> On 08-07-21, 08:06, Thara Gopinath wrote:
>>   static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
>>   {
>>   	struct platform_device *pdev = cpufreq_get_driver_data();
>> @@ -370,6 +480,10 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
>>   			dev_warn(cpu_dev, "failed to enable boost: %d\n", ret);
>>   	}
>>   
>> +	ret = qcom_cpufreq_hw_lmh_init(policy, index);
> 
> You missed unregistering EM here (which is also missing from exit,
> which you need to fix first in a separate patch).

Hi!

So how exactly do you do this? I checked other users of the api and I do 
not see any free. I would say if needed, it should be a separate patch 
and outside of this series.

> 
>> +	if (ret)
>> +		goto error;
>> +
>>   	return 0;
>>   error:
>>   	kfree(data);
>> @@ -389,6 +503,10 @@ static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)
>>   
>>   	dev_pm_opp_remove_all_dynamic(cpu_dev);
>>   	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
>> +	if (data->lmh_dcvs_irq > 0) {
>> +		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);
> 
> Why using devm variants here and while requesting the irq ?
> 
>> +		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);
>> +	}
> 
> Please move this to qcom_cpufreq_hw_lmh_exit() or something.

Ok.

> 
> Now with sequence of disabling interrupt, etc, I see a potential
> problem.
> 
> CPU0                                    CPU1
> 
> qcom_cpufreq_hw_cpu_exit()
> -> devm_free_irq();
>                                          qcom_lmh_dcvs_poll()
>                                          -> qcom_lmh_dcvs_notify()
>                                            -> enable_irq()
> 
> -> cancel_delayed_work_sync();
> 
> 
> What will happen if enable_irq() gets called after freeing the irq ?
> Not sure, but it looks like you will hit this then from manage.c:
> 
> WARN(!desc->irq_data.chip, KERN_ERR "enable_irq before
>                                       setup/request_irq: irq %u\n", irq))
> 
> ?
> 
> You got a chicken n egg problem :)

Yes indeed! But also it is a very rare chicken and egg problem.
The scenario here is that the cpus are busy and running load causing a 
thermal overrun and lmh is engaged. At the same time for this issue to 
be hit the cpu is trying to exit/disable cpufreq. Calling 
cancel_delayed_work_sync first could solve this issue, right ? 
cancel_delayed_work_sync guarantees the work not to be pending even if
it requeues itself on return. So once the delayed work is cancelled, the 
interrupts can be safely disabled. Thoughts ?


>
Bjorn Andersson July 10, 2021, 4:57 a.m. UTC | #3
On Thu 08 Jul 07:06 CDT 2021, Thara Gopinath wrote:

> Add interrupt support to notify the kernel of h/w initiated frequency

> throttling by LMh. Convey this to scheduler via thermal presssure

> interface.

> 

> Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>

> ---

> 

> v2->v3:

> 	- Cosmetic fixes from review comments on the list.

> 	- Moved all LMh initializations to qcom_cpufreq_hw_lmh_init.

> 	- Added freeing of LMh interrupt and cancelling the polling worker to

> 	  qcom_cpufreq_hw_cpu_exit as per Viresh's suggestion.

> 	- LMh interrupts are now tied to cpu dev and not cpufreq dev. This will be

> 	  useful for further generation of SoCs where the same interrupt signals

> 	  multiple cpu clusters.

> 

> v1->v2:

> 	- Introduced qcom_cpufreq_hw_lmh_init to consolidate LMh related initializations

> 	  as per Viresh's review comment.

> 	- Moved the piece of code restarting polling/re-enabling LMh interrupt to

> 	  qcom_lmh_dcvs_notify therby simplifying isr and timer callback as per Viresh's

> 	  suggestion.

> 	- Droped cpus from qcom_cpufreq_data and instead using cpus from cpufreq_policy in

> 	  qcom_lmh_dcvs_notify as per Viresh's review comment.

> 	- Dropped dt property qcom,support-lmh as per Bjorn's suggestion.

> 	- Other minor/cosmetic fixes

> 

>  drivers/cpufreq/qcom-cpufreq-hw.c | 118 ++++++++++++++++++++++++++++++

>  1 file changed, 118 insertions(+)

> 

> diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c

> index f86859bf76f1..bb5fc700d913 100644

> --- a/drivers/cpufreq/qcom-cpufreq-hw.c

> +++ b/drivers/cpufreq/qcom-cpufreq-hw.c

> @@ -7,6 +7,7 @@

>  #include <linux/cpufreq.h>

>  #include <linux/init.h>

>  #include <linux/interconnect.h>

> +#include <linux/interrupt.h>

>  #include <linux/kernel.h>

>  #include <linux/module.h>

>  #include <linux/of_address.h>

> @@ -22,10 +23,13 @@

>  #define CLK_HW_DIV			2

>  #define LUT_TURBO_IND			1

>  

> +#define HZ_PER_KHZ			1000

> +

>  struct qcom_cpufreq_soc_data {

>  	u32 reg_enable;

>  	u32 reg_freq_lut;

>  	u32 reg_volt_lut;

> +	u32 reg_current_vote;

>  	u32 reg_perf_state;

>  	u8 lut_row_size;

>  };

> @@ -33,7 +37,10 @@ struct qcom_cpufreq_soc_data {

>  struct qcom_cpufreq_data {

>  	void __iomem *base;

>  	struct resource *res;

> +	struct delayed_work lmh_dcvs_poll_work;


How about dropping "lmh" from this variable name?

Perhaps "throttle_work" or something like that?

>  	const struct qcom_cpufreq_soc_data *soc_data;

> +	struct cpufreq_policy *policy;

> +	int lmh_dcvs_irq;


throttle_irq ?

>  };

>  

>  static unsigned long cpu_hw_rate, xo_rate;

> @@ -251,10 +258,84 @@ static void qcom_get_related_cpus(int index, struct cpumask *m)

>  	}

>  }

>  

> +static inline unsigned long qcom_lmh_vote_to_freq(u32 val)

> +{

> +	return (val & 0x3FF) * 19200;

> +}

> +

> +static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)

> +{

> +	struct cpufreq_policy *policy = data->policy;

> +	struct dev_pm_opp *opp;

> +	struct device *dev;

> +	unsigned long max_capacity, capacity, freq_hz, throttled_freq;

> +	unsigned int val, freq;

> +

> +	/*

> +	 * Get the h/w throttled frequency, normalize it using the

> +	 * registered opp table and use it to calculate thermal pressure.

> +	 */

> +	val = readl_relaxed(data->base + data->soc_data->reg_current_vote);


I would find it cleaner to move the readl() into the helper function, as
you don't care about the register value, only the resulting frequency.

> +	freq = qcom_lmh_vote_to_freq(val);

> +	freq_hz = freq * HZ_PER_KHZ;

> +

> +	dev = get_cpu_device(cpumask_first(policy->cpus));

> +	opp = dev_pm_opp_find_freq_floor(dev, &freq_hz);

> +	if (IS_ERR(opp) && PTR_ERR(opp) == -ERANGE)

> +		opp = dev_pm_opp_find_freq_ceil(dev, &freq_hz);

> +

> +	throttled_freq = freq_hz / HZ_PER_KHZ;

> +

> +	/* Update thermal pressure */

> +

> +	max_capacity = arch_scale_cpu_capacity(cpumask_first(policy->cpus));

> +	capacity = throttled_freq * max_capacity;

> +	capacity /= policy->cpuinfo.max_freq;


Perhaps, to avoid overflows if this is ever used on a 32-bit platform
use:

	mult_frac(max_capacity, throttled_freq, policy->cpuinfo.max_freq)

> +

> +	/* Don't pass boost capacity to scheduler */

> +	if (capacity > max_capacity)

> +		capacity = max_capacity;

> +

> +	arch_set_thermal_pressure(policy->cpus, max_capacity - capacity);

> +

> +	/*

> +	 * If h/w throttled frequency is higher than what cpufreq has requested for, stop

> +	 * polling and switch back to interrupt mechanism

> +	 */

> +

> +	if (throttled_freq >= qcom_cpufreq_hw_get(cpumask_first(policy->cpus)))

> +		/* Clear the existing interrupts and enable it back */

> +		enable_irq(data->lmh_dcvs_irq);

> +	else

> +		mod_delayed_work(system_highpri_wq, &data->lmh_dcvs_poll_work,

> +				 msecs_to_jiffies(10));

> +}

> +

> +static void qcom_lmh_dcvs_poll(struct work_struct *work)

> +{

> +	struct qcom_cpufreq_data *data;

> +

> +	data = container_of(work, struct qcom_cpufreq_data, lmh_dcvs_poll_work.work);

> +

> +	qcom_lmh_dcvs_notify(data);

> +}

> +

> +static irqreturn_t qcom_lmh_dcvs_handle_irq(int irq, void *data)

> +{

> +	struct qcom_cpufreq_data *c_data = data;

> +

> +	/* Disable interrupt and enable polling */

> +	disable_irq_nosync(c_data->lmh_dcvs_irq);

> +	qcom_lmh_dcvs_notify(c_data);

> +

> +	return 0;

> +}

> +

>  static const struct qcom_cpufreq_soc_data qcom_soc_data = {

>  	.reg_enable = 0x0,

>  	.reg_freq_lut = 0x110,

>  	.reg_volt_lut = 0x114,

> +	.reg_current_vote = 0x704,

>  	.reg_perf_state = 0x920,

>  	.lut_row_size = 32,

>  };

> @@ -274,6 +355,35 @@ static const struct of_device_id qcom_cpufreq_hw_match[] = {

>  };

>  MODULE_DEVICE_TABLE(of, qcom_cpufreq_hw_match);

>  

> +static int qcom_cpufreq_hw_lmh_init(struct cpufreq_policy *policy, int index)

> +{

> +	struct qcom_cpufreq_data *data = policy->driver_data;

> +	struct platform_device *pdev = cpufreq_get_driver_data();

> +	struct device *cpu_dev = get_cpu_device(policy->cpu);

> +	char irq_name[15];

> +	int ret;

> +

> +	/*

> +	 * Look for LMh interrupt. If no interrupt line is specified /

> +	 * if there is an error, allow cpufreq to be enabled as usual.

> +	 */

> +	data->lmh_dcvs_irq = platform_get_irq(pdev, index);

> +	if (data->lmh_dcvs_irq <= 0)

> +		return data->lmh_dcvs_irq == -EPROBE_DEFER ? -EPROBE_DEFER : 0;

> +

> +	snprintf(irq_name, sizeof(irq_name), "dcvsh-irq-%u", policy->cpu);

> +	ret = devm_request_irq(cpu_dev, data->lmh_dcvs_irq, qcom_lmh_dcvs_handle_irq,

> +			       0, irq_name, data);

> +	if (ret) {

> +		dev_err(&pdev->dev, "Error %d registering irq %x\n", ret, data->lmh_dcvs_irq);


The irq number here won't have any meaning, and %x wouldn't be suitable.

How about ..."Error registering %s: %d\n", irq_name, ret); ?

> +		return 0;


This sounds like a problem, wouldn't it be suitable to treat it as a
problem?

> +	}

> +	data->policy = policy;


Afaict, no one is going to access data->policy unless devm_request_irq()
succeeds and if it does and the interrupt fires immediately it would be
too late to set it here. So better move it earlier.

> +	INIT_DEFERRABLE_WORK(&data->lmh_dcvs_poll_work, qcom_lmh_dcvs_poll);


What if the interrupt fires before you initialize the work? Better move
this higher up.

> +

> +	return 0;

> +}

> +

>  static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)

>  {

>  	struct platform_device *pdev = cpufreq_get_driver_data();

> @@ -370,6 +480,10 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)

>  			dev_warn(cpu_dev, "failed to enable boost: %d\n", ret);

>  	}

>  

> +	ret = qcom_cpufreq_hw_lmh_init(policy, index);

> +	if (ret)

> +		goto error;

> +

>  	return 0;

>  error:

>  	kfree(data);

> @@ -389,6 +503,10 @@ static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)

>  

>  	dev_pm_opp_remove_all_dynamic(cpu_dev);

>  	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);

> +	if (data->lmh_dcvs_irq > 0) {

> +		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);


As init/exit are called multiple times you should avoid the devm
variants.

Regards,
Bjorn

> +		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);

> +	}

>  	kfree(policy->freq_table);

>  	kfree(data);

>  	iounmap(base);

> -- 

> 2.25.1

>
Thara Gopinath July 13, 2021, 1:09 a.m. UTC | #4
On 7/10/21 12:57 AM, Bjorn Andersson wrote:
> On Thu 08 Jul 07:06 CDT 2021, Thara Gopinath wrote:
> 
>> Add interrupt support to notify the kernel of h/w initiated frequency
>> throttling by LMh. Convey this to scheduler via thermal presssure
>> interface.
>>
>> Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
>> ---
>>
>> v2->v3:
>> 	- Cosmetic fixes from review comments on the list.
>> 	- Moved all LMh initializations to qcom_cpufreq_hw_lmh_init.
>> 	- Added freeing of LMh interrupt and cancelling the polling worker to
>> 	  qcom_cpufreq_hw_cpu_exit as per Viresh's suggestion.
>> 	- LMh interrupts are now tied to cpu dev and not cpufreq dev. This will be
>> 	  useful for further generation of SoCs where the same interrupt signals
>> 	  multiple cpu clusters.
>>
>> v1->v2:
>> 	- Introduced qcom_cpufreq_hw_lmh_init to consolidate LMh related initializations
>> 	  as per Viresh's review comment.
>> 	- Moved the piece of code restarting polling/re-enabling LMh interrupt to
>> 	  qcom_lmh_dcvs_notify therby simplifying isr and timer callback as per Viresh's
>> 	  suggestion.
>> 	- Droped cpus from qcom_cpufreq_data and instead using cpus from cpufreq_policy in
>> 	  qcom_lmh_dcvs_notify as per Viresh's review comment.
>> 	- Dropped dt property qcom,support-lmh as per Bjorn's suggestion.
>> 	- Other minor/cosmetic fixes
>>
>>   drivers/cpufreq/qcom-cpufreq-hw.c | 118 ++++++++++++++++++++++++++++++
>>   1 file changed, 118 insertions(+)
>>
>> diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
>> index f86859bf76f1..bb5fc700d913 100644
>> --- a/drivers/cpufreq/qcom-cpufreq-hw.c
>> +++ b/drivers/cpufreq/qcom-cpufreq-hw.c
>> @@ -7,6 +7,7 @@
>>   #include <linux/cpufreq.h>
>>   #include <linux/init.h>
>>   #include <linux/interconnect.h>
>> +#include <linux/interrupt.h>
>>   #include <linux/kernel.h>
>>   #include <linux/module.h>
>>   #include <linux/of_address.h>
>> @@ -22,10 +23,13 @@
>>   #define CLK_HW_DIV			2
>>   #define LUT_TURBO_IND			1
>>   
>> +#define HZ_PER_KHZ			1000
>> +
>>   struct qcom_cpufreq_soc_data {
>>   	u32 reg_enable;
>>   	u32 reg_freq_lut;
>>   	u32 reg_volt_lut;
>> +	u32 reg_current_vote;
>>   	u32 reg_perf_state;
>>   	u8 lut_row_size;
>>   };
>> @@ -33,7 +37,10 @@ struct qcom_cpufreq_soc_data {
>>   struct qcom_cpufreq_data {
>>   	void __iomem *base;
>>   	struct resource *res;
>> +	struct delayed_work lmh_dcvs_poll_work;
> 
> How about dropping "lmh" from this variable name?
> 
> Perhaps "throttle_work" or something like that?
> 
>>   	const struct qcom_cpufreq_soc_data *soc_data;
>> +	struct cpufreq_policy *policy;
>> +	int lmh_dcvs_irq;
> 
> throttle_irq ?

sounds good!

> 
>>   };
>>   
>>   static unsigned long cpu_hw_rate, xo_rate;
>> @@ -251,10 +258,84 @@ static void qcom_get_related_cpus(int index, struct cpumask *m)
>>   	}
>>   }
>>   
>> +static inline unsigned long qcom_lmh_vote_to_freq(u32 val)
>> +{
>> +	return (val & 0x3FF) * 19200;
>> +}
>> +
>> +static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
>> +{
>> +	struct cpufreq_policy *policy = data->policy;
>> +	struct dev_pm_opp *opp;
>> +	struct device *dev;
>> +	unsigned long max_capacity, capacity, freq_hz, throttled_freq;
>> +	unsigned int val, freq;
>> +
>> +	/*
>> +	 * Get the h/w throttled frequency, normalize it using the
>> +	 * registered opp table and use it to calculate thermal pressure.
>> +	 */
>> +	val = readl_relaxed(data->base + data->soc_data->reg_current_vote);
> 
> I would find it cleaner to move the readl() into the helper function, as
> you don't care about the register value, only the resulting frequency.

Ok..

> 
>> +	freq = qcom_lmh_vote_to_freq(val);
>> +	freq_hz = freq * HZ_PER_KHZ;
>> +
>> +	dev = get_cpu_device(cpumask_first(policy->cpus));
>> +	opp = dev_pm_opp_find_freq_floor(dev, &freq_hz);
>> +	if (IS_ERR(opp) && PTR_ERR(opp) == -ERANGE)
>> +		opp = dev_pm_opp_find_freq_ceil(dev, &freq_hz);
>> +
>> +	throttled_freq = freq_hz / HZ_PER_KHZ;
>> +
>> +	/* Update thermal pressure */
>> +
>> +	max_capacity = arch_scale_cpu_capacity(cpumask_first(policy->cpus));
>> +	capacity = throttled_freq * max_capacity;
>> +	capacity /= policy->cpuinfo.max_freq;
> 
> Perhaps, to avoid overflows if this is ever used on a 32-bit platform
> use:
> 
> 	mult_frac(max_capacity, throttled_freq, policy->cpuinfo.max_freq)

yep. sounds good.

> 
>> +
>> +	/* Don't pass boost capacity to scheduler */
>> +	if (capacity > max_capacity)
>> +		capacity = max_capacity;
>> +
>> +	arch_set_thermal_pressure(policy->cpus, max_capacity - capacity);
>> +
>> +	/*
>> +	 * If h/w throttled frequency is higher than what cpufreq has requested for, stop
>> +	 * polling and switch back to interrupt mechanism
>> +	 */
>> +
>> +	if (throttled_freq >= qcom_cpufreq_hw_get(cpumask_first(policy->cpus)))
>> +		/* Clear the existing interrupts and enable it back */
>> +		enable_irq(data->lmh_dcvs_irq);
>> +	else
>> +		mod_delayed_work(system_highpri_wq, &data->lmh_dcvs_poll_work,
>> +				 msecs_to_jiffies(10));
>> +}
>> +
>> +static void qcom_lmh_dcvs_poll(struct work_struct *work)
>> +{
>> +	struct qcom_cpufreq_data *data;
>> +
>> +	data = container_of(work, struct qcom_cpufreq_data, lmh_dcvs_poll_work.work);
>> +
>> +	qcom_lmh_dcvs_notify(data);
>> +}
>> +
>> +static irqreturn_t qcom_lmh_dcvs_handle_irq(int irq, void *data)
>> +{
>> +	struct qcom_cpufreq_data *c_data = data;
>> +
>> +	/* Disable interrupt and enable polling */
>> +	disable_irq_nosync(c_data->lmh_dcvs_irq);
>> +	qcom_lmh_dcvs_notify(c_data);
>> +
>> +	return 0;
>> +}
>> +
>>   static const struct qcom_cpufreq_soc_data qcom_soc_data = {
>>   	.reg_enable = 0x0,
>>   	.reg_freq_lut = 0x110,
>>   	.reg_volt_lut = 0x114,
>> +	.reg_current_vote = 0x704,
>>   	.reg_perf_state = 0x920,
>>   	.lut_row_size = 32,
>>   };
>> @@ -274,6 +355,35 @@ static const struct of_device_id qcom_cpufreq_hw_match[] = {
>>   };
>>   MODULE_DEVICE_TABLE(of, qcom_cpufreq_hw_match);
>>   
>> +static int qcom_cpufreq_hw_lmh_init(struct cpufreq_policy *policy, int index)
>> +{
>> +	struct qcom_cpufreq_data *data = policy->driver_data;
>> +	struct platform_device *pdev = cpufreq_get_driver_data();
>> +	struct device *cpu_dev = get_cpu_device(policy->cpu);
>> +	char irq_name[15];
>> +	int ret;
>> +
>> +	/*
>> +	 * Look for LMh interrupt. If no interrupt line is specified /
>> +	 * if there is an error, allow cpufreq to be enabled as usual.
>> +	 */
>> +	data->lmh_dcvs_irq = platform_get_irq(pdev, index);
>> +	if (data->lmh_dcvs_irq <= 0)
>> +		return data->lmh_dcvs_irq == -EPROBE_DEFER ? -EPROBE_DEFER : 0;
>> +
>> +	snprintf(irq_name, sizeof(irq_name), "dcvsh-irq-%u", policy->cpu);
>> +	ret = devm_request_irq(cpu_dev, data->lmh_dcvs_irq, qcom_lmh_dcvs_handle_irq,
>> +			       0, irq_name, data);
>> +	if (ret) {
>> +		dev_err(&pdev->dev, "Error %d registering irq %x\n", ret, data->lmh_dcvs_irq);
> 
> The irq number here won't have any meaning, and %x wouldn't be suitable.
> 
> How about ..."Error registering %s: %d\n", irq_name, ret); ?

ok.

> 
>> +		return 0;
> 
> This sounds like a problem, wouldn't it be suitable to treat it as a
> problem?

I thought a lot about this. My point is even if LMh does not get enabled 
due to some reason, cpufreq should be enabled. If I return an error back 
from here, cpufreq will be disabled.


> 
>> +	}
>> +	data->policy = policy;
> 
> Afaict, no one is going to access data->policy unless devm_request_irq()
> succeeds and if it does and the interrupt fires immediately it would be
> too late to set it here. So better move it earlier.
> 
>> +	INIT_DEFERRABLE_WORK(&data->lmh_dcvs_poll_work, qcom_lmh_dcvs_poll);
> 
> What if the interrupt fires before you initialize the work? Better move
> this higher up.

I will move this and the data->policy = policy above before requesting 
the interrupt.

> 
>> +
>> +	return 0;
>> +}
>> +
>>   static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
>>   {
>>   	struct platform_device *pdev = cpufreq_get_driver_data();
>> @@ -370,6 +480,10 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
>>   			dev_warn(cpu_dev, "failed to enable boost: %d\n", ret);
>>   	}
>>   
>> +	ret = qcom_cpufreq_hw_lmh_init(policy, index);
>> +	if (ret)
>> +		goto error;
>> +
>>   	return 0;
>>   error:
>>   	kfree(data);
>> @@ -389,6 +503,10 @@ static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)
>>   
>>   	dev_pm_opp_remove_all_dynamic(cpu_dev);
>>   	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
>> +	if (data->lmh_dcvs_irq > 0) {
>> +		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);
> 
> As init/exit are called multiple times you should avoid the devm
> variants.

Yes. I think Viresh was also mentioning this. I will move to non devm 
version.

> 
> Regards,
> Bjorn
> 
>> +		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);
>> +	}
>>   	kfree(policy->freq_table);
>>   	kfree(data);
>>   	iounmap(base);
>> -- 
>> 2.25.1
>>
Thara Gopinath July 13, 2021, 1:18 a.m. UTC | #5
On 7/12/21 12:41 AM, Viresh Kumar wrote:
> On 09-07-21, 11:37, Thara Gopinath wrote:
>> On 7/9/21 2:46 AM, Viresh Kumar wrote:
>>>> @@ -389,6 +503,10 @@ static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)
>>>>    	dev_pm_opp_remove_all_dynamic(cpu_dev);
>>>>    	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
>>>> +	if (data->lmh_dcvs_irq > 0) {
>>>> +		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);
>>>
>>> Why using devm variants here and while requesting the irq ?
> 
> Missed this one ?

Yep. I just replied to Bjorn's email on this. I will move to non devm 
version.

> 
>>>
>>>> +		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);
>>>> +	}
>>>
>>> Please move this to qcom_cpufreq_hw_lmh_exit() or something.
>>
>> Ok.
>>
>>>
>>> Now with sequence of disabling interrupt, etc, I see a potential
>>> problem.
>>>
>>> CPU0                                    CPU1
>>>
>>> qcom_cpufreq_hw_cpu_exit()
>>> -> devm_free_irq();
>>>                                           qcom_lmh_dcvs_poll()
>>>                                           -> qcom_lmh_dcvs_notify()
>>>                                             -> enable_irq()
>>>
>>> -> cancel_delayed_work_sync();
>>>
>>>
>>> What will happen if enable_irq() gets called after freeing the irq ?
>>> Not sure, but it looks like you will hit this then from manage.c:
>>>
>>> WARN(!desc->irq_data.chip, KERN_ERR "enable_irq before
>>>                                        setup/request_irq: irq %u\n", irq))
>>>
>>> ?
>>>
>>> You got a chicken n egg problem :)
>>
>> Yes indeed! But also it is a very rare chicken and egg problem.
>> The scenario here is that the cpus are busy and running load causing a
>> thermal overrun and lmh is engaged. At the same time for this issue to be
>> hit the cpu is trying to exit/disable cpufreq.
> 
> Yes, it is a very specific case but it needs to be resolved anyway. You don't
> want to get this ever :)
> 
>> Calling
>> cancel_delayed_work_sync first could solve this issue, right ?
>> cancel_delayed_work_sync guarantees the work not to be pending even if
>> it requeues itself on return. So once the delayed work is cancelled, the
>> interrupts can be safely disabled. Thoughts ?
> 
> I don't think even that would provide such guarantees to you here, as there is
> a chance the work gets queued again because of an interrupt that triggers right
> after you cancel the work.
> 
> The basic way of solving such issues is that once you cancel something, you need
> to guarantee that it doesn't get triggered again, no matter what.
> 
> The problem here I see is with your design itself, both delayed work and irq can
> enable each other, so no matter which one you disable first, won't be
> sufficient. You need to fix that design somehow.

So I really need the interrupt to fire and then the timer to kick in and 
take up the monitoring. I can think of introducing a variable 
is_disabled which is updated and read under a spinlock. 
qcom_cpufreq_hw_cpu_exit can hold the spinlock and set is_disabled to 
true prior to cancelling the work queue or disabling the interrupt. 
Before re-enabling the interrupt or re-queuing the work in 
qcom_lmh_dcvs_notify, is_disabled can be read and checked.

But does this problem not exist in target_index , fast_switch etc also ? 
One cpu can be disabling and the other one can be updating the target 
right?

>
Viresh Kumar July 13, 2021, 3:18 a.m. UTC | #6
On 12-07-21, 21:18, Thara Gopinath wrote:
> So I really need the interrupt to fire and then the timer to kick in and
> take up the monitoring. I can think of introducing a variable is_disabled
> which is updated and read under a spinlock. qcom_cpufreq_hw_cpu_exit can
> hold the spinlock and set is_disabled to true prior to cancelling the work
> queue or disabling the interrupt. Before re-enabling the interrupt or
> re-queuing the work in qcom_lmh_dcvs_notify, is_disabled can be read and
> checked.

Or you can make the lmh_dcvs_poll_work item a pointer and mark it NULL in exit,
with proper locking etc.

> But does this problem not exist in target_index , fast_switch etc also ? One
> cpu can be disabling and the other one can be updating the target right?

The race doesn't happen there as cpufreq_unregister_driver() takes care of
stopping everything before removing the policy. To be more precise, governor's
->stop() function is responsible for making sure that frequency won't be updated
any further.
Thara Gopinath July 14, 2021, 12:37 p.m. UTC | #7
On 7/12/21 11:18 PM, Viresh Kumar wrote:
> On 12-07-21, 21:18, Thara Gopinath wrote:
>> So I really need the interrupt to fire and then the timer to kick in and
>> take up the monitoring. I can think of introducing a variable is_disabled
>> which is updated and read under a spinlock. qcom_cpufreq_hw_cpu_exit can
>> hold the spinlock and set is_disabled to true prior to cancelling the work
>> queue or disabling the interrupt. Before re-enabling the interrupt or
>> re-queuing the work in qcom_lmh_dcvs_notify, is_disabled can be read and
>> checked.
> 
> Or you can make the lmh_dcvs_poll_work item a pointer and mark it NULL in exit,
> with proper locking etc.

Yes it could work. I will spin the next version with either this or 
introducing a new variable with locking.

> 
>> But does this problem not exist in target_index , fast_switch etc also ? One
>> cpu can be disabling and the other one can be updating the target right?
> 
> The race doesn't happen there as cpufreq_unregister_driver() takes care of
> stopping everything before removing the policy. To be more precise, governor's
> ->stop() function is responsible for making sure that frequency won't be updated
> any further.
>
diff mbox series

Patch

diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index f86859bf76f1..bb5fc700d913 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -7,6 +7,7 @@ 
 #include <linux/cpufreq.h>
 #include <linux/init.h>
 #include <linux/interconnect.h>
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
@@ -22,10 +23,13 @@ 
 #define CLK_HW_DIV			2
 #define LUT_TURBO_IND			1
 
+#define HZ_PER_KHZ			1000
+
 struct qcom_cpufreq_soc_data {
 	u32 reg_enable;
 	u32 reg_freq_lut;
 	u32 reg_volt_lut;
+	u32 reg_current_vote;
 	u32 reg_perf_state;
 	u8 lut_row_size;
 };
@@ -33,7 +37,10 @@  struct qcom_cpufreq_soc_data {
 struct qcom_cpufreq_data {
 	void __iomem *base;
 	struct resource *res;
+	struct delayed_work lmh_dcvs_poll_work;
 	const struct qcom_cpufreq_soc_data *soc_data;
+	struct cpufreq_policy *policy;
+	int lmh_dcvs_irq;
 };
 
 static unsigned long cpu_hw_rate, xo_rate;
@@ -251,10 +258,84 @@  static void qcom_get_related_cpus(int index, struct cpumask *m)
 	}
 }
 
+static inline unsigned long qcom_lmh_vote_to_freq(u32 val)
+{
+	return (val & 0x3FF) * 19200;
+}
+
+static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
+{
+	struct cpufreq_policy *policy = data->policy;
+	struct dev_pm_opp *opp;
+	struct device *dev;
+	unsigned long max_capacity, capacity, freq_hz, throttled_freq;
+	unsigned int val, freq;
+
+	/*
+	 * Get the h/w throttled frequency, normalize it using the
+	 * registered opp table and use it to calculate thermal pressure.
+	 */
+	val = readl_relaxed(data->base + data->soc_data->reg_current_vote);
+	freq = qcom_lmh_vote_to_freq(val);
+	freq_hz = freq * HZ_PER_KHZ;
+
+	dev = get_cpu_device(cpumask_first(policy->cpus));
+	opp = dev_pm_opp_find_freq_floor(dev, &freq_hz);
+	if (IS_ERR(opp) && PTR_ERR(opp) == -ERANGE)
+		opp = dev_pm_opp_find_freq_ceil(dev, &freq_hz);
+
+	throttled_freq = freq_hz / HZ_PER_KHZ;
+
+	/* Update thermal pressure */
+
+	max_capacity = arch_scale_cpu_capacity(cpumask_first(policy->cpus));
+	capacity = throttled_freq * max_capacity;
+	capacity /= policy->cpuinfo.max_freq;
+
+	/* Don't pass boost capacity to scheduler */
+	if (capacity > max_capacity)
+		capacity = max_capacity;
+
+	arch_set_thermal_pressure(policy->cpus, max_capacity - capacity);
+
+	/*
+	 * If h/w throttled frequency is higher than what cpufreq has requested for, stop
+	 * polling and switch back to interrupt mechanism
+	 */
+
+	if (throttled_freq >= qcom_cpufreq_hw_get(cpumask_first(policy->cpus)))
+		/* Clear the existing interrupts and enable it back */
+		enable_irq(data->lmh_dcvs_irq);
+	else
+		mod_delayed_work(system_highpri_wq, &data->lmh_dcvs_poll_work,
+				 msecs_to_jiffies(10));
+}
+
+static void qcom_lmh_dcvs_poll(struct work_struct *work)
+{
+	struct qcom_cpufreq_data *data;
+
+	data = container_of(work, struct qcom_cpufreq_data, lmh_dcvs_poll_work.work);
+
+	qcom_lmh_dcvs_notify(data);
+}
+
+static irqreturn_t qcom_lmh_dcvs_handle_irq(int irq, void *data)
+{
+	struct qcom_cpufreq_data *c_data = data;
+
+	/* Disable interrupt and enable polling */
+	disable_irq_nosync(c_data->lmh_dcvs_irq);
+	qcom_lmh_dcvs_notify(c_data);
+
+	return 0;
+}
+
 static const struct qcom_cpufreq_soc_data qcom_soc_data = {
 	.reg_enable = 0x0,
 	.reg_freq_lut = 0x110,
 	.reg_volt_lut = 0x114,
+	.reg_current_vote = 0x704,
 	.reg_perf_state = 0x920,
 	.lut_row_size = 32,
 };
@@ -274,6 +355,35 @@  static const struct of_device_id qcom_cpufreq_hw_match[] = {
 };
 MODULE_DEVICE_TABLE(of, qcom_cpufreq_hw_match);
 
+static int qcom_cpufreq_hw_lmh_init(struct cpufreq_policy *policy, int index)
+{
+	struct qcom_cpufreq_data *data = policy->driver_data;
+	struct platform_device *pdev = cpufreq_get_driver_data();
+	struct device *cpu_dev = get_cpu_device(policy->cpu);
+	char irq_name[15];
+	int ret;
+
+	/*
+	 * Look for LMh interrupt. If no interrupt line is specified /
+	 * if there is an error, allow cpufreq to be enabled as usual.
+	 */
+	data->lmh_dcvs_irq = platform_get_irq(pdev, index);
+	if (data->lmh_dcvs_irq <= 0)
+		return data->lmh_dcvs_irq == -EPROBE_DEFER ? -EPROBE_DEFER : 0;
+
+	snprintf(irq_name, sizeof(irq_name), "dcvsh-irq-%u", policy->cpu);
+	ret = devm_request_irq(cpu_dev, data->lmh_dcvs_irq, qcom_lmh_dcvs_handle_irq,
+			       0, irq_name, data);
+	if (ret) {
+		dev_err(&pdev->dev, "Error %d registering irq %x\n", ret, data->lmh_dcvs_irq);
+		return 0;
+	}
+	data->policy = policy;
+	INIT_DEFERRABLE_WORK(&data->lmh_dcvs_poll_work, qcom_lmh_dcvs_poll);
+
+	return 0;
+}
+
 static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
 {
 	struct platform_device *pdev = cpufreq_get_driver_data();
@@ -370,6 +480,10 @@  static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
 			dev_warn(cpu_dev, "failed to enable boost: %d\n", ret);
 	}
 
+	ret = qcom_cpufreq_hw_lmh_init(policy, index);
+	if (ret)
+		goto error;
+
 	return 0;
 error:
 	kfree(data);
@@ -389,6 +503,10 @@  static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)
 
 	dev_pm_opp_remove_all_dynamic(cpu_dev);
 	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
+	if (data->lmh_dcvs_irq > 0) {
+		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);
+		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);
+	}
 	kfree(policy->freq_table);
 	kfree(data);
 	iounmap(base);