diff mbox series

[v2,2/5] thermal: devfreq_cooling: get a copy of device status

Message ID 20201118120358.17150-3-lukasz.luba@arm.com
State Superseded
Headers show
Series Thermal devfreq cooling improvements with Energy Model | expand

Commit Message

Lukasz Luba Nov. 18, 2020, 12:03 p.m. UTC
Devfreq cooling needs to now the correct status of the device in order
to operate. Do not rely on Devfreq last_status which might be a stale data
and get more up-to-date values of the load.

Devfreq framework can change the device status in the background. To
mitigate this situation make a copy of the status structure and use it
for internal calculations.

In addition this patch adds normalization function, which also makes sure
that whatever data comes from the device, it is in a sane range.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
---
 drivers/thermal/devfreq_cooling.c | 52 +++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 9 deletions(-)

Comments

Ionela Voinescu Dec. 2, 2020, 10:23 a.m. UTC | #1
On Wednesday 18 Nov 2020 at 12:03:55 (+0000), Lukasz Luba wrote:
> Devfreq cooling needs to now the correct status of the device in order

> to operate. Do not rely on Devfreq last_status which might be a stale data

> and get more up-to-date values of the load.

> 

> Devfreq framework can change the device status in the background. To

> mitigate this situation make a copy of the status structure and use it

> for internal calculations.

> 

> In addition this patch adds normalization function, which also makes sure

> that whatever data comes from the device, it is in a sane range.

> 

> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>

> ---

>  drivers/thermal/devfreq_cooling.c | 52 +++++++++++++++++++++++++------

>  1 file changed, 43 insertions(+), 9 deletions(-)

> 

> diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c

> index 659c0143c9f0..925523694462 100644

> --- a/drivers/thermal/devfreq_cooling.c

> +++ b/drivers/thermal/devfreq_cooling.c

> @@ -227,20 +227,46 @@ static inline unsigned long get_total_power(struct devfreq_cooling_device *dfc,

>  							       voltage);

>  }

>  

> +static void _normalize_load(struct devfreq_dev_status *status)

> +{

> +	/* Make some space if needed */

> +	if (status->busy_time > 0xffff) {

> +		status->busy_time >>= 10;

> +		status->total_time >>= 10;

> +	}

> +

> +	if (status->busy_time > status->total_time)

> +		status->busy_time = status->total_time;

> +

> +	status->busy_time *= 100;

> +	status->busy_time /= status->total_time ? : 1;

> +

> +	/* Avoid division by 0 */

> +	status->busy_time = status->busy_time ? : 1;

> +	status->total_time = 100;

> +}

>  

>  static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev,

>  					       u32 *power)

>  {

>  	struct devfreq_cooling_device *dfc = cdev->devdata;

>  	struct devfreq *df = dfc->devfreq;

> -	struct devfreq_dev_status *status = &df->last_status;

> +	struct devfreq_dev_status status;

>  	unsigned long state;

> -	unsigned long freq = status->current_frequency;

> +	unsigned long freq;

>  	unsigned long voltage;

>  	u32 dyn_power = 0;

>  	u32 static_power = 0;

>  	int res;

>  

> +	mutex_lock(&df->lock);

> +	res = df->profile->get_dev_status(df->dev.parent, &status);

> +	mutex_unlock(&df->lock);

> +	if (res)

> +		return res;

> +

> +	freq = status.current_frequency;

> +

>  	state = freq_get_state(dfc, freq);

>  	if (state == THERMAL_CSTATE_INVALID) {

>  		res = -EAGAIN;

> @@ -268,16 +294,18 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd

>  	} else {

>  		dyn_power = dfc->power_table[state];

>  

> +		_normalize_load(&status);

> +

>  		/* Scale dynamic power for utilization */

> -		dyn_power *= status->busy_time;

> -		dyn_power /= status->total_time;

> +		dyn_power *= status.busy_time;

> +		dyn_power /= status.total_time;

>  		/* Get static power */

>  		static_power = get_static_power(dfc, freq);

>  

>  		*power = dyn_power + static_power;

>  	}

>  

> -	trace_thermal_power_devfreq_get_power(cdev, status, freq, *power);

> +	trace_thermal_power_devfreq_get_power(cdev, &status, freq, *power);

>  

>  	return 0;

>  fail:

> @@ -309,14 +337,20 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,

>  {

>  	struct devfreq_cooling_device *dfc = cdev->devdata;

>  	struct devfreq *df = dfc->devfreq;

> -	struct devfreq_dev_status *status = &df->last_status;

> -	unsigned long freq = status->current_frequency;

> +	struct devfreq_dev_status status;

>  	unsigned long busy_time;

> +	unsigned long freq;

>  	s32 dyn_power;

>  	u32 static_power;

>  	s32 est_power;

>  	int i;

>  

> +	mutex_lock(&df->lock);

> +	status = df->last_status;

> +	mutex_unlock(&df->lock);

> +

> +	freq = status.current_frequency;

> +

>  	if (dfc->power_ops->get_real_power) {

>  		/* Scale for resource utilization */

>  		est_power = power * dfc->res_util;

> @@ -328,8 +362,8 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,

>  		dyn_power = dyn_power > 0 ? dyn_power : 0;

>  

>  		/* Scale dynamic power for utilization */

> -		busy_time = status->busy_time ?: 1;

> -		est_power = (dyn_power * status->total_time) / busy_time;

> +		busy_time = status.busy_time ?: 1;

> +		est_power = (dyn_power * status.total_time) / busy_time;

>  	}

>  

>  	/*

> -- 

> 2.17.1

> 


Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>


Thanks,
Ionela.
Daniel Lezcano Dec. 3, 2020, 1:09 p.m. UTC | #2
On 18/11/2020 13:03, Lukasz Luba wrote:
> Devfreq cooling needs to now the correct status of the device in order

> to operate. Do not rely on Devfreq last_status which might be a stale data

> and get more up-to-date values of the load.

> 

> Devfreq framework can change the device status in the background. To

> mitigate this situation make a copy of the status structure and use it

> for internal calculations.

> 

> In addition this patch adds normalization function, which also makes sure

> that whatever data comes from the device, it is in a sane range.

> 

> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>

> ---

>  drivers/thermal/devfreq_cooling.c | 52 +++++++++++++++++++++++++------

>  1 file changed, 43 insertions(+), 9 deletions(-)

> 

> diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c

> index 659c0143c9f0..925523694462 100644

> --- a/drivers/thermal/devfreq_cooling.c

> +++ b/drivers/thermal/devfreq_cooling.c

> @@ -227,20 +227,46 @@ static inline unsigned long get_total_power(struct devfreq_cooling_device *dfc,

>  							       voltage);

>  }

>  

> +static void _normalize_load(struct devfreq_dev_status *status)

> +{

> +	/* Make some space if needed */

> +	if (status->busy_time > 0xffff) {

> +		status->busy_time >>= 10;

> +		status->total_time >>= 10;

> +	}

> +

> +	if (status->busy_time > status->total_time)

> +		status->busy_time = status->total_time;


How the condition above is possible?

> +	status->busy_time *= 100;

> +	status->busy_time /= status->total_time ? : 1;

> +

> +	/* Avoid division by 0 */

> +	status->busy_time = status->busy_time ? : 1;

> +	status->total_time = 100;


Why not base the normalization on 1024? and use an intermediate u64.

For example:

static u32 _normalize_load(struct devfreq_dev_status *status)
{
	u64 load = 0;

	/* Prevent divison by zero */
	if (!status->busy_time)
		return 0;

	/*
	 * Assuming status->total_time is always greater or equal
	 * to status->busy_time, it can not be equal to zero because
	 * of the test above
	 */
	load = status->busy_time * 1024;
	load /= status->total_time;

	/*
	 * load is always [1..1024[, so it can not be truncated by a
	 * u64 -> u32 coercive cast
	 */
	return (u32)load;
}


> +}

>  

>  static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev,

>  					       u32 *power)

>  {

>  	struct devfreq_cooling_device *dfc = cdev->devdata;

>  	struct devfreq *df = dfc->devfreq;

> -	struct devfreq_dev_status *status = &df->last_status;

> +	struct devfreq_dev_status status;

>  	unsigned long state;

> -	unsigned long freq = status->current_frequency;

> +	unsigned long freq;

>  	unsigned long voltage;

>  	u32 dyn_power = 0;

>  	u32 static_power = 0;

>  	int res;

>  

> +	mutex_lock(&df->lock);

> +	res = df->profile->get_dev_status(df->dev.parent, &status);

> +	mutex_unlock(&df->lock);

> +	if (res)

> +		return res;

> +

> +	freq = status.current_frequency;

> +

>  	state = freq_get_state(dfc, freq);

>  	if (state == THERMAL_CSTATE_INVALID) {

>  		res = -EAGAIN;

> @@ -268,16 +294,18 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd

>  	} else {

>  		dyn_power = dfc->power_table[state];

>  

> +		_normalize_load(&status);


		load = _normalize_load(&status);

> +

>  		/* Scale dynamic power for utilization */

> -		dyn_power *= status->busy_time;

> -		dyn_power /= status->total_time;

> +		dyn_power *= status.busy_time;

> +		dyn_power /= status.total_time;


		/*
		 * May be change dyn_power to a u64 to prevent overflow
		 * when multiplied by 'load'
		 */
		dyn_power = (dyn_power * load) / 1024;

>  		/* Get static power */

>  		static_power = get_static_power(dfc, freq);

>  

>  		*power = dyn_power + static_power;

>  	}

>  

> -	trace_thermal_power_devfreq_get_power(cdev, status, freq, *power);

> +	trace_thermal_power_devfreq_get_power(cdev, &status, freq, *power);

>  

>  	return 0;

>  fail:

> @@ -309,14 +337,20 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,

>  {

>  	struct devfreq_cooling_device *dfc = cdev->devdata;

>  	struct devfreq *df = dfc->devfreq;

> -	struct devfreq_dev_status *status = &df->last_status;

> -	unsigned long freq = status->current_frequency;

> +	struct devfreq_dev_status status;

>  	unsigned long busy_time;

> +	unsigned long freq;

>  	s32 dyn_power;

>  	u32 static_power;

>  	s32 est_power;

>  	int i;

>  

> +	mutex_lock(&df->lock);

> +	status = df->last_status;

> +	mutex_unlock(&df->lock);

> +

> +	freq = status.current_frequency;

> +

>  	if (dfc->power_ops->get_real_power) {

>  		/* Scale for resource utilization */

>  		est_power = power * dfc->res_util;

> @@ -328,8 +362,8 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,

>  		dyn_power = dyn_power > 0 ? dyn_power : 0;

>  

>  		/* Scale dynamic power for utilization */

> -		busy_time = status->busy_time ?: 1;

> -		est_power = (dyn_power * status->total_time) / busy_time;

> +		busy_time = status.busy_time ?: 1;

> +		est_power = (dyn_power * status.total_time) / busy_time;

>  	}

>  

>  	/*

> 



-- 
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
Lukasz Luba Dec. 3, 2020, 3:38 p.m. UTC | #3
On 12/3/20 1:09 PM, Daniel Lezcano wrote:
> On 18/11/2020 13:03, Lukasz Luba wrote:

>> Devfreq cooling needs to now the correct status of the device in order

>> to operate. Do not rely on Devfreq last_status which might be a stale data

>> and get more up-to-date values of the load.

>>

>> Devfreq framework can change the device status in the background. To

>> mitigate this situation make a copy of the status structure and use it

>> for internal calculations.

>>

>> In addition this patch adds normalization function, which also makes sure

>> that whatever data comes from the device, it is in a sane range.

>>

>> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>

>> ---

>>   drivers/thermal/devfreq_cooling.c | 52 +++++++++++++++++++++++++------

>>   1 file changed, 43 insertions(+), 9 deletions(-)

>>

>> diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c

>> index 659c0143c9f0..925523694462 100644

>> --- a/drivers/thermal/devfreq_cooling.c

>> +++ b/drivers/thermal/devfreq_cooling.c

>> @@ -227,20 +227,46 @@ static inline unsigned long get_total_power(struct devfreq_cooling_device *dfc,

>>   							       voltage);

>>   }

>>   

>> +static void _normalize_load(struct devfreq_dev_status *status)

>> +{

>> +	/* Make some space if needed */

>> +	if (status->busy_time > 0xffff) {

>> +		status->busy_time >>= 10;

>> +		status->total_time >>= 10;

>> +	}

>> +

>> +	if (status->busy_time > status->total_time)

>> +		status->busy_time = status->total_time;

> 

> How the condition above is possible?


They should, be checked by the driver, but I cannot trust
and have to check for all corner cases: (div by 0, overflow
one of them, etc). The busy_time and total_time are unsigned long,
which means 4B on 32bit machines.
If these values are coming from device counters, which count every
busy cycle and total cycles of a clock of a device running at e.g.
1GHz they would overflow every ~4s.

Normally IPA polling are 1s and 100ms, it's platform specific. But there
are also 'empty' periods when IPA sees temperature very low and does not
even call the .get_requested_power() callbacks for the cooling devices,
just grants max freq to all. This is problematic. I am investigating it
and will propose a solution for IPA soon.

I would avoid all of this if devfreq core would have default for all
devices a reliable polling timer... Let me check some possibilities also
for this case.

> 

>> +	status->busy_time *= 100;

>> +	status->busy_time /= status->total_time ? : 1;

>> +

>> +	/* Avoid division by 0 */

>> +	status->busy_time = status->busy_time ? : 1;

>> +	status->total_time = 100;

> 

> Why not base the normalization on 1024? and use an intermediate u64.


You are the 2nd reviewer who is asking this. I tried to keep 'load' as
in range [0, 100] since we also have 'load' in cpufreq cooling in this
range. Maybe I should switch to 1024 (Ionela was also asking for this).

> 

> For example:

> 

> static u32 _normalize_load(struct devfreq_dev_status *status)

> {

> 	u64 load = 0;

> 

> 	/* Prevent divison by zero */

> 	if (!status->busy_time)

> 		return 0;

> 

> 	/*

> 	 * Assuming status->total_time is always greater or equal

> 	 * to status->busy_time, it can not be equal to zero because

> 	 * of the test above

> 	 */

> 	load = status->busy_time * 1024;

> 	load /= status->total_time;


I wanted to avoid any divisions which involve 64bit var on 32bit
machine.

> 

> 	/*

> 	 * load is always [1..1024[, so it can not be truncated by a

> 	 * u64 -> u32 coercive cast

> 	 */

> 	return (u32)load;

> }

> 

> 

>> +}

>>   

>>   static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev,

>>   					       u32 *power)

>>   {

>>   	struct devfreq_cooling_device *dfc = cdev->devdata;

>>   	struct devfreq *df = dfc->devfreq;

>> -	struct devfreq_dev_status *status = &df->last_status;

>> +	struct devfreq_dev_status status;

>>   	unsigned long state;

>> -	unsigned long freq = status->current_frequency;

>> +	unsigned long freq;

>>   	unsigned long voltage;

>>   	u32 dyn_power = 0;

>>   	u32 static_power = 0;

>>   	int res;

>>   

>> +	mutex_lock(&df->lock);

>> +	res = df->profile->get_dev_status(df->dev.parent, &status);

>> +	mutex_unlock(&df->lock);

>> +	if (res)

>> +		return res;

>> +

>> +	freq = status.current_frequency;

>> +

>>   	state = freq_get_state(dfc, freq);

>>   	if (state == THERMAL_CSTATE_INVALID) {

>>   		res = -EAGAIN;

>> @@ -268,16 +294,18 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd

>>   	} else {

>>   		dyn_power = dfc->power_table[state];

>>   

>> +		_normalize_load(&status);

> 

> 		load = _normalize_load(&status);

> 

>> +

>>   		/* Scale dynamic power for utilization */

>> -		dyn_power *= status->busy_time;

>> -		dyn_power /= status->total_time;

>> +		dyn_power *= status.busy_time;

>> +		dyn_power /= status.total_time;

> 

> 		/*

> 		 * May be change dyn_power to a u64 to prevent overflow

> 		 * when multiplied by 'load'

> 		 */

> 		dyn_power = (dyn_power * load) / 1024;


dyn_power value from EM should fit in 16bit [1], so we should be safe.

I will experiment with the 1024 code and check some corner cases.

Thank you Daniel for the review!

Regards,
Lukasz

[1] 
https://elixir.bootlin.com/linux/v5.10-rc5/source/kernel/power/energy_model.c#L135
Daniel Lezcano Dec. 3, 2020, 4:09 p.m. UTC | #4
On 03/12/2020 16:38, Lukasz Luba wrote:
> 

> 

> On 12/3/20 1:09 PM, Daniel Lezcano wrote:

>> On 18/11/2020 13:03, Lukasz Luba wrote:

>>> Devfreq cooling needs to now the correct status of the device in order

>>> to operate. Do not rely on Devfreq last_status which might be a stale

>>> data

>>> and get more up-to-date values of the load.

>>>

>>> Devfreq framework can change the device status in the background. To

>>> mitigate this situation make a copy of the status structure and use it

>>> for internal calculations.

>>>

>>> In addition this patch adds normalization function, which also makes

>>> sure

>>> that whatever data comes from the device, it is in a sane range.

>>>

>>> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>

>>> ---

>>>   drivers/thermal/devfreq_cooling.c | 52 +++++++++++++++++++++++++------

>>>   1 file changed, 43 insertions(+), 9 deletions(-)

>>>

>>> diff --git a/drivers/thermal/devfreq_cooling.c

>>> b/drivers/thermal/devfreq_cooling.c

>>> index 659c0143c9f0..925523694462 100644

>>> --- a/drivers/thermal/devfreq_cooling.c

>>> +++ b/drivers/thermal/devfreq_cooling.c

>>> @@ -227,20 +227,46 @@ static inline unsigned long

>>> get_total_power(struct devfreq_cooling_device *dfc,

>>>                                      voltage);

>>>   }

>>>   +static void _normalize_load(struct devfreq_dev_status *status)

>>> +{

>>> +    /* Make some space if needed */

>>> +    if (status->busy_time > 0xffff) {

>>> +        status->busy_time >>= 10;

>>> +        status->total_time >>= 10;

>>> +    }

>>> +

>>> +    if (status->busy_time > status->total_time)

>>> +        status->busy_time = status->total_time;

>>

>> How the condition above is possible?

> 

> They should, be checked by the driver, but I cannot trust

> and have to check for all corner cases: (div by 0, overflow

> one of them, etc). The busy_time and total_time are unsigned long,

> which means 4B on 32bit machines.

> If these values are coming from device counters, which count every

> busy cycle and total cycles of a clock of a device running at e.g.

> 1GHz they would overflow every ~4s.


I don't think it is up to this routine to check the driver is correctly
implemented, especially at every call to get_requested_power.

If the normalization ends up by doing this kind of thing, there is
certainly something wrong in the 'status' computation to be fixed before
submitting this series.


> Normally IPA polling are 1s and 100ms, it's platform specific. But there

> are also 'empty' periods when IPA sees temperature very low and does not

> even call the .get_requested_power() callbacks for the cooling devices,

> just grants max freq to all. This is problematic. I am investigating it

> and will propose a solution for IPA soon.

> 

> I would avoid all of this if devfreq core would have default for all

> devices a reliable polling timer... Let me check some possibilities also

> for this case.


Ok, may be create an API to compute the 'idle,busy,total times' to be
used by the different the devfreq drivers and then fix the overflow in
this common place.

>>> +    status->busy_time *= 100;

>>> +    status->busy_time /= status->total_time ? : 1;

>>> +

>>> +    /* Avoid division by 0 */

>>> +    status->busy_time = status->busy_time ? : 1;

>>> +    status->total_time = 100;

>>

>> Why not base the normalization on 1024? and use an intermediate u64.

> 

> You are the 2nd reviewer who is asking this. I tried to keep 'load' as

> in range [0, 100] since we also have 'load' in cpufreq cooling in this

> range. Maybe I should switch to 1024 (Ionela was also asking for this).


Well it is common practice to compute normalization with 1024 because
the division is a bit shift and the compiler optimize the code very well
with that value.




-- 
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
Lukasz Luba Dec. 7, 2020, 12:41 p.m. UTC | #5
On 12/3/20 4:09 PM, Daniel Lezcano wrote:
> On 03/12/2020 16:38, Lukasz Luba wrote:

>>

>>

>> On 12/3/20 1:09 PM, Daniel Lezcano wrote:

>>> On 18/11/2020 13:03, Lukasz Luba wrote:

>>>> Devfreq cooling needs to now the correct status of the device in order

>>>> to operate. Do not rely on Devfreq last_status which might be a stale

>>>> data

>>>> and get more up-to-date values of the load.

>>>>

>>>> Devfreq framework can change the device status in the background. To

>>>> mitigate this situation make a copy of the status structure and use it

>>>> for internal calculations.

>>>>

>>>> In addition this patch adds normalization function, which also makes

>>>> sure

>>>> that whatever data comes from the device, it is in a sane range.

>>>>

>>>> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>

>>>> ---

>>>>    drivers/thermal/devfreq_cooling.c | 52 +++++++++++++++++++++++++------

>>>>    1 file changed, 43 insertions(+), 9 deletions(-)

>>>>

>>>> diff --git a/drivers/thermal/devfreq_cooling.c

>>>> b/drivers/thermal/devfreq_cooling.c

>>>> index 659c0143c9f0..925523694462 100644

>>>> --- a/drivers/thermal/devfreq_cooling.c

>>>> +++ b/drivers/thermal/devfreq_cooling.c

>>>> @@ -227,20 +227,46 @@ static inline unsigned long

>>>> get_total_power(struct devfreq_cooling_device *dfc,

>>>>                                       voltage);

>>>>    }

>>>>    +static void _normalize_load(struct devfreq_dev_status *status)

>>>> +{

>>>> +    /* Make some space if needed */

>>>> +    if (status->busy_time > 0xffff) {

>>>> +        status->busy_time >>= 10;

>>>> +        status->total_time >>= 10;

>>>> +    }

>>>> +

>>>> +    if (status->busy_time > status->total_time)

>>>> +        status->busy_time = status->total_time;

>>>

>>> How the condition above is possible?

>>

>> They should, be checked by the driver, but I cannot trust

>> and have to check for all corner cases: (div by 0, overflow

>> one of them, etc). The busy_time and total_time are unsigned long,

>> which means 4B on 32bit machines.

>> If these values are coming from device counters, which count every

>> busy cycle and total cycles of a clock of a device running at e.g.

>> 1GHz they would overflow every ~4s.

> 

> I don't think it is up to this routine to check the driver is correctly

> implemented, especially at every call to get_requested_power.

> 

> If the normalization ends up by doing this kind of thing, there is

> certainly something wrong in the 'status' computation to be fixed before

> submitting this series.

> 

> 

>> Normally IPA polling are 1s and 100ms, it's platform specific. But there

>> are also 'empty' periods when IPA sees temperature very low and does not

>> even call the .get_requested_power() callbacks for the cooling devices,

>> just grants max freq to all. This is problematic. I am investigating it

>> and will propose a solution for IPA soon.

>>

>> I would avoid all of this if devfreq core would have default for all

>> devices a reliable polling timer... Let me check some possibilities also

>> for this case.

> 

> Ok, may be create an API to compute the 'idle,busy,total times' to be

> used by the different the devfreq drivers and then fix the overflow in

> this common place.


Yes, I have this plan, but I have to close this patch series. To go
forward with this, I will drop the normalization function and will keep
only the code of safe copy of the 'status', so using busy_time and
total_time will be safe.

I will address this computation and normalization in different patch
series. There might be a need of a new API as you pointed out, which
is out-of-scope of this patch set.

> 

>>>> +    status->busy_time *= 100;

>>>> +    status->busy_time /= status->total_time ? : 1;

>>>> +

>>>> +    /* Avoid division by 0 */

>>>> +    status->busy_time = status->busy_time ? : 1;

>>>> +    status->total_time = 100;

>>>

>>> Why not base the normalization on 1024? and use an intermediate u64.

>>

>> You are the 2nd reviewer who is asking this. I tried to keep 'load' as

>> in range [0, 100] since we also have 'load' in cpufreq cooling in this

>> range. Maybe I should switch to 1024 (Ionela was also asking for this).

> 

> Well it is common practice to compute normalization with 1024 because

> the division is a bit shift and the compiler optimize the code very well

> with that value.

> 


I will keep this 1024 in mind for the next topic series.

Regards,
Lukasz
Lukasz Luba Dec. 8, 2020, 2:20 p.m. UTC | #6
Hi Daniel,

On 12/7/20 12:41 PM, Lukasz Luba wrote:
> 

> 

> On 12/3/20 4:09 PM, Daniel Lezcano wrote:

>> On 03/12/2020 16:38, Lukasz Luba wrote:

>>>

>>>

>>> On 12/3/20 1:09 PM, Daniel Lezcano wrote:

>>>> On 18/11/2020 13:03, Lukasz Luba wrote:

>>>>> Devfreq cooling needs to now the correct status of the device in order

>>>>> to operate. Do not rely on Devfreq last_status which might be a stale

>>>>> data

>>>>> and get more up-to-date values of the load.

>>>>>

>>>>> Devfreq framework can change the device status in the background. To

>>>>> mitigate this situation make a copy of the status structure and use it

>>>>> for internal calculations.

>>>>>

>>>>> In addition this patch adds normalization function, which also makes

>>>>> sure

>>>>> that whatever data comes from the device, it is in a sane range.

>>>>>

>>>>> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>

>>>>> ---

>>>>>    drivers/thermal/devfreq_cooling.c | 52 

>>>>> +++++++++++++++++++++++++------

>>>>>    1 file changed, 43 insertions(+), 9 deletions(-)

>>>>>

>>>>> diff --git a/drivers/thermal/devfreq_cooling.c

>>>>> b/drivers/thermal/devfreq_cooling.c

>>>>> index 659c0143c9f0..925523694462 100644

>>>>> --- a/drivers/thermal/devfreq_cooling.c

>>>>> +++ b/drivers/thermal/devfreq_cooling.c

>>>>> @@ -227,20 +227,46 @@ static inline unsigned long

>>>>> get_total_power(struct devfreq_cooling_device *dfc,

>>>>>                                       voltage);

>>>>>    }

>>>>>    +static void _normalize_load(struct devfreq_dev_status *status)

>>>>> +{

>>>>> +    /* Make some space if needed */

>>>>> +    if (status->busy_time > 0xffff) {

>>>>> +        status->busy_time >>= 10;

>>>>> +        status->total_time >>= 10;

>>>>> +    }

>>>>> +

>>>>> +    if (status->busy_time > status->total_time)

>>>>> +        status->busy_time = status->total_time;

>>>>

>>>> How the condition above is possible?

>>>

>>> They should, be checked by the driver, but I cannot trust

>>> and have to check for all corner cases: (div by 0, overflow

>>> one of them, etc). The busy_time and total_time are unsigned long,

>>> which means 4B on 32bit machines.

>>> If these values are coming from device counters, which count every

>>> busy cycle and total cycles of a clock of a device running at e.g.

>>> 1GHz they would overflow every ~4s.

>>

>> I don't think it is up to this routine to check the driver is correctly

>> implemented, especially at every call to get_requested_power.

>>

>> If the normalization ends up by doing this kind of thing, there is

>> certainly something wrong in the 'status' computation to be fixed before

>> submitting this series.

>>

>>

>>> Normally IPA polling are 1s and 100ms, it's platform specific. But there

>>> are also 'empty' periods when IPA sees temperature very low and does not

>>> even call the .get_requested_power() callbacks for the cooling devices,

>>> just grants max freq to all. This is problematic. I am investigating it

>>> and will propose a solution for IPA soon.

>>>

>>> I would avoid all of this if devfreq core would have default for all

>>> devices a reliable polling timer... Let me check some possibilities also

>>> for this case.

>>

>> Ok, may be create an API to compute the 'idle,busy,total times' to be

>> used by the different the devfreq drivers and then fix the overflow in

>> this common place.

> 

> Yes, I have this plan, but I have to close this patch series. To go

> forward with this, I will drop the normalization function and will keep

> only the code of safe copy of the 'status', so using busy_time and

> total_time will be safe.


I did experiments and actually I cannot drop this function. Drivers can
feed total_time and busy_time which are in nanoseconds, e.g. [1] 50ms =>
50.000.000ns which is then when multiplied by 1024  and exceed the u32.
I want to avoid 64bit variables and divisions, so shifting them earlier
would help. IMHO it does not harm this devfreq cooling to make that
check and handle ns values.

I am going to use the normalization into 0..1024 as you and Ionela
suggested.
I will also drop the direct device status check. That would be a
different patch series. In that patch set I will try to come with a
generic solution and some API.

Regards,
Lukasz

[1] 
https://elixir.bootlin.com/linux/v5.10-rc5/source/drivers/gpu/drm/panfrost/panfrost_devfreq.c#L66
diff mbox series

Patch

diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index 659c0143c9f0..925523694462 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -227,20 +227,46 @@  static inline unsigned long get_total_power(struct devfreq_cooling_device *dfc,
 							       voltage);
 }
 
+static void _normalize_load(struct devfreq_dev_status *status)
+{
+	/* Make some space if needed */
+	if (status->busy_time > 0xffff) {
+		status->busy_time >>= 10;
+		status->total_time >>= 10;
+	}
+
+	if (status->busy_time > status->total_time)
+		status->busy_time = status->total_time;
+
+	status->busy_time *= 100;
+	status->busy_time /= status->total_time ? : 1;
+
+	/* Avoid division by 0 */
+	status->busy_time = status->busy_time ? : 1;
+	status->total_time = 100;
+}
 
 static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev,
 					       u32 *power)
 {
 	struct devfreq_cooling_device *dfc = cdev->devdata;
 	struct devfreq *df = dfc->devfreq;
-	struct devfreq_dev_status *status = &df->last_status;
+	struct devfreq_dev_status status;
 	unsigned long state;
-	unsigned long freq = status->current_frequency;
+	unsigned long freq;
 	unsigned long voltage;
 	u32 dyn_power = 0;
 	u32 static_power = 0;
 	int res;
 
+	mutex_lock(&df->lock);
+	res = df->profile->get_dev_status(df->dev.parent, &status);
+	mutex_unlock(&df->lock);
+	if (res)
+		return res;
+
+	freq = status.current_frequency;
+
 	state = freq_get_state(dfc, freq);
 	if (state == THERMAL_CSTATE_INVALID) {
 		res = -EAGAIN;
@@ -268,16 +294,18 @@  static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
 	} else {
 		dyn_power = dfc->power_table[state];
 
+		_normalize_load(&status);
+
 		/* Scale dynamic power for utilization */
-		dyn_power *= status->busy_time;
-		dyn_power /= status->total_time;
+		dyn_power *= status.busy_time;
+		dyn_power /= status.total_time;
 		/* Get static power */
 		static_power = get_static_power(dfc, freq);
 
 		*power = dyn_power + static_power;
 	}
 
-	trace_thermal_power_devfreq_get_power(cdev, status, freq, *power);
+	trace_thermal_power_devfreq_get_power(cdev, &status, freq, *power);
 
 	return 0;
 fail:
@@ -309,14 +337,20 @@  static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
 {
 	struct devfreq_cooling_device *dfc = cdev->devdata;
 	struct devfreq *df = dfc->devfreq;
-	struct devfreq_dev_status *status = &df->last_status;
-	unsigned long freq = status->current_frequency;
+	struct devfreq_dev_status status;
 	unsigned long busy_time;
+	unsigned long freq;
 	s32 dyn_power;
 	u32 static_power;
 	s32 est_power;
 	int i;
 
+	mutex_lock(&df->lock);
+	status = df->last_status;
+	mutex_unlock(&df->lock);
+
+	freq = status.current_frequency;
+
 	if (dfc->power_ops->get_real_power) {
 		/* Scale for resource utilization */
 		est_power = power * dfc->res_util;
@@ -328,8 +362,8 @@  static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
 		dyn_power = dyn_power > 0 ? dyn_power : 0;
 
 		/* Scale dynamic power for utilization */
-		busy_time = status->busy_time ?: 1;
-		est_power = (dyn_power * status->total_time) / busy_time;
+		busy_time = status.busy_time ?: 1;
+		est_power = (dyn_power * status.total_time) / busy_time;
 	}
 
 	/*