[2/2] thermal: cpufreq_cooling: Reuse effective_cpu_util()

Message ID b051b42f0c4f36d7177978e090c6a85df17922c6.1594707424.git.viresh.kumar@linaro.org
State New
Headers show
Series
  • cpufreq_cooling: Get effective CPU utilization from scheduler
Related show

Commit Message

Viresh Kumar July 14, 2020, 6:36 a.m.
Several parts of the kernel are already using the effective CPU
utilization to get the current load on the CPU, do the same here instead
of depending on the idle time of the CPU, which isn't that accurate
comparatively.

Note that, this (and CPU frequency scaling in general) doesn't work that
well with idle injection as that is done from rt threads and is counted
as load while it tries to do quite the opposite. That should be solved
separately though.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>

---
 drivers/thermal/cpufreq_cooling.c | 65 +++++++------------------------
 1 file changed, 15 insertions(+), 50 deletions(-)

-- 
2.25.0.rc1.19.g042ed3e048af

Comments

Peter Zijlstra July 14, 2020, 8:23 a.m. | #1
On Tue, Jul 14, 2020 at 12:06:53PM +0530, Viresh Kumar wrote:
> Several parts of the kernel are already using the effective CPU

> utilization to get the current load on the CPU, do the same here instead

> of depending on the idle time of the CPU, which isn't that accurate

> comparatively.

> 

> Note that, this (and CPU frequency scaling in general) doesn't work that

> well with idle injection as that is done from rt threads and is counted

> as load while it tries to do quite the opposite. That should be solved

> separately though.

> 

> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>

> ---

>  drivers/thermal/cpufreq_cooling.c | 65 +++++++------------------------

>  1 file changed, 15 insertions(+), 50 deletions(-)

> 

> diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c

> index 6c0e1b053126..74340b2b0da7 100644

> --- a/drivers/thermal/cpufreq_cooling.c

> +++ b/drivers/thermal/cpufreq_cooling.c

> @@ -23,6 +23,7 @@

>  #include <linux/thermal.h>

>  

>  #include <trace/events/thermal.h>

> +#include "../../kernel/sched/sched.h"


Hard NAK on that. Just writing it should've been a clue.
Rafael J. Wysocki July 14, 2020, 1:05 p.m. | #2
On Tue, Jul 14, 2020 at 8:37 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>

> Several parts of the kernel are already using the effective CPU

> utilization to get the current load on the CPU, do the same here instead

> of depending on the idle time of the CPU, which isn't that accurate

> comparatively.

>

> Note that, this (and CPU frequency scaling in general) doesn't work that

> well with idle injection as that is done from rt threads and is counted

> as load while it tries to do quite the opposite. That should be solved

> separately though.

>

> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>

> ---

>  drivers/thermal/cpufreq_cooling.c | 65 +++++++------------------------

>  1 file changed, 15 insertions(+), 50 deletions(-)

>

> diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c

> index 6c0e1b053126..74340b2b0da7 100644

> --- a/drivers/thermal/cpufreq_cooling.c

> +++ b/drivers/thermal/cpufreq_cooling.c

> @@ -23,6 +23,7 @@

>  #include <linux/thermal.h>

>

>  #include <trace/events/thermal.h>

> +#include "../../kernel/sched/sched.h"

>

>  /*

>   * Cooling state <-> CPUFreq frequency

> @@ -38,16 +39,6 @@

>   *     ...

>   */

>

> -/**

> - * struct time_in_idle - Idle time stats

> - * @time: previous reading of the absolute time that this cpu was idle

> - * @timestamp: wall time of the last invocation of get_cpu_idle_time_us()

> - */

> -struct time_in_idle {

> -       u64 time;

> -       u64 timestamp;

> -};

> -

>  /**

>   * struct cpufreq_cooling_device - data for cooling device with cpufreq

>   * @id: unique integer value corresponding to each cpufreq_cooling_device

> @@ -62,7 +53,6 @@ struct time_in_idle {

>   *     registered cooling device.

>   * @policy: cpufreq policy.

>   * @node: list_head to link all cpufreq_cooling_device together.

> - * @idle_time: idle time stats

>   * @qos_req: PM QoS contraint to apply

>   *

>   * This structure is required for keeping information of each registered

> @@ -76,7 +66,6 @@ struct cpufreq_cooling_device {

>         struct em_perf_domain *em;

>         struct cpufreq_policy *policy;

>         struct list_head node;

> -       struct time_in_idle *idle_time;

>         struct freq_qos_request qos_req;

>  };

>

> @@ -132,34 +121,21 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,

>  }

>

>  /**

> - * get_load() - get load for a cpu since last updated

> + * get_load() - get current load for a cpu

>   * @cpufreq_cdev:      &struct cpufreq_cooling_device for this cpu

>   * @cpu:       cpu number

> - * @cpu_idx:   index of the cpu in time_in_idle*

> + * @cpu_idx:   index of the cpu

>   *

> - * Return: The average load of cpu @cpu in percentage since this

> - * function was last called.

> + * Return: The current load of cpu @cpu in percentage.

>   */

>  static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,

>                     int cpu_idx)

>  {

> -       u32 load;

> -       u64 now, now_idle, delta_time, delta_idle;

> -       struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx];

> -

> -       now_idle = get_cpu_idle_time(cpu, &now, 0);

> -       delta_idle = now_idle - idle_time->time;

> -       delta_time = now - idle_time->timestamp;

> +       unsigned long util = cpu_util_cfs(cpu_rq(cpu));

> +       unsigned long max = arch_scale_cpu_capacity(cpu);

>

> -       if (delta_time <= delta_idle)

> -               load = 0;

> -       else

> -               load = div64_u64(100 * (delta_time - delta_idle), delta_time);

> -

> -       idle_time->time = now_idle;

> -       idle_time->timestamp = now;

> -

> -       return load;

> +       util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL);


Hmm.

It doesn't look like cpufreq_cdev and cpu_idx are needed any more in
this function, so maybe drop them from the arg list?  And then there
won't be anything specific to CPU cooling in this function, so maybe
move it to sched and export it from there properly?

Also it looks like max could be passed to it along with the CPU number
instead of being always taken as arch_scale_cpu_capacity(cpu).

> +       return (util * 100) / max;

>  }

>

>  /**
Viresh Kumar July 15, 2020, 7:32 a.m. | #3
On 14-07-20, 15:05, Rafael J. Wysocki wrote:
> On Tue, Jul 14, 2020 at 8:37 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:

> >  static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,

> >                     int cpu_idx)

> >  {

> > -       u32 load;

> > -       u64 now, now_idle, delta_time, delta_idle;

> > -       struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx];

> > -

> > -       now_idle = get_cpu_idle_time(cpu, &now, 0);

> > -       delta_idle = now_idle - idle_time->time;

> > -       delta_time = now - idle_time->timestamp;

> > +       unsigned long util = cpu_util_cfs(cpu_rq(cpu));

> > +       unsigned long max = arch_scale_cpu_capacity(cpu);

> >

> > -       if (delta_time <= delta_idle)

> > -               load = 0;

> > -       else

> > -               load = div64_u64(100 * (delta_time - delta_idle), delta_time);

> > -

> > -       idle_time->time = now_idle;

> > -       idle_time->timestamp = now;

> > -

> > -       return load;

> > +       util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL);

> 

> Hmm.

> 

> It doesn't look like cpufreq_cdev and cpu_idx are needed any more in

> this function, so maybe drop them from the arg list?


Right.

> And then there

> won't be anything specific to CPU cooling in this function, so maybe

> move it to sched and export it from there properly?


There isn't a lot happening in this routine right now TBH and am not
sure if it is really worth it to have a separate routine for this
(unless we can get rid of something for all the callers, like avoiding
a call to arch_scale_cpu_capacity() and then naming it
effective_cpu_load().

> Also it looks like max could be passed to it along with the CPU number

> instead of being always taken as arch_scale_cpu_capacity(cpu).


I am not sure what you are suggesting here. What will be the value of
max if not arch_scale_cpu_capacity() ?

-- 
viresh
Rafael J. Wysocki July 15, 2020, 12:47 p.m. | #4
On Wed, Jul 15, 2020 at 9:32 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>

> On 14-07-20, 15:05, Rafael J. Wysocki wrote:

> > On Tue, Jul 14, 2020 at 8:37 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:

> > >  static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,

> > >                     int cpu_idx)

> > >  {

> > > -       u32 load;

> > > -       u64 now, now_idle, delta_time, delta_idle;

> > > -       struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx];

> > > -

> > > -       now_idle = get_cpu_idle_time(cpu, &now, 0);

> > > -       delta_idle = now_idle - idle_time->time;

> > > -       delta_time = now - idle_time->timestamp;

> > > +       unsigned long util = cpu_util_cfs(cpu_rq(cpu));

> > > +       unsigned long max = arch_scale_cpu_capacity(cpu);

> > >

> > > -       if (delta_time <= delta_idle)

> > > -               load = 0;

> > > -       else

> > > -               load = div64_u64(100 * (delta_time - delta_idle), delta_time);

> > > -

> > > -       idle_time->time = now_idle;

> > > -       idle_time->timestamp = now;

> > > -

> > > -       return load;

> > > +       util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL);

> >

> > Hmm.

> >

> > It doesn't look like cpufreq_cdev and cpu_idx are needed any more in

> > this function, so maybe drop them from the arg list?

>

> Right.

>

> > And then there

> > won't be anything specific to CPU cooling in this function, so maybe

> > move it to sched and export it from there properly?

>

> There isn't a lot happening in this routine right now TBH and am not

> sure if it is really worth it to have a separate routine for this

> (unless we can get rid of something for all the callers, like avoiding

> a call to arch_scale_cpu_capacity() and then naming it

> effective_cpu_load().


Maybe yes.  Or sched_cpu_load() to stand for "the effective CPU load
as seen by the scheduler".

But I'm not sure if percent is the best unit to return from it.  Maybe
make it return something like (util << SCHED_CAPACITY_SHFT) /
arch_scale_cpu_capacity(cpu).

> > Also it looks like max could be passed to it along with the CPU number

> > instead of being always taken as arch_scale_cpu_capacity(cpu).

>

> I am not sure what you are suggesting here. What will be the value of

> max if not arch_scale_cpu_capacity() ?


I was thinking about a value supplied by the caller, eg.
sched_cpu_load(cpu, max), but if all callers would pass
arch_scale_cpu_capacity(cpu) as max anyway, then it's better to simply
call it from there.
Peter Zijlstra July 16, 2020, 11:56 a.m. | #5
On Tue, Jul 14, 2020 at 12:06:53PM +0530, Viresh Kumar wrote:
>  /**

> + * get_load() - get current load for a cpu

>   * @cpufreq_cdev:	&struct cpufreq_cooling_device for this cpu

>   * @cpu:	cpu number

> + * @cpu_idx:	index of the cpu

>   *

> + * Return: The current load of cpu @cpu in percentage.

>   */

>  static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,

>  		    int cpu_idx)

>  {

> +	unsigned long util = cpu_util_cfs(cpu_rq(cpu));

> +	unsigned long max = arch_scale_cpu_capacity(cpu);

>  

> +	util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL);

> +	return (util * 100) / max;

>  }


So there's a number of things... let me recap a bunch of things that
got mentioned on IRC earlier this week and then continue from there..

So IPA* (or any other thermal governor) needs energy estimates for the
various managed devices, cpufreq_cooling, being the driver for the CPU
device, needs to provide that and in return receives feedback on how
much energy it is allowed to consume, cpufreq_cooling then dynamically
enables/disables OPP states.

There are actually two methods the thermal governor will use:
get_real_power() and get_requested_power().

The first isn't used anywhere in mainline, but could be implemented on
hardware that has energy counters (like say x86 RAPL).

The second attempts to guesstimate power, and is the subject of this
patch.

Currently cpufreq_cooling appears to estimate the CPU energy usage by
calculating the percentage of idle time using the per-cpu cpustat stuff,
which is pretty horrific.

This patch then attempts to improve upon that by using the scheduler's
cpu_util(ENERGY_UTIL) estimate, which is also used to select OPP state
and improves upon avg idle. This should be a big improvement as higher
frequency consumes more energy, but should we not also consider that:

	E = C V^2 f

The EAS energy model has tables for the OPPs that contain this, but in
this case we seem to be assuming a linear enery/frequency curve, which
is just not the case.

I suppose we could do something like **:

	100 * util^3 / max^3

which assumes V~f.

Another point is that cpu_util() vs turbo is a bit iffy, and to that,
things like x86-APERF/MPERF and ARM-AMU got mentioned. Those might also
have the benefit of giving you values that match your own sampling
interval (100ms), where the sched stuff is PELT (64,32.. based).

So what I've been thinking is that cpufreq drivers ought to be able to
supply this method, and only when they lack, can the cpufreq-governor
(schedutil) install a fallback. And then cpufreq-cooling can use
whatever is provided (through the cpufreq interfaces).

That way, we:

 1) don't have to export anything
 2) get arch drivers to provide something 'better'


Does that sounds like something sensible?




[*] I always want a beer when I see that name :-)

[**] I despise code that uses percentages, computers suck at
/100 and there is no reason not to use any other random fraction, so why
pick a bad one.
Lukasz Luba July 16, 2020, 2:24 p.m. | #6
Hi Peter,

Thank you for summarizing this. I've put my comments below.

On 7/16/20 12:56 PM, Peter Zijlstra wrote:
> On Tue, Jul 14, 2020 at 12:06:53PM +0530, Viresh Kumar wrote:

>>   /**

>> + * get_load() - get current load for a cpu

>>    * @cpufreq_cdev:	&struct cpufreq_cooling_device for this cpu

>>    * @cpu:	cpu number

>> + * @cpu_idx:	index of the cpu

>>    *

>> + * Return: The current load of cpu @cpu in percentage.

>>    */

>>   static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,

>>   		    int cpu_idx)

>>   {

>> +	unsigned long util = cpu_util_cfs(cpu_rq(cpu));

>> +	unsigned long max = arch_scale_cpu_capacity(cpu);

>>   

>> +	util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL);

>> +	return (util * 100) / max;

>>   }

> 

> So there's a number of things... let me recap a bunch of things that

> got mentioned on IRC earlier this week and then continue from there..

> 

> So IPA* (or any other thermal governor) needs energy estimates for the

> various managed devices, cpufreq_cooling, being the driver for the CPU

> device, needs to provide that and in return receives feedback on how

> much energy it is allowed to consume, cpufreq_cooling then dynamically

> enables/disables OPP states.


Currently, only IPA uses the power estimation, other governors don't
use these API functions in cpufreq_cooling.

> 

> There are actually two methods the thermal governor will use:

> get_real_power() and get_requested_power().

> 

> The first isn't used anywhere in mainline, but could be implemented on

> hardware that has energy counters (like say x86 RAPL).


The first is only present as callback for registered devfreq cooling,
which is registered by devfreq driver. If that driver provides the
get_real_power(), it will be called from get_requested_power().
Thus, it's likely that IPA would get real power value from HW.

I was planning to add it also to cpufreq_cooling callbacks years
ago...

> 

> The second attempts to guesstimate power, and is the subject of this

> patch.

> 

> Currently cpufreq_cooling appears to estimate the CPU energy usage by

> calculating the percentage of idle time using the per-cpu cpustat stuff,

> which is pretty horrific.


Even worse, it then *samples* the *current* CPU frequency at that
particular point in time and assumes that when the CPU wasn't idle
during that period - it had *this* frequency...

> 

> This patch then attempts to improve upon that by using the scheduler's

> cpu_util(ENERGY_UTIL) estimate, which is also used to select OPP state

> and improves upon avg idle. This should be a big improvement as higher


IMHO this patch set doesn't address the real problem: 'sampling
freq problem' described above. There was no issue with getting idle
period. The avg freq was the problem, in that period when the
CPUs were running. The model implemented in alg was also a problem.

The whole period (e.g. CPU freqs which were used or idle state)

^(CPU freq)
|
|                            sampling the current freq
|                _______        |
|               |      |        |
|________       |      |        |
|       |       |      |        |
|       | idle  |      |________v________...
|_ _____|_______|__________________________> (time)
   start of period               end
   |<------- (typically 100ms)-->|



> frequency consumes more energy, but should we not also consider that:

> 

> 	E = C V^2 f

> 

> The EAS energy model has tables for the OPPs that contain this, but in

> this case we seem to be assuming a linear enery/frequency curve, which

> is just not the case.


I am not sure if I got your point. To understand your point better
I think some drawing would be required. I will skip this patch
and old mainline code and focus on your proposed solution
(because this patch set does not address 'sampling freq problem').

> 

> I suppose we could do something like **:

> 

> 	100 * util^3 / max^3

> 

> which assumes V~f.


In EM we keep power values in the array and these values grow
exponentially. Each OPP has it corresponding

P_x = C (V_x)^2 f_x    , where x is the OPP id thus corresponding V,f

so we have discrete power values, growing like:

^(power)
|
|
|                          *
|
|
|                       *
|                       |
|                   *   |
|                       | <----- power estimation function
|            *          |        should not use linear 'util/max_util'
|   *                   |        relation here *
|_______________________|_____________> (freq)
    opp0     opp1  opp2 opp3 opp4

What is the problem
First:
We need to pick the right Power from the array. I would suggest
to pick the max allowed frequency for that whole period, because
we don't know if the CPUs were using it (it's likely).
Second:
Then we have the utilization, which can be considered as:
'idle period & running period with various freq inside', lets
call it avg performance in that whole period.
Third:
Try to estimate the power used in that whole period having
the avg performance and max performance.

What you are suggesting is to travel that [*] line in
non-linear fashion, but in (util^3)/(max_util^3). Which means
it goes down faster when the utilization drops.
I think it is too aggressive, e.g.
500^3 / 1024^3 = 0.116  <--- very little, ~12%
200^3 / 300^3  = 0.296

Peter could you confirm if I understood you correct?
This is quite important bit for me.

> 

> Another point is that cpu_util() vs turbo is a bit iffy, and to that,

> things like x86-APERF/MPERF and ARM-AMU got mentioned. Those might also

> have the benefit of giving you values that match your own sampling

> interval (100ms), where the sched stuff is PELT (64,32.. based).

> 

> So what I've been thinking is that cpufreq drivers ought to be able to

> supply this method, and only when they lack, can the cpufreq-governor

> (schedutil) install a fallback. And then cpufreq-cooling can use

> whatever is provided (through the cpufreq interfaces).

> 

> That way, we:

> 

>   1) don't have to export anything

>   2) get arch drivers to provide something 'better'

> 

> 

> Does that sounds like something sensible?

> 


Yes, make sense. Please also keep in mind that this
utilization somehow must be mapped into power in a proper way.
I am currently working on addressing all of these problems
(including this correlation).

Thank you for your time spending on it and your suggestions.

Regards,
Lukasz

> 

> 

> 

> [*] I always want a beer when I see that name :-)

> 

> [**] I despise code that uses percentages, computers suck at

> /100 and there is no reason not to use any other random fraction, so why

> pick a bad one.

>
Peter Zijlstra July 16, 2020, 3:43 p.m. | #7
On Thu, Jul 16, 2020 at 03:24:37PM +0100, Lukasz Luba wrote:
> On 7/16/20 12:56 PM, Peter Zijlstra wrote:


> > The second attempts to guesstimate power, and is the subject of this

> > patch.

> > 

> > Currently cpufreq_cooling appears to estimate the CPU energy usage by

> > calculating the percentage of idle time using the per-cpu cpustat stuff,

> > which is pretty horrific.

> 

> Even worse, it then *samples* the *current* CPU frequency at that

> particular point in time and assumes that when the CPU wasn't idle

> during that period - it had *this* frequency...


*whee* :-)

...

> In EM we keep power values in the array and these values grow

> exponentially. Each OPP has it corresponding

> 

> P_x = C (V_x)^2 f_x    , where x is the OPP id thus corresponding V,f

> 

> so we have discrete power values, growing like:

> 

> ^(power)

> |

> |

> |                          *

> |

> |

> |                       *

> |                       |

> |                   *   |

> |                       | <----- power estimation function

> |            *          |        should not use linear 'util/max_util'

> |   *                   |        relation here *

> |_______________________|_____________> (freq)

>    opp0     opp1  opp2 opp3 opp4

> 

> What is the problem

> First:

> We need to pick the right Power from the array. I would suggest

> to pick the max allowed frequency for that whole period, because

> we don't know if the CPUs were using it (it's likely).

> Second:

> Then we have the utilization, which can be considered as:

> 'idle period & running period with various freq inside', lets

> call it avg performance in that whole period.

> Third:

> Try to estimate the power used in that whole period having

> the avg performance and max performance.

> 

> What you are suggesting is to travel that [*] line in

> non-linear fashion, but in (util^3)/(max_util^3). Which means

> it goes down faster when the utilization drops.

> I think it is too aggressive, e.g.

> 500^3 / 1024^3 = 0.116  <--- very little, ~12%

> 200^3 / 300^3  = 0.296

> 

> Peter could you confirm if I understood you correct?


Correct, with the caveat that we might try and regression fit a 3rd
order polynomial to a bunch of EM data to see if there's a 'better'
function to be had than a raw 'f(x) := x^3'.

> This is quite important bit for me.


So, if we assume schedutil + EM, we can actually have schedutil
calculate a running power sum. That is, something like: \Int P_x dt.
Because we know the points where OPP changes.

Although, thinking more, I suspect we need tighter integration with
cpuidle, because we don't actually have idle times here, but that should
be doable.

But for anything other than schedutil + EM, things become more
interesting, because then we need to guesstimate power usage without the
benefit of having actual power numbers.

We can of course still do that running power sum, with whatever P(u) or
P(f) end up with, I suppose.

> > Another point is that cpu_util() vs turbo is a bit iffy, and to that,

> > things like x86-APERF/MPERF and ARM-AMU got mentioned. Those might also

> > have the benefit of giving you values that match your own sampling

> > interval (100ms), where the sched stuff is PELT (64,32.. based).

> > 

> > So what I've been thinking is that cpufreq drivers ought to be able to

> > supply this method, and only when they lack, can the cpufreq-governor

> > (schedutil) install a fallback. And then cpufreq-cooling can use

> > whatever is provided (through the cpufreq interfaces).

> > 

> > That way, we:

> > 

> >   1) don't have to export anything

> >   2) get arch drivers to provide something 'better'

> > 

> > 

> > Does that sounds like something sensible?

> > 

> 

> Yes, make sense. Please also keep in mind that this

> utilization somehow must be mapped into power in a proper way.

> I am currently working on addressing all of these problems

> (including this correlation).


Right, so that mapping util to power was what I was missing and
suggesting we do. So for 'simple' hardware we have cpufreq events for
frequency change, and cpuidle events for idle, and with EM we can simply
sum the relevant power numbers.

For hardware lacking EM, or hardware managed DVFS, we'll have to fudge
things a little. How best to do that is up in the air a little, but
virtual power curves seem a useful tool to me.

The next problem for IPA is having all the devices report power in the
same virtual unit I suppose, but I'll leave that to others ;-)
Vincent Guittot July 17, 2020, 9:46 a.m. | #8
On Thu, 16 Jul 2020 at 16:24, Lukasz Luba <lukasz.luba@arm.com> wrote:
>

> Hi Peter,

>

> Thank you for summarizing this. I've put my comments below.

>

> On 7/16/20 12:56 PM, Peter Zijlstra wrote:

> > On Tue, Jul 14, 2020 at 12:06:53PM +0530, Viresh Kumar wrote:

> >>   /**

> >> + * get_load() - get current load for a cpu

> >>    * @cpufreq_cdev:  &struct cpufreq_cooling_device for this cpu

> >>    * @cpu:   cpu number

> >> + * @cpu_idx:        index of the cpu

> >>    *

> >> + * Return: The current load of cpu @cpu in percentage.

> >>    */

> >>   static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,

> >>                  int cpu_idx)

> >>   {

> >> +    unsigned long util = cpu_util_cfs(cpu_rq(cpu));

> >> +    unsigned long max = arch_scale_cpu_capacity(cpu);

> >>

> >> +    util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL);

> >> +    return (util * 100) / max;

> >>   }

> >

> > So there's a number of things... let me recap a bunch of things that

> > got mentioned on IRC earlier this week and then continue from there..

> >

> > So IPA* (or any other thermal governor) needs energy estimates for the

> > various managed devices, cpufreq_cooling, being the driver for the CPU

> > device, needs to provide that and in return receives feedback on how

> > much energy it is allowed to consume, cpufreq_cooling then dynamically

> > enables/disables OPP states.

>

> Currently, only IPA uses the power estimation, other governors don't

> use these API functions in cpufreq_cooling.

>

> >

> > There are actually two methods the thermal governor will use:

> > get_real_power() and get_requested_power().

> >

> > The first isn't used anywhere in mainline, but could be implemented on

> > hardware that has energy counters (like say x86 RAPL).

>

> The first is only present as callback for registered devfreq cooling,

> which is registered by devfreq driver. If that driver provides the

> get_real_power(), it will be called from get_requested_power().

> Thus, it's likely that IPA would get real power value from HW.

>

> I was planning to add it also to cpufreq_cooling callbacks years

> ago...

>

> >

> > The second attempts to guesstimate power, and is the subject of this

> > patch.

> >

> > Currently cpufreq_cooling appears to estimate the CPU energy usage by

> > calculating the percentage of idle time using the per-cpu cpustat stuff,

> > which is pretty horrific.

>

> Even worse, it then *samples* the *current* CPU frequency at that

> particular point in time and assumes that when the CPU wasn't idle

> during that period - it had *this* frequency...


So there is 2 problems in the power calculation of cpufreq cooling device :
- How to get an accurate utilization level of the cpu which is what
this patch is trying to fix because using idle time is just wrong
whereas scheduler utilization is frequency invariant
- How to get power estimate from this utilization level. And as you
pointed out, using the current freq which is not accurate.

>

> >

> > This patch then attempts to improve upon that by using the scheduler's

> > cpu_util(ENERGY_UTIL) estimate, which is also used to select OPP state

> > and improves upon avg idle. This should be a big improvement as higher

>

> IMHO this patch set doesn't address the real problem: 'sampling

> freq problem' described above. There was no issue with getting idle

> period. The avg freq was the problem, in that period when the


Not sure that you can say that avg freq is a bigger problem than
getting the load because there is a real issue with tracking idle
period for estimating load because running slower reduces the idle
time and increases artificially the load. That's why we implemented
frequency invariance in PELT.

At the opposite when the thermal mitigation happens, the frequency
will be most probably capped by cpu cooling device and will most
probably stay at the capped value

> CPUs were running. The model implemented in alg was also a problem.

>

> The whole period (e.g. CPU freqs which were used or idle state)

>

> ^(CPU freq)

> |

> |                            sampling the current freq

> |                _______        |

> |               |      |        |

> |________       |      |        |

> |       |       |      |        |

> |       | idle  |      |________v________...

> |_ _____|_______|__________________________> (time)

>    start of period               end

>    |<------- (typically 100ms)-->|

>

>

>

> > frequency consumes more energy, but should we not also consider that:

> >

> >       E = C V^2 f

> >

> > The EAS energy model has tables for the OPPs that contain this, but in

> > this case we seem to be assuming a linear enery/frequency curve, which

> > is just not the case.

>

> I am not sure if I got your point. To understand your point better

> I think some drawing would be required. I will skip this patch

> and old mainline code and focus on your proposed solution

> (because this patch set does not address 'sampling freq problem').

>

> >

> > I suppose we could do something like **:

> >

> >       100 * util^3 / max^3

> >

> > which assumes V~f.

>

> In EM we keep power values in the array and these values grow

> exponentially. Each OPP has it corresponding

>

> P_x = C (V_x)^2 f_x    , where x is the OPP id thus corresponding V,f

>

> so we have discrete power values, growing like:

>

> ^(power)

> |

> |

> |                          *

> |

> |

> |                       *

> |                       |

> |                   *   |

> |                       | <----- power estimation function

> |            *          |        should not use linear 'util/max_util'

> |   *                   |        relation here *

> |_______________________|_____________> (freq)

>     opp0     opp1  opp2 opp3 opp4

>

> What is the problem

> First:

> We need to pick the right Power from the array. I would suggest

> to pick the max allowed frequency for that whole period, because

> we don't know if the CPUs were using it (it's likely).

> Second:

> Then we have the utilization, which can be considered as:

> 'idle period & running period with various freq inside', lets

> call it avg performance in that whole period.

> Third:

> Try to estimate the power used in that whole period having

> the avg performance and max performance.


We already have a function that is doing such kind of computation
based of the utilization of the CPU : em_pd_energy(). And we could
reuse some of this function if not exactly this one

>

> What you are suggesting is to travel that [*] line in

> non-linear fashion, but in (util^3)/(max_util^3). Which means

> it goes down faster when the utilization drops.

> I think it is too aggressive, e.g.

> 500^3 / 1024^3 = 0.116  <--- very little, ~12%

> 200^3 / 300^3  = 0.296

>

> Peter could you confirm if I understood you correct?

> This is quite important bit for me.

>

> >

> > Another point is that cpu_util() vs turbo is a bit iffy, and to that,

> > things like x86-APERF/MPERF and ARM-AMU got mentioned. Those might also

> > have the benefit of giving you values that match your own sampling

> > interval (100ms), where the sched stuff is PELT (64,32.. based).

> >

> > So what I've been thinking is that cpufreq drivers ought to be able to

> > supply this method, and only when they lack, can the cpufreq-governor

> > (schedutil) install a fallback. And then cpufreq-cooling can use

> > whatever is provided (through the cpufreq interfaces).

> >

> > That way, we:

> >

> >   1) don't have to export anything

> >   2) get arch drivers to provide something 'better'

> >

> >

> > Does that sounds like something sensible?

> >

>

> Yes, make sense. Please also keep in mind that this

> utilization somehow must be mapped into power in a proper way.

> I am currently working on addressing all of these problems

> (including this correlation).

>

> Thank you for your time spending on it and your suggestions.

>

> Regards,

> Lukasz

>

> >

> >

> >

> > [*] I always want a beer when I see that name :-)

> >

> > [**] I despise code that uses percentages, computers suck at

> > /100 and there is no reason not to use any other random fraction, so why

> > pick a bad one.

> >
Lukasz Luba July 17, 2020, 9:55 a.m. | #9
On 7/16/20 4:43 PM, Peter Zijlstra wrote:
> On Thu, Jul 16, 2020 at 03:24:37PM +0100, Lukasz Luba wrote:

>> On 7/16/20 12:56 PM, Peter Zijlstra wrote:

> 

>>> The second attempts to guesstimate power, and is the subject of this

>>> patch.

>>>

>>> Currently cpufreq_cooling appears to estimate the CPU energy usage by

>>> calculating the percentage of idle time using the per-cpu cpustat stuff,

>>> which is pretty horrific.

>>

>> Even worse, it then *samples* the *current* CPU frequency at that

>> particular point in time and assumes that when the CPU wasn't idle

>> during that period - it had *this* frequency...

> 

> *whee* :-)

> 

> ...

> 

>> In EM we keep power values in the array and these values grow

>> exponentially. Each OPP has it corresponding

>>

>> P_x = C (V_x)^2 f_x    , where x is the OPP id thus corresponding V,f

>>

>> so we have discrete power values, growing like:

>>

>> ^(power)

>> |

>> |

>> |                          *

>> |

>> |

>> |                       *

>> |                       |

>> |                   *   |

>> |                       | <----- power estimation function

>> |            *          |        should not use linear 'util/max_util'

>> |   *                   |        relation here *

>> |_______________________|_____________> (freq)

>>     opp0     opp1  opp2 opp3 opp4

>>

>> What is the problem

>> First:

>> We need to pick the right Power from the array. I would suggest

>> to pick the max allowed frequency for that whole period, because

>> we don't know if the CPUs were using it (it's likely).

>> Second:

>> Then we have the utilization, which can be considered as:

>> 'idle period & running period with various freq inside', lets

>> call it avg performance in that whole period.

>> Third:

>> Try to estimate the power used in that whole period having

>> the avg performance and max performance.

>>

>> What you are suggesting is to travel that [*] line in

>> non-linear fashion, but in (util^3)/(max_util^3). Which means

>> it goes down faster when the utilization drops.

>> I think it is too aggressive, e.g.

>> 500^3 / 1024^3 = 0.116  <--- very little, ~12%

>> 200^3 / 300^3  = 0.296

>>

>> Peter could you confirm if I understood you correct?

> 

> Correct, with the caveat that we might try and regression fit a 3rd

> order polynomial to a bunch of EM data to see if there's a 'better'

> function to be had than a raw 'f(x) := x^3'.


I agree, I think we are on the same wavelength now.

> 

>> This is quite important bit for me.

> 

> So, if we assume schedutil + EM, we can actually have schedutil

> calculate a running power sum. That is, something like: \Int P_x dt.

> Because we know the points where OPP changes.


Yes, that's why I was thinking about having this information stored as a
copy inside the EM, then just read it in other subsystem like: thermal,
powercap, etc.

> 

> Although, thinking more, I suspect we need tighter integration with

> cpuidle, because we don't actually have idle times here, but that should

> be doable.


I am scratching my head for while because of that idle issue. It opens
more dimensions to tackle.

> 

> But for anything other than schedutil + EM, things become more

> interesting, because then we need to guesstimate power usage without the

> benefit of having actual power numbers.


Yes, from the engineering/research perspective, platforms which do not
have EM in Linux (like Intel) are also interesting.

> 

> We can of course still do that running power sum, with whatever P(u) or

> P(f) end up with, I suppose.

> 

>>> Another point is that cpu_util() vs turbo is a bit iffy, and to that,

>>> things like x86-APERF/MPERF and ARM-AMU got mentioned. Those might also

>>> have the benefit of giving you values that match your own sampling

>>> interval (100ms), where the sched stuff is PELT (64,32.. based).

>>>

>>> So what I've been thinking is that cpufreq drivers ought to be able to

>>> supply this method, and only when they lack, can the cpufreq-governor

>>> (schedutil) install a fallback. And then cpufreq-cooling can use

>>> whatever is provided (through the cpufreq interfaces).

>>>

>>> That way, we:

>>>

>>>    1) don't have to export anything

>>>    2) get arch drivers to provide something 'better'

>>>

>>>

>>> Does that sounds like something sensible?

>>>

>>

>> Yes, make sense. Please also keep in mind that this

>> utilization somehow must be mapped into power in a proper way.

>> I am currently working on addressing all of these problems

>> (including this correlation).

> 

> Right, so that mapping util to power was what I was missing and

> suggesting we do. So for 'simple' hardware we have cpufreq events for

> frequency change, and cpuidle events for idle, and with EM we can simply

> sum the relevant power numbers.

> 

> For hardware lacking EM, or hardware managed DVFS, we'll have to fudge

> things a little. How best to do that is up in the air a little, but

> virtual power curves seem a useful tool to me.

> 

> The next problem for IPA is having all the devices report power in the

> same virtual unit I suppose, but I'll leave that to others ;-)

> 


True, there is more issues. There is also another movement with powercap
driven by Daniel Lezcano, which I am going to support. Maybe he would
be interested as well in having a copy of calculated energy stored
in EM. I must gather some requirements and align with him.

Thank you for your support!

Regards,
Lukasz
Quentin Perret July 17, 2020, 10:14 a.m. | #10
On Tuesday 14 Jul 2020 at 12:06:53 (+0530), Viresh Kumar wrote:
>  /**

> - * get_load() - get load for a cpu since last updated

> + * get_load() - get current load for a cpu

>   * @cpufreq_cdev:	&struct cpufreq_cooling_device for this cpu

>   * @cpu:	cpu number

> - * @cpu_idx:	index of the cpu in time_in_idle*

> + * @cpu_idx:	index of the cpu

>   *

> - * Return: The average load of cpu @cpu in percentage since this

> - * function was last called.

> + * Return: The current load of cpu @cpu in percentage.

>   */

>  static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,

>  		    int cpu_idx)

>  {

> -	u32 load;

> -	u64 now, now_idle, delta_time, delta_idle;

> -	struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx];

> -

> -	now_idle = get_cpu_idle_time(cpu, &now, 0);

> -	delta_idle = now_idle - idle_time->time;

> -	delta_time = now - idle_time->timestamp;

> +	unsigned long util = cpu_util_cfs(cpu_rq(cpu));

> +	unsigned long max = arch_scale_cpu_capacity(cpu);


Should we subtract the thermal PELT signal from max? I'm worried that
if we don't do that, the load computed in this function will look
artificially small when IPA is capping the max freq, and that we'll
enter a weird oscillating state due to the cyclic dependency here.

Thoughts?

>  

> -	if (delta_time <= delta_idle)

> -		load = 0;

> -	else

> -		load = div64_u64(100 * (delta_time - delta_idle), delta_time);

> -

> -	idle_time->time = now_idle;

> -	idle_time->timestamp = now;

> -

> -	return load;

> +	util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL);

> +	return (util * 100) / max;

>  }



Thanks,
Quentin
Lukasz Luba July 17, 2020, 10:30 a.m. | #11
On 7/17/20 10:46 AM, Vincent Guittot wrote:
> On Thu, 16 Jul 2020 at 16:24, Lukasz Luba <lukasz.luba@arm.com> wrote:

>>

>> Hi Peter,

>>

>> Thank you for summarizing this. I've put my comments below.

>>

>> On 7/16/20 12:56 PM, Peter Zijlstra wrote:

>>> On Tue, Jul 14, 2020 at 12:06:53PM +0530, Viresh Kumar wrote:

>>>>    /**

>>>> + * get_load() - get current load for a cpu

>>>>     * @cpufreq_cdev:  &struct cpufreq_cooling_device for this cpu

>>>>     * @cpu:   cpu number

>>>> + * @cpu_idx:        index of the cpu

>>>>     *

>>>> + * Return: The current load of cpu @cpu in percentage.

>>>>     */

>>>>    static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,

>>>>                   int cpu_idx)

>>>>    {

>>>> +    unsigned long util = cpu_util_cfs(cpu_rq(cpu));

>>>> +    unsigned long max = arch_scale_cpu_capacity(cpu);

>>>>

>>>> +    util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL);

>>>> +    return (util * 100) / max;

>>>>    }

>>>

>>> So there's a number of things... let me recap a bunch of things that

>>> got mentioned on IRC earlier this week and then continue from there..

>>>

>>> So IPA* (or any other thermal governor) needs energy estimates for the

>>> various managed devices, cpufreq_cooling, being the driver for the CPU

>>> device, needs to provide that and in return receives feedback on how

>>> much energy it is allowed to consume, cpufreq_cooling then dynamically

>>> enables/disables OPP states.

>>

>> Currently, only IPA uses the power estimation, other governors don't

>> use these API functions in cpufreq_cooling.

>>

>>>

>>> There are actually two methods the thermal governor will use:

>>> get_real_power() and get_requested_power().

>>>

>>> The first isn't used anywhere in mainline, but could be implemented on

>>> hardware that has energy counters (like say x86 RAPL).

>>

>> The first is only present as callback for registered devfreq cooling,

>> which is registered by devfreq driver. If that driver provides the

>> get_real_power(), it will be called from get_requested_power().

>> Thus, it's likely that IPA would get real power value from HW.

>>

>> I was planning to add it also to cpufreq_cooling callbacks years

>> ago...

>>

>>>

>>> The second attempts to guesstimate power, and is the subject of this

>>> patch.

>>>

>>> Currently cpufreq_cooling appears to estimate the CPU energy usage by

>>> calculating the percentage of idle time using the per-cpu cpustat stuff,

>>> which is pretty horrific.

>>

>> Even worse, it then *samples* the *current* CPU frequency at that

>> particular point in time and assumes that when the CPU wasn't idle

>> during that period - it had *this* frequency...

> 

> So there is 2 problems in the power calculation of cpufreq cooling device :

> - How to get an accurate utilization level of the cpu which is what

> this patch is trying to fix because using idle time is just wrong

> whereas scheduler utilization is frequency invariant

> - How to get power estimate from this utilization level. And as you

> pointed out, using the current freq which is not accurate.





> 

>>

>>>

>>> This patch then attempts to improve upon that by using the scheduler's

>>> cpu_util(ENERGY_UTIL) estimate, which is also used to select OPP state

>>> and improves upon avg idle. This should be a big improvement as higher

>>

>> IMHO this patch set doesn't address the real problem: 'sampling

>> freq problem' described above. There was no issue with getting idle

>> period. The avg freq was the problem, in that period when the

> 

> Not sure that you can say that avg freq is a bigger problem than

> getting the load because there is a real issue with tracking idle

> period for estimating load because running slower reduces the idle

> time and increases artificially the load. That's why we implemented

> frequency invariance in PELT.


If you take a closer look into the code, you see that wrong
freq picks the wrong power value from the EM, the line:
freq = cpufreq_quick_get(policy->cpu)
then check the function cpu_freq_to_power().
The power is calculated by:
  (raw_cpu_power * cpufreq_cdev->last_load) / 100

The load estimation error is also an issue, but the comprehensive
solution should address possibly all existing issues.

I don't know if you were approached also by the same vendor,
complaining on IPA issues. Do you have some requirements? Or deadlines,
for which kernel version you have to fix it?
We can discuss this out offline if you like.

> 

> At the opposite when the thermal mitigation happens, the frequency

> will be most probably capped by cpu cooling device and will most

> probably stay at the capped value


I don't think that we can rely on such assumption. The whole
cpufreq_get_requested_power() should be changed.

> 

>> CPUs were running. The model implemented in alg was also a problem.

>>

>> The whole period (e.g. CPU freqs which were used or idle state)

>>

>> ^(CPU freq)

>> |

>> |                            sampling the current freq

>> |                _______        |

>> |               |      |        |

>> |________       |      |        |

>> |       |       |      |        |

>> |       | idle  |      |________v________...

>> |_ _____|_______|__________________________> (time)

>>     start of period               end

>>     |<------- (typically 100ms)-->|

>>

>>

>>

>>> frequency consumes more energy, but should we not also consider that:

>>>

>>>        E = C V^2 f

>>>

>>> The EAS energy model has tables for the OPPs that contain this, but in

>>> this case we seem to be assuming a linear enery/frequency curve, which

>>> is just not the case.

>>

>> I am not sure if I got your point. To understand your point better

>> I think some drawing would be required. I will skip this patch

>> and old mainline code and focus on your proposed solution

>> (because this patch set does not address 'sampling freq problem').

>>

>>>

>>> I suppose we could do something like **:

>>>

>>>        100 * util^3 / max^3

>>>

>>> which assumes V~f.

>>

>> In EM we keep power values in the array and these values grow

>> exponentially. Each OPP has it corresponding

>>

>> P_x = C (V_x)^2 f_x    , where x is the OPP id thus corresponding V,f

>>

>> so we have discrete power values, growing like:

>>

>> ^(power)

>> |

>> |

>> |                          *

>> |

>> |

>> |                       *

>> |                       |

>> |                   *   |

>> |                       | <----- power estimation function

>> |            *          |        should not use linear 'util/max_util'

>> |   *                   |        relation here *

>> |_______________________|_____________> (freq)

>>      opp0     opp1  opp2 opp3 opp4

>>

>> What is the problem

>> First:

>> We need to pick the right Power from the array. I would suggest

>> to pick the max allowed frequency for that whole period, because

>> we don't know if the CPUs were using it (it's likely).

>> Second:

>> Then we have the utilization, which can be considered as:

>> 'idle period & running period with various freq inside', lets

>> call it avg performance in that whole period.

>> Third:

>> Try to estimate the power used in that whole period having

>> the avg performance and max performance.

> 

> We already have a function that is doing such kind of computation

> based of the utilization of the CPU : em_pd_energy(). And we could

> reuse some of this function if not exactly this one


Yes and I think we should use this information. I am going to
talk with Daniel about EM evolution (this is one of the topics
from my side). Next, it is going to be a LPC event, where we
can also discuss with broader audience.

Regards,
Lukasz
Quentin Perret July 17, 2020, 10:33 a.m. | #12
On Friday 17 Jul 2020 at 11:14:38 (+0100), Quentin Perret wrote:
> On Tuesday 14 Jul 2020 at 12:06:53 (+0530), Viresh Kumar wrote:

> >  /**

> > - * get_load() - get load for a cpu since last updated

> > + * get_load() - get current load for a cpu

> >   * @cpufreq_cdev:	&struct cpufreq_cooling_device for this cpu

> >   * @cpu:	cpu number

> > - * @cpu_idx:	index of the cpu in time_in_idle*

> > + * @cpu_idx:	index of the cpu

> >   *

> > - * Return: The average load of cpu @cpu in percentage since this

> > - * function was last called.

> > + * Return: The current load of cpu @cpu in percentage.

> >   */

> >  static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,

> >  		    int cpu_idx)

> >  {

> > -	u32 load;

> > -	u64 now, now_idle, delta_time, delta_idle;

> > -	struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx];

> > -

> > -	now_idle = get_cpu_idle_time(cpu, &now, 0);

> > -	delta_idle = now_idle - idle_time->time;

> > -	delta_time = now - idle_time->timestamp;

> > +	unsigned long util = cpu_util_cfs(cpu_rq(cpu));

> > +	unsigned long max = arch_scale_cpu_capacity(cpu);

> 

> Should we subtract the thermal PELT signal from max?


Actually, that or add it the ENERGY_UTIL case in effective_cpu_util() as
this appears to be missing for EAS too ...
Quentin Perret July 17, 2020, 10:43 a.m. | #13
On Friday 17 Jul 2020 at 11:33:05 (+0100), Quentin Perret wrote:
> On Friday 17 Jul 2020 at 11:14:38 (+0100), Quentin Perret wrote:

> > On Tuesday 14 Jul 2020 at 12:06:53 (+0530), Viresh Kumar wrote:

> > >  /**

> > > - * get_load() - get load for a cpu since last updated

> > > + * get_load() - get current load for a cpu

> > >   * @cpufreq_cdev:	&struct cpufreq_cooling_device for this cpu

> > >   * @cpu:	cpu number

> > > - * @cpu_idx:	index of the cpu in time_in_idle*

> > > + * @cpu_idx:	index of the cpu

> > >   *

> > > - * Return: The average load of cpu @cpu in percentage since this

> > > - * function was last called.

> > > + * Return: The current load of cpu @cpu in percentage.

> > >   */

> > >  static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,

> > >  		    int cpu_idx)

> > >  {

> > > -	u32 load;

> > > -	u64 now, now_idle, delta_time, delta_idle;

> > > -	struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx];

> > > -

> > > -	now_idle = get_cpu_idle_time(cpu, &now, 0);

> > > -	delta_idle = now_idle - idle_time->time;

> > > -	delta_time = now - idle_time->timestamp;

> > > +	unsigned long util = cpu_util_cfs(cpu_rq(cpu));

> > > +	unsigned long max = arch_scale_cpu_capacity(cpu);

> > 

> > Should we subtract the thermal PELT signal from max?

> 

> Actually, that or add it the ENERGY_UTIL case in effective_cpu_util() as

> this appears to be missing for EAS too ...


Scratch that. I do think there is something missing in the EAS path, but
not sure that would fix it. I'll have a think and stop spamming
everybody in the meantime ...

The first question is still valid, though :)

Sorry for the noise,
Quentin
Vincent Guittot July 17, 2020, 12:13 p.m. | #14
On Fri, 17 Jul 2020 at 12:30, Lukasz Luba <lukasz.luba@arm.com> wrote:
>

>

>

> On 7/17/20 10:46 AM, Vincent Guittot wrote:

> > On Thu, 16 Jul 2020 at 16:24, Lukasz Luba <lukasz.luba@arm.com> wrote:

> >>

> >> Hi Peter,

> >>

> >> Thank you for summarizing this. I've put my comments below.

> >>

> >> On 7/16/20 12:56 PM, Peter Zijlstra wrote:

> >>> On Tue, Jul 14, 2020 at 12:06:53PM +0530, Viresh Kumar wrote:

> >>>>    /**

> >>>> + * get_load() - get current load for a cpu

> >>>>     * @cpufreq_cdev:  &struct cpufreq_cooling_device for this cpu

> >>>>     * @cpu:   cpu number

> >>>> + * @cpu_idx:        index of the cpu

> >>>>     *

> >>>> + * Return: The current load of cpu @cpu in percentage.

> >>>>     */

> >>>>    static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,

> >>>>                   int cpu_idx)

> >>>>    {

> >>>> +    unsigned long util = cpu_util_cfs(cpu_rq(cpu));

> >>>> +    unsigned long max = arch_scale_cpu_capacity(cpu);

> >>>>

> >>>> +    util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL);

> >>>> +    return (util * 100) / max;

> >>>>    }

> >>>

> >>> So there's a number of things... let me recap a bunch of things that

> >>> got mentioned on IRC earlier this week and then continue from there..

> >>>

> >>> So IPA* (or any other thermal governor) needs energy estimates for the

> >>> various managed devices, cpufreq_cooling, being the driver for the CPU

> >>> device, needs to provide that and in return receives feedback on how

> >>> much energy it is allowed to consume, cpufreq_cooling then dynamically

> >>> enables/disables OPP states.

> >>

> >> Currently, only IPA uses the power estimation, other governors don't

> >> use these API functions in cpufreq_cooling.

> >>

> >>>

> >>> There are actually two methods the thermal governor will use:

> >>> get_real_power() and get_requested_power().

> >>>

> >>> The first isn't used anywhere in mainline, but could be implemented on

> >>> hardware that has energy counters (like say x86 RAPL).

> >>

> >> The first is only present as callback for registered devfreq cooling,

> >> which is registered by devfreq driver. If that driver provides the

> >> get_real_power(), it will be called from get_requested_power().

> >> Thus, it's likely that IPA would get real power value from HW.

> >>

> >> I was planning to add it also to cpufreq_cooling callbacks years

> >> ago...

> >>

> >>>

> >>> The second attempts to guesstimate power, and is the subject of this

> >>> patch.

> >>>

> >>> Currently cpufreq_cooling appears to estimate the CPU energy usage by

> >>> calculating the percentage of idle time using the per-cpu cpustat stuff,

> >>> which is pretty horrific.

> >>

> >> Even worse, it then *samples* the *current* CPU frequency at that

> >> particular point in time and assumes that when the CPU wasn't idle

> >> during that period - it had *this* frequency...

> >

> > So there is 2 problems in the power calculation of cpufreq cooling device :

> > - How to get an accurate utilization level of the cpu which is what

> > this patch is trying to fix because using idle time is just wrong

> > whereas scheduler utilization is frequency invariant

> > - How to get power estimate from this utilization level. And as you

> > pointed out, using the current freq which is not accurate.

>

>

>

>

> >

> >>

> >>>

> >>> This patch then attempts to improve upon that by using the scheduler's

> >>> cpu_util(ENERGY_UTIL) estimate, which is also used to select OPP state

> >>> and improves upon avg idle. This should be a big improvement as higher

> >>

> >> IMHO this patch set doesn't address the real problem: 'sampling

> >> freq problem' described above. There was no issue with getting idle

> >> period. The avg freq was the problem, in that period when the

> >

> > Not sure that you can say that avg freq is a bigger problem than

> > getting the load because there is a real issue with tracking idle

> > period for estimating load because running slower reduces the idle

> > time and increases artificially the load. That's why we implemented

> > frequency invariance in PELT.

>

> If you take a closer look into the code, you see that wrong

> freq picks the wrong power value from the EM, the line:

> freq = cpufreq_quick_get(policy->cpu)

> then check the function cpu_freq_to_power().

> The power is calculated by:

>   (raw_cpu_power * cpufreq_cdev->last_load) / 100

>

> The load estimation error is also an issue, but the comprehensive

> solution should address possibly all existing issues.

>

> I don't know if you were approached also by the same vendor,

> complaining on IPA issues. Do you have some requirements? Or deadlines,

> for which kernel version you have to fix it?


No, I don't have vendors complaining for IPA.
This patch is just because there because tracking idle time is known
to be wrong whereas sched_util gives a better estimation of the
current/next utilization and as a result a better estimation of the
power that will be consumed

> We can discuss this out offline if you like.

>

> >

> > At the opposite when the thermal mitigation happens, the frequency

> > will be most probably capped by cpu cooling device and will most

> > probably stay at the capped value

>

> I don't think that we can rely on such assumption. The whole


I'm not sure that it's that bad although I fully agree that it's not
perfect or maybe good enough.
The scheduler's utilization is already used to select the cpu
frequency. In an ideal world, it is not expected to change according
to current information so using scheduler utilization and current
freq, which has been also set based on the current knowledge of the
utilization of the cpu, should not be that bad. More than what
happened during the last period, we try to estimate what will happen
during the next one in this case.

Trying to track accurately the energy/power consumed over the last
period (with idle events and freq changes) is not that useful in this
case and especially because of task migration and other scheduler's
activities that will make previous period tracking obsolete where
scheduler utilization takes that into account. Such accurate
power/energy tracking is useful if you want to cap the power
consumption of the devices in framework like powercap when HW can't do
it by itself but in this case you are woring at a different time scale

> cpufreq_get_requested_power() should be changed.

>

> >

> >> CPUs were running. The model implemented in alg was also a problem.

> >>

> >> The whole period (e.g. CPU freqs which were used or idle state)

> >>

> >> ^(CPU freq)

> >> |

> >> |                            sampling the current freq

> >> |                _______        |

> >> |               |      |        |

> >> |________       |      |        |

> >> |       |       |      |        |

> >> |       | idle  |      |________v________...

> >> |_ _____|_______|__________________________> (time)

> >>     start of period               end

> >>     |<------- (typically 100ms)-->|

> >>

> >>

> >>

> >>> frequency consumes more energy, but should we not also consider that:

> >>>

> >>>        E = C V^2 f

> >>>

> >>> The EAS energy model has tables for the OPPs that contain this, but in

> >>> this case we seem to be assuming a linear enery/frequency curve, which

> >>> is just not the case.

> >>

> >> I am not sure if I got your point. To understand your point better

> >> I think some drawing would be required. I will skip this patch

> >> and old mainline code and focus on your proposed solution

> >> (because this patch set does not address 'sampling freq problem').

> >>

> >>>

> >>> I suppose we could do something like **:

> >>>

> >>>        100 * util^3 / max^3

> >>>

> >>> which assumes V~f.

> >>

> >> In EM we keep power values in the array and these values grow

> >> exponentially. Each OPP has it corresponding

> >>

> >> P_x = C (V_x)^2 f_x    , where x is the OPP id thus corresponding V,f

> >>

> >> so we have discrete power values, growing like:

> >>

> >> ^(power)

> >> |

> >> |

> >> |                          *

> >> |

> >> |

> >> |                       *

> >> |                       |

> >> |                   *   |

> >> |                       | <----- power estimation function

> >> |            *          |        should not use linear 'util/max_util'

> >> |   *                   |        relation here *

> >> |_______________________|_____________> (freq)

> >>      opp0     opp1  opp2 opp3 opp4

> >>

> >> What is the problem

> >> First:

> >> We need to pick the right Power from the array. I would suggest

> >> to pick the max allowed frequency for that whole period, because

> >> we don't know if the CPUs were using it (it's likely).

> >> Second:

> >> Then we have the utilization, which can be considered as:

> >> 'idle period & running period with various freq inside', lets

> >> call it avg performance in that whole period.

> >> Third:

> >> Try to estimate the power used in that whole period having

> >> the avg performance and max performance.

> >

> > We already have a function that is doing such kind of computation

> > based of the utilization of the CPU : em_pd_energy(). And we could

> > reuse some of this function if not exactly this one

>

> Yes and I think we should use this information. I am going to

> talk with Daniel about EM evolution (this is one of the topics

> from my side). Next, it is going to be a LPC event, where we

> can also discuss with broader audience.

>

> Regards,

> Lukasz

>
Viresh Kumar July 22, 2020, 9:13 a.m. | #15
On 17-07-20, 11:43, Quentin Perret wrote:
> On Friday 17 Jul 2020 at 11:33:05 (+0100), Quentin Perret wrote:

> > On Friday 17 Jul 2020 at 11:14:38 (+0100), Quentin Perret wrote:

> > > On Tuesday 14 Jul 2020 at 12:06:53 (+0530), Viresh Kumar wrote:

> > > >  /**

> > > > - * get_load() - get load for a cpu since last updated

> > > > + * get_load() - get current load for a cpu

> > > >   * @cpufreq_cdev:	&struct cpufreq_cooling_device for this cpu

> > > >   * @cpu:	cpu number

> > > > - * @cpu_idx:	index of the cpu in time_in_idle*

> > > > + * @cpu_idx:	index of the cpu

> > > >   *

> > > > - * Return: The average load of cpu @cpu in percentage since this

> > > > - * function was last called.

> > > > + * Return: The current load of cpu @cpu in percentage.

> > > >   */

> > > >  static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,

> > > >  		    int cpu_idx)

> > > >  {

> > > > -	u32 load;

> > > > -	u64 now, now_idle, delta_time, delta_idle;

> > > > -	struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx];

> > > > -

> > > > -	now_idle = get_cpu_idle_time(cpu, &now, 0);

> > > > -	delta_idle = now_idle - idle_time->time;

> > > > -	delta_time = now - idle_time->timestamp;

> > > > +	unsigned long util = cpu_util_cfs(cpu_rq(cpu));

> > > > +	unsigned long max = arch_scale_cpu_capacity(cpu);

> > > 

> > > Should we subtract the thermal PELT signal from max?


Makes sense to me, but I am not really good with this util and load
stuff and so would leave that to you guys to decide :)

-- 
viresh
Viresh Kumar July 30, 2020, 6:24 a.m. | #16
On 17-07-20, 11:46, Vincent Guittot wrote:
> On Thu, 16 Jul 2020 at 16:24, Lukasz Luba <lukasz.luba@arm.com> wrote:

> > On 7/16/20 12:56 PM, Peter Zijlstra wrote:

> > > Currently cpufreq_cooling appears to estimate the CPU energy usage by

> > > calculating the percentage of idle time using the per-cpu cpustat stuff,

> > > which is pretty horrific.

> >

> > Even worse, it then *samples* the *current* CPU frequency at that

> > particular point in time and assumes that when the CPU wasn't idle

> > during that period - it had *this* frequency...

> 

> So there is 2 problems in the power calculation of cpufreq cooling device :

> - How to get an accurate utilization level of the cpu which is what

> this patch is trying to fix because using idle time is just wrong

> whereas scheduler utilization is frequency invariant


Since this patch is targeted only towards fixing this particular
problem, should I change something in the patch to make it acceptable
?

> - How to get power estimate from this utilization level. And as you

> pointed out, using the current freq which is not accurate.


This should be tackled separately I believe.

-- 
viresh
Lukasz Luba July 30, 2020, 11:16 a.m. | #17
Hi Viresh,

On 7/30/20 7:24 AM, Viresh Kumar wrote:
> On 17-07-20, 11:46, Vincent Guittot wrote:

>> On Thu, 16 Jul 2020 at 16:24, Lukasz Luba <lukasz.luba@arm.com> wrote:

>>> On 7/16/20 12:56 PM, Peter Zijlstra wrote:

>>>> Currently cpufreq_cooling appears to estimate the CPU energy usage by

>>>> calculating the percentage of idle time using the per-cpu cpustat stuff,

>>>> which is pretty horrific.

>>>

>>> Even worse, it then *samples* the *current* CPU frequency at that

>>> particular point in time and assumes that when the CPU wasn't idle

>>> during that period - it had *this* frequency...

>>

>> So there is 2 problems in the power calculation of cpufreq cooling device :

>> - How to get an accurate utilization level of the cpu which is what

>> this patch is trying to fix because using idle time is just wrong

>> whereas scheduler utilization is frequency invariant

> 

> Since this patch is targeted only towards fixing this particular

> problem, should I change something in the patch to make it acceptable

> ?

> 

>> - How to get power estimate from this utilization level. And as you

>> pointed out, using the current freq which is not accurate.

> 

> This should be tackled separately I believe.

> 


I don't think that these two are separate. Furthermore, I think we
would need this kind of information also in future in the powercap.
I've discussed with Daniel this possible scenario.

We have a vendor who presented issue with the IPA input power and
pointed out these issues. Unfortunately, I don't have this vendor
phone but I assume it can last a few minutes without changing the
max allowed OPP. Based on their plots the frequency driven by the
governor is changing, also the idles are present during the IPA period.

Please give me a few days, because I am also plumbing these stuff
and would like to present it. These two interfaces: involving cpufreq
driver or fallback mode for utilization and EM.

Regards,
Lukasz

Patch

diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index 6c0e1b053126..74340b2b0da7 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -23,6 +23,7 @@ 
 #include <linux/thermal.h>
 
 #include <trace/events/thermal.h>
+#include "../../kernel/sched/sched.h"
 
 /*
  * Cooling state <-> CPUFreq frequency
@@ -38,16 +39,6 @@ 
  *	...
  */
 
-/**
- * struct time_in_idle - Idle time stats
- * @time: previous reading of the absolute time that this cpu was idle
- * @timestamp: wall time of the last invocation of get_cpu_idle_time_us()
- */
-struct time_in_idle {
-	u64 time;
-	u64 timestamp;
-};
-
 /**
  * struct cpufreq_cooling_device - data for cooling device with cpufreq
  * @id: unique integer value corresponding to each cpufreq_cooling_device
@@ -62,7 +53,6 @@  struct time_in_idle {
  *	registered cooling device.
  * @policy: cpufreq policy.
  * @node: list_head to link all cpufreq_cooling_device together.
- * @idle_time: idle time stats
  * @qos_req: PM QoS contraint to apply
  *
  * This structure is required for keeping information of each registered
@@ -76,7 +66,6 @@  struct cpufreq_cooling_device {
 	struct em_perf_domain *em;
 	struct cpufreq_policy *policy;
 	struct list_head node;
-	struct time_in_idle *idle_time;
 	struct freq_qos_request qos_req;
 };
 
@@ -132,34 +121,21 @@  static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
 }
 
 /**
- * get_load() - get load for a cpu since last updated
+ * get_load() - get current load for a cpu
  * @cpufreq_cdev:	&struct cpufreq_cooling_device for this cpu
  * @cpu:	cpu number
- * @cpu_idx:	index of the cpu in time_in_idle*
+ * @cpu_idx:	index of the cpu
  *
- * Return: The average load of cpu @cpu in percentage since this
- * function was last called.
+ * Return: The current load of cpu @cpu in percentage.
  */
 static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
 		    int cpu_idx)
 {
-	u32 load;
-	u64 now, now_idle, delta_time, delta_idle;
-	struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx];
-
-	now_idle = get_cpu_idle_time(cpu, &now, 0);
-	delta_idle = now_idle - idle_time->time;
-	delta_time = now - idle_time->timestamp;
+	unsigned long util = cpu_util_cfs(cpu_rq(cpu));
+	unsigned long max = arch_scale_cpu_capacity(cpu);
 
-	if (delta_time <= delta_idle)
-		load = 0;
-	else
-		load = div64_u64(100 * (delta_time - delta_idle), delta_time);
-
-	idle_time->time = now_idle;
-	idle_time->timestamp = now;
-
-	return load;
+	util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL);
+	return (util * 100) / max;
 }
 
 /**
@@ -192,13 +168,12 @@  static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_cdev,
  * Instead, we calculate the current power on the assumption that the
  * immediate future will look like the immediate past.
  *
- * We use the current frequency and the average load since this
- * function was last called.  In reality, there could have been
- * multiple opps since this function was last called and that affects
- * the load calculation.  While it's not perfectly accurate, this
- * simplification is good enough and works.  REVISIT this, as more
- * complex code may be needed if experiments show that it's not
- * accurate enough.
+ * We use the current frequency and the current load.  In reality,
+ * there could have been multiple opps since this function was last
+ * called and that affects the load calculation.  While it's not
+ * perfectly accurate, this simplification is good enough and works.
+ * REVISIT this, as more complex code may be needed if experiments show
+ * that it's not accurate enough.
  *
  * Return: 0 on success, -E* if getting the static power failed.
  */
@@ -523,13 +498,6 @@  __cpufreq_cooling_register(struct device_node *np,
 
 	cpufreq_cdev->policy = policy;
 	num_cpus = cpumask_weight(policy->related_cpus);
-	cpufreq_cdev->idle_time = kcalloc(num_cpus,
-					 sizeof(*cpufreq_cdev->idle_time),
-					 GFP_KERNEL);
-	if (!cpufreq_cdev->idle_time) {
-		cdev = ERR_PTR(-ENOMEM);
-		goto free_cdev;
-	}
 
 	/* max_level is an index, not a counter */
 	cpufreq_cdev->max_level = i - 1;
@@ -537,7 +505,7 @@  __cpufreq_cooling_register(struct device_node *np,
 	ret = ida_simple_get(&cpufreq_ida, 0, 0, GFP_KERNEL);
 	if (ret < 0) {
 		cdev = ERR_PTR(ret);
-		goto free_idle_time;
+		goto free_cdev;
 	}
 	cpufreq_cdev->id = ret;
 
@@ -586,8 +554,6 @@  __cpufreq_cooling_register(struct device_node *np,
 	freq_qos_remove_request(&cpufreq_cdev->qos_req);
 remove_ida:
 	ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
-free_idle_time:
-	kfree(cpufreq_cdev->idle_time);
 free_cdev:
 	kfree(cpufreq_cdev);
 	return cdev;
@@ -680,7 +646,6 @@  void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 	thermal_cooling_device_unregister(cdev);
 	freq_qos_remove_request(&cpufreq_cdev->qos_req);
 	ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
-	kfree(cpufreq_cdev->idle_time);
 	kfree(cpufreq_cdev);
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);