diff mbox series

[V2,2/2] thermal: cpufreq_cooling: Reuse sched_cpu_util()

Message ID 11e7c7dcb07ae258fa02e187c9697252f3835466.1603448113.git.viresh.kumar@linaro.org
State New
Headers show
Series cpufreq_cooling: Get effective CPU utilization from scheduler | expand

Commit Message

Viresh Kumar Oct. 23, 2020, 10:20 a.m. UTC
Several parts of the kernel are already using the effective CPU
utilization (as seen by the scheduler) to get the current load on the
CPU, do the same here instead of depending on the idle time of the CPU,
which isn't that accurate comparatively.

Note that, this (and CPU frequency scaling in general) doesn't work that
well with idle injection as that is done from rt threads and is counted
as load while it tries to do quite the opposite. That should be solved
separately though.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/thermal/cpufreq_cooling.c | 70 +++++++------------------------
 1 file changed, 16 insertions(+), 54 deletions(-)

Comments

Peter Zijlstra Oct. 23, 2020, 10:37 a.m. UTC | #1
On Fri, Oct 23, 2020 at 03:50:21PM +0530, Viresh Kumar wrote:
> Several parts of the kernel are already using the effective CPU
> utilization (as seen by the scheduler) to get the current load on the
> CPU, do the same here instead of depending on the idle time of the CPU,
> which isn't that accurate comparatively.
> 
> Note that, this (and CPU frequency scaling in general) doesn't work that
> well with idle injection as that is done from rt threads and is counted
> as load while it tries to do quite the opposite. That should be solved
> separately though.

Actual numbers that show the goodness would be nice ;-) Because clearly
we're doing this make it better.
Viresh Kumar Nov. 12, 2020, 9:41 a.m. UTC | #2
On 23-10-20, 12:37, Peter Zijlstra wrote:
> Actual numbers that show the goodness would be nice ;-) Because clearly

> we're doing this make it better.


Hi Peter,

I tried the patchset with hackbench, sysbench and schbench. None of them showed
any regression or significant improvements. schbench was the one I was most
hopeful with as it creates the scenario where the utilization numbers provide a
better estimate of the future.

Scenario 1: The CPUs were mostly idle in the previous polling window of the IPA
governor as the tasks were sleeping and here are the details from traces (load
is in %):

   thermal_power_cpu_get_power: Old: cpus=00000000,000000ff freq=1200000 total_load=203 load={{0x35,0x1,0x0,0x31,0x0,0x0,0x64,0x0}} dynamic_power=1339
   thermal_power_cpu_get_power: New: cpus=00000000,000000ff freq=1200000 total_load=600 load={{0x60,0x46,0x45,0x45,0x48,0x3b,0x61,0x44}} dynamic_power=3960

Here, the "Old" line gives the load and requested_power (dynamic_power here)
numbers calculated using the idle time based implementation. And "New" is based
on CPU utilization from this patchset.

As can be clearly seen, the load and requested_power numbers are simply
incorrect in the idle time based approach and the numbers collected from CPU's
utilization are much better and will also match the expectations of the
schedutil governor.

Scenario 2: The CPUs were busy in the previous polling window of the IPA
governor:

   thermal_power_cpu_get_power: Old: cpus=00000000,000000ff freq=1200000 total_load=800 load={{0x64,0x64,0x64,0x64,0x64,0x64,0x64,0x64}} dynamic_power=5280
   thermal_power_cpu_get_power: New: cpus=00000000,000000ff freq=1200000 total_load=708 load={{0x4d,0x5c,0x5c,0x5b,0x5c,0x5c,0x51,0x5b}} dynamic_power=4672

As can be seen, the idle time based load is 100% for all the CPUs as it took
only the last window into account, but in reality the CPUs aren't that loaded as
shown by the utilazation numbers.

Though this patchset improves the power estimation done by the cpufreq_cooling
driver (which matches with the freq scaling governor, schedutil, as well), the
IPA governor doesn't necessarily appreciate the correctness of it as it takes
the decision to choose the next cooling state based on multiple factors, like
current temperature, target temperature, requested_power, all power players (who
request power from it), etc. The algorithm is complex there and I am afraid the
improved numbers here don't necessarily translate to better numbers for the
benchmarks like schbench. Another factor can be IPAs tuning for my platform
(Hikey6220).

Irrespective of the IPA governor, the estimate provided by the cpufreq cooling
driver does improve a lot with this patchset and are better aligned with the
schedutil governor and I believe it would be better to merge this nevertheless.

I have already prepared the next version which takes care of !SMP case, was just
holding it off until I was trying to get some numbers out.

-- 
viresh
diff mbox series

Patch

diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index cc2959f22f01..1315e4d4758b 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -19,6 +19,7 @@ 
 #include <linux/idr.h>
 #include <linux/pm_opp.h>
 #include <linux/pm_qos.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/thermal.h>
 
@@ -38,16 +39,6 @@ 
  *	...
  */
 
-/**
- * struct time_in_idle - Idle time stats
- * @time: previous reading of the absolute time that this cpu was idle
- * @timestamp: wall time of the last invocation of get_cpu_idle_time_us()
- */
-struct time_in_idle {
-	u64 time;
-	u64 timestamp;
-};
-
 /**
  * struct cpufreq_cooling_device - data for cooling device with cpufreq
  * @id: unique integer value corresponding to each cpufreq_cooling_device
@@ -62,7 +53,6 @@  struct time_in_idle {
  *	registered cooling device.
  * @policy: cpufreq policy.
  * @node: list_head to link all cpufreq_cooling_device together.
- * @idle_time: idle time stats
  * @qos_req: PM QoS contraint to apply
  *
  * This structure is required for keeping information of each registered
@@ -76,7 +66,6 @@  struct cpufreq_cooling_device {
 	struct em_perf_domain *em;
 	struct cpufreq_policy *policy;
 	struct list_head node;
-	struct time_in_idle *idle_time;
 	struct freq_qos_request qos_req;
 };
 
@@ -132,34 +121,18 @@  static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
 }
 
 /**
- * get_load() - get load for a cpu since last updated
- * @cpufreq_cdev:	&struct cpufreq_cooling_device for this cpu
+ * get_load() - get current load for a cpu
  * @cpu:	cpu number
- * @cpu_idx:	index of the cpu in time_in_idle*
  *
- * Return: The average load of cpu @cpu in percentage since this
- * function was last called.
+ * Return: The current load of cpu @cpu in percentage.
  */
-static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
-		    int cpu_idx)
+static u32 get_load(int cpu)
 {
-	u32 load;
-	u64 now, now_idle, delta_time, delta_idle;
-	struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx];
-
-	now_idle = get_cpu_idle_time(cpu, &now, 0);
-	delta_idle = now_idle - idle_time->time;
-	delta_time = now - idle_time->timestamp;
+	unsigned long max = arch_scale_cpu_capacity(cpu);
+	unsigned long util;
 
-	if (delta_time <= delta_idle)
-		load = 0;
-	else
-		load = div64_u64(100 * (delta_time - delta_idle), delta_time);
-
-	idle_time->time = now_idle;
-	idle_time->timestamp = now;
-
-	return load;
+	util = sched_cpu_util(cpu, ENERGY_UTIL, max);
+	return (util * 100) / max;
 }
 
 /**
@@ -191,13 +164,12 @@  static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_cdev,
  * Instead, we calculate the current power on the assumption that the
  * immediate future will look like the immediate past.
  *
- * We use the current frequency and the average load since this
- * function was last called.  In reality, there could have been
- * multiple opps since this function was last called and that affects
- * the load calculation.  While it's not perfectly accurate, this
- * simplification is good enough and works.  REVISIT this, as more
- * complex code may be needed if experiments show that it's not
- * accurate enough.
+ * We use the current frequency and the current load.  In reality,
+ * there could have been multiple opps since this function was last
+ * called and that affects the load calculation.  While it's not
+ * perfectly accurate, this simplification is good enough and works.
+ * REVISIT this, as more complex code may be needed if experiments show
+ * that it's not accurate enough.
  *
  * Return: 0 on success, -E* if getting the static power failed.
  */
@@ -223,7 +195,7 @@  static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
 		u32 load;
 
 		if (cpu_online(cpu))
-			load = get_load(cpufreq_cdev, cpu, i);
+			load = get_load(cpu);
 		else
 			load = 0;
 
@@ -517,13 +489,6 @@  __cpufreq_cooling_register(struct device_node *np,
 
 	cpufreq_cdev->policy = policy;
 	num_cpus = cpumask_weight(policy->related_cpus);
-	cpufreq_cdev->idle_time = kcalloc(num_cpus,
-					 sizeof(*cpufreq_cdev->idle_time),
-					 GFP_KERNEL);
-	if (!cpufreq_cdev->idle_time) {
-		cdev = ERR_PTR(-ENOMEM);
-		goto free_cdev;
-	}
 
 	/* max_level is an index, not a counter */
 	cpufreq_cdev->max_level = i - 1;
@@ -531,7 +496,7 @@  __cpufreq_cooling_register(struct device_node *np,
 	ret = ida_simple_get(&cpufreq_ida, 0, 0, GFP_KERNEL);
 	if (ret < 0) {
 		cdev = ERR_PTR(ret);
-		goto free_idle_time;
+		goto free_cdev;
 	}
 	cpufreq_cdev->id = ret;
 
@@ -580,8 +545,6 @@  __cpufreq_cooling_register(struct device_node *np,
 	freq_qos_remove_request(&cpufreq_cdev->qos_req);
 remove_ida:
 	ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
-free_idle_time:
-	kfree(cpufreq_cdev->idle_time);
 free_cdev:
 	kfree(cpufreq_cdev);
 	return cdev;
@@ -674,7 +637,6 @@  void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 	thermal_cooling_device_unregister(cdev);
 	freq_qos_remove_request(&cpufreq_cdev->qos_req);
 	ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
-	kfree(cpufreq_cdev->idle_time);
 	kfree(cpufreq_cdev);
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);