diff mbox series

[V2,1/3] Calculate Thermal Pressure

Message ID 1555443521-579-2-git-send-email-thara.gopinath@linaro.org
State New
Headers show
Series Introduce Thermal Pressure | expand

Commit Message

Thara Gopinath April 16, 2019, 7:38 p.m. UTC
Add thermal.c and thermal.h files that provides interface
APIs to initialize, update/average, track, accumulate and decay
thermal pressure per cpu basis. A per cpu structure thermal_pressure is
introduced to keep track of instantaneous per cpu thermal pressure.
Per cpu timers are scheduled to accumulate and decay thermal pressure
periodically. Two interfaces are introduced: sched_update_thermal_pressure
to be called from any entity that caps the maximum frequency of a cpu
and sched_get_thermal_pressure to be called by scheduler to get the
thermal pressure of the cpu.

Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>

---
 include/linux/sched/thermal.h |  11 +++
 kernel/sched/Makefile         |   2 +-
 kernel/sched/thermal.c        | 220 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 232 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/thermal.h
 create mode 100644 kernel/sched/thermal.c

-- 
2.1.4

Comments

Quentin Perret April 18, 2019, 10:14 a.m. UTC | #1
On Tuesday 16 Apr 2019 at 15:38:39 (-0400), Thara Gopinath wrote:
> +/**

> + * Function to update thermal pressure from cooling device

> + * or any framework responsible for capping cpu maximum

> + * capacity.

> + */

> +void sched_update_thermal_pressure(struct cpumask *cpus,

> +				   unsigned long cap_max_freq,

> +				   unsigned long max_freq)

> +{

> +	int cpu;

> +	unsigned long flags = 0;

> +	struct thermal_pressure *cpu_thermal;

> +

> +	for_each_cpu(cpu, cpus) {


Is it actually required to do this for each CPU ? You could calculate
the whole thing once for the first CPU, and apply the result to all CPUs
in the policy no ? All CPUs in a policy are capped and uncapped
synchronously.

> +		cpu_thermal = per_cpu(thermal_pressure_cpu, cpu);

> +		if (!cpu_thermal)

> +			return;

> +		spin_lock_irqsave(&cpu_thermal->lock, flags);

> +		thermal_pressure_update(cpu_thermal, cap_max_freq, max_freq, 1);

> +	}

> +}
Thara Gopinath April 24, 2019, 4:13 a.m. UTC | #2
On 04/18/2019 06:14 AM, Quentin Perret wrote:
> On Tuesday 16 Apr 2019 at 15:38:39 (-0400), Thara Gopinath wrote:

>> +/**

>> + * Function to update thermal pressure from cooling device

>> + * or any framework responsible for capping cpu maximum

>> + * capacity.

>> + */

>> +void sched_update_thermal_pressure(struct cpumask *cpus,

>> +				   unsigned long cap_max_freq,

>> +				   unsigned long max_freq)

>> +{

>> +	int cpu;

>> +	unsigned long flags = 0;

>> +	struct thermal_pressure *cpu_thermal;

>> +

>> +	for_each_cpu(cpu, cpus) {

> 

> Is it actually required to do this for each CPU ? You could calculate

> the whole thing once for the first CPU, and apply the result to all CPUs

> in the policy no ? All CPUs in a policy are capped and uncapped

> synchronously.

Hmm. You are right that all cpus in a policy are capped and uncapped
synchronously from the thermal framework point of view. But the thermal
pressure decay can happen at different times for each cpu and hence the
update has to be done on a per cpu basis(especially to keep track of
other age and other variables in the averaging and accumulating
algorithm). It can be separated out but I think it will just make the
solution more complicated.
> 

>> +		cpu_thermal = per_cpu(thermal_pressure_cpu, cpu);

>> +		if (!cpu_thermal)

>> +			return;

>> +		spin_lock_irqsave(&cpu_thermal->lock, flags);

>> +		thermal_pressure_update(cpu_thermal, cap_max_freq, max_freq, 1);

>> +	}

>> +}



-- 
Regards
Thara
Peter Zijlstra April 24, 2019, 4:38 p.m. UTC | #3
On Tue, Apr 16, 2019 at 03:38:39PM -0400, Thara Gopinath wrote:

> +static void thermal_pressure_update(struct thermal_pressure *cpu_thermal,

> +				    unsigned long cap_max_freq,

> +				    unsigned long max_freq, bool change_scale)

> +{

> +	unsigned long flags = 0;

> +

> +	calculate_thermal_pressure(cpu_thermal);

> +	if (change_scale)

> +		cpu_thermal->raw_scale =

> +			(cap_max_freq << SCHED_CAPACITY_SHIFT) / max_freq;


That needs {} per coding style.

> +

> +	mod_timer(&cpu_thermal->timer, jiffies +

> +				usecs_to_jiffies(TICK_USEC));

> +

> +	spin_unlock_irqrestore(&cpu_thermal->lock, flags);


This is busted has heck, @flags should be the result of irqsave(.flags).

> +}

> +

> +/**

> + * Function for the tick update of the thermal pressure.

> + * The thermal pressure update is aborted if already an update is

> + * happening.

> + */

> +static void thermal_pressure_timeout(struct timer_list *timer)

> +{

> +	struct thermal_pressure *cpu_thermal = from_timer(cpu_thermal, timer,

> +							  timer);


If you split after the = the result is so very much easier to read.

> +	unsigned long flags = 0;

> +

> +	if (!cpu_thermal)

> +		return;

> +

> +	if (!spin_trylock_irqsave(&cpu_thermal->lock, flags))

> +		return;

> +

> +	thermal_pressure_update(cpu_thermal, 0, 0, 0);

> +}

> +

> +/**

> + * Function to update thermal pressure from cooling device

> + * or any framework responsible for capping cpu maximum

> + * capacity.


Would be useful to know how wide @cpus is, typically. Is that the power
culster?

> + */

> +void sched_update_thermal_pressure(struct cpumask *cpus,

> +				   unsigned long cap_max_freq,

> +				   unsigned long max_freq)

> +{

> +	int cpu;

> +	unsigned long flags = 0;

> +	struct thermal_pressure *cpu_thermal;


You got them in exactly the wrong order here.

> +

> +	for_each_cpu(cpu, cpus) {

> +		cpu_thermal = per_cpu(thermal_pressure_cpu, cpu);

> +		if (!cpu_thermal)

> +			return;

> +		spin_lock_irqsave(&cpu_thermal->lock, flags);

> +		thermal_pressure_update(cpu_thermal, cap_max_freq, max_freq, 1);

> +	}

> +}


That's just horrible style. Move the unlock out of
thermal_pressure_update() such that lock and unlock are in the same
function.
Peter Zijlstra April 24, 2019, 4:45 p.m. UTC | #4
On Tue, Apr 16, 2019 at 03:38:39PM -0400, Thara Gopinath wrote:
> +static void __init init_thermal_pressure(void)

> +{

> +	struct thermal_pressure *cpu_thermal;

> +	unsigned long scale;

> +	int cpu;

> +

> +	pr_debug("Init thermal pressure\n");

> +	for_each_possible_cpu(cpu) {

> +		cpu_thermal = per_cpu(thermal_pressure_cpu, cpu);

> +		if (cpu_thermal)

> +			continue;

> +

> +		cpu_thermal = kzalloc(sizeof(*cpu_thermal), GFP_KERNEL);

> +		if (!cpu_thermal)

> +			continue;


So if you unconditinoally allocate this memory, why not just have:

DEFINE_PER_CPU(struct thermal_pressure, cpu_thermal_pressure); ?

Or stick it in struct rq or whatever.

> +		scale = SCHED_CAPACITY_SCALE;

> +		cpu_thermal->scale = scale;

> +		cpu_thermal->old_scale = scale;

> +		cpu_thermal->raw_scale = scale;

> +		cpu_thermal->age_stamp = sched_clock_cpu(cpu);

> +		cpu_thermal->last_update = sched_clock_cpu(cpu);

> +		cpu_thermal->cpu = cpu;

> +		spin_lock_init(&cpu_thermal->lock);

> +		timer_setup(&cpu_thermal->timer, thermal_pressure_timeout,

> +			    TIMER_DEFERRABLE);

> +		per_cpu(thermal_pressure_cpu, cpu) = cpu_thermal;

> +		pr_debug("cpu %d thermal scale = %ld\n", cpu, cpu_thermal->scale);

> +	}

> +

> +	for_each_possible_cpu(cpu) {


You just done that, what gives?

> +		cpu_thermal = per_cpu(thermal_pressure_cpu, cpu);

> +		if (!cpu_thermal)

> +			continue;

> +		cpu_thermal->timer.expires = jiffies +

> +						usecs_to_jiffies(TICK_USEC);


That's like a very copmlicated way of writing: '1', right?

> +		add_timer(&cpu_thermal->timer);

> +	}

> +}


So really what you want is a hook into the tick?
Quentin Perret April 25, 2019, 10:57 a.m. UTC | #5
On Tuesday 16 Apr 2019 at 15:38:39 (-0400), Thara Gopinath wrote:
> +/* Per cpu structure to keep track of Thermal Pressure */

> +struct thermal_pressure {

> +	unsigned long scale; /* scale reflecting average cpu max capacity*/

> +	unsigned long acc_scale; /* Accumulated scale for this time window */

> +	unsigned long old_scale; /* Scale value for the previous window */

> +	unsigned long raw_scale; /* Raw max capacity */

> +	unsigned long age_stamp; /* Last time old_scale was updated */

> +	unsigned long last_update; /* Last time acc_scale was updated */

> +	spinlock_t lock; /* Lock for protecting from simultaneous access*/

> +	/* Timer for periodic update of thermal pressure */

> +	struct timer_list timer;


Do you actually need the periodic update ? You only really need to
update the 'scale' value when updating the LB stats no ? Nobody
accesses that value in between two LBs.

Thanks,
Quentin
Vincent Guittot April 25, 2019, 12:45 p.m. UTC | #6
On Thu, 25 Apr 2019 at 12:57, Quentin Perret <quentin.perret@arm.com> wrote:
>

> On Tuesday 16 Apr 2019 at 15:38:39 (-0400), Thara Gopinath wrote:

> > +/* Per cpu structure to keep track of Thermal Pressure */

> > +struct thermal_pressure {

> > +     unsigned long scale; /* scale reflecting average cpu max capacity*/

> > +     unsigned long acc_scale; /* Accumulated scale for this time window */

> > +     unsigned long old_scale; /* Scale value for the previous window */

> > +     unsigned long raw_scale; /* Raw max capacity */

> > +     unsigned long age_stamp; /* Last time old_scale was updated */

> > +     unsigned long last_update; /* Last time acc_scale was updated */

> > +     spinlock_t lock; /* Lock for protecting from simultaneous access*/

> > +     /* Timer for periodic update of thermal pressure */

> > +     struct timer_list timer;

>

> Do you actually need the periodic update ? You only really need to

> update the 'scale' value when updating the LB stats no ? Nobody

> accesses that value in between two LBs.


Do you mean calling a variant of sched_update_thermal_pressure() in
update_cpu_capacity() instead of periodic update ?
Yes , that should be enough

>

> Thanks,

> Quentin
Quentin Perret April 25, 2019, 12:47 p.m. UTC | #7
On Thursday 25 Apr 2019 at 14:45:57 (+0200), Vincent Guittot wrote:
> On Thu, 25 Apr 2019 at 12:57, Quentin Perret <quentin.perret@arm.com> wrote:

> >

> > On Tuesday 16 Apr 2019 at 15:38:39 (-0400), Thara Gopinath wrote:

> > > +/* Per cpu structure to keep track of Thermal Pressure */

> > > +struct thermal_pressure {

> > > +     unsigned long scale; /* scale reflecting average cpu max capacity*/

> > > +     unsigned long acc_scale; /* Accumulated scale for this time window */

> > > +     unsigned long old_scale; /* Scale value for the previous window */

> > > +     unsigned long raw_scale; /* Raw max capacity */

> > > +     unsigned long age_stamp; /* Last time old_scale was updated */

> > > +     unsigned long last_update; /* Last time acc_scale was updated */

> > > +     spinlock_t lock; /* Lock for protecting from simultaneous access*/

> > > +     /* Timer for periodic update of thermal pressure */

> > > +     struct timer_list timer;

> >

> > Do you actually need the periodic update ? You only really need to

> > update the 'scale' value when updating the LB stats no ? Nobody

> > accesses that value in between two LBs.

> 

> Do you mean calling a variant of sched_update_thermal_pressure() in

> update_cpu_capacity() instead of periodic update ?

> Yes , that should be enough


Right something like this, and remove all the timers. Should be a bit
cleaner I guess.

Thanks,
Quentin
Thara Gopinath April 26, 2019, 2:17 p.m. UTC | #8
On 04/25/2019 08:45 AM, Vincent Guittot wrote:
> On Thu, 25 Apr 2019 at 12:57, Quentin Perret <quentin.perret@arm.com> wrote:

>>

>> On Tuesday 16 Apr 2019 at 15:38:39 (-0400), Thara Gopinath wrote:

>>> +/* Per cpu structure to keep track of Thermal Pressure */

>>> +struct thermal_pressure {

>>> +     unsigned long scale; /* scale reflecting average cpu max capacity*/

>>> +     unsigned long acc_scale; /* Accumulated scale for this time window */

>>> +     unsigned long old_scale; /* Scale value for the previous window */

>>> +     unsigned long raw_scale; /* Raw max capacity */

>>> +     unsigned long age_stamp; /* Last time old_scale was updated */

>>> +     unsigned long last_update; /* Last time acc_scale was updated */

>>> +     spinlock_t lock; /* Lock for protecting from simultaneous access*/

>>> +     /* Timer for periodic update of thermal pressure */

>>> +     struct timer_list timer;

>>

>> Do you actually need the periodic update ? You only really need to

>> update the 'scale' value when updating the LB stats no ? Nobody

>> accesses that value in between two LBs.

> 

> Do you mean calling a variant of sched_update_thermal_pressure() in

> update_cpu_capacity() instead of periodic update ?

> Yes , that should be enough


Hi,

I do have some concerns in doing this.
1. Updating thermal pressure does involve some calculations for
accumulating, averaging, decaying etc which in turn could have some
finite and measurable time spent in the function. I am not sure if this
delay will be acceptable for all systems during load balancing (I have
not measured the time involved). We need to decide if this is something
we can live with.

2. More importantly, since update can happen from at least two paths (
thermal fw and periodic timer in case of this patch series)to ensure
mutual exclusion,  the update is done under a spin lock. Again calling
from update_cpu_capacity will involve holding the lock in the load
balance path which is possible not for the best.
For me, updating out of load balance minimizes the disruption to
scheduler on the whole.

But if there is an over whelming support for updating the statistics
from the LB , I can move the code.

Regards
Thara

> 

>>

>> Thanks,

>> Quentin



-- 
Regards
Thara
Quentin Perret May 8, 2019, 12:41 p.m. UTC | #9
Hi Thara,

Sorry for the delayed response.

On Friday 26 Apr 2019 at 10:17:56 (-0400), Thara Gopinath wrote:
> On 04/25/2019 08:45 AM, Vincent Guittot wrote:

> > Do you mean calling a variant of sched_update_thermal_pressure() in

> > update_cpu_capacity() instead of periodic update ?

> > Yes , that should be enough

> 

> Hi,

> 

> I do have some concerns in doing this.

> 1. Updating thermal pressure does involve some calculations for

> accumulating, averaging, decaying etc which in turn could have some

> finite and measurable time spent in the function. I am not sure if this

> delay will be acceptable for all systems during load balancing (I have

> not measured the time involved). We need to decide if this is something

> we can live with.

> 

> 2. More importantly, since update can happen from at least two paths (

> thermal fw and periodic timer in case of this patch series)to ensure

> mutual exclusion,  the update is done under a spin lock. Again calling

> from update_cpu_capacity will involve holding the lock in the load

> balance path which is possible not for the best.

> For me, updating out of load balance minimizes the disruption to

> scheduler on the whole.

> 

> But if there is an over whelming support for updating the statistics

> from the LB , I can move the code.


If I try to clarify my point a little bit, my observation is really that
it's a shame to update the thermal stats often, but to not reflect that
in capacity_of().

So in fact there are two alternatives: 1) do the update only during LB
(which is what I suggested first) to avoid 'useless' work; or 2) reflect
the thermal pressure in the CPU capacity every time the thermal stats
are updated.

And thinking more about it, perhaps 2) is actually a better option? With
this we could try smaller decay periods than the LB interval (which is
most likely useless otherwise) and make sure the capacity considered
during wake-up is up-to-date. This should be a good thing for latency
sensitive tasks I think. (If you consider a task in the Android display
pipeline for example, it needs to run within 16ms or the frame is
missed. So, on wake-up, we'd like to know where the task can run fast
_now_, not according to the capacities the CPUs had 200ms ago or so).

Thoughts ?
Quentin
diff mbox series

Patch

diff --git a/include/linux/sched/thermal.h b/include/linux/sched/thermal.h
new file mode 100644
index 0000000..cda158e
--- /dev/null
+++ b/include/linux/sched/thermal.h
@@ -0,0 +1,11 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_THERMAL_H
+#define _LINUX_SCHED_THERMAL_H
+
+void sched_update_thermal_pressure(struct cpumask *cpus,
+				   unsigned long cap_max_freq,
+				   unsigned long max_freq);
+
+unsigned long sched_get_thermal_pressure(int cpu);
+
+#endif /* _LINUX_SCHED_THERMAL_H */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 21fb5a5..4d3b820 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -20,7 +20,7 @@  obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle.o fair.o rt.o deadline.o
 obj-y += wait.o wait_bit.o swait.o completion.o
 
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o thermal.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/thermal.c b/kernel/sched/thermal.c
new file mode 100644
index 0000000..1acee52
--- /dev/null
+++ b/kernel/sched/thermal.c
@@ -0,0 +1,220 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Sceduler Thermal Interactions
+ *
+ *  Copyright (C) 2018 Linaro, Inc., Thara Gopinath <thara.gopinath@linaro.org>
+ */
+
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include "sched.h"
+
+/* Per cpu structure to keep track of Thermal Pressure */
+struct thermal_pressure {
+	unsigned long scale; /* scale reflecting average cpu max capacity*/
+	unsigned long acc_scale; /* Accumulated scale for this time window */
+	unsigned long old_scale; /* Scale value for the previous window */
+	unsigned long raw_scale; /* Raw max capacity */
+	unsigned long age_stamp; /* Last time old_scale was updated */
+	unsigned long last_update; /* Last time acc_scale was updated */
+	spinlock_t lock; /* Lock for protecting from simultaneous access*/
+	/* Timer for periodic update of thermal pressure */
+	struct timer_list timer;
+	int cpu;
+};
+
+DEFINE_PER_CPU(struct thermal_pressure *, thermal_pressure_cpu);
+
+#define THERMAL_PRESSURE_DECAY_PERIOD	(NSEC_PER_SEC / 2)
+
+static unsigned long calculate_simple(struct thermal_pressure *cpu_thermal,
+				      s64 delta, s64 period)
+{
+	unsigned long scale;
+	s64 decay_period = THERMAL_PRESSURE_DECAY_PERIOD;
+
+	cpu_thermal->acc_scale += delta * cpu_thermal->raw_scale;
+	scale = cpu_thermal->old_scale * decay_period;
+	scale += cpu_thermal->acc_scale;
+	scale /= (decay_period + period);
+	cpu_thermal->last_update += delta;
+
+	return scale;
+}
+
+/*
+ * Calculate thermal pressure.
+ * At the crux this is an averaging algorithm. Intially a tunable
+ * decay period(D) is defined. Thermal pressure at the end of a decay
+ * period D is the average of thermal pressure of period D-1 and D.
+ *
+ * Time             D-2            D-1             D
+ * ----------------------------------------------------------
+ * Raw Thermal  r1          r2            r3
+ * Pressure
+ *
+ * Average Thermal   r1         (r1+r2)/2       ((r1+r2)/2 + r3)/2
+ * Pressure.
+ */
+static void calculate_thermal_pressure(struct thermal_pressure *cpu_thermal)
+{
+	unsigned long scale;
+	s64 now, delta, decay_period, period;
+	int cpu;
+
+	if (!cpu_thermal)
+		return;
+
+	cpu = cpu_thermal->cpu;
+	now = sched_clock_cpu(cpu);
+	period = now - cpu_thermal->age_stamp;
+	decay_period = THERMAL_PRESSURE_DECAY_PERIOD;
+
+	if (period <= 0)
+		return;
+
+	/*
+	 * If period is less than decay_period,
+	 * just accumulate thermal pressure
+	 */
+	if (period < decay_period) {
+		delta = now - cpu_thermal->last_update;
+		scale = calculate_simple(cpu_thermal, delta, period);
+	} else {
+		/* delta here is the remaining time in the last time window */
+		delta = decay_period -
+			(cpu_thermal->last_update - cpu_thermal->age_stamp);
+		scale = calculate_simple(cpu_thermal, delta, decay_period);
+		cpu_thermal->acc_scale = 0;
+		cpu_thermal->age_stamp += decay_period;
+		/* Decay thermal pressure for every decay period remaining */
+		while ((sched_clock_cpu(cpu) - cpu_thermal->age_stamp)
+							> decay_period) {
+			scale += cpu_thermal->raw_scale;
+			scale /= 2;
+			cpu_thermal->age_stamp += decay_period;
+			cpu_thermal->last_update += decay_period;
+		}
+		cpu_thermal->old_scale = scale;
+		delta = sched_clock_cpu(cpu) - cpu_thermal->age_stamp;
+		if (delta > 0)
+			scale = calculate_simple(cpu_thermal, delta, delta);
+	}
+	cpu_thermal->scale = scale;
+}
+
+static void thermal_pressure_update(struct thermal_pressure *cpu_thermal,
+				    unsigned long cap_max_freq,
+				    unsigned long max_freq, bool change_scale)
+{
+	unsigned long flags = 0;
+
+	calculate_thermal_pressure(cpu_thermal);
+	if (change_scale)
+		cpu_thermal->raw_scale =
+			(cap_max_freq << SCHED_CAPACITY_SHIFT) / max_freq;
+
+	mod_timer(&cpu_thermal->timer, jiffies +
+				usecs_to_jiffies(TICK_USEC));
+
+	spin_unlock_irqrestore(&cpu_thermal->lock, flags);
+}
+
+/**
+ * Function for the tick update of the thermal pressure.
+ * The thermal pressure update is aborted if already an update is
+ * happening.
+ */
+static void thermal_pressure_timeout(struct timer_list *timer)
+{
+	struct thermal_pressure *cpu_thermal = from_timer(cpu_thermal, timer,
+							  timer);
+	unsigned long flags = 0;
+
+	if (!cpu_thermal)
+		return;
+
+	if (!spin_trylock_irqsave(&cpu_thermal->lock, flags))
+		return;
+
+	thermal_pressure_update(cpu_thermal, 0, 0, 0);
+}
+
+/**
+ * Function to update thermal pressure from cooling device
+ * or any framework responsible for capping cpu maximum
+ * capacity.
+ */
+void sched_update_thermal_pressure(struct cpumask *cpus,
+				   unsigned long cap_max_freq,
+				   unsigned long max_freq)
+{
+	int cpu;
+	unsigned long flags = 0;
+	struct thermal_pressure *cpu_thermal;
+
+	for_each_cpu(cpu, cpus) {
+		cpu_thermal = per_cpu(thermal_pressure_cpu, cpu);
+		if (!cpu_thermal)
+			return;
+		spin_lock_irqsave(&cpu_thermal->lock, flags);
+		thermal_pressure_update(cpu_thermal, cap_max_freq, max_freq, 1);
+	}
+}
+
+/**
+ * Function to be called from scheduler to get thermal pressure
+ * of a cpu
+ */
+unsigned long sched_get_thermal_pressure(int cpu)
+{
+	struct thermal_pressure *cpu_thermal = per_cpu(thermal_pressure_cpu,
+							cpu);
+
+	if (!cpu_thermal)
+		return SCHED_CAPACITY_SCALE;
+	else
+		return cpu_thermal->scale;
+}
+
+static void __init init_thermal_pressure(void)
+{
+	struct thermal_pressure *cpu_thermal;
+	unsigned long scale;
+	int cpu;
+
+	pr_debug("Init thermal pressure\n");
+	for_each_possible_cpu(cpu) {
+		cpu_thermal = per_cpu(thermal_pressure_cpu, cpu);
+		if (cpu_thermal)
+			continue;
+
+		cpu_thermal = kzalloc(sizeof(*cpu_thermal), GFP_KERNEL);
+		if (!cpu_thermal)
+			continue;
+		scale = SCHED_CAPACITY_SCALE;
+		cpu_thermal->scale = scale;
+		cpu_thermal->old_scale = scale;
+		cpu_thermal->raw_scale = scale;
+		cpu_thermal->age_stamp = sched_clock_cpu(cpu);
+		cpu_thermal->last_update = sched_clock_cpu(cpu);
+		cpu_thermal->cpu = cpu;
+		spin_lock_init(&cpu_thermal->lock);
+		timer_setup(&cpu_thermal->timer, thermal_pressure_timeout,
+			    TIMER_DEFERRABLE);
+		per_cpu(thermal_pressure_cpu, cpu) = cpu_thermal;
+		pr_debug("cpu %d thermal scale = %ld\n", cpu, cpu_thermal->scale);
+	}
+
+	for_each_possible_cpu(cpu) {
+		cpu_thermal = per_cpu(thermal_pressure_cpu, cpu);
+		if (!cpu_thermal)
+			continue;
+		cpu_thermal->timer.expires = jiffies +
+						usecs_to_jiffies(TICK_USEC);
+		add_timer(&cpu_thermal->timer);
+	}
+}
+
+late_initcall(init_thermal_pressure);