[v3,2/7] sched: Add infrastructure to store and update instantaneous thermal pressure

Message ID 1571014705-19646-3-git-send-email-thara.gopinath@linaro.org
State New
Headers show
Series
  • Introduce Thermal Pressure
Related show

Commit Message

Thara Gopinath Oct. 14, 2019, 12:58 a.m.
Add thermal.c and thermal.h files that provides interface
APIs to initialize, update/average, track, accumulate and decay
thermal pressure per cpu basis. A per cpu structure max_capacity_info is
introduced to keep track of instantaneous per cpu thermal pressure.
Thermal pressure is the delta between max_capacity and cap_capacity.
API update_periodic_maxcap is called for periodic accumulate and decay
of the thermal pressure. It is to to be called from a periodic tick
function. This API calculates the delta between max_capacity and
cap_capacity and passes on the delta to update_thermal_avg to do the
necessary accumulate, decay and average. API update_maxcap_capacity is for
the system to update the thermal pressure by updating cap_capacity.
Considering, update_periodic_maxcap reads cap_capacity and
update_maxcap_capacity writes into cap_capacity, one can argue for
some sort of locking mechanism to avoid a stale value.
But considering update_periodic_maxcap can be called from a system
critical path like scheduler tick function, a locking mechanism is not
ideal. This means that it is possible the value used to
calculate average thermal pressure for a cpu can be stale for upto 1
tick period.

Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>

---
 include/linux/sched.h  | 14 +++++++++++
 kernel/sched/Makefile  |  2 +-
 kernel/sched/thermal.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/thermal.h | 13 ++++++++++
 4 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 kernel/sched/thermal.c
 create mode 100644 kernel/sched/thermal.h

-- 
2.1.4

Comments

Vincent Guittot Oct. 14, 2019, 3:50 p.m. | #1
Hi Thara,

On Mon, 14 Oct 2019 at 02:58, Thara Gopinath <thara.gopinath@linaro.org> wrote:
>

> Add thermal.c and thermal.h files that provides interface

> APIs to initialize, update/average, track, accumulate and decay

> thermal pressure per cpu basis. A per cpu structure max_capacity_info is

> introduced to keep track of instantaneous per cpu thermal pressure.

> Thermal pressure is the delta between max_capacity and cap_capacity.

> API update_periodic_maxcap is called for periodic accumulate and decay

> of the thermal pressure. It is to to be called from a periodic tick

> function. This API calculates the delta between max_capacity and

> cap_capacity and passes on the delta to update_thermal_avg to do the

> necessary accumulate, decay and average. API update_maxcap_capacity is for

> the system to update the thermal pressure by updating cap_capacity.

> Considering, update_periodic_maxcap reads cap_capacity and

> update_maxcap_capacity writes into cap_capacity, one can argue for

> some sort of locking mechanism to avoid a stale value.

> But considering update_periodic_maxcap can be called from a system

> critical path like scheduler tick function, a locking mechanism is not

> ideal. This means that it is possible the value used to

> calculate average thermal pressure for a cpu can be stale for upto 1

> tick period.

>

> Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>

> ---

>  include/linux/sched.h  | 14 +++++++++++

>  kernel/sched/Makefile  |  2 +-

>  kernel/sched/thermal.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++

>  kernel/sched/thermal.h | 13 ++++++++++

>  4 files changed, 94 insertions(+), 1 deletion(-)

>  create mode 100644 kernel/sched/thermal.c

>  create mode 100644 kernel/sched/thermal.h

>

> diff --git a/include/linux/sched.h b/include/linux/sched.h

> index 2c2e56b..875ce2b 100644

> --- a/include/linux/sched.h

> +++ b/include/linux/sched.h

> @@ -1983,6 +1983,20 @@ static inline void rseq_syscall(struct pt_regs *regs)

>

>  #endif

>

> +#ifdef CONFIG_SMP

> +void update_maxcap_capacity(int cpu, u64 capacity);

> +

> +void populate_max_capacity_info(void);

> +#else

> +static inline void update_maxcap_capacity(int cpu, u64 capacity)

> +{

> +}

> +

> +static inline void populate_max_capacity_info(void)

> +{

> +}

> +#endif

> +

>  const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);

>  char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);

>  int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);

> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

> index 21fb5a5..4d3b820 100644

> --- a/kernel/sched/Makefile

> +++ b/kernel/sched/Makefile

> @@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o

>  obj-y += idle.o fair.o rt.o deadline.o

>  obj-y += wait.o wait_bit.o swait.o completion.o

>

> -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o

> +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o thermal.o

>  obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o

>  obj-$(CONFIG_SCHEDSTATS) += stats.o

>  obj-$(CONFIG_SCHED_DEBUG) += debug.o

> diff --git a/kernel/sched/thermal.c b/kernel/sched/thermal.c

> new file mode 100644

> index 0000000..5f0b2d4

> --- /dev/null

> +++ b/kernel/sched/thermal.c

> @@ -0,0 +1,66 @@

> +// SPDX-License-Identifier: GPL-2.0

> +/*

> + * Sceduler Thermal Interactions

> + *

> + *  Copyright (C) 2018 Linaro, Inc., Thara Gopinath <thara.gopinath@linaro.org>

> + */

> +

> +#include <linux/sched.h>

> +#include "sched.h"

> +#include "pelt.h"

> +#include "thermal.h"

> +

> +struct max_capacity_info {

> +       unsigned long max_capacity;

> +       unsigned long cap_capacity;

> +};

> +

> +static DEFINE_PER_CPU(struct max_capacity_info, max_cap);

> +

> +void update_maxcap_capacity(int cpu, u64 capacity)

> +{

> +       struct max_capacity_info *__max_cap;

> +       unsigned long __capacity;

> +

> +       __max_cap = (&per_cpu(max_cap, cpu));

> +       if (!__max_cap) {

> +               pr_err("no max_capacity_info structure for cpu %d\n", cpu);

> +               return;

> +       }

> +

> +       /* Normalize the capacity */

> +       __capacity = (capacity * arch_scale_cpu_capacity(cpu)) >>

> +                                                       SCHED_CAPACITY_SHIFT;

> +       pr_debug("updating cpu%d capped capacity from %lu to %lu\n", cpu, __max_cap->cap_capacity, __capacity);

> +

> +       __max_cap->cap_capacity = __capacity;

> +}

> +

> +void populate_max_capacity_info(void)

> +{

> +       struct max_capacity_info *__max_cap;

> +       u64 capacity;

> +       int cpu;

> +

> +       for_each_possible_cpu(cpu) {

> +               __max_cap = (&per_cpu(max_cap, cpu));

> +               if (!__max_cap)

> +                       continue;

> +               capacity = arch_scale_cpu_capacity(cpu);

> +               __max_cap->max_capacity = capacity;

> +               __max_cap->cap_capacity = capacity;

> +               pr_debug("cpu %d max capacity set to %ld\n", cpu, __max_cap->max_capacity);

> +       }

> +}


everything above seems to be there for the cpu cooling device and
should be included in it instead. The scheduler only need the capacity
capping
The cpu cooling device should just set the delta capacity in the
per-cpu variable (see my comment below)

> +

> +void update_periodic_maxcap(struct rq *rq)

> +{

> +       struct max_capacity_info *__max_cap = (&per_cpu(max_cap, cpu_of(rq)));

> +       unsigned long delta;

> +

> +       if (!__max_cap)

> +               return;

> +

> +       delta = __max_cap->max_capacity - __max_cap->cap_capacity;


Why don't you just save the delta in the per_cpu variable instead of
the struct max_capacity_info ? You have to compute the delta every
tick whereas we can expect it to not change that much compared to the
number of tick

> +       update_thermal_avg(rq_clock_task(rq), rq, delta);

> +}

> diff --git a/kernel/sched/thermal.h b/kernel/sched/thermal.h

> new file mode 100644

> index 0000000..5657cb4

> --- /dev/null

> +++ b/kernel/sched/thermal.h

> @@ -0,0 +1,13 @@

> +/* SPDX-License-Identifier: GPL-2.0 */

> +/*

> + * Scheduler thermal interaction internal methods.

> + */

> +

> +#ifdef CONFIG_SMP

> +void update_periodic_maxcap(struct rq *rq);

> +

> +#else

> +static inline void update_periodic_maxcap(struct rq *rq)

> +{

> +}

> +#endif

> --

> 2.1.4

>
Thara Gopinath Oct. 16, 2019, 9:22 p.m. | #2
Hi Vincent,

Thanks for the review
On 10/14/2019 11:50 AM, Vincent Guittot wrote:
> Hi Thara,

> 

> On Mon, 14 Oct 2019 at 02:58, Thara Gopinath <thara.gopinath@linaro.org> wrote:

>>

>> Add thermal.c and thermal.h files that provides interface

>> APIs to initialize, update/average, track, accumulate and decay

>> thermal pressure per cpu basis. A per cpu structure max_capacity_info is

>> introduced to keep track of instantaneous per cpu thermal pressure.

>> Thermal pressure is the delta between max_capacity and cap_capacity.

>> API update_periodic_maxcap is called for periodic accumulate and decay

>> of the thermal pressure. It is to to be called from a periodic tick

>> function. This API calculates the delta between max_capacity and

>> cap_capacity and passes on the delta to update_thermal_avg to do the

>> necessary accumulate, decay and average. API update_maxcap_capacity is for

>> the system to update the thermal pressure by updating cap_capacity.

>> Considering, update_periodic_maxcap reads cap_capacity and

>> update_maxcap_capacity writes into cap_capacity, one can argue for

>> some sort of locking mechanism to avoid a stale value.

>> But considering update_periodic_maxcap can be called from a system

>> critical path like scheduler tick function, a locking mechanism is not

>> ideal. This means that it is possible the value used to

>> calculate average thermal pressure for a cpu can be stale for upto 1

>> tick period.

>>

>> Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>

>> ---

>>  include/linux/sched.h  | 14 +++++++++++

>>  kernel/sched/Makefile  |  2 +-

>>  kernel/sched/thermal.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++

>>  kernel/sched/thermal.h | 13 ++++++++++

>>  4 files changed, 94 insertions(+), 1 deletion(-)

>>  create mode 100644 kernel/sched/thermal.c

>>  create mode 100644 kernel/sched/thermal.h

>>

>> diff --git a/include/linux/sched.h b/include/linux/sched.h

>> index 2c2e56b..875ce2b 100644

>> --- a/include/linux/sched.h

>> +++ b/include/linux/sched.h

>> @@ -1983,6 +1983,20 @@ static inline void rseq_syscall(struct pt_regs *regs)

>>

>>  #endif

>>

>> +#ifdef CONFIG_SMP

>> +void update_maxcap_capacity(int cpu, u64 capacity);

>> +

>> +void populate_max_capacity_info(void);

>> +#else

>> +static inline void update_maxcap_capacity(int cpu, u64 capacity)

>> +{

>> +}

>> +

>> +static inline void populate_max_capacity_info(void)

>> +{

>> +}

>> +#endif

>> +

>>  const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);

>>  char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);

>>  int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);

>> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

>> index 21fb5a5..4d3b820 100644

>> --- a/kernel/sched/Makefile

>> +++ b/kernel/sched/Makefile

>> @@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o

>>  obj-y += idle.o fair.o rt.o deadline.o

>>  obj-y += wait.o wait_bit.o swait.o completion.o

>>

>> -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o

>> +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o thermal.o

>>  obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o

>>  obj-$(CONFIG_SCHEDSTATS) += stats.o

>>  obj-$(CONFIG_SCHED_DEBUG) += debug.o

>> diff --git a/kernel/sched/thermal.c b/kernel/sched/thermal.c

>> new file mode 100644

>> index 0000000..5f0b2d4

>> --- /dev/null

>> +++ b/kernel/sched/thermal.c

>> @@ -0,0 +1,66 @@

>> +// SPDX-License-Identifier: GPL-2.0

>> +/*

>> + * Sceduler Thermal Interactions

>> + *

>> + *  Copyright (C) 2018 Linaro, Inc., Thara Gopinath <thara.gopinath@linaro.org>

>> + */

>> +

>> +#include <linux/sched.h>

>> +#include "sched.h"

>> +#include "pelt.h"

>> +#include "thermal.h"

>> +

>> +struct max_capacity_info {

>> +       unsigned long max_capacity;

>> +       unsigned long cap_capacity;

>> +};

>> +

>> +static DEFINE_PER_CPU(struct max_capacity_info, max_cap);

>> +

>> +void update_maxcap_capacity(int cpu, u64 capacity)

>> +{

>> +       struct max_capacity_info *__max_cap;

>> +       unsigned long __capacity;

>> +

>> +       __max_cap = (&per_cpu(max_cap, cpu));

>> +       if (!__max_cap) {

>> +               pr_err("no max_capacity_info structure for cpu %d\n", cpu);

>> +               return;

>> +       }

>> +

>> +       /* Normalize the capacity */

>> +       __capacity = (capacity * arch_scale_cpu_capacity(cpu)) >>

>> +                                                       SCHED_CAPACITY_SHIFT;

>> +       pr_debug("updating cpu%d capped capacity from %lu to %lu\n", cpu, __max_cap->cap_capacity, __capacity);

>> +

>> +       __max_cap->cap_capacity = __capacity;

>> +}

>> +

>> +void populate_max_capacity_info(void)

>> +{

>> +       struct max_capacity_info *__max_cap;

>> +       u64 capacity;

>> +       int cpu;

>> +

>> +       for_each_possible_cpu(cpu) {

>> +               __max_cap = (&per_cpu(max_cap, cpu));

>> +               if (!__max_cap)

>> +                       continue;

>> +               capacity = arch_scale_cpu_capacity(cpu);

>> +               __max_cap->max_capacity = capacity;

>> +               __max_cap->cap_capacity = capacity;

>> +               pr_debug("cpu %d max capacity set to %ld\n", cpu, __max_cap->max_capacity);

>> +       }

>> +}

> 

> everything above seems to be there for the cpu cooling device and

> should be included in it instead. The scheduler only need the capacity

> capping

> The cpu cooling device should just set the delta capacity in the

> per-cpu variable (see my comment below)

It can be a firmware  updating the thermal pressure instead of cpu
cooling device. Or may be some other entity .So instead of replicating
this code, isnt't it better to reside in one place ?
> 

>> +

>> +void update_periodic_maxcap(struct rq *rq)

>> +{

>> +       struct max_capacity_info *__max_cap = (&per_cpu(max_cap, cpu_of(rq)));

>> +       unsigned long delta;

>> +

>> +       if (!__max_cap)

>> +               return;

>> +

>> +       delta = __max_cap->max_capacity - __max_cap->cap_capacity;

> 

> Why don't you just save the delta in the per_cpu variable instead of

> the struct max_capacity_info ? You have to compute the delta every

> tick whereas we can expect it to not change that much compared to the

> number of tick.


Again I think thermal pressure can be applied from other entities like
firmware as well. But  I agree with your point that calculating delta
every tick is not a good idea. How about I move it to
update_maxcap_capacity so that delta is calculate every time an update
comes from cpu cooling device or anybody else ?

Warm Regards
Thara
Vincent Guittot Oct. 17, 2019, 8:44 a.m. | #3
Hi Thara,

On Wed, 16 Oct 2019 at 23:22, Thara Gopinath <thara.gopinath@linaro.org> wrote:
>

> Hi Vincent,

>

> Thanks for the review

> On 10/14/2019 11:50 AM, Vincent Guittot wrote:

> > Hi Thara,

> >

> > On Mon, 14 Oct 2019 at 02:58, Thara Gopinath <thara.gopinath@linaro.org> wrote:

> >>

> >> Add thermal.c and thermal.h files that provides interface

> >> APIs to initialize, update/average, track, accumulate and decay

> >> thermal pressure per cpu basis. A per cpu structure max_capacity_info is

> >> introduced to keep track of instantaneous per cpu thermal pressure.

> >> Thermal pressure is the delta between max_capacity and cap_capacity.

> >> API update_periodic_maxcap is called for periodic accumulate and decay

> >> of the thermal pressure. It is to to be called from a periodic tick

> >> function. This API calculates the delta between max_capacity and

> >> cap_capacity and passes on the delta to update_thermal_avg to do the

> >> necessary accumulate, decay and average. API update_maxcap_capacity is for

> >> the system to update the thermal pressure by updating cap_capacity.

> >> Considering, update_periodic_maxcap reads cap_capacity and

> >> update_maxcap_capacity writes into cap_capacity, one can argue for

> >> some sort of locking mechanism to avoid a stale value.

> >> But considering update_periodic_maxcap can be called from a system

> >> critical path like scheduler tick function, a locking mechanism is not

> >> ideal. This means that it is possible the value used to

> >> calculate average thermal pressure for a cpu can be stale for upto 1

> >> tick period.

> >>

> >> Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>

> >> ---

> >>  include/linux/sched.h  | 14 +++++++++++

> >>  kernel/sched/Makefile  |  2 +-

> >>  kernel/sched/thermal.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++

> >>  kernel/sched/thermal.h | 13 ++++++++++

> >>  4 files changed, 94 insertions(+), 1 deletion(-)

> >>  create mode 100644 kernel/sched/thermal.c

> >>  create mode 100644 kernel/sched/thermal.h

> >>

> >> diff --git a/include/linux/sched.h b/include/linux/sched.h

> >> index 2c2e56b..875ce2b 100644

> >> --- a/include/linux/sched.h

> >> +++ b/include/linux/sched.h

> >> @@ -1983,6 +1983,20 @@ static inline void rseq_syscall(struct pt_regs *regs)

> >>

> >>  #endif

> >>

> >> +#ifdef CONFIG_SMP

> >> +void update_maxcap_capacity(int cpu, u64 capacity);

> >> +

> >> +void populate_max_capacity_info(void);

> >> +#else

> >> +static inline void update_maxcap_capacity(int cpu, u64 capacity)

> >> +{

> >> +}

> >> +

> >> +static inline void populate_max_capacity_info(void)

> >> +{

> >> +}

> >> +#endif

> >> +

> >>  const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);

> >>  char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);

> >>  int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);

> >> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

> >> index 21fb5a5..4d3b820 100644

> >> --- a/kernel/sched/Makefile

> >> +++ b/kernel/sched/Makefile

> >> @@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o

> >>  obj-y += idle.o fair.o rt.o deadline.o

> >>  obj-y += wait.o wait_bit.o swait.o completion.o

> >>

> >> -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o

> >> +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o thermal.o

> >>  obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o

> >>  obj-$(CONFIG_SCHEDSTATS) += stats.o

> >>  obj-$(CONFIG_SCHED_DEBUG) += debug.o

> >> diff --git a/kernel/sched/thermal.c b/kernel/sched/thermal.c

> >> new file mode 100644

> >> index 0000000..5f0b2d4

> >> --- /dev/null

> >> +++ b/kernel/sched/thermal.c

> >> @@ -0,0 +1,66 @@

> >> +// SPDX-License-Identifier: GPL-2.0

> >> +/*

> >> + * Sceduler Thermal Interactions

> >> + *

> >> + *  Copyright (C) 2018 Linaro, Inc., Thara Gopinath <thara.gopinath@linaro.org>

> >> + */

> >> +

> >> +#include <linux/sched.h>

> >> +#include "sched.h"

> >> +#include "pelt.h"

> >> +#include "thermal.h"

> >> +

> >> +struct max_capacity_info {

> >> +       unsigned long max_capacity;

> >> +       unsigned long cap_capacity;

> >> +};

> >> +

> >> +static DEFINE_PER_CPU(struct max_capacity_info, max_cap);

> >> +

> >> +void update_maxcap_capacity(int cpu, u64 capacity)

> >> +{

> >> +       struct max_capacity_info *__max_cap;

> >> +       unsigned long __capacity;

> >> +

> >> +       __max_cap = (&per_cpu(max_cap, cpu));

> >> +       if (!__max_cap) {

> >> +               pr_err("no max_capacity_info structure for cpu %d\n", cpu);

> >> +               return;

> >> +       }

> >> +

> >> +       /* Normalize the capacity */

> >> +       __capacity = (capacity * arch_scale_cpu_capacity(cpu)) >>

> >> +                                                       SCHED_CAPACITY_SHIFT;

> >> +       pr_debug("updating cpu%d capped capacity from %lu to %lu\n", cpu, __max_cap->cap_capacity, __capacity);

> >> +

> >> +       __max_cap->cap_capacity = __capacity;

> >> +}

> >> +

> >> +void populate_max_capacity_info(void)

> >> +{

> >> +       struct max_capacity_info *__max_cap;

> >> +       u64 capacity;

> >> +       int cpu;

> >> +

> >> +       for_each_possible_cpu(cpu) {

> >> +               __max_cap = (&per_cpu(max_cap, cpu));

> >> +               if (!__max_cap)

> >> +                       continue;

> >> +               capacity = arch_scale_cpu_capacity(cpu);

> >> +               __max_cap->max_capacity = capacity;

> >> +               __max_cap->cap_capacity = capacity;

> >> +               pr_debug("cpu %d max capacity set to %ld\n", cpu, __max_cap->max_capacity);

> >> +       }

> >> +}

> >

> > everything above seems to be there for the cpu cooling device and

> > should be included in it instead. The scheduler only need the capacity

> > capping

> > The cpu cooling device should just set the delta capacity in the

> > per-cpu variable (see my comment below)

> It can be a firmware  updating the thermal pressure instead of cpu

> cooling device. Or may be some other entity .So instead of replicating

> this code, isnt't it better to reside in one place ?


But you have now idea which kind of metrics will be provided by
firmware. It might not be a capped frequency and a max frequency but
other metrics.
Also, __max_cap->max_capacity just save in a new per cpu struct the
return of arch_scale_cpu_capacity but this doesn't give us real
benefit




> >

> >> +

> >> +void update_periodic_maxcap(struct rq *rq)

> >> +{

> >> +       struct max_capacity_info *__max_cap = (&per_cpu(max_cap, cpu_of(rq)));

> >> +       unsigned long delta;

> >> +

> >> +       if (!__max_cap)

> >> +               return;

> >> +

> >> +       delta = __max_cap->max_capacity - __max_cap->cap_capacity;

> >

> > Why don't you just save the delta in the per_cpu variable instead of

> > the struct max_capacity_info ? You have to compute the delta every

> > tick whereas we can expect it to not change that much compared to the

> > number of tick.

>

> Again I think thermal pressure can be applied from other entities like

> firmware as well. But  I agree with your point that calculating delta

> every tick is not a good idea. How about I move it to

> update_maxcap_capacity so that delta is calculate every time an update

> comes from cpu cooling device or anybody else ?

>

> Warm Regards

> Thara

>
Thara Gopinath Oct. 17, 2019, 4:40 p.m. | #4
On 10/17/2019 04:44 AM, Vincent Guittot wrote:
> Hi Thara,

> 

> On Wed, 16 Oct 2019 at 23:22, Thara Gopinath <thara.gopinath@linaro.org> wrote:

>>

>> Hi Vincent,

>>

>> Thanks for the review

>> On 10/14/2019 11:50 AM, Vincent Guittot wrote:

>>> Hi Thara,

>>>

>>> On Mon, 14 Oct 2019 at 02:58, Thara Gopinath <thara.gopinath@linaro.org> wrote:

>>>>

>>>> Add thermal.c and thermal.h files that provides interface

>>>> APIs to initialize, update/average, track, accumulate and decay

>>>> thermal pressure per cpu basis. A per cpu structure max_capacity_info is

>>>> introduced to keep track of instantaneous per cpu thermal pressure.

>>>> Thermal pressure is the delta between max_capacity and cap_capacity.

>>>> API update_periodic_maxcap is called for periodic accumulate and decay

>>>> of the thermal pressure. It is to to be called from a periodic tick

>>>> function. This API calculates the delta between max_capacity and

>>>> cap_capacity and passes on the delta to update_thermal_avg to do the

>>>> necessary accumulate, decay and average. API update_maxcap_capacity is for

>>>> the system to update the thermal pressure by updating cap_capacity.

>>>> Considering, update_periodic_maxcap reads cap_capacity and

>>>> update_maxcap_capacity writes into cap_capacity, one can argue for

>>>> some sort of locking mechanism to avoid a stale value.

>>>> But considering update_periodic_maxcap can be called from a system

>>>> critical path like scheduler tick function, a locking mechanism is not

>>>> ideal. This means that it is possible the value used to

>>>> calculate average thermal pressure for a cpu can be stale for upto 1

>>>> tick period.

>>>>

>>>> Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>

>>>> ---

>>>>  include/linux/sched.h  | 14 +++++++++++

>>>>  kernel/sched/Makefile  |  2 +-

>>>>  kernel/sched/thermal.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++

>>>>  kernel/sched/thermal.h | 13 ++++++++++

>>>>  4 files changed, 94 insertions(+), 1 deletion(-)

>>>>  create mode 100644 kernel/sched/thermal.c

>>>>  create mode 100644 kernel/sched/thermal.h

>>>>

>>>> diff --git a/include/linux/sched.h b/include/linux/sched.h

>>>> index 2c2e56b..875ce2b 100644

>>>> --- a/include/linux/sched.h

>>>> +++ b/include/linux/sched.h

>>>> @@ -1983,6 +1983,20 @@ static inline void rseq_syscall(struct pt_regs *regs)

>>>>

>>>>  #endif

>>>>

>>>> +#ifdef CONFIG_SMP

>>>> +void update_maxcap_capacity(int cpu, u64 capacity);

>>>> +

>>>> +void populate_max_capacity_info(void);

>>>> +#else

>>>> +static inline void update_maxcap_capacity(int cpu, u64 capacity)

>>>> +{

>>>> +}

>>>> +

>>>> +static inline void populate_max_capacity_info(void)

>>>> +{

>>>> +}

>>>> +#endif

>>>> +

>>>>  const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);

>>>>  char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);

>>>>  int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);

>>>> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

>>>> index 21fb5a5..4d3b820 100644

>>>> --- a/kernel/sched/Makefile

>>>> +++ b/kernel/sched/Makefile

>>>> @@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o

>>>>  obj-y += idle.o fair.o rt.o deadline.o

>>>>  obj-y += wait.o wait_bit.o swait.o completion.o

>>>>

>>>> -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o

>>>> +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o thermal.o

>>>>  obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o

>>>>  obj-$(CONFIG_SCHEDSTATS) += stats.o

>>>>  obj-$(CONFIG_SCHED_DEBUG) += debug.o

>>>> diff --git a/kernel/sched/thermal.c b/kernel/sched/thermal.c

>>>> new file mode 100644

>>>> index 0000000..5f0b2d4

>>>> --- /dev/null

>>>> +++ b/kernel/sched/thermal.c

>>>> @@ -0,0 +1,66 @@

>>>> +// SPDX-License-Identifier: GPL-2.0

>>>> +/*

>>>> + * Sceduler Thermal Interactions

>>>> + *

>>>> + *  Copyright (C) 2018 Linaro, Inc., Thara Gopinath <thara.gopinath@linaro.org>

>>>> + */

>>>> +

>>>> +#include <linux/sched.h>

>>>> +#include "sched.h"

>>>> +#include "pelt.h"

>>>> +#include "thermal.h"

>>>> +

>>>> +struct max_capacity_info {

>>>> +       unsigned long max_capacity;

>>>> +       unsigned long cap_capacity;

>>>> +};

>>>> +

>>>> +static DEFINE_PER_CPU(struct max_capacity_info, max_cap);

>>>> +

>>>> +void update_maxcap_capacity(int cpu, u64 capacity)

>>>> +{

>>>> +       struct max_capacity_info *__max_cap;

>>>> +       unsigned long __capacity;

>>>> +

>>>> +       __max_cap = (&per_cpu(max_cap, cpu));

>>>> +       if (!__max_cap) {

>>>> +               pr_err("no max_capacity_info structure for cpu %d\n", cpu);

>>>> +               return;

>>>> +       }

>>>> +

>>>> +       /* Normalize the capacity */

>>>> +       __capacity = (capacity * arch_scale_cpu_capacity(cpu)) >>

>>>> +                                                       SCHED_CAPACITY_SHIFT;

>>>> +       pr_debug("updating cpu%d capped capacity from %lu to %lu\n", cpu, __max_cap->cap_capacity, __capacity);

>>>> +

>>>> +       __max_cap->cap_capacity = __capacity;

>>>> +}

>>>> +

>>>> +void populate_max_capacity_info(void)

>>>> +{

>>>> +       struct max_capacity_info *__max_cap;

>>>> +       u64 capacity;

>>>> +       int cpu;

>>>> +

>>>> +       for_each_possible_cpu(cpu) {

>>>> +               __max_cap = (&per_cpu(max_cap, cpu));

>>>> +               if (!__max_cap)

>>>> +                       continue;

>>>> +               capacity = arch_scale_cpu_capacity(cpu);

>>>> +               __max_cap->max_capacity = capacity;

>>>> +               __max_cap->cap_capacity = capacity;

>>>> +               pr_debug("cpu %d max capacity set to %ld\n", cpu, __max_cap->max_capacity);

>>>> +       }

>>>> +}

>>>

>>> everything above seems to be there for the cpu cooling device and

>>> should be included in it instead. The scheduler only need the capacity

>>> capping

>>> The cpu cooling device should just set the delta capacity in the

>>> per-cpu variable (see my comment below)

>> It can be a firmware  updating the thermal pressure instead of cpu

>> cooling device. Or may be some other entity .So instead of replicating

>> this code, isnt't it better to reside in one place ?

> 

> But you have now idea which kind of metrics will be provided by

> firmware. It might not be a capped frequency and a max frequency but

> other metrics.

hmm. Why ? It is thermal pressure. It is finally applied to the
scheduler metric cpu_capacity. It should not be anything other than
capacity reduction due to thermal events that should be provided.
> Also, __max_cap->max_capacity just save in a new per cpu struct the

> return of arch_scale_cpu_capacity but this doesn't give us real

> benefit

I agree. I will change the struct into a single variable delta and
initialize it to 0. populate_max_capacity_info need not be there. I will
have update_maxcap_capacity calculate the delta with
arch_scale_cpu_capacity and store it into a per cpu variable.


-- 
Warm Regards
Thara
Vincent Guittot Oct. 18, 2019, 8:05 a.m. | #5
Hi Thara,

On Thu, 17 Oct 2019 at 18:40, Thara Gopinath <thara.gopinath@linaro.org> wrote:
>

> On 10/17/2019 04:44 AM, Vincent Guittot wrote:

> > Hi Thara,

> >

> > On Wed, 16 Oct 2019 at 23:22, Thara Gopinath <thara.gopinath@linaro.org> wrote:

> >>

> >> Hi Vincent,

> >>

> >> Thanks for the review

> >> On 10/14/2019 11:50 AM, Vincent Guittot wrote:

> >>> Hi Thara,

> >>>

> >>> On Mon, 14 Oct 2019 at 02:58, Thara Gopinath <thara.gopinath@linaro.org> wrote:

> >>>>

> >>>> Add thermal.c and thermal.h files that provides interface

> >>>> APIs to initialize, update/average, track, accumulate and decay

> >>>> thermal pressure per cpu basis. A per cpu structure max_capacity_info is

> >>>> introduced to keep track of instantaneous per cpu thermal pressure.

> >>>> Thermal pressure is the delta between max_capacity and cap_capacity.

> >>>> API update_periodic_maxcap is called for periodic accumulate and decay

> >>>> of the thermal pressure. It is to to be called from a periodic tick

> >>>> function. This API calculates the delta between max_capacity and

> >>>> cap_capacity and passes on the delta to update_thermal_avg to do the

> >>>> necessary accumulate, decay and average. API update_maxcap_capacity is for

> >>>> the system to update the thermal pressure by updating cap_capacity.

> >>>> Considering, update_periodic_maxcap reads cap_capacity and

> >>>> update_maxcap_capacity writes into cap_capacity, one can argue for

> >>>> some sort of locking mechanism to avoid a stale value.

> >>>> But considering update_periodic_maxcap can be called from a system

> >>>> critical path like scheduler tick function, a locking mechanism is not

> >>>> ideal. This means that it is possible the value used to

> >>>> calculate average thermal pressure for a cpu can be stale for upto 1

> >>>> tick period.

> >>>>

> >>>> Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>

> >>>> ---

> >>>>  include/linux/sched.h  | 14 +++++++++++

> >>>>  kernel/sched/Makefile  |  2 +-

> >>>>  kernel/sched/thermal.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++

> >>>>  kernel/sched/thermal.h | 13 ++++++++++

> >>>>  4 files changed, 94 insertions(+), 1 deletion(-)

> >>>>  create mode 100644 kernel/sched/thermal.c

> >>>>  create mode 100644 kernel/sched/thermal.h

> >>>>

> >>>> diff --git a/include/linux/sched.h b/include/linux/sched.h

> >>>> index 2c2e56b..875ce2b 100644

> >>>> --- a/include/linux/sched.h

> >>>> +++ b/include/linux/sched.h

> >>>> @@ -1983,6 +1983,20 @@ static inline void rseq_syscall(struct pt_regs *regs)

> >>>>

> >>>>  #endif

> >>>>

> >>>> +#ifdef CONFIG_SMP

> >>>> +void update_maxcap_capacity(int cpu, u64 capacity);

> >>>> +

> >>>> +void populate_max_capacity_info(void);

> >>>> +#else

> >>>> +static inline void update_maxcap_capacity(int cpu, u64 capacity)

> >>>> +{

> >>>> +}

> >>>> +

> >>>> +static inline void populate_max_capacity_info(void)

> >>>> +{

> >>>> +}

> >>>> +#endif

> >>>> +

> >>>>  const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);

> >>>>  char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);

> >>>>  int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);

> >>>> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

> >>>> index 21fb5a5..4d3b820 100644

> >>>> --- a/kernel/sched/Makefile

> >>>> +++ b/kernel/sched/Makefile

> >>>> @@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o

> >>>>  obj-y += idle.o fair.o rt.o deadline.o

> >>>>  obj-y += wait.o wait_bit.o swait.o completion.o

> >>>>

> >>>> -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o

> >>>> +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o thermal.o

> >>>>  obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o

> >>>>  obj-$(CONFIG_SCHEDSTATS) += stats.o

> >>>>  obj-$(CONFIG_SCHED_DEBUG) += debug.o

> >>>> diff --git a/kernel/sched/thermal.c b/kernel/sched/thermal.c

> >>>> new file mode 100644

> >>>> index 0000000..5f0b2d4

> >>>> --- /dev/null

> >>>> +++ b/kernel/sched/thermal.c

> >>>> @@ -0,0 +1,66 @@

> >>>> +// SPDX-License-Identifier: GPL-2.0

> >>>> +/*

> >>>> + * Sceduler Thermal Interactions

> >>>> + *

> >>>> + *  Copyright (C) 2018 Linaro, Inc., Thara Gopinath <thara.gopinath@linaro.org>

> >>>> + */

> >>>> +

> >>>> +#include <linux/sched.h>

> >>>> +#include "sched.h"

> >>>> +#include "pelt.h"

> >>>> +#include "thermal.h"

> >>>> +

> >>>> +struct max_capacity_info {

> >>>> +       unsigned long max_capacity;

> >>>> +       unsigned long cap_capacity;

> >>>> +};

> >>>> +

> >>>> +static DEFINE_PER_CPU(struct max_capacity_info, max_cap);

> >>>> +

> >>>> +void update_maxcap_capacity(int cpu, u64 capacity)

> >>>> +{

> >>>> +       struct max_capacity_info *__max_cap;

> >>>> +       unsigned long __capacity;

> >>>> +

> >>>> +       __max_cap = (&per_cpu(max_cap, cpu));

> >>>> +       if (!__max_cap) {

> >>>> +               pr_err("no max_capacity_info structure for cpu %d\n", cpu);

> >>>> +               return;

> >>>> +       }

> >>>> +

> >>>> +       /* Normalize the capacity */

> >>>> +       __capacity = (capacity * arch_scale_cpu_capacity(cpu)) >>

> >>>> +                                                       SCHED_CAPACITY_SHIFT;

> >>>> +       pr_debug("updating cpu%d capped capacity from %lu to %lu\n", cpu, __max_cap->cap_capacity, __capacity);

> >>>> +

> >>>> +       __max_cap->cap_capacity = __capacity;

> >>>> +}

> >>>> +

> >>>> +void populate_max_capacity_info(void)

> >>>> +{

> >>>> +       struct max_capacity_info *__max_cap;

> >>>> +       u64 capacity;

> >>>> +       int cpu;

> >>>> +

> >>>> +       for_each_possible_cpu(cpu) {

> >>>> +               __max_cap = (&per_cpu(max_cap, cpu));

> >>>> +               if (!__max_cap)

> >>>> +                       continue;

> >>>> +               capacity = arch_scale_cpu_capacity(cpu);

> >>>> +               __max_cap->max_capacity = capacity;

> >>>> +               __max_cap->cap_capacity = capacity;

> >>>> +               pr_debug("cpu %d max capacity set to %ld\n", cpu, __max_cap->max_capacity);

> >>>> +       }

> >>>> +}

> >>>

> >>> everything above seems to be there for the cpu cooling device and

> >>> should be included in it instead. The scheduler only need the capacity

> >>> capping

> >>> The cpu cooling device should just set the delta capacity in the

> >>> per-cpu variable (see my comment below)

> >> It can be a firmware  updating the thermal pressure instead of cpu

> >> cooling device. Or may be some other entity .So instead of replicating

> >> this code, isnt't it better to reside in one place ?

> >

> > But you have now idea which kind of metrics will be provided by

> > firmware. It might not be a capped frequency and a max frequency but

> > other metrics.

> hmm. Why ? It is thermal pressure. It is finally applied to the

> scheduler metric cpu_capacity. It should not be anything other than

> capacity reduction due to thermal events that should be provided.


What I mean is that you don't know how the value will be provided and
if you will need a cap and a max value. It might easily provide
directly the delta,  or a percent or even something else


> > Also, __max_cap->max_capacity just save in a new per cpu struct the

> > return of arch_scale_cpu_capacity but this doesn't give us real

> > benefit

> I agree. I will change the struct into a single variable delta and

> initialize it to 0. populate_max_capacity_info need not be there. I will

> have update_maxcap_capacity calculate the delta with

> arch_scale_cpu_capacity and store it into a per cpu variable.


Also could you rename this function. You use thermal for the pelt
function so could you align the function names ? either to use thermal
everywhere or another name but could you not mix the naming
If you want to use thermal why not using update_thermal_pressure ? at
least it's easy to understand what it does

>



>

> --

> Warm Regards

> Thara

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c2e56b..875ce2b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1983,6 +1983,20 @@  static inline void rseq_syscall(struct pt_regs *regs)
 
 #endif
 
+#ifdef CONFIG_SMP
+void update_maxcap_capacity(int cpu, u64 capacity);
+
+void populate_max_capacity_info(void);
+#else
+static inline void update_maxcap_capacity(int cpu, u64 capacity)
+{
+}
+
+static inline void populate_max_capacity_info(void)
+{
+}
+#endif
+
 const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);
 char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);
 int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 21fb5a5..4d3b820 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -20,7 +20,7 @@  obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle.o fair.o rt.o deadline.o
 obj-y += wait.o wait_bit.o swait.o completion.o
 
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o thermal.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/thermal.c b/kernel/sched/thermal.c
new file mode 100644
index 0000000..5f0b2d4
--- /dev/null
+++ b/kernel/sched/thermal.c
@@ -0,0 +1,66 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Sceduler Thermal Interactions
+ *
+ *  Copyright (C) 2018 Linaro, Inc., Thara Gopinath <thara.gopinath@linaro.org>
+ */
+
+#include <linux/sched.h>
+#include "sched.h"
+#include "pelt.h"
+#include "thermal.h"
+
+struct max_capacity_info {
+	unsigned long max_capacity;
+	unsigned long cap_capacity;
+};
+
+static DEFINE_PER_CPU(struct max_capacity_info, max_cap);
+
+void update_maxcap_capacity(int cpu, u64 capacity)
+{
+	struct max_capacity_info *__max_cap;
+	unsigned long __capacity;
+
+	__max_cap = (&per_cpu(max_cap, cpu));
+	if (!__max_cap) {
+		pr_err("no max_capacity_info structure for cpu %d\n", cpu);
+		return;
+	}
+
+	/* Normalize the capacity */
+	__capacity = (capacity * arch_scale_cpu_capacity(cpu)) >>
+							SCHED_CAPACITY_SHIFT;
+	pr_debug("updating cpu%d capped capacity from %lu to %lu\n", cpu, __max_cap->cap_capacity, __capacity);
+
+	__max_cap->cap_capacity = __capacity;
+}
+
+void populate_max_capacity_info(void)
+{
+	struct max_capacity_info *__max_cap;
+	u64 capacity;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		__max_cap = (&per_cpu(max_cap, cpu));
+		if (!__max_cap)
+			continue;
+		capacity = arch_scale_cpu_capacity(cpu);
+		__max_cap->max_capacity = capacity;
+		__max_cap->cap_capacity = capacity;
+		pr_debug("cpu %d max capacity set to %ld\n", cpu, __max_cap->max_capacity);
+	}
+}
+
+void update_periodic_maxcap(struct rq *rq)
+{
+	struct max_capacity_info *__max_cap = (&per_cpu(max_cap, cpu_of(rq)));
+	unsigned long delta;
+
+	if (!__max_cap)
+		return;
+
+	delta = __max_cap->max_capacity - __max_cap->cap_capacity;
+	update_thermal_avg(rq_clock_task(rq), rq, delta);
+}
diff --git a/kernel/sched/thermal.h b/kernel/sched/thermal.h
new file mode 100644
index 0000000..5657cb4
--- /dev/null
+++ b/kernel/sched/thermal.h
@@ -0,0 +1,13 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Scheduler thermal interaction internal methods.
+ */
+
+#ifdef CONFIG_SMP
+void update_periodic_maxcap(struct rq *rq);
+
+#else
+static inline void update_periodic_maxcap(struct rq *rq)
+{
+}
+#endif