diff mbox series

[v4,2/5] sched: Take cpufreq feedback into account

Message ID 20240109164655.626085-3-vincent.guittot@linaro.org
State New
Headers show
Series Rework system pressure interface to the scheduler | expand

Commit Message

Vincent Guittot Jan. 9, 2024, 4:46 p.m. UTC
Aggregate the different pressures applied on the capacity of CPUs and
create a new function that returns the actual capacity of the CPU:
  get_actual_cpu_capacity()

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
---
 kernel/sched/fair.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

Comments

Qais Yousef Jan. 30, 2024, 12:26 a.m. UTC | #1
On 01/09/24 17:46, Vincent Guittot wrote:
> Aggregate the different pressures applied on the capacity of CPUs and
> create a new function that returns the actual capacity of the CPU:
>   get_actual_cpu_capacity()
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
> ---
>  kernel/sched/fair.c | 45 +++++++++++++++++++++++++--------------------
>  1 file changed, 25 insertions(+), 20 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 9cc20855dc2b..e54bbf8b4936 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4910,13 +4910,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
>  	trace_sched_util_est_se_tp(&p->se);
>  }
>  
> +static inline unsigned long get_actual_cpu_capacity(int cpu)
> +{
> +	unsigned long capacity = arch_scale_cpu_capacity(cpu);
> +
> +	capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));

Does cpufreq_get_pressure() reflect thermally throttled frequency, or just the
policy->max being capped by user etc? I didn't see an update to cpufreq when we
topology_update_hw_pressure(). Not sure if it'll go through another path.

maxing with thermal_load_avg() will change the behavior below where we used to
compare against instantaneous pressure. The concern was that it not just can
appear quickly, but disappear quickly too. thermal_load_avg() will decay
slowly, no?  This means we'll lose a lot of opportunities for better task
placement until this decays which can take relatively long time.

So maxing handles the direction where a pressure suddenly appears. But it
doesn't handle where it disappears.

I suspect your thoughts are that if it was transient then thermal_load_avg()
should be small anyway - which I think makes sense.

I think we need a comment to explain these nuance differences.

> +
> +	return capacity;
> +}
> +
>  static inline int util_fits_cpu(unsigned long util,
>  				unsigned long uclamp_min,
>  				unsigned long uclamp_max,
>  				int cpu)
>  {
> -	unsigned long capacity_orig, capacity_orig_thermal;
>  	unsigned long capacity = capacity_of(cpu);
> +	unsigned long capacity_orig;
>  	bool fits, uclamp_max_fits;
>  
>  	/*
> @@ -4948,7 +4957,6 @@ static inline int util_fits_cpu(unsigned long util,
>  	 * goal is to cap the task. So it's okay if it's getting less.
>  	 */
>  	capacity_orig = arch_scale_cpu_capacity(cpu);
> -	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
>  
>  	/*
>  	 * We want to force a task to fit a cpu as implied by uclamp_max.
> @@ -5023,7 +5031,8 @@ static inline int util_fits_cpu(unsigned long util,
>  	 * handle the case uclamp_min > uclamp_max.
>  	 */
>  	uclamp_min = min(uclamp_min, uclamp_max);
> -	if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> +	if (fits && (util < uclamp_min) &&
> +	    (uclamp_min > get_actual_cpu_capacity(cpu)))
>  		return -1;
>  
>  	return fits;
> @@ -7404,7 +7413,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  		 * Look for the CPU with best capacity.
>  		 */
>  		else if (fits < 0)
> -			cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
> +			cpu_cap = get_actual_cpu_capacity(cpu);
>  
>  		/*
>  		 * First, select CPU which fits better (-1 being better than 0).
> @@ -7897,8 +7906,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  	struct root_domain *rd = this_rq()->rd;
>  	int cpu, best_energy_cpu, target = -1;
>  	int prev_fits = -1, best_fits = -1;
> -	unsigned long best_thermal_cap = 0;
> -	unsigned long prev_thermal_cap = 0;
> +	unsigned long best_actual_cap = 0;
> +	unsigned long prev_actual_cap = 0;
>  	struct sched_domain *sd;
>  	struct perf_domain *pd;
>  	struct energy_env eenv;
> @@ -7928,7 +7937,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  
>  	for (; pd; pd = pd->next) {
>  		unsigned long util_min = p_util_min, util_max = p_util_max;
> -		unsigned long cpu_cap, cpu_thermal_cap, util;
> +		unsigned long cpu_cap, cpu_actual_cap, util;
>  		long prev_spare_cap = -1, max_spare_cap = -1;
>  		unsigned long rq_util_min, rq_util_max;
>  		unsigned long cur_delta, base_energy;
> @@ -7940,18 +7949,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  		if (cpumask_empty(cpus))
>  			continue;
>  
> -		/* Account thermal pressure for the energy estimation */
> +		/* Account external pressure for the energy estimation */
>  		cpu = cpumask_first(cpus);
> -		cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
> -		cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
> +		cpu_actual_cap = get_actual_cpu_capacity(cpu);
>  
> -		eenv.cpu_cap = cpu_thermal_cap;
> +		eenv.cpu_cap = cpu_actual_cap;
>  		eenv.pd_cap = 0;
>  
>  		for_each_cpu(cpu, cpus) {
>  			struct rq *rq = cpu_rq(cpu);
>  
> -			eenv.pd_cap += cpu_thermal_cap;
> +			eenv.pd_cap += cpu_actual_cap;
>  
>  			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
>  				continue;
> @@ -8022,7 +8030,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  			if (prev_delta < base_energy)
>  				goto unlock;
>  			prev_delta -= base_energy;
> -			prev_thermal_cap = cpu_thermal_cap;
> +			prev_actual_cap = cpu_actual_cap;
>  			best_delta = min(best_delta, prev_delta);
>  		}
>  
> @@ -8037,7 +8045,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  			 * but best energy cpu has better capacity.
>  			 */
>  			if ((max_fits < 0) &&
> -			    (cpu_thermal_cap <= best_thermal_cap))
> +			    (cpu_actual_cap <= best_actual_cap))
>  				continue;
>  
>  			cur_delta = compute_energy(&eenv, pd, cpus, p,
> @@ -8058,14 +8066,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  			best_delta = cur_delta;
>  			best_energy_cpu = max_spare_cap_cpu;
>  			best_fits = max_fits;
> -			best_thermal_cap = cpu_thermal_cap;
> +			best_actual_cap = cpu_actual_cap;
>  		}
>  	}
>  	rcu_read_unlock();
>  
>  	if ((best_fits > prev_fits) ||
>  	    ((best_fits > 0) && (best_delta < prev_delta)) ||
> -	    ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> +	    ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
>  		target = best_energy_cpu;
>  
>  	return target;
> @@ -9441,8 +9449,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
>  
>  static unsigned long scale_rt_capacity(int cpu)
>  {
> +	unsigned long max = get_actual_cpu_capacity(cpu);
>  	struct rq *rq = cpu_rq(cpu);
> -	unsigned long max = arch_scale_cpu_capacity(cpu);
>  	unsigned long used, free;
>  	unsigned long irq;
>  
> @@ -9454,12 +9462,9 @@ static unsigned long scale_rt_capacity(int cpu)
>  	/*
>  	 * avg_rt.util_avg and avg_dl.util_avg track binary signals
>  	 * (running and not running) with weights 0 and 1024 respectively.
> -	 * avg_thermal.load_avg tracks thermal pressure and the weighted
> -	 * average uses the actual delta max capacity(load).
>  	 */
>  	used = READ_ONCE(rq->avg_rt.util_avg);
>  	used += READ_ONCE(rq->avg_dl.util_avg);
> -	used += thermal_load_avg(rq);
>  
>  	if (unlikely(used >= max))
>  		return 1;
> -- 
> 2.34.1
>
Qais Yousef Jan. 30, 2024, 12:50 a.m. UTC | #2
On 01/30/24 00:26, Qais Yousef wrote:
> On 01/09/24 17:46, Vincent Guittot wrote:
> > Aggregate the different pressures applied on the capacity of CPUs and
> > create a new function that returns the actual capacity of the CPU:
> >   get_actual_cpu_capacity()
> > 
> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
> > ---
> >  kernel/sched/fair.c | 45 +++++++++++++++++++++++++--------------------
> >  1 file changed, 25 insertions(+), 20 deletions(-)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 9cc20855dc2b..e54bbf8b4936 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4910,13 +4910,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> >  	trace_sched_util_est_se_tp(&p->se);
> >  }
> >  
> > +static inline unsigned long get_actual_cpu_capacity(int cpu)
> > +{
> > +	unsigned long capacity = arch_scale_cpu_capacity(cpu);
> > +
> > +	capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
> 
> Does cpufreq_get_pressure() reflect thermally throttled frequency, or just the
> policy->max being capped by user etc? I didn't see an update to cpufreq when we
> topology_update_hw_pressure(). Not sure if it'll go through another path.

It is done via the cooling device. And assume any limitations on freq due to
power etc are assumed to always to cause the policy->max to change.

(sorry if I missed earlier discussions about this)

> 
> maxing with thermal_load_avg() will change the behavior below where we used to
> compare against instantaneous pressure. The concern was that it not just can
> appear quickly, but disappear quickly too. thermal_load_avg() will decay
> slowly, no?  This means we'll lose a lot of opportunities for better task
> placement until this decays which can take relatively long time.
> 
> So maxing handles the direction where a pressure suddenly appears. But it
> doesn't handle where it disappears.
> 
> I suspect your thoughts are that if it was transient then thermal_load_avg()
> should be small anyway - which I think makes sense.
> 
> I think we need a comment to explain these nuance differences.
> 
> > +
> > +	return capacity;
> > +}
> > +
> >  static inline int util_fits_cpu(unsigned long util,
> >  				unsigned long uclamp_min,
> >  				unsigned long uclamp_max,
> >  				int cpu)
> >  {
> > -	unsigned long capacity_orig, capacity_orig_thermal;
> >  	unsigned long capacity = capacity_of(cpu);
> > +	unsigned long capacity_orig;
> >  	bool fits, uclamp_max_fits;
> >  
> >  	/*
> > @@ -4948,7 +4957,6 @@ static inline int util_fits_cpu(unsigned long util,
> >  	 * goal is to cap the task. So it's okay if it's getting less.
> >  	 */
> >  	capacity_orig = arch_scale_cpu_capacity(cpu);
> > -	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> >  
> >  	/*
> >  	 * We want to force a task to fit a cpu as implied by uclamp_max.
> > @@ -5023,7 +5031,8 @@ static inline int util_fits_cpu(unsigned long util,
> >  	 * handle the case uclamp_min > uclamp_max.
> >  	 */
> >  	uclamp_min = min(uclamp_min, uclamp_max);
> > -	if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > +	if (fits && (util < uclamp_min) &&
> > +	    (uclamp_min > get_actual_cpu_capacity(cpu)))
> >  		return -1;
> >  
> >  	return fits;
> > @@ -7404,7 +7413,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> >  		 * Look for the CPU with best capacity.
> >  		 */
> >  		else if (fits < 0)
> > -			cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
> > +			cpu_cap = get_actual_cpu_capacity(cpu);
> >  
> >  		/*
> >  		 * First, select CPU which fits better (-1 being better than 0).
> > @@ -7897,8 +7906,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >  	struct root_domain *rd = this_rq()->rd;
> >  	int cpu, best_energy_cpu, target = -1;
> >  	int prev_fits = -1, best_fits = -1;
> > -	unsigned long best_thermal_cap = 0;
> > -	unsigned long prev_thermal_cap = 0;
> > +	unsigned long best_actual_cap = 0;
> > +	unsigned long prev_actual_cap = 0;
> >  	struct sched_domain *sd;
> >  	struct perf_domain *pd;
> >  	struct energy_env eenv;
> > @@ -7928,7 +7937,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >  
> >  	for (; pd; pd = pd->next) {
> >  		unsigned long util_min = p_util_min, util_max = p_util_max;
> > -		unsigned long cpu_cap, cpu_thermal_cap, util;
> > +		unsigned long cpu_cap, cpu_actual_cap, util;
> >  		long prev_spare_cap = -1, max_spare_cap = -1;
> >  		unsigned long rq_util_min, rq_util_max;
> >  		unsigned long cur_delta, base_energy;
> > @@ -7940,18 +7949,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >  		if (cpumask_empty(cpus))
> >  			continue;
> >  
> > -		/* Account thermal pressure for the energy estimation */
> > +		/* Account external pressure for the energy estimation */
> >  		cpu = cpumask_first(cpus);
> > -		cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
> > -		cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
> > +		cpu_actual_cap = get_actual_cpu_capacity(cpu);
> >  
> > -		eenv.cpu_cap = cpu_thermal_cap;
> > +		eenv.cpu_cap = cpu_actual_cap;
> >  		eenv.pd_cap = 0;
> >  
> >  		for_each_cpu(cpu, cpus) {
> >  			struct rq *rq = cpu_rq(cpu);
> >  
> > -			eenv.pd_cap += cpu_thermal_cap;
> > +			eenv.pd_cap += cpu_actual_cap;
> >  
> >  			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> >  				continue;
> > @@ -8022,7 +8030,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >  			if (prev_delta < base_energy)
> >  				goto unlock;
> >  			prev_delta -= base_energy;
> > -			prev_thermal_cap = cpu_thermal_cap;
> > +			prev_actual_cap = cpu_actual_cap;
> >  			best_delta = min(best_delta, prev_delta);
> >  		}
> >  
> > @@ -8037,7 +8045,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >  			 * but best energy cpu has better capacity.
> >  			 */
> >  			if ((max_fits < 0) &&
> > -			    (cpu_thermal_cap <= best_thermal_cap))
> > +			    (cpu_actual_cap <= best_actual_cap))
> >  				continue;
> >  
> >  			cur_delta = compute_energy(&eenv, pd, cpus, p,
> > @@ -8058,14 +8066,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >  			best_delta = cur_delta;
> >  			best_energy_cpu = max_spare_cap_cpu;
> >  			best_fits = max_fits;
> > -			best_thermal_cap = cpu_thermal_cap;
> > +			best_actual_cap = cpu_actual_cap;
> >  		}
> >  	}
> >  	rcu_read_unlock();
> >  
> >  	if ((best_fits > prev_fits) ||
> >  	    ((best_fits > 0) && (best_delta < prev_delta)) ||
> > -	    ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> > +	    ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
> >  		target = best_energy_cpu;
> >  
> >  	return target;
> > @@ -9441,8 +9449,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
> >  
> >  static unsigned long scale_rt_capacity(int cpu)
> >  {
> > +	unsigned long max = get_actual_cpu_capacity(cpu);
> >  	struct rq *rq = cpu_rq(cpu);
> > -	unsigned long max = arch_scale_cpu_capacity(cpu);
> >  	unsigned long used, free;
> >  	unsigned long irq;
> >  
> > @@ -9454,12 +9462,9 @@ static unsigned long scale_rt_capacity(int cpu)
> >  	/*
> >  	 * avg_rt.util_avg and avg_dl.util_avg track binary signals
> >  	 * (running and not running) with weights 0 and 1024 respectively.
> > -	 * avg_thermal.load_avg tracks thermal pressure and the weighted
> > -	 * average uses the actual delta max capacity(load).
> >  	 */
> >  	used = READ_ONCE(rq->avg_rt.util_avg);
> >  	used += READ_ONCE(rq->avg_dl.util_avg);
> > -	used += thermal_load_avg(rq);
> >  
> >  	if (unlikely(used >= max))
> >  		return 1;
> > -- 
> > 2.34.1
> >
Vincent Guittot Jan. 30, 2024, 9:35 a.m. UTC | #3
On Tue, 30 Jan 2024 at 01:50, Qais Yousef <qyousef@layalina.io> wrote:
>
> On 01/30/24 00:26, Qais Yousef wrote:
> > On 01/09/24 17:46, Vincent Guittot wrote:
> > > Aggregate the different pressures applied on the capacity of CPUs and
> > > create a new function that returns the actual capacity of the CPU:
> > >   get_actual_cpu_capacity()
> > >
> > > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > > Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
> > > ---
> > >  kernel/sched/fair.c | 45 +++++++++++++++++++++++++--------------------
> > >  1 file changed, 25 insertions(+), 20 deletions(-)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 9cc20855dc2b..e54bbf8b4936 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -4910,13 +4910,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > >     trace_sched_util_est_se_tp(&p->se);
> > >  }
> > >
> > > +static inline unsigned long get_actual_cpu_capacity(int cpu)
> > > +{
> > > +   unsigned long capacity = arch_scale_cpu_capacity(cpu);
> > > +
> > > +   capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
> >
> > Does cpufreq_get_pressure() reflect thermally throttled frequency, or just the
> > policy->max being capped by user etc? I didn't see an update to cpufreq when we
> > topology_update_hw_pressure(). Not sure if it'll go through another path.
>
> It is done via the cooling device. And assume any limitations on freq due to
> power etc are assumed to always to cause the policy->max to change.
>
> (sorry if I missed earlier discussions about this)

I assume that you have answered all your questions.

We have now 2 distinct signals:
- hw high freq update which is averaged with PELT and go through
topology_update_hw_pressure
- cpufreq pressure which is not averaged (including cpufreq cooling
device with patch 3)

>
> >
> > maxing with thermal_load_avg() will change the behavior below where we used to
> > compare against instantaneous pressure. The concern was that it not just can
> > appear quickly, but disappear quickly too. thermal_load_avg() will decay
> > slowly, no?  This means we'll lose a lot of opportunities for better task
> > placement until this decays which can take relatively long time.
> >
> > So maxing handles the direction where a pressure suddenly appears. But it
> > doesn't handle where it disappears.
> >
> > I suspect your thoughts are that if it was transient then thermal_load_avg()
> > should be small anyway - which I think makes sense.
> >
> > I think we need a comment to explain these nuance differences.
> >
> > > +
> > > +   return capacity;
> > > +}
> > > +
> > >  static inline int util_fits_cpu(unsigned long util,
> > >                             unsigned long uclamp_min,
> > >                             unsigned long uclamp_max,
> > >                             int cpu)
> > >  {
> > > -   unsigned long capacity_orig, capacity_orig_thermal;
> > >     unsigned long capacity = capacity_of(cpu);
> > > +   unsigned long capacity_orig;
> > >     bool fits, uclamp_max_fits;
> > >
> > >     /*
> > > @@ -4948,7 +4957,6 @@ static inline int util_fits_cpu(unsigned long util,
> > >      * goal is to cap the task. So it's okay if it's getting less.
> > >      */
> > >     capacity_orig = arch_scale_cpu_capacity(cpu);
> > > -   capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> > >
> > >     /*
> > >      * We want to force a task to fit a cpu as implied by uclamp_max.
> > > @@ -5023,7 +5031,8 @@ static inline int util_fits_cpu(unsigned long util,
> > >      * handle the case uclamp_min > uclamp_max.
> > >      */
> > >     uclamp_min = min(uclamp_min, uclamp_max);
> > > -   if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > > +   if (fits && (util < uclamp_min) &&
> > > +       (uclamp_min > get_actual_cpu_capacity(cpu)))
> > >             return -1;
> > >
> > >     return fits;
> > > @@ -7404,7 +7413,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > >              * Look for the CPU with best capacity.
> > >              */
> > >             else if (fits < 0)
> > > -                   cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
> > > +                   cpu_cap = get_actual_cpu_capacity(cpu);
> > >
> > >             /*
> > >              * First, select CPU which fits better (-1 being better than 0).
> > > @@ -7897,8 +7906,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >     struct root_domain *rd = this_rq()->rd;
> > >     int cpu, best_energy_cpu, target = -1;
> > >     int prev_fits = -1, best_fits = -1;
> > > -   unsigned long best_thermal_cap = 0;
> > > -   unsigned long prev_thermal_cap = 0;
> > > +   unsigned long best_actual_cap = 0;
> > > +   unsigned long prev_actual_cap = 0;
> > >     struct sched_domain *sd;
> > >     struct perf_domain *pd;
> > >     struct energy_env eenv;
> > > @@ -7928,7 +7937,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >
> > >     for (; pd; pd = pd->next) {
> > >             unsigned long util_min = p_util_min, util_max = p_util_max;
> > > -           unsigned long cpu_cap, cpu_thermal_cap, util;
> > > +           unsigned long cpu_cap, cpu_actual_cap, util;
> > >             long prev_spare_cap = -1, max_spare_cap = -1;
> > >             unsigned long rq_util_min, rq_util_max;
> > >             unsigned long cur_delta, base_energy;
> > > @@ -7940,18 +7949,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >             if (cpumask_empty(cpus))
> > >                     continue;
> > >
> > > -           /* Account thermal pressure for the energy estimation */
> > > +           /* Account external pressure for the energy estimation */
> > >             cpu = cpumask_first(cpus);
> > > -           cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
> > > -           cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
> > > +           cpu_actual_cap = get_actual_cpu_capacity(cpu);
> > >
> > > -           eenv.cpu_cap = cpu_thermal_cap;
> > > +           eenv.cpu_cap = cpu_actual_cap;
> > >             eenv.pd_cap = 0;
> > >
> > >             for_each_cpu(cpu, cpus) {
> > >                     struct rq *rq = cpu_rq(cpu);
> > >
> > > -                   eenv.pd_cap += cpu_thermal_cap;
> > > +                   eenv.pd_cap += cpu_actual_cap;
> > >
> > >                     if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> > >                             continue;
> > > @@ -8022,7 +8030,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >                     if (prev_delta < base_energy)
> > >                             goto unlock;
> > >                     prev_delta -= base_energy;
> > > -                   prev_thermal_cap = cpu_thermal_cap;
> > > +                   prev_actual_cap = cpu_actual_cap;
> > >                     best_delta = min(best_delta, prev_delta);
> > >             }
> > >
> > > @@ -8037,7 +8045,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >                      * but best energy cpu has better capacity.
> > >                      */
> > >                     if ((max_fits < 0) &&
> > > -                       (cpu_thermal_cap <= best_thermal_cap))
> > > +                       (cpu_actual_cap <= best_actual_cap))
> > >                             continue;
> > >
> > >                     cur_delta = compute_energy(&eenv, pd, cpus, p,
> > > @@ -8058,14 +8066,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >                     best_delta = cur_delta;
> > >                     best_energy_cpu = max_spare_cap_cpu;
> > >                     best_fits = max_fits;
> > > -                   best_thermal_cap = cpu_thermal_cap;
> > > +                   best_actual_cap = cpu_actual_cap;
> > >             }
> > >     }
> > >     rcu_read_unlock();
> > >
> > >     if ((best_fits > prev_fits) ||
> > >         ((best_fits > 0) && (best_delta < prev_delta)) ||
> > > -       ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> > > +       ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
> > >             target = best_energy_cpu;
> > >
> > >     return target;
> > > @@ -9441,8 +9449,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
> > >
> > >  static unsigned long scale_rt_capacity(int cpu)
> > >  {
> > > +   unsigned long max = get_actual_cpu_capacity(cpu);
> > >     struct rq *rq = cpu_rq(cpu);
> > > -   unsigned long max = arch_scale_cpu_capacity(cpu);
> > >     unsigned long used, free;
> > >     unsigned long irq;
> > >
> > > @@ -9454,12 +9462,9 @@ static unsigned long scale_rt_capacity(int cpu)
> > >     /*
> > >      * avg_rt.util_avg and avg_dl.util_avg track binary signals
> > >      * (running and not running) with weights 0 and 1024 respectively.
> > > -    * avg_thermal.load_avg tracks thermal pressure and the weighted
> > > -    * average uses the actual delta max capacity(load).
> > >      */
> > >     used = READ_ONCE(rq->avg_rt.util_avg);
> > >     used += READ_ONCE(rq->avg_dl.util_avg);
> > > -   used += thermal_load_avg(rq);
> > >
> > >     if (unlikely(used >= max))
> > >             return 1;
> > > --
> > > 2.34.1
> > >
Qais Yousef Jan. 30, 2024, 10:09 p.m. UTC | #4
On 01/30/24 10:35, Vincent Guittot wrote:
> On Tue, 30 Jan 2024 at 01:50, Qais Yousef <qyousef@layalina.io> wrote:
> >
> > On 01/30/24 00:26, Qais Yousef wrote:
> > > On 01/09/24 17:46, Vincent Guittot wrote:
> > > > Aggregate the different pressures applied on the capacity of CPUs and
> > > > create a new function that returns the actual capacity of the CPU:
> > > >   get_actual_cpu_capacity()
> > > >
> > > > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > > > Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
> > > > ---
> > > >  kernel/sched/fair.c | 45 +++++++++++++++++++++++++--------------------
> > > >  1 file changed, 25 insertions(+), 20 deletions(-)
> > > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index 9cc20855dc2b..e54bbf8b4936 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -4910,13 +4910,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > > >     trace_sched_util_est_se_tp(&p->se);
> > > >  }
> > > >
> > > > +static inline unsigned long get_actual_cpu_capacity(int cpu)
> > > > +{
> > > > +   unsigned long capacity = arch_scale_cpu_capacity(cpu);
> > > > +
> > > > +   capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
> > >
> > > Does cpufreq_get_pressure() reflect thermally throttled frequency, or just the
> > > policy->max being capped by user etc? I didn't see an update to cpufreq when we
> > > topology_update_hw_pressure(). Not sure if it'll go through another path.
> >
> > It is done via the cooling device. And assume any limitations on freq due to
> > power etc are assumed to always to cause the policy->max to change.
> >
> > (sorry if I missed earlier discussions about this)
> 
> I assume that you have answered all your questions.
> 
> We have now 2 distinct signals:
> - hw high freq update which is averaged with PELT and go through
> topology_update_hw_pressure
> - cpufreq pressure which is not averaged (including cpufreq cooling
> device with patch 3)

Yes. I think a comment like suggested below is useful to help keeping the code
understandable to new comers. But FWIW

Reviewed-by: Qais Yousef <qyousef@layalina.io>

> 
> >
> > >
> > > maxing with thermal_load_avg() will change the behavior below where we used to
> > > compare against instantaneous pressure. The concern was that it not just can
> > > appear quickly, but disappear quickly too. thermal_load_avg() will decay
> > > slowly, no?  This means we'll lose a lot of opportunities for better task
> > > placement until this decays which can take relatively long time.
> > >
> > > So maxing handles the direction where a pressure suddenly appears. But it
> > > doesn't handle where it disappears.
> > >
> > > I suspect your thoughts are that if it was transient then thermal_load_avg()
> > > should be small anyway - which I think makes sense.
> > >
> > > I think we need a comment to explain these nuance differences.
> > >
> > > > +
> > > > +   return capacity;
> > > > +}
> > > > +
> > > >  static inline int util_fits_cpu(unsigned long util,
> > > >                             unsigned long uclamp_min,
> > > >                             unsigned long uclamp_max,
> > > >                             int cpu)
> > > >  {
> > > > -   unsigned long capacity_orig, capacity_orig_thermal;
> > > >     unsigned long capacity = capacity_of(cpu);
> > > > +   unsigned long capacity_orig;
> > > >     bool fits, uclamp_max_fits;
> > > >
> > > >     /*
> > > > @@ -4948,7 +4957,6 @@ static inline int util_fits_cpu(unsigned long util,
> > > >      * goal is to cap the task. So it's okay if it's getting less.
> > > >      */
> > > >     capacity_orig = arch_scale_cpu_capacity(cpu);
> > > > -   capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> > > >
> > > >     /*
> > > >      * We want to force a task to fit a cpu as implied by uclamp_max.
> > > > @@ -5023,7 +5031,8 @@ static inline int util_fits_cpu(unsigned long util,
> > > >      * handle the case uclamp_min > uclamp_max.
> > > >      */
> > > >     uclamp_min = min(uclamp_min, uclamp_max);
> > > > -   if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > > > +   if (fits && (util < uclamp_min) &&
> > > > +       (uclamp_min > get_actual_cpu_capacity(cpu)))
> > > >             return -1;
> > > >
> > > >     return fits;
> > > > @@ -7404,7 +7413,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > > >              * Look for the CPU with best capacity.
> > > >              */
> > > >             else if (fits < 0)
> > > > -                   cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
> > > > +                   cpu_cap = get_actual_cpu_capacity(cpu);
> > > >
> > > >             /*
> > > >              * First, select CPU which fits better (-1 being better than 0).
> > > > @@ -7897,8 +7906,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >     struct root_domain *rd = this_rq()->rd;
> > > >     int cpu, best_energy_cpu, target = -1;
> > > >     int prev_fits = -1, best_fits = -1;
> > > > -   unsigned long best_thermal_cap = 0;
> > > > -   unsigned long prev_thermal_cap = 0;
> > > > +   unsigned long best_actual_cap = 0;
> > > > +   unsigned long prev_actual_cap = 0;
> > > >     struct sched_domain *sd;
> > > >     struct perf_domain *pd;
> > > >     struct energy_env eenv;
> > > > @@ -7928,7 +7937,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >
> > > >     for (; pd; pd = pd->next) {
> > > >             unsigned long util_min = p_util_min, util_max = p_util_max;
> > > > -           unsigned long cpu_cap, cpu_thermal_cap, util;
> > > > +           unsigned long cpu_cap, cpu_actual_cap, util;
> > > >             long prev_spare_cap = -1, max_spare_cap = -1;
> > > >             unsigned long rq_util_min, rq_util_max;
> > > >             unsigned long cur_delta, base_energy;
> > > > @@ -7940,18 +7949,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >             if (cpumask_empty(cpus))
> > > >                     continue;
> > > >
> > > > -           /* Account thermal pressure for the energy estimation */
> > > > +           /* Account external pressure for the energy estimation */
> > > >             cpu = cpumask_first(cpus);
> > > > -           cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
> > > > -           cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
> > > > +           cpu_actual_cap = get_actual_cpu_capacity(cpu);
> > > >
> > > > -           eenv.cpu_cap = cpu_thermal_cap;
> > > > +           eenv.cpu_cap = cpu_actual_cap;
> > > >             eenv.pd_cap = 0;
> > > >
> > > >             for_each_cpu(cpu, cpus) {
> > > >                     struct rq *rq = cpu_rq(cpu);
> > > >
> > > > -                   eenv.pd_cap += cpu_thermal_cap;
> > > > +                   eenv.pd_cap += cpu_actual_cap;
> > > >
> > > >                     if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> > > >                             continue;
> > > > @@ -8022,7 +8030,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >                     if (prev_delta < base_energy)
> > > >                             goto unlock;
> > > >                     prev_delta -= base_energy;
> > > > -                   prev_thermal_cap = cpu_thermal_cap;
> > > > +                   prev_actual_cap = cpu_actual_cap;
> > > >                     best_delta = min(best_delta, prev_delta);
> > > >             }
> > > >
> > > > @@ -8037,7 +8045,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >                      * but best energy cpu has better capacity.
> > > >                      */
> > > >                     if ((max_fits < 0) &&
> > > > -                       (cpu_thermal_cap <= best_thermal_cap))
> > > > +                       (cpu_actual_cap <= best_actual_cap))
> > > >                             continue;
> > > >
> > > >                     cur_delta = compute_energy(&eenv, pd, cpus, p,
> > > > @@ -8058,14 +8066,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >                     best_delta = cur_delta;
> > > >                     best_energy_cpu = max_spare_cap_cpu;
> > > >                     best_fits = max_fits;
> > > > -                   best_thermal_cap = cpu_thermal_cap;
> > > > +                   best_actual_cap = cpu_actual_cap;
> > > >             }
> > > >     }
> > > >     rcu_read_unlock();
> > > >
> > > >     if ((best_fits > prev_fits) ||
> > > >         ((best_fits > 0) && (best_delta < prev_delta)) ||
> > > > -       ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> > > > +       ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
> > > >             target = best_energy_cpu;
> > > >
> > > >     return target;
> > > > @@ -9441,8 +9449,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
> > > >
> > > >  static unsigned long scale_rt_capacity(int cpu)
> > > >  {
> > > > +   unsigned long max = get_actual_cpu_capacity(cpu);
> > > >     struct rq *rq = cpu_rq(cpu);
> > > > -   unsigned long max = arch_scale_cpu_capacity(cpu);
> > > >     unsigned long used, free;
> > > >     unsigned long irq;
> > > >
> > > > @@ -9454,12 +9462,9 @@ static unsigned long scale_rt_capacity(int cpu)
> > > >     /*
> > > >      * avg_rt.util_avg and avg_dl.util_avg track binary signals
> > > >      * (running and not running) with weights 0 and 1024 respectively.
> > > > -    * avg_thermal.load_avg tracks thermal pressure and the weighted
> > > > -    * average uses the actual delta max capacity(load).
> > > >      */
> > > >     used = READ_ONCE(rq->avg_rt.util_avg);
> > > >     used += READ_ONCE(rq->avg_dl.util_avg);
> > > > -   used += thermal_load_avg(rq);
> > > >
> > > >     if (unlikely(used >= max))
> > > >             return 1;
> > > > --
> > > > 2.34.1
> > > >
diff mbox series

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9cc20855dc2b..e54bbf8b4936 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4910,13 +4910,22 @@  static inline void util_est_update(struct cfs_rq *cfs_rq,
 	trace_sched_util_est_se_tp(&p->se);
 }
 
+static inline unsigned long get_actual_cpu_capacity(int cpu)
+{
+	unsigned long capacity = arch_scale_cpu_capacity(cpu);
+
+	capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
+
+	return capacity;
+}
+
 static inline int util_fits_cpu(unsigned long util,
 				unsigned long uclamp_min,
 				unsigned long uclamp_max,
 				int cpu)
 {
-	unsigned long capacity_orig, capacity_orig_thermal;
 	unsigned long capacity = capacity_of(cpu);
+	unsigned long capacity_orig;
 	bool fits, uclamp_max_fits;
 
 	/*
@@ -4948,7 +4957,6 @@  static inline int util_fits_cpu(unsigned long util,
 	 * goal is to cap the task. So it's okay if it's getting less.
 	 */
 	capacity_orig = arch_scale_cpu_capacity(cpu);
-	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
 
 	/*
 	 * We want to force a task to fit a cpu as implied by uclamp_max.
@@ -5023,7 +5031,8 @@  static inline int util_fits_cpu(unsigned long util,
 	 * handle the case uclamp_min > uclamp_max.
 	 */
 	uclamp_min = min(uclamp_min, uclamp_max);
-	if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+	if (fits && (util < uclamp_min) &&
+	    (uclamp_min > get_actual_cpu_capacity(cpu)))
 		return -1;
 
 	return fits;
@@ -7404,7 +7413,7 @@  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		 * Look for the CPU with best capacity.
 		 */
 		else if (fits < 0)
-			cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
+			cpu_cap = get_actual_cpu_capacity(cpu);
 
 		/*
 		 * First, select CPU which fits better (-1 being better than 0).
@@ -7897,8 +7906,8 @@  static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	struct root_domain *rd = this_rq()->rd;
 	int cpu, best_energy_cpu, target = -1;
 	int prev_fits = -1, best_fits = -1;
-	unsigned long best_thermal_cap = 0;
-	unsigned long prev_thermal_cap = 0;
+	unsigned long best_actual_cap = 0;
+	unsigned long prev_actual_cap = 0;
 	struct sched_domain *sd;
 	struct perf_domain *pd;
 	struct energy_env eenv;
@@ -7928,7 +7937,7 @@  static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 
 	for (; pd; pd = pd->next) {
 		unsigned long util_min = p_util_min, util_max = p_util_max;
-		unsigned long cpu_cap, cpu_thermal_cap, util;
+		unsigned long cpu_cap, cpu_actual_cap, util;
 		long prev_spare_cap = -1, max_spare_cap = -1;
 		unsigned long rq_util_min, rq_util_max;
 		unsigned long cur_delta, base_energy;
@@ -7940,18 +7949,17 @@  static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 		if (cpumask_empty(cpus))
 			continue;
 
-		/* Account thermal pressure for the energy estimation */
+		/* Account external pressure for the energy estimation */
 		cpu = cpumask_first(cpus);
-		cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
-		cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+		cpu_actual_cap = get_actual_cpu_capacity(cpu);
 
-		eenv.cpu_cap = cpu_thermal_cap;
+		eenv.cpu_cap = cpu_actual_cap;
 		eenv.pd_cap = 0;
 
 		for_each_cpu(cpu, cpus) {
 			struct rq *rq = cpu_rq(cpu);
 
-			eenv.pd_cap += cpu_thermal_cap;
+			eenv.pd_cap += cpu_actual_cap;
 
 			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
 				continue;
@@ -8022,7 +8030,7 @@  static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			if (prev_delta < base_energy)
 				goto unlock;
 			prev_delta -= base_energy;
-			prev_thermal_cap = cpu_thermal_cap;
+			prev_actual_cap = cpu_actual_cap;
 			best_delta = min(best_delta, prev_delta);
 		}
 
@@ -8037,7 +8045,7 @@  static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			 * but best energy cpu has better capacity.
 			 */
 			if ((max_fits < 0) &&
-			    (cpu_thermal_cap <= best_thermal_cap))
+			    (cpu_actual_cap <= best_actual_cap))
 				continue;
 
 			cur_delta = compute_energy(&eenv, pd, cpus, p,
@@ -8058,14 +8066,14 @@  static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			best_delta = cur_delta;
 			best_energy_cpu = max_spare_cap_cpu;
 			best_fits = max_fits;
-			best_thermal_cap = cpu_thermal_cap;
+			best_actual_cap = cpu_actual_cap;
 		}
 	}
 	rcu_read_unlock();
 
 	if ((best_fits > prev_fits) ||
 	    ((best_fits > 0) && (best_delta < prev_delta)) ||
-	    ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+	    ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
 		target = best_energy_cpu;
 
 	return target;
@@ -9441,8 +9449,8 @@  static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 
 static unsigned long scale_rt_capacity(int cpu)
 {
+	unsigned long max = get_actual_cpu_capacity(cpu);
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long max = arch_scale_cpu_capacity(cpu);
 	unsigned long used, free;
 	unsigned long irq;
 
@@ -9454,12 +9462,9 @@  static unsigned long scale_rt_capacity(int cpu)
 	/*
 	 * avg_rt.util_avg and avg_dl.util_avg track binary signals
 	 * (running and not running) with weights 0 and 1024 respectively.
-	 * avg_thermal.load_avg tracks thermal pressure and the weighted
-	 * average uses the actual delta max capacity(load).
 	 */
 	used = READ_ONCE(rq->avg_rt.util_avg);
 	used += READ_ONCE(rq->avg_dl.util_avg);
-	used += thermal_load_avg(rq);
 
 	if (unlikely(used >= max))
 		return 1;