diff mbox series

[RFC,v2,4/7] sched/fair: Use CFS util_avg_uclamp for utilization and frequency

Message ID 4f755ae12895bbc74a74bac56bf2ef0f30413a32.1706792708.git.hongyan.xia2@arm.com
State New
Headers show
Series None | expand

Commit Message

Hongyan Xia Feb. 1, 2024, 1:12 p.m. UTC
Switch to the new util_avg_uclamp for task and runqueue utilization.
Since task_util_est() calls task_util() which now uses util_avg_uclamp,
this means util_est is now also a clamped value.

Now that we have the sum aggregated CFS util value, we do not need to
consult uclamp buckets to know how the frequency should be clamped. We
simply look at the aggregated top level root_cfs_util_uclamp to know
what frequency to choose.

TODO: Sum aggregation for RT tasks. I have already implemented RT sum
aggregation, which is only 49 lines of code, but I don't want RT to
distract this series which is mainly CFS-focused. RT will be sent in a
separate mini series.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/core.c              | 17 ++++----------
 kernel/sched/cpufreq_schedutil.c | 10 ++------
 kernel/sched/fair.c              | 39 ++++++++++++++++----------------
 kernel/sched/sched.h             | 21 +++++++++++++----
 4 files changed, 42 insertions(+), 45 deletions(-)

Comments

Dietmar Eggemann March 15, 2024, 12:31 p.m. UTC | #1
On 01/02/2024 14:12, Hongyan Xia wrote:

[...]

> @@ -7685,11 +7697,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  static unsigned long
>  cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
>  {
> -	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
> -	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
> +	struct rq *rq = cpu_rq(cpu);
> +	struct cfs_rq *cfs_rq = &rq->cfs;
> +	unsigned long util = root_cfs_util(rq);
> +	bool capped = uclamp_rq_is_capped(rq);

I try to rerun your tests in your 2 ipynbs (cover letter) but this let's
the sum aggr stack go sideways ...

if 'sched_uclamp_used' then uclamp_rq_is_capped() will call
cpu_util_cfs()->cpu_util() which then calls uclamp_rq_is_capped()
recursively resulting in a stack overflow.

Do you have a fix for that you can share? For the time I remove the call
to uclamp_rq_is_capped() in cpu_util().

[...]
Hongyan Xia March 15, 2024, 7:47 p.m. UTC | #2
On 15/03/2024 12:31, Dietmar Eggemann wrote:
> On 01/02/2024 14:12, Hongyan Xia wrote:
> 
> [...]
> 
>> @@ -7685,11 +7697,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>>   static unsigned long
>>   cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
>>   {
>> -	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
>> -	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
>> +	struct rq *rq = cpu_rq(cpu);
>> +	struct cfs_rq *cfs_rq = &rq->cfs;
>> +	unsigned long util = root_cfs_util(rq);
>> +	bool capped = uclamp_rq_is_capped(rq);
> 
> I try to rerun your tests in your 2 ipynbs (cover letter) but this let's
> the sum aggr stack go sideways ...
> 
> if 'sched_uclamp_used' then uclamp_rq_is_capped() will call
> cpu_util_cfs()->cpu_util() which then calls uclamp_rq_is_capped()
> recursively resulting in a stack overflow.
> 
> Do you have a fix for that you can share? For the time I remove the call
> to uclamp_rq_is_capped() in cpu_util().

My apologies. This has long ago been fixed and here is the diff:

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1ebdd0b9ebca..d5dcda036e0d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3018,9 +3018,8 @@ static inline bool uclamp_rq_is_capped(struct rq *rq)
         if (!static_branch_likely(&sched_uclamp_used))
                 return false;

-       rq_uclamp_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
-       rq_real_util = READ_ONCE(rq->cfs.avg.util_avg) +
-                      READ_ONCE(rq->avg_rt.util_avg);
+       rq_uclamp_util = READ_ONCE(rq->root_cfs_util_uclamp);
+       rq_real_util = READ_ONCE(rq->cfs.avg.util_avg);

         return rq_uclamp_util < SCHED_CAPACITY_SCALE &&
                rq_real_util > rq_uclamp_util;

> [...]
>
diff mbox series

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index db4be4921e7f..0bedc05c883f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7465,6 +7465,9 @@  int sched_core_idle_cpu(int cpu)
  * The DL bandwidth number otoh is not a measured metric but a value computed
  * based on the task model parameters and gives the minimal utilization
  * required to meet deadlines.
+ *
+ * The util_cfs parameter has already taken uclamp into account (unless uclamp
+ * support is not compiled in).
  */
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 				 unsigned long *min,
@@ -7490,13 +7493,7 @@  unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 	}
 
 	if (min) {
-		/*
-		 * The minimum utilization returns the highest level between:
-		 * - the computed DL bandwidth needed with the IRQ pressure which
-		 *   steals time to the deadline task.
-		 * - The minimum performance requirement for CFS and/or RT.
-		 */
-		*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+		*min = irq + cpu_bw_dl(rq);
 
 		/*
 		 * When an RT task is runnable and uclamp is not used, we must
@@ -7515,12 +7512,8 @@  unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 	util = util_cfs + cpu_util_rt(rq);
 	util += cpu_util_dl(rq);
 
-	/*
-	 * The maximum hint is a soft bandwidth requirement, which can be lower
-	 * than the actual utilization because of uclamp_max requirements.
-	 */
 	if (max)
-		*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+		*max = scale;
 
 	if (util >= scale)
 		return scale;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 95c3c097083e..48a4e4a685d0 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -381,11 +381,8 @@  static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
 	/*
 	 * Do not reduce the frequency if the CPU has not been idle
 	 * recently, as the reduction is likely to be premature then.
-	 *
-	 * Except when the rq is capped by uclamp_max.
 	 */
-	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
-	    sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
+	if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
 	    !sg_policy->need_freq_update) {
 		next_f = sg_policy->next_freq;
 
@@ -435,11 +432,8 @@  static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 	/*
 	 * Do not reduce the target performance level if the CPU has not been
 	 * idle recently, as the reduction is likely to be premature then.
-	 *
-	 * Except when the rq is capped by uclamp_max.
 	 */
-	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
-	    sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+	if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
 		sg_cpu->util = prev_util;
 
 	cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 36357cfaf48d..b92739e1c52f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4821,10 +4821,17 @@  static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
 
 static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
 
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long task_util(struct task_struct *p)
+{
+	return READ_ONCE(p->se.avg.util_avg_uclamp);
+}
+#else
 static inline unsigned long task_util(struct task_struct *p)
 {
 	return READ_ONCE(p->se.avg.util_avg);
 }
+#endif
 
 static inline unsigned long task_runnable(struct task_struct *p)
 {
@@ -4932,8 +4939,13 @@  static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * To avoid underestimate of task utilization, skip updates of EWMA if
 	 * we cannot grant that thread got all CPU time it wanted.
 	 */
-	if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
+	if ((READ_ONCE(p->se.avg.util_avg) + UTIL_EST_MARGIN) <
+			task_runnable(p)) {
+		ewma = clamp(ewma,
+			     uclamp_eff_value(p, UCLAMP_MIN),
+			     uclamp_eff_value(p, UCLAMP_MAX));
 		goto done;
+	}
 
 
 	/*
@@ -7685,11 +7697,13 @@  static int select_idle_sibling(struct task_struct *p, int prev, int target)
 static unsigned long
 cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
 {
-	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
-	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
+	struct rq *rq = cpu_rq(cpu);
+	struct cfs_rq *cfs_rq = &rq->cfs;
+	unsigned long util = root_cfs_util(rq);
+	bool capped = uclamp_rq_is_capped(rq);
 	unsigned long runnable;
 
-	if (boost) {
+	if (boost && !capped) {
 		runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
 		util = max(util, runnable);
 	}
@@ -7867,7 +7881,6 @@  eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
 	int cpu;
 
 	for_each_cpu(cpu, pd_cpus) {
-		struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
 		unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
 		unsigned long eff_util, min, max;
 
@@ -7880,20 +7893,6 @@  eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
 		 */
 		eff_util = effective_cpu_util(cpu, util, &min, &max);
 
-		/* Task's uclamp can modify min and max value */
-		if (tsk && uclamp_is_used()) {
-			min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
-
-			/*
-			 * If there is no active max uclamp constraint,
-			 * directly use task's one, otherwise keep max.
-			 */
-			if (uclamp_rq_is_idle(cpu_rq(cpu)))
-				max = uclamp_eff_value(p, UCLAMP_MAX);
-			else
-				max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
-		}
-
 		eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
 		max_util = max(max_util, eff_util);
 	}
@@ -7996,7 +7995,7 @@  static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	target = prev_cpu;
 
 	sync_entity_load_avg(&p->se);
-	if (!task_util_est(p) && p_util_min == 0)
+	if (!task_util_est(p))
 		goto unlock;
 
 	eenv_task_busy_time(&eenv, p, prev_cpu);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce80b87b549b..3ee28822f48f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3062,16 +3062,17 @@  static inline bool uclamp_rq_is_idle(struct rq *rq)
 /* Is the rq being capped/throttled by uclamp_max? */
 static inline bool uclamp_rq_is_capped(struct rq *rq)
 {
-	unsigned long rq_util;
-	unsigned long max_util;
+	unsigned long rq_uclamp_util, rq_real_util;
 
 	if (!static_branch_likely(&sched_uclamp_used))
 		return false;
 
-	rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
-	max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+	rq_uclamp_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
+	rq_real_util = READ_ONCE(rq->cfs.avg.util_avg) +
+		       READ_ONCE(rq->avg_rt.util_avg);
 
-	return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util;
+	return rq_uclamp_util < SCHED_CAPACITY_SCALE &&
+	       rq_real_util > rq_uclamp_util;
 }
 
 /*
@@ -3087,6 +3088,11 @@  static inline bool uclamp_is_used(void)
 	return static_branch_likely(&sched_uclamp_used);
 }
 
+static inline unsigned long root_cfs_util(struct rq *rq)
+{
+	return READ_ONCE(rq->root_cfs_util_uclamp);
+}
+
 static inline void util_uclamp_enqueue(struct sched_avg *avg,
 				       struct task_struct *p)
 {
@@ -3160,6 +3166,11 @@  static inline bool uclamp_rq_is_idle(struct rq *rq)
 	return false;
 }
 
+static inline unsigned long root_cfs_util(struct rq *rq)
+{
+	return READ_ONCE(rq->cfs.avg.util_avg);
+}
+
 static inline void remove_root_cfs_util_uclamp(struct task_struct *p)
 {
 }