diff mbox series

[RFC,12/16] sched/pelt: Add new waiting_avg to record when runnable && !running

Message ID 20240820163512.1096301-13-qyousef@layalina.io
State New
Headers show
Series sched/fair/schedutil: Better manage system response time | expand

Commit Message

Qais Yousef Aug. 20, 2024, 4:35 p.m. UTC
This info will be useful to understand how long tasks end up waiting
behind other tasks. This info is recorded for tasks only, and
added/subtracted from root cfs_rq on __update_load_avg_se().

It also helps to decouple util_avg which indicates tasks computational
demand from the fact that the CPU might need to run faster to reduce the
waiting time. It has been a point of confusion in the past while
discussing uclamp and util_avg and the fact that not keeping freq high
means tasks will take longer to run and cause delays. Isolating the
source of delay into its own signal would be a better way to take this
source of delay into account when making decisions independently of
task's/CPU's computational demands.

It is not used now. But will be used later to help drive DVFS headroom.
It could become a helpful metric to help us manage waiting latencies in
general, for example in load balance.

TODO: waiting_avg should use rq_clock_task() as it doesn't care about
invariance. Waiting time should reflect actual wait in realtime as this
is the measure of latency that users care about.

Signed-off-by: Qais Yousef <qyousef@layalina.io>
---
 include/linux/sched.h |  2 ++
 kernel/sched/debug.c  |  5 +++++
 kernel/sched/fair.c   | 32 +++++++++++++++++++++++++++++-
 kernel/sched/pelt.c   | 45 ++++++++++++++++++++++++++++++-------------
 4 files changed, 70 insertions(+), 14 deletions(-)

Comments

Dietmar Eggemann Sept. 18, 2024, 7:01 a.m. UTC | #1
On 20/08/2024 18:35, Qais Yousef wrote:
> This info will be useful to understand how long tasks end up waiting
> behind other tasks. This info is recorded for tasks only, and
> added/subtracted from root cfs_rq on __update_load_avg_se().
> 
> It also helps to decouple util_avg which indicates tasks computational
> demand from the fact that the CPU might need to run faster to reduce the
> waiting time. It has been a point of confusion in the past while
> discussing uclamp and util_avg and the fact that not keeping freq high
> means tasks will take longer to run and cause delays. Isolating the
> source of delay into its own signal would be a better way to take this
> source of delay into account when making decisions independently of
> task's/CPU's computational demands.
> 
> It is not used now. But will be used later to help drive DVFS headroom.
> It could become a helpful metric to help us manage waiting latencies in
> general, for example in load balance.
> 
> TODO: waiting_avg should use rq_clock_task() as it doesn't care about
> invariance. Waiting time should reflect actual wait in realtime as this
> is the measure of latency that users care about.

Since you use PELT for the update, you're bound to use rq_clock_pelt().
If we could have PELT with two time values, then we could have
'util_avg' and 'invariant util_avg' to cure the slow ramp-up on tiny CPU
and/or low OPPs and we wouldn't have to add all of this extra code.

[...]

> @@ -4744,8 +4760,15 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
>  	 * Track task load average for carrying it to new CPU after migrated, and
>  	 * track group sched_entity load average for task_h_load calculation in migration
>  	 */
> -	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
> +	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
> +		bool update_rq_waiting_avg = entity_is_task(se) && se_runnable(se);
> +
> +		if (update_rq_waiting_avg)
> +			sub_waiting_avg(&rq_of(cfs_rq)->cfs, se);
>  		__update_load_avg_se(now, cfs_rq, se);
> +		if (update_rq_waiting_avg)
> +			add_waiting_avg(&rq_of(cfs_rq)->cfs, se);
> +	}

That's a pretty convoluted design. util_est-style attach/detach within
the PELT update but only for tasks and not all se's.

Doesn't 'p->se.avg.runnable_avg - p->se.avg.util_avg' give you what you
want? It's invariant but so is this here.

Commit 50181c0cff31 ("sched/pelt: Avoid underestimation of task
utilization") uses some of it already.

+ /*
+  * To avoid underestimate of task utilization, skip updates of EWMA if
+  * we cannot grant that thread got all CPU time it wanted.
+  */
+  if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p))
+          goto done;

[...]

> @@ -6786,6 +6814,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	 * estimated utilization, before we update schedutil.
>  	 */
>  	util_est_enqueue(&rq->cfs, p);
> +	add_waiting_avg(&rq->cfs, se);

This would also have to be checked against the new p->se.sched_delayed
thing.

>  	/*
>  	 * If in_iowait is set, the code below may not trigger any cpufreq
> @@ -6874,6 +6903,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	bool was_sched_idle = sched_idle_rq(rq);
>  
>  	util_est_dequeue(&rq->cfs, p);
> +	sub_waiting_avg(&rq->cfs, se);
                                  ^^
This won't compile. se vs. &p->se

[...]
diff mbox series

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a30ee43a25fb..f332ce5e226f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -477,10 +477,12 @@  struct sched_avg {
 	u64				last_update_time;
 	u64				load_sum;
 	u64				runnable_sum;
+	u64				waiting_sum;
 	u32				util_sum;
 	u32				period_contrib;
 	unsigned long			load_avg;
 	unsigned long			runnable_avg;
+	unsigned long			waiting_avg;
 	unsigned long			util_avg;
 	unsigned int			util_est;
 } ____cacheline_aligned;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c1eb9a1afd13..5fa2662a4a50 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -528,6 +528,7 @@  static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 	P(se->avg.load_avg);
 	P(se->avg.util_avg);
 	P(se->avg.runnable_avg);
+	P(se->avg.waiting_avg);
 #endif
 
 #undef PN_SCHEDSTAT
@@ -683,6 +684,8 @@  void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->avg.load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_avg",
 			cfs_rq->avg.runnable_avg);
+	SEQ_printf(m, "  .%-30s: %lu\n", "waiting_avg",
+			cfs_rq->avg.waiting_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
 			cfs_rq->avg.util_avg);
 	SEQ_printf(m, "  .%-30s: %u\n", "util_est",
@@ -1071,9 +1074,11 @@  void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 #ifdef CONFIG_SMP
 	P(se.avg.load_sum);
 	P(se.avg.runnable_sum);
+	P(se.avg.waiting_sum);
 	P(se.avg.util_sum);
 	P(se.avg.load_avg);
 	P(se.avg.runnable_avg);
+	P(se.avg.waiting_avg);
 	P(se.avg.util_avg);
 	P(se.avg.last_update_time);
 	PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3d9794db58e1..a8dbba0b755e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4726,6 +4726,22 @@  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	trace_pelt_cfs_tp(cfs_rq);
 }
 
+static inline void add_waiting_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long waiting_avg;
+	waiting_avg = READ_ONCE(cfs_rq->avg.waiting_avg);
+	waiting_avg += READ_ONCE(se->avg.waiting_avg);
+	WRITE_ONCE(cfs_rq->avg.waiting_avg, waiting_avg);
+}
+
+static inline void sub_waiting_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long waiting_avg;
+	waiting_avg = READ_ONCE(cfs_rq->avg.waiting_avg);
+	waiting_avg -= min(waiting_avg, READ_ONCE(se->avg.waiting_avg));
+	WRITE_ONCE(cfs_rq->avg.waiting_avg, waiting_avg);
+}
+
 /*
  * Optional action to be done while updating the load average
  */
@@ -4744,8 +4760,15 @@  static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	 * Track task load average for carrying it to new CPU after migrated, and
 	 * track group sched_entity load average for task_h_load calculation in migration
 	 */
-	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+		bool update_rq_waiting_avg = entity_is_task(se) && se_runnable(se);
+
+		if (update_rq_waiting_avg)
+			sub_waiting_avg(&rq_of(cfs_rq)->cfs, se);
 		__update_load_avg_se(now, cfs_rq, se);
+		if (update_rq_waiting_avg)
+			add_waiting_avg(&rq_of(cfs_rq)->cfs, se);
+	}
 
 	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
 	decayed |= propagate_entity_load_avg(se);
@@ -5182,6 +5205,11 @@  attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void
 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 
+static inline void
+add_waiting_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+sub_waiting_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+
 static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
 {
 	return 0;
@@ -6786,6 +6814,7 @@  enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 * estimated utilization, before we update schedutil.
 	 */
 	util_est_enqueue(&rq->cfs, p);
+	add_waiting_avg(&rq->cfs, se);
 
 	/*
 	 * If in_iowait is set, the code below may not trigger any cpufreq
@@ -6874,6 +6903,7 @@  static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	bool was_sched_idle = sched_idle_rq(rq);
 
 	util_est_dequeue(&rq->cfs, p);
+	sub_waiting_avg(&rq->cfs, se);
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 536575757420..f0974abf8566 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -103,7 +103,8 @@  static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
  */
 static __always_inline u32
 accumulate_sum(u64 delta, struct sched_avg *sa,
-	       unsigned long load, unsigned long runnable, int running)
+	       unsigned long load, unsigned long runnable, int running,
+	       bool is_task)
 {
 	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
 	u64 periods;
@@ -118,6 +119,7 @@  accumulate_sum(u64 delta, struct sched_avg *sa,
 		sa->load_sum = decay_load(sa->load_sum, periods);
 		sa->runnable_sum =
 			decay_load(sa->runnable_sum, periods);
+		sa->waiting_sum = decay_load((u64)(sa->waiting_sum), periods);
 		sa->util_sum = decay_load((u64)(sa->util_sum), periods);
 
 		/*
@@ -147,6 +149,8 @@  accumulate_sum(u64 delta, struct sched_avg *sa,
 		sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT;
 	if (running)
 		sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
+	if (is_task && runnable && !running)
+		sa->waiting_sum += contrib << SCHED_CAPACITY_SHIFT;
 
 	return periods;
 }
@@ -181,7 +185,8 @@  accumulate_sum(u64 delta, struct sched_avg *sa,
  */
 static __always_inline int
 ___update_load_sum(u64 now, struct sched_avg *sa,
-		  unsigned long load, unsigned long runnable, int running)
+		  unsigned long load, unsigned long runnable, int running,
+		  bool is_task)
 {
 	int time_shift;
 	u64 delta;
@@ -232,7 +237,7 @@  ___update_load_sum(u64 now, struct sched_avg *sa,
 	 * Step 1: accumulate *_sum since last_update_time. If we haven't
 	 * crossed period boundaries, finish.
 	 */
-	if (!accumulate_sum(delta, sa, load, runnable, running))
+	if (!accumulate_sum(delta, sa, load, runnable, running, is_task))
 		return 0;
 
 	return 1;
@@ -272,6 +277,7 @@  ___update_load_avg(struct sched_avg *sa, unsigned long load)
 	 */
 	sa->load_avg = div_u64(load * sa->load_sum, divider);
 	sa->runnable_avg = div_u64(sa->runnable_sum, divider);
+	sa->waiting_avg = div_u64(sa->waiting_sum, divider);
 	WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
 }
 
@@ -303,7 +309,7 @@  ___update_load_avg(struct sched_avg *sa, unsigned long load)
 
 int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
 {
-	if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
+	if (___update_load_sum(now, &se->avg, 0, 0, 0, false)) {
 		___update_load_avg(&se->avg, se_weight(se));
 		trace_pelt_se_tp(se);
 		return 1;
@@ -314,10 +320,17 @@  int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
 
 int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	bool is_task = entity_is_task(se);
+
+	if (is_task)
+		rq_of(cfs_rq)->cfs.avg.waiting_avg -= se->avg.waiting_avg;
+
 	if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
-				cfs_rq->curr == se)) {
+				cfs_rq->curr == se, is_task)) {
 
 		___update_load_avg(&se->avg, se_weight(se));
+		if (is_task)
+			rq_of(cfs_rq)->cfs.avg.waiting_avg += se->avg.waiting_avg;
 		cfs_se_util_change(&se->avg);
 		trace_pelt_se_tp(se);
 		return 1;
@@ -331,7 +344,8 @@  int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
 	if (___update_load_sum(now, &cfs_rq->avg,
 				scale_load_down(cfs_rq->load.weight),
 				cfs_rq->h_nr_running,
-				cfs_rq->curr != NULL)) {
+				cfs_rq->curr != NULL,
+				false)) {
 
 		___update_load_avg(&cfs_rq->avg, 1);
 		trace_pelt_cfs_tp(cfs_rq);
@@ -357,7 +371,8 @@  int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
 	if (___update_load_sum(now, &rq->avg_rt,
 				running,
 				running,
-				running)) {
+				running,
+				false)) {
 
 		___update_load_avg(&rq->avg_rt, 1);
 		trace_pelt_rt_tp(rq);
@@ -383,7 +398,8 @@  int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
 	if (___update_load_sum(now, &rq->avg_dl,
 				running,
 				running,
-				running)) {
+				running,
+				false)) {
 
 		___update_load_avg(&rq->avg_dl, 1);
 		trace_pelt_dl_tp(rq);
@@ -414,7 +430,8 @@  int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
 	if (___update_load_sum(now, &rq->avg_hw,
 			       capacity,
 			       capacity,
-			       capacity)) {
+			       capacity,
+			       false)) {
 		___update_load_avg(&rq->avg_hw, 1);
 		trace_pelt_hw_tp(rq);
 		return 1;
@@ -462,11 +479,13 @@  int update_irq_load_avg(struct rq *rq, u64 running)
 	ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
 				0,
 				0,
-				0);
+				0,
+				false);
 	ret += ___update_load_sum(rq->clock, &rq->avg_irq,
 				1,
 				1,
-				1);
+				1,
+				false);
 
 	if (ret) {
 		___update_load_avg(&rq->avg_irq, 1);
@@ -536,7 +555,7 @@  unsigned long approximate_util_avg(unsigned long util, u64 delta)
 	if (unlikely(!delta))
 		return util;
 
-	accumulate_sum(delta << sched_pelt_lshift, &sa, 1, 0, 1);
+	accumulate_sum(delta << sched_pelt_lshift, &sa, 1, 0, 1, false);
 	___update_load_avg(&sa, 0);
 
 	return sa.util_avg;
@@ -555,7 +574,7 @@  u64 approximate_runtime(unsigned long util)
 		return runtime;
 
 	while (sa.util_avg < util) {
-		accumulate_sum(delta, &sa, 1, 0, 1);
+		accumulate_sum(delta, &sa, 1, 0, 1, false);
 		___update_load_avg(&sa, 0);
 		runtime++;
 	}