@@ -477,10 +477,12 @@ struct sched_avg {
u64 last_update_time;
u64 load_sum;
u64 runnable_sum;
+ u64 waiting_sum;
u32 util_sum;
u32 period_contrib;
unsigned long load_avg;
unsigned long runnable_avg;
+ unsigned long waiting_avg;
unsigned long util_avg;
unsigned int util_est;
} ____cacheline_aligned;
@@ -528,6 +528,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->avg.load_avg);
P(se->avg.util_avg);
P(se->avg.runnable_avg);
+ P(se->avg.waiting_avg);
#endif
#undef PN_SCHEDSTAT
@@ -683,6 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
cfs_rq->avg.runnable_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "waiting_avg",
+ cfs_rq->avg.waiting_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %u\n", "util_est",
@@ -1071,9 +1074,11 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
#ifdef CONFIG_SMP
P(se.avg.load_sum);
P(se.avg.runnable_sum);
+ P(se.avg.waiting_sum);
P(se.avg.util_sum);
P(se.avg.load_avg);
P(se.avg.runnable_avg);
+ P(se.avg.waiting_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
@@ -4726,6 +4726,22 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
trace_pelt_cfs_tp(cfs_rq);
}
+static inline void add_waiting_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long waiting_avg;
+ waiting_avg = READ_ONCE(cfs_rq->avg.waiting_avg);
+ waiting_avg += READ_ONCE(se->avg.waiting_avg);
+ WRITE_ONCE(cfs_rq->avg.waiting_avg, waiting_avg);
+}
+
+static inline void sub_waiting_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long waiting_avg;
+ waiting_avg = READ_ONCE(cfs_rq->avg.waiting_avg);
+ waiting_avg -= min(waiting_avg, READ_ONCE(se->avg.waiting_avg));
+ WRITE_ONCE(cfs_rq->avg.waiting_avg, waiting_avg);
+}
+
/*
* Optional action to be done while updating the load average
*/
@@ -4744,8 +4760,15 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calculation in migration
*/
- if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+ bool update_rq_waiting_avg = entity_is_task(se) && se_runnable(se);
+
+ if (update_rq_waiting_avg)
+ sub_waiting_avg(&rq_of(cfs_rq)->cfs, se);
__update_load_avg_se(now, cfs_rq, se);
+ if (update_rq_waiting_avg)
+ add_waiting_avg(&rq_of(cfs_rq)->cfs, se);
+ }
decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
@@ -5182,6 +5205,11 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+add_waiting_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+sub_waiting_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+
static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
{
return 0;
@@ -6786,6 +6814,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* estimated utilization, before we update schedutil.
*/
util_est_enqueue(&rq->cfs, p);
+ add_waiting_avg(&rq->cfs, se);
/*
* If in_iowait is set, the code below may not trigger any cpufreq
@@ -6874,6 +6903,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bool was_sched_idle = sched_idle_rq(rq);
util_est_dequeue(&rq->cfs, p);
+ sub_waiting_avg(&rq->cfs, se);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -103,7 +103,8 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
*/
static __always_inline u32
accumulate_sum(u64 delta, struct sched_avg *sa,
- unsigned long load, unsigned long runnable, int running)
+ unsigned long load, unsigned long runnable, int running,
+ bool is_task)
{
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
@@ -118,6 +119,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
sa->load_sum = decay_load(sa->load_sum, periods);
sa->runnable_sum =
decay_load(sa->runnable_sum, periods);
+ sa->waiting_sum = decay_load((u64)(sa->waiting_sum), periods);
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
/*
@@ -147,6 +149,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT;
if (running)
sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
+ if (is_task && runnable && !running)
+ sa->waiting_sum += contrib << SCHED_CAPACITY_SHIFT;
return periods;
}
@@ -181,7 +185,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
*/
static __always_inline int
___update_load_sum(u64 now, struct sched_avg *sa,
- unsigned long load, unsigned long runnable, int running)
+ unsigned long load, unsigned long runnable, int running,
+ bool is_task)
{
int time_shift;
u64 delta;
@@ -232,7 +237,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, sa, load, runnable, running))
+ if (!accumulate_sum(delta, sa, load, runnable, running, is_task))
return 0;
return 1;
@@ -272,6 +277,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
*/
sa->load_avg = div_u64(load * sa->load_sum, divider);
sa->runnable_avg = div_u64(sa->runnable_sum, divider);
+ sa->waiting_avg = div_u64(sa->waiting_sum, divider);
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
}
@@ -303,7 +309,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
- if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
+ if (___update_load_sum(now, &se->avg, 0, 0, 0, false)) {
___update_load_avg(&se->avg, se_weight(se));
trace_pelt_se_tp(se);
return 1;
@@ -314,10 +320,17 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ bool is_task = entity_is_task(se);
+
+ if (is_task)
+ rq_of(cfs_rq)->cfs.avg.waiting_avg -= se->avg.waiting_avg;
+
if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
- cfs_rq->curr == se)) {
+ cfs_rq->curr == se, is_task)) {
___update_load_avg(&se->avg, se_weight(se));
+ if (is_task)
+ rq_of(cfs_rq)->cfs.avg.waiting_avg += se->avg.waiting_avg;
cfs_se_util_change(&se->avg);
trace_pelt_se_tp(se);
return 1;
@@ -331,7 +344,8 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
cfs_rq->h_nr_running,
- cfs_rq->curr != NULL)) {
+ cfs_rq->curr != NULL,
+ false)) {
___update_load_avg(&cfs_rq->avg, 1);
trace_pelt_cfs_tp(cfs_rq);
@@ -357,7 +371,8 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
if (___update_load_sum(now, &rq->avg_rt,
running,
running,
- running)) {
+ running,
+ false)) {
___update_load_avg(&rq->avg_rt, 1);
trace_pelt_rt_tp(rq);
@@ -383,7 +398,8 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
if (___update_load_sum(now, &rq->avg_dl,
running,
running,
- running)) {
+ running,
+ false)) {
___update_load_avg(&rq->avg_dl, 1);
trace_pelt_dl_tp(rq);
@@ -414,7 +430,8 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
if (___update_load_sum(now, &rq->avg_hw,
capacity,
capacity,
- capacity)) {
+ capacity,
+ false)) {
___update_load_avg(&rq->avg_hw, 1);
trace_pelt_hw_tp(rq);
return 1;
@@ -462,11 +479,13 @@ int update_irq_load_avg(struct rq *rq, u64 running)
ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
0,
0,
- 0);
+ 0,
+ false);
ret += ___update_load_sum(rq->clock, &rq->avg_irq,
1,
1,
- 1);
+ 1,
+ false);
if (ret) {
___update_load_avg(&rq->avg_irq, 1);
@@ -536,7 +555,7 @@ unsigned long approximate_util_avg(unsigned long util, u64 delta)
if (unlikely(!delta))
return util;
- accumulate_sum(delta << sched_pelt_lshift, &sa, 1, 0, 1);
+ accumulate_sum(delta << sched_pelt_lshift, &sa, 1, 0, 1, false);
___update_load_avg(&sa, 0);
return sa.util_avg;
@@ -555,7 +574,7 @@ u64 approximate_runtime(unsigned long util)
return runtime;
while (sa.util_avg < util) {
- accumulate_sum(delta, &sa, 1, 0, 1);
+ accumulate_sum(delta, &sa, 1, 0, 1, false);
___update_load_avg(&sa, 0);
runtime++;
}
This info will be useful to understand how long tasks end up waiting behind other tasks. This info is recorded for tasks only, and added/subtracted from root cfs_rq on __update_load_avg_se(). It also helps to decouple util_avg which indicates tasks computational demand from the fact that the CPU might need to run faster to reduce the waiting time. It has been a point of confusion in the past while discussing uclamp and util_avg and the fact that not keeping freq high means tasks will take longer to run and cause delays. Isolating the source of delay into its own signal would be a better way to take this source of delay into account when making decisions independently of task's/CPU's computational demands. It is not used now. But will be used later to help drive DVFS headroom. It could become a helpful metric to help us manage waiting latencies in general, for example in load balance. TODO: waiting_avg should use rq_clock_task() as it doesn't care about invariance. Waiting time should reflect actual wait in realtime as this is the measure of latency that users care about. Signed-off-by: Qais Yousef <qyousef@layalina.io> --- include/linux/sched.h | 2 ++ kernel/sched/debug.c | 5 +++++ kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++- kernel/sched/pelt.c | 45 ++++++++++++++++++++++++++++++------------- 4 files changed, 70 insertions(+), 14 deletions(-)