sched: fix group_entity's share update

Vincent Guittot Dec. 1, 2016, 4:38 p.m.
The update of the share of a cfs_rq is done when its load_avg is updated
but before the group_entity's load_avg has been updated for the past time
slot. This generates wrong load_avg accounting which can be significant
when small tasks are involved in the scheduling.

Let take the example of a task TA that is dequeued of its task group TG1.
TA was the only task in TG1 which becomes idle.

We have the sequence:

- dequeue_entity TA->se
    - update_load_avg(TA->se)
    - dequeue_entity_load_avg(TG1->cfs_rq, TA->se)
    - account_entity_dequeue(TG1->cfs_rq, TA->se)
          TG1->cfs_rq->load.weight = 0
    - update_cfs_shares(TG1->cfs_rq)
	        TG1->se->load.weight is updated with the new share of
		cfs_rq. TG1->se->load.weight = 0.
- dequeue_entity TG1->se
    - update_load_avg(TG1->se) but its weight is now null so the last time
slot (up to a tick) will be accounted with its new weight (0 in our case)
instead of its real weight. The last time slot is accounted as an idle one
whereas it was a running one.

If the running time of TA is short enough that no tick happens when it
runs, all running time of TG1->se will be accounted as idle time.

Instead, we should update the share of a cfs_rq (in fact the weight of its
group entity) only after having updated the load_avg of the group_entity.

update_cfs_shares() now takes the sched_entity as parameter instead of the
cfs_rq and the weight of the group_entity is updated only once its load_avg
has been synced with current time.

Cc: <>
Signed-off-by: Vincent Guittot <>


I have seen the problem on tip/sched/core, v4.8 and v4.7. Previous versions
might also have the problem but I haven't not been able to test them yet.

 kernel/sched/fair.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)



diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 18d9e75..19092fa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2689,15 +2689,18 @@  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
 	struct task_group *tg;
-	struct sched_entity *se;
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
 	long shares;
+	if (entity_is_task(se))
+		return;
 	tg = cfs_rq->tg;
-	se = tg->se[cpu_of(rq_of(cfs_rq))];
-	if (!se || throttled_hierarchy(cfs_rq))
+	if (throttled_hierarchy(cfs_rq))
 #ifndef CONFIG_SMP
 	if (likely(se->load.weight == tg->shares))
@@ -2707,8 +2710,10 @@  static void update_cfs_shares(struct cfs_rq *cfs_rq)
 	reweight_entity(cfs_rq_of(se), se, shares);
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
@@ -3583,9 +3588,9 @@  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		se->vruntime += cfs_rq->min_vruntime;
 	update_load_avg(se, UPDATE_TG);
+	update_cfs_shares(se);
 	enqueue_entity_load_avg(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
-	update_cfs_shares(cfs_rq);
 	if (flags & ENQUEUE_WAKEUP)
 		place_entity(cfs_rq, se, 0);
@@ -3681,7 +3686,7 @@  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	/* return excess runtime on last dequeue */
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(se);
 	 * Now advance min_vruntime if @se was the entity holding it back,
@@ -3864,7 +3869,7 @@  entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * Ensure that runnable average is periodically updated.
 	update_load_avg(curr, UPDATE_TG);
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(curr);
@@ -4761,7 +4766,7 @@  enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	if (!se)
@@ -4820,7 +4825,7 @@  static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	if (!se)
@@ -9316,7 +9321,7 @@  int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		/* Possible calls to update_curr() need rq clock */
-			update_cfs_shares(group_cfs_rq(se));
+			update_cfs_shares(se);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);