diff mbox

[4/7,v3] sched: propagate load during synchronous attach/detach

Message ID 1473666472-13749-5-git-send-email-vincent.guittot@linaro.org
State New
Headers show

Commit Message

Vincent Guittot Sept. 12, 2016, 7:47 a.m. UTC
When a task moves from/to a cfs_rq, we set a flag which is then used to
propagate the change at parent level (sched_entity and cfs_rq) during
next update. If the cfs_rq is throttled, the flag will stay pending until
the cfs_rw is unthrottled.

For propagating the utilization, we copy the utilization of child cfs_rq to
the sched_entity.

For propagating the load, we have to take into account the load of the
whole task group in order to evaluate the load of the sched_entity.
Similarly to what was done before the rewrite of PELT, we add a correction
factor in case the task group's load is less than its share so it will
contribute the same load of a task of equal weight.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

---
 kernel/sched/fair.c  | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |   1 +
 2 files changed, 170 insertions(+), 1 deletion(-)

-- 
1.9.1

Comments

Vincent Guittot Sept. 15, 2016, 1:01 p.m. UTC | #1
On 15 September 2016 at 14:55, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Sep 12, 2016 at 09:47:49AM +0200, Vincent Guittot wrote:

>  +/* Take into account change of utilization of a child task group */

>> +static inline void

>> +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)

>> +{

>> +     struct cfs_rq *gcfs_rq =  group_cfs_rq(se);

>> +     long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;

>> +

>> +     /* Nothing to update */

>> +     if (!delta)

>> +             return;

>> +

>> +     /* Set new sched_entity's utilizaton */

>> +     se->avg.util_avg = gcfs_rq->avg.util_avg;

>> +     se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;

>> +

>> +     /* Update parent cfs_rq utilization */

>> +     cfs_rq->avg.util_avg =  max_t(long, cfs_rq->avg.util_avg + delta, 0);

>

> This..

>

>> +     cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;

>> +}

>> +

>> +/* Take into account change of load of a child task group */

>> +static inline void

>> +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)

>> +{

>> +     struct cfs_rq *gcfs_rq = group_cfs_rq(se);

>> +     long delta, load = gcfs_rq->avg.load_avg;

>> +

>> +     /* If the load of group cfs_rq is null, the load of the

>> +      * sched_entity will also be null so we can skip the formula

>> +      */

>> +     if (load) {

>> +             long tg_load;

>> +

>> +             /* Get tg's load and ensure tg_load > 0 */

>> +             tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;

>> +

>> +             /* Ensure tg_load >= load and updated with current load*/

>> +             tg_load -= gcfs_rq->tg_load_avg_contrib;

>> +             tg_load += load;

>> +

>> +             /* scale gcfs_rq's load into tg's shares*/

>> +             load *= scale_load_down(gcfs_rq->tg->shares);

>> +             load /= tg_load;

>> +

>> +             /*

>> +              * we need to compute a correction term in the case that the

>> +              * task group is consuming <1 cpu so that we would contribute

>> +              * the same load as a task of equal weight.

>> +             */

>> +             if (tg_load < scale_load_down(gcfs_rq->tg->shares)) {

>> +                     load *= tg_load;

>> +                     load /= scale_load_down(gcfs_rq->tg->shares);

>> +             }

>> +     }

>> +

>> +     delta = load - se->avg.load_avg;

>> +

>> +     /* Nothing to update */

>> +     if (!delta)

>> +             return;

>> +

>> +     /* Set new sched_entity's load */

>> +     se->avg.load_avg = load;

>> +     se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;

>> +

>> +     /* Update parent cfs_rq load */

>> +     cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg + delta, 0);

>

> And this..

>

>> +     cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;

>> +}

>

> Re-introduce the issue from: 897418922215 ("sched/fair: Fix cfs_rq avg

> tracking underflow").



Yes sorry, i forgot this point. I will use sub_positive instead
Vincent Guittot Sept. 15, 2016, 1:11 p.m. UTC | #2
On 15 September 2016 at 14:59, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Sep 12, 2016 at 09:47:49AM +0200, Vincent Guittot wrote:

>> +     /* If the load of group cfs_rq is null, the load of the

>> +      * sched_entity will also be null so we can skip the formula

>> +      */

>

> https://lkml.kernel.org/r/CA+55aFyQYJerovMsSoSKS7PessZBr4vNp-3QUUwhqk4A4_jcbg@mail.gmail.com

>


I'm going to fix it right now.

I thought that checkpatch.pl would have raised a warning for this
comment style issue

>
Dietmar Eggemann Sept. 15, 2016, 1:11 p.m. UTC | #3
On 12/09/16 08:47, Vincent Guittot wrote:
> When a task moves from/to a cfs_rq, we set a flag which is then used to

> propagate the change at parent level (sched_entity and cfs_rq) during

> next update. If the cfs_rq is throttled, the flag will stay pending until

> the cfs_rw is unthrottled.

> 

> For propagating the utilization, we copy the utilization of child cfs_rq to


s/child/group ?

> the sched_entity.

> 

> For propagating the load, we have to take into account the load of the

> whole task group in order to evaluate the load of the sched_entity.

> Similarly to what was done before the rewrite of PELT, we add a correction

> factor in case the task group's load is less than its share so it will

> contribute the same load of a task of equal weight.


What about cfs_rq->runnable_load_avg?

[...]

> +/* Take into account change of load of a child task group */

> +static inline void

> +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)

> +{

> +	struct cfs_rq *gcfs_rq = group_cfs_rq(se);

> +	long delta, load = gcfs_rq->avg.load_avg;

> +

> +	/* If the load of group cfs_rq is null, the load of the

> +	 * sched_entity will also be null so we can skip the formula

> +	 */

> +	if (load) {

> +		long tg_load;

> +

> +		/* Get tg's load and ensure tg_load > 0 */

> +		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;

> +

> +		/* Ensure tg_load >= load and updated with current load*/

> +		tg_load -= gcfs_rq->tg_load_avg_contrib;

> +		tg_load += load;

> +

> +		/* scale gcfs_rq's load into tg's shares*/

> +		load *= scale_load_down(gcfs_rq->tg->shares);

> +		load /= tg_load;

> +

> +		/*

> +		 * we need to compute a correction term in the case that the

> +		 * task group is consuming <1 cpu so that we would contribute

> +		 * the same load as a task of equal weight.


Wasn't 'consuming <1' related to 'NICE_0_LOAD' and not
scale_load_down(gcfs_rq->tg->shares) before the rewrite of PELT (v4.2,
__update_group_entity_contrib())?

> +		*/

> +		if (tg_load < scale_load_down(gcfs_rq->tg->shares)) {

> +			load *= tg_load;

> +			load /= scale_load_down(gcfs_rq->tg->shares);

> +		}

> +	}


[...]
Vincent Guittot Sept. 15, 2016, 2:31 p.m. UTC | #4
On 15 September 2016 at 15:11, Dietmar Eggemann
<dietmar.eggemann@arm.com> wrote:
> On 12/09/16 08:47, Vincent Guittot wrote:

>> When a task moves from/to a cfs_rq, we set a flag which is then used to

>> propagate the change at parent level (sched_entity and cfs_rq) during

>> next update. If the cfs_rq is throttled, the flag will stay pending until

>> the cfs_rw is unthrottled.

>>

>> For propagating the utilization, we copy the utilization of child cfs_rq to

>

> s/child/group ?

>

>> the sched_entity.

>>

>> For propagating the load, we have to take into account the load of the

>> whole task group in order to evaluate the load of the sched_entity.

>> Similarly to what was done before the rewrite of PELT, we add a correction

>> factor in case the task group's load is less than its share so it will

>> contribute the same load of a task of equal weight.

>

> What about cfs_rq->runnable_load_avg?


sched_entity's load is updated before being enqueued so the up to date
value will be added to cfs_rq->runnable_load_avg... Unless se is
already enqueued ... so cfs_rq->runnable_load_avg should also be
updated is se is already on_rq. I'm going to add this case

Thanks for pointing this case

>

> [...]

>

>> +/* Take into account change of load of a child task group */

>> +static inline void

>> +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)

>> +{

>> +     struct cfs_rq *gcfs_rq = group_cfs_rq(se);

>> +     long delta, load = gcfs_rq->avg.load_avg;

>> +

>> +     /* If the load of group cfs_rq is null, the load of the

>> +      * sched_entity will also be null so we can skip the formula

>> +      */

>> +     if (load) {

>> +             long tg_load;

>> +

>> +             /* Get tg's load and ensure tg_load > 0 */

>> +             tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;

>> +

>> +             /* Ensure tg_load >= load and updated with current load*/

>> +             tg_load -= gcfs_rq->tg_load_avg_contrib;

>> +             tg_load += load;

>> +

>> +             /* scale gcfs_rq's load into tg's shares*/

>> +             load *= scale_load_down(gcfs_rq->tg->shares);

>> +             load /= tg_load;

>> +

>> +             /*

>> +              * we need to compute a correction term in the case that the

>> +              * task group is consuming <1 cpu so that we would contribute

>> +              * the same load as a task of equal weight.

>

> Wasn't 'consuming <1' related to 'NICE_0_LOAD' and not

> scale_load_down(gcfs_rq->tg->shares) before the rewrite of PELT (v4.2,

> __update_group_entity_contrib())?


Yes before the rewrite, the condition (tg->runnable_avg < NICE_0_LOAD) was used.

I have used the following examples to choose the condition:

A task group with only one always running task TA with a weight equals
to tg->shares, will have a tg's load (cfs_rq->tg->load_avg) equals to
TA's weight == scale_load_down(tg->shares): The load of the CPU on
which the task runs, will be scale_load_down(task's weight) ==
scale_load_down(tg->shares) and the load of others CPUs will be null.
In this case, all shares will be given to cfs_rq CFS1 on which TA runs
and the load of the sched_entity SB that represents CFS1 at parent
level will be scale_load_down(SB's weight) =
scale_load_down(tg->shares).

If the TA is not an always running task, its load will be less than
its weight and less than scale_load_down(tg->shares) and as a result
tg->load_avg will be less than scale_load_down(tg->shares).
Nevertheless, the weight of SB is still scale_load_down(tg->shares)
and its load should be the same as TA. But the 1st part of the
calculation gives a load of scale_load_down(gcfs_rq->tg->shares)
because tg_load == gcfs_rq->tg_load_avg_contrib == load. So if tg_load
< scale_load_down(gcfs_rq->tg->shares), we have to correct the load
that we set to SEB

>

>> +             */

>> +             if (tg_load < scale_load_down(gcfs_rq->tg->shares)) {

>> +                     load *= tg_load;

>> +                     load /= scale_load_down(gcfs_rq->tg->shares);

>> +             }

>> +     }

>

> [...]
Vincent Guittot Sept. 15, 2016, 2:51 p.m. UTC | #5
On 15 September 2016 at 16:43, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Sep 12, 2016 at 09:47:49AM +0200, Vincent Guittot wrote:

>> +static inline void

>> +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)

>> +{

>> +     struct cfs_rq *gcfs_rq = group_cfs_rq(se);

>> +     long delta, load = gcfs_rq->avg.load_avg;

>> +

>> +     /* If the load of group cfs_rq is null, the load of the

>> +      * sched_entity will also be null so we can skip the formula

>> +      */

>> +     if (load) {

>> +             long tg_load;

>> +

>> +             /* Get tg's load and ensure tg_load > 0 */

>> +             tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;

>> +

>> +             /* Ensure tg_load >= load and updated with current load*/

>> +             tg_load -= gcfs_rq->tg_load_avg_contrib;

>> +             tg_load += load;

>> +

>> +             /* scale gcfs_rq's load into tg's shares*/

>> +             load *= scale_load_down(gcfs_rq->tg->shares);

>> +             load /= tg_load;

>> +

>> +             /*

>> +              * we need to compute a correction term in the case that the

>> +              * task group is consuming <1 cpu so that we would contribute

>> +              * the same load as a task of equal weight.

>> +             */

>> +             if (tg_load < scale_load_down(gcfs_rq->tg->shares)) {

>> +                     load *= tg_load;

>> +                     load /= scale_load_down(gcfs_rq->tg->shares);

>> +             }

>

> Note that you're reversing the exact scaling you just applied.


Yes, Indeed

>

> That is:

>                shares    tg_load

>         load * ------- * ------- == load

>                tg_load   shares

>

>> +     }

>

> So something like:

>

>         shares = scale_load_down(gcfs_rq->tg->shares);

>

>         if (tg_load >= shares) {

>                 load *= shares;

>                 load /= tg_load;

>         }

>

> should be the same as the above and saves a bunch of math, no?


Yes
Dietmar Eggemann Sept. 15, 2016, 5:20 p.m. UTC | #6
On 15/09/16 15:31, Vincent Guittot wrote:
> On 15 September 2016 at 15:11, Dietmar Eggemann

> <dietmar.eggemann@arm.com> wrote:


[...]

>> Wasn't 'consuming <1' related to 'NICE_0_LOAD' and not

>> scale_load_down(gcfs_rq->tg->shares) before the rewrite of PELT (v4.2,

>> __update_group_entity_contrib())?

> 

> Yes before the rewrite, the condition (tg->runnable_avg < NICE_0_LOAD) was used.

> 

> I have used the following examples to choose the condition:

> 

> A task group with only one always running task TA with a weight equals

> to tg->shares, will have a tg's load (cfs_rq->tg->load_avg) equals to

> TA's weight == scale_load_down(tg->shares): The load of the CPU on

> which the task runs, will be scale_load_down(task's weight) ==

> scale_load_down(tg->shares) and the load of others CPUs will be null.

> In this case, all shares will be given to cfs_rq CFS1 on which TA runs

> and the load of the sched_entity SB that represents CFS1 at parent

> level will be scale_load_down(SB's weight) =

> scale_load_down(tg->shares).

> 

> If the TA is not an always running task, its load will be less than

> its weight and less than scale_load_down(tg->shares) and as a result

> tg->load_avg will be less than scale_load_down(tg->shares).

> Nevertheless, the weight of SB is still scale_load_down(tg->shares)

> and its load should be the same as TA. But the 1st part of the

> calculation gives a load of scale_load_down(gcfs_rq->tg->shares)

> because tg_load == gcfs_rq->tg_load_avg_contrib == load. So if tg_load

> < scale_load_down(gcfs_rq->tg->shares), we have to correct the load

> that we set to SEB


Makes sense to me now. Thanks. Peter already pointed out that this math
can be made easier, so you will probably 'scale gcfs_rq's load into tg's
shares' only if 'tg_load >= shares''

[...]
Dietmar Eggemann Sept. 15, 2016, 5:36 p.m. UTC | #7
On 15/09/16 16:14, Peter Zijlstra wrote:
> On Thu, Sep 15, 2016 at 02:11:49PM +0100, Dietmar Eggemann wrote:

>> On 12/09/16 08:47, Vincent Guittot wrote:

> 

>>> +/* Take into account change of load of a child task group */

>>> +static inline void

>>> +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)

>>> +{

>>> +	struct cfs_rq *gcfs_rq = group_cfs_rq(se);

>>> +	long delta, load = gcfs_rq->avg.load_avg;

>>> +

>>> +	/* If the load of group cfs_rq is null, the load of the

>>> +	 * sched_entity will also be null so we can skip the formula

>>> +	 */

>>> +	if (load) {

>>> +		long tg_load;

>>> +

>>> +		/* Get tg's load and ensure tg_load > 0 */

>>> +		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;

>>> +

>>> +		/* Ensure tg_load >= load and updated with current load*/

>>> +		tg_load -= gcfs_rq->tg_load_avg_contrib;

>>> +		tg_load += load;

>>> +

>>> +		/* scale gcfs_rq's load into tg's shares*/

>>> +		load *= scale_load_down(gcfs_rq->tg->shares);

>>> +		load /= tg_load;

>>> +

>>> +		/*

>>> +		 * we need to compute a correction term in the case that the

>>> +		 * task group is consuming <1 cpu so that we would contribute

>>> +		 * the same load as a task of equal weight.

>>

>> Wasn't 'consuming <1' related to 'NICE_0_LOAD' and not

>> scale_load_down(gcfs_rq->tg->shares) before the rewrite of PELT (v4.2,

>> __update_group_entity_contrib())?

> 

> 

> So the approximation was: min(1, runnable_avg) * shares;

> 

> And it just so happened that we tracked runnable_avg in 10 bit fixed

> point, which then happened to be NICE_0_LOAD.

> 

> But here we have load_avg, which already includes a '* shares' factor.

> So that then becomes min(shares, load_avg).


Makes sense, understand it now.

> We did however loose a lot on why and how min(1, runnable_avg) is a

> sensible thing to do...


Do you refer to the big comment on top of this if condition in the old
code in __update_group_entity_contrib()? The last two subsections of it
I never understood ...

[...]
diff mbox

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0aa1d7d..e4015f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3017,6 +3017,132 @@  static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 	}
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct cfs_rq *gcfs_rq =  group_cfs_rq(se);
+	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+	/* Nothing to update */
+	if (!delta)
+		return;
+
+	/* Set new sched_entity's utilizaton */
+	se->avg.util_avg = gcfs_rq->avg.util_avg;
+	se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+	/* Update parent cfs_rq utilization */
+	cfs_rq->avg.util_avg =  max_t(long, cfs_rq->avg.util_avg + delta, 0);
+	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+	long delta, load = gcfs_rq->avg.load_avg;
+
+	/* If the load of group cfs_rq is null, the load of the
+	 * sched_entity will also be null so we can skip the formula
+	 */
+	if (load) {
+		long tg_load;
+
+		/* Get tg's load and ensure tg_load > 0 */
+		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+		/* Ensure tg_load >= load and updated with current load*/
+		tg_load -= gcfs_rq->tg_load_avg_contrib;
+		tg_load += load;
+
+		/* scale gcfs_rq's load into tg's shares*/
+		load *= scale_load_down(gcfs_rq->tg->shares);
+		load /= tg_load;
+
+		/*
+		 * we need to compute a correction term in the case that the
+		 * task group is consuming <1 cpu so that we would contribute
+		 * the same load as a task of equal weight.
+		*/
+		if (tg_load < scale_load_down(gcfs_rq->tg->shares)) {
+			load *= tg_load;
+			load /= scale_load_down(gcfs_rq->tg->shares);
+		}
+	}
+
+	delta = load - se->avg.load_avg;
+
+	/* Nothing to update */
+	if (!delta)
+		return;
+
+	/* Set new sched_entity's load */
+	se->avg.load_avg = load;
+	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+	/* Update parent cfs_rq load */
+	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg + delta, 0);
+	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+	/* set cfs_rq's flag */
+	cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+	/* Get my cfs_rq */
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+	/* Nothing to propagate */
+	if (!cfs_rq->propagate_avg)
+		return 0;
+
+	/* Clear my cfs_rq's flag */
+	cfs_rq->propagate_avg = 0;
+
+	return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq;
+
+	if (entity_is_task(se))
+		return 0;
+
+	if (!test_and_clear_tg_cfs_propagate(se))
+		return 0;
+
+	/* Get parent cfs_rq */
+	cfs_rq = cfs_rq_of(se);
+
+	/* Propagate to parent */
+	set_tg_cfs_propagate(cfs_rq);
+
+	/* Update utilization */
+	update_tg_cfs_util(cfs_rq, se);
+
+	/* Update load */
+	update_tg_cfs_load(cfs_rq, se);
+
+	return 1;
+}
+#else
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+	return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+#endif
+
 /*
  * Unsigned subtract and clamp on underflow.
  *
@@ -3093,6 +3219,7 @@  static inline void update_load_avg(struct sched_entity *se, int update_tg,
 	u64 now = cfs_rq_clock_task(cfs_rq);
 	struct rq *rq = rq_of(cfs_rq);
 	int cpu = cpu_of(rq);
+	int decayed;
 
 	/*
 	 * Track task load average for carrying it to new CPU after migrated, and
@@ -3103,7 +3230,11 @@  static inline void update_load_avg(struct sched_entity *se, int update_tg,
 			  se->on_rq * scale_load_down(se->load.weight),
 			  cfs_rq->curr == se, NULL);
 
-	if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
+	decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+
+	decayed |= propagate_entity_load_avg(se);
+
+	if (decayed && update_tg)
 		update_tg_load_avg(cfs_rq, 0);
 }
 
@@ -3122,6 +3253,7 @@  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	cfs_rq->avg.load_sum += se->avg.load_sum;
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
+	set_tg_cfs_propagate(cfs_rq);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -3141,6 +3273,7 @@  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+	set_tg_cfs_propagate(cfs_rq);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -8499,6 +8632,22 @@  static void detach_task_cfs_rq(struct task_struct *p)
 	update_load_avg(se, 0, 0);
 	detach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/*
+	 * Propagate the detach across the tg tree to make it visible to the
+	 * root
+	 */
+	se = se->parent;
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+
+		update_load_avg(se, 1, 0);
+	}
+#endif
 }
 
 static void attach_entity_cfs_rq(struct sched_entity *se)
@@ -8517,6 +8666,22 @@  static void attach_entity_cfs_rq(struct sched_entity *se)
 	update_load_avg(se, 0, !sched_feat(ATTACH_AGE_LOAD));
 	attach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/*
+	 * Propagate the attach across the tg tree to make it visible to the
+	 * root
+	 */
+	se = se->parent;
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+
+		update_load_avg(se, 1, 0);
+	}
+#endif
 }
 
 static void attach_task_cfs_rq(struct task_struct *p)
@@ -8578,6 +8743,9 @@  void init_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 #ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	cfs_rq->propagate_avg = 0;
+#endif
 	atomic_long_set(&cfs_rq->removed_load_avg, 0);
 	atomic_long_set(&cfs_rq->removed_util_avg, 0);
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 483616a..0517a9e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -397,6 +397,7 @@  struct cfs_rq {
 	unsigned long runnable_load_avg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	unsigned long tg_load_avg_contrib;
+	unsigned long propagate_avg;
 #endif
 	atomic_long_t removed_load_avg, removed_util_avg;
 #ifndef CONFIG_64BIT