diff mbox

[4/6,v5] sched: propagate load during synchronous attach/detach

Message ID 1476695653-12309-5-git-send-email-vincent.guittot@linaro.org
State New
Headers show

Commit Message

Vincent Guittot Oct. 17, 2016, 9:14 a.m. UTC
When a task moves from/to a cfs_rq, we set a flag which is then used to
propagate the change at parent level (sched_entity and cfs_rq) during
next update. If the cfs_rq is throttled, the flag will stay pending until
the cfs_rw is unthrottled.

For propagating the utilization, we copy the utilization of group cfs_rq to
the sched_entity.

For propagating the load, we have to take into account the load of the
whole task group in order to evaluate the load of the sched_entity.
Similarly to what was done before the rewrite of PELT, we add a correction
factor in case the task group's load is greater than its share so it will
contribute the same load of a task of equal weight.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

---
 kernel/sched/fair.c  | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |   1 +
 2 files changed, 200 insertions(+), 1 deletion(-)

-- 
2.7.4

Comments

Dietmar Eggemann Oct. 21, 2016, 12:19 p.m. UTC | #1
On 10/17/2016 10:14 AM, Vincent Guittot wrote:
> When a task moves from/to a cfs_rq, we set a flag which is then used to

> propagate the change at parent level (sched_entity and cfs_rq) during

> next update. If the cfs_rq is throttled, the flag will stay pending until

> the cfs_rw is unthrottled.


minor nit:

s/cfs_rw/cfs_rq

[...]

> @@ -8704,6 +8867,22 @@ static void detach_task_cfs_rq(struct task_struct *p)

>  	update_load_avg(se, 0);

>  	detach_entity_load_avg(cfs_rq, se);

>  	update_tg_load_avg(cfs_rq, false);

> +

> +#ifdef CONFIG_FAIR_GROUP_SCHED

> +	/*

> +	 * Propagate the detach across the tg tree to make it visible to the

> +	 * root

> +	 */

> +	se = se->parent;

> +	for_each_sched_entity(se) {

> +		cfs_rq = cfs_rq_of(se);

> +

> +		if (cfs_rq_throttled(cfs_rq))

> +			break;

> +

> +		update_load_avg(se, UPDATE_TG);

> +	}

> +#endif

>  }

>

>  static void attach_entity_cfs_rq(struct sched_entity *se)

> @@ -8722,6 +8901,22 @@ static void attach_entity_cfs_rq(struct sched_entity *se)

>  	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);

>  	attach_entity_load_avg(cfs_rq, se);

>  	update_tg_load_avg(cfs_rq, false);

> +

> +#ifdef CONFIG_FAIR_GROUP_SCHED

> +	/*

> +	 * Propagate the attach across the tg tree to make it visible to the

> +	 * root

> +	 */

> +	se = se->parent;

> +	for_each_sched_entity(se) {

> +		cfs_rq = cfs_rq_of(se);

> +

> +		if (cfs_rq_throttled(cfs_rq))

> +			break;

> +

> +		update_load_avg(se, UPDATE_TG);

> +	}

> +#endif

>  }


The 'detach across' and 'attach across' in detach_task_cfs_rq() and 
attach_entity_cfs_rq() do the same so couldn't you not create a function 
propagate_foo() for it? This would avoid this ifdef as well.

You could further create in your '[PATCH 1/6 v5] sched: factorize attach 
entity':

detach_entity_cfs_rq() {
   update_load_avg()
   detach_entity_load_avg()
   update_tg_load_avg()
   propagate_load_avg()
}

and then we would have:

attach_task_cfs_rq() -> attach_entity_cfs_rq() -> propagate_foo()
detach_task_cfs_rq() -> detach_entity_cfs_rq() -> propagate_foo()

I guess you didn't because it would be only called one time but this 
symmetric approaches are easier to remember (at least for me).

[...]
Vincent Guittot Oct. 26, 2016, 7:05 a.m. UTC | #2
On 21 October 2016 at 14:19, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>

> On 10/17/2016 10:14 AM, Vincent Guittot wrote:

>>

>> When a task moves from/to a cfs_rq, we set a flag which is then used to

>> propagate the change at parent level (sched_entity and cfs_rq) during

>> next update. If the cfs_rq is throttled, the flag will stay pending until

>> the cfs_rw is unthrottled.

>

>

> minor nit:

>

> s/cfs_rw/cfs_rq


yes

>

> [...]

>

>

>> @@ -8704,6 +8867,22 @@ static void detach_task_cfs_rq(struct task_struct *p)

>>         update_load_avg(se, 0);

>>         detach_entity_load_avg(cfs_rq, se);

>>         update_tg_load_avg(cfs_rq, false);

>> +

>> +#ifdef CONFIG_FAIR_GROUP_SCHED

>> +       /*

>> +        * Propagate the detach across the tg tree to make it visible to the

>> +        * root

>> +        */

>> +       se = se->parent;

>> +       for_each_sched_entity(se) {

>> +               cfs_rq = cfs_rq_of(se);

>> +

>> +               if (cfs_rq_throttled(cfs_rq))

>> +                       break;

>> +

>> +               update_load_avg(se, UPDATE_TG);

>> +       }

>> +#endif

>>  }

>>

>>  static void attach_entity_cfs_rq(struct sched_entity *se)

>> @@ -8722,6 +8901,22 @@ static void attach_entity_cfs_rq(struct sched_entity *se)

>>         update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);

>>         attach_entity_load_avg(cfs_rq, se);

>>         update_tg_load_avg(cfs_rq, false);

>> +

>> +#ifdef CONFIG_FAIR_GROUP_SCHED

>> +       /*

>> +        * Propagate the attach across the tg tree to make it visible to the

>> +        * root

>> +        */

>> +       se = se->parent;

>> +       for_each_sched_entity(se) {

>> +               cfs_rq = cfs_rq_of(se);

>> +

>> +               if (cfs_rq_throttled(cfs_rq))

>> +                       break;

>> +

>> +               update_load_avg(se, UPDATE_TG);

>> +       }

>> +#endif

>>  }

>

>

> The 'detach across' and 'attach across' in detach_task_cfs_rq() and attach_entity_cfs_rq() do the same so couldn't you not create a function propagate_foo() for it? This would avoid this ifdef as well.

>

> You could further create in your '[PATCH 1/6 v5] sched: factorize attach entity':

>

> detach_entity_cfs_rq() {

>   update_load_avg()

>   detach_entity_load_avg()

>   update_tg_load_avg()

>   propagate_load_avg()

> }

>

> and then we would have:

>

> attach_task_cfs_rq() -> attach_entity_cfs_rq() -> propagate_foo()

> detach_task_cfs_rq() -> detach_entity_cfs_rq() -> propagate_foo()

>

> I guess you didn't because it would be only called one time but this symmetric approaches are easier to remember (at least for me).


Yes i haven't created attach_entity_cfs_rq because it would be used only once.
Regarding the creation of a propagate_foo function, i have just
followed a similar skeleton as what is done in
enqueue/dequeue_task_fair

I don't have strong opinion about creating this indirection for code
readability. Others, have you got a preference ?

>

> [...]
Peter Zijlstra Oct. 26, 2016, 10:54 a.m. UTC | #3
On Mon, Oct 17, 2016 at 11:14:11AM +0200, Vincent Guittot wrote:
>  /*

> + * Signed add and clamp on underflow.

> + *

> + * Explicitly do a load-store to ensure the intermediate value never hits

> + * memory. This allows lockless observations without ever seeing the negative

> + * values.

> + */

> +#define add_positive(_ptr, _val) do {				\

> +	typeof(_ptr) ptr = (_ptr);				\

> +	typeof(_val) res, val = (_val);				\

> +	typeof(*ptr) var = READ_ONCE(*ptr);			\

> +	res = var + val;					\

> +	if (res < 0)						\

> +		res = 0;					\


I think this is broken, and inconsistent with sub_positive().

The thing is, util_avg, on which you use this, is an unsigned type.
Checking for unsigned underflow can be done by comparing against either
one of the terms.

> +	WRITE_ONCE(*ptr, res);					\

> +} while (0)


> +	add_positive(&cfs_rq->avg.util_avg, delta);
Peter Zijlstra Oct. 26, 2016, 11:16 a.m. UTC | #4
On Wed, Oct 26, 2016 at 09:05:49AM +0200, Vincent Guittot wrote:
> >

> > The 'detach across' and 'attach across' in detach_task_cfs_rq() and

> > attach_entity_cfs_rq() do the same so couldn't you not create a

> > function propagate_foo() for it? This would avoid this ifdef as

> > well.

> >

> > You could further create in your '[PATCH 1/6 v5] sched: factorize attach entity':

> >

> > detach_entity_cfs_rq() {

> >   update_load_avg()

> >   detach_entity_load_avg()

> >   update_tg_load_avg()

> >   propagate_load_avg()

> > }

> >

> > and then we would have:

> >

> > attach_task_cfs_rq() -> attach_entity_cfs_rq() -> propagate_foo()

> > detach_task_cfs_rq() -> detach_entity_cfs_rq() -> propagate_foo()

> >

> > I guess you didn't because it would be only called one time but this

> > symmetric approaches are easier to remember (at least for me).

> 

> Yes i haven't created attach_entity_cfs_rq because it would be used only once.

> Regarding the creation of a propagate_foo function, i have just

> followed a similar skeleton as what is done in

> enqueue/dequeue_task_fair

> 

> I don't have strong opinion about creating this indirection for code

> readability. Others, have you got a preference ?


I think I agree with Dietmar. Duplicate code needs a helper function and
it would be nice to keep symmetry, even if there's only a single
call site.
Peter Zijlstra Oct. 26, 2016, 11:45 a.m. UTC | #5
On Mon, Oct 17, 2016 at 11:14:11AM +0200, Vincent Guittot wrote:
> +/* Take into account change of load of a child task group */

> +static inline void

> +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)

> +{

> +	struct cfs_rq *gcfs_rq = group_cfs_rq(se);

> +	long delta, load = gcfs_rq->avg.load_avg;

> +

> +	/*

> +	 * If the load of group cfs_rq is null, the load of the

> +	 * sched_entity will also be null so we can skip the formula

> +	 */


Does it make sense to do:

	if (!load)
		goto no_load;

and avoid the indent?

> +	if (load) {

> +		long tg_load;

> +

> +		/* Get tg's load and ensure tg_load > 0 */

> +		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;

> +

> +		/* Ensure tg_load >= load and updated with current load*/

> +		tg_load -= gcfs_rq->tg_load_avg_contrib;

> +		tg_load += load;

> +

> +		/*

> +		 * We need to compute a correction term in the case that the

> +		 * task group is consuming more cpu than a task of equal

> +		 * weight. A task with a weight equals to tg->shares will have

> +		 * a load less or equal to scale_load_down(tg->shares).

  +		 *
> +		 * Similarly, the sched_entities that represent the task group

> +		 * at parent level, can't have a load higher than

> +		 * scale_load_down(tg->shares). And the Sum of sched_entities'

> +		 * load must be <= scale_load_down(tg->shares).

> +		 */

> +		if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {

> +			/* scale gcfs_rq's load into tg's shares*/

> +			load *= scale_load_down(gcfs_rq->tg->shares);

> +			load /= tg_load;

> +		}

> +	}

> +

no_load:
> +	delta = load - se->avg.load_avg;

> +

> +	/* Nothing to update */

> +	if (!delta)

> +		return;

> +

> +	/* Set new sched_entity's load */

> +	se->avg.load_avg = load;

> +	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;

> +

> +	/* Update parent cfs_rq load */

> +	add_positive(&cfs_rq->avg.load_avg, delta);

> +	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;

> +

> +	/*

> +	 * If the sched_entity is already enqueued, we also have to update the

> +	 * runnable load avg.

> +	 */

> +	if (se->on_rq) {

> +		/* Update parent cfs_rq runnable_load_avg */

> +		add_positive(&cfs_rq->runnable_load_avg, delta);

> +		cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;

> +	}

> +}
Vincent Guittot Oct. 26, 2016, 12:31 p.m. UTC | #6
On 26 October 2016 at 12:54, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Oct 17, 2016 at 11:14:11AM +0200, Vincent Guittot wrote:

>>  /*

>> + * Signed add and clamp on underflow.

>> + *

>> + * Explicitly do a load-store to ensure the intermediate value never hits

>> + * memory. This allows lockless observations without ever seeing the negative

>> + * values.

>> + */

>> +#define add_positive(_ptr, _val) do {                                \

>> +     typeof(_ptr) ptr = (_ptr);                              \

>> +     typeof(_val) res, val = (_val);                         \

>> +     typeof(*ptr) var = READ_ONCE(*ptr);                     \

>> +     res = var + val;                                        \

>> +     if (res < 0)                                            \

>> +             res = 0;                                        \

>

> I think this is broken, and inconsistent with sub_positive().


I agree that the behavior is different from sub_positive which deals
with unsigned value, but i was not able to come with a short name that
highlight this signed/unsigned difference

>

> The thing is, util_avg, on which you use this, is an unsigned type.


The delta that is added to util_avg, is a signed value

> Checking for unsigned underflow can be done by comparing against either

> one of the terms.

>

>> +     WRITE_ONCE(*ptr, res);                                  \

>> +} while (0)

>

>> +     add_positive(&cfs_rq->avg.util_avg, delta);
Vincent Guittot Oct. 26, 2016, 12:31 p.m. UTC | #7
On 26 October 2016 at 13:16, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Oct 26, 2016 at 09:05:49AM +0200, Vincent Guittot wrote:

>> >

>> > The 'detach across' and 'attach across' in detach_task_cfs_rq() and

>> > attach_entity_cfs_rq() do the same so couldn't you not create a

>> > function propagate_foo() for it? This would avoid this ifdef as

>> > well.

>> >

>> > You could further create in your '[PATCH 1/6 v5] sched: factorize attach entity':

>> >

>> > detach_entity_cfs_rq() {

>> >   update_load_avg()

>> >   detach_entity_load_avg()

>> >   update_tg_load_avg()

>> >   propagate_load_avg()

>> > }

>> >

>> > and then we would have:

>> >

>> > attach_task_cfs_rq() -> attach_entity_cfs_rq() -> propagate_foo()

>> > detach_task_cfs_rq() -> detach_entity_cfs_rq() -> propagate_foo()

>> >

>> > I guess you didn't because it would be only called one time but this

>> > symmetric approaches are easier to remember (at least for me).

>>

>> Yes i haven't created attach_entity_cfs_rq because it would be used only once.

>> Regarding the creation of a propagate_foo function, i have just

>> followed a similar skeleton as what is done in

>> enqueue/dequeue_task_fair

>>

>> I don't have strong opinion about creating this indirection for code

>> readability. Others, have you got a preference ?

>

> I think I agree with Dietmar. Duplicate code needs a helper function and

> it would be nice to keep symmetry, even if there's only a single

> call site.


OK
Peter Zijlstra Oct. 26, 2016, 2:28 p.m. UTC | #8
On Wed, Oct 26, 2016 at 02:31:01PM +0200, Vincent Guittot wrote:
> On 26 October 2016 at 12:54, Peter Zijlstra <peterz@infradead.org> wrote:

> > On Mon, Oct 17, 2016 at 11:14:11AM +0200, Vincent Guittot wrote:

> >>  /*

> >> + * Signed add and clamp on underflow.

> >> + *

> >> + * Explicitly do a load-store to ensure the intermediate value never hits

> >> + * memory. This allows lockless observations without ever seeing the negative

> >> + * values.

> >> + */

> >> +#define add_positive(_ptr, _val) do {                                \

> >> +     typeof(_ptr) ptr = (_ptr);                              \

> >> +     typeof(_val) res, val = (_val);                         \

> >> +     typeof(*ptr) var = READ_ONCE(*ptr);                     \

> >> +     res = var + val;                                        \

> >> +     if (res < 0)                                            \

> >> +             res = 0;                                        \

> >

> > I think this is broken, and inconsistent with sub_positive().

> 

> I agree that the behavior is different from sub_positive which deals

> with unsigned value, but i was not able to come with a short name that

> highlight this signed/unsigned difference

> 

> >

> > The thing is, util_avg, on which you use this, is an unsigned type.

> 

> The delta that is added to util_avg, is a signed value


Doesn't matter, util_avg is unsigned, this means MSB set is a valid and
non-negative number, while the above will truncate it to 0.

So you really do need an alternative method of underflow. And yes, delta
being signed makes it slightly more complicated.

How about something like the below, that will, if val is negative and we
thus end up doing a subtraction (assumes 2s complement, which is fine,
we do all over anyway), check the result isn't larger than we started
out with.


#define add_positive(_ptr, _val) do {                           \
        typeof(_ptr) ptr = (_ptr);                              \
        typeof(_val) val = (_val);                              \
        typeof(*ptr) res, var = READ_ONCE(*ptr);                \
								\
	res = var + val;					\
								\
	if (val < 0 && res > var)				\
		res = 0;					\
								\
        WRITE_ONCE(*ptr, res);                                  \
} while (0)
Vincent Guittot Oct. 26, 2016, 6:40 p.m. UTC | #9
On 26 October 2016 at 16:28, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Oct 26, 2016 at 02:31:01PM +0200, Vincent Guittot wrote:

>> On 26 October 2016 at 12:54, Peter Zijlstra <peterz@infradead.org> wrote:

>> > On Mon, Oct 17, 2016 at 11:14:11AM +0200, Vincent Guittot wrote:

>> >>  /*

>> >> + * Signed add and clamp on underflow.

>> >> + *

>> >> + * Explicitly do a load-store to ensure the intermediate value never hits

>> >> + * memory. This allows lockless observations without ever seeing the negative

>> >> + * values.

>> >> + */

>> >> +#define add_positive(_ptr, _val) do {                                \

>> >> +     typeof(_ptr) ptr = (_ptr);                              \

>> >> +     typeof(_val) res, val = (_val);                         \

>> >> +     typeof(*ptr) var = READ_ONCE(*ptr);                     \

>> >> +     res = var + val;                                        \

>> >> +     if (res < 0)                                            \

>> >> +             res = 0;                                        \

>> >

>> > I think this is broken, and inconsistent with sub_positive().

>>

>> I agree that the behavior is different from sub_positive which deals

>> with unsigned value, but i was not able to come with a short name that

>> highlight this signed/unsigned difference

>>

>> >

>> > The thing is, util_avg, on which you use this, is an unsigned type.

>>

>> The delta that is added to util_avg, is a signed value

>

> Doesn't matter, util_avg is unsigned, this means MSB set is a valid and

> non-negative number, while the above will truncate it to 0.

>

> So you really do need an alternative method of underflow. And yes, delta

> being signed makes it slightly more complicated.

>

> How about something like the below, that will, if val is negative and we

> thus end up doing a subtraction (assumes 2s complement, which is fine,

> we do all over anyway), check the result isn't larger than we started

> out with.

>

>

> #define add_positive(_ptr, _val) do {                           \

>         typeof(_ptr) ptr = (_ptr);                              \

>         typeof(_val) val = (_val);                              \

>         typeof(*ptr) res, var = READ_ONCE(*ptr);                \

>                                                                 \

>         res = var + val;                                        \

>                                                                 \

>         if (val < 0 && res > var)                               \

>                 res = 0;                                        \

>                                                                 \

>         WRITE_ONCE(*ptr, res);                                  \

> } while (0)


Indeed, looks better like that

>

>
diff mbox

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 52cbc92..91fc949 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3031,6 +3031,162 @@  static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 }
 
 /*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do {				\
+	typeof(_ptr) ptr = (_ptr);				\
+	typeof(_val) res, val = (_val);				\
+	typeof(*ptr) var = READ_ONCE(*ptr);			\
+	res = var + val;					\
+	if (res < 0)						\
+		res = 0;					\
+	WRITE_ONCE(*ptr, res);					\
+} while (0)
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct cfs_rq *gcfs_rq =  group_cfs_rq(se);
+	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+	/* Nothing to update */
+	if (!delta)
+		return;
+
+	/* Set new sched_entity's utilization */
+	se->avg.util_avg = gcfs_rq->avg.util_avg;
+	se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+	/* Update parent cfs_rq utilization */
+	add_positive(&cfs_rq->avg.util_avg, delta);
+	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+	long delta, load = gcfs_rq->avg.load_avg;
+
+	/*
+	 * If the load of group cfs_rq is null, the load of the
+	 * sched_entity will also be null so we can skip the formula
+	 */
+	if (load) {
+		long tg_load;
+
+		/* Get tg's load and ensure tg_load > 0 */
+		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+		/* Ensure tg_load >= load and updated with current load*/
+		tg_load -= gcfs_rq->tg_load_avg_contrib;
+		tg_load += load;
+
+		/*
+		 * We need to compute a correction term in the case that the
+		 * task group is consuming more cpu than a task of equal
+		 * weight. A task with a weight equals to tg->shares will have
+		 * a load less or equal to scale_load_down(tg->shares).
+		 * Similarly, the sched_entities that represent the task group
+		 * at parent level, can't have a load higher than
+		 * scale_load_down(tg->shares). And the Sum of sched_entities'
+		 * load must be <= scale_load_down(tg->shares).
+		 */
+		if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+			/* scale gcfs_rq's load into tg's shares*/
+			load *= scale_load_down(gcfs_rq->tg->shares);
+			load /= tg_load;
+		}
+	}
+
+	delta = load - se->avg.load_avg;
+
+	/* Nothing to update */
+	if (!delta)
+		return;
+
+	/* Set new sched_entity's load */
+	se->avg.load_avg = load;
+	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+	/* Update parent cfs_rq load */
+	add_positive(&cfs_rq->avg.load_avg, delta);
+	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+	/*
+	 * If the sched_entity is already enqueued, we also have to update the
+	 * runnable load avg.
+	 */
+	if (se->on_rq) {
+		/* Update parent cfs_rq runnable_load_avg */
+		add_positive(&cfs_rq->runnable_load_avg, delta);
+		cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+	}
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+	/* set cfs_rq's flag */
+	cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+	/* Get my cfs_rq */
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+	/* Nothing to propagate */
+	if (!cfs_rq->propagate_avg)
+		return 0;
+
+	/* Clear my cfs_rq's flag */
+	cfs_rq->propagate_avg = 0;
+
+	return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq;
+
+	if (entity_is_task(se))
+		return 0;
+
+	if (!test_and_clear_tg_cfs_propagate(se))
+		return 0;
+
+	/* Get parent cfs_rq */
+	cfs_rq = cfs_rq_of(se);
+
+	/* Propagate to parent */
+	set_tg_cfs_propagate(cfs_rq);
+
+	/* Update utilization */
+	update_tg_cfs_util(cfs_rq, se);
+
+	/* Update load */
+	update_tg_cfs_load(cfs_rq, se);
+
+	return 1;
+}
+#else
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+	return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+#endif
+
+/*
  * Unsigned subtract and clamp on underflow.
  *
  * Explicitly do a load-store to ensure the intermediate value never hits
@@ -3111,6 +3267,7 @@  static inline void update_load_avg(struct sched_entity *se, int flags)
 	u64 now = cfs_rq_clock_task(cfs_rq);
 	struct rq *rq = rq_of(cfs_rq);
 	int cpu = cpu_of(rq);
+	int decayed;
 
 	/*
 	 * Track task load average for carrying it to new CPU after migrated, and
@@ -3121,7 +3278,11 @@  static inline void update_load_avg(struct sched_entity *se, int flags)
 			  se->on_rq * scale_load_down(se->load.weight),
 			  cfs_rq->curr == se, NULL);
 
-	if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG))
+	decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+
+	decayed |= propagate_entity_load_avg(se);
+
+	if (decayed && (flags & UPDATE_TG))
 		update_tg_load_avg(cfs_rq, 0);
 }
 
@@ -3140,6 +3301,7 @@  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	cfs_rq->avg.load_sum += se->avg.load_sum;
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
+	set_tg_cfs_propagate(cfs_rq);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -3159,6 +3321,7 @@  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+	set_tg_cfs_propagate(cfs_rq);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -8704,6 +8867,22 @@  static void detach_task_cfs_rq(struct task_struct *p)
 	update_load_avg(se, 0);
 	detach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/*
+	 * Propagate the detach across the tg tree to make it visible to the
+	 * root
+	 */
+	se = se->parent;
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+
+		update_load_avg(se, UPDATE_TG);
+	}
+#endif
 }
 
 static void attach_entity_cfs_rq(struct sched_entity *se)
@@ -8722,6 +8901,22 @@  static void attach_entity_cfs_rq(struct sched_entity *se)
 	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
 	attach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/*
+	 * Propagate the attach across the tg tree to make it visible to the
+	 * root
+	 */
+	se = se->parent;
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+
+		update_load_avg(se, UPDATE_TG);
+	}
+#endif
 }
 
 static void attach_task_cfs_rq(struct task_struct *p)
@@ -8783,6 +8978,9 @@  void init_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 #ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	cfs_rq->propagate_avg = 0;
+#endif
 	atomic_long_set(&cfs_rq->removed_load_avg, 0);
 	atomic_long_set(&cfs_rq->removed_util_avg, 0);
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f1185c3..47385e2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -404,6 +404,7 @@  struct cfs_rq {
 	unsigned long runnable_load_avg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	unsigned long tg_load_avg_contrib;
+	unsigned long propagate_avg;
 #endif
 	atomic_long_t removed_load_avg, removed_util_avg;
 #ifndef CONFIG_64BIT