diff mbox series

[1/2] sched/fair: optimization of update_blocked_averages()

Message ID 1549469662-13614-2-git-send-email-vincent.guittot@linaro.org
State Accepted
Commit 31bc6aeaab1d1de8959b67edbed5c7a4b3cdbe7c
Headers show
Series sched/fair: Fix O(nr_cgroups) in load balance path | expand

Commit Message

Vincent Guittot Feb. 6, 2019, 4:14 p.m. UTC
Removing a cfs_rq from rq->leaf_cfs_rq_list can break the parent/child
ordering of the list when it will be added back. In order to remove an
empty and fully decayed cfs_rq, we must remove its children too so they
will be added back in the right order next time.

With a normal decay of pelt, a parent will be empty and fully decayed
if all children are empty and fully decayed too. In such a case, we just
have to ensure that the whole branch will be added when a new task is
enqueued. This is default behavior since :
  commit f6783319737f ("sched/fair: Fix insertion in rq->leaf_cfs_rq_list")

In case of throttling, the pelt of throttled cfs_rq will not be updated
whereas the parent will. This breaks the assumption made above unless we
remove the children of a cfs_rq that is throttled. Then, they will be
added back when unthrottled and a sched_entity will be enqueued.

As throttled cfs_rq are now removed from the list, we can remove the
associated test in update_blocked_averages().

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

---
 kernel/sched/fair.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

-- 
2.7.4

Comments

Peter Zijlstra Feb. 8, 2019, 3:40 p.m. UTC | #1
Argh head hurts!!

On Wed, Feb 06, 2019 at 05:14:21PM +0100, Vincent Guittot wrote:
> @@ -4438,6 +4450,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)

>  		/* adjust cfs_rq_clock_task() */

>  		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -

>  					     cfs_rq->throttled_clock_task;

> +

> +		/* Add cfs_rq with already running entity in the list */

> +		if (cfs_rq->nr_running >= 1)

> +			list_add_leaf_cfs_rq(cfs_rq);

>  	}

>  

>  	return 0;


Do we want the below to go with the above change?

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 38d4669aa2ef..0bd80a891025 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4536,6 +4536,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	/* update hierarchical throttle state */
 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
 
+	assert_list_leaf_cfs_rq(rq);
+
 	if (!cfs_rq->load.weight)
 		return;
Vincent Guittot Feb. 8, 2019, 3:44 p.m. UTC | #2
On Fri, 8 Feb 2019 at 16:40, Peter Zijlstra <peterz@infradead.org> wrote:
>

>

> Argh head hurts!!

>

> On Wed, Feb 06, 2019 at 05:14:21PM +0100, Vincent Guittot wrote:

> > @@ -4438,6 +4450,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)

> >               /* adjust cfs_rq_clock_task() */

> >               cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -

> >                                            cfs_rq->throttled_clock_task;

> > +

> > +             /* Add cfs_rq with already running entity in the list */

> > +             if (cfs_rq->nr_running >= 1)

> > +                     list_add_leaf_cfs_rq(cfs_rq);

> >       }

> >

> >       return 0;

>

> Do we want the below to go with the above change?

>

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> index 38d4669aa2ef..0bd80a891025 100644

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -4536,6 +4536,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)

>         /* update hierarchical throttle state */

>         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);

>

> +       assert_list_leaf_cfs_rq(rq);

> +


Good point but this should go after the for_each_sched_entity() loop

>         if (!cfs_rq->load.weight)

>                 return;

>
Peter Zijlstra Feb. 8, 2019, 4:30 p.m. UTC | #3
On Fri, Feb 08, 2019 at 04:44:53PM +0100, Vincent Guittot wrote:
> On Fri, 8 Feb 2019 at 16:40, Peter Zijlstra <peterz@infradead.org> wrote:

> >

> >

> > Argh head hurts!!

> >

> > On Wed, Feb 06, 2019 at 05:14:21PM +0100, Vincent Guittot wrote:

> > > @@ -4438,6 +4450,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)

> > >               /* adjust cfs_rq_clock_task() */

> > >               cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -

> > >                                            cfs_rq->throttled_clock_task;

> > > +

> > > +             /* Add cfs_rq with already running entity in the list */

> > > +             if (cfs_rq->nr_running >= 1)

> > > +                     list_add_leaf_cfs_rq(cfs_rq);

> > >       }

> > >

> > >       return 0;

> >

> > Do we want the below to go with the above change?

> >

> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> > index 38d4669aa2ef..0bd80a891025 100644

> > --- a/kernel/sched/fair.c

> > +++ b/kernel/sched/fair.c

> > @@ -4536,6 +4536,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)

> >         /* update hierarchical throttle state */

> >         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);

> >

> > +       assert_list_leaf_cfs_rq(rq);

> > +

> 

> Good point but this should go after the for_each_sched_entity() loop


Indeed, but that loop does enqueue and can throttle again, should that
not also get that additinoal list_add_leaf_cfs_rq() loop we added to
enqueue_task_fair() to finish the add?
Peter Zijlstra Feb. 8, 2019, 4:30 p.m. UTC | #4
On Wed, Feb 06, 2019 at 05:14:21PM +0100, Vincent Guittot wrote:
> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -346,6 +346,18 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)

>  static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)

>  {

>  	if (cfs_rq->on_list) {

> +		struct rq *rq = rq_of(cfs_rq);

> +

> +		/*

> +		 * With cfs_rq being unthrottled/throttled during an enqueue,

> +		 * it can happen the tmp_alone_branch points the a leaf that

> +		 * we finally want to del. In this case, tmp_alone_branch moves

> +		 * to the prev element but it will point to rq->leaf_cfs_rq_list

> +		 * at the end of the enqueue.

> +		 */

> +		if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)

> +			rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;

> +

>  		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);

>  		cfs_rq->on_list = 0;

>  	}


So that is:

  enqueue_task_fair()
    enqueue_entity()
      list_add_lead_cfs_rq()
      check_enqueue_throttle()
        throttle_cfs_rq()
	  walk_tg_tree_from()
	    tg_throttle_down()
	      list_del_leaf_cfs_rq()

Which can try and remove a cfs_rq which we just added.

And because the list is a bottom-up order, and the deletion is a
downward operation, we must go back (prev) in the list.

So far so good I suppose.

> @@ -4449,8 +4465,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)

>  	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

>  

>  	/* group is entering throttled state, stop time */

> -	if (!cfs_rq->throttle_count)

> +	if (!cfs_rq->throttle_count) {

>  		cfs_rq->throttled_clock_task = rq_clock_task(rq);

> +		list_del_leaf_cfs_rq(cfs_rq);

> +	}

>  	cfs_rq->throttle_count++;

>  

>  	return 0;
Vincent Guittot Feb. 8, 2019, 4:47 p.m. UTC | #5
On Fri, 8 Feb 2019 at 17:30, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Fri, Feb 08, 2019 at 04:44:53PM +0100, Vincent Guittot wrote:

> > On Fri, 8 Feb 2019 at 16:40, Peter Zijlstra <peterz@infradead.org> wrote:

> > >

> > >

> > > Argh head hurts!!

> > >

> > > On Wed, Feb 06, 2019 at 05:14:21PM +0100, Vincent Guittot wrote:

> > > > @@ -4438,6 +4450,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)

> > > >               /* adjust cfs_rq_clock_task() */

> > > >               cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -

> > > >                                            cfs_rq->throttled_clock_task;

> > > > +

> > > > +             /* Add cfs_rq with already running entity in the list */

> > > > +             if (cfs_rq->nr_running >= 1)

> > > > +                     list_add_leaf_cfs_rq(cfs_rq);

> > > >       }

> > > >

> > > >       return 0;

> > >

> > > Do we want the below to go with the above change?

> > >

> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> > > index 38d4669aa2ef..0bd80a891025 100644

> > > --- a/kernel/sched/fair.c

> > > +++ b/kernel/sched/fair.c

> > > @@ -4536,6 +4536,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)

> > >         /* update hierarchical throttle state */

> > >         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);

> > >

> > > +       assert_list_leaf_cfs_rq(rq);

> > > +

> >

> > Good point but this should go after the for_each_sched_entity() loop

>

> Indeed, but that loop does enqueue and can throttle again, should that

> not also get that additional list_add_leaf_cfs_rq() loop we added to

> enqueue_task_fair() to finish the add?


Initially, I added this additional loop but finally removed it because
I didn't hit it during my tests. IIRC, we are protected by
throttle_count in such case, which is not the case when we enqueue a
task
Vincent Guittot Feb. 8, 2019, 4:48 p.m. UTC | #6
On Fri, 8 Feb 2019 at 17:31, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Wed, Feb 06, 2019 at 05:14:21PM +0100, Vincent Guittot wrote:

> > --- a/kernel/sched/fair.c

> > +++ b/kernel/sched/fair.c

> > @@ -346,6 +346,18 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)

> >  static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)

> >  {

> >       if (cfs_rq->on_list) {

> > +             struct rq *rq = rq_of(cfs_rq);

> > +

> > +             /*

> > +              * With cfs_rq being unthrottled/throttled during an enqueue,

> > +              * it can happen the tmp_alone_branch points the a leaf that

> > +              * we finally want to del. In this case, tmp_alone_branch moves

> > +              * to the prev element but it will point to rq->leaf_cfs_rq_list

> > +              * at the end of the enqueue.

> > +              */

> > +             if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)

> > +                     rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;

> > +

> >               list_del_rcu(&cfs_rq->leaf_cfs_rq_list);

> >               cfs_rq->on_list = 0;

> >       }

>

> So that is:

>

>   enqueue_task_fair()

>     enqueue_entity()

>       list_add_lead_cfs_rq()

>       check_enqueue_throttle()

>         throttle_cfs_rq()

>           walk_tg_tree_from()

>             tg_throttle_down()

>               list_del_leaf_cfs_rq()

>

> Which can try and remove a cfs_rq which we just added.

>

> And because the list is a bottom-up order, and the deletion is a

> downward operation, we must go back (prev) in the list.


yes exactly

>

> So far so good I suppose.

>

> > @@ -4449,8 +4465,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)

> >       struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

> >

> >       /* group is entering throttled state, stop time */

> > -     if (!cfs_rq->throttle_count)

> > +     if (!cfs_rq->throttle_count) {

> >               cfs_rq->throttled_clock_task = rq_clock_task(rq);

> > +             list_del_leaf_cfs_rq(cfs_rq);

> > +     }

> >       cfs_rq->throttle_count++;

> >

> >       return 0;

>

>
Peter Zijlstra Feb. 8, 2019, 4:51 p.m. UTC | #7
On Fri, Feb 08, 2019 at 05:47:53PM +0100, Vincent Guittot wrote:
> On Fri, 8 Feb 2019 at 17:30, Peter Zijlstra <peterz@infradead.org> wrote:

> > On Fri, Feb 08, 2019 at 04:44:53PM +0100, Vincent Guittot wrote:

> > > On Fri, 8 Feb 2019 at 16:40, Peter Zijlstra <peterz@infradead.org> wrote:


> > > Good point but this should go after the for_each_sched_entity() loop

> >

> > Indeed, but that loop does enqueue and can throttle again, should that

> > not also get that additional list_add_leaf_cfs_rq() loop we added to

> > enqueue_task_fair() to finish the add?

> 

> Initially, I added this additional loop but finally removed it because

> I didn't hit it during my tests. IIRC, we are protected by

> throttle_count in such case, which is not the case when we enqueue a

> task


Fair enough; and the to-be added assert will notify us if we got that
wrong :-)

I'll add the assert, no need to re-send.

Thanks!
Vincent Guittot Feb. 8, 2019, 4:51 p.m. UTC | #8
On Fri, 8 Feb 2019 at 17:51, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Fri, Feb 08, 2019 at 05:47:53PM +0100, Vincent Guittot wrote:

> > On Fri, 8 Feb 2019 at 17:30, Peter Zijlstra <peterz@infradead.org> wrote:

> > > On Fri, Feb 08, 2019 at 04:44:53PM +0100, Vincent Guittot wrote:

> > > > On Fri, 8 Feb 2019 at 16:40, Peter Zijlstra <peterz@infradead.org> wrote:

>

> > > > Good point but this should go after the for_each_sched_entity() loop

> > >

> > > Indeed, but that loop does enqueue and can throttle again, should that

> > > not also get that additional list_add_leaf_cfs_rq() loop we added to

> > > enqueue_task_fair() to finish the add?

> >

> > Initially, I added this additional loop but finally removed it because

> > I didn't hit it during my tests. IIRC, we are protected by

> > throttle_count in such case, which is not the case when we enqueue a

> > task

>

> Fair enough; and the to-be added assert will notify us if we got that

> wrong :-)

>

> I'll add the assert, no need to re-send.


Thanks

>

> Thanks!
diff mbox series

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffd1ae7..badf8173 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -346,6 +346,18 @@  static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->on_list) {
+		struct rq *rq = rq_of(cfs_rq);
+
+		/*
+		 * With cfs_rq being unthrottled/throttled during an enqueue,
+		 * it can happen the tmp_alone_branch points the a leaf that
+		 * we finally want to del. In this case, tmp_alone_branch moves
+		 * to the prev element but it will point to rq->leaf_cfs_rq_list
+		 * at the end of the enqueue.
+		 */
+		if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
+			rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
+
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 		cfs_rq->on_list = 0;
 	}
@@ -4438,6 +4450,10 @@  static int tg_unthrottle_up(struct task_group *tg, void *data)
 		/* adjust cfs_rq_clock_task() */
 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
 					     cfs_rq->throttled_clock_task;
+
+		/* Add cfs_rq with already running entity in the list */
+		if (cfs_rq->nr_running >= 1)
+			list_add_leaf_cfs_rq(cfs_rq);
 	}
 
 	return 0;
@@ -4449,8 +4465,10 @@  static int tg_throttle_down(struct task_group *tg, void *data)
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
 	/* group is entering throttled state, stop time */
-	if (!cfs_rq->throttle_count)
+	if (!cfs_rq->throttle_count) {
 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
+		list_del_leaf_cfs_rq(cfs_rq);
+	}
 	cfs_rq->throttle_count++;
 
 	return 0;
@@ -7699,10 +7717,6 @@  static void update_blocked_averages(int cpu)
 	for_each_leaf_cfs_rq(rq, cfs_rq) {
 		struct sched_entity *se;
 
-		/* throttled entities do not contribute to load */
-		if (throttled_hierarchy(cfs_rq))
-			continue;
-
 		if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
 			update_tg_load_avg(cfs_rq, 0);