diff mbox

[RFC] sched/fair: let cpu's cfs_rq to reflect task migration

Message ID 57055B41.6000906@arm.com
State New
Headers show

Commit Message

Dietmar Eggemann April 6, 2016, 6:53 p.m. UTC
On 06/04/16 09:37, Morten Rasmussen wrote:
> On Tue, Apr 05, 2016 at 06:00:40PM +0100, Dietmar Eggemann wrote:

>> @@ -2893,8 +2906,12 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s

>>         se->avg.last_update_time = cfs_rq->avg.last_update_time;

>>         cfs_rq->avg.load_avg += se->avg.load_avg;

>>         cfs_rq->avg.load_sum += se->avg.load_sum;

>> -       cfs_rq->avg.util_avg += se->avg.util_avg;

>> -       cfs_rq->avg.util_sum += se->avg.util_sum;

>> +

>> +       if (!entity_is_task(se))

>> +               return;

>> +

>> +       rq_of(cfs_rq)->cfs.avg.util_avg += se->avg.util_avg;

>> +       rq_of(cfs_rq)->cfs.avg.util_sum += se->avg.util_sum;

> 

> To me it seems that you cannot be sure that the rq_of(cfs_rq)->cfs.avg

> time stamp is aligned with se->avg time stamp, which is necessary before

> you can add/subtract two geometric series without introducing an error.

> 

> attach_entity_load_avg() is called (through a couple of other functions)

> from the for_each_sched_entity() loop in enqueue_task_fair() which works

> its way towards the root cfs_rq, i.e. rq_of(cfs_rq)->cfs. So in the loop

> iteration where you attach the task sched_entity, we haven't yet visited

> and updated rq_of(cfs_rq)->cfs.avg.

> 

> If you just add the task contribution and discover later that there is a

> time delta when you update rq_of(cfs_rq)->cfs.avg you end up decaying

> the task contribution which was already up-to-date and its util

> contribution to rq_of(cfs_rq)->cfs.avg ends up being smaller than it

> should be.

> 

> Am I missing something?


No that's definitely wrong in the current patch. I could defer the attach into
the update_cfs_rq_load_avg() call for the root cfs_rq if 
'&rq_of(cfs_rq)->cfs != cfs_rq' in attach_entity_load_avg():

Something like this (only lightly tested!):

But attach_entity_load_avg() is not only called in enqueue_entity_load_avg() for migrated
tasks but also in attach_task_cfs_rq() which is called from switched_to_fair() and
task_move_group_fair() where we can't assume that after the enqueue_entity_load_avg() a
call to update_cfs_rq_load_avg() follows like in

enqueue_task_fair():

    for_each_sched_entity(se)
        enqueue_entity()
	    enqueue_entity_load_avg()
        	    update_cfs_rq_load_avg(now, cfs_rq)
                    if (migrated) attach_entity_load_avg()     

    for_each_sched_entity(se)
        update_load_avg()
            update_cfs_rq_load_avg(now, cfs_rq)

	  
Not sure if we can just update the root cfs_rq to se->avg.last_update_time before we add
se->avg.util_[avg/sum] to rq_of(cfs_rq)->cfs.avg.util_[avg/sum] in attach_entity_load_avg()?

cfs_rq throttling has to be considered as well ...

Comments

Vincent Guittot April 7, 2016, 1:04 p.m. UTC | #1
Hi Dietmar,

On 6 April 2016 at 20:53, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> On 06/04/16 09:37, Morten Rasmussen wrote:

>> On Tue, Apr 05, 2016 at 06:00:40PM +0100, Dietmar Eggemann wrote:

>>> @@ -2893,8 +2906,12 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s

>>>         se->avg.last_update_time = cfs_rq->avg.last_update_time;

>>>         cfs_rq->avg.load_avg += se->avg.load_avg;

>>>         cfs_rq->avg.load_sum += se->avg.load_sum;

>>> -       cfs_rq->avg.util_avg += se->avg.util_avg;

>>> -       cfs_rq->avg.util_sum += se->avg.util_sum;

>>> +

>>> +       if (!entity_is_task(se))

>>> +               return;

>>> +

>>> +       rq_of(cfs_rq)->cfs.avg.util_avg += se->avg.util_avg;

>>> +       rq_of(cfs_rq)->cfs.avg.util_sum += se->avg.util_sum;

>>

>> To me it seems that you cannot be sure that the rq_of(cfs_rq)->cfs.avg

>> time stamp is aligned with se->avg time stamp, which is necessary before

>> you can add/subtract two geometric series without introducing an error.

>>

>> attach_entity_load_avg() is called (through a couple of other functions)

>> from the for_each_sched_entity() loop in enqueue_task_fair() which works

>> its way towards the root cfs_rq, i.e. rq_of(cfs_rq)->cfs. So in the loop

>> iteration where you attach the task sched_entity, we haven't yet visited

>> and updated rq_of(cfs_rq)->cfs.avg.

>>

>> If you just add the task contribution and discover later that there is a

>> time delta when you update rq_of(cfs_rq)->cfs.avg you end up decaying

>> the task contribution which was already up-to-date and its util

>> contribution to rq_of(cfs_rq)->cfs.avg ends up being smaller than it

>> should be.

>>

>> Am I missing something?

>

> No that's definitely wrong in the current patch. I could defer the attach into

> the update_cfs_rq_load_avg() call for the root cfs_rq if

> '&rq_of(cfs_rq)->cfs != cfs_rq' in attach_entity_load_avg():

>

> Something like this (only lightly tested!):

>

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> index 51d675715776..d31d9cd453a1 100644

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -2856,6 +2856,16 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)

>         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,

>                 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);

>

> +       if (cfs_rq->added_util_avg) {

> +               sa->util_avg += cfs_rq->added_util_avg;

> +               cfs_rq->added_util_avg = 0;

> +       }

> +

> +       if (cfs_rq->added_util_sum) {

> +               sa->util_sum += cfs_rq->added_util_sum;

> +               cfs_rq->added_util_sum = 0;

> +       }

> +

>  #ifndef CONFIG_64BIT

>         smp_wmb();

>         cfs_rq->load_last_update_time_copy = sa->last_update_time;

> @@ -2910,8 +2920,13 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s

>         if (!entity_is_task(se))

>                 return;

>

> -       rq_of(cfs_rq)->cfs.avg.util_avg += se->avg.util_avg;

> -       rq_of(cfs_rq)->cfs.avg.util_sum += se->avg.util_sum;

> +       if (&rq_of(cfs_rq)->cfs == cfs_rq) {

> +               rq_of(cfs_rq)->cfs.avg.util_avg += se->avg.util_avg;

> +               rq_of(cfs_rq)->cfs.avg.util_sum += se->avg.util_sum;

> +       } else {

> +               rq_of(cfs_rq)->cfs.added_util_avg = se->avg.util_avg;

> +               rq_of(cfs_rq)->cfs.added_util_sum = se->avg.util_sum;

> +       }

>  }


Don't you also need similar thing for the detach ?

>

>  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)

> @@ -4344,8 +4359,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)

>                 cfs_rq = cfs_rq_of(se);

>                 cfs_rq->h_nr_running++;

>

> -               if (cfs_rq_throttled(cfs_rq))

> +               if (cfs_rq_throttled(cfs_rq)) {

> +                       rq_of(cfs_rq)->cfs.added_util_avg = 0;

> +                       rq_of(cfs_rq)->cfs.added_util_sum = 0;

>                         break;

> +               }

>

>                 update_load_avg(se, 1);

>                 update_cfs_shares(cfs_rq);

> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

> index b2ff5a2bd6df..f0ea3a7eaf07 100644

> --- a/kernel/sched/sched.h

> +++ b/kernel/sched/sched.h

> @@ -391,6 +391,8 @@ struct cfs_rq {

>         unsigned long tg_load_avg_contrib;

>  #endif

>         atomic_long_t removed_load_avg, removed_util_avg;

> +       u32 added_util_sum;

> +       unsigned long added_util_avg;

>  #ifndef CONFIG_64BIT

>         u64 load_last_update_time_copy;

>

> But attach_entity_load_avg() is not only called in enqueue_entity_load_avg() for migrated

> tasks but also in attach_task_cfs_rq() which is called from switched_to_fair() and

> task_move_group_fair() where we can't assume that after the enqueue_entity_load_avg() a

> call to update_cfs_rq_load_avg() follows like in

>

> enqueue_task_fair():

>

>     for_each_sched_entity(se)

>         enqueue_entity()

>             enqueue_entity_load_avg()

>                     update_cfs_rq_load_avg(now, cfs_rq)

>                     if (migrated) attach_entity_load_avg()

>

>     for_each_sched_entity(se)

>         update_load_avg()

>             update_cfs_rq_load_avg(now, cfs_rq)

>

>

> Not sure if we can just update the root cfs_rq to se->avg.last_update_time before we add

> se->avg.util_[avg/sum] to rq_of(cfs_rq)->cfs.avg.util_[avg/sum] in attach_entity_load_avg()?

>

> cfs_rq throttling has to be considered as well ...


IIUC this new proposal, the utilization of a task will be accounted on
the utilization of the root cfs_rq thanks  to
tsk->se->cfs_rq->tg->se[cpu]->... down to the root cfs_rq. Then, you
directly add the utilization of the newly enqueued task in the root
cfs_rq.
Dietmar Eggemann April 7, 2016, 8:30 p.m. UTC | #2
Hi  Vincent,

On 04/07/2016 02:04 PM, Vincent Guittot wrote:
> Hi Dietmar,

>

> On 6 April 2016 at 20:53, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:

>> On 06/04/16 09:37, Morten Rasmussen wrote:

>>> On Tue, Apr 05, 2016 at 06:00:40PM +0100, Dietmar Eggemann wrote:


[...]

>> @@ -2910,8 +2920,13 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s

>>          if (!entity_is_task(se))

>>                  return;

>>

>> -       rq_of(cfs_rq)->cfs.avg.util_avg += se->avg.util_avg;

>> -       rq_of(cfs_rq)->cfs.avg.util_sum += se->avg.util_sum;

>> +       if (&rq_of(cfs_rq)->cfs == cfs_rq) {

>> +               rq_of(cfs_rq)->cfs.avg.util_avg += se->avg.util_avg;

>> +               rq_of(cfs_rq)->cfs.avg.util_sum += se->avg.util_sum;

>> +       } else {

>> +               rq_of(cfs_rq)->cfs.added_util_avg = se->avg.util_avg;

>> +               rq_of(cfs_rq)->cfs.added_util_sum = se->avg.util_sum;

>> +       }

>>   }

>

> Don't you also need similar thing for the detach ?


Maybe? I ran workloads in tg's and checked last_update_time of cfs_rq
and &rq_of(cfs_rq)->cfs and they always were in sync. That's obviously
only the call-stack 'task_move_group_fair() -> detach_task_cfs_rq() ->
detach_entity_load_avg()' and not the one starting from
switched_from_fair().

[...]

>> But attach_entity_load_avg() is not only called in enqueue_entity_load_avg() for migrated

>> tasks but also in attach_task_cfs_rq() which is called from switched_to_fair() and

>> task_move_group_fair() where we can't assume that after the enqueue_entity_load_avg() a

>> call to update_cfs_rq_load_avg() follows like in

>>

>> enqueue_task_fair():

>>

>>      for_each_sched_entity(se)

>>          enqueue_entity()

>>              enqueue_entity_load_avg()

>>                      update_cfs_rq_load_avg(now, cfs_rq)

>>                      if (migrated) attach_entity_load_avg()

>>

>>      for_each_sched_entity(se)

>>          update_load_avg()

>>              update_cfs_rq_load_avg(now, cfs_rq)

>>

>>

>> Not sure if we can just update the root cfs_rq to se->avg.last_update_time before we add

>> se->avg.util_[avg/sum] to rq_of(cfs_rq)->cfs.avg.util_[avg/sum] in attach_entity_load_avg()?

>>

>> cfs_rq throttling has to be considered as well ...

>

> IIUC this new proposal, the utilization of a task will be accounted on

> the utilization of the root cfs_rq thanks  to

> tsk->se->cfs_rq->tg->se[cpu]->... down to the root cfs_rq. Then, you

> directly add the utilization of the newly enqueued task in the root

> cfs_rq.


Not sure if you're referring to this, but in __update_load_avg() I
suppress the utilization update for se's w/ !entity_is_task(se) and
cfs_rq's w/ &rq_of(cfs_rq)->cfs != cfs_rq  so preventing the first case.

IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Vincent Guittot April 8, 2016, 6:05 a.m. UTC | #3
On 7 April 2016 at 22:30, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> Hi  Vincent,

>

> On 04/07/2016 02:04 PM, Vincent Guittot wrote:

>>

>> Hi Dietmar,

>>

>> On 6 April 2016 at 20:53, Dietmar Eggemann <dietmar.eggemann@arm.com>

>> wrote:

>>>

>>> On 06/04/16 09:37, Morten Rasmussen wrote:

>>>>

>>>> On Tue, Apr 05, 2016 at 06:00:40PM +0100, Dietmar Eggemann wrote:

>

>

> [...]

>

>>> @@ -2910,8 +2920,13 @@ static void attach_entity_load_avg(struct cfs_rq

>>> *cfs_rq, struct sched_entity *s

>>>          if (!entity_is_task(se))

>>>                  return;

>>>

>>> -       rq_of(cfs_rq)->cfs.avg.util_avg += se->avg.util_avg;

>>> -       rq_of(cfs_rq)->cfs.avg.util_sum += se->avg.util_sum;

>>> +       if (&rq_of(cfs_rq)->cfs == cfs_rq) {

>>> +               rq_of(cfs_rq)->cfs.avg.util_avg += se->avg.util_avg;

>>> +               rq_of(cfs_rq)->cfs.avg.util_sum += se->avg.util_sum;

>>> +       } else {

>>> +               rq_of(cfs_rq)->cfs.added_util_avg = se->avg.util_avg;

>>> +               rq_of(cfs_rq)->cfs.added_util_sum = se->avg.util_sum;

>>> +       }

>>>   }

>>

>>

>> Don't you also need similar thing for the detach ?

>

>

> Maybe? I ran workloads in tg's and checked last_update_time of cfs_rq

> and &rq_of(cfs_rq)->cfs and they always were in sync. That's obviously

> only the call-stack 'task_move_group_fair() -> detach_task_cfs_rq() ->

> detach_entity_load_avg()' and not the one starting from

> switched_from_fair().

>

> [...]

>

>>> But attach_entity_load_avg() is not only called in

>>> enqueue_entity_load_avg() for migrated

>>> tasks but also in attach_task_cfs_rq() which is called from

>>> switched_to_fair() and

>>> task_move_group_fair() where we can't assume that after the

>>> enqueue_entity_load_avg() a

>>> call to update_cfs_rq_load_avg() follows like in

>>>

>>> enqueue_task_fair():

>>>

>>>      for_each_sched_entity(se)

>>>          enqueue_entity()

>>>              enqueue_entity_load_avg()

>>>                      update_cfs_rq_load_avg(now, cfs_rq)

>>>                      if (migrated) attach_entity_load_avg()

>>>

>>>      for_each_sched_entity(se)

>>>          update_load_avg()

>>>              update_cfs_rq_load_avg(now, cfs_rq)

>>>

>>>

>>> Not sure if we can just update the root cfs_rq to

>>> se->avg.last_update_time before we add

>>> se->avg.util_[avg/sum] to rq_of(cfs_rq)->cfs.avg.util_[avg/sum] in

>>> attach_entity_load_avg()?

>>>

>>> cfs_rq throttling has to be considered as well ...

>>

>>

>> IIUC this new proposal, the utilization of a task will be accounted on

>> the utilization of the root cfs_rq thanks  to

>> tsk->se->cfs_rq->tg->se[cpu]->... down to the root cfs_rq. Then, you

>> directly add the utilization of the newly enqueued task in the root

>> cfs_rq.

>

>

> Not sure if you're referring to this, but in __update_load_avg() I

> suppress the utilization update for se's w/ !entity_is_task(se) and

> cfs_rq's w/ &rq_of(cfs_rq)->cfs != cfs_rq  so preventing the first case.


ok, so you still need part of the previous patch, i thought you had
skipped it as it was wrong

>

> IMPORTANT NOTICE: The contents of this email and any attachments are

> confidential and may also be privileged. If you are not the intended

> recipient, please notify the sender immediately and do not disclose the

> contents to any other person, use it for any purpose, or store or copy the

> information in any medium. Thank you.

>
diff mbox

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 51d675715776..d31d9cd453a1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2856,6 +2856,16 @@  static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
        decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
                scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
 
+       if (cfs_rq->added_util_avg) {
+               sa->util_avg += cfs_rq->added_util_avg;
+               cfs_rq->added_util_avg = 0;
+       }
+
+       if (cfs_rq->added_util_sum) {
+               sa->util_sum += cfs_rq->added_util_sum;
+               cfs_rq->added_util_sum = 0;
+       }
+
 #ifndef CONFIG_64BIT
        smp_wmb();
        cfs_rq->load_last_update_time_copy = sa->last_update_time;
@@ -2910,8 +2920,13 @@  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
        if (!entity_is_task(se))
                return;
 
-       rq_of(cfs_rq)->cfs.avg.util_avg += se->avg.util_avg;
-       rq_of(cfs_rq)->cfs.avg.util_sum += se->avg.util_sum;
+       if (&rq_of(cfs_rq)->cfs == cfs_rq) {
+               rq_of(cfs_rq)->cfs.avg.util_avg += se->avg.util_avg;
+               rq_of(cfs_rq)->cfs.avg.util_sum += se->avg.util_sum;
+       } else {
+               rq_of(cfs_rq)->cfs.added_util_avg = se->avg.util_avg;
+               rq_of(cfs_rq)->cfs.added_util_sum = se->avg.util_sum;
+       }
 }
 
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4344,8 +4359,11 @@  enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running++;
 
-               if (cfs_rq_throttled(cfs_rq))
+               if (cfs_rq_throttled(cfs_rq)) {
+                       rq_of(cfs_rq)->cfs.added_util_avg = 0;
+                       rq_of(cfs_rq)->cfs.added_util_sum = 0;
                        break;
+               }

                update_load_avg(se, 1);
                update_cfs_shares(cfs_rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2ff5a2bd6df..f0ea3a7eaf07 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -391,6 +391,8 @@  struct cfs_rq {
        unsigned long tg_load_avg_contrib;
 #endif
        atomic_long_t removed_load_avg, removed_util_avg;
+       u32 added_util_sum;
+       unsigned long added_util_avg;
 #ifndef CONFIG_64BIT
        u64 load_last_update_time_copy;