diff mbox series

[06/11] sched/irq: add irq utilization tracking

Message ID 1530200714-4504-7-git-send-email-vincent.guittot@linaro.org
State Accepted
Commit 91c27493e78df6849baaa21a9d66e26de8b875c0
Headers show
Series track CPU utilization | expand

Commit Message

Vincent Guittot June 28, 2018, 3:45 p.m. UTC
interrupt and steal time are the only remaining activities tracked by
rt_avg. Like for sched classes, we can use PELT to track their average
utilization of the CPU. But unlike sched class, we don't track when
entering/leaving interrupt; Instead, we take into account the time spent
under interrupt context when we update rqs' clock (rq_clock_task).
This also means that we have to decay the normal context time and account
for interrupt time during the update.

That's also important to note that because
  rq_clock == rq_clock_task + interrupt time
and rq_clock_task is used by a sched class to compute its utilization, the
util_avg of a sched class only reflects the utilization of the time spent
in normal context and not of the whole time of the CPU. The utilization of
interrupt gives an more accurate level of utilization of CPU.
The CPU utilization is :
  avg_irq + (1 - avg_irq / max capacity) * /Sum avg_rq

Most of the time, avg_irq is small and neglictible so the use of the
approximation CPU utilization = /Sum avg_rq was enough

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

---
 kernel/sched/core.c  |  4 +++-
 kernel/sched/fair.c  | 13 ++++++++++---
 kernel/sched/pelt.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 kernel/sched/pelt.h  | 16 ++++++++++++++++
 kernel/sched/sched.h |  3 +++
 5 files changed, 72 insertions(+), 4 deletions(-)

-- 
2.7.4

Comments

Wanpeng Li July 26, 2018, 3:09 a.m. UTC | #1
Hi Vincent,
On Fri, 29 Jun 2018 at 03:07, Vincent Guittot
<vincent.guittot@linaro.org> wrote:
>

> interrupt and steal time are the only remaining activities tracked by

> rt_avg. Like for sched classes, we can use PELT to track their average

> utilization of the CPU. But unlike sched class, we don't track when

> entering/leaving interrupt; Instead, we take into account the time spent

> under interrupt context when we update rqs' clock (rq_clock_task).

> This also means that we have to decay the normal context time and account

> for interrupt time during the update.

>

> That's also important to note that because

>   rq_clock == rq_clock_task + interrupt time

> and rq_clock_task is used by a sched class to compute its utilization, the

> util_avg of a sched class only reflects the utilization of the time spent

> in normal context and not of the whole time of the CPU. The utilization of

> interrupt gives an more accurate level of utilization of CPU.

> The CPU utilization is :

>   avg_irq + (1 - avg_irq / max capacity) * /Sum avg_rq

>

> Most of the time, avg_irq is small and neglictible so the use of the

> approximation CPU utilization = /Sum avg_rq was enough

>

> Cc: Ingo Molnar <mingo@redhat.com>

> Cc: Peter Zijlstra <peterz@infradead.org>

> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

> ---

>  kernel/sched/core.c  |  4 +++-

>  kernel/sched/fair.c  | 13 ++++++++++---

>  kernel/sched/pelt.c  | 40 ++++++++++++++++++++++++++++++++++++++++

>  kernel/sched/pelt.h  | 16 ++++++++++++++++

>  kernel/sched/sched.h |  3 +++

>  5 files changed, 72 insertions(+), 4 deletions(-)

>

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c

> index 78d8fac..e5263a4 100644

> --- a/kernel/sched/core.c

> +++ b/kernel/sched/core.c

> @@ -18,6 +18,8 @@

>  #include "../workqueue_internal.h"

>  #include "../smpboot.h"

>

> +#include "pelt.h"

> +

>  #define CREATE_TRACE_POINTS

>  #include <trace/events/sched.h>

>

> @@ -186,7 +188,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)

>

>  #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

>         if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))

> -               sched_rt_avg_update(rq, irq_delta + steal);

> +               update_irq_load_avg(rq, irq_delta + steal);


I think we should not add steal time into irq load tracking, steal
time is always 0 on native kernel which doesn't matter, what will
happen when guest disables IRQ_TIME_ACCOUNTING and enables
PARAVIRT_TIME_ACCOUNTING? Steal time is not the real irq util_avg. In
addition, we haven't exposed power management for performance which
means that e.g. schedutil governor can not cooperate with passive mode
intel_pstate driver to tune the OPP. To decay the old steal time avg
and add the new one just wastes cpu cycles.

Regards,
Wanpeng Li

>  #endif

>  }

>

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> index ffce4b2..d2758e3 100644

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -7289,7 +7289,7 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)

>         return false;

>  }

>

> -static inline bool others_rqs_have_blocked(struct rq *rq)

> +static inline bool others_have_blocked(struct rq *rq)

>  {

>         if (READ_ONCE(rq->avg_rt.util_avg))

>                 return true;

> @@ -7297,6 +7297,11 @@ static inline bool others_rqs_have_blocked(struct rq *rq)

>         if (READ_ONCE(rq->avg_dl.util_avg))

>                 return true;

>

> +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

> +       if (READ_ONCE(rq->avg_irq.util_avg))

> +               return true;

> +#endif

> +

>         return false;

>  }

>

> @@ -7361,8 +7366,9 @@ static void update_blocked_averages(int cpu)

>         }

>         update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);

>         update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);

> +       update_irq_load_avg(rq, 0);

>         /* Don't need periodic decay once load/util_avg are null */

> -       if (others_rqs_have_blocked(rq))

> +       if (others_have_blocked(rq))

>                 done = false;

>

>  #ifdef CONFIG_NO_HZ_COMMON

> @@ -7431,9 +7437,10 @@ static inline void update_blocked_averages(int cpu)

>         update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);

>         update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);

>         update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);

> +       update_irq_load_avg(rq, 0);

>  #ifdef CONFIG_NO_HZ_COMMON

>         rq->last_blocked_load_update_tick = jiffies;

> -       if (!cfs_rq_has_blocked(cfs_rq) && !others_rqs_have_blocked(rq))

> +       if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))

>                 rq->has_blocked_load = 0;

>  #endif

>         rq_unlock_irqrestore(rq, &rf);

> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c

> index 8b78b63..ead6d8b 100644

> --- a/kernel/sched/pelt.c

> +++ b/kernel/sched/pelt.c

> @@ -357,3 +357,43 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)

>

>         return 0;

>  }

> +

> +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

> +/*

> + * irq:

> + *

> + *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked

> + *   util_sum = cpu_scale * load_sum

> + *   runnable_load_sum = load_sum

> + *

> + */

> +

> +int update_irq_load_avg(struct rq *rq, u64 running)

> +{

> +       int ret = 0;

> +       /*

> +        * We know the time that has been used by interrupt since last update

> +        * but we don't when. Let be pessimistic and assume that interrupt has

> +        * happened just before the update. This is not so far from reality

> +        * because interrupt will most probably wake up task and trig an update

> +        * of rq clock during which the metric si updated.

> +        * We start to decay with normal context time and then we add the

> +        * interrupt context time.

> +        * We can safely remove running from rq->clock because

> +        * rq->clock += delta with delta >= running

> +        */

> +       ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,

> +                               0,

> +                               0,

> +                               0);

> +       ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,

> +                               1,

> +                               1,

> +                               1);

> +

> +       if (ret)

> +               ___update_load_avg(&rq->avg_irq, 1, 1);

> +

> +       return ret;

> +}

> +#endif

> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h

> index 0e4f912..d2894db 100644

> --- a/kernel/sched/pelt.h

> +++ b/kernel/sched/pelt.h

> @@ -6,6 +6,16 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);

>  int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);

>  int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);

>

> +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

> +int update_irq_load_avg(struct rq *rq, u64 running);

> +#else

> +static inline int

> +update_irq_load_avg(struct rq *rq, u64 running)

> +{

> +       return 0;

> +}

> +#endif

> +

>  /*

>   * When a task is dequeued, its estimated utilization should not be update if

>   * its util_avg has not been updated at least once.

> @@ -51,6 +61,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)

>  {

>         return 0;

>  }

> +

> +static inline int

> +update_irq_load_avg(struct rq *rq, u64 running)

> +{

> +       return 0;

> +}

>  #endif

>

>

> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

> index ef5d6aa..377be2b 100644

> --- a/kernel/sched/sched.h

> +++ b/kernel/sched/sched.h

> @@ -850,6 +850,9 @@ struct rq {

>         u64                     age_stamp;

>         struct sched_avg        avg_rt;

>         struct sched_avg        avg_dl;

> +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

> +       struct sched_avg        avg_irq;

> +#endif

>         u64                     idle_stamp;

>         u64                     avg_idle;

>

> --

> 2.7.4

>
Vincent Guittot July 30, 2018, 4:43 p.m. UTC | #2
Hi Wanpeng,

On Thu, 26 Jul 2018 at 05:09, Wanpeng Li <kernellwp@gmail.com> wrote:
>

> Hi Vincent,

> On Fri, 29 Jun 2018 at 03:07, Vincent Guittot

> <vincent.guittot@linaro.org> wrote:

> >

> > interrupt and steal time are the only remaining activities tracked by

> > rt_avg. Like for sched classes, we can use PELT to track their average

> > utilization of the CPU. But unlike sched class, we don't track when

> > entering/leaving interrupt; Instead, we take into account the time spent

> > under interrupt context when we update rqs' clock (rq_clock_task).

> > This also means that we have to decay the normal context time and account

> > for interrupt time during the update.

> >

> > That's also important to note that because

> >   rq_clock == rq_clock_task + interrupt time

> > and rq_clock_task is used by a sched class to compute its utilization, the

> > util_avg of a sched class only reflects the utilization of the time spent

> > in normal context and not of the whole time of the CPU. The utilization of

> > interrupt gives an more accurate level of utilization of CPU.

> > The CPU utilization is :

> >   avg_irq + (1 - avg_irq / max capacity) * /Sum avg_rq

> >

> > Most of the time, avg_irq is small and neglictible so the use of the

> > approximation CPU utilization = /Sum avg_rq was enough

> >

> > Cc: Ingo Molnar <mingo@redhat.com>

> > Cc: Peter Zijlstra <peterz@infradead.org>

> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

> > ---

> >  kernel/sched/core.c  |  4 +++-

> >  kernel/sched/fair.c  | 13 ++++++++++---

> >  kernel/sched/pelt.c  | 40 ++++++++++++++++++++++++++++++++++++++++

> >  kernel/sched/pelt.h  | 16 ++++++++++++++++

> >  kernel/sched/sched.h |  3 +++

> >  5 files changed, 72 insertions(+), 4 deletions(-)

> >

> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c

> > index 78d8fac..e5263a4 100644

> > --- a/kernel/sched/core.c

> > +++ b/kernel/sched/core.c

> > @@ -18,6 +18,8 @@

> >  #include "../workqueue_internal.h"

> >  #include "../smpboot.h"

> >

> > +#include "pelt.h"

> > +

> >  #define CREATE_TRACE_POINTS

> >  #include <trace/events/sched.h>

> >

> > @@ -186,7 +188,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)

> >

> >  #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

> >         if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))

> > -               sched_rt_avg_update(rq, irq_delta + steal);

> > +               update_irq_load_avg(rq, irq_delta + steal);

>

> I think we should not add steal time into irq load tracking, steal

> time is always 0 on native kernel which doesn't matter, what will

> happen when guest disables IRQ_TIME_ACCOUNTING and enables

> PARAVIRT_TIME_ACCOUNTING? Steal time is not the real irq util_avg. In

> addition, we haven't exposed power management for performance which

> means that e.g. schedutil governor can not cooperate with passive mode

> intel_pstate driver to tune the OPP. To decay the old steal time avg

> and add the new one just wastes cpu cycles.


In fact, I have kept the same behavior as with rt_avg, which was
already adding steal time when computing scale_rt_capacity, which is
used to reflect the remaining capacity for FAIR tasks and is used in
load balance. I'm not sure that it's worth using different variables
for irq and steal.
That being said, I see a possible optimization in schedutil when
PARAVIRT_TIME_ACCOUNTING is enable and IRQ_TIME_ACCOUNTING is disable.
With this kind of config, scale_irq_capacity can be a nop for
schedutil but scales the utilization for scale_rt_capacity

Regards,
Vincent

>

> Regards,

> Wanpeng Li

>

> >  #endif

> >  }

> >

> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> > index ffce4b2..d2758e3 100644

> > --- a/kernel/sched/fair.c

> > +++ b/kernel/sched/fair.c

> > @@ -7289,7 +7289,7 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)

> >         return false;

> >  }

> >

> > -static inline bool others_rqs_have_blocked(struct rq *rq)

> > +static inline bool others_have_blocked(struct rq *rq)

> >  {

> >         if (READ_ONCE(rq->avg_rt.util_avg))

> >                 return true;

> > @@ -7297,6 +7297,11 @@ static inline bool others_rqs_have_blocked(struct rq *rq)

> >         if (READ_ONCE(rq->avg_dl.util_avg))

> >                 return true;

> >

> > +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

> > +       if (READ_ONCE(rq->avg_irq.util_avg))

> > +               return true;

> > +#endif

> > +

> >         return false;

> >  }

> >

> > @@ -7361,8 +7366,9 @@ static void update_blocked_averages(int cpu)

> >         }

> >         update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);

> >         update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);

> > +       update_irq_load_avg(rq, 0);

> >         /* Don't need periodic decay once load/util_avg are null */

> > -       if (others_rqs_have_blocked(rq))

> > +       if (others_have_blocked(rq))

> >                 done = false;

> >

> >  #ifdef CONFIG_NO_HZ_COMMON

> > @@ -7431,9 +7437,10 @@ static inline void update_blocked_averages(int cpu)

> >         update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);

> >         update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);

> >         update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);

> > +       update_irq_load_avg(rq, 0);

> >  #ifdef CONFIG_NO_HZ_COMMON

> >         rq->last_blocked_load_update_tick = jiffies;

> > -       if (!cfs_rq_has_blocked(cfs_rq) && !others_rqs_have_blocked(rq))

> > +       if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))

> >                 rq->has_blocked_load = 0;

> >  #endif

> >         rq_unlock_irqrestore(rq, &rf);

> > diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c

> > index 8b78b63..ead6d8b 100644

> > --- a/kernel/sched/pelt.c

> > +++ b/kernel/sched/pelt.c

> > @@ -357,3 +357,43 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)

> >

> >         return 0;

> >  }

> > +

> > +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

> > +/*

> > + * irq:

> > + *

> > + *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked

> > + *   util_sum = cpu_scale * load_sum

> > + *   runnable_load_sum = load_sum

> > + *

> > + */

> > +

> > +int update_irq_load_avg(struct rq *rq, u64 running)

> > +{

> > +       int ret = 0;

> > +       /*

> > +        * We know the time that has been used by interrupt since last update

> > +        * but we don't when. Let be pessimistic and assume that interrupt has

> > +        * happened just before the update. This is not so far from reality

> > +        * because interrupt will most probably wake up task and trig an update

> > +        * of rq clock during which the metric si updated.

> > +        * We start to decay with normal context time and then we add the

> > +        * interrupt context time.

> > +        * We can safely remove running from rq->clock because

> > +        * rq->clock += delta with delta >= running

> > +        */

> > +       ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,

> > +                               0,

> > +                               0,

> > +                               0);

> > +       ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,

> > +                               1,

> > +                               1,

> > +                               1);

> > +

> > +       if (ret)

> > +               ___update_load_avg(&rq->avg_irq, 1, 1);

> > +

> > +       return ret;

> > +}

> > +#endif

> > diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h

> > index 0e4f912..d2894db 100644

> > --- a/kernel/sched/pelt.h

> > +++ b/kernel/sched/pelt.h

> > @@ -6,6 +6,16 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);

> >  int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);

> >  int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);

> >

> > +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

> > +int update_irq_load_avg(struct rq *rq, u64 running);

> > +#else

> > +static inline int

> > +update_irq_load_avg(struct rq *rq, u64 running)

> > +{

> > +       return 0;

> > +}

> > +#endif

> > +

> >  /*

> >   * When a task is dequeued, its estimated utilization should not be update if

> >   * its util_avg has not been updated at least once.

> > @@ -51,6 +61,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)

> >  {

> >         return 0;

> >  }

> > +

> > +static inline int

> > +update_irq_load_avg(struct rq *rq, u64 running)

> > +{

> > +       return 0;

> > +}

> >  #endif

> >

> >

> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

> > index ef5d6aa..377be2b 100644

> > --- a/kernel/sched/sched.h

> > +++ b/kernel/sched/sched.h

> > @@ -850,6 +850,9 @@ struct rq {

> >         u64                     age_stamp;

> >         struct sched_avg        avg_rt;

> >         struct sched_avg        avg_dl;

> > +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

> > +       struct sched_avg        avg_irq;

> > +#endif

> >         u64                     idle_stamp;

> >         u64                     avg_idle;

> >

> > --

> > 2.7.4

> >
Wanpeng Li July 31, 2018, 3:32 a.m. UTC | #3
On Tue, 31 Jul 2018 at 00:43, Vincent Guittot
<vincent.guittot@linaro.org> wrote:
>

> Hi Wanpeng,

>

> On Thu, 26 Jul 2018 at 05:09, Wanpeng Li <kernellwp@gmail.com> wrote:

> >

> > Hi Vincent,

> > On Fri, 29 Jun 2018 at 03:07, Vincent Guittot

> > <vincent.guittot@linaro.org> wrote:

> > >

> > > interrupt and steal time are the only remaining activities tracked by

> > > rt_avg. Like for sched classes, we can use PELT to track their average

> > > utilization of the CPU. But unlike sched class, we don't track when

> > > entering/leaving interrupt; Instead, we take into account the time spent

> > > under interrupt context when we update rqs' clock (rq_clock_task).

> > > This also means that we have to decay the normal context time and account

> > > for interrupt time during the update.

> > >

> > > That's also important to note that because

> > >   rq_clock == rq_clock_task + interrupt time

> > > and rq_clock_task is used by a sched class to compute its utilization, the

> > > util_avg of a sched class only reflects the utilization of the time spent

> > > in normal context and not of the whole time of the CPU. The utilization of

> > > interrupt gives an more accurate level of utilization of CPU.

> > > The CPU utilization is :

> > >   avg_irq + (1 - avg_irq / max capacity) * /Sum avg_rq

> > >

> > > Most of the time, avg_irq is small and neglictible so the use of the

> > > approximation CPU utilization = /Sum avg_rq was enough

> > >

> > > Cc: Ingo Molnar <mingo@redhat.com>

> > > Cc: Peter Zijlstra <peterz@infradead.org>

> > > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

> > > ---

> > >  kernel/sched/core.c  |  4 +++-

> > >  kernel/sched/fair.c  | 13 ++++++++++---

> > >  kernel/sched/pelt.c  | 40 ++++++++++++++++++++++++++++++++++++++++

> > >  kernel/sched/pelt.h  | 16 ++++++++++++++++

> > >  kernel/sched/sched.h |  3 +++

> > >  5 files changed, 72 insertions(+), 4 deletions(-)

> > >

> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c

> > > index 78d8fac..e5263a4 100644

> > > --- a/kernel/sched/core.c

> > > +++ b/kernel/sched/core.c

> > > @@ -18,6 +18,8 @@

> > >  #include "../workqueue_internal.h"

> > >  #include "../smpboot.h"

> > >

> > > +#include "pelt.h"

> > > +

> > >  #define CREATE_TRACE_POINTS

> > >  #include <trace/events/sched.h>

> > >

> > > @@ -186,7 +188,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)

> > >

> > >  #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

> > >         if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))

> > > -               sched_rt_avg_update(rq, irq_delta + steal);

> > > +               update_irq_load_avg(rq, irq_delta + steal);

> >

> > I think we should not add steal time into irq load tracking, steal

> > time is always 0 on native kernel which doesn't matter, what will

> > happen when guest disables IRQ_TIME_ACCOUNTING and enables

> > PARAVIRT_TIME_ACCOUNTING? Steal time is not the real irq util_avg. In

> > addition, we haven't exposed power management for performance which

> > means that e.g. schedutil governor can not cooperate with passive mode

> > intel_pstate driver to tune the OPP. To decay the old steal time avg

> > and add the new one just wastes cpu cycles.

>

> In fact, I have kept the same behavior as with rt_avg, which was

> already adding steal time when computing scale_rt_capacity, which is

> used to reflect the remaining capacity for FAIR tasks and is used in

> load balance. I'm not sure that it's worth using different variables

> for irq and steal.

> That being said, I see a possible optimization in schedutil when

> PARAVIRT_TIME_ACCOUNTING is enable and IRQ_TIME_ACCOUNTING is disable.

> With this kind of config, scale_irq_capacity can be a nop for

> schedutil but scales the utilization for scale_rt_capacity


Yeah, this is what in my mind before, you can make a patch for that. :)

Regards,
Wanpeng Li
Vincent Guittot July 31, 2018, 8:21 a.m. UTC | #4
On Tue, 31 Jul 2018 at 05:32, Wanpeng Li <kernellwp@gmail.com> wrote:

> > > >

> > > >  #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

> > > >         if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))

> > > > -               sched_rt_avg_update(rq, irq_delta + steal);

> > > > +               update_irq_load_avg(rq, irq_delta + steal);

> > >

> > > I think we should not add steal time into irq load tracking, steal

> > > time is always 0 on native kernel which doesn't matter, what will

> > > happen when guest disables IRQ_TIME_ACCOUNTING and enables

> > > PARAVIRT_TIME_ACCOUNTING? Steal time is not the real irq util_avg. In

> > > addition, we haven't exposed power management for performance which

> > > means that e.g. schedutil governor can not cooperate with passive mode

> > > intel_pstate driver to tune the OPP. To decay the old steal time avg

> > > and add the new one just wastes cpu cycles.

> >

> > In fact, I have kept the same behavior as with rt_avg, which was

> > already adding steal time when computing scale_rt_capacity, which is

> > used to reflect the remaining capacity for FAIR tasks and is used in

> > load balance. I'm not sure that it's worth using different variables

> > for irq and steal.

> > That being said, I see a possible optimization in schedutil when

> > PARAVIRT_TIME_ACCOUNTING is enable and IRQ_TIME_ACCOUNTING is disable.

> > With this kind of config, scale_irq_capacity can be a nop for

> > schedutil but scales the utilization for scale_rt_capacity

>

> Yeah, this is what in my mind before, you can make a patch for that. :)


ok, I'm going to prepare a patch

Thanks

>

> Regards,

> Wanpeng Li
diff mbox series

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78d8fac..e5263a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -18,6 +18,8 @@ 
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
 
+#include "pelt.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
@@ -186,7 +188,7 @@  static void update_rq_clock_task(struct rq *rq, s64 delta)
 
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
 	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
-		sched_rt_avg_update(rq, irq_delta + steal);
+		update_irq_load_avg(rq, irq_delta + steal);
 #endif
 }
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffce4b2..d2758e3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7289,7 +7289,7 @@  static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
 	return false;
 }
 
-static inline bool others_rqs_have_blocked(struct rq *rq)
+static inline bool others_have_blocked(struct rq *rq)
 {
 	if (READ_ONCE(rq->avg_rt.util_avg))
 		return true;
@@ -7297,6 +7297,11 @@  static inline bool others_rqs_have_blocked(struct rq *rq)
 	if (READ_ONCE(rq->avg_dl.util_avg))
 		return true;
 
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+	if (READ_ONCE(rq->avg_irq.util_avg))
+		return true;
+#endif
+
 	return false;
 }
 
@@ -7361,8 +7366,9 @@  static void update_blocked_averages(int cpu)
 	}
 	update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
 	update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+	update_irq_load_avg(rq, 0);
 	/* Don't need periodic decay once load/util_avg are null */
-	if (others_rqs_have_blocked(rq))
+	if (others_have_blocked(rq))
 		done = false;
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -7431,9 +7437,10 @@  static inline void update_blocked_averages(int cpu)
 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
 	update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
 	update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+	update_irq_load_avg(rq, 0);
 #ifdef CONFIG_NO_HZ_COMMON
 	rq->last_blocked_load_update_tick = jiffies;
-	if (!cfs_rq_has_blocked(cfs_rq) && !others_rqs_have_blocked(rq))
+	if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
 		rq->has_blocked_load = 0;
 #endif
 	rq_unlock_irqrestore(rq, &rf);
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 8b78b63..ead6d8b 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -357,3 +357,43 @@  int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
 
 	return 0;
 }
+
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+/*
+ * irq:
+ *
+ *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ *   util_sum = cpu_scale * load_sum
+ *   runnable_load_sum = load_sum
+ *
+ */
+
+int update_irq_load_avg(struct rq *rq, u64 running)
+{
+	int ret = 0;
+	/*
+	 * We know the time that has been used by interrupt since last update
+	 * but we don't when. Let be pessimistic and assume that interrupt has
+	 * happened just before the update. This is not so far from reality
+	 * because interrupt will most probably wake up task and trig an update
+	 * of rq clock during which the metric si updated.
+	 * We start to decay with normal context time and then we add the
+	 * interrupt context time.
+	 * We can safely remove running from rq->clock because
+	 * rq->clock += delta with delta >= running
+	 */
+	ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
+				0,
+				0,
+				0);
+	ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
+				1,
+				1,
+				1);
+
+	if (ret)
+		___update_load_avg(&rq->avg_irq, 1, 1);
+
+	return ret;
+}
+#endif
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 0e4f912..d2894db 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -6,6 +6,16 @@  int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
 int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
 int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
 
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+int update_irq_load_avg(struct rq *rq, u64 running);
+#else
+static inline int
+update_irq_load_avg(struct rq *rq, u64 running)
+{
+	return 0;
+}
+#endif
+
 /*
  * When a task is dequeued, its estimated utilization should not be update if
  * its util_avg has not been updated at least once.
@@ -51,6 +61,12 @@  update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
 {
 	return 0;
 }
+
+static inline int
+update_irq_load_avg(struct rq *rq, u64 running)
+{
+	return 0;
+}
 #endif
 
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef5d6aa..377be2b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -850,6 +850,9 @@  struct rq {
 	u64			age_stamp;
 	struct sched_avg	avg_rt;
 	struct sched_avg	avg_dl;
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+	struct sched_avg	avg_irq;
+#endif
 	u64			idle_stamp;
 	u64			avg_idle;