[v6,04/11] cpufreq/schedutil: use rt utilization tracking

Message ID 1528459794-13066-5-git-send-email-vincent.guittot@linaro.org
State Superseded
Headers show
Series
  • track CPU utilization
Related show

Commit Message

Vincent Guittot June 8, 2018, 12:09 p.m.
Take into account rt utilization when selecting an OPP for cfs tasks in order
to reflect the utilization of the CPU.

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

---
 kernel/sched/cpufreq_schedutil.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

-- 
2.7.4

Comments

Dietmar Eggemann June 18, 2018, 9 a.m. | #1
On 06/08/2018 02:09 PM, Vincent Guittot wrote:
> Take into account rt utilization when selecting an OPP for cfs tasks in order

> to reflect the utilization of the CPU.


The rt utilization signal is only tracked per-cpu, not per-entity. So it 
is not aware of PELT migrations (attach/detach).

IMHO, this patch deserves some explanation why the temporary 
inflation/deflation of the OPP driving utilization signal in case an 
rt-task migrates off/on (missing detach/attach for rt-signal) doesn't 
harm performance or energy consumption.

There was some talk (mainly on #sched irc) about ... (1) preempted cfs 
tasks (with reduced demand (utilization id only running) signals) using 
this remaining rt utilization of an rt task which migrated off and ... 
(2) going to max when an rt tasks runs ... but a summary of all of that 
in this patch would really help to understand.

> Cc: Ingo Molnar <mingo@redhat.com>

> Cc: Peter Zijlstra <peterz@infradead.org>

> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

> ---

>   kernel/sched/cpufreq_schedutil.c | 9 ++++++++-

>   1 file changed, 8 insertions(+), 1 deletion(-)

> 

> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c

> index 28592b6..32f97fb 100644

> --- a/kernel/sched/cpufreq_schedutil.c

> +++ b/kernel/sched/cpufreq_schedutil.c

> @@ -56,6 +56,7 @@ struct sugov_cpu {

>   	/* The fields below are only needed when sharing a policy: */

>   	unsigned long		util_cfs;

>   	unsigned long		util_dl;

> +	unsigned long		util_rt;

>   	unsigned long		max;

>   

>   	/* The field below is for single-CPU policies only: */

> @@ -178,15 +179,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)

>   	sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);

>   	sg_cpu->util_cfs = cpu_util_cfs(rq);

>   	sg_cpu->util_dl  = cpu_util_dl(rq);

> +	sg_cpu->util_rt  = cpu_util_rt(rq);

>   }

>   

>   static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

>   {

>   	struct rq *rq = cpu_rq(sg_cpu->cpu);

> +	unsigned long util;

>   

>   	if (rq->rt.rt_nr_running)

>   		return sg_cpu->max;

>   

> +	util = sg_cpu->util_dl;

> +	util += sg_cpu->util_cfs;

> +	util += sg_cpu->util_rt;

> +

>   	/*

>   	 * Utilization required by DEADLINE must always be granted while, for

>   	 * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to

> @@ -197,7 +204,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

>   	 * util_cfs + util_dl as requested freq. However, cpufreq is not yet

>   	 * ready for such an interface. So, we only do the latter for now.

>   	 */

> -	return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));

> +	return min(sg_cpu->max, util);

>   }

>   

>   static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)

>
Vincent Guittot June 18, 2018, 12:58 p.m. | #2
On Mon, 18 Jun 2018 at 11:00, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>

> On 06/08/2018 02:09 PM, Vincent Guittot wrote:

> > Take into account rt utilization when selecting an OPP for cfs tasks in order

> > to reflect the utilization of the CPU.

>

> The rt utilization signal is only tracked per-cpu, not per-entity. So it

> is not aware of PELT migrations (attach/detach).

>

> IMHO, this patch deserves some explanation why the temporary

> inflation/deflation of the OPP driving utilization signal in case an

> rt-task migrates off/on (missing detach/attach for rt-signal) doesn't

> harm performance or energy consumption.

>

> There was some talk (mainly on #sched irc) about ... (1) preempted cfs

> tasks (with reduced demand (utilization id only running) signals) using

> this remaining rt utilization of an rt task which migrated off and ...

> (2) going to max when an rt tasks runs ... but a summary of all of that

> in this patch would really help to understand.


Ok. I will add more comments in the next version. I just wait a bit to
let time to get more feedback before sending a new release

>

> > Cc: Ingo Molnar <mingo@redhat.com>

> > Cc: Peter Zijlstra <peterz@infradead.org>

> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

> > ---

> >   kernel/sched/cpufreq_schedutil.c | 9 ++++++++-

> >   1 file changed, 8 insertions(+), 1 deletion(-)

> >

> > diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c

> > index 28592b6..32f97fb 100644

> > --- a/kernel/sched/cpufreq_schedutil.c

> > +++ b/kernel/sched/cpufreq_schedutil.c

> > @@ -56,6 +56,7 @@ struct sugov_cpu {

> >       /* The fields below are only needed when sharing a policy: */

> >       unsigned long           util_cfs;

> >       unsigned long           util_dl;

> > +     unsigned long           util_rt;

> >       unsigned long           max;

> >

> >       /* The field below is for single-CPU policies only: */

> > @@ -178,15 +179,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)

> >       sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);

> >       sg_cpu->util_cfs = cpu_util_cfs(rq);

> >       sg_cpu->util_dl  = cpu_util_dl(rq);

> > +     sg_cpu->util_rt  = cpu_util_rt(rq);

> >   }

> >

> >   static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

> >   {

> >       struct rq *rq = cpu_rq(sg_cpu->cpu);

> > +     unsigned long util;

> >

> >       if (rq->rt.rt_nr_running)

> >               return sg_cpu->max;

> >

> > +     util = sg_cpu->util_dl;

> > +     util += sg_cpu->util_cfs;

> > +     util += sg_cpu->util_rt;

> > +

> >       /*

> >        * Utilization required by DEADLINE must always be granted while, for

> >        * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to

> > @@ -197,7 +204,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

> >        * util_cfs + util_dl as requested freq. However, cpufreq is not yet

> >        * ready for such an interface. So, we only do the latter for now.

> >        */

> > -     return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));

> > +     return min(sg_cpu->max, util);

> >   }

> >

> >   static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)

> >

>
Peter Zijlstra June 21, 2018, 6:45 p.m. | #3
On Fri, Jun 08, 2018 at 02:09:47PM +0200, Vincent Guittot wrote:
>  static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

>  {

>  	struct rq *rq = cpu_rq(sg_cpu->cpu);

> +	unsigned long util;

>  

>  	if (rq->rt.rt_nr_running)

>  		return sg_cpu->max;

>  

> +	util = sg_cpu->util_dl;

> +	util += sg_cpu->util_cfs;

> +	util += sg_cpu->util_rt;

> +

>  	/*

>  	 * Utilization required by DEADLINE must always be granted while, for

>  	 * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to

> @@ -197,7 +204,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

>  	 * util_cfs + util_dl as requested freq. However, cpufreq is not yet

>  	 * ready for such an interface. So, we only do the latter for now.

>  	 */

> -	return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));

> +	return min(sg_cpu->max, util);

>  }


So this (and the dl etc. equivalents) result in exactly the problems
complained about last time, no?

What I proposed was something along the lines of:

	util = 1024 * sg_cpu->util_cfs;
	util /= (1024 - (sg_cpu->util_rt + sg_cpu->util_dl + ...));

	return min(sg_cpu->max, util + sg_cpu->bw_dl);

Where we, instead of directly adding the various util signals.

I now see an email from Quentin asking if these things are not in fact
the same, but no, they are not. The difference is that the above only
affects the CFS signal and will re-normalize the utilization of an
'always' running task back to 1 by compensating for the stolen capacity.

But it will not, like these here patches, affect the OPP selection of
other classes. If there is no CFS utilization (or very little), then the
renormalization will not matter, and the existing DL bandwidth
compuation will be unaffected.
Peter Zijlstra June 21, 2018, 6:57 p.m. | #4
On Thu, Jun 21, 2018 at 08:45:24PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 08, 2018 at 02:09:47PM +0200, Vincent Guittot wrote:

> >  static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

> >  {

> >  	struct rq *rq = cpu_rq(sg_cpu->cpu);

> > +	unsigned long util;

> >  

> >  	if (rq->rt.rt_nr_running)

> >  		return sg_cpu->max;

> >  

> > +	util = sg_cpu->util_dl;

> > +	util += sg_cpu->util_cfs;

> > +	util += sg_cpu->util_rt;

> > +

> >  	/*

> >  	 * Utilization required by DEADLINE must always be granted while, for

> >  	 * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to

> > @@ -197,7 +204,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

> >  	 * util_cfs + util_dl as requested freq. However, cpufreq is not yet

> >  	 * ready for such an interface. So, we only do the latter for now.

> >  	 */

> > -	return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));

> > +	return min(sg_cpu->max, util);

> >  }

> 

> So this (and the dl etc. equivalents) result in exactly the problems

> complained about last time, no?

> 

> What I proposed was something along the lines of:

> 

> 	util = 1024 * sg_cpu->util_cfs;

> 	util /= (1024 - (sg_cpu->util_rt + sg_cpu->util_dl + ...));

> 

> 	return min(sg_cpu->max, util + sg_cpu->bw_dl);

> 

> Where we, instead of directly adding the various util signals.


That looks unfinished; I think that wants to include: "we renormalize
the CFS signal".

> I now see an email from Quentin asking if these things are not in fact

> the same, but no, they are not. The difference is that the above only

> affects the CFS signal and will re-normalize the utilization of an

> 'always' running task back to 1 by compensating for the stolen capacity.

> 

> But it will not, like these here patches, affect the OPP selection of

> other classes. If there is no CFS utilization (or very little), then the

> renormalization will not matter, and the existing DL bandwidth

> compuation will be unaffected.

>
Juri Lelli June 22, 2018, 7:58 a.m. | #5
On 21/06/18 20:45, Peter Zijlstra wrote:
> On Fri, Jun 08, 2018 at 02:09:47PM +0200, Vincent Guittot wrote:

> >  static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

> >  {

> >  	struct rq *rq = cpu_rq(sg_cpu->cpu);

> > +	unsigned long util;

> >  

> >  	if (rq->rt.rt_nr_running)

> >  		return sg_cpu->max;

> >  

> > +	util = sg_cpu->util_dl;

> > +	util += sg_cpu->util_cfs;

> > +	util += sg_cpu->util_rt;

> > +

> >  	/*

> >  	 * Utilization required by DEADLINE must always be granted while, for

> >  	 * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to

> > @@ -197,7 +204,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

> >  	 * util_cfs + util_dl as requested freq. However, cpufreq is not yet

> >  	 * ready for such an interface. So, we only do the latter for now.

> >  	 */

> > -	return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));

> > +	return min(sg_cpu->max, util);

> >  }

> 

> So this (and the dl etc. equivalents) result in exactly the problems

> complained about last time, no?

> 

> What I proposed was something along the lines of:

> 

> 	util = 1024 * sg_cpu->util_cfs;

> 	util /= (1024 - (sg_cpu->util_rt + sg_cpu->util_dl + ...));

> 

> 	return min(sg_cpu->max, util + sg_cpu->bw_dl);

> 

> Where we, instead of directly adding the various util signals.

> 

> I now see an email from Quentin asking if these things are not in fact

> the same, but no, they are not. The difference is that the above only

> affects the CFS signal and will re-normalize the utilization of an

> 'always' running task back to 1 by compensating for the stolen capacity.

> 

> But it will not, like these here patches, affect the OPP selection of

> other classes. If there is no CFS utilization (or very little), then the

> renormalization will not matter, and the existing DL bandwidth

> compuation will be unaffected.


IIUC, even with very little CFS utilization, the final OPP selection
will still be "inflated" w.r.t. bw_dl in case util_dl is big (like for a
DL task with a big period and not so big runtime). But I guess that's
OK, since we agreed that such DL tasks should be the exception anyway.
Quentin Perret June 22, 2018, 7:58 a.m. | #6
Hi Peter,

On Thursday 21 Jun 2018 at 20:45:24 (+0200), Peter Zijlstra wrote:
> On Fri, Jun 08, 2018 at 02:09:47PM +0200, Vincent Guittot wrote:

> >  static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

> >  {

> >  	struct rq *rq = cpu_rq(sg_cpu->cpu);

> > +	unsigned long util;

> >  

> >  	if (rq->rt.rt_nr_running)

> >  		return sg_cpu->max;

> >  

> > +	util = sg_cpu->util_dl;

> > +	util += sg_cpu->util_cfs;

> > +	util += sg_cpu->util_rt;

> > +

> >  	/*

> >  	 * Utilization required by DEADLINE must always be granted while, for

> >  	 * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to

> > @@ -197,7 +204,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

> >  	 * util_cfs + util_dl as requested freq. However, cpufreq is not yet

> >  	 * ready for such an interface. So, we only do the latter for now.

> >  	 */

> > -	return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));

> > +	return min(sg_cpu->max, util);

> >  }

> 

> So this (and the dl etc. equivalents) result in exactly the problems

> complained about last time, no?

> 

> What I proposed was something along the lines of:

> 

> 	util = 1024 * sg_cpu->util_cfs;

> 	util /= (1024 - (sg_cpu->util_rt + sg_cpu->util_dl + ...));

> 

> 	return min(sg_cpu->max, util + sg_cpu->bw_dl);

> 

> Where we, instead of directly adding the various util signals.

> 

> I now see an email from Quentin asking if these things are not in fact

> the same, but no, they are not. The difference is that the above only

> affects the CFS signal and will re-normalize the utilization of an

> 'always' running task back to 1 by compensating for the stolen capacity.

> 

> But it will not, like these here patches, affect the OPP selection of

> other classes. If there is no CFS utilization (or very little), then the

> renormalization will not matter, and the existing DL bandwidth

> compuation will be unaffected.


Right, thinking more carefully about this re-scaling, the two things are
indeed not the same, but I'm still not sure if this is what we want.

Say we have 50% of the capacity stolen by RT, and a 25% CFS task
running. If we re-scale, we'll end up with a 50% request for CFS
(util==512 for your code above). But if we want to see a little bit
of idle time in the system, we should really request an OPP for 75%+ of
capacity no ? Or am I missing something ?

And also, I think Juri had concerns when we use the util_dl (as a PELT
signal) for OPP selection since that kills the benefit of DL for long
running DL tasks. Or can we assume that DL tasks with very long
runtime/periods are a corner case we can ignore ?

Thanks,
Quentin
Vincent Guittot June 22, 2018, 8:10 a.m. | #7
On Thu, 21 Jun 2018 at 20:57, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Thu, Jun 21, 2018 at 08:45:24PM +0200, Peter Zijlstra wrote:

> > On Fri, Jun 08, 2018 at 02:09:47PM +0200, Vincent Guittot wrote:

> > >  static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

> > >  {

> > >     struct rq *rq = cpu_rq(sg_cpu->cpu);

> > > +   unsigned long util;

> > >

> > >     if (rq->rt.rt_nr_running)

> > >             return sg_cpu->max;

> > >

> > > +   util = sg_cpu->util_dl;

> > > +   util += sg_cpu->util_cfs;

> > > +   util += sg_cpu->util_rt;

> > > +

> > >     /*

> > >      * Utilization required by DEADLINE must always be granted while, for

> > >      * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to

> > > @@ -197,7 +204,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

> > >      * util_cfs + util_dl as requested freq. However, cpufreq is not yet

> > >      * ready for such an interface. So, we only do the latter for now.

> > >      */

> > > -   return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));

> > > +   return min(sg_cpu->max, util);

> > >  }

> >

> > So this (and the dl etc. equivalents) result in exactly the problems

> > complained about last time, no?

> >

> > What I proposed was something along the lines of:

> >

> >       util = 1024 * sg_cpu->util_cfs;

> >       util /= (1024 - (sg_cpu->util_rt + sg_cpu->util_dl + ...));

> >

> >       return min(sg_cpu->max, util + sg_cpu->bw_dl);


I see that you use sg_cpu->util_dl and sg_cpu->bw_dl in your equation
above but this patch 04 only adds rt util_avg and the dl util_avg has
not been added yet.
 dl util_avg is added in patch 6
So for this patch, we are only using sg_cpu->bw_dl

> >

> > Where we, instead of directly adding the various util signals.

>

> That looks unfinished; I think that wants to include: "we renormalize

> the CFS signal".

>

> > I now see an email from Quentin asking if these things are not in fact

> > the same, but no, they are not. The difference is that the above only

> > affects the CFS signal and will re-normalize the utilization of an

> > 'always' running task back to 1 by compensating for the stolen capacity.

> >

> > But it will not, like these here patches, affect the OPP selection of

> > other classes. If there is no CFS utilization (or very little), then the

> > renormalization will not matter, and the existing DL bandwidth

> > compuation will be unaffected.

> >
Peter Zijlstra June 22, 2018, 11:37 a.m. | #8
On Fri, Jun 22, 2018 at 08:58:53AM +0100, Quentin Perret wrote:
> Say we have 50% of the capacity stolen by RT, and a 25% CFS task

> running. If we re-scale, we'll end up with a 50% request for CFS

> (util==512 for your code above). But if we want to see a little bit

> of idle time in the system, we should really request an OPP for 75%+ of

> capacity no ? Or am I missing something ?


That is true.. So we could limit the scaling to the case where there is
no idle time, something like:

	util = sg_cpu->util_cfs;

	cap_cfs = (1024 - (sg_cpu->util_rt + ...));
	if (util == cap_cfs)
		util = sg_cpu->max;

That specifically handles the '0% idle -> 100% freq' case, but I don't
realy like edge behaviour like that. If for some reason it all doesn't
quite align you're left with bits.

And the linear scaling is the next simplest thing that avoids the hard
boundary case.

I suppose we can make it more complicated, something like:

             u           u
  f := u + (--- - u) * (---)^n
            1-r         1-r

Where: u := cfs util
       r := \Sum !cfs util
       f := frequency request

That would still satisfy all criteria I think:

  r = 0      -> f := u
  u = (1-r)  -> f := 1

and in particular:

  u << (1-r) -> f ~= u

which casuses less inflation than the linear thing where there is idle
time.

In your specific example that ends up with:

             .25           .25
  f = .25 + (--- - .25) * (---)^n = .25 + .0625 (for n=2)
             .5            .5     = .25 + .125  (for n=1)

But is that needed complexity?
Peter Zijlstra June 22, 2018, 11:41 a.m. | #9
On Fri, Jun 22, 2018 at 10:10:32AM +0200, Vincent Guittot wrote:
> On Thu, 21 Jun 2018 at 20:57, Peter Zijlstra <peterz@infradead.org> wrote:

> >

> > > So this (and the dl etc. equivalents) result in exactly the problems

> > > complained about last time, no?

> > >

> > > What I proposed was something along the lines of:

> > >

> > >       util = 1024 * sg_cpu->util_cfs;

> > >       util /= (1024 - (sg_cpu->util_rt + sg_cpu->util_dl + ...));

> > >

> > >       return min(sg_cpu->max, util + sg_cpu->bw_dl);

> 

> I see that you use sg_cpu->util_dl and sg_cpu->bw_dl in your equation

> above but this patch 04 only adds rt util_avg and the dl util_avg has

> not been added yet.

>  dl util_avg is added in patch 6

> So for this patch, we are only using sg_cpu->bw_dl


Yeah, not the point really.

It is about how we're going to use the (rt,dl,irq etc..) util values,
more than which particular one was introduced here.

I'm just not a big fan of the whole: freq := cfs_util + rt_util thing
(as would be obvious by now).
Peter Zijlstra June 22, 2018, 11:44 a.m. | #10
On Fri, Jun 22, 2018 at 01:37:13PM +0200, Peter Zijlstra wrote:
> I suppose we can make it more complicated, something like:

> 

>              u           u

>   f := u + (--- - u) * (---)^n

>             1-r         1-r

> 

> Where: u := cfs util

>        r := \Sum !cfs util

>        f := frequency request

> 

> That would still satisfy all criteria I think:

> 

>   r = 0      -> f := u

>   u = (1-r)  -> f := 1

> 

> and in particular:

> 

>   u << (1-r) -> f ~= u

> 

> which casuses less inflation than the linear thing where there is idle

> time.


Note that for n=0 this last property is lost and we have the initial
linear case back.
Vincent Guittot June 22, 2018, 12:14 p.m. | #11
On Fri, 22 Jun 2018 at 13:41, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Fri, Jun 22, 2018 at 10:10:32AM +0200, Vincent Guittot wrote:

> > On Thu, 21 Jun 2018 at 20:57, Peter Zijlstra <peterz@infradead.org> wrote:

> > >

> > > > So this (and the dl etc. equivalents) result in exactly the problems

> > > > complained about last time, no?

> > > >

> > > > What I proposed was something along the lines of:

> > > >

> > > >       util = 1024 * sg_cpu->util_cfs;

> > > >       util /= (1024 - (sg_cpu->util_rt + sg_cpu->util_dl + ...));

> > > >

> > > >       return min(sg_cpu->max, util + sg_cpu->bw_dl);

> >

> > I see that you use sg_cpu->util_dl and sg_cpu->bw_dl in your equation

> > above but this patch 04 only adds rt util_avg and the dl util_avg has

> > not been added yet.

> >  dl util_avg is added in patch 6

> > So for this patch, we are only using sg_cpu->bw_dl

>

> Yeah, not the point really.

>

> It is about how we're going to use the (rt,dl,irq etc..) util values,

> more than which particular one was introduced here.


ok

>

> I'm just not a big fan of the whole: freq := cfs_util + rt_util thing

> (as would be obvious by now).


so I'm not sure to catch what you don't like with the sum ? Is it the
special case for dl and how we switch between dl_bw and dl util_avg
which can generate a drop in frequency
Because the increase is linear regarding rt and cfs
Vincent Guittot June 22, 2018, 12:23 p.m. | #12
On Fri, 22 Jun 2018 at 13:37, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Fri, Jun 22, 2018 at 08:58:53AM +0100, Quentin Perret wrote:

> > Say we have 50% of the capacity stolen by RT, and a 25% CFS task

> > running. If we re-scale, we'll end up with a 50% request for CFS

> > (util==512 for your code above). But if we want to see a little bit

> > of idle time in the system, we should really request an OPP for 75%+ of

> > capacity no ? Or am I missing something ?

>

> That is true.. So we could limit the scaling to the case where there is

> no idle time, something like:

>

>         util = sg_cpu->util_cfs;

>

>         cap_cfs = (1024 - (sg_cpu->util_rt + ...));

>         if (util == cap_cfs)

>                 util = sg_cpu->max;

>

> That specifically handles the '0% idle -> 100% freq' case, but I don't

> realy like edge behaviour like that. If for some reason it all doesn't

> quite align you're left with bits.

>

> And the linear scaling is the next simplest thing that avoids the hard

> boundary case.

>

> I suppose we can make it more complicated, something like:

>

>              u           u

>   f := u + (--- - u) * (---)^n

>             1-r         1-r

>

> Where: u := cfs util

>        r := \Sum !cfs util

>        f := frequency request

>

> That would still satisfy all criteria I think:

>

>   r = 0      -> f := u

>   u = (1-r)  -> f := 1

>

> and in particular:

>

>   u << (1-r) -> f ~= u

>

> which casuses less inflation than the linear thing where there is idle

> time.

>

> In your specific example that ends up with:

>

>              .25           .25

>   f = .25 + (--- - .25) * (---)^n = .25 + .0625 (for n=2)

>              .5            .5     = .25 + .125  (for n=1)

>

> But is that needed complexity?


And we are not yet at the right value for quentin's example as we need
something around 0.75 for is example
The non linearity only comes from dl so if we want to use the equation
above, u should be (cfs + rt) and r = dl
But this also means that we will start to inflate the utilization to
get higher OPP even if there is idle time and lost the interest of
using dl bw
Quentin Perret June 22, 2018, 12:54 p.m. | #13
On Friday 22 Jun 2018 at 13:37:13 (+0200), Peter Zijlstra wrote:
> That is true.. So we could limit the scaling to the case where there is

> no idle time, something like:

> 

> 	util = sg_cpu->util_cfs;

> 

> 	cap_cfs = (1024 - (sg_cpu->util_rt + ...));

> 	if (util == cap_cfs)

> 		util = sg_cpu->max;

> 

> That specifically handles the '0% idle -> 100% freq' case, but I don't

> realy like edge behaviour like that. If for some reason it all doesn't

> quite align you're left with bits.

> 

> And the linear scaling is the next simplest thing that avoids the hard

> boundary case.


Right, so maybe we'll get something smoother by just summing the signals
as Vincent is proposing ? You will still request max freq for the
(util == cap_cfs) case you described. By definition, you will have
(util_cfs + util_rt + ...) == 1024 in this case.

cap_cfs is the delta between RT+DL+... and 1024, and the only case where
util_cfs can be equal to cap_cfs is if util_cfs fills that delta
entirely.

I hope that makes sense

Thanks,
Quentin
Peter Zijlstra June 22, 2018, 1:26 p.m. | #14
On Fri, Jun 22, 2018 at 02:23:22PM +0200, Vincent Guittot wrote:
> On Fri, 22 Jun 2018 at 13:37, Peter Zijlstra <peterz@infradead.org> wrote:

> > I suppose we can make it more complicated, something like:

> >

> >              u           u

> >   f := u + (--- - u) * (---)^n

> >             1-r         1-r

> >

> > Where: u := cfs util

> >        r := \Sum !cfs util

> >        f := frequency request

> >

> > That would still satisfy all criteria I think:

> >

> >   r = 0      -> f := u

> >   u = (1-r)  -> f := 1

> >

> > and in particular:

> >

> >   u << (1-r) -> f ~= u

> >

> > which casuses less inflation than the linear thing where there is idle

> > time.


> And we are not yet at the right value for quentin's example as we need

> something around 0.75 for is example


$ bc -l
define f (u,r,n) { return u + ((u/(1-r)) - u) * (u/(1-r))^n; }
f(.2,.7,0)
.66666666666666666666
f(.2,.7,2)
.40740740740740740739
f(.2,.7,4)
.29218106995884773661

So at 10% idle time, we've only inflated what should be 20% to 40%, that
is entirely reasonable I think. The linear case gave us 66%.  But feel
free to increase @n if you feel that helps, 4 is only one mult more than
2 and gets us down to 29%.

> The non linearity only comes from dl so if we want to use the equation

> above, u should be (cfs + rt) and r = dl


Right until we allow RT to run at anything other than f=1. Once we allow
rt util capping, either through Patrick's thing or CBS servers or
whatever, we get:

  f = min(1, f_rt + f_dl + f_cfs)

And then u_rt does want to be part of r. And while we do run RT at f=1,
it doesn't matter either way around I think.

> But this also means that we will start to inflate the utilization to

> get higher OPP even if there is idle time and lost the interest of

> using dl bw


You get _some_ inflation, but only if there is actual cfs utilization to
begin with.

And that is my objection to that straight sum thing; there the dl util
distorts the computed dl bandwidth thing even if there is no cfs
utilization.
Peter Zijlstra June 22, 2018, 1:29 p.m. | #15
On Fri, Jun 22, 2018 at 01:54:34PM +0100, Quentin Perret wrote:
> On Friday 22 Jun 2018 at 13:37:13 (+0200), Peter Zijlstra wrote:

> > That is true.. So we could limit the scaling to the case where there is

> > no idle time, something like:

> > 

> > 	util = sg_cpu->util_cfs;

> > 

> > 	cap_cfs = (1024 - (sg_cpu->util_rt + ...));

> > 	if (util == cap_cfs)

> > 		util = sg_cpu->max;

> > 

> > That specifically handles the '0% idle -> 100% freq' case, but I don't

> > realy like edge behaviour like that. If for some reason it all doesn't

> > quite align you're left with bits.

> > 

> > And the linear scaling is the next simplest thing that avoids the hard

> > boundary case.

> 

> Right, so maybe we'll get something smoother by just summing the signals

> as Vincent is proposing ? 


Sure, but see my previous mail just now, that has the problem of
u_{rt,dl} distoting f_{rt,dl} even when there is no u_cfs.
Peter Zijlstra June 22, 2018, 1:52 p.m. | #16
On Fri, Jun 22, 2018 at 03:26:24PM +0200, Peter Zijlstra wrote:
> > But this also means that we will start to inflate the utilization to

> > get higher OPP even if there is idle time and lost the interest of

> > using dl bw

> 

> You get _some_ inflation, but only if there is actual cfs utilization to

> begin with.


Note that at high idle time, the inflation really is minimal:

f(.2,.2,2)
.20312500000000000000
Vincent Guittot June 22, 2018, 1:54 p.m. | #17
On Fri, 22 Jun 2018 at 15:26, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Fri, Jun 22, 2018 at 02:23:22PM +0200, Vincent Guittot wrote:

> > On Fri, 22 Jun 2018 at 13:37, Peter Zijlstra <peterz@infradead.org> wrote:

> > > I suppose we can make it more complicated, something like:

> > >

> > >              u           u

> > >   f := u + (--- - u) * (---)^n

> > >             1-r         1-r

> > >

> > > Where: u := cfs util

> > >        r := \Sum !cfs util

> > >        f := frequency request

> > >

> > > That would still satisfy all criteria I think:

> > >

> > >   r = 0      -> f := u

> > >   u = (1-r)  -> f := 1

> > >

> > > and in particular:

> > >

> > >   u << (1-r) -> f ~= u

> > >

> > > which casuses less inflation than the linear thing where there is idle

> > > time.

>

> > And we are not yet at the right value for quentin's example as we need

> > something around 0.75 for is example

>

> $ bc -l

> define f (u,r,n) { return u + ((u/(1-r)) - u) * (u/(1-r))^n; }

> f(.2,.7,0)

> .66666666666666666666

> f(.2,.7,2)

> .40740740740740740739

> f(.2,.7,4)

> .29218106995884773661

>

> So at 10% idle time, we've only inflated what should be 20% to 40%, that

> is entirely reasonable I think. The linear case gave us 66%.  But feel

> free to increase @n if you feel that helps, 4 is only one mult more than

> 2 and gets us down to 29%.


I'm a bit lost with your example.
u = 0.2 (for cfs) and r=0.7 (let say for rt) in your example and idle is 0.1

For rt task, we run 0.7 of the time at f=1 then we will select f=0.4
for run cfs task with u=0.2 but u is the utilization at f=1 which
means that it will take 250% of normal time to execute at f=0.4 which
means 0.5  time instead of 0.2 at f=1 so we are going out of time. In
order to have enough time to run r and u we must run at least  f=0.666
for cfs = 0.2/(1-0.7). If rt task doesn't run at f=1 we would have to
run at f=0.9

>

> > The non linearity only comes from dl so if we want to use the equation

> > above, u should be (cfs + rt) and r = dl

>

> Right until we allow RT to run at anything other than f=1. Once we allow

> rt util capping, either through Patrick's thing or CBS servers or

> whatever, we get:

>

>   f = min(1, f_rt + f_dl + f_cfs)

>

> And then u_rt does want to be part of r. And while we do run RT at f=1,

> it doesn't matter either way around I think.

>

> > But this also means that we will start to inflate the utilization to

> > get higher OPP even if there is idle time and lost the interest of

> > using dl bw

>

> You get _some_ inflation, but only if there is actual cfs utilization to

> begin with.

>

> And that is my objection to that straight sum thing; there the dl util

> distorts the computed dl bandwidth thing even if there is no cfs

> utilization.


hmm,


>
Vincent Guittot June 22, 2018, 1:57 p.m. | #18
On Fri, 22 Jun 2018 at 15:54, Vincent Guittot
<vincent.guittot@linaro.org> wrote:
>

> On Fri, 22 Jun 2018 at 15:26, Peter Zijlstra <peterz@infradead.org> wrote:

> >

> > On Fri, Jun 22, 2018 at 02:23:22PM +0200, Vincent Guittot wrote:

> > > On Fri, 22 Jun 2018 at 13:37, Peter Zijlstra <peterz@infradead.org> wrote:

> > > > I suppose we can make it more complicated, something like:

> > > >

> > > >              u           u

> > > >   f := u + (--- - u) * (---)^n

> > > >             1-r         1-r

> > > >

> > > > Where: u := cfs util

> > > >        r := \Sum !cfs util

> > > >        f := frequency request

> > > >

> > > > That would still satisfy all criteria I think:

> > > >

> > > >   r = 0      -> f := u

> > > >   u = (1-r)  -> f := 1

> > > >

> > > > and in particular:

> > > >

> > > >   u << (1-r) -> f ~= u

> > > >

> > > > which casuses less inflation than the linear thing where there is idle

> > > > time.

> >

> > > And we are not yet at the right value for quentin's example as we need

> > > something around 0.75 for is example

> >

> > $ bc -l

> > define f (u,r,n) { return u + ((u/(1-r)) - u) * (u/(1-r))^n; }

> > f(.2,.7,0)

> > .66666666666666666666

> > f(.2,.7,2)

> > .40740740740740740739

> > f(.2,.7,4)

> > .29218106995884773661

> >

> > So at 10% idle time, we've only inflated what should be 20% to 40%, that

> > is entirely reasonable I think. The linear case gave us 66%.  But feel

> > free to increase @n if you feel that helps, 4 is only one mult more than

> > 2 and gets us down to 29%.

>

> I'm a bit lost with your example.

> u = 0.2 (for cfs) and r=0.7 (let say for rt) in your example and idle is 0.1

>

> For rt task, we run 0.7 of the time at f=1 then we will select f=0.4

> for run cfs task with u=0.2 but u is the utilization at f=1 which

> means that it will take 250% of normal time to execute at f=0.4 which

> means 0.5  time instead of 0.2 at f=1 so we are going out of time. In

> order to have enough time to run r and u we must run at least  f=0.666

> for cfs = 0.2/(1-0.7). If rt task doesn't run at f=1 we would have to

> run at f=0.9

>

> >

> > > The non linearity only comes from dl so if we want to use the equation

> > > above, u should be (cfs + rt) and r = dl

> >

> > Right until we allow RT to run at anything other than f=1. Once we allow

> > rt util capping, either through Patrick's thing or CBS servers or

> > whatever, we get:

> >

> >   f = min(1, f_rt + f_dl + f_cfs)

> >

> > And then u_rt does want to be part of r. And while we do run RT at f=1,

> > it doesn't matter either way around I think.

> >

> > > But this also means that we will start to inflate the utilization to

> > > get higher OPP even if there is idle time and lost the interest of

> > > using dl bw

> >

> > You get _some_ inflation, but only if there is actual cfs utilization to

> > begin with.

> >

> > And that is my objection to that straight sum thing; there the dl util

> > distorts the computed dl bandwidth thing even if there is no cfs

> > utilization.

>

> hmm,


forgot to finish this sentence

hmm, dl util_avg is only used to detect is there is idle time so if
cfs util is nul we will not distort the  dl bw (for a use case where
there is no rt task running)

>

>

> >
Peter Zijlstra June 22, 2018, 2:11 p.m. | #19
On Fri, Jun 22, 2018 at 03:54:24PM +0200, Vincent Guittot wrote:
> On Fri, 22 Jun 2018 at 15:26, Peter Zijlstra <peterz@infradead.org> wrote:


> > $ bc -l

> > define f (u,r,n) { return u + ((u/(1-r)) - u) * (u/(1-r))^n; }

> > f(.2,.7,0)

> > .66666666666666666666

> > f(.2,.7,2)

> > .40740740740740740739

> > f(.2,.7,4)

> > .29218106995884773661

> >

> > So at 10% idle time, we've only inflated what should be 20% to 40%, that

> > is entirely reasonable I think. The linear case gave us 66%.  But feel

> > free to increase @n if you feel that helps, 4 is only one mult more than

> > 2 and gets us down to 29%.

> 

> I'm a bit lost with your example.

> u = 0.2 (for cfs) and r=0.7 (let say for rt) in your example and idle is 0.1

> 

> For rt task, we run 0.7 of the time at f=1 then we will select f=0.4

> for run cfs task with u=0.2 but u is the utilization at f=1 which

> means that it will take 250% of normal time to execute at f=0.4 which

> means 0.5  time instead of 0.2 at f=1 so we are going out of time. In

> order to have enough time to run r and u we must run at least  f=0.666

> for cfs = 0.2/(1-0.7). 


Argh.. that is n=0. So clearly I went off the rails somewhere.
Vincent Guittot June 22, 2018, 2:12 p.m. | #20
On Fri, 22 Jun 2018 at 15:54, Vincent Guittot
<vincent.guittot@linaro.org> wrote:
>

> On Fri, 22 Jun 2018 at 15:26, Peter Zijlstra <peterz@infradead.org> wrote:

> >

> > On Fri, Jun 22, 2018 at 02:23:22PM +0200, Vincent Guittot wrote:

> > > On Fri, 22 Jun 2018 at 13:37, Peter Zijlstra <peterz@infradead.org> wrote:

> > > > I suppose we can make it more complicated, something like:

> > > >

> > > >              u           u

> > > >   f := u + (--- - u) * (---)^n

> > > >             1-r         1-r

> > > >

> > > > Where: u := cfs util

> > > >        r := \Sum !cfs util

> > > >        f := frequency request

> > > >

> > > > That would still satisfy all criteria I think:

> > > >

> > > >   r = 0      -> f := u

> > > >   u = (1-r)  -> f := 1

> > > >

> > > > and in particular:

> > > >

> > > >   u << (1-r) -> f ~= u

> > > >

> > > > which casuses less inflation than the linear thing where there is idle

> > > > time.

> >

> > > And we are not yet at the right value for quentin's example as we need

> > > something around 0.75 for is example

> >

> > $ bc -l

> > define f (u,r,n) { return u + ((u/(1-r)) - u) * (u/(1-r))^n; }

> > f(.2,.7,0)

> > .66666666666666666666

> > f(.2,.7,2)

> > .40740740740740740739

> > f(.2,.7,4)

> > .29218106995884773661

> >

> > So at 10% idle time, we've only inflated what should be 20% to 40%, that

> > is entirely reasonable I think. The linear case gave us 66%.  But feel

> > free to increase @n if you feel that helps, 4 is only one mult more than

> > 2 and gets us down to 29%.

>

> I'm a bit lost with your example.

> u = 0.2 (for cfs) and r=0.7 (let say for rt) in your example and idle is 0.1

>

> For rt task, we run 0.7 of the time at f=1 then we will select f=0.4

> for run cfs task with u=0.2 but u is the utilization at f=1 which

> means that it will take 250% of normal time to execute at f=0.4 which

> means 0.5  time instead of 0.2 at f=1 so we are going out of time. In

> order to have enough time to run r and u we must run at least  f=0.666

> for cfs = 0.2/(1-0.7). If rt task doesn't run at f=1 we would have to

> run at f=0.9


The current proposal keeps thing simple and doesn't take into account
the fact that rt runs at max freq which gives more margin when cfs is
running.
If we want to take into account the fact that rt task run at max freq
when computing frequency for dl and cfs we should use f = (cfs util +
dl bw) /(1 - rt util). Then this doesn't take into account the fact
that f=maw as soon as rt is runnable which means that dl can run at
max part of its time.

>

> >

> > > The non linearity only comes from dl so if we want to use the equation

> > > above, u should be (cfs + rt) and r = dl

> >

> > Right until we allow RT to run at anything other than f=1. Once we allow

> > rt util capping, either through Patrick's thing or CBS servers or

> > whatever, we get:

> >

> >   f = min(1, f_rt + f_dl + f_cfs)

> >

> > And then u_rt does want to be part of r. And while we do run RT at f=1,

> > it doesn't matter either way around I think.

> >

> > > But this also means that we will start to inflate the utilization to

> > > get higher OPP even if there is idle time and lost the interest of

> > > using dl bw

> >

> > You get _some_ inflation, but only if there is actual cfs utilization to

> > begin with.

> >

> > And that is my objection to that straight sum thing; there the dl util

> > distorts the computed dl bandwidth thing even if there is no cfs

> > utilization.

>

> hmm,

>

>

> >
Peter Zijlstra June 22, 2018, 2:46 p.m. | #21
On Fri, Jun 22, 2018 at 03:57:24PM +0200, Vincent Guittot wrote:
> On Fri, 22 Jun 2018 at 15:54, Vincent Guittot

> <vincent.guittot@linaro.org> wrote:

> > On Fri, 22 Jun 2018 at 15:26, Peter Zijlstra <peterz@infradead.org> wrote:


> > > And that is my objection to that straight sum thing; there the dl util

> > > distorts the computed dl bandwidth thing even if there is no cfs

> > > utilization.

> >

> > hmm,

> 

> forgot to finish this sentence

> 

> hmm, dl util_avg is only used to detect is there is idle time so if

> cfs util is nul we will not distort the  dl bw (for a use case where

> there is no rt task running)


Hurm.. let me apply those patches and read the result, because I think I
missed something.
Peter Zijlstra June 22, 2018, 2:48 p.m. | #22
On Fri, Jun 22, 2018 at 04:11:59PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 22, 2018 at 03:54:24PM +0200, Vincent Guittot wrote:

> > On Fri, 22 Jun 2018 at 15:26, Peter Zijlstra <peterz@infradead.org> wrote:

> 

> > > define f (u,r,n) { return u + ((u/(1-r)) - u) * (u/(1-r))^n; }


> > I'm a bit lost with your example.

> > u = 0.2 (for cfs) and r=0.7 (let say for rt) in your example and idle is 0.1

> > 

> > For rt task, we run 0.7 of the time at f=1 then we will select f=0.4

> > for run cfs task with u=0.2 but u is the utilization at f=1 which

> > means that it will take 250% of normal time to execute at f=0.4 which

> > means 0.5  time instead of 0.2 at f=1 so we are going out of time. In

> > order to have enough time to run r and u we must run at least  f=0.666

> > for cfs = 0.2/(1-0.7). 

> 

> Argh.. that is n=0. So clearly I went off the rails somewhere.


Aah, I think the number I've been computing is a 'corrected' u. Not an
f. It made sure that 0 idle got u=1, but no more.
Vincent Guittot June 22, 2018, 2:49 p.m. | #23
On Fri, 22 Jun 2018 at 16:46, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Fri, Jun 22, 2018 at 03:57:24PM +0200, Vincent Guittot wrote:

> > On Fri, 22 Jun 2018 at 15:54, Vincent Guittot

> > <vincent.guittot@linaro.org> wrote:

> > > On Fri, 22 Jun 2018 at 15:26, Peter Zijlstra <peterz@infradead.org> wrote:

>

> > > > And that is my objection to that straight sum thing; there the dl util

> > > > distorts the computed dl bandwidth thing even if there is no cfs

> > > > utilization.

> > >

> > > hmm,

> >

> > forgot to finish this sentence

> >

> > hmm, dl util_avg is only used to detect is there is idle time so if

> > cfs util is nul we will not distort the  dl bw (for a use case where

> > there is no rt task running)

>

> Hurm.. let me apply those patches and read the result, because I think I

> missed something.


ok.

the patchset is available here if it can help :
https://git.linaro.org/people/vincent.guittot/kernel.git   branch:
sched-rt-utilization

>
Peter Zijlstra June 22, 2018, 3:22 p.m. | #24
On Fri, Jun 22, 2018 at 01:37:13PM +0200, Peter Zijlstra wrote:
> That is true.. So we could limit the scaling to the case where there is

> no idle time, something like:

> 

> 	util = sg_cpu->util_cfs;

> 

> 	cap_cfs = (1024 - (sg_cpu->util_rt + ...));

> 	if (util == cap_cfs)

> 		util = sg_cpu->max;

> 


OK, it appears this is more or less what the patches do. And I think
there's a small risk/hole with this where util ~= cap_cfs but very close
due to some unaccounted time.

FWIW, when looking, I saw no reason why sugov_get_util() and
sugov_aggregate_util() were in fact separate functions.

--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -53,11 +53,7 @@ struct sugov_cpu {
 	unsigned int		iowait_boost_max;
 	u64			last_update;
 
-	/* The fields below are only needed when sharing a policy: */
-	unsigned long		util_cfs;
 	unsigned long		util_dl;
-	unsigned long		bw_dl;
-	unsigned long		util_rt;
 	unsigned long		max;
 
 	/* The field below is for single-CPU policies only: */
@@ -181,44 +177,38 @@ static unsigned int get_next_freq(struct
 	return cpufreq_driver_resolve_freq(policy, freq);
 }
 
-static void sugov_get_util(struct sugov_cpu *sg_cpu)
+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 {
 	struct rq *rq = cpu_rq(sg_cpu->cpu);
+	unsigned long util, max;
 
-	sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
-	sg_cpu->util_cfs = cpu_util_cfs(rq);
-	sg_cpu->util_dl  = cpu_util_dl(rq);
-	sg_cpu->bw_dl    = cpu_bw_dl(rq);
-	sg_cpu->util_rt  = cpu_util_rt(rq);
-}
-
-static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
-{
-	struct rq *rq = cpu_rq(sg_cpu->cpu);
-	unsigned long util;
+	sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+	sg_cpu->util_dl   = cpu_util_dl(rq);
 
 	if (rq->rt.rt_nr_running)
-		return sg_cpu->max;
+		return max;
 
-	util = sg_cpu->util_cfs;
-	util += sg_cpu->util_rt;
+	util  = cpu_util_cfs(rq);
+	util += cpu_util_rt(rq);
 
-	if ((util + sg_cpu->util_dl) >= sg_cpu->max)
-		return sg_cpu->max;
+	/*
+	 * If there is no idle time, we should run at max frequency.
+	 */
+	if ((util + cpu_util_dl(rq)) >= max)
+		return max;
 
 	/*
-	 * As there is still idle time on the CPU, we need to compute the
-	 * utilization level of the CPU.
 	 * Bandwidth required by DEADLINE must always be granted while, for
 	 * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
 	 * to gracefully reduce the frequency when no tasks show up for longer
 	 * periods of time.
+	 *
+	 * Ideally we would like to set bw_dl as min/guaranteed freq and bw_dl
+	 * + util as requested freq. However, cpufreq is not yet ready for such
+	 * an interface. So, we only do the latter for now.
 	 */
 
-	/* Add DL bandwidth requirement */
-	util += sg_cpu->bw_dl;
-
-	return min(sg_cpu->max, util);
+	return min(max, cpu_bw_dl(rq) + util);
 }
 
 /**
@@ -396,9 +386,8 @@ static void sugov_update_single(struct u
 
 	busy = sugov_cpu_is_busy(sg_cpu);
 
-	sugov_get_util(sg_cpu);
+	util = sugov_get_util(sg_cpu);
 	max = sg_cpu->max;
-	util = sugov_aggregate_util(sg_cpu);
 	sugov_iowait_apply(sg_cpu, time, &util, &max);
 	next_f = get_next_freq(sg_policy, util, max);
 	/*
@@ -437,9 +426,8 @@ static unsigned int sugov_next_freq_shar
 		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
 		unsigned long j_util, j_max;
 
-		sugov_get_util(j_sg_cpu);
+		j_util = sugov_get_util(j_sg_cpu);
 		j_max = j_sg_cpu->max;
-		j_util = sugov_aggregate_util(j_sg_cpu);
 		sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
 
 		if (j_util * max > j_max * util) {
Quentin Perret June 22, 2018, 3:30 p.m. | #25
On Friday 22 Jun 2018 at 17:22:58 (+0200), Peter Zijlstra wrote:
> On Fri, Jun 22, 2018 at 01:37:13PM +0200, Peter Zijlstra wrote:

> > That is true.. So we could limit the scaling to the case where there is

> > no idle time, something like:

> > 

> > 	util = sg_cpu->util_cfs;

> > 

> > 	cap_cfs = (1024 - (sg_cpu->util_rt + ...));

> > 	if (util == cap_cfs)

> > 		util = sg_cpu->max;

> > 

> 

> OK, it appears this is more or less what the patches do. And I think

> there's a small risk/hole with this where util ~= cap_cfs but very close

> due to some unaccounted time.


So Vincent suggested at some point to add a margin to avoid that issue
IIRC. FWIW, this is what the overutilized flag of EAS does. It basically
says, if there isn't enough idle time in the system (cfs_util is too close
to cap_cfs), don't bother looking at the util signals because they'll be
kinda wrong.

So what about something like, go to max freq if overutilized ? Or
something similar on a per cpufreq policy basis ?

Thanks,
Quentin
Vincent Guittot June 22, 2018, 5:24 p.m. | #26
On Fri, 22 Jun 2018 at 17:23, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Fri, Jun 22, 2018 at 01:37:13PM +0200, Peter Zijlstra wrote:

> > That is true.. So we could limit the scaling to the case where there is

> > no idle time, something like:

> >

> >       util = sg_cpu->util_cfs;

> >

> >       cap_cfs = (1024 - (sg_cpu->util_rt + ...));

> >       if (util == cap_cfs)

> >               util = sg_cpu->max;

> >

>

> OK, it appears this is more or less what the patches do. And I think

> there's a small risk/hole with this where util ~= cap_cfs but very close

> due to some unaccounted time.

>

> FWIW, when looking, I saw no reason why sugov_get_util() and

> sugov_aggregate_util() were in fact separate functions.


good point

>

> --- a/kernel/sched/cpufreq_schedutil.c

> +++ b/kernel/sched/cpufreq_schedutil.c

> @@ -53,11 +53,7 @@ struct sugov_cpu {

>         unsigned int            iowait_boost_max;

>         u64                     last_update;

>

> -       /* The fields below are only needed when sharing a policy: */

> -       unsigned long           util_cfs;

>         unsigned long           util_dl;

> -       unsigned long           bw_dl;

> -       unsigned long           util_rt;

>         unsigned long           max;

>

>         /* The field below is for single-CPU policies only: */

> @@ -181,44 +177,38 @@ static unsigned int get_next_freq(struct

>         return cpufreq_driver_resolve_freq(policy, freq);

>  }

>

> -static void sugov_get_util(struct sugov_cpu *sg_cpu)

> +static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)

>  {

>         struct rq *rq = cpu_rq(sg_cpu->cpu);

> +       unsigned long util, max;

>

> -       sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);

> -       sg_cpu->util_cfs = cpu_util_cfs(rq);

> -       sg_cpu->util_dl  = cpu_util_dl(rq);

> -       sg_cpu->bw_dl    = cpu_bw_dl(rq);

> -       sg_cpu->util_rt  = cpu_util_rt(rq);

> -}

> -

> -static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)

> -{

> -       struct rq *rq = cpu_rq(sg_cpu->cpu);

> -       unsigned long util;

> +       sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);

> +       sg_cpu->util_dl   = cpu_util_dl(rq);

>

>         if (rq->rt.rt_nr_running)

> -               return sg_cpu->max;

> +               return max;

>

> -       util = sg_cpu->util_cfs;

> -       util += sg_cpu->util_rt;

> +       util  = cpu_util_cfs(rq);

> +       util += cpu_util_rt(rq);

>

> -       if ((util + sg_cpu->util_dl) >= sg_cpu->max)

> -               return sg_cpu->max;

> +       /*

> +        * If there is no idle time, we should run at max frequency.

> +        */

> +       if ((util + cpu_util_dl(rq)) >= max)

> +               return max;

>

>         /*

> -        * As there is still idle time on the CPU, we need to compute the

> -        * utilization level of the CPU.

>          * Bandwidth required by DEADLINE must always be granted while, for

>          * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism

>          * to gracefully reduce the frequency when no tasks show up for longer

>          * periods of time.

> +        *

> +        * Ideally we would like to set bw_dl as min/guaranteed freq and bw_dl

> +        * + util as requested freq. However, cpufreq is not yet ready for such

> +        * an interface. So, we only do the latter for now.

>          */

>

> -       /* Add DL bandwidth requirement */

> -       util += sg_cpu->bw_dl;

> -

> -       return min(sg_cpu->max, util);

> +       return min(max, cpu_bw_dl(rq) + util);

>  }

>

>  /**

> @@ -396,9 +386,8 @@ static void sugov_update_single(struct u

>

>         busy = sugov_cpu_is_busy(sg_cpu);

>

> -       sugov_get_util(sg_cpu);

> +       util = sugov_get_util(sg_cpu);

>         max = sg_cpu->max;

> -       util = sugov_aggregate_util(sg_cpu);

>         sugov_iowait_apply(sg_cpu, time, &util, &max);

>         next_f = get_next_freq(sg_policy, util, max);

>         /*

> @@ -437,9 +426,8 @@ static unsigned int sugov_next_freq_shar

>                 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);

>                 unsigned long j_util, j_max;

>

> -               sugov_get_util(j_sg_cpu);

> +               j_util = sugov_get_util(j_sg_cpu);

>                 j_max = j_sg_cpu->max;

> -               j_util = sugov_aggregate_util(j_sg_cpu);

>                 sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);

>

>                 if (j_util * max > j_max * util) {

>

Patch

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 28592b6..32f97fb 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -56,6 +56,7 @@  struct sugov_cpu {
 	/* The fields below are only needed when sharing a policy: */
 	unsigned long		util_cfs;
 	unsigned long		util_dl;
+	unsigned long		util_rt;
 	unsigned long		max;
 
 	/* The field below is for single-CPU policies only: */
@@ -178,15 +179,21 @@  static void sugov_get_util(struct sugov_cpu *sg_cpu)
 	sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
 	sg_cpu->util_cfs = cpu_util_cfs(rq);
 	sg_cpu->util_dl  = cpu_util_dl(rq);
+	sg_cpu->util_rt  = cpu_util_rt(rq);
 }
 
 static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
 {
 	struct rq *rq = cpu_rq(sg_cpu->cpu);
+	unsigned long util;
 
 	if (rq->rt.rt_nr_running)
 		return sg_cpu->max;
 
+	util = sg_cpu->util_dl;
+	util += sg_cpu->util_cfs;
+	util += sg_cpu->util_rt;
+
 	/*
 	 * Utilization required by DEADLINE must always be granted while, for
 	 * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
@@ -197,7 +204,7 @@  static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
 	 * util_cfs + util_dl as requested freq. However, cpufreq is not yet
 	 * ready for such an interface. So, we only do the latter for now.
 	 */
-	return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
+	return min(sg_cpu->max, util);
 }
 
 static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)