diff mbox

[v5,10/12] sched: get CPU's utilization statistic

Message ID 1409051215-16788-11-git-send-email-vincent.guittot@linaro.org
State New
Headers show

Commit Message

Vincent Guittot Aug. 26, 2014, 11:06 a.m. UTC
Monitor the utilization level of each group of each sched_domain level. The
utilization is the amount of cpu_capacity that is currently used on a CPU or
group of CPUs. We use the usage_load_avg to evaluate this utilization level.
In the special use case where the CPU is fully loaded by more than 1 task,
the activity level is set above the cpu_capacity in order to reflect the overload
of the CPU

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

Comments

Peter Zijlstra Sept. 11, 2014, 12:34 p.m. UTC | #1
On Tue, Aug 26, 2014 at 01:06:53PM +0200, Vincent Guittot wrote:
> Monitor the utilization level of each group of each sched_domain level. The
> utilization is the amount of cpu_capacity that is currently used on a CPU or
> group of CPUs. We use the usage_load_avg to evaluate this utilization level.
> In the special use case where the CPU is fully loaded by more than 1 task,
> the activity level is set above the cpu_capacity in order to reflect the overload
> of the CPU
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  kernel/sched/fair.c | 19 ++++++++++++++++++-
>  1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1fd2131..2f95d1c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4126,6 +4126,11 @@ static unsigned long capacity_of(int cpu)
>  	return cpu_rq(cpu)->cpu_capacity;
>  }
>  
> +static unsigned long capacity_orig_of(int cpu)
> +{
> +	return cpu_rq(cpu)->cpu_capacity_orig;
> +}
> +
>  static unsigned long cpu_avg_load_per_task(int cpu)
>  {
>  	struct rq *rq = cpu_rq(cpu);

This hunk should probably go into patch 6.

> @@ -4514,6 +4519,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
>  	return target;
>  }
>  
> +static int get_cpu_utilization(int cpu)
> +{
> +	unsigned long usage = cpu_rq(cpu)->cfs.usage_load_avg;
> +	unsigned long capacity = capacity_of(cpu);
> +
> +	if (usage >= SCHED_LOAD_SCALE)
> +		return capacity + 1;
> +
> +	return (usage * capacity) >> SCHED_LOAD_SHIFT;
> +}

So if I understood patch 9 correct, your changelog is iffy.
usage_load_avg should never get > 1 (of whatever unit), no matter how
many tasks are on the rq. You can only maximally run all the time.

Therefore I can only interpret the if (usage >= SCHED_LOAD_SCALE) as
numerical error handling, nothing more.

Also I'm not entirely sure I like the usage, utilization names/metrics.
I would suggest to reverse them. Call the pure running number
'utilization' and this scaled with capacity 'usage' or so.

> @@ -6188,7 +6206,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>  		/* Now, start updating sd_lb_stats */
>  		sds->total_load += sgs->group_load;
>  		sds->total_capacity += sgs->group_capacity;
> -
>  		sg = sg->next;
>  	} while (sg != env->sd->groups);
>  

I like that extra line of whitespace, it separates the body from the
loop itself.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot Sept. 11, 2014, 1:07 p.m. UTC | #2
On 11 September 2014 14:34, Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Aug 26, 2014 at 01:06:53PM +0200, Vincent Guittot wrote:
>> Monitor the utilization level of each group of each sched_domain level. The
>> utilization is the amount of cpu_capacity that is currently used on a CPU or
>> group of CPUs. We use the usage_load_avg to evaluate this utilization level.
>> In the special use case where the CPU is fully loaded by more than 1 task,
>> the activity level is set above the cpu_capacity in order to reflect the overload
>> of the CPU
>>
>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> ---
>>  kernel/sched/fair.c | 19 ++++++++++++++++++-
>>  1 file changed, 18 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 1fd2131..2f95d1c 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4126,6 +4126,11 @@ static unsigned long capacity_of(int cpu)
>>       return cpu_rq(cpu)->cpu_capacity;
>>  }
>>
>> +static unsigned long capacity_orig_of(int cpu)
>> +{
>> +     return cpu_rq(cpu)->cpu_capacity_orig;
>> +}
>> +
>>  static unsigned long cpu_avg_load_per_task(int cpu)
>>  {
>>       struct rq *rq = cpu_rq(cpu);
>
> This hunk should probably go into patch 6.
>
>> @@ -4514,6 +4519,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
>>       return target;
>>  }
>>
>> +static int get_cpu_utilization(int cpu)
>> +{
>> +     unsigned long usage = cpu_rq(cpu)->cfs.usage_load_avg;
>> +     unsigned long capacity = capacity_of(cpu);
>> +
>> +     if (usage >= SCHED_LOAD_SCALE)
>> +             return capacity + 1;
>> +
>> +     return (usage * capacity) >> SCHED_LOAD_SHIFT;
>> +}
>
> So if I understood patch 9 correct, your changelog is iffy.
> usage_load_avg should never get > 1 (of whatever unit), no matter how
> many tasks are on the rq. You can only maximally run all the time.
>
> Therefore I can only interpret the if (usage >= SCHED_LOAD_SCALE) as
> numerical error handling, nothing more.

yes

>
> Also I'm not entirely sure I like the usage, utilization names/metrics.
> I would suggest to reverse them. Call the pure running number
> 'utilization' and this scaled with capacity 'usage' or so.

ok. i can invert 'usage' and 'utilization', which will give

s/get_cpu_utilization/get_cpu_usage/
s/sgs->group_utilization/sgs->group_usage/
s/cfs.usage_load_avg/cfs.utilization_load_avg/
s/se->avg.usage_avg_contrib/se->avg.utilization_avg_contrib
s/__update_task_entity_usage/__update_task_entity_utilization
s/__update_entity_usage_avg_contrib/__update_entity_utilization_avg_contrib

>
>> @@ -6188,7 +6206,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>>               /* Now, start updating sd_lb_stats */
>>               sds->total_load += sgs->group_load;
>>               sds->total_capacity += sgs->group_capacity;
>> -
>>               sg = sg->next;
>>       } while (sg != env->sd->groups);
>>
>
> I like that extra line of whitespace, it separates the body from the
> loop itself.

Sorry, i don't know why i remove it
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra Sept. 11, 2014, 2:04 p.m. UTC | #3
On Thu, Sep 11, 2014 at 03:07:52PM +0200, Vincent Guittot wrote:
> > Also I'm not entirely sure I like the usage, utilization names/metrics.
> > I would suggest to reverse them. Call the pure running number
> > 'utilization' and this scaled with capacity 'usage' or so.
> 
> ok. i can invert 'usage' and 'utilization', which will give
> 
> s/get_cpu_utilization/get_cpu_usage/
> s/sgs->group_utilization/sgs->group_usage/
> s/cfs.usage_load_avg/cfs.utilization_load_avg/
> s/se->avg.usage_avg_contrib/se->avg.utilization_avg_contrib
> s/__update_task_entity_usage/__update_task_entity_utilization
> s/__update_entity_usage_avg_contrib/__update_entity_utilization_avg_contrib
> 

Any other opinions before Vince goes and applies sed on patches? ;-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Nicolas Pitre Sept. 11, 2014, 7:17 p.m. UTC | #4
On Thu, 11 Sep 2014, Peter Zijlstra wrote:

> On Thu, Sep 11, 2014 at 03:07:52PM +0200, Vincent Guittot wrote:
> > > Also I'm not entirely sure I like the usage, utilization names/metrics.
> > > I would suggest to reverse them. Call the pure running number
> > > 'utilization' and this scaled with capacity 'usage' or so.
> > 
> > ok. i can invert 'usage' and 'utilization', which will give
> > 
> > s/get_cpu_utilization/get_cpu_usage/
> > s/sgs->group_utilization/sgs->group_usage/
> > s/cfs.usage_load_avg/cfs.utilization_load_avg/
> > s/se->avg.usage_avg_contrib/se->avg.utilization_avg_contrib
> > s/__update_task_entity_usage/__update_task_entity_utilization
> > s/__update_entity_usage_avg_contrib/__update_entity_utilization_avg_contrib
> > 
> 
> Any other opinions before Vince goes and applies sed on patches? ;-)

I don't mind either way, but for sure someone (possibly me) is going to 
confuse the two soon enough.

Please include in the code some formal definition in the context of the 
scheduler.  A comment block right before the corresponding get_cpu_* 
accessors should be good enough.


Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot Sept. 12, 2014, 7:41 a.m. UTC | #5
On 11 September 2014 21:17, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:
>>
>> Any other opinions before Vince goes and applies sed on patches? ;-)
>
> I don't mind either way, but for sure someone (possibly me) is going to
> confuse the two soon enough.
>
> Please include in the code some formal definition in the context of the
> scheduler.  A comment block right before the corresponding get_cpu_*
> accessors should be good enough.

ok

>
>
> Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Morten Rasmussen Sept. 15, 2014, 7:28 p.m. UTC | #6
On Thu, Sep 11, 2014 at 01:34:12PM +0100, Peter Zijlstra wrote:
> > @@ -4514,6 +4519,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
> >  	return target;
> >  }
> >  
> > +static int get_cpu_utilization(int cpu)
> > +{
> > +	unsigned long usage = cpu_rq(cpu)->cfs.usage_load_avg;
> > +	unsigned long capacity = capacity_of(cpu);
> > +
> > +	if (usage >= SCHED_LOAD_SCALE)
> > +		return capacity + 1;
> > +
> > +	return (usage * capacity) >> SCHED_LOAD_SHIFT;
> > +}
> 
> So if I understood patch 9 correct, your changelog is iffy.
> usage_load_avg should never get > 1 (of whatever unit), no matter how
> many tasks are on the rq. You can only maximally run all the time.
> 
> Therefore I can only interpret the if (usage >= SCHED_LOAD_SCALE) as
> numerical error handling, nothing more.

That is not entirely true unless you also classify transient usage
spikes due to task migrations as numerical errors as well.

Since each task sched_entity is carrying around 350ms worth of execution
history with it between different cpus and cpu utilization is based on
the sum of task entity usage_avg_contrib on the runqueue you may get
cfs.usage_load_avg > 1 temporarily after task migrations. It will
eventually converge to 1.

The same goes for new tasks which are initialized to have a
usage_avg_contrib of 1 and may be queued on cpu with tasks already
running. In that case cfs.usage_load_avg is temporarily unbounded.

> Also I'm not entirely sure I like the usage, utilization names/metrics.
> I would suggest to reverse them. Call the pure running number
> 'utilization' and this scaled with capacity 'usage' or so.

I can agree with calling running for utilization, but I'm not convienced
about capacity. What does it exactly cover here? I'm confused and
jetlagged.

Morten

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Morten Rasmussen Sept. 15, 2014, 7:45 p.m. UTC | #7
On Thu, Sep 11, 2014 at 03:04:44PM +0100, Peter Zijlstra wrote:
> On Thu, Sep 11, 2014 at 03:07:52PM +0200, Vincent Guittot wrote:
> > > Also I'm not entirely sure I like the usage, utilization names/metrics.
> > > I would suggest to reverse them. Call the pure running number
> > > 'utilization' and this scaled with capacity 'usage' or so.
> > 
> > ok. i can invert 'usage' and 'utilization', which will give
> > 
> > s/get_cpu_utilization/get_cpu_usage/
> > s/sgs->group_utilization/sgs->group_usage/

The confusion will have new dimensions added when we introduce
scale-invariance too. Then the running number is already scaled by the
current P-state compute capacity. But I don't have any better
suggestions. 

> > s/cfs.usage_load_avg/cfs.utilization_load_avg/

I don't like using "load" for unweighted metrics. I associate load with
something that may be weighted by priority like load_avg_contrib, and
utilization with pure cpu utilization as in how many cycles is spend on
a particular task. I called it "usage_util_avg" in my own patches, but
"util_avg" might be better if we agree that utilization == usage.

> > s/se->avg.usage_avg_contrib/se->avg.utilization_avg_contrib

util_avg_contrib maybe to keep it shorter.

> > s/__update_task_entity_usage/__update_task_entity_utilization
> > s/__update_entity_usage_avg_contrib/__update_entity_utilization_avg_contrib

Maybe use "util" here as well?

Morten

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot Sept. 16, 2014, 10:43 p.m. UTC | #8
On 15 September 2014 12:45, Morten Rasmussen <morten.rasmussen@arm.com> wrote:
> On Thu, Sep 11, 2014 at 03:04:44PM +0100, Peter Zijlstra wrote:
>> On Thu, Sep 11, 2014 at 03:07:52PM +0200, Vincent Guittot wrote:
>> > > Also I'm not entirely sure I like the usage, utilization names/metrics.
>> > > I would suggest to reverse them. Call the pure running number
>> > > 'utilization' and this scaled with capacity 'usage' or so.
>> >
>> > ok. i can invert 'usage' and 'utilization', which will give
>> >
>> > s/get_cpu_utilization/get_cpu_usage/
>> > s/sgs->group_utilization/sgs->group_usage/
>
> The confusion will have new dimensions added when we introduce
> scale-invariance too. Then the running number is already scaled by the
> current P-state compute capacity. But I don't have any better
> suggestions.
>
>> > s/cfs.usage_load_avg/cfs.utilization_load_avg/
>
> I don't like using "load" for unweighted metrics. I associate load with
> something that may be weighted by priority like load_avg_contrib, and
> utilization with pure cpu utilization as in how many cycles is spend on
> a particular task. I called it "usage_util_avg" in my own patches, but
> "util_avg" might be better if we agree that utilization == usage.

ok. so i don't have the same definition than you. IMHO, load should be
used for figures that have used the average of the geometric series
used in the per entity load tracking more than the fact that we weight
the figures with priority

>
>> > s/se->avg.usage_avg_contrib/se->avg.utilization_avg_contrib
>
> util_avg_contrib maybe to keep it shorter.
>
>> > s/__update_task_entity_usage/__update_task_entity_utilization
>> > s/__update_entity_usage_avg_contrib/__update_entity_utilization_avg_contrib
>
> Maybe use "util" here as well?

I agree that utilization can be a bit too long but util sounds a bit
too short ans we loose the utilization meaning. so we could use
activity instead of utilization

Nevertheless, the most important is that we find a common definition convention

Would the following proposal be ok ?

s/get_cpu_utilization/get_cpu_usage/
s/sgs->group_utilization/sgs->group_usage/
s/cfs.usage_load_avg/cfs.activity_load_avg/
s/se->avg.usage_avg_contrib/se->avg.activity_avg_contrib
s/__update_task_entity_usage/__update_task_entity_activity
s/__update_entity_usage_avg_contrib/__update_entity_activity_avg_contrib

Vincent
>
> Morten
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
diff mbox

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1fd2131..2f95d1c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4126,6 +4126,11 @@  static unsigned long capacity_of(int cpu)
 	return cpu_rq(cpu)->cpu_capacity;
 }
 
+static unsigned long capacity_orig_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -4514,6 +4519,17 @@  static int select_idle_sibling(struct task_struct *p, int target)
 	return target;
 }
 
+static int get_cpu_utilization(int cpu)
+{
+	unsigned long usage = cpu_rq(cpu)->cfs.usage_load_avg;
+	unsigned long capacity = capacity_of(cpu);
+
+	if (usage >= SCHED_LOAD_SCALE)
+		return capacity + 1;
+
+	return (usage * capacity) >> SCHED_LOAD_SHIFT;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5657,6 +5673,7 @@  struct sg_lb_stats {
 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
 	unsigned long load_per_task;
 	unsigned long group_capacity;
+	unsigned long group_utilization; /* Total utilization of the group */
 	unsigned int sum_nr_running; /* Nr tasks running in the group */
 	unsigned int group_capacity_factor;
 	unsigned int idle_cpus;
@@ -6011,6 +6028,7 @@  static inline void update_sg_lb_stats(struct lb_env *env,
 			load = source_load(i, load_idx);
 
 		sgs->group_load += load;
+		sgs->group_utilization += get_cpu_utilization(i);
 		sgs->sum_nr_running += rq->cfs.h_nr_running;
 
 		if (rq->nr_running > 1)
@@ -6188,7 +6206,6 @@  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		/* Now, start updating sd_lb_stats */
 		sds->total_load += sgs->group_load;
 		sds->total_capacity += sgs->group_capacity;
-
 		sg = sg->next;
 	} while (sg != env->sd->groups);