diff mbox series

[RFC,2/5] sched/events: Introduce cfs_rq load tracking trace event

Message ID 20170328063541.12912-3-dietmar.eggemann@arm.com
State New
Headers show
Series CFS load tracking trace events | expand

Commit Message

Dietmar Eggemann March 28, 2017, 6:35 a.m. UTC
The trace event keys load and util (utilization) are mapped to:

 (1) load : cfs_rq->runnable_load_avg

 (2) util : cfs_rq->avg.util_avg

To let this trace event work for configurations w/ and w/o group
scheduling support for cfs (CONFIG_FAIR_GROUP_SCHED) the following
special handling is necessary for non-existent key=value pairs:

 path = "(null)" : In case of !CONFIG_FAIR_GROUP_SCHED.

 id   = -1       : In case of !CONFIG_FAIR_GROUP_SCHED.

The following list shows examples of the key=value pairs in different
configurations for:

 (1) a root task_group:

     cpu=4 path=/ id=1 load=6 util=331

 (2) a task_group:

     cpu=1 path=/tg1/tg11/tg111 id=4 load=538 util=522

 (3) an autogroup:

     cpu=3 path=/autogroup-18 id=0 load=997 util=517

 (4) w/o CONFIG_FAIR_GROUP_SCHED:

     cpu=0 path=(null) id=-1 load=314 util=289

The trace event is only defined for CONFIG_SMP.

The helper function __trace_sched_path() can be used to get the length
parameter of the dynamic array (path == NULL) and to copy the path into
it (path != NULL).

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/sched.h | 73 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |  9 ++++++
 2 files changed, 82 insertions(+)

-- 
2.11.0

Comments

Peter Zijlstra March 28, 2017, 7:56 a.m. UTC | #1
On Tue, Mar 28, 2017 at 07:35:38AM +0100, Dietmar Eggemann wrote:
> The trace event keys load and util (utilization) are mapped to:

> 

>  (1) load : cfs_rq->runnable_load_avg

> 

>  (2) util : cfs_rq->avg.util_avg

> 

> To let this trace event work for configurations w/ and w/o group

> scheduling support for cfs (CONFIG_FAIR_GROUP_SCHED) the following

> special handling is necessary for non-existent key=value pairs:

> 

>  path = "(null)" : In case of !CONFIG_FAIR_GROUP_SCHED.

> 

>  id   = -1       : In case of !CONFIG_FAIR_GROUP_SCHED.

> 

> The following list shows examples of the key=value pairs in different

> configurations for:

> 

>  (1) a root task_group:

> 

>      cpu=4 path=/ id=1 load=6 util=331


What's @id and why do we care?
Peter Zijlstra March 28, 2017, 8 a.m. UTC | #2
On Tue, Mar 28, 2017 at 07:35:38AM +0100, Dietmar Eggemann wrote:
>  

> +	if (cfs_rq)

> +		trace_sched_load_cfs_rq(cfs_rq);


You can do that with DEFINE_EVENT_CONDITION and TP_CONDITION.
Dietmar Eggemann March 28, 2017, 1:30 p.m. UTC | #3
On 03/28/2017 09:56 AM, Peter Zijlstra wrote:
> On Tue, Mar 28, 2017 at 07:35:38AM +0100, Dietmar Eggemann wrote:


[...]

>>  (1) a root task_group:

>>

>>      cpu=4 path=/ id=1 load=6 util=331

>

> What's @id and why do we care?


It's a per cgroup/subsystem unique id for every task_group (cpu controller):

struct task_group {
	struct cgroup_subsys_state css {
		...
		int id;
                 ...
	}
	...
}

The root task group path=/ has id=1 and all autogroups have id=0.

I agree, this id is redundant in case we have the task_group path.
Steven Rostedt March 28, 2017, 2:46 p.m. UTC | #4
On Tue, 28 Mar 2017 07:35:38 +0100
Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:

>  /* This part must be outside protection */

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> index 03adf9fb48b1..ac19ab6ced8f 100644

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -2950,6 +2950,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,

>  		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;

>  	}

>  

> +	if (cfs_rq)

> +		trace_sched_load_cfs_rq(cfs_rq);

> +


Please use TRACE_EVENT_CONDITION(), and test for cfs_rq not NULL.

That way it moves the if (cfs_rq) out of the scheduler code and into
the jump label protected location. That is, the if is only tested when
tracing is enabled.

-- Steve


>  	return decayed;

>  }

>  

> @@ -3170,6 +3173,8 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)

>  	update_tg_cfs_util(cfs_rq, se);

>  	update_tg_cfs_load(cfs_rq, se);

>  

> +	trace_sched_load_cfs_rq(cfs_rq);

> +

>  	return 1;

>  }

>  

> @@ -3359,6 +3364,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s

>  	set_tg_cfs_propagate(cfs_rq);

>  

>  	cfs_rq_util_change(cfs_rq);

> +

> +	trace_sched_load_cfs_rq(cfs_rq);

>  }

>  

>  /**

> @@ -3379,6 +3386,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s

>  	set_tg_cfs_propagate(cfs_rq);

>  

>  	cfs_rq_util_change(cfs_rq);

> +

> +	trace_sched_load_cfs_rq(cfs_rq);

>  }

>  

>  /* Add the load generated by se into cfs_rq's load average */
Steven Rostedt March 28, 2017, 2:47 p.m. UTC | #5
On Tue, 28 Mar 2017 10:00:50 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Mar 28, 2017 at 07:35:38AM +0100, Dietmar Eggemann wrote:

> >  

> > +	if (cfs_rq)

> > +		trace_sched_load_cfs_rq(cfs_rq);  

> 

> You can do that with DEFINE_EVENT_CONDITION and TP_CONDITION.


I just read this after I replied about using TRACE_EVENT_CONDITION. As
there is not a DEFINE_EVENT(), the TRACE_EVENT_CONDITION() should be
used. All the locations expect cfs_rq to not be NULL I assume.

-- Steve
Peter Zijlstra March 28, 2017, 4:44 p.m. UTC | #6
On Tue, Mar 28, 2017 at 10:46:00AM -0400, Steven Rostedt wrote:
> On Tue, 28 Mar 2017 07:35:38 +0100

> Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:

> 

> >  /* This part must be outside protection */

> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> > index 03adf9fb48b1..ac19ab6ced8f 100644

> > --- a/kernel/sched/fair.c

> > +++ b/kernel/sched/fair.c

> > @@ -2950,6 +2950,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,

> >  		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;

> >  	}

> >  

> > +	if (cfs_rq)

> > +		trace_sched_load_cfs_rq(cfs_rq);

> > +

> 

> Please use TRACE_EVENT_CONDITION(), and test for cfs_rq not NULL.


I too suggested that; but then I looked again at that code and we can
actually do this. cfs_rq can be constant propagated and the if
determined at build time.

Its not immediately obvious from the current code; but if we do
something like the below, it should be clearer.

---
Subject: sched/fair: Explicitly generate __update_load_avg() instances
From: Peter Zijlstra <peterz@infradead.org>

Date: Tue Mar 28 11:08:20 CEST 2017

The __update_load_avg() function is an __always_inline because its
used with constant propagation to generate different variants of the
code without having to duplicate it (which would be prone to bugs).

Explicitly instantiate the 3 variants.

Note that most of this is called from rather hot paths, so reducing
branches is good.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

------ a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2849,7 +2849,7 @@ static u32 __compute_runnable_contrib(u6
  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
  */
 static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
 	u64 delta, scaled_delta, periods;
@@ -2953,6 +2953,26 @@ __update_load_avg(u64 now, int cpu, stru
 	return decayed;
 }
 
+static int
+__update_load_avg_blocked_se(u64 now, int cpu, struct sched_avg *sa)
+{
+	return ___update_load_avg(now, cpu, sa, 0, 0, NULL);
+}
+
+static int
+__update_load_avg_se(u64 now, int cpu, struct sched_avg *sa,
+		     unsigned long weight, int running)
+{
+	return ___update_load_avg(now, cpu, sa, weight, running, NULL);
+}
+
+static int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+	return ___update_load_avg(now, cpu, sa, weight, running, cfs_rq);
+}
+
 /*
  * Signed add and clamp on underflow.
  *
@@ -3014,6 +3034,9 @@ static inline void update_tg_load_avg(st
 void set_task_rq_fair(struct sched_entity *se,
 		      struct cfs_rq *prev, struct cfs_rq *next)
 {
+	u64 p_last_update_time;
+	u64 n_last_update_time;
+
 	if (!sched_feat(ATTACH_AGE_LOAD))
 		return;
 
@@ -3024,11 +3047,11 @@ void set_task_rq_fair(struct sched_entit
 	 * time. This will result in the wakee task is less decayed, but giving
 	 * the wakee more load sounds not bad.
 	 */
-	if (se->avg.last_update_time && prev) {
-		u64 p_last_update_time;
-		u64 n_last_update_time;
+	if (!(se->avg.last_update_time && prev))
+		return;
 
 #ifndef CONFIG_64BIT
+	{
 		u64 p_last_update_time_copy;
 		u64 n_last_update_time_copy;
 
@@ -3043,14 +3066,15 @@ void set_task_rq_fair(struct sched_entit
 
 		} while (p_last_update_time != p_last_update_time_copy ||
 			 n_last_update_time != n_last_update_time_copy);
+	}
 #else
-		p_last_update_time = prev->avg.last_update_time;
-		n_last_update_time = next->avg.last_update_time;
+	p_last_update_time = prev->avg.last_update_time;
+	n_last_update_time = next->avg.last_update_time;
 #endif
-		__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
-				  &se->avg, 0, 0, NULL);
-		se->avg.last_update_time = n_last_update_time;
-	}
+	__update_load_avg_blocked_se(p_last_update_time,
+				     cpu_of(rq_of(prev)),
+				     &se->avg);
+	se->avg.last_update_time = n_last_update_time;
 }
 
 /* Take into account change of utilization of a child task group */
@@ -3329,9 +3353,9 @@ static inline void update_load_avg(struc
 	 * track group sched_entity load average for task_h_load calc in migration
 	 */
 	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
-		__update_load_avg(now, cpu, &se->avg,
+		__update_load_avg_se(now, cpu, &se->avg,
 			  se->on_rq * scale_load_down(se->load.weight),
-			  cfs_rq->curr == se, NULL);
+			  cfs_rq->curr == se);
 	}
 
 	decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
@@ -3437,7 +3461,7 @@ void sync_entity_load_avg(struct sched_e
 	u64 last_update_time;
 
 	last_update_time = cfs_rq_last_update_time(cfs_rq);
-	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+	__update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg);
 }
 
 /*

Peter Zijlstra March 28, 2017, 4:57 p.m. UTC | #7
On Tue, Mar 28, 2017 at 06:44:59PM +0200, Peter Zijlstra wrote:
> ---

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -2849,7 +2849,7 @@ static u32 __compute_runnable_contrib(u6

>   *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]

>   */

>  static __always_inline int

> -__update_load_avg(u64 now, int cpu, struct sched_avg *sa,

> +___update_load_avg(u64 now, int cpu, struct sched_avg *sa,

>  		  unsigned long weight, int running, struct cfs_rq *cfs_rq)

>  {

>  	u64 delta, scaled_delta, periods;

> @@ -2953,6 +2953,26 @@ __update_load_avg(u64 now, int cpu, stru

>  	return decayed;

>  }

>  

> +static int

> +__update_load_avg_blocked_se(u64 now, int cpu, struct sched_avg *sa)

> +{

> +	return ___update_load_avg(now, cpu, sa, 0, 0, NULL);

> +}

> +

> +static int

> +__update_load_avg_se(u64 now, int cpu, struct sched_avg *sa,

> +		     unsigned long weight, int running)

> +{

> +	return ___update_load_avg(now, cpu, sa, weight, running, NULL);

> +}

> +

> +static int

> +__update_load_avg(u64 now, int cpu, struct sched_avg *sa,

> +		  unsigned long weight, int running, struct cfs_rq *cfs_rq)

> +{

> +	return ___update_load_avg(now, cpu, sa, weight, running, cfs_rq);


Although ideally we'd be able to tell the compiler that cfs_rq will not
be NULL here. Hurmph.. no __builtin for that I think :/

> +}
Patrick Bellasi March 28, 2017, 5:20 p.m. UTC | #8
On 28-Mar 18:57, Peter Zijlstra wrote:
> On Tue, Mar 28, 2017 at 06:44:59PM +0200, Peter Zijlstra wrote:

> > ---

> > --- a/kernel/sched/fair.c

> > +++ b/kernel/sched/fair.c

> > @@ -2849,7 +2849,7 @@ static u32 __compute_runnable_contrib(u6

> >   *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]

> >   */

> >  static __always_inline int

> > -__update_load_avg(u64 now, int cpu, struct sched_avg *sa,

> > +___update_load_avg(u64 now, int cpu, struct sched_avg *sa,

> >  		  unsigned long weight, int running, struct cfs_rq *cfs_rq)

> >  {

> >  	u64 delta, scaled_delta, periods;

> > @@ -2953,6 +2953,26 @@ __update_load_avg(u64 now, int cpu, stru

> >  	return decayed;

> >  }

> >  

> > +static int

> > +__update_load_avg_blocked_se(u64 now, int cpu, struct sched_avg *sa)

> > +{

> > +	return ___update_load_avg(now, cpu, sa, 0, 0, NULL);

> > +}

> > +

> > +static int

> > +__update_load_avg_se(u64 now, int cpu, struct sched_avg *sa,

> > +		     unsigned long weight, int running)

> > +{

> > +	return ___update_load_avg(now, cpu, sa, weight, running, NULL);

> > +}

> > +

> > +static int

> > +__update_load_avg(u64 now, int cpu, struct sched_avg *sa,

> > +		  unsigned long weight, int running, struct cfs_rq *cfs_rq)


                  __attribute__((nonnull (6)));

> > +{

> > +	return ___update_load_avg(now, cpu, sa, weight, running, cfs_rq);

> 

> Although ideally we'd be able to tell the compiler that cfs_rq will not

> be NULL here. Hurmph.. no __builtin for that I think :/


What about the above attribute?

> 

> > +}


-- 
#include <best/regards.h>

Patrick Bellasi
Steven Rostedt March 28, 2017, 5:36 p.m. UTC | #9
On Tue, 28 Mar 2017 18:44:59 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Mar 28, 2017 at 10:46:00AM -0400, Steven Rostedt wrote:

> > On Tue, 28 Mar 2017 07:35:38 +0100

> > Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:

> >   

> > >  /* This part must be outside protection */

> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> > > index 03adf9fb48b1..ac19ab6ced8f 100644

> > > --- a/kernel/sched/fair.c

> > > +++ b/kernel/sched/fair.c

> > > @@ -2950,6 +2950,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,

> > >  		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;

> > >  	}

> > >  

> > > +	if (cfs_rq)

> > > +		trace_sched_load_cfs_rq(cfs_rq);

> > > +  

> > 

> > Please use TRACE_EVENT_CONDITION(), and test for cfs_rq not NULL.  

> 

> I too suggested that; but then I looked again at that code and we can

> actually do this. cfs_rq can be constant propagated and the if

> determined at build time.

> 

> Its not immediately obvious from the current code; but if we do

> something like the below, it should be clearer.

> 


But why play games, and rely on the design of the code? A
TRACE_EVENT_CONDTION() is more robust and documents that this
tracepoint should not be called when cfs_rq is NULL.

-- Steve
Steven Rostedt March 28, 2017, 5:37 p.m. UTC | #10
On Tue, 28 Mar 2017 13:36:26 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> But why play games, and rely on the design of the code? A

> TRACE_EVENT_CONDTION() is more robust and documents that this

> tracepoint should not be called when cfs_rq is NULL.


In other words, what are you trying to save for not using the
TRACE_EVENT_CONDITION()?

-- Steve
Peter Zijlstra March 28, 2017, 6:18 p.m. UTC | #11
On Tue, Mar 28, 2017 at 06:20:05PM +0100, Patrick Bellasi wrote:
> On 28-Mar 18:57, Peter Zijlstra wrote:

> > On Tue, Mar 28, 2017 at 06:44:59PM +0200, Peter Zijlstra wrote:


> > > +static int

> > > +__update_load_avg(u64 now, int cpu, struct sched_avg *sa,

> > > +		  unsigned long weight, int running, struct cfs_rq *cfs_rq)

> 

>                   __attribute__((nonnull (6)));

> 

> > > +{

> > > +	return ___update_load_avg(now, cpu, sa, weight, running, cfs_rq);

> > 

> > Although ideally we'd be able to tell the compiler that cfs_rq will not

> > be NULL here. Hurmph.. no __builtin for that I think :/

> 

> What about the above attribute?


Ooh, shiny, thanks! My bad for failing to check the function attributes.
Dietmar Eggemann March 29, 2017, 8:37 p.m. UTC | #12
On 03/28/2017 07:37 PM, Steven Rostedt wrote:
> On Tue, 28 Mar 2017 13:36:26 -0400

> Steven Rostedt <rostedt@goodmis.org> wrote:

>

>> But why play games, and rely on the design of the code? A

>> TRACE_EVENT_CONDTION() is more robust and documents that this

>> tracepoint should not be called when cfs_rq is NULL.

>

> In other words, what are you trying to save for not using the

> TRACE_EVENT_CONDITION()?


IMHO, if we could avoid this

   if(cfs_rq)
     trace_sched_load_cfs_rq(cfs_rq);
   else
     trace_sched_load_se(container_of(sa, struct sched_entity, avg));

in __update_load_avg(), then we can use 'unconditional' TRACE_EVENTs in 
all call-sites:

__update_load_avg{_cfs_rq}(), propagate_entity_load_avg(), 
attach_entity_load_avg(), detach_entity_load_avg() for cfs_rq and

__update_load_avg_blocked_se(), __update_load_avg_se(), 
propagate_entity_load_avg() for se.

[...]
Dietmar Eggemann March 29, 2017, 9:03 p.m. UTC | #13
On 03/28/2017 06:44 PM, Peter Zijlstra wrote:
> On Tue, Mar 28, 2017 at 10:46:00AM -0400, Steven Rostedt wrote:

>> On Tue, 28 Mar 2017 07:35:38 +0100

>> Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:


[...]

> I too suggested that; but then I looked again at that code and we can

> actually do this. cfs_rq can be constant propagated and the if

> determined at build time.

>

> Its not immediately obvious from the current code; but if we do

> something like the below, it should be clearer.

>

> ---

> Subject: sched/fair: Explicitly generate __update_load_avg() instances

> From: Peter Zijlstra <peterz@infradead.org>

> Date: Tue Mar 28 11:08:20 CEST 2017

>

> The __update_load_avg() function is an __always_inline because its

> used with constant propagation to generate different variants of the

> code without having to duplicate it (which would be prone to bugs).


Ah, so the if(cfs_rq)/else condition should stay in ___update_load_avg() 
and I shouldn't move the trace events into the 3 variants?

I tried to verify that the if is determined at build time but it's kind 
of hard with trace_events.

> Explicitly instantiate the 3 variants.

>

> Note that most of this is called from rather hot paths, so reducing

> branches is good.

>

> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

> ---

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -2849,7 +2849,7 @@ static u32 __compute_runnable_contrib(u6

>   *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]

>   */

>  static __always_inline int

> -__update_load_avg(u64 now, int cpu, struct sched_avg *sa,

> +___update_load_avg(u64 now, int cpu, struct sched_avg *sa,

>  		  unsigned long weight, int running, struct cfs_rq *cfs_rq)

>  {

>  	u64 delta, scaled_delta, periods;

> @@ -2953,6 +2953,26 @@ __update_load_avg(u64 now, int cpu, stru

>  	return decayed;

>  }

>

> +static int

> +__update_load_avg_blocked_se(u64 now, int cpu, struct sched_avg *sa)

> +{

> +	return ___update_load_avg(now, cpu, sa, 0, 0, NULL);

> +}

> +

> +static int

> +__update_load_avg_se(u64 now, int cpu, struct sched_avg *sa,

> +		     unsigned long weight, int running)

> +{

> +	return ___update_load_avg(now, cpu, sa, weight, running, NULL);

> +}

> +

> +static int

> +__update_load_avg(u64 now, int cpu, struct sched_avg *sa,

> +		  unsigned long weight, int running, struct cfs_rq *cfs_rq)

> +{

> +	return ___update_load_avg(now, cpu, sa, weight, running, cfs_rq);

> +}


Why not reduce the parameter list of these 3 incarnations to 'now, cpu, 
object'?

static int
__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)

static int
__update_load_avg_se(u64 now, int cpu, struct sched_entity *se)

static int
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)

[...]
Peter Zijlstra March 30, 2017, 7:04 a.m. UTC | #14
On Wed, Mar 29, 2017 at 11:03:45PM +0200, Dietmar Eggemann wrote:

> > +static int

> > +__update_load_avg_blocked_se(u64 now, int cpu, struct sched_avg *sa)

> > +{

> > +	return ___update_load_avg(now, cpu, sa, 0, 0, NULL);

> > +}

> > +

> > +static int

> > +__update_load_avg_se(u64 now, int cpu, struct sched_avg *sa,

> > +		     unsigned long weight, int running)

> > +{

> > +	return ___update_load_avg(now, cpu, sa, weight, running, NULL);

> > +}

> > +

> > +static int

> > +__update_load_avg(u64 now, int cpu, struct sched_avg *sa,

> > +		  unsigned long weight, int running, struct cfs_rq *cfs_rq)

> > +{

> > +	return ___update_load_avg(now, cpu, sa, weight, running, cfs_rq);

> > +}

> 

> Why not reduce the parameter list of these 3 incarnations to 'now, cpu,

> object'?

> 

> static int

> __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)

> 

> static int

> __update_load_avg_se(u64 now, int cpu, struct sched_entity *se)

> 

> static int

> __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)

> 

> [...]


doesn't quite work with se, but yes good idea.

And this way we don't need the nonnull attribute either, because it
should be clear from having dereferenced it that it cannot be null.

------ a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2849,7 +2849,7 @@ static u32 __compute_runnable_contrib(u6
  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
  */
 static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
 	u64 delta, scaled_delta, periods;
@@ -2953,6 +2953,28 @@ __update_load_avg(u64 now, int cpu, stru
 	return decayed;
 }
 
+static int
+__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+{
+	return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+}
+
+static int
+__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	return ___update_load_avg(now, cpu, &se->avg,
+				  se->on_rq * scale_load_down(se->load.weight),
+				  cfs_rq->curr == se, NULL);
+}
+
+static int
+__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+{
+	return ___update_load_avg(now, cpu, &cfs_rq->avg,
+			scale_down_load(cfs_rq->load.weight),
+			cfs_rq->curr != NULL, cfs_rq);
+}
+
 /*
  * Signed add and clamp on underflow.
  *
@@ -3014,6 +3036,9 @@ static inline void update_tg_load_avg(st
 void set_task_rq_fair(struct sched_entity *se,
 		      struct cfs_rq *prev, struct cfs_rq *next)
 {
+	u64 p_last_update_time;
+	u64 n_last_update_time;
+
 	if (!sched_feat(ATTACH_AGE_LOAD))
 		return;
 
@@ -3024,11 +3049,11 @@ void set_task_rq_fair(struct sched_entit
 	 * time. This will result in the wakee task is less decayed, but giving
 	 * the wakee more load sounds not bad.
 	 */
-	if (se->avg.last_update_time && prev) {
-		u64 p_last_update_time;
-		u64 n_last_update_time;
+	if (!(se->avg.last_update_time && prev))
+		return;
 
 #ifndef CONFIG_64BIT
+	{
 		u64 p_last_update_time_copy;
 		u64 n_last_update_time_copy;
 
@@ -3043,14 +3068,13 @@ void set_task_rq_fair(struct sched_entit
 
 		} while (p_last_update_time != p_last_update_time_copy ||
 			 n_last_update_time != n_last_update_time_copy);
+	}
 #else
-		p_last_update_time = prev->avg.last_update_time;
-		n_last_update_time = next->avg.last_update_time;
+	p_last_update_time = prev->avg.last_update_time;
+	n_last_update_time = next->avg.last_update_time;
 #endif
-		__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
-				  &se->avg, 0, 0, NULL);
-		se->avg.last_update_time = n_last_update_time;
-	}
+	__update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+	se->avg.last_update_time = n_last_update_time;
 }
 
 /* Take into account change of utilization of a child task group */
@@ -3295,8 +3319,7 @@ update_cfs_rq_load_avg(u64 now, struct c
 		set_tg_cfs_propagate(cfs_rq);
 	}
 
-	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-		scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
+	decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
 
 #ifndef CONFIG_64BIT
 	smp_wmb();
@@ -3328,11 +3351,8 @@ static inline void update_load_avg(struc
 	 * Track task load average for carrying it to new CPU after migrated, and
 	 * track group sched_entity load average for task_h_load calc in migration
 	 */
-	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
-		__update_load_avg(now, cpu, &se->avg,
-			  se->on_rq * scale_load_down(se->load.weight),
-			  cfs_rq->curr == se, NULL);
-	}
+	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+		__update_load_avg_se(now, cpu, cfs_rq, se);
 
 	decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
 	decayed |= propagate_entity_load_avg(se);
@@ -3437,7 +3457,7 @@ void sync_entity_load_avg(struct sched_e
 	u64 last_update_time;
 
 	last_update_time = cfs_rq_last_update_time(cfs_rq);
-	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+	__update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
 }
 
 /*

Dietmar Eggemann March 30, 2017, 7:46 a.m. UTC | #15
On 03/30/2017 09:04 AM, Peter Zijlstra wrote:
> On Wed, Mar 29, 2017 at 11:03:45PM +0200, Dietmar Eggemann wrote:


[...]

>> Why not reduce the parameter list of these 3 incarnations to 'now, cpu,

>> object'?

>>

>> static int

>> __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)

>>

>> static int

>> __update_load_avg_se(u64 now, int cpu, struct sched_entity *se)

>>

>> static int

>> __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)

>>

>> [...]

>

> doesn't quite work with se, but yes good idea.


Ah, OK, you don't like to use 'cfs_rq_of(se)->curr == se' in 
__update_load_avg_se(). The reason is that it's already fetched in 
update_load_avg()?

> And this way we don't need the nonnull attribute either, because it

> should be clear from having dereferenced it that it cannot be null.


Yes, this would be clearer now.

[...]
diff mbox series

Patch

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9e3ef6c99e4b..51db8a90e45f 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -562,6 +562,79 @@  TRACE_EVENT(sched_wake_idle_without_ipi,
 
 	TP_printk("cpu=%d", __entry->cpu)
 );
+
+#ifdef CONFIG_SMP
+#ifdef CREATE_TRACE_POINTS
+static inline
+int __trace_sched_cpu(struct cfs_rq *cfs_rq)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct rq *rq = cfs_rq->rq;
+#else
+	struct rq *rq = container_of(cfs_rq, struct rq, cfs);
+#endif
+	return cpu_of(rq);
+}
+
+static inline
+int __trace_sched_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	int l = path ? len : 0;
+
+	if (task_group_is_autogroup(cfs_rq->tg))
+		return autogroup_path(cfs_rq->tg, path, l) + 1;
+	else
+		return cgroup_path(cfs_rq->tg->css.cgroup, path, l) + 1;
+#else
+	if (path)
+		strcpy(path, "(null)");
+
+	return strlen("(null)");
+#endif
+}
+
+static inline int __trace_sched_id(struct cfs_rq *cfs_rq)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	return cfs_rq->tg->css.id;
+#else
+	return -1;
+#endif
+}
+#endif /* CREATE_TRACE_POINTS */
+
+/*
+ * Tracepoint for cfs_rq load tracking:
+ */
+TRACE_EVENT(sched_load_cfs_rq,
+
+	TP_PROTO(struct cfs_rq *cfs_rq),
+
+	TP_ARGS(cfs_rq),
+
+	TP_STRUCT__entry(
+		__field(	int,		cpu			)
+		__dynamic_array(char,		path,
+				__trace_sched_path(cfs_rq, NULL, 0)	)
+		__field(	int,		id			)
+		__field(	unsigned long,	load			)
+		__field(	unsigned long,	util			)
+	),
+
+	TP_fast_assign(
+		__entry->cpu	= __trace_sched_cpu(cfs_rq);
+		__trace_sched_path(cfs_rq, __get_dynamic_array(path),
+				   __get_dynamic_array_len(path));
+		__entry->id	= __trace_sched_id(cfs_rq);
+		__entry->load	= cfs_rq->runnable_load_avg;
+		__entry->util	= cfs_rq->avg.util_avg;
+	),
+
+	TP_printk("cpu=%d path=%s id=%d load=%lu util=%lu",  __entry->cpu,
+		  __get_str(path), __entry->id, __entry->load, __entry->util)
+);
+#endif /* CONFIG_SMP */
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03adf9fb48b1..ac19ab6ced8f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2950,6 +2950,9 @@  __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
 	}
 
+	if (cfs_rq)
+		trace_sched_load_cfs_rq(cfs_rq);
+
 	return decayed;
 }
 
@@ -3170,6 +3173,8 @@  static inline int propagate_entity_load_avg(struct sched_entity *se)
 	update_tg_cfs_util(cfs_rq, se);
 	update_tg_cfs_load(cfs_rq, se);
 
+	trace_sched_load_cfs_rq(cfs_rq);
+
 	return 1;
 }
 
@@ -3359,6 +3364,8 @@  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	set_tg_cfs_propagate(cfs_rq);
 
 	cfs_rq_util_change(cfs_rq);
+
+	trace_sched_load_cfs_rq(cfs_rq);
 }
 
 /**
@@ -3379,6 +3386,8 @@  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	set_tg_cfs_propagate(cfs_rq);
 
 	cfs_rq_util_change(cfs_rq);
+
+	trace_sched_load_cfs_rq(cfs_rq);
 }
 
 /* Add the load generated by se into cfs_rq's load average */