diff mbox series

[RFC,4/5] sched/events: Introduce sched_entity load tracking trace event

Message ID 20170328063541.12912-5-dietmar.eggemann@arm.com
State New
Headers show
Series CFS load tracking trace events | expand

Commit Message

Dietmar Eggemann March 28, 2017, 6:35 a.m. UTC
The trace event keys load and util (utilization) are mapped to:

 (1) load : se->avg.load_avg

 (2) util : se->avg.util_avg

To let this trace event work for configurations w/ and w/o group
scheduling support for cfs (CONFIG_FAIR_GROUP_SCHED) the following
special handling is necessary for non-existent key=value pairs:

 path = "(null)" : In case of !CONFIG_FAIR_GROUP_SCHED or the
                   sched_entity represents a task.

 id   = -1       : In case of !CONFIG_FAIR_GROUP_SCHED or the
                   sched_entity represents a task.

 comm = "(null)" : In case sched_entity represents a task_group.

 pid = -1        : In case sched_entity represents a task_group.

The following list shows examples of the key=value pairs in different
configurations for:

 (1) a task:

     cpu=0 path=(null) id=-1 comm=sshd pid=2206 load=102 util=102

 (2) a taskgroup:

     cpu=1 path=/tg1/tg11/tg111 id=4 comm=(null) pid=-1 load=882 util=510

 (3) an autogroup:

     cpu=0 path=/autogroup-13 id=0 comm=(null) pid=-1 load=49 util=48

 (4) w/o CONFIG_FAIR_GROUP_SCHED:

     cpu=0 path=(null) id=-1 comm=sshd pid=2211 load=301 util=265

The trace event is only defined for CONFIG_SMP.

The helper functions __trace_sched_cpu(), __trace_sched_path() and
__trace_sched_id() are extended to deal with sched_entities as well.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/sched.h | 63 +++++++++++++++++++++++++++++++++++---------
 kernel/sched/fair.c          |  3 +++
 2 files changed, 54 insertions(+), 12 deletions(-)

-- 
2.11.0

Comments

Peter Zijlstra March 28, 2017, 8:05 a.m. UTC | #1
On Tue, Mar 28, 2017 at 07:35:40AM +0100, Dietmar Eggemann wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> index 04d4f81b96ae..d1dcb19f5b55 100644

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -2940,6 +2940,8 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,

>  

>  	if (cfs_rq)

>  		trace_sched_load_cfs_rq(cfs_rq);

> +	else

> +		trace_sched_load_se(container_of(sa, struct sched_entity, avg));

>  

>  	return decayed;

>  }

> @@ -3162,6 +3164,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)

>  	update_tg_cfs_load(cfs_rq, se);

>  

>  	trace_sched_load_cfs_rq(cfs_rq);

> +	trace_sched_load_se(se);

>  

>  	return 1;

>  }


Having back-to-back tracepoints is disgusting.
Peter Zijlstra March 28, 2017, 8:08 a.m. UTC | #2
On Tue, Mar 28, 2017 at 07:35:40AM +0100, Dietmar Eggemann wrote:
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h

> index 51db8a90e45f..647cfaf528fd 100644

> --- a/include/trace/events/sched.h

> +++ b/include/trace/events/sched.h

> @@ -566,14 +566,15 @@ TRACE_EVENT(sched_wake_idle_without_ipi,

>  #ifdef CONFIG_SMP

>  #ifdef CREATE_TRACE_POINTS

>  static inline

> -int __trace_sched_cpu(struct cfs_rq *cfs_rq)

> +int __trace_sched_cpu(struct cfs_rq *cfs_rq, struct sched_entity *se)

>  {

>  #ifdef CONFIG_FAIR_GROUP_SCHED

> -	struct rq *rq = cfs_rq->rq;

> +	struct rq *rq = cfs_rq ? cfs_rq->rq : NULL;

>  #else

> -	struct rq *rq = container_of(cfs_rq, struct rq, cfs);

> +	struct rq *rq = cfs_rq ? container_of(cfs_rq, struct rq, cfs) : NULL;

>  #endif

> -	return cpu_of(rq);

> +	return rq ? cpu_of(rq)

> +		  : task_cpu((container_of(se, struct task_struct, se)));

>  }


So here you duplicate lots of FAIR_GROUP internals. So why do you then
have to expose group_cfs_rq() in the previous patch?
Dietmar Eggemann March 28, 2017, 2:01 p.m. UTC | #3
On 03/28/2017 10:05 AM, Peter Zijlstra wrote:
> On Tue, Mar 28, 2017 at 07:35:40AM +0100, Dietmar Eggemann wrote:

>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

>> index 04d4f81b96ae..d1dcb19f5b55 100644

>> --- a/kernel/sched/fair.c

>> +++ b/kernel/sched/fair.c

>> @@ -2940,6 +2940,8 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,

>>

>>  	if (cfs_rq)

>>  		trace_sched_load_cfs_rq(cfs_rq);

>> +	else

>> +		trace_sched_load_se(container_of(sa, struct sched_entity, avg));

>>

>>  	return decayed;

>>  }

>> @@ -3162,6 +3164,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)

>>  	update_tg_cfs_load(cfs_rq, se);

>>

>>  	trace_sched_load_cfs_rq(cfs_rq);

>> +	trace_sched_load_se(se);

>>

>>  	return 1;

>>  }

>

> Having back-to-back tracepoints is disgusting.

>


Yeah, avoiding putting them like this is hard since 
update_tg_cfs_util()/update_tg_cfs_load() refresh util for the cfs_rq 
and the se respectively load/runnable_load.
Dietmar Eggemann March 28, 2017, 2:03 p.m. UTC | #4
On 03/28/2017 10:05 AM, Peter Zijlstra wrote:
> On Tue, Mar 28, 2017 at 07:35:40AM +0100, Dietmar Eggemann wrote:

>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

>> index 04d4f81b96ae..d1dcb19f5b55 100644

>> --- a/kernel/sched/fair.c

>> +++ b/kernel/sched/fair.c

>> @@ -2940,6 +2940,8 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,

>>

>>  	if (cfs_rq)

>>  		trace_sched_load_cfs_rq(cfs_rq);

>> +	else

>> +		trace_sched_load_se(container_of(sa, struct sched_entity, avg));

>>

>>  	return decayed;

>>  }

>> @@ -3162,6 +3164,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)

>>  	update_tg_cfs_load(cfs_rq, se);

>>

>>  	trace_sched_load_cfs_rq(cfs_rq);

>> +	trace_sched_load_se(se);

>>

>>  	return 1;

>>  }

>

> Having back-to-back tracepoints is disgusting.


Yeah, but avoiding putting them like this is hard since the calls to 
update_tg_cfs_util() and update_tg_cfs_load() inside 
propagate_entity_load_avg() refresh util for the cfs_rq and the se 
respectively load (and runnable_load).
Dietmar Eggemann March 28, 2017, 2:13 p.m. UTC | #5
On 03/28/2017 10:08 AM, Peter Zijlstra wrote:
> On Tue, Mar 28, 2017 at 07:35:40AM +0100, Dietmar Eggemann wrote:

>> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h

>> index 51db8a90e45f..647cfaf528fd 100644

>> --- a/include/trace/events/sched.h

>> +++ b/include/trace/events/sched.h

>> @@ -566,14 +566,15 @@ TRACE_EVENT(sched_wake_idle_without_ipi,

>>  #ifdef CONFIG_SMP

>>  #ifdef CREATE_TRACE_POINTS

>>  static inline

>> -int __trace_sched_cpu(struct cfs_rq *cfs_rq)

>> +int __trace_sched_cpu(struct cfs_rq *cfs_rq, struct sched_entity *se)

>>  {

>>  #ifdef CONFIG_FAIR_GROUP_SCHED

>> -	struct rq *rq = cfs_rq->rq;

>> +	struct rq *rq = cfs_rq ? cfs_rq->rq : NULL;

>>  #else

>> -	struct rq *rq = container_of(cfs_rq, struct rq, cfs);

>> +	struct rq *rq = cfs_rq ? container_of(cfs_rq, struct rq, cfs) : NULL;

>>  #endif

>> -	return cpu_of(rq);

>> +	return rq ? cpu_of(rq)

>> +		  : task_cpu((container_of(se, struct task_struct, se)));

>>  }

>

> So here you duplicate lots of FAIR_GROUP internals. So why do you then

> have to expose group_cfs_rq() in the previous patch?

>


Not having group_cfs_rq() available made the trace event code too 
confusing.

But like I mentioned in the second to last paragraph in the cover 
letter, having all necessary cfs accessor-functions (rq_of(), task_of(), 
etc.) available would definitely streamline the coding effort of these 
trace events.

Do you think that making them public in include/linux/sched.h is the way 
to go? What about the namespace issue with other sched classes? Should 
they be exported with the name they have right now (since cfs was there 
first) or should they be renamed to cfs_task_of() and rq_of_cfs_rq() etc. ?

RT and Deadline class already have the own (private) accessor-functions 
(e.g. dl_task_of() or rq_of_dl_rq()).
Peter Zijlstra March 28, 2017, 4:41 p.m. UTC | #6
On Tue, Mar 28, 2017 at 04:13:45PM +0200, Dietmar Eggemann wrote:

> Do you think that making them public in include/linux/sched.h is the way to

> go? 


No; all that stuff should really stay private. tracepoints are a very
bad reason to leak this stuff.
Dietmar Eggemann March 29, 2017, 8:19 p.m. UTC | #7
On 03/28/2017 06:41 PM, Peter Zijlstra wrote:
> On Tue, Mar 28, 2017 at 04:13:45PM +0200, Dietmar Eggemann wrote:

>

>> Do you think that making them public in include/linux/sched.h is the way to

>> go?

>

> No; all that stuff should really stay private. tracepoints are a very

> bad reason to leak this stuff.


Understood & makes sense to me. In hindsight, it's not too complicated 
to code group_cfs_rq in include/trace/events/sched.h.
diff mbox series

Patch

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 51db8a90e45f..647cfaf528fd 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -566,14 +566,15 @@  TRACE_EVENT(sched_wake_idle_without_ipi,
 #ifdef CONFIG_SMP
 #ifdef CREATE_TRACE_POINTS
 static inline
-int __trace_sched_cpu(struct cfs_rq *cfs_rq)
+int __trace_sched_cpu(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	struct rq *rq = cfs_rq->rq;
+	struct rq *rq = cfs_rq ? cfs_rq->rq : NULL;
 #else
-	struct rq *rq = container_of(cfs_rq, struct rq, cfs);
+	struct rq *rq = cfs_rq ? container_of(cfs_rq, struct rq, cfs) : NULL;
 #endif
-	return cpu_of(rq);
+	return rq ? cpu_of(rq)
+		  : task_cpu((container_of(se, struct task_struct, se)));
 }
 
 static inline
@@ -582,25 +583,24 @@  int __trace_sched_path(struct cfs_rq *cfs_rq, char *path, int len)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	int l = path ? len : 0;
 
-	if (task_group_is_autogroup(cfs_rq->tg))
+	if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
 		return autogroup_path(cfs_rq->tg, path, l) + 1;
-	else
+	else if (cfs_rq && cfs_rq->tg->css.cgroup)
 		return cgroup_path(cfs_rq->tg->css.cgroup, path, l) + 1;
-#else
+#endif
 	if (path)
 		strcpy(path, "(null)");
 
 	return strlen("(null)");
-#endif
 }
 
 static inline int __trace_sched_id(struct cfs_rq *cfs_rq)
 {
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	return cfs_rq->tg->css.id;
-#else
-	return -1;
+	if (cfs_rq)
+		return cfs_rq->tg->css.id;
 #endif
+	return -1;
 }
 #endif /* CREATE_TRACE_POINTS */
 
@@ -623,7 +623,7 @@  TRACE_EVENT(sched_load_cfs_rq,
 	),
 
 	TP_fast_assign(
-		__entry->cpu	= __trace_sched_cpu(cfs_rq);
+		__entry->cpu	= __trace_sched_cpu(cfs_rq, NULL);
 		__trace_sched_path(cfs_rq, __get_dynamic_array(path),
 				   __get_dynamic_array_len(path));
 		__entry->id	= __trace_sched_id(cfs_rq);
@@ -634,6 +634,45 @@  TRACE_EVENT(sched_load_cfs_rq,
 	TP_printk("cpu=%d path=%s id=%d load=%lu util=%lu",  __entry->cpu,
 		  __get_str(path), __entry->id, __entry->load, __entry->util)
 );
+
+/*
+ * Tracepoint for sched_entity load tracking:
+ */
+TRACE_EVENT(sched_load_se,
+
+	TP_PROTO(struct sched_entity *se),
+
+	TP_ARGS(se),
+
+	TP_STRUCT__entry(
+		__field(	int,		cpu			      )
+		__dynamic_array(char,		path,
+				__trace_sched_path(group_cfs_rq(se), NULL, 0) )
+		__field(	int,		id			      )
+		__array(	char,		comm,	TASK_COMM_LEN	      )
+		__field(	pid_t,		pid			      )
+		__field(	unsigned long,	load			      )
+		__field(	unsigned long,	util			      )
+	),
+
+	TP_fast_assign(
+		struct task_struct *p = group_cfs_rq(se) ? NULL
+				    : container_of(se, struct task_struct, se);
+
+		__entry->cpu = __trace_sched_cpu(group_cfs_rq(se), se);
+		__trace_sched_path(group_cfs_rq(se), __get_dynamic_array(path),
+				   __get_dynamic_array_len(path));
+		__entry->id = __trace_sched_id(group_cfs_rq(se));
+		memcpy(__entry->comm, p ? p->comm : "(null)", TASK_COMM_LEN);
+		__entry->pid = p ? p->pid : -1;
+		__entry->load = se->avg.load_avg;
+		__entry->util = se->avg.util_avg;
+	),
+
+	TP_printk("cpu=%d path=%s id=%d comm=%s pid=%d load=%lu util=%lu",
+		  __entry->cpu, __get_str(path), __entry->id, __entry->comm,
+		  __entry->pid, __entry->load, __entry->util)
+);
 #endif /* CONFIG_SMP */
 #endif /* _TRACE_SCHED_H */
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04d4f81b96ae..d1dcb19f5b55 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2940,6 +2940,8 @@  __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 
 	if (cfs_rq)
 		trace_sched_load_cfs_rq(cfs_rq);
+	else
+		trace_sched_load_se(container_of(sa, struct sched_entity, avg));
 
 	return decayed;
 }
@@ -3162,6 +3164,7 @@  static inline int propagate_entity_load_avg(struct sched_entity *se)
 	update_tg_cfs_load(cfs_rq, se);
 
 	trace_sched_load_cfs_rq(cfs_rq);
+	trace_sched_load_se(se);
 
 	return 1;
 }