diff mbox

[v2,1/2] perf: Add sampling of the raw monotonic clock

Message ID 1411491787-25938-2-git-send-email-pawel.moll@arm.com
State New
Headers show

Commit Message

Pawel Moll Sept. 23, 2014, 5:03 p.m. UTC
This patch adds an option to sample raw monotonic clock
value with any perf event, with the the aim of allowing
time correlation between data coming from perf and
additional performance-related information generated in
userspace.

In order to correlate timestamps in perf data stream
with events happening in userspace (be it JITed debug
symbols or hwmon-originating environment data), user
requests a more or less periodic event (sched_switch
trace event of a hrtimer-based cpu-clock being the
most obvious examples) with PERF_SAMPLE_TIME *and*
PERF_SAMPLE_CLOCK_RAW_MONOTONIC and stamps
user-originating data with values obtained from
clock_gettime(CLOCK_MONOTONIC_RAW). Then, during
analysis, one looks at the perf events immediately
preceding and following (in terms of the
clock_raw_monotonic sample) the userspace event and
does simple linear approximation to get the equivalent
perf time.

        perf event     user event
       -----O--------------+-------------O------> t_mono
            :              |             :
            :              V             :
       -----O----------------------------O------> t_perf

Signed-off-by: Pawel Moll <pawel.moll@arm.com>
---
Changes since v1:

- none

 include/linux/perf_event.h      |  2 ++
 include/uapi/linux/perf_event.h |  4 +++-
 kernel/events/core.c            | 13 +++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

Comments

Namhyung Kim Sept. 24, 2014, 5:41 a.m. UTC | #1
Hi Pawel,

On Tue, 23 Sep 2014 18:03:06 +0100, Pawel Moll wrote:
> This patch adds an option to sample raw monotonic clock
> value with any perf event, with the the aim of allowing
> time correlation between data coming from perf and
> additional performance-related information generated in
> userspace.
>
> In order to correlate timestamps in perf data stream
> with events happening in userspace (be it JITed debug
> symbols or hwmon-originating environment data), user
> requests a more or less periodic event (sched_switch
> trace event of a hrtimer-based cpu-clock being the
> most obvious examples) with PERF_SAMPLE_TIME *and*
> PERF_SAMPLE_CLOCK_RAW_MONOTONIC and stamps
> user-originating data with values obtained from
> clock_gettime(CLOCK_MONOTONIC_RAW). Then, during
> analysis, one looks at the perf events immediately
> preceding and following (in terms of the
> clock_raw_monotonic sample) the userspace event and
> does simple linear approximation to get the equivalent
> perf time.
>
>         perf event     user event
>        -----O--------------+-------------O------> t_mono
>             :              |             :
>             :              V             :
>        -----O----------------------------O------> t_perf

Probably a dumb question: why not make PERF_SAMPLE_TIME being monotonic
clock instead of adding a new PERF_SAMPLE_CLOCK_XXX flag?  Maybe we can
add a new ioctl command like PERF_EVENT_IOC_SET_CLOCK so that one can
pass a clock id.

Thanks,
Namhyung


>
> Signed-off-by: Pawel Moll <pawel.moll@arm.com>
> ---
> Changes since v1:
>
> - none
>
>  include/linux/perf_event.h      |  2 ++
>  include/uapi/linux/perf_event.h |  4 +++-
>  kernel/events/core.c            | 13 +++++++++++++
>  3 files changed, 18 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 707617a..28b73b2 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -602,6 +602,8 @@ struct perf_sample_data {
>  	 * Transaction flags for abort events:
>  	 */
>  	u64				txn;
> +	/* Raw monotonic timestamp, for userspace time correlation */
> +	u64				clock_raw_monotonic;
>  };
>  
>  static inline void perf_sample_data_init(struct perf_sample_data *data,
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 9269de2..e5a75c5 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -137,8 +137,9 @@ enum perf_event_sample_format {
>  	PERF_SAMPLE_DATA_SRC			= 1U << 15,
>  	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
>  	PERF_SAMPLE_TRANSACTION			= 1U << 17,
> +	PERF_SAMPLE_CLOCK_RAW_MONOTONIC		= 1U << 18,
>  
> -	PERF_SAMPLE_MAX = 1U << 18,		/* non-ABI */
> +	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
>  };
>  
>  /*
> @@ -686,6 +687,7 @@ enum perf_event_type {
>  	 *	{ u64			weight;   } && PERF_SAMPLE_WEIGHT
>  	 *	{ u64			data_src; } && PERF_SAMPLE_DATA_SRC
>  	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
> +	 *	{ u64			clock_raw_monotonic; } && PERF_SAMPLE_CLOCK_RAW_MONOTONIC
>  	 * };
>  	 */
>  	PERF_RECORD_SAMPLE			= 9,
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index f9c1ed0..f6df547 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1216,6 +1216,9 @@ static void perf_event__header_size(struct perf_event *event)
>  	if (sample_type & PERF_SAMPLE_TRANSACTION)
>  		size += sizeof(data->txn);
>  
> +	if (sample_type & PERF_SAMPLE_CLOCK_RAW_MONOTONIC)
> +		size += sizeof(data->clock_raw_monotonic);
> +
>  	event->header_size = size;
>  }
>  
> @@ -4456,6 +4459,13 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
>  		data->cpu_entry.cpu	 = raw_smp_processor_id();
>  		data->cpu_entry.reserved = 0;
>  	}
> +
> +	if (sample_type & PERF_SAMPLE_CLOCK_RAW_MONOTONIC) {
> +		struct timespec now;
> +
> +		getrawmonotonic(&now);
> +		data->clock_raw_monotonic = timespec_to_ns(&now);
> +	}
>  }
>  
>  void perf_event_header__init_id(struct perf_event_header *header,
> @@ -4714,6 +4724,9 @@ void perf_output_sample(struct perf_output_handle *handle,
>  	if (sample_type & PERF_SAMPLE_TRANSACTION)
>  		perf_output_put(handle, data->txn);
>  
> +	if (sample_type & PERF_SAMPLE_CLOCK_RAW_MONOTONIC)
> +		perf_output_put(handle, data->clock_raw_monotonic);
> +
>  	if (!event->attr.watermark) {
>  		int wakeup_events = event->attr.wakeup_events;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Pawel Moll Sept. 25, 2014, 10:49 a.m. UTC | #2
On Wed, 2014-09-24 at 06:41 +0100, Namhyung Kim wrote:
> Hi Pawel,
> 
> On Tue, 23 Sep 2014 18:03:06 +0100, Pawel Moll wrote:
> > This patch adds an option to sample raw monotonic clock
> > value with any perf event, with the the aim of allowing
> > time correlation between data coming from perf and
> > additional performance-related information generated in
> > userspace.
> >
> > In order to correlate timestamps in perf data stream
> > with events happening in userspace (be it JITed debug
> > symbols or hwmon-originating environment data), user
> > requests a more or less periodic event (sched_switch
> > trace event of a hrtimer-based cpu-clock being the
> > most obvious examples) with PERF_SAMPLE_TIME *and*
> > PERF_SAMPLE_CLOCK_RAW_MONOTONIC and stamps
> > user-originating data with values obtained from
> > clock_gettime(CLOCK_MONOTONIC_RAW). Then, during
> > analysis, one looks at the perf events immediately
> > preceding and following (in terms of the
> > clock_raw_monotonic sample) the userspace event and
> > does simple linear approximation to get the equivalent
> > perf time.
> >
> >         perf event     user event
> >        -----O--------------+-------------O------> t_mono
> >             :              |             :
> >             :              V             :
> >        -----O----------------------------O------> t_perf
> 
> Probably a dumb question: why not make PERF_SAMPLE_TIME being monotonic
> clock instead of adding a new PERF_SAMPLE_CLOCK_XXX flag? 

It's a valid question. And it was asked before, in this thread:

http://thread.gmane.org/gmane.linux.kernel/1611683

A summary of the answer would be:

> On Wed, 2013-12-11 at 12:07 +0000, Ingo Molnar wrote: 
> > * John Stultz <john.stultz@linaro.org> wrote:
> > 
> > > [...]
> > > 
> > > I'd much rather see perf export CLOCK_MONOTONIC_RAW timestamps, 
> > > since that clockid is well defined. [...]
> > 
> > So the problem with that clock is that it does the following for every 
> > timestamp:
> > 
> >         cycle_now = clock->read(clock);
> > 
> > ... which is impossibly slow if something like the HPET is used, which 
> > is rather common - so this is a non-starter to timestamp perf events 
> > with. We use the scheduler clock as a reasonable compromise between 
> > scalability and clock globality.

Now, for your other comment:

>  Maybe we can
> add a new ioctl command like PERF_EVENT_IOC_SET_CLOCK so that one can
> pass a clock id.

Did you mean selecting a time source for PERF_SAMPLE_TIME (so we don't
need two timestamps in a sample)? It would be doable, I guess, but what
if someone *wants* to have sched clock as the timestamps source (because
it's cheap) but still be able to correlate them with userspace? In this
case two separate timestamps are required to do the approximation.

Now, if you meant it to select a time source for the "other timestamp",
let's call it in a more generic way: PERF_SAMPLE_CLOCK_VALUE this could
work, yes. I see ALSA doing something similar (see
SNDRV_PCM_TSTAMP_TYPE_* enum in include/uapi/sound/asound.h). One
question would be: how does it work with groups? Does setting a
timesource for the leader selects them for all members? I don't think
you want a bunch of samples with different timestamp sources in the same
buffer.

Anyway, great stuff, thanks!

Pawel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Pawel Moll Sept. 26, 2014, 10:58 a.m. UTC | #3
On Fri, 2014-09-26 at 07:16 +0100, Namhyung Kim wrote:
> > It would be doable, I guess, but what
> > if someone *wants* to have sched clock as the timestamps source (because
> > it's cheap) but still be able to correlate them with userspace? In this
> > case two separate timestamps are required to do the approximation.
> 
> But by collecting two timestamps, you'll loose the win of the first
> timestamp, no?

But I can ask for both timestamps only being collected on "low
bandwidth" events, in particular context switches and/or periodic (eg.
10ms hrtimer) software events.

Then I have loads of normal normal samples, timestamped with sched clock
only, and every now and then one with both timestamps which then I can
use for time correlation. The whole point is that the frequency of such
"synchronisation" event can be much (much!) lower than of the normal
samples, but it still allows pretty good approximation (I was getting
accuracy of ~1 microsecond and better with sched_switch trace event
marked with additional raw monotonic timestamp).

Pawel

PS. Have you sent a couple of the messages via some kind of gmane's
proxy? All the mail addresses got rather messed up...

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Namhyung Kim Sept. 26, 2014, 2:38 p.m. UTC | #4
2014-09-26 (금), 11:58 +0100, Pawel Moll:
> On Fri, 2014-09-26 at 07:16 +0100, Namhyung Kim wrote:
> > > It would be doable, I guess, but what
> > > if someone *wants* to have sched clock as the timestamps source (because
> > > it's cheap) but still be able to correlate them with userspace? In this
> > > case two separate timestamps are required to do the approximation.
> > 
> > But by collecting two timestamps, you'll loose the win of the first
> > timestamp, no?
> 
> But I can ask for both timestamps only being collected on "low
> bandwidth" events, in particular context switches and/or periodic (eg.
> 10ms hrtimer) software events.
> 
> Then I have loads of normal normal samples, timestamped with sched clock
> only, and every now and then one with both timestamps which then I can
> use for time correlation. The whole point is that the frequency of such
> "synchronisation" event can be much (much!) lower than of the normal
> samples, but it still allows pretty good approximation (I was getting
> accuracy of ~1 microsecond and better with sched_switch trace event
> marked with additional raw monotonic timestamp).

Okay.  But in that case wouldn't it be enough to use just a single
timestamp for each event - sched_clock for cpu-cycles and monotonic raw
for sched_switch?


> 
> Pawel
> 
> PS. Have you sent a couple of the messages via some kind of gmane's
> proxy? All the mail addresses got rather messed up...
> 

Yes, I use gnus to communicate with lkml, but it seems it messed up mail
addresses if it contains some list (like linux-api ?).  Sorry about
that.

Thanks,
Namhyung


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Pawel Moll Sept. 26, 2014, 3:05 p.m. UTC | #5
On Fri, 2014-09-26 at 15:38 +0100, Namhyung Kim wrote:
> > Then I have loads of normal normal samples, timestamped with sched clock
> > only, and every now and then one with both timestamps which then I can
> > use for time correlation. The whole point is that the frequency of such
> > "synchronisation" event can be much (much!) lower than of the normal
> > samples, but it still allows pretty good approximation (I was getting
> > accuracy of ~1 microsecond and better with sched_switch trace event
> > marked with additional raw monotonic timestamp).
> 
> Okay.  But in that case wouldn't it be enough to use just a single
> timestamp for each event - sched_clock for cpu-cycles and monotonic raw
> for sched_switch?

To do the correlation you need both timestamps to be "taken"
simultaneously:

        perf event     user event
       -----O--------------+-------------O------> t_mono
            :              |             :
            :              V             :
       -----O----------------------------O------> t_perf

Of course it's not possible get both values literally at the same time,
but placing them in a atomic context a couple of instructions from each
other still gives pretty good results. The larger this distance is, the
lower the accuracy will be. I must admit I haven't done such
experiments, but let me remind that I in my test I was getting results
in the range of 1000ns, with a single cycle of a 2GHz taking 0.5ns, so
moving the t_mono/t_perf value sampling further aside will reduce it
significantly...

Pawel

Pawel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
David Ahern Sept. 26, 2014, 7:25 p.m. UTC | #6
On 9/26/14, 9:05 AM, Pawel Moll wrote:
> To do the correlation you need both timestamps to be "taken"
> simultaneously:
>
>          perf event     user event
>         -----O--------------+-------------O------> t_mono
>              :              |             :
>              :              V             :
>         -----O----------------------------O------> t_perf
>
> Of course it's not possible get both values literally at the same time,
> but placing them in a atomic context a couple of instructions from each
> other still gives pretty good results. The larger this distance is, the

An early patchset on this topic added the realtime clock as an event and 
an ioctl was used to push a sample into the event stream. In that case 
you have wall clock and perf-clock samples taken in the same kernel 
context and about as close together as you can get.

https://lkml.org/lkml/2011/2/27/158
https://lkml.org/lkml/2011/2/27/159

David
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Pawel Moll Sept. 29, 2014, 2:47 p.m. UTC | #7
On Fri, 2014-09-26 at 20:25 +0100, David Ahern wrote:
> On 9/26/14, 9:05 AM, Pawel Moll wrote:
> > To do the correlation you need both timestamps to be "taken"
> > simultaneously:
> >
> >          perf event     user event
> >         -----O--------------+-------------O------> t_mono
> >              :              |             :
> >              :              V             :
> >         -----O----------------------------O------> t_perf
> >
> > Of course it's not possible get both values literally at the same time,
> > but placing them in a atomic context a couple of instructions from each
> > other still gives pretty good results. The larger this distance is, the
> 
> An early patchset on this topic added the realtime clock as an event and 
> an ioctl was used to push a sample into the event stream. 

Yeah, I remember. If I remember correctly correctly the pushback was on
a custom event type, right?

Generally speaking I don't mind any solution that we'll get us to the
place both you and I want to be (just being able to time stamp some
performance data in userspace, how difficult can this be! ;-) but I like
the flexibility of an extra sample - one can pick and mix events and
samples at one's leisure.

> In that case 
> you have wall clock and perf-clock samples taken in the same kernel 
> context and about as close together as you can get.

Yep, that's what I was saying - we can't quite get two timestamps at the
*same*, but getting them within a single atomic block of instructions
gives reasonable accuracy.

Thanks!

Pawel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
diff mbox

Patch

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 707617a..28b73b2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -602,6 +602,8 @@  struct perf_sample_data {
 	 * Transaction flags for abort events:
 	 */
 	u64				txn;
+	/* Raw monotonic timestamp, for userspace time correlation */
+	u64				clock_raw_monotonic;
 };
 
 static inline void perf_sample_data_init(struct perf_sample_data *data,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9269de2..e5a75c5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -137,8 +137,9 @@  enum perf_event_sample_format {
 	PERF_SAMPLE_DATA_SRC			= 1U << 15,
 	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
+	PERF_SAMPLE_CLOCK_RAW_MONOTONIC		= 1U << 18,
 
-	PERF_SAMPLE_MAX = 1U << 18,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
 };
 
 /*
@@ -686,6 +687,7 @@  enum perf_event_type {
 	 *	{ u64			weight;   } && PERF_SAMPLE_WEIGHT
 	 *	{ u64			data_src; } && PERF_SAMPLE_DATA_SRC
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
+	 *	{ u64			clock_raw_monotonic; } && PERF_SAMPLE_CLOCK_RAW_MONOTONIC
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f9c1ed0..f6df547 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1216,6 +1216,9 @@  static void perf_event__header_size(struct perf_event *event)
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		size += sizeof(data->txn);
 
+	if (sample_type & PERF_SAMPLE_CLOCK_RAW_MONOTONIC)
+		size += sizeof(data->clock_raw_monotonic);
+
 	event->header_size = size;
 }
 
@@ -4456,6 +4459,13 @@  static void __perf_event_header__init_id(struct perf_event_header *header,
 		data->cpu_entry.cpu	 = raw_smp_processor_id();
 		data->cpu_entry.reserved = 0;
 	}
+
+	if (sample_type & PERF_SAMPLE_CLOCK_RAW_MONOTONIC) {
+		struct timespec now;
+
+		getrawmonotonic(&now);
+		data->clock_raw_monotonic = timespec_to_ns(&now);
+	}
 }
 
 void perf_event_header__init_id(struct perf_event_header *header,
@@ -4714,6 +4724,9 @@  void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		perf_output_put(handle, data->txn);
 
+	if (sample_type & PERF_SAMPLE_CLOCK_RAW_MONOTONIC)
+		perf_output_put(handle, data->clock_raw_monotonic);
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;