diff mbox

[v6,4/4] tracing: Histogram for delayed hrtimer offsets

Message ID 1473246835-30075-5-git-send-email-binoy.jayan@linaro.org
State New
Headers show

Commit Message

Binoy Jayan Sept. 7, 2016, 11:13 a.m. UTC
Generate a histogram of the latencies of delayed timer offsets in
nanoseconds. It shows the latency captured due to a delayed timer expire
event. It happens for example when a timer misses its deadline due to
disabled interrupts. A process if scheduled as a result of the timer
expiration suffers this latency.

The following filter(s) may be used

'hist:key=common_pid.execname:val=toffset,hitcount'
'hist:key=cpu,tcomm:val=toffset:sort=tcomm'
'hist:key=comm,tcomm:sort=comm,tcomm'

Signed-off-by: Binoy Jayan <binoy.jayan@linaro.org>

---
 include/linux/hrtimer.h      |  3 +++
 include/trace/events/timer.h | 25 +++++++++++++++++++++
 kernel/time/Kconfig          |  7 ++++++
 kernel/time/hrtimer.c        | 52 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 87 insertions(+)

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project

Comments

Masami Hiramatsu Sept. 8, 2016, 6:56 a.m. UTC | #1
2016-09-07 20:13 GMT+09:00 Binoy Jayan <binoy.jayan@linaro.org>:
> Generate a histogram of the latencies of delayed timer offsets in

> nanoseconds. It shows the latency captured due to a delayed timer expire

> event. It happens for example when a timer misses its deadline due to

> disabled interrupts. A process if scheduled as a result of the timer

> expiration suffers this latency.

>

> The following filter(s) may be used

>

> 'hist:key=common_pid.execname:val=toffset,hitcount'

> 'hist:key=cpu,tcomm:val=toffset:sort=tcomm'

> 'hist:key=comm,tcomm:sort=comm,tcomm'

>


Ok, so this event is hit only when the hrtimer is delayed.

I just have a comment on the event name:

> +TRACE_EVENT(latency_hrtimer_interrupt,


Since this event will be hit only when the hrtimer is delayed,
would we better add "delayed" in the event name?
e.g. "latency_delayed_hrtimer"

Others are OK for me.

Thanks,

> Signed-off-by: Binoy Jayan <binoy.jayan@linaro.org>

> ---

>  include/linux/hrtimer.h      |  3 +++

>  include/trace/events/timer.h | 25 +++++++++++++++++++++

>  kernel/time/Kconfig          |  7 ++++++

>  kernel/time/hrtimer.c        | 52 ++++++++++++++++++++++++++++++++++++++++++++

>  4 files changed, 87 insertions(+)

>

> diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h

> index 5e00f80..9146842 100644

> --- a/include/linux/hrtimer.h

> +++ b/include/linux/hrtimer.h

> @@ -104,6 +104,9 @@ struct hrtimer {

>         struct hrtimer_clock_base       *base;

>         u8                              state;

>         u8                              is_rel;

> +#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST

> +       ktime_t                         praecox;

> +#endif

>  #ifdef CONFIG_TIMER_STATS

>         int                             start_pid;

>         void                            *start_site;

> diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h

> index 28c5da6..ee45aed 100644

> --- a/include/trace/events/timer.h

> +++ b/include/trace/events/timer.h

> @@ -382,6 +382,31 @@ TRACE_EVENT(tick_stop,

>  );

>  #endif

>

> +#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST

> +TRACE_EVENT(latency_hrtimer_interrupt,

> +

> +       TP_PROTO(long long toffset, struct task_struct *task),

> +

> +       TP_ARGS(toffset, task),

> +

> +       TP_STRUCT__entry(

> +               __field(long long,      toffset)

> +               __array(char,           tcomm,  TASK_COMM_LEN)

> +               __field(int,            tprio)

> +       ),

> +

> +       TP_fast_assign(

> +               __entry->toffset = toffset;

> +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",

> +                       task != NULL ? TASK_COMM_LEN : 7);

> +               __entry->tprio  = task != NULL ? task->prio : -1;

> +       ),

> +

> +       TP_printk("toffset=%lld thread=%s[%d]",

> +               __entry->toffset, __entry->tcomm, __entry->tprio)

> +);

> +#endif

> +

>  #endif /*  _TRACE_TIMER_H */

>

>  /* This part must be outside protection */

> diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig

> index 4008d9f..8ff19dd 100644

> --- a/kernel/time/Kconfig

> +++ b/kernel/time/Kconfig

> @@ -193,5 +193,12 @@ config HIGH_RES_TIMERS

>           hardware is not capable then this option only increases

>           the size of the kernel image.

>

> +config DELAYED_TIMER_OFFSETS_HIST

> +       depends on HIGH_RES_TIMERS

> +       select GENERIC_TRACER

> +       bool "Delayed Timer Offsets Histogram"

> +       help

> +         Generate a histogram of delayed timer offsets in nanoseconds.

> +

>  endmenu

>  endif

> diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c

> index 9ba7c82..432d49a 100644

> --- a/kernel/time/hrtimer.c

> +++ b/kernel/time/hrtimer.c

> @@ -56,6 +56,8 @@

>

>  #include "tick-internal.h"

>

> +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);

> +

>  /*

>   * The timer bases:

>   *

> @@ -960,6 +962,52 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,

>         return tim;

>  }

>

> +#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST

> +static inline void latency_hrtimer_timing_start(struct hrtimer *timer,

> +                                        struct hrtimer_clock_base *new_base,

> +                                        ktime_t tim)

> +{

> +       if (unlikely(trace_latency_hrtimer_interrupt_enabled())) {

> +               ktime_t now = new_base->get_time();

> +

> +               if (ktime_to_ns(tim) < ktime_to_ns(now))

> +                       timer->praecox = now;

> +               else

> +                       timer->praecox = ktime_set(0, 0);

> +       }

> +}

> +

> +static inline void latency_hrtimer_timing_stop(struct hrtimer *timer,

> +                                               ktime_t basenow)

> +{

> +       long latency;

> +       struct task_struct *task;

> +

> +       if (likely(!trace_latency_hrtimer_interrupt_enabled()))

> +               return;

> +

> +       latency = ktime_to_ns(ktime_sub(basenow,

> +                             ktime_to_ns(timer->praecox) ?

> +                             timer->praecox : hrtimer_get_expires(timer)));

> +       task = timer->function == hrtimer_wakeup ?

> +                       container_of(timer, struct hrtimer_sleeper,

> +                                    timer)->task : NULL;

> +       if (latency > 0)

> +               trace_latency_hrtimer_interrupt((u64) latency, task);

> +}

> +#else

> +static inline void latency_hrtimer_timing_start(struct hrtimer *timer,

> +                                        struct hrtimer_clock_base *new_base,

> +                                        ktime_t tim)

> +{

> +}

> +static inline void latency_hrtimer_timing_stop(struct hrtimer *timer,

> +                                               ktime_t basenow)

> +{

> +}

> +

> +#endif

> +

>  /**

>   * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU

>   * @timer:     the timer to be added

> @@ -992,6 +1040,8 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,

>

>         timer_stats_hrtimer_set_start_info(timer);

>

> +       latency_hrtimer_timing_start(timer, new_base, tim);

> +

>         leftmost = enqueue_hrtimer(timer, new_base);

>         if (!leftmost)

>                 goto unlock;

> @@ -1284,6 +1334,8 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)

>

>                         timer = container_of(node, struct hrtimer, node);

>

> +                       latency_hrtimer_timing_stop(timer, basenow);

> +

>                         /*

>                          * The immediate goal for using the softexpires is

>                          * minimizing wakeups, not running timers at the

> --

> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,

> a Linux Foundation Collaborative Project

>




-- 
Masami Hiramatsu
Thomas Gleixner Sept. 8, 2016, 7:10 a.m. UTC | #2
On Wed, 7 Sep 2016, Binoy Jayan wrote:
> diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h

> index 5e00f80..9146842 100644

> --- a/include/linux/hrtimer.h

> +++ b/include/linux/hrtimer.h

> @@ -104,6 +104,9 @@ struct hrtimer {

>  	struct hrtimer_clock_base	*base;

>  	u8				state;

>  	u8				is_rel;

> +#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST

> +	ktime_t				praecox;

> +#endif


Throwing a new struct member at some random place optimizes the struct
footprint, right?

And of course documenting new struct members is optional, correct? I'm
really looking forward for the explanation of that variable name.

> +#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST

> +TRACE_EVENT(latency_hrtimer_interrupt,

> +

> +       TP_PROTO(long long toffset, struct task_struct *task),

> +

> +       TP_ARGS(toffset, task),

> +

> +       TP_STRUCT__entry(

> +               __field(long long,      toffset)

> +               __array(char,           tcomm,  TASK_COMM_LEN)

> +               __field(int,            tprio)

> +       ),

> +

> +       TP_fast_assign(

> +               __entry->toffset = toffset;

> +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",

> +                       task != NULL ? TASK_COMM_LEN : 7);

> +               __entry->tprio  = task != NULL ? task->prio : -1;

> +       ),


What's the value of storing prio? None, if the task is using the DL
scheduler.

> +#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST

> +static inline void latency_hrtimer_timing_start(struct hrtimer *timer,

> +					 struct hrtimer_clock_base *new_base,

> +					 ktime_t tim)

> +{

> +	if (unlikely(trace_latency_hrtimer_interrupt_enabled())) {

> +		ktime_t now = new_base->get_time();

> +

> +		if (ktime_to_ns(tim) < ktime_to_ns(now))

> +			timer->praecox = now;

> +		else

> +			timer->praecox = ktime_set(0, 0);


What's the whole point of this? You're adding an extra get_time() call into
that path. What for? Comments in the code are overrated, right?

> +	}

> +}

> +

> +static inline void latency_hrtimer_timing_stop(struct hrtimer *timer,

> +						ktime_t basenow)

> +{

> +	long latency;

> +	struct task_struct *task;

> +

> +	if (likely(!trace_latency_hrtimer_interrupt_enabled()))

> +		return;

> +

> +	latency = ktime_to_ns(ktime_sub(basenow,

> +			      ktime_to_ns(timer->praecox) ?

> +			      timer->praecox : hrtimer_get_expires(timer)));

> +	task = timer->function == hrtimer_wakeup ?

> +			container_of(timer, struct hrtimer_sleeper,

> +				     timer)->task : NULL;


This is a complete horrible hack. You're tying the task evaluation into a
single instance of hrtimer users. What's the justification for this and why
do you need task at all?

> +	if (latency > 0)

> +		trace_latency_hrtimer_interrupt((u64) latency, task);


And how should latency become < 0 ever? hrtimer interrupt guarantees to
never expire timers prematurely.

Neither the changelog nor the code contain any information about how that
thing is to be used and what the actual value of the stored information
is.

No way that this ad hoc statistics hackery which we carry in RT for a well
known reason is going to go upstream without proper justification and a
weel thought out and documented functionality.

Thanks,

	tglx
Binoy Jayan Sept. 8, 2016, 9:39 a.m. UTC | #3
[Adding Carsten in cc ]

Thank you Thomas for reviewing this and providing insights.

On 8 September 2016 at 12:40, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Wed, 7 Sep 2016, Binoy Jayan wrote:

>> diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h

>> index 5e00f80..9146842 100644

>> --- a/include/linux/hrtimer.h

>> +++ b/include/linux/hrtimer.h

>> @@ -104,6 +104,9 @@ struct hrtimer {

>>       struct hrtimer_clock_base       *base;

>>       u8                              state;

>>       u8                              is_rel;

>> +#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST

>> +     ktime_t                         praecox;

>> +#endif

>

> Throwing a new struct member at some random place optimizes the struct

> footprint, right?


My bad, I'll move the member two items up, just below the pointer 'base'.

> And of course documenting new struct members is optional, correct? I'm

> really looking forward for the explanation of that variable name.


It marks the start time when a process is scheduled to be woken up as
the result
of expiry of the hrtimer. Will be mentioning it in the comments.

>> +

>> +       TP_fast_assign(

>> +               __entry->toffset = toffset;

>> +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",

>> +                       task != NULL ? TASK_COMM_LEN : 7);

>> +               __entry->tprio  = task != NULL ? task->prio : -1;

>> +       ),

>

> What's the value of storing prio? None, if the task is using the DL

> scheduler.

>

>> +#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST

>> +static inline void latency_hrtimer_timing_start(struct hrtimer *timer,

>> +                                      struct hrtimer_clock_base *new_base,

>> +                                      ktime_t tim)

>> +{

>> +     if (unlikely(trace_latency_hrtimer_interrupt_enabled())) {

>> +             ktime_t now = new_base->get_time();

>> +

>> +             if (ktime_to_ns(tim) < ktime_to_ns(now))

>> +                     timer->praecox = now;

>> +             else

>> +                     timer->praecox = ktime_set(0, 0);

>

> What's the whole point of this? You're adding an extra get_time() call into

> that path. What for? Comments in the code are overrated, right?


Will add comments here.

>> +     }

>> +}

>> +

>> +static inline void latency_hrtimer_timing_stop(struct hrtimer *timer,

>> +                                             ktime_t basenow)

>> +{

>> +     long latency;

>> +     struct task_struct *task;

>> +

>> +     if (likely(!trace_latency_hrtimer_interrupt_enabled()))

>> +             return;

>> +

>> +     latency = ktime_to_ns(ktime_sub(basenow,

>> +                           ktime_to_ns(timer->praecox) ?

>> +                           timer->praecox : hrtimer_get_expires(timer)));

>> +     task = timer->function == hrtimer_wakeup ?

>> +                     container_of(timer, struct hrtimer_sleeper,

>> +                                  timer)->task : NULL;

>

> This is a complete horrible hack. You're tying the task evaluation into a

> single instance of hrtimer users. What's the justification for this and why

> do you need task at all?


If I am understanding it not wrongly, I was trying to mark the time when
and hrtimer is started or restarted and the time when the same expires.
The expiry time is compared against the time now and the actual expiry.
The task indicates the task woken up as a result of the timer expiry.

>

>> +     if (latency > 0)

>> +             trace_latency_hrtimer_interrupt((u64) latency, task);

>

> And how should latency become < 0 ever? hrtimer interrupt guarantees to

> never expire timers prematurely.


Not sure why, but I have seen some negative values here.

> Neither the changelog nor the code contain any information about how that

> thing is to be used and what the actual value of the stored information

> is.

>

> No way that this ad hoc statistics hackery which we carry in RT for a well

> known reason is going to go upstream without proper justification and a

> weel thought out and documented functionality.

>


As Carsten has mentioned in his patch, this latency alone is not useful enough
without the process latency which occur due to the disabled interrupts/preemtion
or because of a timer missing its deadline. Since the process latency histogram
needed tracepoints in scheduler code which is discouraged, I haven't gotten
around to do it yet. I've been trying to calculate latencies by making
use of kprobes
events and tracing_maps but I was finding it little tricky.

Thanks,
Binoy
Thomas Gleixner Sept. 9, 2016, 1:30 p.m. UTC | #4
On Thu, 8 Sep 2016, Binoy Jayan wrote:
> On 8 September 2016 at 12:40, Thomas Gleixner <tglx@linutronix.de> wrote:

> >> +#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST

> >> +     ktime_t                         praecox;

> >> +#endif

> > 

> > And of course documenting new struct members is optional, correct? I'm

> > really looking forward for the explanation of that variable name.

> 

> It marks the start time when a process is scheduled to be woken up as the

> result of expiry of the hrtimer. Will be mentioning it in the comments.


That's wrong. It only contains the start time when the timer was not
expired at the point where it is started. Otherwise it's set to 0.

Random explanations are worse than no explanation at all.
 
> >> +#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST

> >> +static inline void latency_hrtimer_timing_start(struct hrtimer *timer,

> >> +                                      struct hrtimer_clock_base *new_base,

> >> +                                      ktime_t tim)

> >> +{

> >> +     if (unlikely(trace_latency_hrtimer_interrupt_enabled())) {

> >> +             ktime_t now = new_base->get_time();

> >> +

> >> +             if (ktime_to_ns(tim) < ktime_to_ns(now))

> >> +                     timer->praecox = now;

> >> +             else

> >> +                     timer->praecox = ktime_set(0, 0);

> >

> > What's the whole point of this? You're adding an extra get_time() call into

> > that path. What for? Comments in the code are overrated, right?

> 

> Will add comments here.


I rather prefer to get an explanation in this mail thread so it can be
discussed before you send out the next version.

> >> +     latency = ktime_to_ns(ktime_sub(basenow,

> >> +                           ktime_to_ns(timer->praecox) ?

> >> +                           timer->praecox : hrtimer_get_expires(timer)));

> >> +     task = timer->function == hrtimer_wakeup ?

> >> +                     container_of(timer, struct hrtimer_sleeper,

> >> +                                  timer)->task : NULL;

> >

> > This is a complete horrible hack. You're tying the task evaluation into a

> > single instance of hrtimer users. What's the justification for this and why

> > do you need task at all?

> 

> If I am understanding it not wrongly, I was trying to mark the time when

> and hrtimer is started or restarted and the time when the same expires.

> The expiry time is compared against the time now and the actual expiry.

> The task indicates the task woken up as a result of the timer expiry.


You are explaining what the code does and you are explaining it wrong. But
you don't answer my question at all.

> >

> >> +     if (latency > 0)

> >> +             trace_latency_hrtimer_interrupt((u64) latency, task);

> >

> > And how should latency become < 0 ever? hrtimer interrupt guarantees to

> > never expire timers prematurely.

> 

> Not sure why, but I have seen some negative values here.


The you better investigate that matter instead of adding band aids to the
code.
 
> > Neither the changelog nor the code contain any information about how that

> > thing is to be used and what the actual value of the stored information

> > is.

> >

> > No way that this ad hoc statistics hackery which we carry in RT for a well

> > known reason is going to go upstream without proper justification and a

> > weel thought out and documented functionality.

> >

> 

> As Carsten has mentioned in his patch, this latency alone is not useful enough

> without the process latency which occur due to the disabled interrupts/preemtion

> or because of a timer missing its deadline. Since the process latency histogram

> needed tracepoints in scheduler code which is discouraged, I haven't gotten

> around to do it yet. I've been trying to calculate latencies by making

> use of kprobes

> events and tracing_maps but I was finding it little tricky.


How is that related to my question?

If that thing has no value on its own, why are you trying to push it?

I don't care if you find it tricky. If you want this patch merged, then you
have to supply information

    - What it does

    - Why it is useful

    - How it can be used

    - What is the output and how can it be analyzed

    - What are the limitations

Thanks,

	tglx
Mark Brown Sept. 9, 2016, 5:10 p.m. UTC | #5
On Thu, Sep 08, 2016 at 03:09:36PM +0530, Binoy Jayan wrote:
> On 8 September 2016 at 12:40, Thomas Gleixner <tglx@linutronix.de> wrote:

> > On Wed, 7 Sep 2016, Binoy Jayan wrote:


> >> +#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST

> >> +     ktime_t                         praecox;

> >> +#endif


> > And of course documenting new struct members is optional, correct? I'm

> > really looking forward for the explanation of that variable name.


> It marks the start time when a process is scheduled to be woken up as

> the result

> of expiry of the hrtimer. Will be mentioning it in the comments.


Even with an appropriate comment I'm not sure that the term praecox is
sufficiently widely understood to be a clear variable name.
Thomas Gleixner Sept. 10, 2016, 6:10 a.m. UTC | #6
On Fri, 9 Sep 2016, Mark Brown wrote:
> On Thu, Sep 08, 2016 at 03:09:36PM +0530, Binoy Jayan wrote:

> > On 8 September 2016 at 12:40, Thomas Gleixner <tglx@linutronix.de> wrote:

> > > On Wed, 7 Sep 2016, Binoy Jayan wrote:

> 

> > >> +#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST

> > >> +     ktime_t                         praecox;

> > >> +#endif

> 

> > > And of course documenting new struct members is optional, correct? I'm

> > > really looking forward for the explanation of that variable name.

> 

> > It marks the start time when a process is scheduled to be woken up as

> > the result

> > of expiry of the hrtimer. Will be mentioning it in the comments.

> 

> Even with an appropriate comment I'm not sure that the term praecox is

> sufficiently widely understood to be a clear variable name.


For those who understand it and know the main use case of this term it
certainly has entertainment value.

Thanks,

	tglx
Carsten Emde Sept. 23, 2016, 2:20 p.m. UTC | #7
On 09/09/2016 03:30 PM, Thomas Gleixner wrote:
> On Thu, 8 Sep 2016, Binoy Jayan wrote:

>> On 8 September 2016 at 12:40, Thomas Gleixner <tglx@linutronix.de> wrote:

> [..]

>>>> +     latency = ktime_to_ns(ktime_sub(basenow,

>>>> +                           ktime_to_ns(timer->praecox) ?

>>>> +                           timer->praecox : hrtimer_get_expires(timer)));

>>>> +     task = timer->function == hrtimer_wakeup ?

>>>> +                     container_of(timer, struct hrtimer_sleeper,

>>>> +                                  timer)->task : NULL;

>>>

>>> This is a complete horrible hack. You're tying the task evaluation into a

>>> single instance of hrtimer users. What's the justification for this and why

>>> do you need task at all?

Makes only sense, if any, if wakeup latency histograms that store the task with
the highest latency so far are available. Initially, I wanted to give a short
explanation here why such internal histograms may be useful, but I found it
easier in a format where pictures of example latency plots and recordings can
be shown. So I wrote a short article on "Preemption latency of real-time Linux
systems" -> https://www.osadl.org/Single-View.111+M5246ebc55e3.0.html.

Hope this helps,

	-Carsten.
diff mbox

Patch

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5e00f80..9146842 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -104,6 +104,9 @@  struct hrtimer {
 	struct hrtimer_clock_base	*base;
 	u8				state;
 	u8				is_rel;
+#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST
+	ktime_t				praecox;
+#endif
 #ifdef CONFIG_TIMER_STATS
 	int				start_pid;
 	void				*start_site;
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 28c5da6..ee45aed 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -382,6 +382,31 @@  TRACE_EVENT(tick_stop,
 );
 #endif
 
+#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST
+TRACE_EVENT(latency_hrtimer_interrupt,
+
+	TP_PROTO(long long toffset, struct task_struct *task),
+
+	TP_ARGS(toffset, task),
+
+	TP_STRUCT__entry(
+		__field(long long,	toffset)
+		__array(char,		tcomm,	TASK_COMM_LEN)
+		__field(int,		tprio)
+	),
+
+	TP_fast_assign(
+		__entry->toffset = toffset;
+		memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
+			task != NULL ? TASK_COMM_LEN : 7);
+		__entry->tprio  = task != NULL ? task->prio : -1;
+	),
+
+	TP_printk("toffset=%lld thread=%s[%d]",
+		__entry->toffset, __entry->tcomm, __entry->tprio)
+);
+#endif
+
 #endif /*  _TRACE_TIMER_H */
 
 /* This part must be outside protection */
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 4008d9f..8ff19dd 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -193,5 +193,12 @@  config HIGH_RES_TIMERS
 	  hardware is not capable then this option only increases
 	  the size of the kernel image.
 
+config DELAYED_TIMER_OFFSETS_HIST
+	depends on HIGH_RES_TIMERS
+	select GENERIC_TRACER
+	bool "Delayed Timer Offsets Histogram"
+	help
+	  Generate a histogram of delayed timer offsets in nanoseconds.
+
 endmenu
 endif
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9ba7c82..432d49a 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -56,6 +56,8 @@ 
 
 #include "tick-internal.h"
 
+static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
+
 /*
  * The timer bases:
  *
@@ -960,6 +962,52 @@  static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
 	return tim;
 }
 
+#ifdef CONFIG_DELAYED_TIMER_OFFSETS_HIST
+static inline void latency_hrtimer_timing_start(struct hrtimer *timer,
+					 struct hrtimer_clock_base *new_base,
+					 ktime_t tim)
+{
+	if (unlikely(trace_latency_hrtimer_interrupt_enabled())) {
+		ktime_t now = new_base->get_time();
+
+		if (ktime_to_ns(tim) < ktime_to_ns(now))
+			timer->praecox = now;
+		else
+			timer->praecox = ktime_set(0, 0);
+	}
+}
+
+static inline void latency_hrtimer_timing_stop(struct hrtimer *timer,
+						ktime_t basenow)
+{
+	long latency;
+	struct task_struct *task;
+
+	if (likely(!trace_latency_hrtimer_interrupt_enabled()))
+		return;
+
+	latency = ktime_to_ns(ktime_sub(basenow,
+			      ktime_to_ns(timer->praecox) ?
+			      timer->praecox : hrtimer_get_expires(timer)));
+	task = timer->function == hrtimer_wakeup ?
+			container_of(timer, struct hrtimer_sleeper,
+				     timer)->task : NULL;
+	if (latency > 0)
+		trace_latency_hrtimer_interrupt((u64) latency, task);
+}
+#else
+static inline void latency_hrtimer_timing_start(struct hrtimer *timer,
+					 struct hrtimer_clock_base *new_base,
+					 ktime_t tim)
+{
+}
+static inline void latency_hrtimer_timing_stop(struct hrtimer *timer,
+						ktime_t basenow)
+{
+}
+
+#endif
+
 /**
  * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
  * @timer:	the timer to be added
@@ -992,6 +1040,8 @@  void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 
 	timer_stats_hrtimer_set_start_info(timer);
 
+	latency_hrtimer_timing_start(timer, new_base, tim);
+
 	leftmost = enqueue_hrtimer(timer, new_base);
 	if (!leftmost)
 		goto unlock;
@@ -1284,6 +1334,8 @@  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
 
 			timer = container_of(node, struct hrtimer, node);
 
+			latency_hrtimer_timing_stop(timer, basenow);
+
 			/*
 			 * The immediate goal for using the softexpires is
 			 * minimizing wakeups, not running timers at the