[RFC] sched_clock: Track monotonic raw clock

Message ID 1405705419-4194-1-git-send-email-pawel.moll@arm.com
State New
Headers show

Commit Message

Pawel Moll July 18, 2014, 5:43 p.m.
This change is trying to make the sched clock "similar" to the
monotonic raw one.

The main goal is to provide some kind of unification between time
flow in kernel and in user space, mainly to achieve correlation
between perf timestamps and clock_gettime(CLOCK_MONOTONIC_RAW).
This has been suggested by Ingo and John during the latest
discussion (of many, we tried custom ioctl, custom clock etc.)
about this:

http://thread.gmane.org/gmane.linux.kernel/1611683/focus=1612554

For now I focused on the generic sched clock implementation,
but similar approach can be applied elsewhere.

Initially I just wanted to copy epoch from monotonic to sched
clock at update_clock(), but this can cause the sched clock
going backwards in certain corner cases, eg. when the sched
clock "increases faster" than the monotonic one. I believe
it's a killer issue, but feel free to ridicule me if I worry
too much :-)

In the end I tried to employ some basic control theory technique
to tune the multiplier used to calculate ns from cycles and
it seems to be be working in my system, with the average error
in the area of 2-3 clock cycles (I've got both clocks running
at 24MHz, which gives 41ns resolution).

/ # cat /sys/kernel/debug/sched_clock_error
min error: 0ns
max error: 200548913ns
100 samples moving average error: 117ns
/ # cat /sys/kernel/debug/tracing/trace
<...>
          <idle>-0     [000] d.h3  1195.102296: sched_clock_adjust: sched=1195102288457ns, mono=1195102288411ns, error=-46ns, mult_adj=65
          <idle>-0     [000] d.h3  1195.202290: sched_clock_adjust: sched=1195202282416ns, mono=1195202282485ns, error=69ns, mult_adj=38
          <idle>-0     [000] d.h3  1195.302286: sched_clock_adjust: sched=1195302278832ns, mono=1195302278861ns, error=29ns, mult_adj=47
          <idle>-0     [000] d.h3  1195.402278: sched_clock_adjust: sched=1195402271082ns, mono=1195402270872ns, error=-210ns, mult_adj=105
          <idle>-0     [000] d.h3  1195.502278: sched_clock_adjust: sched=1195502270832ns, mono=1195502270950ns, error=118ns, mult_adj=29
          <idle>-0     [000] d.h3  1195.602276: sched_clock_adjust: sched=1195602268707ns, mono=1195602268732ns, error=25ns, mult_adj=50
          <idle>-0     [000] d.h3  1195.702280: sched_clock_adjust: sched=1195702272999ns, mono=1195702272997ns, error=-2ns, mult_adj=55
          <idle>-0     [000] d.h3  1195.802276: sched_clock_adjust: sched=1195802268749ns, mono=1195802268684ns, error=-65ns, mult_adj=71
          <idle>-0     [000] d.h3  1195.902272: sched_clock_adjust: sched=1195902265207ns, mono=1195902265223ns, error=16ns, mult_adj=53
          <idle>-0     [000] d.h3  1196.002276: sched_clock_adjust: sched=1196002269374ns, mono=1196002269283ns, error=-91ns, mult_adj=78
<...>

This code definitely needs more work and testing (I'm not 100%
sure if the Kp and Ki I've picked for the proportional and
integral terms are universal), but for now wanted to see
if this approach makes any sense whatsoever.

All feedback more than appreciated!

Signed-off-by: Pawel Moll <pawel.moll@arm.com>
---
 include/trace/events/sched.h |  28 +++++++++
 kernel/time/sched_clock.c    | 142 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 162 insertions(+), 8 deletions(-)

Comments

John Stultz July 18, 2014, 5:51 p.m. | #1
On 07/18/2014 10:43 AM, Pawel Moll wrote:
> This change is trying to make the sched clock "similar" to the
> monotonic raw one.
>
> The main goal is to provide some kind of unification between time
> flow in kernel and in user space, mainly to achieve correlation
> between perf timestamps and clock_gettime(CLOCK_MONOTONIC_RAW).
> This has been suggested by Ingo and John during the latest
> discussion (of many, we tried custom ioctl, custom clock etc.)
> about this:
>
> http://thread.gmane.org/gmane.linux.kernel/1611683/focus=1612554
>
> For now I focused on the generic sched clock implementation,
> but similar approach can be applied elsewhere.
>
> Initially I just wanted to copy epoch from monotonic to sched
> clock at update_clock(), but this can cause the sched clock
> going backwards in certain corner cases, eg. when the sched
> clock "increases faster" than the monotonic one. I believe
> it's a killer issue, but feel free to ridicule me if I worry
> too much :-)
>
> In the end I tried to employ some basic control theory technique
> to tune the multiplier used to calculate ns from cycles and
> it seems to be be working in my system, with the average error
> in the area of 2-3 clock cycles (I've got both clocks running
> at 24MHz, which gives 41ns resolution).
>
> / # cat /sys/kernel/debug/sched_clock_error
> min error: 0ns
> max error: 200548913ns
> 100 samples moving average error: 117ns
> / # cat /sys/kernel/debug/tracing/trace
> <...>
>           <idle>-0     [000] d.h3  1195.102296: sched_clock_adjust: sched=1195102288457ns, mono=1195102288411ns, error=-46ns, mult_adj=65
>           <idle>-0     [000] d.h3  1195.202290: sched_clock_adjust: sched=1195202282416ns, mono=1195202282485ns, error=69ns, mult_adj=38
>           <idle>-0     [000] d.h3  1195.302286: sched_clock_adjust: sched=1195302278832ns, mono=1195302278861ns, error=29ns, mult_adj=47
>           <idle>-0     [000] d.h3  1195.402278: sched_clock_adjust: sched=1195402271082ns, mono=1195402270872ns, error=-210ns, mult_adj=105
>           <idle>-0     [000] d.h3  1195.502278: sched_clock_adjust: sched=1195502270832ns, mono=1195502270950ns, error=118ns, mult_adj=29
>           <idle>-0     [000] d.h3  1195.602276: sched_clock_adjust: sched=1195602268707ns, mono=1195602268732ns, error=25ns, mult_adj=50
>           <idle>-0     [000] d.h3  1195.702280: sched_clock_adjust: sched=1195702272999ns, mono=1195702272997ns, error=-2ns, mult_adj=55
>           <idle>-0     [000] d.h3  1195.802276: sched_clock_adjust: sched=1195802268749ns, mono=1195802268684ns, error=-65ns, mult_adj=71
>           <idle>-0     [000] d.h3  1195.902272: sched_clock_adjust: sched=1195902265207ns, mono=1195902265223ns, error=16ns, mult_adj=53
>           <idle>-0     [000] d.h3  1196.002276: sched_clock_adjust: sched=1196002269374ns, mono=1196002269283ns, error=-91ns, mult_adj=78
> <...>
>
> This code definitely needs more work and testing (I'm not 100%
> sure if the Kp and Ki I've picked for the proportional and
> integral terms are universal), but for now wanted to see
> if this approach makes any sense whatsoever.
>
> All feedback more than appreciated!

Very cool work! I've not been able to review it carefully, but one good
stress test would be to pick a system where the hardware used for
sched_clock is different from the hardware used for timekeeping.

Probably easily done on x86 hardware that normally uses the TSC, but has
HPET/ACPI PM hardware available. After the system boots, change the
clocksource via:
/sys/devices/system/clocksource/clocksource0/current_clocksource


Although, looking again, this looks like it only works on the "generic"
sched_clock (so ARM/ARM64?)...

thanks
-john

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra July 18, 2014, 7:06 p.m. | #2
On Fri, Jul 18, 2014 at 10:51:19AM -0700, John Stultz wrote:
> Very cool work! I've not been able to review it carefully, but one good
> stress test would be to pick a system where the hardware used for
> sched_clock is different from the hardware used for timekeeping.
> 
> Probably easily done on x86 hardware that normally uses the TSC, but has
> HPET/ACPI PM hardware available. After the system boots, change the
> clocksource via:
> /sys/devices/system/clocksource/clocksource0/current_clocksource

Note that x86 uses none of the code patched.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra July 18, 2014, 7:13 p.m. | #3
On Fri, Jul 18, 2014 at 06:43:39PM +0100, Pawel Moll wrote:
> This change is trying to make the sched clock "similar" to the
> monotonic raw one.
> 
> The main goal is to provide some kind of unification between time
> flow in kernel and in user space, mainly to achieve correlation
> between perf timestamps and clock_gettime(CLOCK_MONOTONIC_RAW).
> This has been suggested by Ingo and John during the latest
> discussion (of many, we tried custom ioctl, custom clock etc.)
> about this:
> 
> http://thread.gmane.org/gmane.linux.kernel/1611683/focus=1612554
> 
> For now I focused on the generic sched clock implementation,
> but similar approach can be applied elsewhere.
> 
> Initially I just wanted to copy epoch from monotonic to sched
> clock at update_clock(), but this can cause the sched clock
> going backwards in certain corner cases, eg. when the sched
> clock "increases faster" than the monotonic one. I believe
> it's a killer issue, but feel free to ridicule me if I worry
> too much :-)

But on hardware using generic sched_clock we use the exact same hardware
as the regular timekeeping, right?

So we could start off with the same offset/mult/shift and never deviate,
or is that a silly question?, I've never really looked at the generic
sched_clock stuff too closely.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
John Stultz July 18, 2014, 7:25 p.m. | #4
On Fri, Jul 18, 2014 at 12:13 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, Jul 18, 2014 at 06:43:39PM +0100, Pawel Moll wrote:
>> This change is trying to make the sched clock "similar" to the
>> monotonic raw one.
>>
>> The main goal is to provide some kind of unification between time
>> flow in kernel and in user space, mainly to achieve correlation
>> between perf timestamps and clock_gettime(CLOCK_MONOTONIC_RAW).
>> This has been suggested by Ingo and John during the latest
>> discussion (of many, we tried custom ioctl, custom clock etc.)
>> about this:
>>
>> http://thread.gmane.org/gmane.linux.kernel/1611683/focus=1612554
>>
>> For now I focused on the generic sched clock implementation,
>> but similar approach can be applied elsewhere.
>>
>> Initially I just wanted to copy epoch from monotonic to sched
>> clock at update_clock(), but this can cause the sched clock
>> going backwards in certain corner cases, eg. when the sched
>> clock "increases faster" than the monotonic one. I believe
>> it's a killer issue, but feel free to ridicule me if I worry
>> too much :-)
>
> But on hardware using generic sched_clock we use the exact same hardware
> as the regular timekeeping, right?

Probably most likely, but not necessarily (one can register a
clocksource for sched_clock and then userspace could switch to a
different clocksource for timekeeping).

Also, assuming we someday will merge the x86 sched_clock logic into
the generic sched_clock code, we'll have to handle cases where they
aren't the same.

> So we could start off with the same offset/mult/shift and never deviate,
> or is that a silly question?, I've never really looked at the generic
> sched_clock stuff too closely.

Ideally I'd like to remove the mult/shift pari from clocksources all
together and allow the subsystems that use them to keep their own
mult/shift pair. Mostly because the fine frequency tuning tradeoffs we
want for timekeeping are different from the long-running intervals
without mult overflow we want for sched_clock.

With Thomas' change recently to get the cycle_last bit moved out of
the clocksource structure, we should be fairly close to doing this.

thanks
-john
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra July 18, 2014, 7:34 p.m. | #5
On Fri, Jul 18, 2014 at 12:25:48PM -0700, John Stultz wrote:
> Also, assuming we someday will merge the x86 sched_clock logic into
> the generic sched_clock code, we'll have to handle cases where they
> aren't the same.

I prefer that to not happen. I spend quite a bit of time and effort to
make the x86 code go fast, and that generic code doesn't look like fast
at all.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
John Stultz July 18, 2014, 7:46 p.m. | #6
On 07/18/2014 12:34 PM, Peter Zijlstra wrote:
> On Fri, Jul 18, 2014 at 12:25:48PM -0700, John Stultz wrote:
>> Also, assuming we someday will merge the x86 sched_clock logic into
>> the generic sched_clock code, we'll have to handle cases where they
>> aren't the same.
> I prefer that to not happen. I spend quite a bit of time and effort to
> make the x86 code go fast, and that generic code doesn't look like fast
> at all.

A stretch goal then :)

But yes, the generic sched_clock logic has really just started w/ ARM
and is hopefully moving out to pick up more architectures. I suspect it
will need to adapt many of your tricks from (if not a whole migration to
some of) the x86 code. And even if the x86 code stays separate for
optimization reasons, thats fine.

But as folks try to align things like perf timestamps with time domains
we expose to userspace, we'll have to keep some of the semantics in sync
between the various implementations, and having lots of separate
implementations will be a burden.

But yea, I don't have any plans to try to do a grand unification myself,
so don't fret.

thanks
-john

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra July 18, 2014, 8:22 p.m. | #7
On Fri, Jul 18, 2014 at 12:46:29PM -0700, John Stultz wrote:
> On 07/18/2014 12:34 PM, Peter Zijlstra wrote:
> > On Fri, Jul 18, 2014 at 12:25:48PM -0700, John Stultz wrote:
> >> Also, assuming we someday will merge the x86 sched_clock logic into
> >> the generic sched_clock code, we'll have to handle cases where they
> >> aren't the same.
> > I prefer that to not happen. I spend quite a bit of time and effort to
> > make the x86 code go fast, and that generic code doesn't look like fast
> > at all.
> 
> A stretch goal then :)
> 
> But yes, the generic sched_clock logic has really just started w/ ARM
> and is hopefully moving out to pick up more architectures. I suspect it
> will need to adapt many of your tricks from (if not a whole migration to
> some of) the x86 code. And even if the x86 code stays separate for
> optimization reasons, thats fine.

So the generic stuff seems optimized for 32bit arch, short clocks and
seems to hard assume the clock is globally consistent.

The x86 sched_clock code is optimized for 64bit, has a full 64bit clock
and cannot ever assume the thing is globally consistent (until Intel/AMD
stop making the TSC register writable -- including, and maybe
especially, for SMM).

There is just not much that overlaps there.

> But as folks try to align things like perf timestamps with time domains
> we expose to userspace, we'll have to keep some of the semantics in sync
> between the various implementations, and having lots of separate
> implementations will be a burden.

We're already there, there's about nr_arch - 5 implementations,
hopefully many trivial ones though.
Peter Zijlstra July 18, 2014, 10:41 p.m. | #8
On Fri, Jul 18, 2014 at 10:22:50PM +0200, Peter Zijlstra wrote:
> So the generic stuff seems optimized for 32bit arch, short clocks and
> seems to hard assume the clock is globally consistent.
> 
> The x86 sched_clock code is optimized for 64bit, has a full 64bit clock
> and cannot ever assume the thing is globally consistent (until Intel/AMD
> stop making the TSC register writable -- including, and maybe
> especially, for SMM).
> 
> There is just not much that overlaps there.

So something that might be usable for all of us would be the
'abstracted' control loop.

So the problem is, given a Set Point (CLOCK_MONOTONIC_RAW), a Process
Variable (sched_clock) compute a Control Output (multiplier).

If that were implemented with an interface like:


struct sched_clock_cl; /* Control Loop state */

/**
 * sched_clock_cl_init - intialize the control loop
 */
extern void sched_clock_cl_init(struct sched_clock_cl *sccl, u32 mult, u32 shift);

/**
 * sched_clock_cl_update - compute a new multiplier for sched_clock
 * @sccl - pointer to control loop state structure
 * @sp - current set-point, aka. CLOCK_MONOTONIC_RAW
 * @pv - current process-variable, aka. sched_clock()
 */
extern u32 sched_clock_cl_update(struct sched_clock_cl *sccl, u64 sp, u64 pv);


That way I can run one per-cpu and try and keep each individual CPU
synced up to CLOCK_MONOTONIC_RAW, thereby also providing some inter-cpu
guarantee (due to max error bounds on the control loop, etc..).

And you can run a single instance for the generic sched_clock code,
since that's globally consistent.

And we'd all still be using the same control loop logic.

Now, I already mentioned max error bounds, and I've not yet looked at
your actual control loop, but that is something to keep in mind, we want
something that's limited.

If we can do this (and I'm fairly sure we can) we can in fact kill some
rather ugly code.
Richard Cochran July 19, 2014, 5:02 a.m. | #9
On Fri, Jul 18, 2014 at 06:43:39PM +0100, Pawel Moll wrote:
> 
> This code definitely needs more work and testing (I'm not 100%
> sure if the Kp and Ki I've picked for the proportional and
> integral terms are universal),

I wouldn't bet on it.

> but for now wanted to see
> if this approach makes any sense whatsoever.

You are reading sched_clock and mono-raw together every so
often. Really stupid question: Why not just place that information
into the trace buffer and let user space do the clock correction?

...

> +		/* Tune the cyc_to_ns formula */
> +		mult_adj = sign * (error >> 2) + (cd.error_int >> 2);

So Kp = Ki = 0.25? And did you say that the sample rate is 10/second?

I guess that, while this works well on your machine, it might not
always do so, depending on the mono-raw clock. Probably Kp/i need to
be tunable to a particular system. Even better would be to leave this
out of the kernel altogether.

Thanks,
Richard
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Pawel Moll July 22, 2014, 4:17 p.m. | #10
On Fri, 2014-07-18 at 18:51 +0100, John Stultz wrote:
> Very cool work! 

Glad that it doesn't sound too ridiculous :-)

> I've not been able to review it carefully, but one good
> stress test would be to pick a system where the hardware used for
> sched_clock is different from the hardware used for timekeeping.

Actually I've got exactly this situation on my board. I've got two
sources, and actually the worse (narrower and more expensive) one is
getting eventually used (because it's registered later - separate patch
to follow).

> Although, looking again, this looks like it only works on the "generic"
> sched_clock (so ARM/ARM64?)...

... and microblaze and xtensa right now, yes. This was just the simplest
thing for me to start with, but I appreciate it doesn't cover all
possible cases. Thus the debugfs attribute to tell userspace what can it
expect from the sched_clock. A hack it is, yes.

Pawel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Pawel Moll July 22, 2014, 4:17 p.m. | #11
On Fri, 2014-07-18 at 20:34 +0100, Peter Zijlstra wrote:
> On Fri, Jul 18, 2014 at 12:25:48PM -0700, John Stultz wrote:
> > Also, assuming we someday will merge the x86 sched_clock logic into
> > the generic sched_clock code, we'll have to handle cases where they
> > aren't the same.
> 
> I prefer that to not happen. I spend quite a bit of time and effort to
> make the x86 code go fast, and that generic code doesn't look like fast
> at all.

Actually one of my long-ish time goals to is to speed it up (withing the
generic framework) for arm64 as well, so we may get back to the
discussion then :-)

Pawel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Pawel Moll July 22, 2014, 4:17 p.m. | #12
On Fri, 2014-07-18 at 23:41 +0100, Peter Zijlstra wrote:
> So something that might be usable for all of us would be the
> 'abstracted' control loop.
> 
> So the problem is, given a Set Point (CLOCK_MONOTONIC_RAW), a Process
> Variable (sched_clock) compute a Control Output (multiplier).
> 
> If that were implemented with an interface like:
> 
> 
> struct sched_clock_cl; /* Control Loop state */
> 
> /**
>  * sched_clock_cl_init - intialize the control loop
>  */
> extern void sched_clock_cl_init(struct sched_clock_cl *sccl, u32 mult, u32 shift);
> 
> /**
>  * sched_clock_cl_update - compute a new multiplier for sched_clock
>  * @sccl - pointer to control loop state structure
>  * @sp - current set-point, aka. CLOCK_MONOTONIC_RAW
>  * @pv - current process-variable, aka. sched_clock()
>  */
> extern u32 sched_clock_cl_update(struct sched_clock_cl *sccl, u64 sp, u64 pv);

Yeah, happy to provide this.

> That way I can run one per-cpu and try and keep each individual CPU
> synced up to CLOCK_MONOTONIC_RAW, thereby also providing some inter-cpu
> guarantee (due to max error bounds on the control loop, etc..).
> 
> And you can run a single instance for the generic sched_clock code,
> since that's globally consistent.
> 
> And we'd all still be using the same control loop logic.
> 
> Now, I already mentioned max error bounds, and I've not yet looked at
> your actual control loop, but that is something to keep in mind, we want
> something that's limited.

Indeed. Richard has already expressed concerns that my coefficients
won't work everywhere (as in with different clock rate ratios) and he's
most likely right. My money is on a more advanced, "self-tuning"
solution, where we definitely need an escape condition.

> If we can do this (and I'm fairly sure we can) we can in fact kill some
> rather ugly code.

I'm really glad we're more or less on the same page. Will do some more
research in different conditions. Of course all ideas and suggestions
are more than welcome!

Pawel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Pawel Moll July 22, 2014, 4:17 p.m. | #13
On Sat, 2014-07-19 at 06:02 +0100, Richard Cochran wrote:
> On Fri, Jul 18, 2014 at 06:43:39PM +0100, Pawel Moll wrote:
> > 
> > This code definitely needs more work and testing (I'm not 100%
> > sure if the Kp and Ki I've picked for the proportional and
> > integral terms are universal),
> 
> I wouldn't bet on it.

Yeah, I should have said: I'm 100% sure they are not universal.

> > but for now wanted to see
> > if this approach makes any sense whatsoever.
> 
> You are reading sched_clock and mono-raw together every so
> often. Really stupid question: Why not just place that information
> into the trace buffer and let user space do the clock correction?

That approach has been also discussed, last time in the mentioned
thread:

http://thread.gmane.org/gmane.linux.kernel/1611683/focus=1612554
http://thread.gmane.org/gmane.linux.kernel/1611683/focus=1612554

With both Ingo and John showing preference towards the clock alignment,
so that's where I looked this time (I've already done custom perf
ioctls, posix clocks... don't really know how many different ways I've
tried).

> ...
> 
> > +		/* Tune the cyc_to_ns formula */
> > +		mult_adj = sign * (error >> 2) + (cd.error_int >> 2);
> 
> So Kp = Ki = 0.25? And did you say that the sample rate is 10/second?
> 
> I guess that, while this works well on your machine, it might not
> always do so, depending on the mono-raw clock. Probably Kp/i need to
> be tunable to a particular system. Even better would be to leave this
> out of the kernel altogether.

My hope is that, given that time correlation is pretty "static" process
(the clock skew should be reasonable, otherwise the time source are
plainly rubbish), there will be a way of implementing this in a
self-tuning way. I may be proven wrong, but it seems like a noble thing
to try.

I fully agree that there is no place for manual PI regulator tuning in
the kernel.

Pawel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra July 22, 2014, 4:29 p.m. | #14
On Tue, Jul 22, 2014 at 05:17:23PM +0100, Pawel Moll wrote:
> > Now, I already mentioned max error bounds, and I've not yet looked at
> > your actual control loop, but that is something to keep in mind, we want
> > something that's limited.
> 
> Indeed. Richard has already expressed concerns that my coefficients
> won't work everywhere (as in with different clock rate ratios) and he's
> most likely right. My money is on a more advanced, "self-tuning"
> solution, where we definitely need an escape condition.

I was wanting to look up software PLLs, as I seem to remember that's
what NTP and PTP use, but Richard should know I think.

Its not really a subject I've looked at before.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra July 22, 2014, 4:34 p.m. | #15
On Tue, Jul 22, 2014 at 05:17:29PM +0100, Pawel Moll wrote:
> With both Ingo and John showing preference towards the clock alignment,
> so that's where I looked this time (I've already done custom perf
> ioctls, posix clocks... don't really know how many different ways I've
> tried).

So we should probably also talk about which clock to track, MONO has the
advantage of making it far easier to trace clusters but has the
disadvantage of stacked control loops with NTP adjusting MONO and us
adjusting sched_clock.

And I would really prefer to pick 1 and not make it configurable.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Pawel Moll July 22, 2014, 4:48 p.m. | #16
On Tue, 2014-07-22 at 17:34 +0100, Peter Zijlstra wrote:
> On Tue, Jul 22, 2014 at 05:17:29PM +0100, Pawel Moll wrote:
> > With both Ingo and John showing preference towards the clock alignment,
> > so that's where I looked this time (I've already done custom perf
> > ioctls, posix clocks... don't really know how many different ways I've
> > tried).
> 
> So we should probably also talk about which clock to track, MONO has the
> advantage of making it far easier to trace clusters but has the
> disadvantage of stacked control loops with NTP adjusting MONO and us
> adjusting sched_clock.

John suggested (and I fully agree with him) MONO_RAW, as it is not
getting NTP-d. 

> And I would really prefer to pick 1 and not make it configurable.

Same here. One thing I keep in mind is the fact that userspace must be
able to say whether it can expect the correlation or not. "Not" being
either an architecture which sched_clock is not using the generic
solution (I'm sure there will be some) or "not" because of the
synchronisation failure. My idea so far was a debugfs file saying this
(or missing, which is a message on its own).

Pawel


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Richard Cochran July 22, 2014, 7:39 p.m. | #17
On Tue, Jul 22, 2014 at 05:17:29PM +0100, Pawel Moll wrote:
> That approach has been also discussed, last time in the mentioned
> thread:
> 
> http://thread.gmane.org/gmane.linux.kernel/1611683/focus=1612554
> http://thread.gmane.org/gmane.linux.kernel/1611683/focus=1612554
> 
> With both Ingo and John showing preference towards the clock alignment,
> so that's where I looked this time (I've already done custom perf
> ioctls, posix clocks... don't really know how many different ways I've
> tried).

I re-read that thread, and it sounds like Ingo also suggested simply
tracing the system time updates, leaving the correlation to the perf
clock to user space. This is what I mean, too.

John and Ingo said that exporting MONO-RAW would be "ideal", and maybe
it would be (in the sense of easy-to-use), but only if it were always
correct.  However, hacking some half baked servo into the kernel will
fail, at least some of the time.  When it does fail, there is no way
to figure out what happened, because the servo code obliterates the
information in the original time stamps.

So the only reasonable way, IMHO, is to simply provide the needed
information in the traces, and then add some user space code to find
the relationship between the perf clock and the system clock. The
great advantage of this approach is that you don't have to create a
perfect servo from day one. The time stamps can be re-analyzed at any
time after the fact.

And if you are going to use this for clusters, then you really want
the global time and not MONO-RAW. I would suggest using CLOCK_TAI.

Thanks,
Richard
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Patch hide | download patch | download mbox

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 0a68d5a..4cca9bb 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -550,6 +550,34 @@  TRACE_EVENT(sched_wake_idle_without_ipi,
 
 	TP_printk("cpu=%d", __entry->cpu)
 );
+
+/*
+ * Tracepoint for sched clock adjustments
+ */
+TRACE_EVENT(sched_clock_adjust,
+
+	TP_PROTO(u64 sched, u64 mono, s32 mult_adj),
+
+	TP_ARGS(sched, mono, mult_adj),
+
+	TP_STRUCT__entry(
+		__field( u64,	sched		)
+		__field( u64,	mono		)
+		__field( s32,	mult_adj	)
+	),
+
+	TP_fast_assign(
+		__entry->sched		= sched;
+		__entry->mono		= mono;
+		__entry->mult_adj	= mult_adj;
+	),
+
+	TP_printk("sched=%lluns, mono=%lluns, error=%lldns, mult_adj=%d",
+		(unsigned long long)__entry->sched,
+		(unsigned long long)__entry->mono,
+		(unsigned long long)(__entry->mono - __entry->sched),
+		__entry->mult_adj)
+);
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 445106d..b9c9e04 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -17,18 +17,26 @@ 
 #include <linux/sched_clock.h>
 #include <linux/seqlock.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>
+
+#include <trace/events/sched.h>
 
 struct clock_data {
 	ktime_t wrap_kt;
 	u64 epoch_ns;
+	u64 epoch_mono_ns;
 	u64 epoch_cyc;
 	seqcount_t seq;
 	unsigned long rate;
 	u32 mult;
+	s32 mult_adj;
+	s64 error_int;
 	u32 shift;
 	bool suspended;
 };
 
+#define REFRESH_PERIOD 100000000ULL /* 10^8 ns = 0.1 s */
+
 static struct hrtimer sched_clock_timer;
 static int irqtime = -1;
 
@@ -38,6 +46,45 @@  static struct clock_data cd = {
 	.mult	= NSEC_PER_SEC / HZ,
 };
 
+#ifdef DEBUG
+#define ERROR_LOG_LEN (NSEC_PER_SEC / REFRESH_PERIOD * 10) /* 10 s */
+static u64 sched_clock_error_log[ERROR_LOG_LEN];
+static int sched_clock_error_log_next;
+static int sched_clock_error_log_len;
+
+static u64 sched_clock_error_max;
+static u64 sched_clock_error_min = ~0;
+
+static int sched_clock_error_log_show(struct seq_file *m, void *p)
+{
+	u64 avg = 0;
+	int i;
+
+	for (i = 0; i < sched_clock_error_log_len; i++)
+		avg += sched_clock_error_log[i];
+	do_div(avg,  sched_clock_error_log_len);
+
+	seq_printf(m, "min error: %lluns\n", sched_clock_error_min);
+	seq_printf(m, "max error: %lluns\n", sched_clock_error_max);
+	seq_printf(m, "%d samples moving average error: %lluns\n",
+			sched_clock_error_log_len, avg);
+
+	return 0;
+}
+
+static int sched_clock_error_log_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, sched_clock_error_log_show, inode->i_private);
+}
+
+static struct file_operations sched_clock_error_log_fops = {
+	.open		= sched_clock_error_log_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
 static u64 __read_mostly sched_clock_mask;
 
 static u64 notrace jiffy_sched_clock_read(void)
@@ -74,7 +121,7 @@  unsigned long long notrace sched_clock(void)
 
 	cyc = read_sched_clock();
 	cyc = (cyc - epoch_cyc) & sched_clock_mask;
-	return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+	return epoch_ns + cyc_to_ns(cyc, cd.mult + cd.mult_adj, cd.shift);
 }
 
 /*
@@ -83,18 +130,72 @@  unsigned long long notrace sched_clock(void)
 static void notrace update_sched_clock(void)
 {
 	unsigned long flags;
-	u64 cyc;
+	u64 cyc, delta_cyc;
 	u64 ns;
+	u64 mono_ns = 0;
+	s64 error_ns = 0, error = 0, error_int = 0;
+	s32 mult_adj = 0;
+	struct timespec mono;
 
 	cyc = read_sched_clock();
-	ns = cd.epoch_ns +
-		cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-			  cd.mult, cd.shift);
+	delta_cyc = (cyc - cd.epoch_cyc) & sched_clock_mask;
+	ns = cd.epoch_ns + cyc_to_ns(delta_cyc, cd.mult + cd.mult_adj,
+			cd.shift);
+
+	if (!cd.epoch_mono_ns) {
+		/* Initialize monotonic raw clock epoch */
+		getrawmonotonic(&mono);
+		mono_ns = timespec_to_ns(&mono);
+	}
+
+	if (cd.epoch_mono_ns) {
+		int sign;
+
+		/* We have a simple PI controller here */
+		getrawmonotonic(&mono);
+		mono_ns = timespec_to_ns(&mono);
+		error_ns = mono_ns - ns;
+		sign = (error_ns > 0) - (error_ns < 0);
+		error_ns = abs(error_ns);
+
+		/* Convert error in ns into "mult units" */
+		error = error_ns;
+		if (delta_cyc >> cd.shift)
+			do_div(error, delta_cyc >> cd.shift);
+		else
+			error = 0;
+
+		/* Integral term of the controller */
+		error_int = error * (mono_ns - cd.epoch_mono_ns);
+		do_div(error_int, NSEC_PER_SEC);
+		error_int = sign * error_int + cd.error_int;
+
+		/* Tune the cyc_to_ns formula */
+		mult_adj = sign * (error >> 2) + (cd.error_int >> 2);
+
+#ifdef DEBUG
+		sched_clock_error_log[sched_clock_error_log_next++] = error_ns;
+		if (sched_clock_error_log_next == ERROR_LOG_LEN)
+			sched_clock_error_log_next = 0;
+		else if (sched_clock_error_log_len < ERROR_LOG_LEN)
+			sched_clock_error_log_len++;
+
+		if (error_ns < sched_clock_error_min)
+			sched_clock_error_min = error_ns;
+		if (error_ns > sched_clock_error_max)
+			sched_clock_error_max = error_ns;
+#endif
+
+		trace_sched_clock_adjust(mono_ns, ns, mult_adj);
+	}
 
 	raw_local_irq_save(flags);
 	raw_write_seqcount_begin(&cd.seq);
 	cd.epoch_ns = ns;
+	cd.epoch_mono_ns = mono_ns;
 	cd.epoch_cyc = cyc;
+	cd.mult_adj = mult_adj;
+	cd.error_int = error_int;
 	raw_write_seqcount_end(&cd.seq);
 	raw_local_irq_restore(flags);
 }
@@ -127,13 +228,15 @@  void __init sched_clock_register(u64 (*read)(void), int bits,
 
 	/* calculate how many ns until we wrap */
 	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
-	new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+
+	/* update epoch before we wrap and at least once per refresh period */
+	new_wrap_kt = ns_to_ktime(min(wrap - (wrap >> 3), REFRESH_PERIOD));
 
 	/* update epoch for new counter and update epoch_ns from old counter*/
 	new_epoch = read();
 	cyc = read_sched_clock();
 	ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-			  cd.mult, cd.shift);
+			  cd.mult + cd.mult_adj, cd.shift);
 
 	raw_write_seqcount_begin(&cd.seq);
 	read_sched_clock = read;
@@ -141,6 +244,7 @@  void __init sched_clock_register(u64 (*read)(void), int bits,
 	cd.rate = rate;
 	cd.wrap_kt = new_wrap_kt;
 	cd.mult = new_mult;
+	cd.mult_adj = 0;
 	cd.shift = new_shift;
 	cd.epoch_cyc = new_epoch;
 	cd.epoch_ns = ns;
@@ -209,7 +313,29 @@  static struct syscore_ops sched_clock_ops = {
 
 static int __init sched_clock_syscore_init(void)
 {
+	int err = 0;
+
 	register_syscore_ops(&sched_clock_ops);
-	return 0;
+
+	/*
+	 * As long as not everyone is using this generic implementation,
+	 * userspace must be able to tell what does the sched_clock values
+	 * relate to (if anything).
+	 */
+	if (read_sched_clock != jiffy_sched_clock_read) {
+		static struct debugfs_blob_wrapper blob;
+
+		blob.data = "CLOCK_MONOTONIC_RAW";
+		blob.size = strlen(blob.data) + 1;
+		err = PTR_ERR_OR_ZERO(debugfs_create_blob("sched_clock_base",
+				S_IRUGO, NULL, &blob));
+	}
+
+#ifdef DEBUG
+	err = PTR_ERR_OR_ZERO(debugfs_create_file("sched_clock_error", S_IRUGO,
+				NULL, NULL, &sched_clock_error_log_fops));
+#endif
+
+	return err;
 }
 device_initcall(sched_clock_syscore_init);