diff mbox series

[RFC,2/7] sched/pelt: Add a new function to approximate runtime to reach given util

Message ID 20230827233203.1315953-3-qyousef@layalina.io
State Superseded
Headers show
Series sched: cpufreq: Remove magic margins | expand

Commit Message

Qais Yousef Aug. 27, 2023, 11:31 p.m. UTC
It is basically the ramp-up time from 0 to a given value. Will be used
later to implement new tunable to control response time  for schedutil.

Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
---
 kernel/sched/pelt.c  | 21 +++++++++++++++++++++
 kernel/sched/sched.h |  1 +
 2 files changed, 22 insertions(+)

Comments

Dietmar Eggemann Sept. 6, 2023, 12:56 p.m. UTC | #1
On 28/08/2023 01:31, Qais Yousef wrote:
> It is basically the ramp-up time from 0 to a given value. Will be used
> later to implement new tunable to control response time  for schedutil.
> 
> Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
> ---
>  kernel/sched/pelt.c  | 21 +++++++++++++++++++++
>  kernel/sched/sched.h |  1 +
>  2 files changed, 22 insertions(+)
> 
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index 50322005a0ae..f673b9ab92dc 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -487,3 +487,24 @@ unsigned long approximate_util_avg(unsigned long util, u64 delta)
>  
>  	return sa.util_avg;
>  }
> +
> +/*
> + * Approximate the required amount of runtime in ms required to reach @util.
> + */
> +u64 approximate_runtime(unsigned long util)
> +{
> +	struct sched_avg sa = {};
> +	u64 delta = 1024; // period = 1024 = ~1ms
> +	u64 runtime = 0;
> +
> +	if (unlikely(!util))
> +		return runtime;
> +
> +	while (sa.util_avg < util) {
> +		accumulate_sum(delta, &sa, 0, 0, 1);
> +		___update_load_avg(&sa, 0);
> +		runtime++;
> +	}
> +
> +	return runtime;
> +}

S_n = S_inv * (1 - 0.5^(t/hl))

t = hl * ln(1 - Sn/S_inv)/ln(0.5)

(1) for a little CPU (capacity_orig = 446)

t = 32ms * ln(1 - 446/1024)/ln(0.5)

t = 26ms

(2) for a big CPU (capacity = 1023 (*instead of 1024 since ln(0) not
    defined

t = 32ms * ln(1 - 1023/1024)/ln(0.5)

t = 320ms

[...]
Dietmar Eggemann Sept. 6, 2023, 8:44 p.m. UTC | #2
On 06/09/2023 14:56, Dietmar Eggemann wrote:
> On 28/08/2023 01:31, Qais Yousef wrote:
>> It is basically the ramp-up time from 0 to a given value. Will be used
>> later to implement new tunable to control response time  for schedutil.
>>
>> Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
>> ---
>>  kernel/sched/pelt.c  | 21 +++++++++++++++++++++
>>  kernel/sched/sched.h |  1 +
>>  2 files changed, 22 insertions(+)
>>
>> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
>> index 50322005a0ae..f673b9ab92dc 100644
>> --- a/kernel/sched/pelt.c
>> +++ b/kernel/sched/pelt.c
>> @@ -487,3 +487,24 @@ unsigned long approximate_util_avg(unsigned long util, u64 delta)
>>  
>>  	return sa.util_avg;
>>  }
>> +
>> +/*
>> + * Approximate the required amount of runtime in ms required to reach @util.
>> + */
>> +u64 approximate_runtime(unsigned long util)
>> +{
>> +	struct sched_avg sa = {};
>> +	u64 delta = 1024; // period = 1024 = ~1ms
>> +	u64 runtime = 0;
>> +
>> +	if (unlikely(!util))
>> +		return runtime;
>> +
>> +	while (sa.util_avg < util) {
>> +		accumulate_sum(delta, &sa, 0, 0, 1);
>> +		___update_load_avg(&sa, 0);
>> +		runtime++;
>> +	}
>> +
>> +	return runtime;
>> +}
> 
> S_n = S_inv * (1 - 0.5^(t/hl))
> 
> t = hl * ln(1 - Sn/S_inv)/ln(0.5)
> 
> (1) for a little CPU (capacity_orig = 446)
> 
> t = 32ms * ln(1 - 446/1024)/ln(0.5)
> 
> t = 26ms
> 
> (2) for a big CPU (capacity = 1023 (*instead of 1024 since ln(0) not
>     defined
> 
> t = 32ms * ln(1 - 1023/1024)/ln(0.5)
> 
> t = 320ms

Forgot half of what I wanted to ask:

And you want to be able to have a schedutil interface:

/sys/devices/system/cpu/cpufreq/policy*/schedutil/response_time_ms

in which by default we have 26ms for a CPU with the capacity_orig of 446.

I.e. you want to have a time-based interface there? Which the user can
overwrite, say with 52ms and this then will lower the return value of
get_next_freq() so the system will respond slower?

And the time based interface is more intuitive than staying in the
capacity world of [0-1024]?
Qais Yousef Sept. 6, 2023, 9:38 p.m. UTC | #3
On 09/06/23 22:44, Dietmar Eggemann wrote:
> On 06/09/2023 14:56, Dietmar Eggemann wrote:
> > On 28/08/2023 01:31, Qais Yousef wrote:
> >> It is basically the ramp-up time from 0 to a given value. Will be used
> >> later to implement new tunable to control response time  for schedutil.
> >>
> >> Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
> >> ---
> >>  kernel/sched/pelt.c  | 21 +++++++++++++++++++++
> >>  kernel/sched/sched.h |  1 +
> >>  2 files changed, 22 insertions(+)
> >>
> >> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> >> index 50322005a0ae..f673b9ab92dc 100644
> >> --- a/kernel/sched/pelt.c
> >> +++ b/kernel/sched/pelt.c
> >> @@ -487,3 +487,24 @@ unsigned long approximate_util_avg(unsigned long util, u64 delta)
> >>  
> >>  	return sa.util_avg;
> >>  }
> >> +
> >> +/*
> >> + * Approximate the required amount of runtime in ms required to reach @util.
> >> + */
> >> +u64 approximate_runtime(unsigned long util)
> >> +{
> >> +	struct sched_avg sa = {};
> >> +	u64 delta = 1024; // period = 1024 = ~1ms
> >> +	u64 runtime = 0;
> >> +
> >> +	if (unlikely(!util))
> >> +		return runtime;
> >> +
> >> +	while (sa.util_avg < util) {
> >> +		accumulate_sum(delta, &sa, 0, 0, 1);
> >> +		___update_load_avg(&sa, 0);
> >> +		runtime++;
> >> +	}
> >> +
> >> +	return runtime;
> >> +}
> > 
> > S_n = S_inv * (1 - 0.5^(t/hl))
> > 
> > t = hl * ln(1 - Sn/S_inv)/ln(0.5)
> > 
> > (1) for a little CPU (capacity_orig = 446)
> > 
> > t = 32ms * ln(1 - 446/1024)/ln(0.5)
> > 
> > t = 26ms
> > 
> > (2) for a big CPU (capacity = 1023 (*instead of 1024 since ln(0) not
> >     defined
> > 
> > t = 32ms * ln(1 - 1023/1024)/ln(0.5)
> > 
> > t = 320ms
> 
> Forgot half of what I wanted to ask:
> 
> And you want to be able to have a schedutil interface:
> 
> /sys/devices/system/cpu/cpufreq/policy*/schedutil/response_time_ms
> 
> in which by default we have 26ms for a CPU with the capacity_orig of 446.

Note that this *is* the default. I'm just exposing it not really changing it :)

It is actually much less than that if you take into account the current 25%
headroom.

> 
> I.e. you want to have a time-based interface there? Which the user can
> overwrite, say with 52ms and this then will lower the return value of
> get_next_freq() so the system will respond slower?
> 
> And the time based interface is more intuitive than staying in the
> capacity world of [0-1024]?

Yes this is exactly how I am defining the interface :-) I think this is generic
and will give users what they need and hopefully should stand the test of time.

The slow down aspect has a limitation though as I highlight in the cover
letter. I haven't figured out how to resolve it yet, or worth the effort.
If anyone has thoughts on that that'd be useful to learn about. It should be
fixable though.

Generally perf first is not always the desired outcome. Power and thermal play
bigger roles in a lot of systems today and I can see even sever market pays
more attention to them now.

Hence I didn't see why I should limit it to improving perf only and disregard
that there are situations where the system might be more concerned about power
or thermals and this could allow more fine tuning than limiting max frequencies
abruptly. They just get harder to reach so in average we get different
residencies (for same workload), but freqs are still reachable. There will be
a perf hit of course, but that's userspace problem to decide if it's worth it
or not.

Generally I am a big advocate of userspace to take the leap and be smarter
about what it needs and when. Fixing everything automagically has its appeal
but I don't think this is sustainable anymore.


Thanks!

--
Qais Yousef
Hongyan Xia Sept. 15, 2023, 9:15 a.m. UTC | #4
On 28/08/2023 00:31, Qais Yousef wrote:
> It is basically the ramp-up time from 0 to a given value. Will be used
> later to implement new tunable to control response time  for schedutil.
> 
> Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
> ---
>   kernel/sched/pelt.c  | 21 +++++++++++++++++++++
>   kernel/sched/sched.h |  1 +
>   2 files changed, 22 insertions(+)
> 
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index 50322005a0ae..f673b9ab92dc 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -487,3 +487,24 @@ unsigned long approximate_util_avg(unsigned long util, u64 delta)
>   
>   	return sa.util_avg;
>   }
> +
> +/*
> + * Approximate the required amount of runtime in ms required to reach @util.
> + */
> +u64 approximate_runtime(unsigned long util)
> +{
> +	struct sched_avg sa = {};
> +	u64 delta = 1024; // period = 1024 = ~1ms
> +	u64 runtime = 0;
> +
> +	if (unlikely(!util))
> +		return runtime;
> +
> +	while (sa.util_avg < util) {
> +		accumulate_sum(delta, &sa, 0, 0, 1);

This looks a bit uncomfortable as the existing comment says that we assume:

	if (!load)
		runnable = running = 0;

I haven't looked at the math in detail, but if this is okay, maybe a 
comment saying why this is okay despite the existing comment says otherwise?

> +		___update_load_avg(&sa, 0);
> +		runtime++;
> +	}
> +
> +	return runtime;
> +} > [...]
Qais Yousef Sept. 16, 2023, 7:56 p.m. UTC | #5
On 09/15/23 10:15, Hongyan Xia wrote:
> On 28/08/2023 00:31, Qais Yousef wrote:
> > It is basically the ramp-up time from 0 to a given value. Will be used
> > later to implement new tunable to control response time  for schedutil.
> > 
> > Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
> > ---
> >   kernel/sched/pelt.c  | 21 +++++++++++++++++++++
> >   kernel/sched/sched.h |  1 +
> >   2 files changed, 22 insertions(+)
> > 
> > diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> > index 50322005a0ae..f673b9ab92dc 100644
> > --- a/kernel/sched/pelt.c
> > +++ b/kernel/sched/pelt.c
> > @@ -487,3 +487,24 @@ unsigned long approximate_util_avg(unsigned long util, u64 delta)
> >   	return sa.util_avg;
> >   }
> > +
> > +/*
> > + * Approximate the required amount of runtime in ms required to reach @util.
> > + */
> > +u64 approximate_runtime(unsigned long util)
> > +{
> > +	struct sched_avg sa = {};
> > +	u64 delta = 1024; // period = 1024 = ~1ms
> > +	u64 runtime = 0;
> > +
> > +	if (unlikely(!util))
> > +		return runtime;
> > +
> > +	while (sa.util_avg < util) {
> > +		accumulate_sum(delta, &sa, 0, 0, 1);
> 
> This looks a bit uncomfortable as the existing comment says that we assume:
> 
> 	if (!load)
> 		runnable = running = 0;
> 
> I haven't looked at the math in detail, but if this is okay, maybe a comment
> saying why this is okay despite the existing comment says otherwise?

Yeah as Dietmar highlighted I should pass 1 for load and it was my bad
misreading the code.

So it should be

	accumulate_sum(delta, &sa, 1, 0, 1);

If that's what you meant, yes.


Thanks!

--
Qais Yousef
diff mbox series

Patch

diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 50322005a0ae..f673b9ab92dc 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -487,3 +487,24 @@  unsigned long approximate_util_avg(unsigned long util, u64 delta)
 
 	return sa.util_avg;
 }
+
+/*
+ * Approximate the required amount of runtime in ms required to reach @util.
+ */
+u64 approximate_runtime(unsigned long util)
+{
+	struct sched_avg sa = {};
+	u64 delta = 1024; // period = 1024 = ~1ms
+	u64 runtime = 0;
+
+	if (unlikely(!util))
+		return runtime;
+
+	while (sa.util_avg < util) {
+		accumulate_sum(delta, &sa, 0, 0, 1);
+		___update_load_avg(&sa, 0);
+		runtime++;
+	}
+
+	return runtime;
+}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5f76b8a75a9f..2b889ad399de 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2999,6 +2999,7 @@  unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 				 struct task_struct *p);
 
 unsigned long approximate_util_avg(unsigned long util, u64 delta);
+u64 approximate_runtime(unsigned long util);
 
 /*
  * DVFS decision are made at discrete points. If CPU stays busy, the util will