diff mbox series

[v2,3/6] cpufreq: schedutil: ensure max frequency while running RT/DL tasks

Message ID 1499189651-18797-4-git-send-email-patrick.bellasi@arm.com
State Superseded
Headers show
Series [v2,1/6] cpufreq: schedutil: ignore sugov kthreads | expand

Commit Message

Patrick Bellasi July 4, 2017, 5:34 p.m. UTC
The policy in use for RT/DL tasks sets the maximum frequency when a task
in these classes calls for a cpufreq_update_this_cpu().  However, the
current implementation might cause a frequency drop while a RT/DL task
is still running, just because for example a FAIR task wakes up and it's
enqueued in the same CPU.

This issue is due to the sg_cpu's flags being overwritten at each call
of sugov_update_*. Thus, the wakeup of a FAIR task resets the flags and
can trigger a frequency update thus affecting the currently running
RT/DL task.

This can be fixed, in shared frequency domains, by ORing (instead of
overwriting) the new flag before triggering a frequency update.  This
grants to stay at least at the frequency requested by the RT/DL class,
which is the maximum one for the time being.

This patch does the flags aggregation in the schedutil governor, where
it's easy to verify if we currently have RT/DL workload on a CPU.
This approach is aligned with the current schedutil API design where the
core scheduler does not interact directly with schedutil, while instead
are the scheduling classes which call directly into the policy via
cpufreq_update_{util,this_cpu}. Thus, it makes more sense to have flags
aggregation in the schedutil code instead of the core scheduler.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Steve Muckle <smuckle.linux@gmail.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-pm@vger.kernel.org

---
Changes from v1:
- use "current" to check for RT/DL tasks (PeterZ)
---
 kernel/sched/cpufreq_schedutil.c | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

-- 
2.7.4

Comments

Viresh Kumar July 5, 2017, 6:01 a.m. UTC | #1
On 04-07-17, 18:34, Patrick Bellasi wrote:
> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c

> index 004ae18..98704d8 100644

> --- a/kernel/sched/cpufreq_schedutil.c

> +++ b/kernel/sched/cpufreq_schedutil.c

> @@ -216,6 +216,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,

>  	struct cpufreq_policy *policy = sg_policy->policy;

>  	unsigned long util, max;

>  	unsigned int next_f;

> +	bool rt_mode;

>  	bool busy;

>  

>  	/* Skip updates generated by sugov kthreads */

> @@ -230,7 +231,15 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,

>  

>  	busy = sugov_cpu_is_busy(sg_cpu);

>  

> -	if (flags & SCHED_CPUFREQ_RT_DL) {

> +	/*

> +	 * While RT/DL tasks are running we do not want FAIR tasks to

> +	 * overvrite this CPU's flags, still we can update utilization and

> +	 * frequency (if required/possible) to be fair with these tasks.

> +	 */

> +	rt_mode = task_has_dl_policy(current) ||

> +		  task_has_rt_policy(current) ||

> +		  (flags & SCHED_CPUFREQ_RT_DL);


We may want to create a separate inline function for above, as it is already
used twice in this patch.

But I was wondering if we can get some help from the scheduler to avoid such
code here. I understand that we don't want to do the aggregation in the
scheduler to keep it clean and keep such governor specific thing here.

But what about clearing the sched-class's flag from .pick_next_task() callback
when they return NULL ?

What about something like this instead (completely untested), with which we
don't need the 2/3 patch as well:

-- 
vireshdiff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index d2be2ccbb372..e81a6b5591f5 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -11,6 +11,10 @@
 #define SCHED_CPUFREQ_DL       (1U << 1)
 #define SCHED_CPUFREQ_IOWAIT   (1U << 2)
 
+#define SCHED_CPUFREQ_CLEAR    (1U << 31)
+#define SCHED_CPUFREQ_CLEAR_RT (SCHED_CPUFREQ_CLEAR | SCHED_CPUFREQ_RT)
+#define SCHED_CPUFREQ_CLEAR_DL (SCHED_CPUFREQ_CLEAR | SCHED_CPUFREQ_DL)
+
 #define SCHED_CPUFREQ_RT_DL    (SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL)
 
 #ifdef CONFIG_CPU_FREQ
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 076a2e31951c..f32e15d59d62 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -218,6 +218,9 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
        unsigned int next_f;
        bool busy;
 
+       if (flags & SCHED_CPUFREQ_CLEAR)
+               return;
+
        sugov_set_iowait_boost(sg_cpu, time, flags);
        sg_cpu->last_update = time;
 
@@ -296,7 +299,13 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
 
        sg_cpu->util = util;
        sg_cpu->max = max;
-       sg_cpu->flags = flags;
+
+       if (unlikely(flags & SCHED_CPUFREQ_CLEAR)) {
+               sg_cpu->flags &= ~(flags & ~SCHED_CPUFREQ_CLEAR);
+               return;
+       }
+
+       sg_cpu->flags |= flags;
 
        sugov_set_iowait_boost(sg_cpu, time, flags);
        sg_cpu->last_update = time;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a2ce59015642..441d6153d654 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1203,8 +1203,10 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
        if (prev->sched_class == &dl_sched_class)
                update_curr_dl(rq);
 
-       if (unlikely(!dl_rq->dl_nr_running))
+       if (unlikely(!dl_rq->dl_nr_running)) {
+               cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_CLEAR_DL);
                return NULL;
+       }
 
        put_prev_task(rq, prev);
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 979b7341008a..bca9e4bb7ec4 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1556,8 +1556,10 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
        if (prev->sched_class == &rt_sched_class)
                update_curr_rt(rq);
 
-       if (!rt_rq->rt_queued)
+       if (!rt_rq->rt_queued) {
+               cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_CLEAR_RT);
                return NULL;
+       }
 
        put_prev_task(rq, prev);


Viresh Kumar July 6, 2017, 5:56 a.m. UTC | #2
On 05-07-17, 14:41, Patrick Bellasi wrote:
> On 05-Jul 11:31, Viresh Kumar wrote:


> Just had a fast check but I think something like that can work.

> We had an internal discussion with Brendan (in CC now) which had a

> similar proposal.

> 

> Main counter arguments for me was:

> 1. we wanna to reduce the pollution of scheduling classes code with

>    schedutil specific code, unless strictly necessary


s/schedutil/cpufreq, as the util hooks are getting called for some other stuff
as well.

> 2. we never had a "clear bit" semantics for flags updates

> 

> Thus this proposal seemed to me less of a discontinuity wrt the

> current interface. However, something similar to what you propose

> below should also work.


With the kind of problems we have in hand now, it seems that it would be good
for the governors to know what kind of stuff is queued on the CPU (i.e. the
aggregation of all the flags) and the only sane way of doing that is by clearing
the flag once a class is done with it.

Else we would be required to have code that tries to find the same information
in an indirect way, like what this patch does with the current task.

> Let's collect some more feedback...


Sure.
 
> > diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h

> > index d2be2ccbb372..e81a6b5591f5 100644

> > --- a/include/linux/sched/cpufreq.h

> > +++ b/include/linux/sched/cpufreq.h

> > @@ -11,6 +11,10 @@

> >  #define SCHED_CPUFREQ_DL       (1U << 1)

> >  #define SCHED_CPUFREQ_IOWAIT   (1U << 2)

> >  

> > +#define SCHED_CPUFREQ_CLEAR    (1U << 31)

> > +#define SCHED_CPUFREQ_CLEAR_RT (SCHED_CPUFREQ_CLEAR | SCHED_CPUFREQ_RT)

> > +#define SCHED_CPUFREQ_CLEAR_DL (SCHED_CPUFREQ_CLEAR | SCHED_CPUFREQ_DL)

> > +

> >  #define SCHED_CPUFREQ_RT_DL    (SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL)

> >  

> >  #ifdef CONFIG_CPU_FREQ

> > diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c

> > index 076a2e31951c..f32e15d59d62 100644

> > --- a/kernel/sched/cpufreq_schedutil.c

> > +++ b/kernel/sched/cpufreq_schedutil.c

> > @@ -218,6 +218,9 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,

> >         unsigned int next_f;

> >         bool busy;

> >  

> > +       if (flags & SCHED_CPUFREQ_CLEAR)

> > +               return;

> 

> Here we should still clear the flags, like what we do for the shared

> case... just to keep the internal status consiste with the

> notifications we have got from the scheduling classes.


The sg_cpu->flags field isn't used currently for the single CPU per policy case,
but only for shared policies. But yes, we need to maintain that here now as
well to know what all is queued on a CPU.

-- 
viresh
Joel Fernandes July 7, 2017, 5:26 a.m. UTC | #3
Hi,

On Wed, Jul 5, 2017 at 6:41 AM, Patrick Bellasi <patrick.bellasi@arm.com> wrote:
[..]
>

>> But what about clearing the sched-class's flag from .pick_next_task() callback

>> when they return NULL ?

>>

>> What about something like this instead (completely untested), with which we

>> don't need the 2/3 patch as well:

>

> Just had a fast check but I think something like that can work.

> We had an internal discussion with Brendan (in CC now) which had a

> similar proposal.

>

> Main counter arguments for me was:

> 1. we wanna to reduce the pollution of scheduling classes code with

>    schedutil specific code, unless strictly necessary

> 2. we never had a "clear bit" semantics for flags updates

>

> Thus this proposal seemed to me less of a discontinuity wrt the

> current interface. However, something similar to what you propose

> below should also work. Let's collect some more feedback...

>


I was going to say something similar. I feel its much cleaner if the
scheduler clears the flags that it set, thus taking "ownership" of the
flag. I feel that will avoid complication like this where the governor
has to peek into what's currently running and such (and also help with
the suggestion I made for patch 2/3.

Maybe the interface needs an extension for "clear flag" semantics?

thanks,

-Joel
diff mbox series

Patch

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 004ae18..98704d8 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -216,6 +216,7 @@  static void sugov_update_single(struct update_util_data *hook, u64 time,
 	struct cpufreq_policy *policy = sg_policy->policy;
 	unsigned long util, max;
 	unsigned int next_f;
+	bool rt_mode;
 	bool busy;
 
 	/* Skip updates generated by sugov kthreads */
@@ -230,7 +231,15 @@  static void sugov_update_single(struct update_util_data *hook, u64 time,
 
 	busy = sugov_cpu_is_busy(sg_cpu);
 
-	if (flags & SCHED_CPUFREQ_RT_DL) {
+	/*
+	 * While RT/DL tasks are running we do not want FAIR tasks to
+	 * overvrite this CPU's flags, still we can update utilization and
+	 * frequency (if required/possible) to be fair with these tasks.
+	 */
+	rt_mode = task_has_dl_policy(current) ||
+		  task_has_rt_policy(current) ||
+		  (flags & SCHED_CPUFREQ_RT_DL);
+	if (rt_mode) {
 		next_f = policy->cpuinfo.max_freq;
 	} else {
 		sugov_get_util(&util, &max);
@@ -293,6 +302,7 @@  static void sugov_update_shared(struct update_util_data *hook, u64 time,
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	unsigned long util, max;
 	unsigned int next_f;
+	bool rt_mode;
 
 	/* Skip updates generated by sugov kthreads */
 	if (unlikely(current == sg_policy->thread))
@@ -310,17 +320,27 @@  static void sugov_update_shared(struct update_util_data *hook, u64 time,
 		sg_cpu->flags = 0;
 		goto done;
 	}
-	sg_cpu->flags = flags;
+
+	/*
+	 * While RT/DL tasks are running we do not want FAIR tasks to
+	 * overwrite this CPU's flags, still we can update utilization and
+	 * frequency (if required/possible) to be fair with these tasks.
+	 */
+	rt_mode = task_has_dl_policy(current) ||
+		  task_has_rt_policy(current) ||
+		  (flags & SCHED_CPUFREQ_RT_DL);
+	if (rt_mode)
+		sg_cpu->flags |= flags;
+	else
+		sg_cpu->flags = flags;
 
 	sugov_set_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
 
 	if (sugov_should_update_freq(sg_policy, time)) {
-		if (flags & SCHED_CPUFREQ_RT_DL)
-			next_f = sg_policy->policy->cpuinfo.max_freq;
-		else
-			next_f = sugov_next_freq_shared(sg_cpu, time);
-
+		next_f = rt_mode
+			? sg_policy->policy->cpuinfo.max_freq
+			: sugov_next_freq_shared(sg_cpu, time);
 		sugov_update_commit(sg_policy, time, next_f);
 	}